debian-mirror-gitlab/lib/gitlab/memory/watchdog.rb

# frozen_string_literal: true

module Gitlab
  module Memory
    # A background thread that observes Ruby heap fragmentation and calls
    # into a handler when the Ruby heap has been fragmented for an extended
    # period of time.
    #
    # See Gitlab::Metrics::Memory for how heap fragmentation is defined.
    #
    # To decide whether a given fragmentation level is being exceeded,
    # the watchdog regularly polls the GC. Whenever a violation occurs
    # a strike is issued. If the maximum number of strikes are reached,
    # a handler is invoked to deal with the situation.
    #
    # The duration for which a process may be above a given fragmentation
    # threshold is computed as `max_strikes * sleep_time_seconds`.
    class Watchdog
      DEFAULT_SLEEP_TIME_SECONDS = 60 * 5
      DEFAULT_MAX_HEAP_FRAG = 0.5
      DEFAULT_MAX_MEM_GROWTH = 3.0
      DEFAULT_MAX_STRIKES = 5

      # This handler does nothing. It returns `false` to indicate to the
      # caller that the situation has not been dealt with so it will
      # receive calls repeatedly if fragmentation remains high.
      #
      # This is useful for "dress rehearsals" in production since it allows
      # us to observe how frequently the handler is invoked before taking action.
      class NullHandler
        include Singleton

        def call
          # NOP
          false
        end
      end

      # This handler sends SIGTERM and considers the situation handled.
      class TermProcessHandler
        def initialize(pid = $$)
          @pid = pid
        end

        def call
          Process.kill(:TERM, @pid)
          true
        end
      end

      # This handler invokes Puma's graceful termination handler, which takes
      # into account a configurable grace period during which a process may
      # remain unresponsive to a SIGTERM.
      class PumaHandler
        def initialize(puma_options = ::Puma.cli_config.options)
          @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
        end

        def call
          @worker.term
          true
        end
      end

      # max_heap_fragmentation:
      #   The degree to which the Ruby heap is allowed to be fragmented. Range [0,1].
      # max_mem_growth:
      #   A multiplier for how much excess private memory a worker can map compared to a reference process
      #   (itself or the primary in a pre-fork server.)
      # max_strikes:
      #   How many times the process is allowed to be above max_heap_fragmentation before
      #   a handler is invoked.
      # sleep_time_seconds:
      #   Used to control the frequency with which the watchdog will wake up and poll the GC.
      def initialize(
        handler: NullHandler.instance,
        logger: Logger.new($stdout),
        max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_MAX_HEAP_FRAG,
        max_mem_growth: ENV['GITLAB_MEMWD_MAX_MEM_GROWTH']&.to_f || DEFAULT_MAX_MEM_GROWTH,
        max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES,
        sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS,
        **options)
        super(**options)

        @handler = handler
        @logger = logger
        @sleep_time_seconds = sleep_time_seconds
        @max_strikes = max_strikes
        @stats = {
          heap_frag: {
            max: max_heap_fragmentation,
            strikes: 0
          },
          mem_growth: {
            max: max_mem_growth,
            strikes: 0
          }
        }

        @alive = true

        init_prometheus_metrics(max_heap_fragmentation)
      end

      attr_reader :max_strikes, :sleep_time_seconds

      def max_heap_fragmentation
        @stats[:heap_frag][:max]
      end

      def max_mem_growth
        @stats[:mem_growth][:max]
      end

      def strikes(stat)
        @stats[stat][:strikes]
      end

      def call
        @logger.info(log_labels.merge(message: 'started'))

        while @alive
          sleep(@sleep_time_seconds)

          next unless Feature.enabled?(:gitlab_memory_watchdog, type: :ops)

          monitor_heap_fragmentation
          monitor_memory_growth
        end

        @logger.info(log_labels.merge(message: 'stopped'))
      end

      def stop
        @alive = false
      end

      private

      def monitor_memory_condition(stat_key)
        return unless @alive

        stat = @stats[stat_key]

        ok, labels = yield(stat)

        if ok
          stat[:strikes] = 0
        else
          stat[:strikes] += 1
          @counter_violations.increment(reason: stat_key.to_s)
        end

        if stat[:strikes] > @max_strikes
          @alive = !memory_limit_exceeded_callback(stat_key, labels)
          stat[:strikes] = 0
        end
      end

      def monitor_heap_fragmentation
        monitor_memory_condition(:heap_frag) do |stat|
          heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation
          [
            heap_fragmentation <= stat[:max],
            {
              message: 'heap fragmentation limit exceeded',
              memwd_cur_heap_frag: heap_fragmentation,
              memwd_max_heap_frag: stat[:max]
            }
          ]
        end
      end

      def monitor_memory_growth
        monitor_memory_condition(:mem_growth) do |stat|
          worker_uss = Gitlab::Metrics::System.memory_usage_uss_pss[:uss]
          reference_uss = reference_mem[:uss]
          memory_limit = stat[:max] * reference_uss
          [
            worker_uss <= memory_limit,
            {
              message: 'memory limit exceeded',
              memwd_uss_bytes: worker_uss,
              memwd_ref_uss_bytes: reference_uss,
              memwd_max_uss_bytes: memory_limit
            }
          ]
        end
      end

      # On pre-fork systems this would be the primary process memory from which workers fork.
      # Otherwise it is the current process' memory.
      #
      # We initialize this lazily because in the initializer the application may not have
      # finished booting yet, which would yield an incorrect baseline.
      def reference_mem
        @reference_mem ||= Gitlab::Metrics::System.memory_usage_uss_pss(pid: Gitlab::Cluster::PRIMARY_PID)
      end

      def memory_limit_exceeded_callback(stat_key, handler_labels)
        all_labels = log_labels.merge(handler_labels)
          .merge(memwd_cur_strikes: strikes(stat_key))
        @logger.warn(all_labels)
        @counter_violations_handled.increment(reason: stat_key.to_s)

        handler.call
      end

      def handler
        # This allows us to keep the watchdog running but turn it into "friendly mode" where
        # all that happens is we collect logs and Prometheus events for fragmentation violations.
        return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)

        @handler
      end

      def log_labels
        {
          pid: $$,
          worker_id: worker_id,
          memwd_handler_class: handler.class.name,
          memwd_sleep_time_s: @sleep_time_seconds,
          memwd_max_strikes: @max_strikes,
          memwd_rss_bytes: process_rss_bytes
        }
      end

      def worker_id
        ::Prometheus::PidProvider.worker_id
      end

      def process_rss_bytes
        Gitlab::Metrics::System.memory_usage_rss
      end

      def init_prometheus_metrics(max_heap_fragmentation)
        @heap_frag_limit = Gitlab::Metrics.gauge(
          :gitlab_memwd_heap_frag_limit,
          'The configured limit for how fragmented the Ruby heap is allowed to be'
        )
        @heap_frag_limit.set({}, max_heap_fragmentation)

        default_labels = { pid: worker_id }
        @counter_violations = Gitlab::Metrics.counter(
          :gitlab_memwd_violations_total,
          'Total number of times a Ruby process violated a memory threshold',
          default_labels
        )
        @counter_violations_handled = Gitlab::Metrics.counter(
          :gitlab_memwd_violations_handled_total,
          'Total number of times Ruby process memory violations were handled',
          default_labels
        )
      end
    end
  end
end
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`# frozen_string_literal: true`

			`module Gitlab`
			`module Memory`
			`# A background thread that observes Ruby heap fragmentation and calls`
			`# into a handler when the Ruby heap has been fragmented for an extended`
			`# period of time.`
			`#`
			`# See Gitlab::Metrics::Memory for how heap fragmentation is defined.`
			`#`
			`# To decide whether a given fragmentation level is being exceeded,`
			`# the watchdog regularly polls the GC. Whenever a violation occurs`
			`# a strike is issued. If the maximum number of strikes are reached,`
			`# a handler is invoked to deal with the situation.`
			`#`
			`# The duration for which a process may be above a given fragmentation`
			# threshold is computed as `max_strikes * sleep_time_seconds`.
New upstream version 15.3.1+ds1 2022-08-27 11:52:29 +05:30			`class Watchdog`
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`DEFAULT_SLEEP_TIME_SECONDS = 60 * 5`
			`DEFAULT_MAX_HEAP_FRAG = 0.5`
			`DEFAULT_MAX_MEM_GROWTH = 3.0`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`DEFAULT_MAX_STRIKES = 5`

			# This handler does nothing. It returns `false` to indicate to the
			`# caller that the situation has not been dealt with so it will`
			`# receive calls repeatedly if fragmentation remains high.`
			`#`
			`# This is useful for "dress rehearsals" in production since it allows`
			`# us to observe how frequently the handler is invoked before taking action.`
			`class NullHandler`
			`include Singleton`

New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`def call`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`# NOP`
			`false`
			`end`
			`end`

			`# This handler sends SIGTERM and considers the situation handled.`
			`class TermProcessHandler`
			`def initialize(pid = $$)`
			`@pid = pid`
			`end`

New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`def call`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`Process.kill(:TERM, @pid)`
			`true`
			`end`
			`end`

			`# This handler invokes Puma's graceful termination handler, which takes`
			`# into account a configurable grace period during which a process may`
			`# remain unresponsive to a SIGTERM.`
			`class PumaHandler`
			`def initialize(puma_options = ::Puma.cli_config.options)`
			`@worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)`
			`end`

New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`def call`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`@worker.term`
			`true`
			`end`
			`end`

			`# max_heap_fragmentation:`
			`# The degree to which the Ruby heap is allowed to be fragmented. Range [0,1].`
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`# max_mem_growth:`
			`# A multiplier for how much excess private memory a worker can map compared to a reference process`
			`# (itself or the primary in a pre-fork server.)`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`# max_strikes:`
			`# How many times the process is allowed to be above max_heap_fragmentation before`
			`# a handler is invoked.`
			`# sleep_time_seconds:`
			`# Used to control the frequency with which the watchdog will wake up and poll the GC.`
			`def initialize(`
			`handler: NullHandler.instance,`
			`logger: Logger.new($stdout),`
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f \|\| DEFAULT_MAX_HEAP_FRAG,`
			`max_mem_growth: ENV['GITLAB_MEMWD_MAX_MEM_GROWTH']&.to_f \|\| DEFAULT_MAX_MEM_GROWTH,`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i \|\| DEFAULT_MAX_STRIKES,`
			`sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i \|\| DEFAULT_SLEEP_TIME_SECONDS,`
			`**options)`
			`super(**options)`

			`@handler = handler`
			`@logger = logger`
			`@sleep_time_seconds = sleep_time_seconds`
			`@max_strikes = max_strikes`
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`@stats = {`
			`heap_frag: {`
			`max: max_heap_fragmentation,`
			`strikes: 0`
			`},`
			`mem_growth: {`
			`max: max_mem_growth,`
			`strikes: 0`
			`}`
			`}`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30
			`@alive = true`

			`init_prometheus_metrics(max_heap_fragmentation)`
			`end`

New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`attr_reader :max_strikes, :sleep_time_seconds`

			`def max_heap_fragmentation`
			`@stats[:heap_frag][:max]`
			`end`

			`def max_mem_growth`
			`@stats[:mem_growth][:max]`
			`end`

			`def strikes(stat)`
			`@stats[stat][:strikes]`
			`end`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30
New upstream version 15.3.1+ds1 2022-08-27 11:52:29 +05:30			`def call`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`@logger.info(log_labels.merge(message: 'started'))`

			`while @alive`
			`sleep(@sleep_time_seconds)`

New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`next unless Feature.enabled?(:gitlab_memory_watchdog, type: :ops)`

			`monitor_heap_fragmentation`
			`monitor_memory_growth`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`end`

			`@logger.info(log_labels.merge(message: 'stopped'))`
			`end`

New upstream version 15.3.1+ds1 2022-08-27 11:52:29 +05:30			`def stop`
			`@alive = false`
			`end`

New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`private`

New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`def monitor_memory_condition(stat_key)`
			`return unless @alive`

			`stat = @stats[stat_key]`

			`ok, labels = yield(stat)`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`if ok`
			`stat[:strikes] = 0`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`else`
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`stat[:strikes] += 1`
			`@counter_violations.increment(reason: stat_key.to_s)`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`end`

New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`if stat[:strikes] > @max_strikes`
			`@alive = !memory_limit_exceeded_callback(stat_key, labels)`
			`stat[:strikes] = 0`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`end`
			`end`

New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`def monitor_heap_fragmentation`
			`monitor_memory_condition(:heap_frag) do \|stat\|`
			`heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation`
			`[`
			`heap_fragmentation <= stat[:max],`
			`{`
			`message: 'heap fragmentation limit exceeded',`
			`memwd_cur_heap_frag: heap_fragmentation,`
			`memwd_max_heap_frag: stat[:max]`
			`}`
			`]`
			`end`
			`end`

			`def monitor_memory_growth`
			`monitor_memory_condition(:mem_growth) do \|stat\|`
			`worker_uss = Gitlab::Metrics::System.memory_usage_uss_pss[:uss]`
			`reference_uss = reference_mem[:uss]`
			`memory_limit = stat[:max] * reference_uss`
			`[`
			`worker_uss <= memory_limit,`
			`{`
			`message: 'memory limit exceeded',`
			`memwd_uss_bytes: worker_uss,`
			`memwd_ref_uss_bytes: reference_uss,`
			`memwd_max_uss_bytes: memory_limit`
			`}`
			`]`
			`end`
			`end`

			`# On pre-fork systems this would be the primary process memory from which workers fork.`
			`# Otherwise it is the current process' memory.`
			`#`
			`# We initialize this lazily because in the initializer the application may not have`
			`# finished booting yet, which would yield an incorrect baseline.`
			`def reference_mem`
			`@reference_mem \|\|= Gitlab::Metrics::System.memory_usage_uss_pss(pid: Gitlab::Cluster::PRIMARY_PID)`
			`end`

			`def memory_limit_exceeded_callback(stat_key, handler_labels)`
			`all_labels = log_labels.merge(handler_labels)`
			`.merge(memwd_cur_strikes: strikes(stat_key))`
			`@logger.warn(all_labels)`
			`@counter_violations_handled.increment(reason: stat_key.to_s)`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`handler.call`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`end`

			`def handler`
			`# This allows us to keep the watchdog running but turn it into "friendly mode" where`
			`# all that happens is we collect logs and Prometheus events for fragmentation violations.`
			`return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)`

			`@handler`
			`end`

			`def log_labels`
			`{`
			`pid: $$,`
			`worker_id: worker_id,`
			`memwd_handler_class: handler.class.name,`
			`memwd_sleep_time_s: @sleep_time_seconds,`
			`memwd_max_strikes: @max_strikes,`
			`memwd_rss_bytes: process_rss_bytes`
			`}`
			`end`

			`def worker_id`
			`::Prometheus::PidProvider.worker_id`
			`end`

			`def process_rss_bytes`
			`Gitlab::Metrics::System.memory_usage_rss`
			`end`

			`def init_prometheus_metrics(max_heap_fragmentation)`
			`@heap_frag_limit = Gitlab::Metrics.gauge(`
			`:gitlab_memwd_heap_frag_limit,`
New upstream version 15.3.1+ds1 2022-08-27 11:52:29 +05:30			`'The configured limit for how fragmented the Ruby heap is allowed to be'`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`)`
			`@heap_frag_limit.set({}, max_heap_fragmentation)`

New upstream version 15.3.1+ds1 2022-08-27 11:52:29 +05:30			`default_labels = { pid: worker_id }`
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`@counter_violations = Gitlab::Metrics.counter(`
			`:gitlab_memwd_violations_total,`
			`'Total number of times a Ruby process violated a memory threshold',`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`default_labels`
			`)`
New upstream version 15.4.2+ds1 2022-10-11 01:57:18 +05:30			`@counter_violations_handled = Gitlab::Metrics.counter(`
			`:gitlab_memwd_violations_handled_total,`
			`'Total number of times Ruby process memory violations were handled',`
New upstream version 15.2.2+ds1 2022-08-13 15:12:31 +05:30			`default_labels`
			`)`
			`end`
			`end`
			`end`
			`end`