2022-08-13 15:12:31 +05:30
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module Gitlab
|
|
|
|
module Memory
|
2022-11-25 23:54:43 +05:30
|
|
|
# A background thread that monitors Ruby memory and calls
|
|
|
|
# into a handler when the Ruby process violates defined limits
|
|
|
|
# for an extended period of time.
|
2022-08-27 11:52:29 +05:30
|
|
|
class Watchdog
|
2022-08-13 15:12:31 +05:30
|
|
|
# This handler does nothing. It returns `false` to indicate to the
|
|
|
|
# caller that the situation has not been dealt with so it will
|
|
|
|
# receive calls repeatedly if fragmentation remains high.
|
|
|
|
#
|
|
|
|
# This is useful for "dress rehearsals" in production since it allows
|
|
|
|
# us to observe how frequently the handler is invoked before taking action.
|
|
|
|
class NullHandler
|
|
|
|
include Singleton
|
|
|
|
|
2022-10-11 01:57:18 +05:30
|
|
|
def call
|
2022-08-13 15:12:31 +05:30
|
|
|
# NOP
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# This handler sends SIGTERM and considers the situation handled.
|
|
|
|
class TermProcessHandler
|
|
|
|
def initialize(pid = $$)
|
|
|
|
@pid = pid
|
|
|
|
end
|
|
|
|
|
2022-10-11 01:57:18 +05:30
|
|
|
def call
|
2022-08-13 15:12:31 +05:30
|
|
|
Process.kill(:TERM, @pid)
|
|
|
|
true
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# This handler invokes Puma's graceful termination handler, which takes
|
|
|
|
# into account a configurable grace period during which a process may
|
|
|
|
# remain unresponsive to a SIGTERM.
|
|
|
|
class PumaHandler
|
|
|
|
def initialize(puma_options = ::Puma.cli_config.options)
|
|
|
|
@worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
|
|
|
|
end
|
|
|
|
|
2022-10-11 01:57:18 +05:30
|
|
|
def call
|
2022-08-13 15:12:31 +05:30
|
|
|
@worker.term
|
|
|
|
true
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
def initialize
|
|
|
|
@configuration = Configuration.new
|
2022-08-13 15:12:31 +05:30
|
|
|
@alive = true
|
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
init_prometheus_metrics
|
2022-10-11 01:57:18 +05:30
|
|
|
end
|
|
|
|
|
2023-01-13 00:05:48 +05:30
|
|
|
##
|
|
|
|
# Configuration for Watchdog, use like:
|
|
|
|
#
|
|
|
|
# watchdog.configure do |config|
|
|
|
|
# config.handler = Gitlab::Memory::Watchdog::TermProcessHandler
|
|
|
|
# config.sleep_time_seconds = 60
|
|
|
|
# config.logger = Gitlab::AppLogger
|
|
|
|
# config.monitors do |stack|
|
|
|
|
# stack.push MyMonitorClass, args*, max_strikes:, kwargs**, &block
|
|
|
|
# end
|
|
|
|
# end
|
2022-11-25 23:54:43 +05:30
|
|
|
def configure
|
|
|
|
yield @configuration
|
2022-10-11 01:57:18 +05:30
|
|
|
end
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-08-27 11:52:29 +05:30
|
|
|
def call
|
2022-11-25 23:54:43 +05:30
|
|
|
logger.info(log_labels.merge(message: 'started'))
|
2022-08-13 15:12:31 +05:30
|
|
|
|
|
|
|
while @alive
|
2022-11-25 23:54:43 +05:30
|
|
|
sleep(sleep_time_seconds)
|
2022-10-11 01:57:18 +05:30
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
monitor if Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
logger.info(log_labels.merge(message: 'stopped'))
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
|
2022-08-27 11:52:29 +05:30
|
|
|
def stop
|
|
|
|
@alive = false
|
|
|
|
end
|
|
|
|
|
2022-08-13 15:12:31 +05:30
|
|
|
private
|
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
def monitor
|
|
|
|
@configuration.monitors.call_each do |result|
|
|
|
|
break unless @alive
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
next unless result.threshold_violated?
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
@counter_violations.increment(reason: result.monitor_name)
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
next unless result.strikes_exceeded?
|
2022-10-11 01:57:18 +05:30
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
@alive = !memory_limit_exceeded_callback(result.monitor_name, result.payload)
|
2022-10-11 01:57:18 +05:30
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
def memory_limit_exceeded_callback(monitor_name, monitor_payload)
|
|
|
|
all_labels = log_labels.merge(monitor_payload)
|
|
|
|
logger.warn(all_labels)
|
|
|
|
@counter_violations_handled.increment(reason: monitor_name)
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-10-11 01:57:18 +05:30
|
|
|
handler.call
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
|
|
|
|
def handler
|
|
|
|
# This allows us to keep the watchdog running but turn it into "friendly mode" where
|
|
|
|
# all that happens is we collect logs and Prometheus events for fragmentation violations.
|
|
|
|
return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
|
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
@configuration.handler
|
|
|
|
end
|
|
|
|
|
|
|
|
def logger
|
|
|
|
@configuration.logger
|
|
|
|
end
|
|
|
|
|
|
|
|
def sleep_time_seconds
|
|
|
|
@configuration.sleep_time_seconds
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
|
|
|
|
def log_labels
|
|
|
|
{
|
|
|
|
pid: $$,
|
|
|
|
worker_id: worker_id,
|
|
|
|
memwd_handler_class: handler.class.name,
|
2022-11-25 23:54:43 +05:30
|
|
|
memwd_sleep_time_s: sleep_time_seconds,
|
2022-08-13 15:12:31 +05:30
|
|
|
memwd_rss_bytes: process_rss_bytes
|
|
|
|
}
|
|
|
|
end
|
|
|
|
|
|
|
|
def process_rss_bytes
|
2023-01-13 00:05:48 +05:30
|
|
|
Gitlab::Metrics::System.memory_usage_rss[:total]
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
def worker_id
|
|
|
|
::Prometheus::PidProvider.worker_id
|
|
|
|
end
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
def init_prometheus_metrics
|
2022-08-27 11:52:29 +05:30
|
|
|
default_labels = { pid: worker_id }
|
2022-10-11 01:57:18 +05:30
|
|
|
@counter_violations = Gitlab::Metrics.counter(
|
|
|
|
:gitlab_memwd_violations_total,
|
|
|
|
'Total number of times a Ruby process violated a memory threshold',
|
2022-08-13 15:12:31 +05:30
|
|
|
default_labels
|
|
|
|
)
|
2022-10-11 01:57:18 +05:30
|
|
|
@counter_violations_handled = Gitlab::Metrics.counter(
|
|
|
|
:gitlab_memwd_violations_handled_total,
|
|
|
|
'Total number of times Ruby process memory violations were handled',
|
2022-08-13 15:12:31 +05:30
|
|
|
default_labels
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|