2022-08-13 15:12:31 +05:30
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module Gitlab
|
|
|
|
module Memory
|
2022-11-25 23:54:43 +05:30
|
|
|
# A background thread that monitors Ruby memory and calls
|
|
|
|
# into a handler when the Ruby process violates defined limits
|
|
|
|
# for an extended period of time.
|
2022-08-27 11:52:29 +05:30
|
|
|
class Watchdog
|
2022-11-25 23:54:43 +05:30
|
|
|
def initialize
|
|
|
|
@configuration = Configuration.new
|
2022-08-13 15:12:31 +05:30
|
|
|
@alive = true
|
2022-10-11 01:57:18 +05:30
|
|
|
end
|
|
|
|
|
2023-01-13 00:05:48 +05:30
|
|
|
##
|
2023-03-04 22:38:38 +05:30
|
|
|
# Configuration for Watchdog, see Gitlab::Memory::Watchdog::Configurator
|
|
|
|
# for examples.
|
2022-11-25 23:54:43 +05:30
|
|
|
def configure
|
2023-03-04 22:38:38 +05:30
|
|
|
yield configuration
|
2022-10-11 01:57:18 +05:30
|
|
|
end
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-08-27 11:52:29 +05:30
|
|
|
def call
|
2023-03-04 22:38:38 +05:30
|
|
|
event_reporter.started(log_labels)
|
2022-08-13 15:12:31 +05:30
|
|
|
|
|
|
|
while @alive
|
2022-11-25 23:54:43 +05:30
|
|
|
sleep(sleep_time_seconds)
|
2022-10-11 01:57:18 +05:30
|
|
|
|
2023-03-04 22:38:38 +05:30
|
|
|
monitor
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
|
2023-03-17 16:20:25 +05:30
|
|
|
event_reporter.stopped(log_labels(memwd_reason: @stop_reason).compact)
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
|
2023-03-17 16:20:25 +05:30
|
|
|
def stop
|
|
|
|
stop_working(reason: 'background task stopped')
|
2023-04-23 21:23:45 +05:30
|
|
|
handler.stop if handler.respond_to?(:stop)
|
2022-08-27 11:52:29 +05:30
|
|
|
end
|
|
|
|
|
2022-08-13 15:12:31 +05:30
|
|
|
private
|
|
|
|
|
2023-03-04 22:38:38 +05:30
|
|
|
attr_reader :configuration
|
|
|
|
|
|
|
|
delegate :event_reporter, :monitors, :sleep_time_seconds, to: :configuration
|
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
def monitor
|
2023-03-04 22:38:38 +05:30
|
|
|
if monitors.empty?
|
2023-03-17 16:20:25 +05:30
|
|
|
stop_working(reason: 'monitors are not configured')
|
2023-03-04 22:38:38 +05:30
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
monitors.call_each do |result|
|
2022-11-25 23:54:43 +05:30
|
|
|
break unless @alive
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
next unless result.threshold_violated?
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2023-03-04 22:38:38 +05:30
|
|
|
event_reporter.threshold_violated(result.monitor_name)
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2022-11-25 23:54:43 +05:30
|
|
|
next unless result.strikes_exceeded?
|
2022-10-11 01:57:18 +05:30
|
|
|
|
2023-03-04 22:38:38 +05:30
|
|
|
strike_exceeded_callback(result.monitor_name, result.payload)
|
2022-10-11 01:57:18 +05:30
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2023-03-04 22:38:38 +05:30
|
|
|
def strike_exceeded_callback(monitor_name, monitor_payload)
|
|
|
|
event_reporter.strikes_exceeded(monitor_name, log_labels(monitor_payload))
|
|
|
|
|
|
|
|
Gitlab::Memory::Reports::HeapDump.enqueue!
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2023-03-17 16:20:25 +05:30
|
|
|
stop_working(reason: 'successfully handled') if handler.call
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
|
|
|
|
def handler
|
|
|
|
# This allows us to keep the watchdog running but turn it into "friendly mode" where
|
|
|
|
# all that happens is we collect logs and Prometheus events for fragmentation violations.
|
2023-04-23 21:23:45 +05:30
|
|
|
return Handlers::NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
|
2022-08-13 15:12:31 +05:30
|
|
|
|
2023-03-04 22:38:38 +05:30
|
|
|
configuration.handler
|
2022-11-25 23:54:43 +05:30
|
|
|
end
|
|
|
|
|
2023-03-04 22:38:38 +05:30
|
|
|
def log_labels(extra = {})
|
|
|
|
extra.merge(
|
2022-08-13 15:12:31 +05:30
|
|
|
memwd_handler_class: handler.class.name,
|
2023-03-04 22:38:38 +05:30
|
|
|
memwd_sleep_time_s: sleep_time_seconds
|
2022-08-13 15:12:31 +05:30
|
|
|
)
|
|
|
|
end
|
2023-03-17 16:20:25 +05:30
|
|
|
|
|
|
|
def stop_working(reason:)
|
|
|
|
return unless @alive
|
|
|
|
|
|
|
|
@stop_reason = reason
|
|
|
|
@alive = false
|
|
|
|
end
|
2022-08-13 15:12:31 +05:30
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|