debian-mirror-gitlab/lib/gitlab/sidekiq_daemon/memory_killer.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

294 lines
9.9 KiB
Ruby
Raw Normal View History

2019-12-21 20:55:43 +05:30
# frozen_string_literal: true
module Gitlab
module SidekiqDaemon
class MemoryKiller < Daemon
include ::Gitlab::Utils::StrongMemoize
# Today 64-bit CPU support max 256T memory. It is big enough.
MAX_MEMORY_KB = 256 * 1024 * 1024 * 1024
# RSS below `soft_limit_rss` is considered safe
SOFT_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_MAX_RSS', 2000000).to_i
# RSS above `hard_limit_rss` will be stopped
HARD_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS', MAX_MEMORY_KB).to_i
# RSS in range (soft_limit_rss, hard_limit_rss) is allowed for GRACE_BALLOON_SECONDS
GRACE_BALLOON_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_TIME', 15 * 60).to_i
# Check RSS every CHECK_INTERVAL_SECONDS, minimum 2 seconds
CHECK_INTERVAL_SECONDS = [ENV.fetch('SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL', 3).to_i, 2].max
# Give Sidekiq up to 30 seconds to allow existing jobs to finish after exceeding the limit
SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i
# Developer/admin should always set `memory_killer_max_memory_growth_kb` explicitly
# In case not set, default to 300M. This is for extra-safe.
DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000
# Phases of memory killer
PHASE = {
running: 1,
above_soft_limit: 2,
stop_fetching_new_jobs: 3,
shutting_down: 4,
killing_sidekiq: 5
}.freeze
def initialize
super
@enabled = true
@metrics = init_metrics
2023-03-04 22:38:38 +05:30
@sidekiq_daemon_monitor = Gitlab::SidekiqDaemon::Monitor.instance
2019-12-21 20:55:43 +05:30
end
private
def init_metrics
{
2022-10-11 01:57:18 +05:30
sidekiq_current_rss: ::Gitlab::Metrics.gauge(:sidekiq_current_rss, 'Current RSS of Sidekiq Worker'),
2019-12-21 20:55:43 +05:30
sidekiq_memory_killer_soft_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_soft_limit_rss, 'Current soft_limit_rss of Sidekiq Worker'),
sidekiq_memory_killer_hard_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_hard_limit_rss, 'Current hard_limit_rss of Sidekiq Worker'),
2022-10-11 01:57:18 +05:30
sidekiq_memory_killer_phase: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_phase, 'Current phase of Sidekiq Worker'),
sidekiq_memory_killer_running_jobs: ::Gitlab::Metrics.counter(:sidekiq_memory_killer_running_jobs_total, 'Current running jobs when limit was reached')
2019-12-21 20:55:43 +05:30
}
end
def refresh_state(phase)
@phase = PHASE.fetch(phase)
2022-11-25 23:54:43 +05:30
@current_rss = get_rss_kb
@soft_limit_rss = get_soft_limit_rss_kb
@hard_limit_rss = get_hard_limit_rss_kb
@memory_total = get_memory_total_kb
2019-12-21 20:55:43 +05:30
# track the current state as prometheus gauges
@metrics[:sidekiq_memory_killer_phase].set({}, @phase)
@metrics[:sidekiq_current_rss].set({}, @current_rss)
@metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, @soft_limit_rss)
@metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, @hard_limit_rss)
end
def run_thread
Sidekiq.logger.info(
class: self.class.to_s,
action: 'start',
pid: pid,
message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon'
)
while enabled?
begin
sleep(CHECK_INTERVAL_SECONDS)
restart_sidekiq unless rss_within_range?
2021-06-08 01:23:25 +05:30
rescue StandardError => e
2019-12-21 20:55:43 +05:30
log_exception(e, __method__)
rescue Exception => e # rubocop:disable Lint/RescueException
2023-03-04 22:38:38 +05:30
log_exception(e, __method__)
2019-12-21 20:55:43 +05:30
raise e
end
end
ensure
Sidekiq.logger.warn(
class: self.class.to_s,
action: 'stop',
pid: pid,
message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon'
)
end
def log_exception(exception, method)
Sidekiq.logger.warn(
class: self.class.to_s,
pid: pid,
message: "Exception from #{method}: #{exception.message}"
)
end
def stop_working
@enabled = false
end
def enabled?
@enabled
end
def restart_sidekiq
2022-11-25 23:54:43 +05:30
return if Feature.enabled?(:sidekiq_memory_killer_read_only_mode, type: :ops)
2019-12-21 20:55:43 +05:30
# Tell Sidekiq to stop fetching new jobs
# We first SIGNAL and then wait given time
# We also monitor a number of running jobs and allow to restart early
refresh_state(:stop_fetching_new_jobs)
signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
return unless enabled?
# Tell sidekiq to restart itself
2023-01-13 00:05:48 +05:30
# Keep extra safe to wait `Sidekiq[:timeout] + 2` seconds before SIGKILL
2019-12-21 20:55:43 +05:30
refresh_state(:shutting_down)
2023-01-13 00:05:48 +05:30
signal_and_wait(Sidekiq[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
2019-12-21 20:55:43 +05:30
return unless enabled?
# Ideally we should never reach this condition
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
# Kill the whole pgroup, so we can be sure no children are left behind
refresh_state(:killing_sidekiq)
signal_pgroup('SIGKILL', 'die')
end
def rss_within_range?
refresh_state(:running)
deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds
loop do
return true unless enabled?
# RSS go above hard limit should trigger forcible shutdown right away
break if @current_rss > @hard_limit_rss
# RSS go below the soft limit
return true if @current_rss < @soft_limit_rss
# RSS did not go below the soft limit within deadline, restart
break if Gitlab::Metrics::System.monotonic_time > deadline
sleep(CHECK_INTERVAL_SECONDS)
refresh_state(:above_soft_limit)
2022-08-13 15:12:31 +05:30
log_rss_out_of_range(false)
2019-12-21 20:55:43 +05:30
end
# There are two chances to break from loop:
# - above hard limit, or
# - above soft limit after deadline
# When `above hard limit`, it immediately go to `stop_fetching_new_jobs`
# So ignore `above hard limit` and always set `above_soft_limit` here
refresh_state(:above_soft_limit)
2022-08-13 15:12:31 +05:30
log_rss_out_of_range
2019-12-21 20:55:43 +05:30
false
end
2022-08-13 15:12:31 +05:30
def log_rss_out_of_range(deadline_exceeded = true)
reason = out_of_range_description(@current_rss,
@hard_limit_rss,
@soft_limit_rss,
deadline_exceeded)
2022-08-27 11:52:29 +05:30
running_jobs = fetch_running_jobs
2019-12-21 20:55:43 +05:30
Sidekiq.logger.warn(
class: self.class.to_s,
pid: pid,
message: 'Sidekiq worker RSS out of range',
2022-08-13 15:12:31 +05:30
current_rss: @current_rss,
soft_limit_rss: @soft_limit_rss,
hard_limit_rss: @hard_limit_rss,
2022-11-25 23:54:43 +05:30
memory_total_kb: @memory_total,
2022-08-13 15:12:31 +05:30
reason: reason,
running_jobs: running_jobs)
2022-08-27 11:52:29 +05:30
increment_worker_counters(running_jobs, deadline_exceeded)
end
def increment_worker_counters(running_jobs, deadline_exceeded)
running_jobs.each do |job|
2023-03-04 22:38:38 +05:30
@metrics[:sidekiq_memory_killer_running_jobs].increment({ worker_class: job[:worker_class], deadline_exceeded: deadline_exceeded })
2022-08-27 11:52:29 +05:30
end
2022-08-13 15:12:31 +05:30
end
2022-08-27 11:52:29 +05:30
def fetch_running_jobs
2023-03-04 22:38:38 +05:30
@sidekiq_daemon_monitor.jobs.map do |jid, job|
{
jid: jid,
worker_class: job[:worker_class].name
}
2022-08-13 15:12:31 +05:30
end
2019-12-21 20:55:43 +05:30
end
2022-08-13 15:12:31 +05:30
def out_of_range_description(rss, hard_limit, soft_limit, deadline_exceeded)
2019-12-21 20:55:43 +05:30
if rss > hard_limit
"current_rss(#{rss}) > hard_limit_rss(#{hard_limit})"
2022-08-13 15:12:31 +05:30
elsif deadline_exceeded
2019-12-21 20:55:43 +05:30
"current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{GRACE_BALLOON_SECONDS})"
2022-08-13 15:12:31 +05:30
else
"current_rss(#{rss}) > soft_limit_rss(#{soft_limit})"
2019-12-21 20:55:43 +05:30
end
end
2022-11-25 23:54:43 +05:30
def get_memory_total_kb
Gitlab::Metrics::System.memory_total / 1.kilobytes
end
2019-12-21 20:55:43 +05:30
2022-11-25 23:54:43 +05:30
def get_rss_kb
2023-01-13 00:05:48 +05:30
Gitlab::Metrics::System.memory_usage_rss[:total] / 1.kilobytes
2019-12-21 20:55:43 +05:30
end
2022-11-25 23:54:43 +05:30
def get_soft_limit_rss_kb
2019-12-21 20:55:43 +05:30
SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
end
2022-11-25 23:54:43 +05:30
def get_hard_limit_rss_kb
2019-12-21 20:55:43 +05:30
HARD_LIMIT_RSS_KB
end
def signal_and_wait(time, signal, explanation)
Sidekiq.logger.warn(
class: self.class.to_s,
pid: pid,
signal: signal,
explanation: explanation,
wait_time: time,
message: "Sending signal and waiting"
)
Process.kill(signal, pid)
deadline = Gitlab::Metrics::System.monotonic_time + time
2023-03-17 16:20:25 +05:30
# Sleep until thread killed or timeout reached
sleep(CHECK_INTERVAL_SECONDS) while enabled? && Gitlab::Metrics::System.monotonic_time < deadline
2019-12-21 20:55:43 +05:30
end
def signal_pgroup(signal, explanation)
if Process.getpgrp == pid
pid_or_pgrp_str = 'PGRP'
pid_to_signal = 0
else
pid_or_pgrp_str = 'PID'
pid_to_signal = pid
end
Sidekiq.logger.warn(
class: self.class.to_s,
signal: signal,
pid: pid,
message: "sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})"
)
Process.kill(signal, pid_to_signal)
end
def rss_increase_by_jobs
2023-03-04 22:38:38 +05:30
@sidekiq_daemon_monitor.jobs.sum do |_, job|
rss_increase_by_job(job)
2019-12-21 20:55:43 +05:30
end
end
def rss_increase_by_job(job)
memory_growth_kb = get_job_options(job, 'memory_killer_memory_growth_kb', 0).to_i
max_memory_growth_kb = get_job_options(job, 'memory_killer_max_memory_growth_kb', DEFAULT_MAX_MEMORY_GROWTH_KB).to_i
2020-10-24 23:57:45 +05:30
return 0 if memory_growth_kb == 0
2019-12-21 20:55:43 +05:30
time_elapsed = [Gitlab::Metrics::System.monotonic_time - job[:started_at], 0].max
[memory_growth_kb * time_elapsed, max_memory_growth_kb].min
end
def get_job_options(job, key, default)
job[:worker_class].sidekiq_options.fetch(key, default)
2021-06-08 01:23:25 +05:30
rescue StandardError
2019-12-21 20:55:43 +05:30
default
end
def pid
Process.pid
end
end
end
end