debian-mirror-gitlab/lib/gitlab/database/load_balancing/service_discovery.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

248 lines
8.3 KiB
Ruby
Raw Normal View History

2021-09-04 01:27:46 +05:30
# frozen_string_literal: true
require 'net/dns'
require 'resolv'
module Gitlab
module Database
module LoadBalancing
# Service discovery of secondary database hosts.
#
# Service discovery works by periodically looking up a DNS record. If the
# DNS record returns a new list of hosts, this class will update the load
# balancer with said hosts. Requests may continue to use the old hosts
# until they complete.
class ServiceDiscovery
2021-11-11 11:23:49 +05:30
EmptyDnsResponse = Class.new(StandardError)
2021-10-27 15:23:28 +05:30
attr_reader :interval, :record, :record_type, :disconnect_timeout,
:load_balancer
2021-09-04 01:27:46 +05:30
MAX_SLEEP_ADJUSTMENT = 10
2021-11-11 11:23:49 +05:30
MAX_DISCOVERY_RETRIES = 3
RETRY_DELAY_RANGE = (0.1..0.2).freeze
2021-09-04 01:27:46 +05:30
RECORD_TYPES = {
'A' => Net::DNS::A,
'SRV' => Net::DNS::SRV
}.freeze
Address = Struct.new(:hostname, :port) do
def to_s
port ? "#{hostname}:#{port}" : hostname
end
def <=>(other)
self.to_s <=> other.to_s
end
end
# nameserver - The nameserver to use for DNS lookups.
# port - The port of the nameserver.
# record - The DNS record to look up for retrieving the secondaries.
# record_type - The type of DNS record to look up
# interval - The time to wait between lookups.
# disconnect_timeout - The time after which an old host should be
# forcefully disconnected.
# use_tcp - Use TCP instaed of UDP to look up resources
2021-10-27 15:23:28 +05:30
# load_balancer - The load balancer instance to use
2023-01-13 00:05:48 +05:30
# rubocop:disable Metrics/ParameterLists
2021-10-27 15:23:28 +05:30
def initialize(
2021-11-11 11:23:49 +05:30
load_balancer,
2021-10-27 15:23:28 +05:30
nameserver:,
port:,
record:,
record_type: 'A',
interval: 60,
disconnect_timeout: 120,
2023-01-13 00:05:48 +05:30
use_tcp: false,
max_replica_pools: nil
2021-10-27 15:23:28 +05:30
)
2021-09-04 01:27:46 +05:30
@nameserver = nameserver
@port = port
@record = record
@record_type = record_type_for(record_type)
@interval = interval
@disconnect_timeout = disconnect_timeout
@use_tcp = use_tcp
2021-10-27 15:23:28 +05:30
@load_balancer = load_balancer
2023-01-13 00:05:48 +05:30
@max_replica_pools = max_replica_pools
2023-03-17 16:20:25 +05:30
@nameserver_ttl = 1.second.ago # Begin with an expired ttl to trigger a nameserver dns lookup
2021-09-04 01:27:46 +05:30
end
2023-01-13 00:05:48 +05:30
# rubocop:enable Metrics/ParameterLists
2021-09-04 01:27:46 +05:30
def start
Thread.new do
loop do
2021-10-27 15:23:28 +05:30
next_sleep_duration = perform_service_discovery
2021-09-04 01:27:46 +05:30
# We slightly randomize the sleep() interval. This should reduce
# the likelihood of _all_ processes refreshing at the same time,
# possibly putting unnecessary pressure on the DNS server.
2021-10-27 15:23:28 +05:30
sleep(next_sleep_duration + rand(MAX_SLEEP_ADJUSTMENT))
2021-09-04 01:27:46 +05:30
end
end
end
2021-10-27 15:23:28 +05:30
def perform_service_discovery
2021-11-11 11:23:49 +05:30
MAX_DISCOVERY_RETRIES.times do
return refresh_if_necessary
rescue StandardError => error
# Any exceptions that might occur should be reported to
# Sentry, instead of silently terminating this thread.
Gitlab::ErrorTracking.track_exception(error)
Gitlab::Database::LoadBalancing::Logger.error(
event: :service_discovery_failure,
message: "Service discovery encountered an error: #{error.message}",
host_list_length: load_balancer.host_list.length
)
# Slightly randomize the retry delay so that, in the case of a total
# dns outage, all starting services do not pressure the dns server at the same time.
sleep(rand(RETRY_DELAY_RANGE))
2023-07-09 08:55:56 +05:30
rescue Exception => error # rubocop:disable Lint/RescueException
# All exceptions are logged to find any pattern and solve https://gitlab.com/gitlab-org/gitlab/-/issues/364370
# This will be removed in https://gitlab.com/gitlab-org/gitlab/-/merge_requests/120173
Gitlab::Database::LoadBalancing::Logger.error(
event: :service_discovery_unexpected_exception,
message: "Service discovery encountered an uncaught error: #{error.message}"
)
raise
2021-11-11 11:23:49 +05:30
end
2021-10-27 15:23:28 +05:30
interval
end
2021-09-04 01:27:46 +05:30
# Refreshes the hosts, but only if the DNS record returned a new list of
# addresses.
#
# The return value is the amount of time (in seconds) to wait before
# checking the DNS record for any changes.
def refresh_if_necessary
interval, from_dns = addresses_from_dns
current = addresses_from_load_balancer
2021-11-11 11:23:49 +05:30
if from_dns != current
::Gitlab::Database::LoadBalancing::Logger.info(
event: :host_list_update,
message: "Updating the host list for service discovery",
host_list_length: from_dns.length,
old_host_list_length: current.length
)
replace_hosts(from_dns)
end
2021-09-04 01:27:46 +05:30
interval
end
# Replaces all the hosts in the load balancer with the new ones,
# disconnecting the old connections.
#
# addresses - An Array of Address structs to use for the new hosts.
def replace_hosts(addresses)
old_hosts = load_balancer.host_list.hosts
load_balancer.host_list.hosts = addresses.map do |addr|
Host.new(addr.hostname, load_balancer, port: addr.port)
end
# We must explicitly disconnect the old connections, otherwise we may
# leak database connections over time. For example, if a request
# started just before we added the new hosts it will use an old
# host/connection. While this connection will be checked in and out,
# it won't be explicitly disconnected.
old_hosts.each do |host|
2021-10-27 15:23:28 +05:30
host.disconnect!(timeout: disconnect_timeout)
2021-09-04 01:27:46 +05:30
end
end
# Returns an Array containing:
#
# 1. The time to wait for the next check.
# 2. An array containing the hostnames of the DNS record.
def addresses_from_dns
response = resolver.search(record, record_type)
resources = response.answer
addresses =
case record_type
when Net::DNS::A
addresses_from_a_record(resources)
when Net::DNS::SRV
addresses_from_srv_record(response)
end
2023-01-13 00:05:48 +05:30
addresses = sampler.sample(addresses)
2021-11-11 11:23:49 +05:30
raise EmptyDnsResponse if addresses.empty?
2021-09-04 01:27:46 +05:30
# Addresses are sorted so we can directly compare the old and new
# addresses, without having to use any additional data structures.
[new_wait_time_for(resources), addresses.sort]
end
def new_wait_time_for(resources)
wait = resources.first&.ttl || interval
# The preconfigured interval acts as a minimum amount of time to
# wait.
wait < interval ? interval : wait
end
def addresses_from_load_balancer
load_balancer.host_list.host_names_and_ports.map do |hostname, port|
Address.new(hostname, port)
end.sort
end
def resolver
2023-03-17 16:20:25 +05:30
return @resolver if defined?(@resolver) && @nameserver_ttl.future?
response = Resolver.new(@nameserver).resolve
@nameserver_ttl = response.ttl
@resolver = Net::DNS::Resolver.new(
nameservers: response.address,
2021-09-04 01:27:46 +05:30
port: @port,
use_tcp: @use_tcp
)
end
private
def record_type_for(type)
RECORD_TYPES.fetch(type) do
raise(ArgumentError, "Unsupported record type: #{type}")
end
end
def addresses_from_srv_record(response)
srv_resolver = SrvResolver.new(resolver, response.additional)
response.answer.map do |r|
address = srv_resolver.address_for(r.host.to_s)
next unless address
Address.new(address.to_s, r.port)
end.compact
end
def addresses_from_a_record(resources)
resources.map { |r| Address.new(r.address.to_s) }
end
2023-01-13 00:05:48 +05:30
def sampler
@sampler ||= ::Gitlab::Database::LoadBalancing::ServiceDiscovery::Sampler
.new(max_replica_pools: @max_replica_pools)
end
2021-09-04 01:27:46 +05:30
end
end
end
end