debian-mirror-gitlab/app/services/ci/register_job_service.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

310 lines
10 KiB
Ruby
Raw Normal View History

2018-11-18 11:00:15 +05:30
# frozen_string_literal: true
2017-08-17 22:00:37 +05:30
module Ci
# This class responsible for assigning
# proper pending build to runner on runner API request
class RegisterJobService
2022-08-27 11:52:29 +05:30
include ::Gitlab::Ci::Artifacts::Logger
2021-04-17 20:07:23 +05:30
attr_reader :runner, :metrics
2017-08-17 22:00:37 +05:30
2021-04-17 20:07:23 +05:30
TEMPORARY_LOCK_TIMEOUT = 3.seconds
2018-10-15 14:42:47 +05:30
2020-07-28 23:09:34 +05:30
Result = Struct.new(:build, :build_json, :valid?)
2017-08-17 22:00:37 +05:30
2021-04-29 21:17:54 +05:30
##
# The queue depth limit number has been determined by observing 95
# percentile of effective queue depth on gitlab.com. This is only likely to
# affect 5% of the worst case scenarios.
MAX_QUEUE_DEPTH = 45
2021-04-17 20:07:23 +05:30
2017-08-17 22:00:37 +05:30
def initialize(runner)
@runner = runner
2021-04-17 20:07:23 +05:30
@metrics = ::Gitlab::Ci::Queue::Metrics.new(runner)
2017-08-17 22:00:37 +05:30
end
2018-11-08 19:23:39 +05:30
def execute(params = {})
2021-11-18 22:05:49 +05:30
db_all_caught_up =
::Ci::Runner.sticking.all_caught_up?(:runner, runner.id)
2021-09-04 01:27:46 +05:30
2021-04-17 20:07:23 +05:30
@metrics.increment_queue_operation(:queue_attempt)
2021-09-04 01:27:46 +05:30
result = @metrics.observe_queue_time(:process, @runner.runner_type) do
2021-04-17 20:07:23 +05:30
process_queue(params)
end
2021-09-04 01:27:46 +05:30
# Since we execute this query against replica it might lead to false-positive
# We might receive the positive response: "hi, we don't have any more builds for you".
# This might not be true. If our DB replica is not up-to date with when runner event was generated
# we might still have some CI builds to be picked. Instead we should say to runner:
# "Hi, we don't have any more builds now, but not everything is right anyway, so try again".
# Runner will retry, but again, against replica, and again will check if replication lag did catch-up.
if !db_all_caught_up && !result.build
metrics.increment_queue_operation(:queue_replication_lag)
::Ci::RegisterJobService::Result.new(nil, false) # rubocop:disable Cop/AvoidReturnFromBlocks
else
result
end
2021-04-17 20:07:23 +05:30
end
private
def process_queue(params)
valid = true
depth = 0
each_build(params) do |build|
depth += 1
@metrics.increment_queue_operation(:queue_iteration)
if depth > max_queue_depth
@metrics.increment_queue_operation(:queue_depth_limit)
valid = false
break
end
# We read builds from replicas
# It is likely that some other concurrent connection is processing
# a given build at a given moment. To avoid an expensive compute
# we perform an exclusive lease on Redis to acquire a build temporarily
unless acquire_temporary_lock(build.id)
@metrics.increment_queue_operation(:build_temporary_locked)
# We failed to acquire lock
# - our queue is not complete as some resources are locked temporarily
# - we need to re-process it again to ensure that all builds are handled
valid = false
next
end
result = process_build(build, params)
next unless result
if result.valid?
@metrics.register_success(result.build)
@metrics.observe_queue_depth(:found, depth)
return result # rubocop:disable Cop/AvoidReturnFromBlocks
else
# The usage of valid: is described in
# handling of ActiveRecord::StaleObjectError
valid = false
end
end
@metrics.increment_queue_operation(:queue_conflict) unless valid
@metrics.observe_queue_depth(:conflict, depth) unless valid
@metrics.observe_queue_depth(:not_found, depth) if valid
@metrics.register_failure
Result.new(nil, nil, valid)
end
# rubocop: disable CodeReuse/ActiveRecord
def each_build(params, &blk)
2021-11-18 22:05:49 +05:30
queue = ::Ci::Queue::BuildQueueService.new(runner)
2021-09-30 23:02:18 +05:30
2021-11-18 22:05:49 +05:30
builds = begin
if runner.instance_type?
queue.builds_for_shared_runner
elsif runner.group_type?
queue.builds_for_group_runner
else
queue.builds_for_project_runner
2021-11-11 11:23:49 +05:30
end
2021-11-18 22:05:49 +05:30
end
2017-08-17 22:00:37 +05:30
2021-11-18 22:05:49 +05:30
if runner.ref_protected?
builds = queue.builds_for_protected_runner(builds)
end
2018-03-17 18:26:18 +05:30
2021-11-18 22:05:49 +05:30
# pick builds that does not have other tags than runner's one
builds = queue.builds_matching_tag_ids(builds, runner.tags.ids)
2018-03-17 18:26:18 +05:30
2021-11-18 22:05:49 +05:30
# pick builds that have at least one tag
unless runner.run_untagged?
builds = queue.builds_with_any_tags(builds)
end
2019-09-04 21:01:54 +05:30
2021-11-18 22:05:49 +05:30
# pick builds that older than specified age
if params.key?(:job_age)
builds = queue.builds_queued_before(builds, params[:job_age].seconds.ago)
end
2019-12-26 22:10:19 +05:30
2021-11-18 22:05:49 +05:30
build_ids = retrieve_queue(-> { queue.execute(builds) })
2021-04-29 21:17:54 +05:30
2021-11-18 22:05:49 +05:30
@metrics.observe_queue_size(-> { build_ids.size }, @runner.runner_type)
build_ids.each { |build_id| yield Ci::Build.find(build_id) }
2017-08-17 22:00:37 +05:30
end
2018-12-05 23:21:45 +05:30
# rubocop: enable CodeReuse/ActiveRecord
2017-08-17 22:00:37 +05:30
2021-04-29 21:17:54 +05:30
def retrieve_queue(queue_query_proc)
2021-09-04 01:27:46 +05:30
##
# We want to reset a load balancing session to discard the side
# effects of writes that could have happened prior to this moment.
#
::Gitlab::Database::LoadBalancing::Session.clear_session
2021-04-29 21:17:54 +05:30
@metrics.observe_queue_time(:retrieve, @runner.runner_type) do
queue_query_proc.call
end
end
2019-12-26 22:10:19 +05:30
def process_build(build, params)
2021-04-17 20:07:23 +05:30
unless build.pending?
@metrics.increment_queue_operation(:build_not_pending)
2022-05-07 20:08:51 +05:30
2022-07-16 23:28:13 +05:30
##
# If this build can not be picked because we had stale data in
# `ci_pending_builds` table, we need to respond with 409 to retry
# this operation.
#
if ::Ci::UpdateBuildQueueService.new.remove!(build)
return Result.new(nil, nil, false)
2022-05-07 20:08:51 +05:30
end
2021-04-17 20:07:23 +05:30
return
end
2022-04-04 11:22:00 +05:30
if runner.matches_build?(build)
2021-04-17 20:07:23 +05:30
@metrics.increment_queue_operation(:build_can_pick)
else
@metrics.increment_queue_operation(:build_not_pick)
return
end
2019-12-26 22:10:19 +05:30
# In case when 2 runners try to assign the same build, second runner will be declined
# with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method.
if assign_runner!(build, params)
2020-07-28 23:09:34 +05:30
present_build!(build)
2019-12-26 22:10:19 +05:30
end
2021-04-17 20:07:23 +05:30
rescue ActiveRecord::StaleObjectError
2019-12-26 22:10:19 +05:30
# We are looping to find another build that is not conflicting
# It also indicates that this build can be picked and passed to runner.
# If we don't do it, basically a bunch of runners would be competing for a build
# and thus we will generate a lot of 409. This will increase
# the number of generated requests, also will reduce significantly
# how many builds can be picked by runner in a unit of time.
# In case we hit the concurrency-access lock,
# we still have to return 409 in the end,
# to make sure that this is properly handled by runner.
2021-04-17 20:07:23 +05:30
@metrics.increment_queue_operation(:build_conflict_lock)
Result.new(nil, nil, false)
rescue StateMachines::InvalidTransition
@metrics.increment_queue_operation(:build_conflict_transition)
2020-07-28 23:09:34 +05:30
Result.new(nil, nil, false)
2021-06-08 01:23:25 +05:30
rescue StandardError => ex
2021-04-17 20:07:23 +05:30
@metrics.increment_queue_operation(:build_conflict_exception)
2020-07-28 23:09:34 +05:30
# If an error (e.g. GRPC::DeadlineExceeded) occurred constructing
# the result, consider this as a failure to be retried.
2019-12-26 22:10:19 +05:30
scheduler_failure!(build)
track_exception_for_build(ex, build)
# skip, and move to next one
nil
end
2021-04-17 20:07:23 +05:30
def max_queue_depth
2021-09-04 01:27:46 +05:30
MAX_QUEUE_DEPTH
2021-04-17 20:07:23 +05:30
end
2020-07-28 23:09:34 +05:30
# Force variables evaluation to occur now
def present_build!(build)
# We need to use the presenter here because Gitaly calls in the presenter
# may fail, and we need to ensure the response has been generated.
presented_build = ::Ci::BuildRunnerPresenter.new(build) # rubocop:disable CodeReuse/Presenter
2022-08-27 11:52:29 +05:30
log_artifacts_context(build)
log_build_dependencies_size(presented_build)
2021-09-30 23:02:18 +05:30
build_json = ::API::Entities::Ci::JobRequest::Response.new(presented_build).to_json
2020-07-28 23:09:34 +05:30
Result.new(build, build_json, true)
end
2022-08-27 11:52:29 +05:30
def log_build_dependencies_size(presented_build)
return unless ::Feature.enabled?(:ci_build_dependencies_artifacts_logger, type: :ops)
presented_build.all_dependencies.then do |dependencies|
size = dependencies.sum do |build|
build.available_artifacts? ? build.artifacts_file.size : 0
end
log_build_dependencies(size: size, count: dependencies.size) if size > 0
end
end
2018-11-18 11:00:15 +05:30
def assign_runner!(build, params)
build.runner_id = runner.id
build.runner_session_attributes = params[:session] if params[:session].present?
2020-10-24 23:57:45 +05:30
failure_reason, _ = pre_assign_runner_checks.find { |_, check| check.call(build, params) }
2018-11-18 11:00:15 +05:30
2020-10-24 23:57:45 +05:30
if failure_reason
2021-04-17 20:07:23 +05:30
@metrics.increment_queue_operation(:runner_pre_assign_checks_failed)
2020-10-24 23:57:45 +05:30
build.drop!(failure_reason)
else
2021-04-17 20:07:23 +05:30
@metrics.increment_queue_operation(:runner_pre_assign_checks_success)
2020-10-24 23:57:45 +05:30
build.run!
2018-12-13 13:39:08 +05:30
end
2020-10-24 23:57:45 +05:30
!failure_reason
2018-11-18 11:00:15 +05:30
end
2021-04-17 20:07:23 +05:30
def acquire_temporary_lock(build_id)
return true unless Feature.enabled?(:ci_register_job_temporary_lock, runner)
key = "build/register/#{build_id}"
Gitlab::ExclusiveLease
.new(key, timeout: TEMPORARY_LOCK_TIMEOUT.to_i)
.try_obtain
end
2019-12-26 22:10:19 +05:30
def scheduler_failure!(build)
2021-04-17 20:07:23 +05:30
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject|
2019-12-26 22:10:19 +05:30
subject.drop!(:scheduler_failure)
end
2021-06-08 01:23:25 +05:30
rescue StandardError => ex
2019-12-26 22:10:19 +05:30
build.doom!
# This requires extra exception, otherwise we would loose information
# why we cannot perform `scheduler_failure`
track_exception_for_build(ex, build)
end
def track_exception_for_build(ex, build)
2020-01-01 13:55:28 +05:30
Gitlab::ErrorTracking.track_exception(ex,
2019-12-26 22:10:19 +05:30
build_id: build.id,
build_name: build.name,
build_stage: build.stage,
pipeline_id: build.pipeline_id,
project_id: build.project_id
2020-01-01 13:55:28 +05:30
)
2019-12-26 22:10:19 +05:30
end
2020-10-24 23:57:45 +05:30
def pre_assign_runner_checks
{
missing_dependency_failure: -> (build, _) { !build.has_valid_build_dependencies? },
runner_unsupported: -> (build, params) { !build.supported_runner?(params.dig(:info, :features)) },
2022-01-26 12:08:38 +05:30
archived_failure: -> (build, _) { build.archived? },
2021-11-11 11:23:49 +05:30
project_deleted: -> (build, _) { build.project.pending_delete? },
2022-06-21 17:19:12 +05:30
builds_disabled: -> (build, _) { !build.project.builds_enabled? },
user_blocked: -> (build, _) { build.user&.blocked? }
2020-10-24 23:57:45 +05:30
}
end
2017-08-17 22:00:37 +05:30
end
end
2019-12-04 20:38:33 +05:30
2021-06-08 01:23:25 +05:30
Ci::RegisterJobService.prepend_mod_with('Ci::RegisterJobService')