2018-11-18 11:00:15 +05:30
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2017-08-17 22:00:37 +05:30
|
|
|
module Ci
|
|
|
|
# This class responsible for assigning
|
|
|
|
# proper pending build to runner on runner API request
|
|
|
|
class RegisterJobService
|
2021-04-17 20:07:23 +05:30
|
|
|
attr_reader :runner, :metrics
|
2017-08-17 22:00:37 +05:30
|
|
|
|
2021-04-17 20:07:23 +05:30
|
|
|
TEMPORARY_LOCK_TIMEOUT = 3.seconds
|
2018-10-15 14:42:47 +05:30
|
|
|
|
2020-07-28 23:09:34 +05:30
|
|
|
Result = Struct.new(:build, :build_json, :valid?)
|
2017-08-17 22:00:37 +05:30
|
|
|
|
2021-04-17 20:07:23 +05:30
|
|
|
MAX_QUEUE_DEPTH = 50
|
|
|
|
|
2017-08-17 22:00:37 +05:30
|
|
|
def initialize(runner)
|
|
|
|
@runner = runner
|
2021-04-17 20:07:23 +05:30
|
|
|
@metrics = ::Gitlab::Ci::Queue::Metrics.new(runner)
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
|
|
|
|
2018-11-08 19:23:39 +05:30
|
|
|
def execute(params = {})
|
2021-04-17 20:07:23 +05:30
|
|
|
@metrics.increment_queue_operation(:queue_attempt)
|
|
|
|
|
|
|
|
@metrics.observe_queue_time do
|
|
|
|
process_queue(params)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
def process_queue(params)
|
|
|
|
valid = true
|
|
|
|
depth = 0
|
|
|
|
|
|
|
|
each_build(params) do |build|
|
|
|
|
depth += 1
|
|
|
|
@metrics.increment_queue_operation(:queue_iteration)
|
|
|
|
|
|
|
|
if depth > max_queue_depth
|
|
|
|
@metrics.increment_queue_operation(:queue_depth_limit)
|
|
|
|
|
|
|
|
valid = false
|
|
|
|
|
|
|
|
break
|
|
|
|
end
|
|
|
|
|
|
|
|
# We read builds from replicas
|
|
|
|
# It is likely that some other concurrent connection is processing
|
|
|
|
# a given build at a given moment. To avoid an expensive compute
|
|
|
|
# we perform an exclusive lease on Redis to acquire a build temporarily
|
|
|
|
unless acquire_temporary_lock(build.id)
|
|
|
|
@metrics.increment_queue_operation(:build_temporary_locked)
|
|
|
|
|
|
|
|
# We failed to acquire lock
|
|
|
|
# - our queue is not complete as some resources are locked temporarily
|
|
|
|
# - we need to re-process it again to ensure that all builds are handled
|
|
|
|
valid = false
|
|
|
|
|
|
|
|
next
|
|
|
|
end
|
|
|
|
|
|
|
|
result = process_build(build, params)
|
|
|
|
next unless result
|
|
|
|
|
|
|
|
if result.valid?
|
|
|
|
@metrics.register_success(result.build)
|
|
|
|
@metrics.observe_queue_depth(:found, depth)
|
|
|
|
|
|
|
|
return result # rubocop:disable Cop/AvoidReturnFromBlocks
|
|
|
|
else
|
|
|
|
# The usage of valid: is described in
|
|
|
|
# handling of ActiveRecord::StaleObjectError
|
|
|
|
valid = false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
@metrics.increment_queue_operation(:queue_conflict) unless valid
|
|
|
|
@metrics.observe_queue_depth(:conflict, depth) unless valid
|
|
|
|
@metrics.observe_queue_depth(:not_found, depth) if valid
|
|
|
|
@metrics.register_failure
|
|
|
|
|
|
|
|
Result.new(nil, nil, valid)
|
|
|
|
end
|
|
|
|
|
|
|
|
# rubocop: disable CodeReuse/ActiveRecord
|
|
|
|
def each_build(params, &blk)
|
2017-08-17 22:00:37 +05:30
|
|
|
builds =
|
2018-11-08 19:23:39 +05:30
|
|
|
if runner.instance_type?
|
2017-08-17 22:00:37 +05:30
|
|
|
builds_for_shared_runner
|
2018-10-15 14:42:47 +05:30
|
|
|
elsif runner.group_type?
|
|
|
|
builds_for_group_runner
|
2017-08-17 22:00:37 +05:30
|
|
|
else
|
2018-10-15 14:42:47 +05:30
|
|
|
builds_for_project_runner
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
|
|
|
|
2018-11-08 19:23:39 +05:30
|
|
|
# pick builds that does not have other tags than runner's one
|
|
|
|
builds = builds.matches_tag_ids(runner.tags.ids)
|
2018-03-17 18:26:18 +05:30
|
|
|
|
2018-11-08 19:23:39 +05:30
|
|
|
# pick builds that have at least one tag
|
|
|
|
unless runner.run_untagged?
|
|
|
|
builds = builds.with_any_tags
|
2018-03-17 18:26:18 +05:30
|
|
|
end
|
|
|
|
|
2019-09-04 21:01:54 +05:30
|
|
|
# pick builds that older than specified age
|
|
|
|
if params.key?(:job_age)
|
|
|
|
builds = builds.queued_before(params[:job_age].seconds.ago)
|
|
|
|
end
|
|
|
|
|
2021-04-17 20:07:23 +05:30
|
|
|
if Feature.enabled?(:ci_register_job_service_one_by_one, runner)
|
|
|
|
build_ids = builds.pluck(:id)
|
2019-12-26 22:10:19 +05:30
|
|
|
|
2021-04-17 20:07:23 +05:30
|
|
|
@metrics.observe_queue_size(-> { build_ids.size })
|
2019-12-26 22:10:19 +05:30
|
|
|
|
2021-04-17 20:07:23 +05:30
|
|
|
build_ids.each do |build_id|
|
|
|
|
yield Ci::Build.find(build_id)
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
2021-04-17 20:07:23 +05:30
|
|
|
else
|
|
|
|
@metrics.observe_queue_size(-> { builds.to_a.size })
|
2017-08-17 22:00:37 +05:30
|
|
|
|
2021-04-17 20:07:23 +05:30
|
|
|
builds.each(&blk)
|
|
|
|
end
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: enable CodeReuse/ActiveRecord
|
2017-08-17 22:00:37 +05:30
|
|
|
|
2019-12-26 22:10:19 +05:30
|
|
|
def process_build(build, params)
|
2021-04-17 20:07:23 +05:30
|
|
|
unless build.pending?
|
|
|
|
@metrics.increment_queue_operation(:build_not_pending)
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
if runner.can_pick?(build)
|
|
|
|
@metrics.increment_queue_operation(:build_can_pick)
|
|
|
|
else
|
|
|
|
@metrics.increment_queue_operation(:build_not_pick)
|
|
|
|
|
|
|
|
return
|
|
|
|
end
|
2019-12-26 22:10:19 +05:30
|
|
|
|
|
|
|
# In case when 2 runners try to assign the same build, second runner will be declined
|
|
|
|
# with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method.
|
|
|
|
if assign_runner!(build, params)
|
2020-07-28 23:09:34 +05:30
|
|
|
present_build!(build)
|
2019-12-26 22:10:19 +05:30
|
|
|
end
|
2021-04-17 20:07:23 +05:30
|
|
|
rescue ActiveRecord::StaleObjectError
|
2019-12-26 22:10:19 +05:30
|
|
|
# We are looping to find another build that is not conflicting
|
|
|
|
# It also indicates that this build can be picked and passed to runner.
|
|
|
|
# If we don't do it, basically a bunch of runners would be competing for a build
|
|
|
|
# and thus we will generate a lot of 409. This will increase
|
|
|
|
# the number of generated requests, also will reduce significantly
|
|
|
|
# how many builds can be picked by runner in a unit of time.
|
|
|
|
# In case we hit the concurrency-access lock,
|
|
|
|
# we still have to return 409 in the end,
|
|
|
|
# to make sure that this is properly handled by runner.
|
2021-04-17 20:07:23 +05:30
|
|
|
@metrics.increment_queue_operation(:build_conflict_lock)
|
|
|
|
|
|
|
|
Result.new(nil, nil, false)
|
|
|
|
rescue StateMachines::InvalidTransition
|
|
|
|
@metrics.increment_queue_operation(:build_conflict_transition)
|
|
|
|
|
2020-07-28 23:09:34 +05:30
|
|
|
Result.new(nil, nil, false)
|
2019-12-26 22:10:19 +05:30
|
|
|
rescue => ex
|
2021-04-17 20:07:23 +05:30
|
|
|
@metrics.increment_queue_operation(:build_conflict_exception)
|
|
|
|
|
2020-07-28 23:09:34 +05:30
|
|
|
# If an error (e.g. GRPC::DeadlineExceeded) occurred constructing
|
|
|
|
# the result, consider this as a failure to be retried.
|
2019-12-26 22:10:19 +05:30
|
|
|
scheduler_failure!(build)
|
|
|
|
track_exception_for_build(ex, build)
|
|
|
|
|
|
|
|
# skip, and move to next one
|
|
|
|
nil
|
|
|
|
end
|
|
|
|
|
2021-04-17 20:07:23 +05:30
|
|
|
def max_queue_depth
|
|
|
|
@max_queue_depth ||= begin
|
|
|
|
if Feature.enabled?(:gitlab_ci_builds_queue_limit, runner, default_enabled: false)
|
|
|
|
MAX_QUEUE_DEPTH
|
|
|
|
else
|
|
|
|
::Gitlab::Database::MAX_INT_VALUE
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-07-28 23:09:34 +05:30
|
|
|
# Force variables evaluation to occur now
|
|
|
|
def present_build!(build)
|
|
|
|
# We need to use the presenter here because Gitaly calls in the presenter
|
|
|
|
# may fail, and we need to ensure the response has been generated.
|
|
|
|
presented_build = ::Ci::BuildRunnerPresenter.new(build) # rubocop:disable CodeReuse/Presenter
|
|
|
|
build_json = ::API::Entities::JobRequest::Response.new(presented_build).to_json
|
|
|
|
Result.new(build, build_json, true)
|
|
|
|
end
|
|
|
|
|
2018-11-18 11:00:15 +05:30
|
|
|
def assign_runner!(build, params)
|
|
|
|
build.runner_id = runner.id
|
|
|
|
build.runner_session_attributes = params[:session] if params[:session].present?
|
|
|
|
|
2020-10-24 23:57:45 +05:30
|
|
|
failure_reason, _ = pre_assign_runner_checks.find { |_, check| check.call(build, params) }
|
2018-11-18 11:00:15 +05:30
|
|
|
|
2020-10-24 23:57:45 +05:30
|
|
|
if failure_reason
|
2021-04-17 20:07:23 +05:30
|
|
|
@metrics.increment_queue_operation(:runner_pre_assign_checks_failed)
|
|
|
|
|
2020-10-24 23:57:45 +05:30
|
|
|
build.drop!(failure_reason)
|
|
|
|
else
|
2021-04-17 20:07:23 +05:30
|
|
|
@metrics.increment_queue_operation(:runner_pre_assign_checks_success)
|
|
|
|
|
2020-10-24 23:57:45 +05:30
|
|
|
build.run!
|
2018-12-13 13:39:08 +05:30
|
|
|
end
|
|
|
|
|
2020-10-24 23:57:45 +05:30
|
|
|
!failure_reason
|
2018-11-18 11:00:15 +05:30
|
|
|
end
|
|
|
|
|
2021-04-17 20:07:23 +05:30
|
|
|
def acquire_temporary_lock(build_id)
|
|
|
|
return true unless Feature.enabled?(:ci_register_job_temporary_lock, runner)
|
|
|
|
|
|
|
|
key = "build/register/#{build_id}"
|
|
|
|
|
|
|
|
Gitlab::ExclusiveLease
|
|
|
|
.new(key, timeout: TEMPORARY_LOCK_TIMEOUT.to_i)
|
|
|
|
.try_obtain
|
|
|
|
end
|
|
|
|
|
2019-12-26 22:10:19 +05:30
|
|
|
def scheduler_failure!(build)
|
2021-04-17 20:07:23 +05:30
|
|
|
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject|
|
2019-12-26 22:10:19 +05:30
|
|
|
subject.drop!(:scheduler_failure)
|
|
|
|
end
|
|
|
|
rescue => ex
|
|
|
|
build.doom!
|
|
|
|
|
|
|
|
# This requires extra exception, otherwise we would loose information
|
|
|
|
# why we cannot perform `scheduler_failure`
|
|
|
|
track_exception_for_build(ex, build)
|
|
|
|
end
|
|
|
|
|
|
|
|
def track_exception_for_build(ex, build)
|
2020-01-01 13:55:28 +05:30
|
|
|
Gitlab::ErrorTracking.track_exception(ex,
|
2019-12-26 22:10:19 +05:30
|
|
|
build_id: build.id,
|
|
|
|
build_name: build.name,
|
|
|
|
build_stage: build.stage,
|
|
|
|
pipeline_id: build.pipeline_id,
|
|
|
|
project_id: build.project_id
|
2020-01-01 13:55:28 +05:30
|
|
|
)
|
2019-12-26 22:10:19 +05:30
|
|
|
end
|
|
|
|
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: disable CodeReuse/ActiveRecord
|
2017-08-17 22:00:37 +05:30
|
|
|
def builds_for_shared_runner
|
|
|
|
new_builds.
|
|
|
|
# don't run projects which have not enabled shared runners and builds
|
2017-09-10 17:25:29 +05:30
|
|
|
joins(:project).where(projects: { shared_runners_enabled: true, pending_delete: false })
|
|
|
|
.joins('LEFT JOIN project_features ON ci_builds.project_id = project_features.project_id')
|
|
|
|
.where('project_features.builds_access_level IS NULL or project_features.builds_access_level > 0').
|
2017-08-17 22:00:37 +05:30
|
|
|
|
2018-10-15 14:42:47 +05:30
|
|
|
# Implement fair scheduling
|
|
|
|
# this returns builds that are ordered by number of running builds
|
|
|
|
# we prefer projects that don't use shared runners at all
|
|
|
|
joins("LEFT JOIN (#{running_builds_for_shared_runners.to_sql}) AS project_builds ON ci_builds.project_id=project_builds.project_id")
|
2019-12-26 22:10:19 +05:30
|
|
|
.order(Arel.sql('COALESCE(project_builds.running_builds, 0) ASC'), 'ci_builds.id ASC')
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: enable CodeReuse/ActiveRecord
|
2017-08-17 22:00:37 +05:30
|
|
|
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: disable CodeReuse/ActiveRecord
|
2018-10-15 14:42:47 +05:30
|
|
|
def builds_for_project_runner
|
|
|
|
new_builds.where(project: runner.projects.without_deleted.with_builds_enabled).order('id ASC')
|
|
|
|
end
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: enable CodeReuse/ActiveRecord
|
2018-10-15 14:42:47 +05:30
|
|
|
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: disable CodeReuse/ActiveRecord
|
2018-10-15 14:42:47 +05:30
|
|
|
def builds_for_group_runner
|
2018-11-08 19:23:39 +05:30
|
|
|
# Workaround for weird Rails bug, that makes `runner.groups.to_sql` to return `runner_id = NULL`
|
|
|
|
groups = ::Group.joins(:runner_namespaces).merge(runner.runner_namespaces)
|
|
|
|
|
2019-02-15 15:39:39 +05:30
|
|
|
hierarchy_groups = Gitlab::ObjectHierarchy.new(groups).base_and_descendants
|
2018-10-15 14:42:47 +05:30
|
|
|
projects = Project.where(namespace_id: hierarchy_groups)
|
|
|
|
.with_group_runners_enabled
|
|
|
|
.with_builds_enabled
|
|
|
|
.without_deleted
|
|
|
|
new_builds.where(project: projects).order('id ASC')
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: enable CodeReuse/ActiveRecord
|
2017-08-17 22:00:37 +05:30
|
|
|
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: disable CodeReuse/ActiveRecord
|
2017-08-17 22:00:37 +05:30
|
|
|
def running_builds_for_shared_runners
|
2018-11-08 19:23:39 +05:30
|
|
|
Ci::Build.running.where(runner: Ci::Runner.instance_type)
|
2017-09-10 17:25:29 +05:30
|
|
|
.group(:project_id).select(:project_id, 'count(*) AS running_builds')
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
2018-12-05 23:21:45 +05:30
|
|
|
# rubocop: enable CodeReuse/ActiveRecord
|
2017-08-17 22:00:37 +05:30
|
|
|
|
|
|
|
def new_builds
|
2018-03-17 18:26:18 +05:30
|
|
|
builds = Ci::Build.pending.unstarted
|
|
|
|
builds = builds.ref_protected if runner.ref_protected?
|
|
|
|
builds
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
|
|
|
|
2020-10-24 23:57:45 +05:30
|
|
|
def pre_assign_runner_checks
|
|
|
|
{
|
|
|
|
missing_dependency_failure: -> (build, _) { !build.has_valid_build_dependencies? },
|
|
|
|
runner_unsupported: -> (build, params) { !build.supported_runner?(params.dig(:info, :features)) },
|
|
|
|
archived_failure: -> (build, _) { build.archived? }
|
|
|
|
}
|
|
|
|
end
|
2017-08-17 22:00:37 +05:30
|
|
|
end
|
|
|
|
end
|
2019-12-04 20:38:33 +05:30
|
|
|
|
|
|
|
Ci::RegisterJobService.prepend_if_ee('EE::Ci::RegisterJobService')
|