debian-mirror-gitlab/app/workers/repository_check/batch_worker.rb

95 lines
2.9 KiB
Ruby
Raw Normal View History

2018-11-08 19:23:39 +05:30
# frozen_string_literal: true
2016-06-02 11:05:42 +05:30
module RepositoryCheck
class BatchWorker
2018-03-17 18:26:18 +05:30
include ApplicationWorker
2018-11-08 19:23:39 +05:30
include RepositoryCheckQueue
include ExclusiveLeaseGuard
2016-11-03 12:29:30 +05:30
2016-06-02 11:05:42 +05:30
RUN_TIME = 3600
2018-10-15 14:42:47 +05:30
BATCH_SIZE = 10_000
2018-11-08 19:23:39 +05:30
LEASE_TIMEOUT = 1.hour
attr_reader :shard_name
def perform(shard_name)
@shard_name = shard_name
2016-11-03 12:29:30 +05:30
2018-10-15 14:42:47 +05:30
return unless Gitlab::CurrentSettings.repository_checks_enabled
2018-11-08 19:23:39 +05:30
return unless Gitlab::ShardHealthCache.healthy_shard?(shard_name)
try_obtain_lease do
perform_repository_checks
end
end
2018-10-15 14:42:47 +05:30
2018-11-08 19:23:39 +05:30
def lease_timeout
LEASE_TIMEOUT
end
def lease_key
"repository_check_batch_worker:#{shard_name}"
end
def perform_repository_checks
2016-06-02 11:05:42 +05:30
start = Time.now
2016-11-03 12:29:30 +05:30
2016-06-02 11:05:42 +05:30
# This loop will break after a little more than one hour ('a little
# more' because `git fsck` may take a few minutes), or if it runs out of
# projects to check. By default sidekiq-cron will start a new
# RepositoryCheckWorker each hour so that as long as there are repositories to
# check, only one (or two) will be checked at a time.
project_ids.each do |project_id|
break if Time.now - start >= RUN_TIME
2016-11-03 12:29:30 +05:30
2018-11-08 19:23:39 +05:30
next unless try_obtain_lease_for_project(project_id)
2016-11-03 12:29:30 +05:30
2016-06-02 11:05:42 +05:30
SingleRepositoryWorker.new.perform(project_id)
end
end
2016-11-03 12:29:30 +05:30
2016-06-02 11:05:42 +05:30
private
2016-11-03 12:29:30 +05:30
2016-06-02 11:05:42 +05:30
# Project.find_each does not support WHERE clauses and
# Project.find_in_batches does not support ordering. So we just build an
# array of ID's. This is OK because we do it only once an hour, because
# getting ID's from Postgres is not terribly slow, and because no user
# has to sit and wait for this query to finish.
def project_ids
2018-10-15 14:42:47 +05:30
never_checked_project_ids(BATCH_SIZE) + old_checked_project_ids(BATCH_SIZE)
end
2018-12-05 23:21:45 +05:30
# rubocop: disable CodeReuse/ActiveRecord
2018-10-15 14:42:47 +05:30
def never_checked_project_ids(batch_size)
2018-11-08 19:23:39 +05:30
projects_on_shard.where(last_repository_check_at: nil)
2018-10-15 14:42:47 +05:30
.where('created_at < ?', 24.hours.ago)
.limit(batch_size).pluck(:id)
end
2018-12-05 23:21:45 +05:30
# rubocop: enable CodeReuse/ActiveRecord
2018-10-15 14:42:47 +05:30
2018-12-05 23:21:45 +05:30
# rubocop: disable CodeReuse/ActiveRecord
2018-10-15 14:42:47 +05:30
def old_checked_project_ids(batch_size)
2018-11-08 19:23:39 +05:30
projects_on_shard.where.not(last_repository_check_at: nil)
2018-10-15 14:42:47 +05:30
.where('last_repository_check_at < ?', 1.month.ago)
.reorder(last_repository_check_at: :asc)
.limit(batch_size).pluck(:id)
2016-06-02 11:05:42 +05:30
end
2018-12-05 23:21:45 +05:30
# rubocop: enable CodeReuse/ActiveRecord
2016-11-03 12:29:30 +05:30
2018-12-05 23:21:45 +05:30
# rubocop: disable CodeReuse/ActiveRecord
2018-11-08 19:23:39 +05:30
def projects_on_shard
Project.where(repository_storage: shard_name)
end
2018-12-05 23:21:45 +05:30
# rubocop: enable CodeReuse/ActiveRecord
2018-11-08 19:23:39 +05:30
def try_obtain_lease_for_project(id)
2016-06-02 11:05:42 +05:30
# Use a 24-hour timeout because on servers/projects where 'git fsck' is
# super slow we definitely do not want to run it twice in parallel.
Gitlab::ExclusiveLease.new(
"project_repository_check:#{id}",
timeout: 24.hours
).try_obtain
end
end
end