debian-mirror-gitlab/lib/gitlab/github_import/parallel_scheduling.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

255 lines
8.4 KiB
Ruby
Raw Normal View History

2018-03-17 18:26:18 +05:30
# frozen_string_literal: true
module Gitlab
module GithubImport
module ParallelScheduling
2023-04-23 21:23:45 +05:30
attr_reader :project, :client, :page_counter, :already_imported_cache_key,
:job_waiter_cache_key, :job_waiter_remaining_cache_key
2018-03-17 18:26:18 +05:30
# The base cache key to use for tracking already imported objects.
ALREADY_IMPORTED_CACHE_KEY =
2019-12-04 20:38:33 +05:30
'github-importer/already-imported/%{project}/%{collection}'
2023-04-23 21:23:45 +05:30
# The base cache key to use for storing job waiter key
JOB_WAITER_CACHE_KEY =
'github-importer/job-waiter/%{project}/%{collection}'
# The base cache key to use for storing job waiter remaining jobs
JOB_WAITER_REMAINING_CACHE_KEY =
'github-importer/job-waiter-remaining/%{project}/%{collection}'
2018-03-17 18:26:18 +05:30
# project - An instance of `Project`.
# client - An instance of `Gitlab::GithubImport::Client`.
# parallel - When set to true the objects will be imported in parallel.
def initialize(project, client, parallel: true)
@project = project
@client = client
@parallel = parallel
@page_counter = PageCounter.new(project, collection_method)
@already_imported_cache_key = ALREADY_IMPORTED_CACHE_KEY %
{ project: project.id, collection: collection_method }
2023-04-23 21:23:45 +05:30
@job_waiter_cache_key = JOB_WAITER_CACHE_KEY %
{ project: project.id, collection: collection_method }
@job_waiter_remaining_cache_key = JOB_WAITER_REMAINING_CACHE_KEY %
{ project: project.id, collection: collection_method }
2018-03-17 18:26:18 +05:30
end
def parallel?
@parallel
end
def execute
2021-02-22 17:27:13 +05:30
info(project.id, message: "starting importer")
2018-03-17 18:26:18 +05:30
retval =
if parallel?
parallel_import
else
sequential_import
end
# Once we have completed all work we can remove our "already exists"
# cache so we don't put too much pressure on Redis.
#
# We don't immediately remove it since it's technically possible for
# other instances of this job to still run, instead we set the
# expiration time to a lower value. This prevents the other jobs from
# still scheduling duplicates while. Since all work has already been
# completed those jobs will just cycle through any remaining pages while
# not scheduling anything.
2021-11-11 11:23:49 +05:30
Gitlab::Cache::Import::Caching.expire(already_imported_cache_key, Gitlab::Cache::Import::Caching::SHORTER_TIMEOUT)
2021-02-22 17:27:13 +05:30
info(project.id, message: "importer finished")
2018-03-17 18:26:18 +05:30
retval
2021-06-08 01:23:25 +05:30
rescue StandardError => e
2021-10-27 15:23:28 +05:30
Gitlab::Import::ImportFailureService.track(
project_id: project.id,
error_source: self.class.name,
exception: e,
2021-11-18 22:05:49 +05:30
fail_import: abort_on_failure,
metrics: true
2021-10-27 15:23:28 +05:30
)
2021-02-22 17:27:13 +05:30
2021-10-27 15:23:28 +05:30
raise(e)
2018-03-17 18:26:18 +05:30
end
# Imports all the objects in sequence in the current thread.
def sequential_import
each_object_to_import do |object|
2022-10-11 01:57:18 +05:30
repr = object_representation(object)
2018-03-17 18:26:18 +05:30
importer_class.new(repr, project, client).execute
end
end
# Imports all objects in parallel by scheduling a Sidekiq job for every
# individual object.
def parallel_import
2022-08-27 11:52:29 +05:30
raise 'Batch settings must be defined for parallel import' if parallel_import_batch.blank?
2018-03-17 18:26:18 +05:30
2023-05-27 22:25:52 +05:30
spread_parallel_import
2023-04-23 21:23:45 +05:30
end
2023-05-27 22:25:52 +05:30
def spread_parallel_import
2023-04-23 21:23:45 +05:30
enqueued_job_counter = 0
each_object_to_import do |object|
repr = object_representation(object)
job_delay = calculate_job_delay(enqueued_job_counter)
sidekiq_worker_class.perform_in(job_delay, project.id, repr.to_hash, job_waiter.key)
enqueued_job_counter += 1
job_waiter.jobs_remaining = Gitlab::Cache::Import::Caching.increment(job_waiter_remaining_cache_key)
end
job_waiter
2018-03-17 18:26:18 +05:30
end
# The method that will be called for traversing through all the objects to
# import, yielding them to the supplied block.
def each_object_to_import
repo = project.import_source
# We inject the page number here to make sure that all importers always
# start where they left off. Simply starting over wouldn't work for
# repositories with a lot of data (e.g. tens of thousands of comments).
options = collection_options.merge(page: page_counter.current)
client.each_page(collection_method, repo, options) do |page|
# Technically it's possible that the same work is performed multiple
# times, as Sidekiq doesn't guarantee there will ever only be one
# instance of a job. In such a scenario it's possible for one job to
# have a lower page number (e.g. 5) compared to another (e.g. 10). In
# this case we skip over all the objects until we have caught up,
# reducing the number of duplicate jobs scheduled by the provided
# block.
next unless page_counter.set(page.number)
page.objects.each do |object|
2022-11-25 23:54:43 +05:30
object = object.to_h
2018-03-17 18:26:18 +05:30
next if already_imported?(object)
2022-11-25 23:54:43 +05:30
if increment_object_counter?(object)
Gitlab::GithubImport::ObjectCounter.increment(project, object_type, :fetched)
end
2021-09-30 23:02:18 +05:30
2018-03-17 18:26:18 +05:30
yield object
# We mark the object as imported immediately so we don't end up
# scheduling it multiple times.
mark_as_imported(object)
end
end
end
2022-11-25 23:54:43 +05:30
def increment_object_counter?(_object)
true
end
2018-03-17 18:26:18 +05:30
# Returns true if the given object has already been imported, false
# otherwise.
#
# object - The object to check.
def already_imported?(object)
id = id_for_already_imported_cache(object)
2020-04-08 14:13:33 +05:30
Gitlab::Cache::Import::Caching.set_includes?(already_imported_cache_key, id)
2018-03-17 18:26:18 +05:30
end
# Marks the given object as "already imported".
def mark_as_imported(object)
id = id_for_already_imported_cache(object)
2020-04-08 14:13:33 +05:30
Gitlab::Cache::Import::Caching.set_add(already_imported_cache_key, id)
2018-03-17 18:26:18 +05:30
end
2021-09-30 23:02:18 +05:30
def object_type
raise NotImplementedError
end
2018-03-17 18:26:18 +05:30
# Returns the ID to use for the cache used for checking if an object has
# already been imported or not.
#
# object - The object we may want to import.
def id_for_already_imported_cache(object)
raise NotImplementedError
end
# The class used for converting API responses to Hashes when performing
# the import.
def representation_class
raise NotImplementedError
end
# The class to use for importing objects when importing them sequentially.
def importer_class
raise NotImplementedError
end
# The Sidekiq worker class used for scheduling the importing of objects in
# parallel.
def sidekiq_worker_class
raise NotImplementedError
end
# The name of the method to call to retrieve the data to import.
def collection_method
raise NotImplementedError
end
2022-05-07 20:08:51 +05:30
# Default batch settings for parallel import (can be redefined in Importer classes)
def parallel_import_batch
2022-07-16 23:28:13 +05:30
{ size: 1000, delay: 1.minute }
2022-05-07 20:08:51 +05:30
end
2021-10-27 15:23:28 +05:30
def abort_on_failure
false
end
2018-03-17 18:26:18 +05:30
# Any options to be passed to the method used for retrieving the data to
# import.
def collection_options
{}
end
2021-02-22 17:27:13 +05:30
private
2022-08-27 11:52:29 +05:30
def additional_object_data
{}
end
2022-10-11 01:57:18 +05:30
def object_representation(object)
representation_class.from_api_response(object, additional_object_data)
end
2021-02-22 17:27:13 +05:30
def info(project_id, extra = {})
2021-10-27 15:23:28 +05:30
Logger.info(log_attributes(project_id, extra))
2021-02-22 17:27:13 +05:30
end
def log_attributes(project_id, extra = {})
extra.merge(
project_id: project_id,
importer: importer_class.name,
parallel: parallel?
)
end
2023-04-23 21:23:45 +05:30
def job_waiter
@job_waiter ||= begin
key = Gitlab::Cache::Import::Caching.read(job_waiter_cache_key)
key ||= Gitlab::Cache::Import::Caching.write(job_waiter_cache_key, JobWaiter.generate_key)
jobs_remaining = Gitlab::Cache::Import::Caching.read(job_waiter_remaining_cache_key).to_i || 0
JobWaiter.new(jobs_remaining, key)
end
end
def calculate_job_delay(job_index)
multiplier = (job_index / parallel_import_batch[:size])
(multiplier * parallel_import_batch[:delay]) + 1.second
end
2018-03-17 18:26:18 +05:30
end
end
end