# frozen_string_literal: true module Gitlab module Database module Migrations # BatchedBackgroundMigrations are a new approach to scheduling and executing background migrations, which uses # persistent state in the database to track each migration. This avoids having to batch over an entire table and # schedule a large number of sidekiq jobs upfront. It also provides for more flexibility as the migration runs, # as it can be paused and restarted, and have configuration values like the batch size updated dynamically as the # migration runs. # # For now, these migrations are not considered ready for general use, for more information see the tracking epic: # https://gitlab.com/groups/gitlab-org/-/epics/6751 module BatchedBackgroundMigrationHelpers BATCH_SIZE = 1_000 # Number of rows to process per job SUB_BATCH_SIZE = 100 # Number of rows to process per sub-batch BATCH_CLASS_NAME = 'PrimaryKeyBatchingStrategy' # Default batch class for batched migrations BATCH_MIN_VALUE = 1 # Default minimum value for batched migrations BATCH_MIN_DELAY = 2.minutes.freeze # Minimum delay between batched migrations # Creates a batched background migration for the given table. A batched migration runs one job # at a time, computing the bounds of the next batch based on the current migration settings and the previous # batch bounds. Each job's execution status is tracked in the database as the migration runs. The given job # class must be present in the Gitlab::BackgroundMigration module, and the batch class (if specified) must be # present in the Gitlab::BackgroundMigration::BatchingStrategies module. # # If migration with same job_class_name, table_name, column_name, and job_arguments already exists, this helper # will log an warning and not create a new one. # # job_class_name - The background migration job class as a string # batch_table_name - The name of the table the migration will batch over # batch_column_name - The name of the column the migration will batch over # job_arguments - Extra arguments to pass to the job instance when the migration runs # job_interval - The pause interval between each job's execution, minimum of 2 minutes # batch_min_value - The value in the column the batching will begin at # batch_max_value - The value in the column the batching will end at, defaults to `SELECT MAX(batch_column)` # batch_class_name - The name of the class that will be called to find the range of each next batch # batch_size - The maximum number of rows per job # sub_batch_size - The maximum number of rows processed per "iteration" within the job # # *Returns the created BatchedMigration record* # # Example: # # queue_batched_background_migration( # 'CopyColumnUsingBackgroundMigrationJob', # :events, # :id, # job_interval: 2.minutes, # other_job_arguments: ['column1', 'column2']) # # Where the the background migration exists: # # class Gitlab::BackgroundMigration::CopyColumnUsingBackgroundMigrationJob # def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, *other_args) # # do something # end # end def queue_batched_background_migration( # rubocop:disable Metrics/ParameterLists job_class_name, batch_table_name, batch_column_name, *job_arguments, job_interval:, batch_min_value: BATCH_MIN_VALUE, batch_max_value: nil, batch_class_name: BATCH_CLASS_NAME, batch_size: BATCH_SIZE, max_batch_size: nil, sub_batch_size: SUB_BATCH_SIZE, gitlab_schema: nil ) Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.require_dml_mode! gitlab_schema ||= gitlab_schema_from_context Gitlab::Database::BackgroundMigration::BatchedMigration.reset_column_information if Gitlab::Database::BackgroundMigration::BatchedMigration.for_configuration(gitlab_schema, job_class_name, batch_table_name, batch_column_name, job_arguments).exists? Gitlab::AppLogger.warn "Batched background migration not enqueued because it already exists: " \ "job_class_name: #{job_class_name}, table_name: #{batch_table_name}, column_name: #{batch_column_name}, " \ "job_arguments: #{job_arguments.inspect}" return end job_interval = BATCH_MIN_DELAY if job_interval < BATCH_MIN_DELAY batch_max_value ||= connection.select_value(<<~SQL) SELECT MAX(#{connection.quote_column_name(batch_column_name)}) FROM #{connection.quote_table_name(batch_table_name)} SQL status_event = batch_max_value.nil? ? :finish : :execute batch_max_value ||= batch_min_value migration = Gitlab::Database::BackgroundMigration::BatchedMigration.new( job_class_name: job_class_name, table_name: batch_table_name, column_name: batch_column_name, job_arguments: job_arguments, interval: job_interval, min_value: batch_min_value, max_value: batch_max_value, batch_class_name: batch_class_name, batch_size: batch_size, sub_batch_size: sub_batch_size, status_event: status_event ) if migration.job_class.respond_to?(:job_arguments_count) && migration.job_class.job_arguments_count != job_arguments.count raise "Wrong number of job arguments for #{migration.job_class_name} " \ "(given #{job_arguments.count}, expected #{migration.job_class.job_arguments_count})" end # Below `BatchedMigration` attributes were introduced after the # initial `batched_background_migrations` table was created, so any # migrations that ran relying on initial table schema would not know # about columns introduced later on because this model is not # isolated in migrations, which is why we need to check for existence # of these columns first. if migration.respond_to?(:max_batch_size) migration.max_batch_size = max_batch_size end if migration.respond_to?(:total_tuple_count) # We keep track of the estimated number of tuples to reason later # about the overall progress of a migration. migration.total_tuple_count = Gitlab::Database::SharedModel.using_connection(connection) do Gitlab::Database::PgClass.for_table(batch_table_name)&.cardinality_estimate end end if migration.respond_to?(:gitlab_schema) migration.gitlab_schema = gitlab_schema end migration.save! migration end def finalize_batched_background_migration(job_class_name:, table_name:, column_name:, job_arguments:) Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.require_dml_mode! if transaction_open? raise 'The `finalize_batched_background_migration` cannot be run inside a transaction. ' \ 'You can disable transactions by calling `disable_ddl_transaction!` in the body of ' \ 'your migration class.' end Gitlab::Database::BackgroundMigration::BatchedMigration.reset_column_information migration = Gitlab::Database::BackgroundMigration::BatchedMigration.find_for_configuration( gitlab_schema_from_context, job_class_name, table_name, column_name, job_arguments) raise 'Could not find batched background migration' if migration.nil? with_restored_connection_stack do |restored_connection| Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.with_suppressed do Gitlab::Database::BackgroundMigration::BatchedMigrationRunner.finalize( job_class_name, table_name, column_name, job_arguments, connection: restored_connection) end end end # Deletes batched background migration for the given configuration. # # job_class_name - The background migration job class as a string # table_name - The name of the table the migration iterates over # column_name - The name of the column the migration will batch over # job_arguments - Migration arguments # # Example: # # delete_batched_background_migration( # 'CopyColumnUsingBackgroundMigrationJob', # :events, # :id, # ['column1', 'column2']) def delete_batched_background_migration(job_class_name, table_name, column_name, job_arguments) Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.require_dml_mode! Gitlab::Database::BackgroundMigration::BatchedMigration.reset_column_information Gitlab::Database::BackgroundMigration::BatchedMigration .for_configuration( gitlab_schema_from_context, job_class_name, table_name, column_name, job_arguments ).delete_all end def gitlab_schema_from_context if respond_to?(:allowed_gitlab_schemas) # Gitlab::Database::Migration::V2_0 Array(allowed_gitlab_schemas).first else # Gitlab::Database::Migration::V1_0 :gitlab_main end end def ensure_batched_background_migration_is_finished(job_class_name:, table_name:, column_name:, job_arguments:, finalize: true) Gitlab::Database::QueryAnalyzers::RestrictAllowedSchemas.require_dml_mode! Gitlab::Database::BackgroundMigration::BatchedMigration.reset_column_information migration = Gitlab::Database::BackgroundMigration::BatchedMigration.find_for_configuration( Gitlab::Database.gitlab_schemas_for_connection(connection), job_class_name, table_name, column_name, job_arguments ) configuration = { job_class_name: job_class_name, table_name: table_name, column_name: column_name, job_arguments: job_arguments } return Gitlab::AppLogger.warn "Could not find batched background migration for the given configuration: #{configuration}" if migration.nil? return if migration.finished? finalize_batched_background_migration(job_class_name: job_class_name, table_name: table_name, column_name: column_name, job_arguments: job_arguments) if finalize return if migration.reload.finished? # rubocop:disable Cop/ActiveRecordAssociationReload raise "Expected batched background migration for the given configuration to be marked as 'finished', " \ "but it is '#{migration.status_name}':" \ "\t#{configuration}" \ "\n\n" \ "Finalize it manually by running the following command in a `bash` or `sh` shell:" \ "\n\n" \ "\tsudo gitlab-rake gitlab:background_migrations:finalize[#{job_class_name},#{table_name},#{column_name},'#{job_arguments.to_json.gsub(',', '\,')}']" \ "\n\n" \ "For more information, check the documentation" \ "\n\n" \ "\thttps://docs.gitlab.com/ee/user/admin_area/monitoring/background_migrations.html#database-migrations-failing-because-of-batched-background-migration-not-finished" end end end end end