182 lines
5.7 KiB
Ruby
182 lines
5.7 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module Gitlab
|
|
module BackgroundMigration
|
|
# This class finds all non-hashed uploaded file paths and saves them to a
|
|
# `untracked_files_for_uploads` table.
|
|
class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength
|
|
# For bulk_queue_background_migration_jobs_by_range
|
|
include Database::MigrationHelpers
|
|
include ::Gitlab::Utils::StrongMemoize
|
|
|
|
FIND_BATCH_SIZE = 500
|
|
RELATIVE_UPLOAD_DIR = "uploads".freeze
|
|
ABSOLUTE_UPLOAD_DIR = File.join(
|
|
Gitlab.config.uploads.storage_path,
|
|
RELATIVE_UPLOAD_DIR
|
|
)
|
|
FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'.freeze
|
|
START_WITH_ROOT_REGEX = %r{\A#{Gitlab.config.uploads.storage_path}/}.freeze
|
|
EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze
|
|
EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze
|
|
|
|
# This class is used to iterate over batches of
|
|
# `untracked_files_for_uploads` rows.
|
|
class UntrackedFile < ActiveRecord::Base
|
|
include EachBatch
|
|
|
|
self.table_name = 'untracked_files_for_uploads'
|
|
end
|
|
|
|
def perform
|
|
ensure_temporary_tracking_table_exists
|
|
|
|
# Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
|
|
# doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
|
|
# slow, start with an empty table for Postgres < 9.5.
|
|
# That way we can do bulk inserts at ~30x the speed of individual
|
|
# inserts (~20 minutes worth of inserts at GitLab.com scale instead of
|
|
# ~10 hours).
|
|
# In all other cases, installations will get both bulk inserts and the
|
|
# ability for these jobs to retry without having to clear and reinsert.
|
|
clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?
|
|
|
|
store_untracked_file_paths
|
|
|
|
if UntrackedFile.all.empty?
|
|
drop_temp_table
|
|
else
|
|
schedule_populate_untracked_uploads_jobs
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def ensure_temporary_tracking_table_exists
|
|
table_name = :untracked_files_for_uploads
|
|
|
|
unless ActiveRecord::Base.connection.data_source_exists?(table_name)
|
|
UntrackedFile.connection.create_table table_name do |t|
|
|
t.string :path, limit: 600, null: false
|
|
t.index :path, unique: true
|
|
end
|
|
end
|
|
end
|
|
|
|
def clear_untracked_file_paths
|
|
UntrackedFile.delete_all
|
|
end
|
|
|
|
def store_untracked_file_paths
|
|
return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
|
|
|
|
each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths|
|
|
insert_file_paths(file_paths)
|
|
end
|
|
end
|
|
|
|
def each_file_batch(search_dir, batch_size, &block)
|
|
cmd = build_find_command(search_dir)
|
|
|
|
Open3.popen2(*cmd) do |stdin, stdout, status_thread|
|
|
yield_paths_in_batches(stdout, batch_size, &block)
|
|
|
|
raise "Find command failed" unless status_thread.value.success?
|
|
end
|
|
end
|
|
|
|
def yield_paths_in_batches(stdout, batch_size, &block)
|
|
paths = []
|
|
|
|
stdout.each_line("\0") do |line|
|
|
paths << line.chomp("\0").sub(START_WITH_ROOT_REGEX, '')
|
|
|
|
if paths.size >= batch_size
|
|
yield(paths)
|
|
paths = []
|
|
end
|
|
end
|
|
|
|
yield(paths) if paths.any?
|
|
end
|
|
|
|
def build_find_command(search_dir)
|
|
cmd = %W[find -L #{search_dir}
|
|
-type f
|
|
! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune )
|
|
! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune )
|
|
-print0]
|
|
|
|
ionice = which_ionice
|
|
cmd = %W[#{ionice} -c Idle] + cmd if ionice
|
|
|
|
log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\""
|
|
Rails.logger.info log_msg
|
|
|
|
cmd
|
|
end
|
|
|
|
def which_ionice
|
|
Gitlab::Utils.which('ionice')
|
|
rescue StandardError
|
|
# In this case, returning false is relatively safe,
|
|
# even though it isn't very nice
|
|
false
|
|
end
|
|
|
|
def insert_file_paths(file_paths)
|
|
sql = insert_sql(file_paths)
|
|
|
|
ActiveRecord::Base.connection.execute(sql)
|
|
end
|
|
|
|
def insert_sql(file_paths)
|
|
if postgresql_pre_9_5?
|
|
"INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
|
|
elsif postgresql?
|
|
"INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\
|
|
" ON CONFLICT DO NOTHING;"
|
|
else # MySQL
|
|
"INSERT IGNORE INTO"\
|
|
" #{table_columns_and_values_for_insert(file_paths)};"
|
|
end
|
|
end
|
|
|
|
def table_columns_and_values_for_insert(file_paths)
|
|
values = file_paths.map do |file_path|
|
|
ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend
|
|
end.join(', ')
|
|
|
|
"#{UntrackedFile.table_name} (path) VALUES #{values}"
|
|
end
|
|
|
|
def postgresql?
|
|
strong_memoize(:postgresql) do
|
|
Gitlab::Database.postgresql?
|
|
end
|
|
end
|
|
|
|
def can_bulk_insert_and_ignore_duplicates?
|
|
!postgresql_pre_9_5?
|
|
end
|
|
|
|
def postgresql_pre_9_5?
|
|
strong_memoize(:postgresql_pre_9_5) do
|
|
postgresql? && Gitlab::Database.version.to_f < 9.5
|
|
end
|
|
end
|
|
|
|
def schedule_populate_untracked_uploads_jobs
|
|
bulk_queue_background_migration_jobs_by_range(
|
|
UntrackedFile, FOLLOW_UP_MIGRATION)
|
|
end
|
|
|
|
def drop_temp_table
|
|
unless Rails.env.test? # Dropping a table intermittently breaks test cleanup
|
|
UntrackedFile.connection.drop_table(:untracked_files_for_uploads,
|
|
if_exists: true)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|