debian-mirror-gitlab/lib/gitlab/database/batch_count.rb

171 lines
6.3 KiB
Ruby
Raw Normal View History

2020-03-13 15:44:24 +05:30
# frozen_string_literal: true
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# Implements a distinct and ordinary batch counter
# Needs indexes on the column below to calculate max, min and range queries
# For larger tables just set use higher batch_size with index optimization
2020-04-22 19:07:51 +05:30
#
# In order to not use a possible complex time consuming query when calculating min and max for batch_distinct_count
# the start and finish can be sent specifically
#
2021-01-03 14:25:43 +05:30
# Grouped relations can be used as well. However, the preferred batch count should be around 10K because group by count is more expensive.
#
2020-03-13 15:44:24 +05:30
# See https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
2020-04-22 19:07:51 +05:30
#
2020-03-13 15:44:24 +05:30
# Examples:
# extend ::Gitlab::Database::BatchCount
# batch_count(User.active)
# batch_count(::Clusters::Cluster.aws_installed.enabled, :cluster_id)
2021-01-03 14:25:43 +05:30
# batch_count(Namespace.group(:type))
2020-03-13 15:44:24 +05:30
# batch_distinct_count(::Project, :creator_id)
2020-04-22 19:07:51 +05:30
# batch_distinct_count(::Project.with_active_services.service_desk_enabled.where(time_period), start: ::User.minimum(:id), finish: ::User.maximum(:id))
2021-01-03 14:25:43 +05:30
# batch_distinct_count(Project.group(:visibility_level), :creator_id)
2020-10-24 23:57:45 +05:30
# batch_sum(User, :sign_in_count)
2021-01-03 14:25:43 +05:30
# batch_sum(Issue.group(:state_id), :weight))
2020-03-13 15:44:24 +05:30
module Gitlab
module Database
module BatchCount
2020-04-22 19:07:51 +05:30
def batch_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
BatchCounter.new(relation, column: column).count(batch_size: batch_size, start: start, finish: finish)
2020-03-13 15:44:24 +05:30
end
2020-04-22 19:07:51 +05:30
def batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
BatchCounter.new(relation, column: column).count(mode: :distinct, batch_size: batch_size, start: start, finish: finish)
2020-03-13 15:44:24 +05:30
end
2020-10-24 23:57:45 +05:30
def batch_sum(relation, column, batch_size: nil, start: nil, finish: nil)
BatchCounter.new(relation, column: nil, operation: :sum, operation_args: [column]).count(batch_size: batch_size, start: start, finish: finish)
end
2020-03-13 15:44:24 +05:30
class << self
include BatchCount
end
end
class BatchCounter
FALLBACK = -1
2020-04-08 14:13:33 +05:30
MIN_REQUIRED_BATCH_SIZE = 1_250
2020-10-24 23:57:45 +05:30
DEFAULT_SUM_BATCH_SIZE = 1_000
2020-03-13 15:44:24 +05:30
MAX_ALLOWED_LOOPS = 10_000
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
2020-04-22 19:07:51 +05:30
ALLOWED_MODES = [:itself, :distinct].freeze
2021-02-22 17:27:13 +05:30
FALLBACK_FINISH = 0
OFFSET_BY_ONE = 1
2020-04-22 19:07:51 +05:30
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_DISTINCT_BATCH_SIZE = 10_000
DEFAULT_BATCH_SIZE = 100_000
2020-03-13 15:44:24 +05:30
2020-10-24 23:57:45 +05:30
def initialize(relation, column: nil, operation: :count, operation_args: nil)
2020-03-13 15:44:24 +05:30
@relation = relation
@column = column || relation.primary_key
2020-10-24 23:57:45 +05:30
@operation = operation
@operation_args = operation_args
2020-03-13 15:44:24 +05:30
end
def unwanted_configuration?(finish, batch_size, start)
2020-10-24 23:57:45 +05:30
(@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) ||
(@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) ||
2020-03-13 15:44:24 +05:30
(finish - start) / batch_size >= MAX_ALLOWED_LOOPS ||
2021-02-22 17:27:13 +05:30
start >= finish
2020-03-13 15:44:24 +05:30
end
2020-04-22 19:07:51 +05:30
def count(batch_size: nil, mode: :itself, start: nil, finish: nil)
2020-03-13 15:44:24 +05:30
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
2020-04-22 19:07:51 +05:30
check_mode!(mode)
2020-03-13 15:44:24 +05:30
# non-distinct have better performance
2020-10-24 23:57:45 +05:30
batch_size ||= batch_size_for_mode_and_operation(mode, @operation)
2020-03-13 15:44:24 +05:30
2020-04-22 19:07:51 +05:30
start = actual_start(start)
finish = actual_finish(finish)
2020-03-13 15:44:24 +05:30
raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
return FALLBACK if unwanted_configuration?(finish, batch_size, start)
2021-01-03 14:25:43 +05:30
results = nil
2020-03-13 15:44:24 +05:30
batch_start = start
2021-02-22 17:27:13 +05:30
while batch_start < finish
batch_end = [batch_start + batch_size, finish].min
batch_relation = build_relation_batch(batch_start, batch_end, mode)
2020-03-13 15:44:24 +05:30
begin
2021-01-03 14:25:43 +05:30
results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend
2021-02-22 17:27:13 +05:30
batch_start = batch_end
2021-01-03 14:25:43 +05:30
rescue ActiveRecord::QueryCanceled => error
2020-03-13 15:44:24 +05:30
# retry with a safe batch size & warmer cache
if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE
batch_size /= 2
else
2021-01-03 14:25:43 +05:30
log_canceled_batch_fetch(batch_start, mode, batch_relation.to_sql, error)
2020-03-13 15:44:24 +05:30
return FALLBACK
end
end
2021-02-22 17:27:13 +05:30
2020-03-13 15:44:24 +05:30
sleep(SLEEP_TIME_IN_SECONDS)
end
2021-01-03 14:25:43 +05:30
results
2020-03-13 15:44:24 +05:30
end
2021-01-03 14:25:43 +05:30
def merge_results(results, object)
return object unless results
if object.is_a?(Hash)
results.merge!(object) { |_, a, b| a + b }
else
results + object
end
2020-03-13 15:44:24 +05:30
end
2020-04-22 19:07:51 +05:30
private
2021-01-03 14:25:43 +05:30
def build_relation_batch(start, finish, mode)
@relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend
end
2020-10-24 23:57:45 +05:30
def batch_size_for_mode_and_operation(mode, operation)
return DEFAULT_SUM_BATCH_SIZE if operation == :sum
mode == :distinct ? DEFAULT_DISTINCT_BATCH_SIZE : DEFAULT_BATCH_SIZE
end
2020-05-24 23:13:21 +05:30
def between_condition(start, finish)
2021-01-29 00:20:46 +05:30
return @column.between(start...finish) if @column.is_a?(Arel::Attributes::Attribute)
2020-05-24 23:13:21 +05:30
2021-01-29 00:20:46 +05:30
{ @column => start...finish }
2020-05-24 23:13:21 +05:30
end
2020-04-22 19:07:51 +05:30
def actual_start(start)
2021-01-03 14:25:43 +05:30
start || @relation.unscope(:group, :having).minimum(@column) || 0
2020-04-22 19:07:51 +05:30
end
def actual_finish(finish)
2021-02-22 17:27:13 +05:30
(finish || @relation.unscope(:group, :having).maximum(@column) || FALLBACK_FINISH) + OFFSET_BY_ONE
2020-04-22 19:07:51 +05:30
end
def check_mode!(mode)
raise "The mode #{mode.inspect} is not supported" unless ALLOWED_MODES.include?(mode)
raise 'Use distinct count for optimized distinct counting' if @relation.limit(1).distinct_value.present? && mode != :distinct
raise 'Use distinct count only with non id fields' if @column == :id && mode == :distinct
end
2021-01-03 14:25:43 +05:30
def log_canceled_batch_fetch(batch_start, mode, query, error)
Gitlab::AppJsonLogger
.error(
event: 'batch_count',
relation: @relation.table_name,
operation: @operation,
operation_args: @operation_args,
start: batch_start,
mode: mode,
query: query,
message: "Query has been canceled with message: #{error.message}"
)
end
2020-03-13 15:44:24 +05:30
end
end
end