2019-02-15 15:39:39 +05:30
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2016-09-29 09:46:39 +05:30
|
|
|
# https://www.periscopedata.com/blog/medians-in-sql.html
|
|
|
|
module Gitlab
|
|
|
|
module Database
|
|
|
|
module Median
|
2018-03-27 19:54:05 +05:30
|
|
|
NotSupportedError = Class.new(StandardError)
|
|
|
|
|
2016-09-29 09:46:39 +05:30
|
|
|
def median_datetime(arel_table, query_so_far, column_sym)
|
2018-03-27 19:54:05 +05:30
|
|
|
extract_median(execute_queries(arel_table, query_so_far, column_sym)).presence
|
|
|
|
end
|
|
|
|
|
|
|
|
def median_datetimes(arel_table, query_so_far, column_sym, partition_column)
|
|
|
|
extract_medians(execute_queries(arel_table, query_so_far, column_sym, partition_column)).presence
|
2016-09-29 09:46:39 +05:30
|
|
|
end
|
|
|
|
|
|
|
|
def extract_median(results)
|
|
|
|
result = results.compact.first
|
|
|
|
|
2019-10-12 21:52:04 +05:30
|
|
|
result = result.first.presence
|
2018-03-27 19:54:05 +05:30
|
|
|
|
2019-10-12 21:52:04 +05:30
|
|
|
result['median']&.to_f if result
|
2016-09-29 09:46:39 +05:30
|
|
|
end
|
|
|
|
|
2018-03-27 19:54:05 +05:30
|
|
|
def extract_medians(results)
|
|
|
|
median_values = results.compact.first.values
|
|
|
|
|
|
|
|
median_values.each_with_object({}) do |(id, median), hash|
|
|
|
|
hash[id.to_i] = median&.to_f
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column = nil)
|
2016-09-29 09:46:39 +05:30
|
|
|
# Create a CTE with the column we're operating on, row number (after sorting by the column
|
|
|
|
# we're operating on), and count of the table we're operating on (duplicated across) all rows
|
|
|
|
# of the CTE. For example, if we're looking to find the median of the `projects.star_count`
|
|
|
|
# column, the CTE might look like this:
|
|
|
|
#
|
|
|
|
# star_count | row_id | ct
|
|
|
|
# ------------+--------+----
|
|
|
|
# 5 | 1 | 3
|
|
|
|
# 9 | 2 | 3
|
|
|
|
# 15 | 3 | 3
|
2018-03-27 19:54:05 +05:30
|
|
|
#
|
|
|
|
# If a partition column is used we will do the same operation but for separate partitions,
|
|
|
|
# when that happens the CTE might look like this:
|
|
|
|
#
|
|
|
|
# project_id | star_count | row_id | ct
|
|
|
|
# ------------+------------+--------+----
|
|
|
|
# 1 | 5 | 1 | 2
|
|
|
|
# 1 | 9 | 2 | 2
|
|
|
|
# 2 | 10 | 1 | 3
|
|
|
|
# 2 | 15 | 2 | 3
|
|
|
|
# 2 | 20 | 3 | 3
|
2016-09-29 09:46:39 +05:30
|
|
|
cte_table = Arel::Table.new("ordered_records")
|
2018-03-27 19:54:05 +05:30
|
|
|
|
2016-09-29 09:46:39 +05:30
|
|
|
cte = Arel::Nodes::As.new(
|
|
|
|
cte_table,
|
2018-03-27 19:54:05 +05:30
|
|
|
arel_table.project(*rank_rows(arel_table, column_sym, partition_column)).
|
2016-09-29 09:46:39 +05:30
|
|
|
# Disallow negative values
|
|
|
|
where(arel_table[column_sym].gteq(zero_interval)))
|
|
|
|
|
|
|
|
# From the CTE, select either the middle row or the middle two rows (this is accomplished
|
|
|
|
# by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the
|
|
|
|
# selected rows, and this is the median value.
|
2018-03-27 19:54:05 +05:30
|
|
|
result =
|
|
|
|
cte_table
|
|
|
|
.project(*median_projections(cte_table, column_sym, partition_column))
|
|
|
|
.where(
|
|
|
|
Arel::Nodes::Between.new(
|
|
|
|
cte_table[:row_id],
|
|
|
|
Arel::Nodes::And.new(
|
|
|
|
[(cte_table[:ct] / Arel.sql('2.0')),
|
|
|
|
(cte_table[:ct] / Arel.sql('2.0') + 1)]
|
|
|
|
)
|
2016-09-29 09:46:39 +05:30
|
|
|
)
|
|
|
|
)
|
2018-03-27 19:54:05 +05:30
|
|
|
.with(query_so_far, cte)
|
|
|
|
|
|
|
|
result.group(cte_table[partition_column]).order(cte_table[partition_column]) if partition_column
|
|
|
|
|
|
|
|
result.to_sql
|
2016-09-29 09:46:39 +05:30
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
2018-03-27 19:54:05 +05:30
|
|
|
def execute_queries(arel_table, query_so_far, column_sym, partition_column = nil)
|
2019-10-12 21:52:04 +05:30
|
|
|
queries = pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column)
|
2018-03-27 19:54:05 +05:30
|
|
|
|
|
|
|
Array.wrap(queries).map { |query| ActiveRecord::Base.connection.execute(query) }
|
|
|
|
end
|
|
|
|
|
2016-09-29 09:46:39 +05:30
|
|
|
def average(args, as)
|
|
|
|
Arel::Nodes::NamedFunction.new("AVG", args, as)
|
|
|
|
end
|
|
|
|
|
2018-03-27 19:54:05 +05:30
|
|
|
def rank_rows(arel_table, column_sym, partition_column)
|
|
|
|
column_row = arel_table[column_sym].as(column_sym.to_s)
|
|
|
|
|
|
|
|
if partition_column
|
|
|
|
partition_row = arel_table[partition_column]
|
|
|
|
row_id =
|
|
|
|
Arel::Nodes::Over.new(
|
|
|
|
Arel::Nodes::NamedFunction.new('rank', []),
|
|
|
|
Arel::Nodes::Window.new.partition(arel_table[partition_column])
|
|
|
|
.order(arel_table[column_sym])
|
|
|
|
).as('row_id')
|
|
|
|
|
2019-02-15 15:39:39 +05:30
|
|
|
count = arel_table.from.from(arel_table.alias)
|
|
|
|
.project('COUNT(*)')
|
2018-03-27 19:54:05 +05:30
|
|
|
.where(arel_table[partition_column].eq(arel_table.alias[partition_column]))
|
|
|
|
.as('ct')
|
|
|
|
|
|
|
|
[partition_row, column_row, row_id, count]
|
|
|
|
else
|
|
|
|
row_id =
|
|
|
|
Arel::Nodes::Over.new(
|
|
|
|
Arel::Nodes::NamedFunction.new('row_number', []),
|
|
|
|
Arel::Nodes::Window.new.order(arel_table[column_sym])
|
|
|
|
).as('row_id')
|
|
|
|
|
2019-09-30 21:07:59 +05:30
|
|
|
count = arel_table.where(arel_table[column_sym].gteq(zero_interval)).project("COUNT(1)").as('ct')
|
2018-03-27 19:54:05 +05:30
|
|
|
|
|
|
|
[column_row, row_id, count]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def median_projections(table, column_sym, partition_column)
|
|
|
|
projections = []
|
|
|
|
projections << table[partition_column] if partition_column
|
|
|
|
projections << average([extract_epoch(table[column_sym])], "median")
|
|
|
|
projections
|
|
|
|
end
|
|
|
|
|
2016-09-29 09:46:39 +05:30
|
|
|
def extract_epoch(arel_attribute)
|
|
|
|
Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")})
|
|
|
|
end
|
|
|
|
|
2017-08-17 22:00:37 +05:30
|
|
|
def extract_diff_epoch(diff)
|
|
|
|
Arel.sql(%Q{EXTRACT(EPOCH FROM (#{diff.to_sql}))})
|
|
|
|
end
|
|
|
|
|
2016-09-29 09:46:39 +05:30
|
|
|
# Need to cast '0' to an INTERVAL before we can check if the interval is positive
|
|
|
|
def zero_interval
|
|
|
|
Arel::Nodes::NamedFunction.new("CAST", [Arel.sql("'0' AS INTERVAL")])
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|