debian-mirror-gitlab/lib/gitlab/database/similarity_score.rb

120 lines
4.9 KiB
Ruby
Raw Normal View History

2020-10-24 23:57:45 +05:30
# frozen_string_literal: true
module Gitlab
module Database
class SimilarityScore
EMPTY_STRING = Arel.sql("''").freeze
EXPRESSION_ON_INVALID_INPUT = Arel::Nodes::NamedFunction.new('CAST', [Arel.sql('0').as('integer')]).freeze
DEFAULT_MULTIPLIER = 1
2021-01-03 14:25:43 +05:30
DISPLAY_NAME = self.name.underscore.freeze
# Adds a "magic" comment in the generated SQL expression in order to be able to tell if we're sorting by similarity.
# Example: /* gitlab/database/similarity_score */ SIMILARITY(COALESCE...
SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION = "/* #{DISPLAY_NAME} */ SIMILARITY".freeze
2020-10-24 23:57:45 +05:30
# This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity.
#
# Note: Calculating similarity score for large volume of records is inefficient. use SimilarityScore only for smaller
# resultset which is already filtered by other conditions (< 10_000 records).
#
# ==== Parameters
# * +search+ - [String] the user provided search string
# * +rules+ - [{ column: COLUMN, multiplier: 1 }, { column: COLUMN_2, multiplier: 0.5 }] rules for the scoring.
# * +column+ - Arel column expression, example: Project.arel_table["name"]
# * +multiplier+ - Integer or Float to increase or decrease the score (optional, defaults to 1)
#
# ==== Use case
#
# We'd like to search for projects by path, name and description. We want to rank higher the path and name matches, since
# it's more likely that the user was remembering the path or the name of the project.
#
# Rules:
# [
# { column: Project.arel_table['path'], multiplier: 1 },
# { column: Project.arel_table['name'], multiplier: 1 },
# { column: Project.arel_table['description'], multiplier: 0.5 }
# ]
#
# ==== Examples
#
# Similarity calculation based on one column:
#
# Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [{ column: Project.arel_table['name'] }])
#
# Similarity calculation based on two column, where the second column has lower priority:
#
# Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [
# { column: Project.arel_table['name'], multiplier: 1 },
# { column: Project.arel_table['description'], multiplier: 0.5 }
# ])
#
# Integration with an ActiveRecord query:
#
# table = Project.arel_table
#
# order_expression = Gitlab::Database::SimilarityScore.build_expession(search: 'input', rules: [
# { column: table['name'], multiplier: 1 },
# { column: table['description'], multiplier: 0.5 }
# ])
#
# Project.where("name LIKE ?", '%' + 'input' + '%').order(order_expression.desc)
#
# The expression can be also used in SELECT:
#
# results = Project.select(order_expression.as('similarity')).where("name LIKE ?", '%' + 'input' + '%').order(similarity: :desc)
# puts results.map(&:similarity)
#
def self.build_expression(search:, rules:)
return EXPRESSION_ON_INVALID_INPUT if search.blank? || rules.empty?
quoted_search = ActiveRecord::Base.connection.quote(search.to_s)
first_expression, *expressions = rules.map do |rule|
rule_to_arel(quoted_search, rule)
end
# (SIMILARITY ...) + (SIMILARITY ...)
expressions.inject(first_expression) do |expression1, expression2|
Arel::Nodes::Addition.new(expression1, expression2)
end
end
2021-01-03 14:25:43 +05:30
def self.order_by_similarity?(arel_query)
arel_query.to_sql.include?(SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION)
end
2020-10-24 23:57:45 +05:30
# (SIMILARITY(COALESCE(column, ''), 'search_string') * CAST(multiplier AS numeric))
def self.rule_to_arel(search, rule)
Arel::Nodes::Grouping.new(
Arel::Nodes::Multiplication.new(
similarity_function_call(search, column_expression(rule)),
multiplier_expression(rule)
)
)
end
# COALESCE(column, '')
def self.column_expression(rule)
Arel::Nodes::NamedFunction.new('COALESCE', [rule.fetch(:column), EMPTY_STRING])
end
# SIMILARITY(COALESCE(column, ''), 'search_string')
def self.similarity_function_call(search, column)
2021-01-03 14:25:43 +05:30
Arel::Nodes::NamedFunction.new(SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION, [column, Arel.sql(search)])
2020-10-24 23:57:45 +05:30
end
# CAST(multiplier AS numeric)
def self.multiplier_expression(rule)
quoted_multiplier = ActiveRecord::Base.connection.quote(rule.fetch(:multiplier, DEFAULT_MULTIPLIER).to_s)
Arel::Nodes::NamedFunction.new('CAST', [Arel.sql(quoted_multiplier).as('numeric')])
end
private_class_method :rule_to_arel
private_class_method :column_expression
private_class_method :similarity_function_call
private_class_method :multiplier_expression
end
end
end