debian-mirror-gitlab/app/models/concerns/pg_full_text_searchable.rb

# frozen_string_literal: true

# This module adds PG full-text search capabilities to a model.
# A `search_data` association with a `search_vector` column is required.
#
# Declare the fields that will be part of the search vector with their
# corresponding weights. Possible values for weight are A, B, C, or D.
# For example:
#
# include PgFullTextSearchable
# pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
#
# This module sets up an after_commit hook that updates the search data
# when the searchable columns are changed. You will need to implement the
# `#persist_pg_full_text_search_vector` method that does the actual insert or update.
#
# This also adds a `pg_full_text_search` scope so you can do:
#
# Model.pg_full_text_search("some search term")

module PgFullTextSearchable
  extend ActiveSupport::Concern

  LONG_WORDS_REGEX = %r([A-Za-z0-9+/@]{50,}).freeze
  TSVECTOR_MAX_LENGTH = 1.megabyte.freeze
  TEXT_SEARCH_DICTIONARY = 'english'
  URL_SCHEME_REGEX = %r{(?<=\A|\W)\w+://(?=\w+)}.freeze
  TSQUERY_DISALLOWED_CHARACTERS_REGEX = %r{[^a-zA-Z0-9 .@/\-_"]}.freeze

  def update_search_data!
    tsvector_sql_nodes = self.class.pg_full_text_searchable_columns.map do |column, weight|
      tsvector_arel_node(column, weight)&.to_sql
    end

    persist_pg_full_text_search_vector(Arel.sql(tsvector_sql_nodes.compact.join(' || ')))
  rescue ActiveRecord::StatementInvalid => e
    raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')

    Gitlab::AppJsonLogger.error(
      message: 'Error updating search data: string is too long for tsvector',
      class: self.class.name,
      model_id: self.id
    )
  end

  private

  def persist_pg_full_text_search_vector(search_vector)
    raise NotImplementedError
  end

  def tsvector_arel_node(column, weight)
    return if self[column].blank?

    column_text = self[column].gsub(LONG_WORDS_REGEX, ' ')
    column_text = column_text[0..(TSVECTOR_MAX_LENGTH - 1)]
    column_text = ActiveSupport::Inflector.transliterate(column_text)

    Arel::Nodes::NamedFunction.new(
      'setweight',
      [
        Arel::Nodes::NamedFunction.new(
          'to_tsvector',
          [Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), Arel::Nodes.build_quoted(column_text)]
        ),
        Arel::Nodes.build_quoted(weight)
      ]
    )
  end

  included do
    cattr_reader :pg_full_text_searchable_columns do
      {}
    end
  end

  class_methods do
    def pg_full_text_searchable(columns:)
      raise 'Full text search columns already defined!' if pg_full_text_searchable_columns.present?

      columns.each do |column|
        pg_full_text_searchable_columns[column[:name]] = column[:weight]
      end

      # When multiple updates are done in a transaction, `saved_changes` will only report the latest save
      # and we may miss an update to the searchable columns.
      # As a workaround, we set a dirty flag here and update the search data in `after_save_commit`.
      after_save do
        next unless pg_full_text_searchable_columns.keys.any? { |f| saved_changes.has_key?(f) }

        @update_pg_full_text_search_data = true
      end

      # We update this outside the transaction because this could raise an error if the resulting tsvector
      # is too long. When that happens, we still persist the create / update but the model will not have a
      # search data record. This is fine in most cases because this is a very rare occurrence and only happens
      # with strings that are most likely unsearchable anyway.
      #
      # We also do not want to use a subtransaction here due to: https://gitlab.com/groups/gitlab-org/-/epics/6540
      after_save_commit do
        update_search_data! if @update_pg_full_text_search_data
        @update_pg_full_text_search_data = nil
      end
    end

    def pg_full_text_search(query, matched_columns: [])
      search_data_table = reflect_on_association(:search_data).klass.arel_table

      joins(:search_data).where(
        Arel::Nodes::InfixOperation.new(
          '@@',
          search_data_table[:search_vector],
          Arel::Nodes::NamedFunction.new(
            'to_tsquery',
            [Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), build_tsquery(query, matched_columns)]
          )
        )
      )
    end

    private

    def build_tsquery(query, matched_columns)
      # URLs get broken up into separate words when : is removed below, so we just remove the whole scheme.
      query = remove_url_scheme(query)
      # Remove accents from search term to match indexed data
      query = ActiveSupport::Inflector.transliterate(query)
      # Prevent users from using tsquery operators that can cause syntax errors.
      query = filter_allowed_characters(query)

      weights = matched_columns.map do |column_name|
        pg_full_text_searchable_columns[column_name]
      end.compact.join
      prefix_search_suffix = ":*#{weights}"

      tsquery = Gitlab::SQL::Pattern.split_query_to_search_terms(query).map do |search_term|
        case search_term
        when /\A\d+\z/ # Handles https://gitlab.com/gitlab-org/gitlab/-/issues/375337
          "(#{search_term + prefix_search_suffix} | -#{search_term + prefix_search_suffix})"
        when /\s/
          search_term.split.map { |t| "#{t}:#{weights}" }.join(' <-> ')
        else
          search_term + prefix_search_suffix
        end
      end.join(' & ')

      Arel::Nodes.build_quoted(tsquery)
    end

    def remove_url_scheme(query)
      query.gsub(URL_SCHEME_REGEX, '')
    end

    def filter_allowed_characters(query)
      query.gsub(TSQUERY_DISALLOWED_CHARACTERS_REGEX, ' ')
    end
  end
end
New upstream version 14.9.4+ds1 2022-05-07 20:08:51 +05:30			`# frozen_string_literal: true`

			`# This module adds PG full-text search capabilities to a model.`
			# A `search_data` association with a `search_vector` column is required.
			`#`
			`# Declare the fields that will be part of the search vector with their`
			`# corresponding weights. Possible values for weight are A, B, C, or D.`
			`# For example:`
			`#`
			`# include PgFullTextSearchable`
			`# pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]`
			`#`
			`# This module sets up an after_commit hook that updates the search data`
			`# when the searchable columns are changed. You will need to implement the`
			# `#persist_pg_full_text_search_vector` method that does the actual insert or update.
			`#`
			# This also adds a `pg_full_text_search` scope so you can do:
			`#`
			`# Model.pg_full_text_search("some search term")`

			`module PgFullTextSearchable`
			`extend ActiveSupport::Concern`

			`LONG_WORDS_REGEX = %r([A-Za-z0-9+/@]{50,}).freeze`
			`TSVECTOR_MAX_LENGTH = 1.megabyte.freeze`
			`TEXT_SEARCH_DICTIONARY = 'english'`
New upstream version 15.1.3+ds1 2022-07-23 23:45:48 +05:30			`URL_SCHEME_REGEX = %r{(?<=\A\|\W)\w+://(?=\w+)}.freeze`
New upstream version 15.6.4+ds1 2023-01-13 00:05:48 +05:30			`TSQUERY_DISALLOWED_CHARACTERS_REGEX = %r{[^a-zA-Z0-9 .@/\-_"]}.freeze`
New upstream version 14.9.4+ds1 2022-05-07 20:08:51 +05:30
			`def update_search_data!`
			`tsvector_sql_nodes = self.class.pg_full_text_searchable_columns.map do \|column, weight\|`
			`tsvector_arel_node(column, weight)&.to_sql`
			`end`

			`persist_pg_full_text_search_vector(Arel.sql(tsvector_sql_nodes.compact.join(' \|\| ')))`
			`rescue ActiveRecord::StatementInvalid => e`
			`raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')`

			`Gitlab::AppJsonLogger.error(`
			`message: 'Error updating search data: string is too long for tsvector',`
			`class: self.class.name,`
			`model_id: self.id`
			`)`
			`end`

			`private`

			`def persist_pg_full_text_search_vector(search_vector)`
			`raise NotImplementedError`
			`end`

			`def tsvector_arel_node(column, weight)`
			`return if self[column].blank?`

			`column_text = self[column].gsub(LONG_WORDS_REGEX, ' ')`
			`column_text = column_text[0..(TSVECTOR_MAX_LENGTH - 1)]`
			`column_text = ActiveSupport::Inflector.transliterate(column_text)`

			`Arel::Nodes::NamedFunction.new(`
			`'setweight',`
			`[`
			`Arel::Nodes::NamedFunction.new(`
			`'to_tsvector',`
			`[Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), Arel::Nodes.build_quoted(column_text)]`
			`),`
			`Arel::Nodes.build_quoted(weight)`
			`]`
			`)`
			`end`

			`included do`
			`cattr_reader :pg_full_text_searchable_columns do`
			`{}`
			`end`
			`end`

			`class_methods do`
			`def pg_full_text_searchable(columns:)`
			`raise 'Full text search columns already defined!' if pg_full_text_searchable_columns.present?`

			`columns.each do \|column\|`
			`pg_full_text_searchable_columns[column[:name]] = column[:weight]`
			`end`

New upstream version 15.0.4+ds1 2022-07-16 23:28:13 +05:30			# When multiple updates are done in a transaction, `saved_changes` will only report the latest save
			`# and we may miss an update to the searchable columns.`
			# As a workaround, we set a dirty flag here and update the search data in `after_save_commit`.
			`after_save do`
			`next unless pg_full_text_searchable_columns.keys.any? { \|f\| saved_changes.has_key?(f) }`

			`@update_pg_full_text_search_data = true`
			`end`

New upstream version 14.9.4+ds1 2022-05-07 20:08:51 +05:30			`# We update this outside the transaction because this could raise an error if the resulting tsvector`
			`# is too long. When that happens, we still persist the create / update but the model will not have a`
			`# search data record. This is fine in most cases because this is a very rare occurrence and only happens`
			`# with strings that are most likely unsearchable anyway.`
			`#`
			`# We also do not want to use a subtransaction here due to: https://gitlab.com/groups/gitlab-org/-/epics/6540`
			`after_save_commit do`
New upstream version 15.0.4+ds1 2022-07-16 23:28:13 +05:30			`update_search_data! if @update_pg_full_text_search_data`
			`@update_pg_full_text_search_data = nil`
New upstream version 14.9.4+ds1 2022-05-07 20:08:51 +05:30			`end`
			`end`

New upstream version 15.6.4+ds1 2023-01-13 00:05:48 +05:30			`def pg_full_text_search(query, matched_columns: [])`
New upstream version 14.9.4+ds1 2022-05-07 20:08:51 +05:30			`search_data_table = reflect_on_association(:search_data).klass.arel_table`

			`joins(:search_data).where(`
			`Arel::Nodes::InfixOperation.new(`
			`'@@',`
			`search_data_table[:search_vector],`
			`Arel::Nodes::NamedFunction.new(`
New upstream version 15.6.4+ds1 2023-01-13 00:05:48 +05:30			`'to_tsquery',`
			`[Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), build_tsquery(query, matched_columns)]`
New upstream version 14.9.4+ds1 2022-05-07 20:08:51 +05:30			`)`
			`)`
			`)`
			`end`
New upstream version 15.1.3+ds1 2022-07-23 23:45:48 +05:30
			`private`

New upstream version 15.6.4+ds1 2023-01-13 00:05:48 +05:30			`def build_tsquery(query, matched_columns)`
			`# URLs get broken up into separate words when : is removed below, so we just remove the whole scheme.`
			`query = remove_url_scheme(query)`
			`# Remove accents from search term to match indexed data`
			`query = ActiveSupport::Inflector.transliterate(query)`
			`# Prevent users from using tsquery operators that can cause syntax errors.`
			`query = filter_allowed_characters(query)`

			`weights = matched_columns.map do \|column_name\|`
			`pg_full_text_searchable_columns[column_name]`
			`end.compact.join`
			`prefix_search_suffix = ":*#{weights}"`

			`tsquery = Gitlab::SQL::Pattern.split_query_to_search_terms(query).map do \|search_term\|`
			`case search_term`
			`when /\A\d+\z/ # Handles https://gitlab.com/gitlab-org/gitlab/-/issues/375337`
			`"(#{search_term + prefix_search_suffix} \| -#{search_term + prefix_search_suffix})"`
			`when /\s/`
			`search_term.split.map { \|t\| "#{t}:#{weights}" }.join(' <-> ')`
			`else`
			`search_term + prefix_search_suffix`
			`end`
			`end.join(' & ')`

			`Arel::Nodes.build_quoted(tsquery)`
			`end`

			`def remove_url_scheme(query)`
			`query.gsub(URL_SCHEME_REGEX, '')`
			`end`

			`def filter_allowed_characters(query)`
			`query.gsub(TSQUERY_DISALLOWED_CHARACTERS_REGEX, ' ')`
New upstream version 15.1.3+ds1 2022-07-23 23:45:48 +05:30			`end`
New upstream version 14.9.4+ds1 2022-05-07 20:08:51 +05:30			`end`
			`end`