Determine fuzziness of bleve indexer by keyword length (#29706)

also bleve did match on fuzzy search and the other way around. this also fix that bug.

(cherry picked from commit b9c57fb78e8e0d80d786d8e1da433b6c7ebf2f1c)

Conflicts:
	tests/integration/repo_search_test.go
	simple conflict resolution in the tests
This commit is contained in:
6543 2024-03-23 16:45:13 +01:00 committed by Earl Warren
parent b73117127c
commit ab5f0b7558
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: 0579CB2928A78A00
4 changed files with 29 additions and 37 deletions

View file

@ -39,6 +39,8 @@ import (
const ( const (
unicodeNormalizeName = "unicodeNormalize" unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16 maxBatchSize = 16
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
fuzzyDenominator = 4
) )
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
@ -239,15 +241,12 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
keywordQuery query.Query keywordQuery query.Query
) )
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
if opts.IsKeywordFuzzy { if opts.IsKeywordFuzzy {
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
} else {
prefixQuery := bleve.NewPrefixQuery(opts.Keyword)
prefixQuery.FieldVal = "Content"
keywordQuery = prefixQuery
} }
if len(opts.RepoIDs) > 0 { if len(opts.RepoIDs) > 0 {

View file

@ -20,17 +20,11 @@ func NumericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
} }
// MatchPhraseQuery generates a match phrase query for the given phrase, field and analyzer // MatchPhraseQuery generates a match phrase query for the given phrase, field and analyzer
func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQuery { func MatchPhraseQuery(matchPhrase, field, analyzer string, fuzziness int) *query.MatchPhraseQuery {
q := bleve.NewMatchPhraseQuery(matchPhrase) q := bleve.NewMatchPhraseQuery(matchPhrase)
q.FieldVal = field q.FieldVal = field
q.Analyzer = analyzer q.Analyzer = analyzer
return q q.Fuzziness = fuzziness
}
// PrefixQuery generates a match prefix query for the given prefix and field
func PrefixQuery(matchPrefix, field string) *query.PrefixQuery {
q := bleve.NewPrefixQuery(matchPrefix)
q.FieldVal = field
return q return q
} }

View file

@ -35,7 +35,11 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
}) })
} }
const maxBatchSize = 16 const (
maxBatchSize = 16
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
fuzzyDenominator = 4
)
// IndexerData an update to the issue indexer // IndexerData an update to the issue indexer
type IndexerData internal.IndexerData type IndexerData internal.IndexerData
@ -156,19 +160,16 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
var queries []query.Query var queries []query.Query
if options.Keyword != "" { if options.Keyword != "" {
fuzziness := 0
if options.IsFuzzyKeyword { if options.IsFuzzyKeyword {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ fuzziness = len(options.Keyword) / fuzzyDenominator
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
}...))
} else {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.PrefixQuery(options.Keyword, "title"),
inner_bleve.PrefixQuery(options.Keyword, "content"),
inner_bleve.PrefixQuery(options.Keyword, "comments"),
}...))
} }
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer, fuzziness),
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer, fuzziness),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer, fuzziness),
}...))
} }
if len(options.RepoIDs) > 0 || options.AllPublic { if len(options.RepoIDs) > 0 || options.AllPublic {

View file

@ -46,7 +46,7 @@ func testSearchRepo(t *testing.T, useExternalIndexer bool) {
if useExternalIndexer { if useExternalIndexer {
gitReference = "/commit/" gitReference = "/commit/"
executeIndexer(t, repo, code_indexer.UpdateRepoIndexer) code_indexer.UpdateRepoIndexer(repo)
} }
testSearch(t, "/user2/repo1/search?q=Description&page=1", gitReference, []string{"README.md"}) testSearch(t, "/user2/repo1/search?q=Description&page=1", gitReference, []string{"README.md"})
@ -58,12 +58,14 @@ func testSearchRepo(t *testing.T, useExternalIndexer bool) {
repo, err = repo_model.GetRepositoryByOwnerAndName(db.DefaultContext, "user2", "glob") repo, err = repo_model.GetRepositoryByOwnerAndName(db.DefaultContext, "user2", "glob")
assert.NoError(t, err) assert.NoError(t, err)
executeIndexer(t, repo, code_indexer.UpdateRepoIndexer) code_indexer.UpdateRepoIndexer(repo)
testSearch(t, "/user2/glob/search?q=loren&page=1", gitReference, []string{"a.txt"}) testSearch(t, "/user2/glob/search?q=loren&page=1", gitReference, []string{"a.txt"})
testSearch(t, "/user2/glob/search?q=file3&page=1", gitReference, []string{"x/b.txt"}) testSearch(t, "/user2/glob/search?q=loren&page=1&t=match", gitReference, []string{"a.txt"})
testSearch(t, "/user2/glob/search?q=file4&page=1", gitReference, []string{}) testSearch(t, "/user2/glob/search?q=file3&page=1", gitReference, []string{"x/b.txt", "a.txt"})
testSearch(t, "/user2/glob/search?q=file5&page=1", gitReference, []string{}) testSearch(t, "/user2/glob/search?q=file3&page=1&t=match", gitReference, []string{"x/b.txt", "a.txt"})
testSearch(t, "/user2/glob/search?q=file4&page=1&t=match", gitReference, []string{"x/b.txt", "a.txt"})
testSearch(t, "/user2/glob/search?q=file5&page=1&t=match", gitReference, []string{"x/b.txt", "a.txt"})
} }
} }
@ -88,7 +90,3 @@ func testSearch(t *testing.T, url, gitRef string, expected []string) {
checkResultLinks(t, gitRef, doc) checkResultLinks(t, gitRef, doc)
} }
func executeIndexer(t *testing.T, repo *repo_model.Repository, op func(*repo_model.Repository)) {
op(repo)
}