forgejo-federation/modules/indexer/issues/meilisearch/meilisearch.go
6543 38c3cc4eb7
Patch in exact search for meilisearch (#29671)
meilisearch does not have an search option to contorl fuzzynes per query
right now:
 - https://github.com/meilisearch/meilisearch/issues/1192
 - https://github.com/orgs/meilisearch/discussions/377
 - https://github.com/meilisearch/meilisearch/discussions/1096

so we have to create a workaround by post-filter the search result in
gitea until this is addressed.

For future works I added an option in backend only atm, to enable
fuzzynes for issue indexer too.
And also refactored the code so the fuzzy option is equal in logic to
code indexer

---
*Sponsored by Kithara Software GmbH*
Conflicts:
	routers/web/repo/search.go
	trivial context confict s/isMatch/isFuzzy/
2024-03-11 23:37:00 +07:00

322 lines
8.9 KiB
Go

// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package meilisearch
import (
"context"
"errors"
"strconv"
"strings"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_meilisearch "code.gitea.io/gitea/modules/indexer/internal/meilisearch"
"code.gitea.io/gitea/modules/indexer/issues/internal"
"github.com/meilisearch/meilisearch-go"
)
const (
issueIndexerLatestVersion = 3
// TODO: make this configurable if necessary
maxTotalHits = 10000
)
// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
inner *inner_meilisearch.Indexer
indexer_internal.Indexer // do not composite inner_meilisearch.Indexer directly to avoid exposing too much
}
// NewIndexer creates a new meilisearch indexer
func NewIndexer(url, apiKey, indexerName string) *Indexer {
settings := &meilisearch.Settings{
// The default ranking rules of meilisearch are: ["words", "typo", "proximity", "attribute", "sort", "exactness"]
// So even if we specify the sort order, it could not be respected because the priority of "sort" is so low.
// So we need to specify the ranking rules to make sure the sort order is respected.
// See https://www.meilisearch.com/docs/learn/core_concepts/relevancy
RankingRules: []string{"sort", // make sure "sort" has the highest priority
"words", "typo", "proximity", "attribute", "exactness"},
SearchableAttributes: []string{
"title",
"content",
"comments",
},
DisplayedAttributes: []string{
"id",
"title",
"content",
"comments",
},
FilterableAttributes: []string{
"repo_id",
"is_public",
"is_pull",
"is_closed",
"label_ids",
"no_label",
"milestone_id",
"project_id",
"project_board_id",
"poster_id",
"assignee_id",
"mention_ids",
"reviewed_ids",
"review_requested_ids",
"subscriber_ids",
"updated_unix",
},
SortableAttributes: []string{
"updated_unix",
"created_unix",
"deadline_unix",
"comment_count",
"id",
},
Pagination: &meilisearch.Pagination{
MaxTotalHits: maxTotalHits,
},
}
inner := inner_meilisearch.NewIndexer(url, apiKey, indexerName, issueIndexerLatestVersion, settings)
indexer := &Indexer{
inner: inner,
Indexer: inner,
}
return indexer
}
// Index will save the index data
func (b *Indexer) Index(_ context.Context, issues ...*internal.IndexerData) error {
if len(issues) == 0 {
return nil
}
for _, issue := range issues {
_, err := b.inner.Client.Index(b.inner.VersionedIndexName()).AddDocuments(issue)
if err != nil {
return err
}
}
// TODO: bulk send index data
return nil
}
// Delete deletes indexes by ids
func (b *Indexer) Delete(_ context.Context, ids ...int64) error {
if len(ids) == 0 {
return nil
}
for _, id := range ids {
_, err := b.inner.Client.Index(b.inner.VersionedIndexName()).DeleteDocument(strconv.FormatInt(id, 10))
if err != nil {
return err
}
}
// TODO: bulk send deletes
return nil
}
// Search searches for issues by given conditions.
// Returns the matching issue IDs
func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) {
query := inner_meilisearch.FilterAnd{}
if len(options.RepoIDs) > 0 {
q := &inner_meilisearch.FilterOr{}
q.Or(inner_meilisearch.NewFilterIn("repo_id", options.RepoIDs...))
if options.AllPublic {
q.Or(inner_meilisearch.NewFilterEq("is_public", true))
}
query.And(q)
}
if options.IsPull.Has() {
query.And(inner_meilisearch.NewFilterEq("is_pull", options.IsPull.Value()))
}
if options.IsClosed.Has() {
query.And(inner_meilisearch.NewFilterEq("is_closed", options.IsClosed.Value()))
}
if options.NoLabelOnly {
query.And(inner_meilisearch.NewFilterEq("no_label", true))
} else {
if len(options.IncludedLabelIDs) > 0 {
q := &inner_meilisearch.FilterAnd{}
for _, labelID := range options.IncludedLabelIDs {
q.And(inner_meilisearch.NewFilterEq("label_ids", labelID))
}
query.And(q)
} else if len(options.IncludedAnyLabelIDs) > 0 {
query.And(inner_meilisearch.NewFilterIn("label_ids", options.IncludedAnyLabelIDs...))
}
if len(options.ExcludedLabelIDs) > 0 {
q := &inner_meilisearch.FilterAnd{}
for _, labelID := range options.ExcludedLabelIDs {
q.And(inner_meilisearch.NewFilterNot(inner_meilisearch.NewFilterEq("label_ids", labelID)))
}
query.And(q)
}
}
if len(options.MilestoneIDs) > 0 {
query.And(inner_meilisearch.NewFilterIn("milestone_id", options.MilestoneIDs...))
}
if options.ProjectID != nil {
query.And(inner_meilisearch.NewFilterEq("project_id", *options.ProjectID))
}
if options.ProjectBoardID != nil {
query.And(inner_meilisearch.NewFilterEq("project_board_id", *options.ProjectBoardID))
}
if options.PosterID != nil {
query.And(inner_meilisearch.NewFilterEq("poster_id", *options.PosterID))
}
if options.AssigneeID != nil {
query.And(inner_meilisearch.NewFilterEq("assignee_id", *options.AssigneeID))
}
if options.MentionID != nil {
query.And(inner_meilisearch.NewFilterEq("mention_ids", *options.MentionID))
}
if options.ReviewedID != nil {
query.And(inner_meilisearch.NewFilterEq("reviewed_ids", *options.ReviewedID))
}
if options.ReviewRequestedID != nil {
query.And(inner_meilisearch.NewFilterEq("review_requested_ids", *options.ReviewRequestedID))
}
if options.SubscriberID != nil {
query.And(inner_meilisearch.NewFilterEq("subscriber_ids", *options.SubscriberID))
}
if options.UpdatedAfterUnix != nil {
query.And(inner_meilisearch.NewFilterGte("updated_unix", *options.UpdatedAfterUnix))
}
if options.UpdatedBeforeUnix != nil {
query.And(inner_meilisearch.NewFilterLte("updated_unix", *options.UpdatedBeforeUnix))
}
if options.SortBy == "" {
options.SortBy = internal.SortByCreatedAsc
}
sortBy := []string{
parseSortBy(options.SortBy),
"id:desc",
}
skip, limit := indexer_internal.ParsePaginator(options.Paginator, maxTotalHits)
searchRes, err := b.inner.Client.Index(b.inner.VersionedIndexName()).Search(options.Keyword, &meilisearch.SearchRequest{
Filter: query.Statement(),
Limit: int64(limit),
Offset: int64(skip),
Sort: sortBy,
MatchingStrategy: "all",
})
if err != nil {
return nil, err
}
hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword)
if err != nil {
return nil, err
}
return &internal.SearchResult{
Total: searchRes.EstimatedTotalHits,
Hits: hits,
}, nil
}
func parseSortBy(sortBy internal.SortBy) string {
field := strings.TrimPrefix(string(sortBy), "-")
if strings.HasPrefix(string(sortBy), "-") {
return field + ":desc"
}
return field + ":asc"
}
// nonFuzzyWorkaround is needed as meilisearch does not have an exact search
// and you can only change "typo tolerance" per index. So we have to post-filter the results
// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance
// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed
func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) {
hits := make([]internal.Match, 0, len(searchRes.Hits))
for _, hit := range searchRes.Hits {
hit, ok := hit.(map[string]any)
if !ok {
return nil, ErrMalformedResponse
}
if !isFuzzy {
keyword = strings.ToLower(keyword)
// declare a anon func to check if the title, content or at least one comment contains the keyword
found, err := func() (bool, error) {
// check if title match first
title, ok := hit["title"].(string)
if !ok {
return false, ErrMalformedResponse
} else if strings.Contains(strings.ToLower(title), keyword) {
return true, nil
}
// check if content has a match
content, ok := hit["content"].(string)
if !ok {
return false, ErrMalformedResponse
} else if strings.Contains(strings.ToLower(content), keyword) {
return true, nil
}
// now check for each comment if one has a match
// so we first try to cast and skip if there are no comments
comments, ok := hit["comments"].([]any)
if !ok {
return false, ErrMalformedResponse
} else if len(comments) == 0 {
return false, nil
}
// now we iterate over all and report as soon as we detect one match
for i := range comments {
comment, ok := comments[i].(string)
if !ok {
return false, ErrMalformedResponse
}
if strings.Contains(strings.ToLower(comment), keyword) {
return true, nil
}
}
// we got no match
return false, nil
}()
if err != nil {
return nil, err
} else if !found {
continue
}
}
issueID, ok := hit["id"].(float64)
if !ok {
return nil, ErrMalformedResponse
}
hits = append(hits, internal.Match{
ID: int64(issueID),
})
}
return hits, nil
}