Skip to content

Commit

Permalink
MB-61640: Fuzzy Dynamic Scoring
Browse files Browse the repository at this point in the history
 - Removed levenshtein distance calculation from bleve
 - Propagated edit distances from vellum
 - Removed all changes for wildcard
  • Loading branch information
Likith101 committed Aug 9, 2024
1 parent 1c0f49e commit 4fa2c95
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 3 deletions.
59 changes: 59 additions & 0 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2953,3 +2953,62 @@ func TestCopyIndex(t *testing.T) {
}
}
}

func TestFuzzyScoring(t *testing.T) {
tmpIndexPath := createTmpIndexPath(t)
defer cleanupTmpIndexPath(t, tmpIndexPath)

mp := NewIndexMapping()
mp.DefaultAnalyzer = "simple"
idx, err := New(tmpIndexPath, mp)
if err != nil {
t.Fatal(err)
}

batch := idx.NewBatch()

docs := []map[string]interface{}{
{
"textField": "ab",
},
{
"textField": "abc",
},
{
"textField": "abcd",
},
}

for _, doc := range docs {
err := batch.Index(fmt.Sprintf("%v", doc["textField"]), doc)
if err != nil {
t.Fatal(err)
}
}

err = idx.Batch(batch)
if err != nil {
t.Fatal(err)
}

query := NewFuzzyQuery("ab")
query.Fuzziness = 2
searchRequest := NewSearchRequestOptions(query, 10, 0, true)
res, err := idx.Search(searchRequest)
if err != nil {
t.Error(err)
}

maxScore := res.Hits[0].Score

for i, hit := range res.Hits {
if maxScore/float64(i+1) != hit.Score {
t.Errorf("expected score - %f, got score - %f", maxScore/float64(i+1), hit.Score)
}
}

err = idx.Close()
if err != nil {
t.Fatal(err)
}
}
9 changes: 6 additions & 3 deletions search/searcher/search_fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,11 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
}

var candidates []string
var editDistances []uint8
var dictBytesRead uint64
if fuzzyCandidates != nil {
candidates = fuzzyCandidates.candidates
editDistances = fuzzyCandidates.editDistances
dictBytesRead = fuzzyCandidates.bytesRead
}

Expand All @@ -68,7 +70,7 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
}

return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field,
boost, fuzzyCandidates.editDistances, options, true)
boost, editDistances, options, true)
}

type fuzzyCandidates struct {
Expand All @@ -95,7 +97,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
candidates: make([]string, 0),
editDistances: make([]uint8, 0),
}
var reuse []int

// in case of advanced reader implementations directly call
// the levenshtein automaton based iterator to collect the
// candidate terms
Expand All @@ -112,7 +114,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv.candidates = append(rv.candidates, tfd.Term)
rv.editDistances = append(rv.editDistances, tfd.Distance)
rv.editDistances = append(rv.editDistances, tfd.EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.20.x, macos-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.21.x, ubuntu-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.21.x, macos-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.22.x, ubuntu-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.22.x, macos-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)
if tooManyClauses(len(rv.candidates)) {
return nil, tooManyClausesErr(field, len(rv.candidates))
}
Expand All @@ -139,6 +141,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
}()

// enumerate terms and check levenshtein distance
var reuse []int
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
var ld int
Expand Down
2 changes: 2 additions & 0 deletions search/searcher/search_multi_term.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ func NewMultiTermSearcher(ctx context.Context, indexReader index.IndexReader, te
options, limit)
}

// Works similarly to the multi term searcher but additionally boosts individual terms based on
// their edit distance from the query terms
func NewMultiTermSearcherBoosted(ctx context.Context, indexReader index.IndexReader, terms []string,
field string, boost float64, editDistances []uint8, options search.SearcherOptions, limit bool) (
search.Searcher, error) {
Expand Down

0 comments on commit 4fa2c95

Please sign in to comment.