Skip to content

Commit

Permalink
MB-61640: Fuzzy Dynamic Scoring
Browse files Browse the repository at this point in the history
 - Added new implementations of certain functions to allow passing of edit distances per term
 - Propagated edit distances from vellum
 - Boosting each term based on the inverse of its fuzziness
  • Loading branch information
Likith101 committed Oct 22, 2024
1 parent fab6e1e commit 6c536a8
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 5 deletions.
59 changes: 59 additions & 0 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2953,3 +2953,62 @@ func TestCopyIndex(t *testing.T) {
}
}
}

func TestFuzzyScoring(t *testing.T) {
tmpIndexPath := createTmpIndexPath(t)
defer cleanupTmpIndexPath(t, tmpIndexPath)

mp := NewIndexMapping()
mp.DefaultAnalyzer = "simple"
idx, err := New(tmpIndexPath, mp)
if err != nil {
t.Fatal(err)
}

batch := idx.NewBatch()

docs := []map[string]interface{}{
{
"textField": "ab",
},
{
"textField": "abc",
},
{
"textField": "abcd",
},
}

for _, doc := range docs {
err := batch.Index(fmt.Sprintf("%v", doc["textField"]), doc)
if err != nil {
t.Fatal(err)
}
}

err = idx.Batch(batch)
if err != nil {
t.Fatal(err)
}

query := NewFuzzyQuery("ab")
query.Fuzziness = 2
searchRequest := NewSearchRequestOptions(query, 10, 0, true)
res, err := idx.Search(searchRequest)
if err != nil {
t.Error(err)
}

maxScore := res.Hits[0].Score

for i, hit := range res.Hits {
if maxScore/float64(i+1) != hit.Score {
t.Errorf("expected score - %f, got score - %f", maxScore/float64(i+1), hit.Score)
}
}

err = idx.Close()
if err != nil {
t.Fatal(err)
}
}
16 changes: 11 additions & 5 deletions search/searcher/search_fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,11 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
}

var candidates []string
var editDistances []uint8
var dictBytesRead uint64
if fuzzyCandidates != nil {
candidates = fuzzyCandidates.candidates
editDistances = fuzzyCandidates.editDistances
dictBytesRead = fuzzyCandidates.bytesRead
}

Expand All @@ -67,13 +69,14 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
}
}

return NewMultiTermSearcher(ctx, indexReader, candidates, field,
boost, options, true)
return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field,
boost, editDistances, options, true)
}

type fuzzyCandidates struct {
candidates []string
bytesRead uint64
candidates []string
editDistances []uint8
bytesRead uint64
}

func reportIOStats(ctx context.Context, bytesRead uint64) {
Expand All @@ -91,7 +94,8 @@ func reportIOStats(ctx context.Context, bytesRead uint64) {
func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) {
rv = &fuzzyCandidates{
candidates: make([]string, 0),
candidates: make([]string, 0),
editDistances: make([]uint8, 0),
}

// in case of advanced reader implementations directly call
Expand All @@ -110,6 +114,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv.candidates = append(rv.candidates, tfd.Term)
rv.editDistances = append(rv.editDistances, tfd.EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.20.x, ubuntu-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.20.x, macos-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.20.x, windows-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.21.x, ubuntu-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.21.x, macos-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.22.x, ubuntu-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)

Check failure on line 117 in search/searcher/search_fuzzy.go

View workflow job for this annotation

GitHub Actions / test (1.22.x, macos-latest)

tfd.EditDistance undefined (type *index.DictEntry has no field or method EditDistance)
if tooManyClauses(len(rv.candidates)) {
return nil, tooManyClausesErr(field, len(rv.candidates))
}
Expand Down Expand Up @@ -144,6 +149,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
if !exceeded && ld <= fuzziness {
rv.candidates = append(rv.candidates, tfd.Term)
rv.editDistances = append(rv.editDistances, uint8(ld))
if tooManyClauses(len(rv.candidates)) {
return nil, tooManyClausesErr(field, len(rv.candidates))
}
Expand Down
51 changes: 51 additions & 0 deletions search/searcher/search_multi_term.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,31 @@ func NewMultiTermSearcher(ctx context.Context, indexReader index.IndexReader, te
options, limit)
}

// Works similarly to the multi term searcher but additionally boosts individual terms based on
// their edit distance from the query terms
func NewMultiTermSearcherBoosted(ctx context.Context, indexReader index.IndexReader, terms []string,
field string, boost float64, editDistances []uint8, options search.SearcherOptions, limit bool) (
search.Searcher, error) {

if tooManyClauses(len(terms)) {
if optionsDisjunctionOptimizable(options) {
return optimizeMultiTermSearcher(ctx, indexReader, terms, field, boost, options)
}
if limit {
return nil, tooManyClausesErr(field, len(terms))
}
}

qsearchers, err := makeBatchSearchersBoosted(ctx, indexReader, terms, field, boost, editDistances, options)
if err != nil {
return nil, err
}

// build disjunction searcher of these ranges
return newMultiTermSearcherInternal(ctx, indexReader, qsearchers, field, boost,
options, limit)
}

func NewMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte,
field string, boost float64, options search.SearcherOptions, limit bool) (
search.Searcher, error) {
Expand Down Expand Up @@ -151,6 +176,32 @@ func makeBatchSearchers(ctx context.Context, indexReader index.IndexReader, term
return qsearchers, nil
}

func makeBatchSearchersBoosted(ctx context.Context, indexReader index.IndexReader, terms []string, field string,
boost float64, editDistances []uint8, options search.SearcherOptions) ([]search.Searcher, error) {

qsearchers := make([]search.Searcher, len(terms))
qsearchersClose := func() {
for _, searcher := range qsearchers {
if searcher != nil {
_ = searcher.Close()
}
}
}
for i, term := range terms {
var err error
var editMultiplier float64
if editDistances != nil {
editMultiplier = 1 / float64(editDistances[i]+1)
}
qsearchers[i], err = NewTermSearcher(ctx, indexReader, term, field, boost*editMultiplier, options)
if err != nil {
qsearchersClose()
return nil, err
}
}
return qsearchers, nil
}

func optimizeMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte,
field string, boost float64, options search.SearcherOptions) (
search.Searcher, error) {
Expand Down

0 comments on commit 6c536a8

Please sign in to comment.