Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-35347: Synonym Search #2090

Merged
merged 38 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
f6170c3
real first draft
CascadingRadium Nov 6, 2024
263e990
fix bug
CascadingRadium Nov 6, 2024
c5fc548
small fix the first draft
CascadingRadium Nov 6, 2024
542d34a
glue code for indexing path
CascadingRadium Nov 15, 2024
6627dcb
unit test
CascadingRadium Nov 15, 2024
32c67af
query path first draft
CascadingRadium Nov 18, 2024
784f45b
minor fixes
CascadingRadium Nov 22, 2024
f19cedc
bug fixes and unit tests for single index implementation
CascadingRadium Nov 29, 2024
91be472
remove regex optimization
CascadingRadium Nov 29, 2024
9baf914
add default synonym sources
CascadingRadium Nov 30, 2024
dd692bf
refactor code
CascadingRadium Dec 3, 2024
4d4440e
alias path code
CascadingRadium Dec 3, 2024
7b86533
Presearch Code Refactor
CascadingRadium Dec 6, 2024
d58474f
fix comment
CascadingRadium Dec 6, 2024
3908df3
Add ExtractFields API with unit test
CascadingRadium Dec 6, 2024
31973e0
bug fix
CascadingRadium Dec 6, 2024
2130f3c
final fixes to alias query path
CascadingRadium Dec 9, 2024
5494ea1
Merge branch 'presearchRefactor' into synonyms
CascadingRadium Dec 9, 2024
59c3193
rebase
CascadingRadium Dec 9, 2024
cad9e79
fix bug
CascadingRadium Dec 9, 2024
f3d0ac5
optimization
CascadingRadium Dec 9, 2024
e3b1d5b
bleve APIs
CascadingRadium Dec 9, 2024
b469373
minor fix
CascadingRadium Dec 9, 2024
67815ad
bug fix
CascadingRadium Dec 9, 2024
55f6d4c
make default_synonym_source omitempty
CascadingRadium Dec 10, 2024
1a6dd1e
fix bugs
CascadingRadium Dec 10, 2024
ee71211
refactor bleve APIs
CascadingRadium Dec 10, 2024
a4d83ac
add additional methods to interface
CascadingRadium Dec 10, 2024
1cf2bfd
update interface name
CascadingRadium Dec 11, 2024
302147b
go.mod update
CascadingRadium Dec 12, 2024
2971072
merge master
CascadingRadium Dec 17, 2024
0db25b4
Merge branch 'master' into synonyms
CascadingRadium Dec 17, 2024
e062cd7
reposition
CascadingRadium Dec 17, 2024
ccb4a71
Merge branch 'master' into synonyms
CascadingRadium Dec 18, 2024
41fc99e
refactor
CascadingRadium Dec 19, 2024
0f11d73
minor fix
CascadingRadium Dec 19, 2024
c731844
test fix
CascadingRadium Dec 19, 2024
148d32a
Bump up zap/v16
abhinavdangeti Dec 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions analysis/type.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ type DateTimeParser interface {
ParseDateTime(string) (time.Time, string, error)
}

const SynonymSourceType = "synonym"

type SynonymSourceVisitor func(name string, item SynonymSource) error

type SynonymSource interface {
Analyzer() string
Collection() string
CascadingRadium marked this conversation as resolved.
Show resolved Hide resolved
}

type ByteArrayConverter interface {
Convert([]byte) (interface{}, error)
}
15 changes: 15 additions & 0 deletions document/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ func NewDocument(id string) *Document {
}
}

func NewSynonymDocument(id string) *Document {
return &Document{
id: id,
Fields: make([]Field, 0),
}
}

func (d *Document) Size() int {
sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr +
len(d.id)
Expand Down Expand Up @@ -133,3 +140,11 @@ func (d *Document) VisitComposite(visitor index.CompositeFieldVisitor) {
func (d *Document) HasComposite() bool {
return len(d.CompositeFields) > 0
}

func (d *Document) VisitSynonymFields(visitor index.SynonymFieldVisitor) {
for _, f := range d.Fields {
if sf, ok := f.(index.SynonymField); ok {
visitor(sf)
}
}
}
143 changes: 143 additions & 0 deletions document/field_synonym.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
// Copyright (c) 2024 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package document

import (
"reflect"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
)

var reflectStaticSizeSynonymField int

func init() {
var f SynonymField
reflectStaticSizeSynonymField = int(reflect.TypeOf(f).Size())
}

const DefaultSynonymIndexingOptions = index.IndexField

type SynonymField struct {
name string
analyzer analysis.Analyzer
options index.FieldIndexingOptions
input []string
synonyms []string
numPlainTextBytes uint64

// populated during analysis
synonymMap map[string][]string
}

func (s *SynonymField) Size() int {
return reflectStaticSizeSynonymField + size.SizeOfPtr +
len(s.name)
}

func (s *SynonymField) Name() string {
return s.name
}

func (s *SynonymField) ArrayPositions() []uint64 {
return nil
}

func (s *SynonymField) Options() index.FieldIndexingOptions {
return s.options
}

func (s *SynonymField) NumPlainTextBytes() uint64 {
return s.numPlainTextBytes
}

func (s *SynonymField) AnalyzedLength() int {
return 0
}

func (s *SynonymField) EncodedFieldType() byte {
return 'y'
}

func (s *SynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies {
return nil
}

func (s *SynonymField) Analyze() {
var analyzedInput []string
if len(s.input) > 0 {
analyzedInput = make([]string, 0, len(s.input))
for _, term := range s.input {
analyzedInput = append(analyzedInput, analyzeSynonymTerm(term, s.analyzer))
}
}
analyzedSynonyms := make([]string, 0, len(s.synonyms))
for _, syn := range s.synonyms {
analyzedSynonyms = append(analyzedSynonyms, analyzeSynonymTerm(syn, s.analyzer))
}
s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms)
}

func (s *SynonymField) Value() []byte {
return nil
}

func (s *SynonymField) IterateSynonyms(visitor func(term string, synonyms []string)) {
for term, synonyms := range s.synonymMap {
visitor(term, synonyms)
}
}

func NewSynonymField(name string, analyzer analysis.Analyzer, input []string, synonyms []string) *SynonymField {
return &SynonymField{
name: name,
analyzer: analyzer,
options: DefaultSynonymIndexingOptions,
input: input,
synonyms: synonyms,
}
}

func processSynonymData(input []string, synonyms []string) map[string][]string {
var synonymMap map[string][]string
if len(input) > 0 {
// Map each term to the same list of synonyms.
synonymMap = make(map[string][]string, len(input))
for _, term := range input {
synonymMap[term] = synonyms
}
} else {
synonymMap = make(map[string][]string, len(synonyms))
// Precompute a map where each synonym points to all other synonyms.
for i, elem := range synonyms {
synonymMap[elem] = make([]string, 0, len(synonyms)-1)
for j, otherElem := range synonyms {
if i != j {
synonymMap[elem] = append(synonymMap[elem], otherElem)
}
}
}
}
return synonymMap
}

func analyzeSynonymTerm(term string, analyzer analysis.Analyzer) string {
tokenStream := analyzer.Analyze([]byte(term))
if len(tokenStream) == 0 {
return term
}
return string(tokenStream[0].Term)
}
2 changes: 2 additions & 0 deletions error.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const (
ErrorEmptyID
ErrorIndexReadInconsistency
ErrorTwoPhaseSearchInconsistency
ErrorSynonymSearchNotSupported
)

// Error represents a more strongly typed bleve error for detecting
Expand All @@ -49,4 +50,5 @@ var errorMessages = map[Error]string{
ErrorEmptyID: "document ID cannot be empty",
ErrorIndexReadInconsistency: "index read inconsistency detected",
ErrorTwoPhaseSearchInconsistency: "2-phase search failed, likely due to an overlapping topology change",
ErrorSynonymSearchNotSupported: "synonym search not supported",
}
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@ go 1.21
require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/bits-and-blooms/bitset v1.12.0
github.com/blevesearch/bleve_index_api v1.1.13
github.com/blevesearch/bleve_index_api v1.2.0
github.com/blevesearch/geo v0.1.20
github.com/blevesearch/go-faiss v1.0.24
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
github.com/blevesearch/go-porterstemmer v1.0.3
github.com/blevesearch/goleveldb v1.0.1
github.com/blevesearch/gtreap v0.1.1
github.com/blevesearch/scorch_segment_api/v2 v2.2.16
github.com/blevesearch/scorch_segment_api/v2 v2.3.0
github.com/blevesearch/segment v0.9.1
github.com/blevesearch/snowball v0.6.1
github.com/blevesearch/snowballstem v0.9.0
github.com/blevesearch/stempel v0.2.0
github.com/blevesearch/upsidedown_store_api v1.0.2
github.com/blevesearch/vellum v1.0.11
github.com/blevesearch/vellum v1.1.0
github.com/blevesearch/zapx/v11 v11.3.10
github.com/blevesearch/zapx/v12 v12.3.10
github.com/blevesearch/zapx/v13 v13.3.10
Expand Down
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.13 h1:+nrA6oRJr85aCPyqaeZtsruObwKojutfonHJin/BP48=
github.com/blevesearch/bleve_index_api v1.1.13/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo=
github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI=
Expand All @@ -19,8 +19,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc=
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
Expand All @@ -31,8 +31,8 @@ github.com/blevesearch/stempel v0.2.0 h1:CYzVPaScODMvgE9o+kf6D4RJ/VRomyi9uHF+PtB
github.com/blevesearch/stempel v0.2.0/go.mod h1:wjeTHqQv+nQdbPuJ/YcvOjTInA2EIc6Ks1FoSUzSLvc=
github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A=
github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ=
github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU=
github.com/blevesearch/vellum v1.0.11/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk=
github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ=
github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s=
Expand Down
63 changes: 63 additions & 0 deletions index.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package bleve

import (
"context"
"fmt"

"github.com/blevesearch/bleve/v2/index/upsidedown"

Expand Down Expand Up @@ -63,6 +64,36 @@ func (b *Batch) Index(id string, data interface{}) error {
return nil
}

func (b *Batch) IndexSynonym(id string, collection string, definition *SynonymDefinition) error {
if id == "" {
return ErrorEmptyID
}
if eventIndex, ok := b.index.(index.EventIndex); ok {
eventIndex.FireIndexEvent()
}
synMap, ok := b.index.Mapping().(mapping.SynonymMapping)
if !ok {
return ErrorSynonymSearchNotSupported
}

if err := definition.Validate(); err != nil {
return err
}

doc := document.NewSynonymDocument(id)
err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms)
if err != nil {
return err
}
b.internal.Update(doc)

b.lastDocSize = uint64(doc.Size() +
len(id) + size.SizeOfString) // overhead from internal
b.totalSize += b.lastDocSize

return nil
}

func (b *Batch) LastDocSize() uint64 {
return b.lastDocSize
}
Expand Down Expand Up @@ -323,3 +354,35 @@ type IndexCopyable interface {
// FileSystemDirectory is the default implementation for the
// index.Directory interface.
type FileSystemDirectory string

// SynonymDefinition represents a synonym mapping in Bleve.
// Each instance associates one or more input terms with a list of synonyms,
// defining how terms are treated as equivalent in searches.
type SynonymDefinition struct {
// Input is an optional list of terms for unidirectional synonym mapping.
// When terms are specified in Input, they will map to the terms in Synonyms,
// making the relationship unidirectional (each Input maps to all Synonyms).
// If Input is omitted, the relationship is bidirectional among all Synonyms.
Input []string `json:"input,omitempty"`

// Synonyms is a list of terms that are considered equivalent.
// If Input is specified, each term in Input will map to each term in Synonyms.
// If Input is not specified, the Synonyms list will be treated bidirectionally,
// meaning each term in Synonyms is treated as synonymous with all others.
Synonyms []string `json:"synonyms"`
}

func (sd *SynonymDefinition) Validate() error {
if len(sd.Synonyms) == 0 {
return fmt.Errorf("synonym definition must have at least one synonym")
}
return nil
}

// SynonymIndex supports indexing synonym definitions alongside regular documents.
// Synonyms, grouped by collection name, define term relationships for query expansion in searches.
type SynonymIndex interface {
Index
// IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection.
IndexSynonym(id string, collection string, definition *SynonymDefinition) error
}
Loading
Loading