Skip to content

Commit

Permalink
MB-35347: Synonym Search (#2090)
Browse files Browse the repository at this point in the history
- Allow setting up `synonym_sources` in the index mapping, which will
follow its own ingest pipeline, ingesting special synonym definitions
using the IndexSynonym API().
- A `synonym_source` can be set like an analyzer to a field mapping and
can be set as a default option at the document mapping or the index
mapping level.
- Each `synonym_source` can have its own analyzer, making it flexible to
allow for compatibility with the language analyzer specified for its
corresponding mapping.
- Compatibility with every term-based query where the term gets expanded
to include its synonyms at query time.
- Dependencies:
- blevesearch/[email protected] -
blevesearch/bleve_index_api#57
- blevesearch/[email protected] -
blevesearch/scorch_segment_api#46
- blevesearch/[email protected] -
blevesearch/vellum#22
- blevesearch/zapx@v16@latest -
blevesearch/zapx#268

---------

Co-authored-by: Abhinav Dangeti <[email protected]>
  • Loading branch information
CascadingRadium and abhinavdangeti authored Dec 19, 2024
1 parent 3e63d1c commit 77458c4
Show file tree
Hide file tree
Showing 33 changed files with 2,333 additions and 79 deletions.
9 changes: 9 additions & 0 deletions analysis/type.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ type DateTimeParser interface {
ParseDateTime(string) (time.Time, string, error)
}

const SynonymSourceType = "synonym"

type SynonymSourceVisitor func(name string, item SynonymSource) error

type SynonymSource interface {
Analyzer() string
Collection() string
}

type ByteArrayConverter interface {
Convert([]byte) (interface{}, error)
}
15 changes: 15 additions & 0 deletions document/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ func NewDocument(id string) *Document {
}
}

func NewSynonymDocument(id string) *Document {
return &Document{
id: id,
Fields: make([]Field, 0),
}
}

func (d *Document) Size() int {
sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr +
len(d.id)
Expand Down Expand Up @@ -133,3 +140,11 @@ func (d *Document) VisitComposite(visitor index.CompositeFieldVisitor) {
func (d *Document) HasComposite() bool {
return len(d.CompositeFields) > 0
}

func (d *Document) VisitSynonymFields(visitor index.SynonymFieldVisitor) {
for _, f := range d.Fields {
if sf, ok := f.(index.SynonymField); ok {
visitor(sf)
}
}
}
149 changes: 149 additions & 0 deletions document/field_synonym.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
// Copyright (c) 2024 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package document

import (
"reflect"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
)

var reflectStaticSizeSynonymField int

func init() {
var f SynonymField
reflectStaticSizeSynonymField = int(reflect.TypeOf(f).Size())
}

const DefaultSynonymIndexingOptions = index.IndexField

type SynonymField struct {
name string
analyzer analysis.Analyzer
options index.FieldIndexingOptions
input []string
synonyms []string
numPlainTextBytes uint64

// populated during analysis
synonymMap map[string][]string
}

func (s *SynonymField) Size() int {
return reflectStaticSizeSynonymField + size.SizeOfPtr +
len(s.name)
}

func (s *SynonymField) Name() string {
return s.name
}

func (s *SynonymField) ArrayPositions() []uint64 {
return nil
}

func (s *SynonymField) Options() index.FieldIndexingOptions {
return s.options
}

func (s *SynonymField) NumPlainTextBytes() uint64 {
return s.numPlainTextBytes
}

func (s *SynonymField) AnalyzedLength() int {
return 0
}

func (s *SynonymField) EncodedFieldType() byte {
return 'y'
}

func (s *SynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies {
return nil
}

func (s *SynonymField) Analyze() {
var analyzedInput []string
if len(s.input) > 0 {
analyzedInput = make([]string, 0, len(s.input))
for _, term := range s.input {
analyzedTerm := analyzeSynonymTerm(term, s.analyzer)
if analyzedTerm != "" {
analyzedInput = append(analyzedInput, analyzedTerm)
}
}
}
analyzedSynonyms := make([]string, 0, len(s.synonyms))
for _, syn := range s.synonyms {
analyzedTerm := analyzeSynonymTerm(syn, s.analyzer)
if analyzedTerm != "" {
analyzedSynonyms = append(analyzedSynonyms, analyzedTerm)
}
}
s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms)
}

func (s *SynonymField) Value() []byte {
return nil
}

func (s *SynonymField) IterateSynonyms(visitor func(term string, synonyms []string)) {
for term, synonyms := range s.synonymMap {
visitor(term, synonyms)
}
}

func NewSynonymField(name string, analyzer analysis.Analyzer, input []string, synonyms []string) *SynonymField {
return &SynonymField{
name: name,
analyzer: analyzer,
options: DefaultSynonymIndexingOptions,
input: input,
synonyms: synonyms,
}
}

func processSynonymData(input []string, synonyms []string) map[string][]string {
var synonymMap map[string][]string
if len(input) > 0 {
// Map each term to the same list of synonyms.
synonymMap = make(map[string][]string, len(input))
for _, term := range input {
synonymMap[term] = synonyms
}
} else {
synonymMap = make(map[string][]string, len(synonyms))
// Precompute a map where each synonym points to all other synonyms.
for i, elem := range synonyms {
synonymMap[elem] = make([]string, 0, len(synonyms)-1)
for j, otherElem := range synonyms {
if i != j {
synonymMap[elem] = append(synonymMap[elem], otherElem)
}
}
}
}
return synonymMap
}

func analyzeSynonymTerm(term string, analyzer analysis.Analyzer) string {
tokenStream := analyzer.Analyze([]byte(term))
if len(tokenStream) == 1 {
return string(tokenStream[0].Term)
}
return ""
}
2 changes: 2 additions & 0 deletions error.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const (
ErrorEmptyID
ErrorIndexReadInconsistency
ErrorTwoPhaseSearchInconsistency
ErrorSynonymSearchNotSupported
)

// Error represents a more strongly typed bleve error for detecting
Expand All @@ -49,4 +50,5 @@ var errorMessages = map[Error]string{
ErrorEmptyID: "document ID cannot be empty",
ErrorIndexReadInconsistency: "index read inconsistency detected",
ErrorTwoPhaseSearchInconsistency: "2-phase search failed, likely due to an overlapping topology change",
ErrorSynonymSearchNotSupported: "synonym search not supported",
}
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,26 @@ go 1.21
require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/bits-and-blooms/bitset v1.12.0
github.com/blevesearch/bleve_index_api v1.1.13
github.com/blevesearch/bleve_index_api v1.2.0
github.com/blevesearch/geo v0.1.20
github.com/blevesearch/go-faiss v1.0.24
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
github.com/blevesearch/go-porterstemmer v1.0.3
github.com/blevesearch/goleveldb v1.0.1
github.com/blevesearch/gtreap v0.1.1
github.com/blevesearch/scorch_segment_api/v2 v2.2.16
github.com/blevesearch/scorch_segment_api/v2 v2.3.0
github.com/blevesearch/segment v0.9.1
github.com/blevesearch/snowball v0.6.1
github.com/blevesearch/snowballstem v0.9.0
github.com/blevesearch/stempel v0.2.0
github.com/blevesearch/upsidedown_store_api v1.0.2
github.com/blevesearch/vellum v1.0.11
github.com/blevesearch/vellum v1.1.0
github.com/blevesearch/zapx/v11 v11.3.10
github.com/blevesearch/zapx/v12 v12.3.10
github.com/blevesearch/zapx/v13 v13.3.10
github.com/blevesearch/zapx/v14 v14.3.10
github.com/blevesearch/zapx/v15 v15.3.17
github.com/blevesearch/zapx/v16 v16.1.11-0.20241217210710-e1dde3e9876d
github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38
github.com/couchbase/moss v0.2.0
github.com/golang/protobuf v1.3.2
github.com/spf13/cobra v1.7.0
Expand Down
16 changes: 8 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.13 h1:+nrA6oRJr85aCPyqaeZtsruObwKojutfonHJin/BP48=
github.com/blevesearch/bleve_index_api v1.1.13/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo=
github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI=
Expand All @@ -19,8 +19,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc=
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
Expand All @@ -31,8 +31,8 @@ github.com/blevesearch/stempel v0.2.0 h1:CYzVPaScODMvgE9o+kf6D4RJ/VRomyi9uHF+PtB
github.com/blevesearch/stempel v0.2.0/go.mod h1:wjeTHqQv+nQdbPuJ/YcvOjTInA2EIc6Ks1FoSUzSLvc=
github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A=
github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ=
github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU=
github.com/blevesearch/vellum v1.0.11/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk=
github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ=
github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s=
Expand All @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7
github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
github.com/blevesearch/zapx/v15 v15.3.17 h1:NkkMI98pYLq/uHnB6YWcITrrLpCVyvZ9iP+AyfpW1Ys=
github.com/blevesearch/zapx/v15 v15.3.17/go.mod h1:vXRQzJJvlGVCdmOD5hg7t7JdjUT5DmDPhsAfjvtzIq8=
github.com/blevesearch/zapx/v16 v16.1.11-0.20241217210710-e1dde3e9876d h1:XUZzJwWrRqRJwigYWE7iB2nYBP6rjcU3x+InZtvQOGo=
github.com/blevesearch/zapx/v16 v16.1.11-0.20241217210710-e1dde3e9876d/go.mod h1:wZc3SFjKlrqxkiUkT+HVBBBBTX8oqXxUb2gjE+CMgIE=
github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38 h1:iJ3Q3sbyo2d0bjfb720RmGjj7cqzh/EdP3528ggDIMY=
github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38/go.mod h1:JTZseJiEpogtkepKSubIKAmfgbQiOReJXfmjxB1qta4=
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=
Expand Down
63 changes: 63 additions & 0 deletions index.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package bleve

import (
"context"
"fmt"

"github.com/blevesearch/bleve/v2/index/upsidedown"

Expand Down Expand Up @@ -63,6 +64,36 @@ func (b *Batch) Index(id string, data interface{}) error {
return nil
}

func (b *Batch) IndexSynonym(id string, collection string, definition *SynonymDefinition) error {
if id == "" {
return ErrorEmptyID
}
if eventIndex, ok := b.index.(index.EventIndex); ok {
eventIndex.FireIndexEvent()
}
synMap, ok := b.index.Mapping().(mapping.SynonymMapping)
if !ok {
return ErrorSynonymSearchNotSupported
}

if err := definition.Validate(); err != nil {
return err
}

doc := document.NewSynonymDocument(id)
err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms)
if err != nil {
return err
}
b.internal.Update(doc)

b.lastDocSize = uint64(doc.Size() +
len(id) + size.SizeOfString) // overhead from internal
b.totalSize += b.lastDocSize

return nil
}

func (b *Batch) LastDocSize() uint64 {
return b.lastDocSize
}
Expand Down Expand Up @@ -323,3 +354,35 @@ type IndexCopyable interface {
// FileSystemDirectory is the default implementation for the
// index.Directory interface.
type FileSystemDirectory string

// SynonymDefinition represents a synonym mapping in Bleve.
// Each instance associates one or more input terms with a list of synonyms,
// defining how terms are treated as equivalent in searches.
type SynonymDefinition struct {
// Input is an optional list of terms for unidirectional synonym mapping.
// When terms are specified in Input, they will map to the terms in Synonyms,
// making the relationship unidirectional (each Input maps to all Synonyms).
// If Input is omitted, the relationship is bidirectional among all Synonyms.
Input []string `json:"input,omitempty"`

// Synonyms is a list of terms that are considered equivalent.
// If Input is specified, each term in Input will map to each term in Synonyms.
// If Input is not specified, the Synonyms list will be treated bidirectionally,
// meaning each term in Synonyms is treated as synonymous with all others.
Synonyms []string `json:"synonyms"`
}

func (sd *SynonymDefinition) Validate() error {
if len(sd.Synonyms) == 0 {
return fmt.Errorf("synonym definition must have at least one synonym")
}
return nil
}

// SynonymIndex supports indexing synonym definitions alongside regular documents.
// Synonyms, grouped by collection name, define term relationships for query expansion in searches.
type SynonymIndex interface {
Index
// IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection.
IndexSynonym(id string, collection string, definition *SynonymDefinition) error
}
Loading

0 comments on commit 77458c4

Please sign in to comment.