Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-57888: Updated Merge Process to Support Index Update #280

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions build.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"math"
"os"

index "github.com/blevesearch/bleve_index_api"
"github.com/blevesearch/vellum"
)

Expand Down Expand Up @@ -169,6 +170,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64
sectionsIndexOffset: sectionsIndexOffset,
fieldDvReaders: make([]map[uint16]*docValueReader, len(segmentSections)),
docValueOffset: 0, // docValueOffsets identified automatically by the section
updatedFields: make(map[string]*index.UpdateFieldInfo),
fieldFSTs: make(map[uint16]*vellum.FST),
vecIndexCache: newVectorIndexCache(),
synIndexCache: newSynonymIndexCache(),
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ go 1.21

require (
github.com/RoaringBitmap/roaring/v2 v2.4.5
github.com/blevesearch/bleve_index_api v1.2.1
github.com/blevesearch/bleve_index_api v1.2.2-0.20250207163324-9af4b44c3810
github.com/blevesearch/go-faiss v1.0.24
github.com/blevesearch/mmap-go v1.0.4
github.com/blevesearch/scorch_segment_api/v2 v2.3.3
github.com/blevesearch/scorch_segment_api/v2 v2.3.4-0.20250210150338-cf1ed08c650d
github.com/blevesearch/vellum v1.1.0
github.com/golang/snappy v0.0.4
github.com/spf13/cobra v1.7.0
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5 h1:uGrrMreGjvAtTBobc0g5IrW1D5ldxDQYe2
github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/hVXDS2dXi7/eUFE0=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.2.1 h1:IuXwLvmyp7I7+e0FOA68gcHHLfzSQ4AqQ8wVab5uxk0=
github.com/blevesearch/bleve_index_api v1.2.1/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0=
github.com/blevesearch/bleve_index_api v1.2.2-0.20250207163324-9af4b44c3810 h1:SFNLq10DkzDR2yyDjoEl5dge3QWw6k3hRkAFWVoMh8c=
github.com/blevesearch/bleve_index_api v1.2.2-0.20250207163324-9af4b44c3810/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0=
github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI=
github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.3.3 h1:LtyQ1Wltja54bchqwgY20SvVe6HltUL4PsAPH3UNrQI=
github.com/blevesearch/scorch_segment_api/v2 v2.3.3/go.mod h1:LXidEjeenMdbcLKP/UdZi1HJOny61FbhslAh5SgN5Ik=
github.com/blevesearch/scorch_segment_api/v2 v2.3.4-0.20250210150338-cf1ed08c650d h1:8MZpw7vMvv0zNsowDsD9/QqAOqgEzkTVrlHpp5ib4js=
github.com/blevesearch/scorch_segment_api/v2 v2.3.4-0.20250210150338-cf1ed08c650d/go.mod h1:oup/GcMIiq8rqKTl7GxsjmqjKaHu5QXysCMBsmSqXeQ=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
Expand Down
55 changes: 48 additions & 7 deletions merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"sort"

"github.com/RoaringBitmap/roaring/v2"
index "github.com/blevesearch/bleve_index_api"
seg "github.com/blevesearch/scorch_segment_api/v2"
"github.com/golang/snappy"
)
Expand Down Expand Up @@ -109,6 +110,19 @@ func mergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, pat
return newDocNums, uint64(cr.Count()), nil
}

func filterFields(fieldsInv []string, fieldInfo map[string]*index.UpdateFieldInfo) []string {
rv := make([]string, 0)
for _, field := range fieldsInv {
if val, ok := fieldInfo[field]; ok {
if val.Deleted {
continue
}
}
rv = append(rv, field)
}
return rv
}

func mergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
chunkMode uint32, cr *CountHashWriter, closeCh chan struct{}) (
newDocNums [][]uint64, numDocs, storedIndexOffset uint64,
Expand All @@ -117,6 +131,8 @@ func mergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,

var fieldsSame bool
fieldsSame, fieldsInv = mergeFields(segments)
updatedFields := mergeUpdatedFields(segments)
fieldsInv = filterFields(fieldsInv, updatedFields)
fieldsMap = mapFields(fieldsInv)

numDocs = computeNewDocCount(segments, drops)
Expand All @@ -130,15 +146,16 @@ func mergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
// offsets in the fields section index of the file (the final merged file).
mergeOpaque := map[int]resetable{}
args := map[string]interface{}{
"chunkMode": chunkMode,
"fieldsSame": fieldsSame,
"fieldsMap": fieldsMap,
"numDocs": numDocs,
"chunkMode": chunkMode,
"fieldsSame": fieldsSame,
"fieldsMap": fieldsMap,
"numDocs": numDocs,
"updatedFields": updatedFields,
}

if numDocs > 0 {
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh)
fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh, updatedFields)
if err != nil {
return nil, 0, 0, nil, nil, 0, err
}
Expand Down Expand Up @@ -358,7 +375,7 @@ type varintEncoder func(uint64) (int, error)

func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) {
w *CountHashWriter, closeCh chan struct{}, updatedFields map[string]*index.UpdateFieldInfo) (uint64, [][]uint64, error) {
var rv [][]uint64 // The remapped or newDocNums for each segment.

var newDocNum uint64
Expand Down Expand Up @@ -397,7 +414,7 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
// optimize when the field mapping is the same across all
// segments and there are no deletions, via byte-copying
// of stored docs bytes directly to the writer
if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) {
if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) && len(updatedFields) == 0 {
err := segment.copyStoredDocs(newDocNum, docNumOffsets, w)
if err != nil {
return 0, nil, err
Expand Down Expand Up @@ -471,6 +488,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,

// now walk the non-"_id" fields in order
for fieldID := 1; fieldID < len(fieldsInv); fieldID++ {
if val, ok := updatedFields[fieldsInv[fieldID]]; ok {
if val.Store {
continue
}
}
storedFieldValues := vals[fieldID]

stf := typs[fieldID]
Expand Down Expand Up @@ -606,6 +628,25 @@ func mergeFields(segments []*SegmentBase) (bool, []string) {
return fieldsSame, rv
}

func mergeUpdatedFields(segments []*SegmentBase) map[string]*index.UpdateFieldInfo {
fieldInfo := make(map[string]*index.UpdateFieldInfo)

for _, segment := range segments {
for field, info := range segment.updatedFields {
if _, ok := fieldInfo[field]; !ok {
fieldInfo[field] = info
} else {
fieldInfo[field].Deleted = fieldInfo[field].Deleted || info.Deleted
fieldInfo[field].Index = fieldInfo[field].Index || info.Index
fieldInfo[field].Store = fieldInfo[field].Store || info.Store
fieldInfo[field].DocValues = fieldInfo[field].Store || info.DocValues
}
}

}
return fieldInfo
}

func isClosed(closeCh chan struct{}) bool {
select {
case <-closeCh:
Expand Down
16 changes: 13 additions & 3 deletions section_faiss_vector_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ func (v *faissVectorIndexSection) Merge(opaque map[int]resetable, segments []*Se
if _, ok := sb.fieldsMap[fieldName]; !ok {
continue
}
if info, ok := vo.updatedFields[fieldName]; ok && info.Index {
continue
}

// check if the section address is a valid one for "fieldName" in the
// segment sb. the local fieldID (fetched by the fieldsMap of the sb)
Expand Down Expand Up @@ -703,9 +706,10 @@ func (v *faissVectorIndexSection) getvectorIndexOpaque(opaque map[int]resetable)

func (v *faissVectorIndexSection) InitOpaque(args map[string]interface{}) resetable {
rv := &vectorIndexOpaque{
fieldAddrs: make(map[uint16]int),
vecIDMap: make(map[int64]*vecInfo),
vecFieldMap: make(map[uint16]*indexContent),
fieldAddrs: make(map[uint16]int),
vecIDMap: make(map[int64]*vecInfo),
vecFieldMap: make(map[uint16]*indexContent),
updatedFields: make(map[string]*index.UpdateFieldInfo),
}
for k, v := range args {
rv.Set(k, v)
Expand Down Expand Up @@ -744,6 +748,8 @@ type vectorIndexOpaque struct {
// index to be build.
vecFieldMap map[uint16]*indexContent

updatedFields map[string]*index.UpdateFieldInfo

tmp0 []byte
}

Expand Down Expand Up @@ -790,4 +796,8 @@ func (v *vectorIndexOpaque) Reset() (err error) {
}

func (v *vectorIndexOpaque) Set(key string, val interface{}) {
switch key {
case "updatedFields":
v.updatedFields = val.(map[string]*index.UpdateFieldInfo)
}
}
23 changes: 18 additions & 5 deletions section_inverted_text_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ func (i *invertedTextIndexSection) AddrForField(opaque map[int]resetable, fieldI
func mergeAndPersistInvertedSection(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkMode uint32,
w *CountHashWriter, closeCh chan struct{}) (map[int]int, uint64, error) {
updatedFields map[string]*index.UpdateFieldInfo, w *CountHashWriter,
closeCh chan struct{}) (map[int]int, uint64, error) {
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
var bufLoc []uint64

Expand Down Expand Up @@ -126,10 +127,15 @@ func mergeAndPersistInvertedSection(segments []*SegmentBase, dropsIn []*roaring.
return nil, 0, seg.ErrClosed
}

if info, ok := updatedFields[fieldName]; ok && info.Index {
continue
}

dict, err2 := segment.dictionary(fieldName)
if err2 != nil {
return nil, 0, err2
}

if dict != nil && dict.fst != nil {
itr, err2 := dict.fst.Iterator(nil, nil)
if err2 != nil && err2 != vellum.ErrIteratorDone {
Expand Down Expand Up @@ -244,7 +250,7 @@ func mergeAndPersistInvertedSection(segments []*SegmentBase, dropsIn []*roaring.

postItr = postings.iterator(true, true, true, postItr)

if fieldsSame {
if fieldsSame && len(updatedFields) == 0 {
// can optimize by copying freq/norm/loc bytes directly
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
term, postItr, newDocNums[itrI], newRoaring,
Expand Down Expand Up @@ -317,7 +323,9 @@ func mergeAndPersistInvertedSection(segments []*SegmentBase, dropsIn []*roaring.
if isClosed(closeCh) {
return nil, 0, seg.ErrClosed
}

if info, ok := updatedFields[fieldName]; ok && info.DocValues {
continue
}
fieldIDPlus1 := uint16(segment.fieldsMap[fieldName])
if dvIter, exists := segment.fieldDvReaders[SectionInvertedTextIndex][fieldIDPlus1-1]; exists &&
dvIter != nil {
Expand Down Expand Up @@ -398,7 +406,7 @@ func (i *invertedTextIndexSection) Merge(opaque map[int]resetable, segments []*S
w *CountHashWriter, closeCh chan struct{}) error {
io := i.getInvertedIndexOpaque(opaque)
fieldAddrs, _, err := mergeAndPersistInvertedSection(segments, drops, fieldsInv,
io.FieldsMap, io.fieldsSame, newDocNumsIn, io.numDocs, io.chunkMode, w, closeCh)
io.FieldsMap, io.fieldsSame, newDocNumsIn, io.numDocs, io.chunkMode, io.updatedFields, w, closeCh)
if err != nil {
return err
}
Expand Down Expand Up @@ -905,7 +913,8 @@ func (i *invertedIndexOpaque) getOrDefineField(fieldName string) int {

func (i *invertedTextIndexSection) InitOpaque(args map[string]interface{}) resetable {
rv := &invertedIndexOpaque{
fieldAddrs: map[int]int{},
fieldAddrs: map[int]int{},
updatedFields: make(map[string]*index.UpdateFieldInfo),
}
for k, v := range args {
rv.Set(k, v)
Expand Down Expand Up @@ -969,6 +978,8 @@ type invertedIndexOpaque struct {

fieldAddrs map[int]int

updatedFields map[string]*index.UpdateFieldInfo

fieldsSame bool
numDocs uint64
}
Expand Down Expand Up @@ -1035,5 +1046,7 @@ func (i *invertedIndexOpaque) Set(key string, val interface{}) {
i.FieldsMap = val.(map[string]uint16)
case "numDocs":
i.numDocs = val.(uint64)
case "updatedFields":
i.updatedFields = val.(map[string]*index.UpdateFieldInfo)
}
}
22 changes: 17 additions & 5 deletions section_synonym_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ type synonymIndexOpaque struct {

// A map linking thesaurus IDs to their corresponding thesaurus' file offsets
thesaurusAddrs map[int]int

updatedFields map[string]*index.UpdateFieldInfo
}

// Set the fieldsMap and results in the synonym index opaque before the section processes a synonym field.
Expand All @@ -108,6 +110,8 @@ func (so *synonymIndexOpaque) Set(key string, value interface{}) {
so.results = value.([]index.Document)
case "fieldsMap":
so.FieldsMap = value.(map[string]uint16)
case "updatedFields":
so.updatedFields = value.(map[string]*index.UpdateFieldInfo)
}
}

Expand Down Expand Up @@ -399,6 +403,7 @@ func (s *synonymIndexSection) getSynonymIndexOpaque(opaque map[int]resetable) *s
func (s *synonymIndexSection) InitOpaque(args map[string]interface{}) resetable {
rv := &synonymIndexOpaque{
thesaurusAddrs: map[int]int{},
updatedFields: make(map[string]*index.UpdateFieldInfo),
}
for k, v := range args {
rv.Set(k, v)
Expand Down Expand Up @@ -452,7 +457,7 @@ func (s *synonymIndexSection) Merge(opaque map[int]resetable, segments []*Segmen
drops []*roaring.Bitmap, fieldsInv []string, newDocNumsIn [][]uint64,
w *CountHashWriter, closeCh chan struct{}) error {
so := s.getSynonymIndexOpaque(opaque)
thesaurusAddrs, fieldIDtoThesaurusID, err := mergeAndPersistSynonymSection(segments, drops, fieldsInv, newDocNumsIn, w, closeCh)
thesaurusAddrs, fieldIDtoThesaurusID, err := mergeAndPersistSynonymSection(segments, drops, fieldsInv, newDocNumsIn, so.updatedFields, w, closeCh)
if err != nil {
return err
}
Expand Down Expand Up @@ -553,7 +558,8 @@ func writeSynTermMap(synTermMap map[uint32]string, w *CountHashWriter, bufMaxVar
}

func mergeAndPersistSynonymSection(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
fieldsInv []string, newDocNumsIn [][]uint64, w *CountHashWriter,
fieldsInv []string, newDocNumsIn [][]uint64,
updatedFields map[string]*index.UpdateFieldInfo, w *CountHashWriter,
closeCh chan struct{}) (map[int]int, map[uint16]int, error) {

var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
Expand Down Expand Up @@ -601,9 +607,15 @@ func mergeAndPersistSynonymSection(segments []*SegmentBase, dropsIn []*roaring.B
return nil, nil, seg.ErrClosed
}

thes, err2 := segment.thesaurus(fieldName)
if err2 != nil {
return nil, nil, err2
var thes *Thesaurus
var err2 error
if info, ok := updatedFields[fieldName]; ok && info.Index {
thes = nil
} else {
thes, err2 = segment.thesaurus(fieldName)
if err2 != nil {
return nil, nil, err2
}
}
if thes != nil && thes.fst != nil {
itr, err2 := thes.fst.Iterator(nil, nil)
Expand Down
11 changes: 11 additions & 0 deletions segment.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"unsafe"

"github.com/RoaringBitmap/roaring/v2"
index "github.com/blevesearch/bleve_index_api"
mmap "github.com/blevesearch/mmap-go"
segment "github.com/blevesearch/scorch_segment_api/v2"
"github.com/blevesearch/vellum"
Expand Down Expand Up @@ -109,6 +110,8 @@ type SegmentBase struct {
fieldDvNames []string // field names cached in fieldDvReaders
size uint64

updatedFields map[string]*index.UpdateFieldInfo

m sync.Mutex
fieldFSTs map[uint16]*vellum.FST

Expand Down Expand Up @@ -952,3 +955,11 @@ func (s *SegmentBase) loadDvReaders() error {

return nil
}

func (s *SegmentBase) GetUpdatedFields() map[string]*index.UpdateFieldInfo {
return s.updatedFields
}

func (s *SegmentBase) PutUpdatedFields(updatedFields map[string]*index.UpdateFieldInfo) {
s.updatedFields = updatedFields
}