Skip to content

Commit

Permalink
keeping legacy behaviour as an option + error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
Thejas-bhat committed Dec 3, 2024
1 parent 525b363 commit a4b2982
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 63 deletions.
12 changes: 7 additions & 5 deletions index/scorch/introducer.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,15 +352,15 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
creator: "introduceMerge",
}

// iterate through current segments
// newSegmentDeleted := roaring.NewBitmap()
var running, docsToPersistCount, memSegments, fileSegments uint64
var droppedSegmentFiles []string
newSegmentDeleted := make([]*roaring.Bitmap, len(nextMerge.new))
for i := range newSegmentDeleted {
// create a bitmaps to track the obsoletes per newly merged segments
newSegmentDeleted[i] = roaring.NewBitmap()
}

var running, docsToPersistCount, memSegments, fileSegments uint64
var droppedSegmentFiles []string
// iterate through current segments
for i := range root.segment {
segmentID := root.segment[i].id
if segSnapAtMerge, ok := nextMerge.mergedSegHistory[segmentID]; ok {
Expand Down Expand Up @@ -439,7 +439,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
fsr.UpdateFieldStats(stats)
}

// put new segment at end
// put the merged segment at the end of newSnapshot
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
id: nextMerge.id[i],
segment: newMergedSegment, // take ownership for nextMerge.new's ref-count
Expand Down Expand Up @@ -467,6 +467,8 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
if skipped {
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsObsoleted, 1)
} else {
// tbd: should this stat correspond to total number of merged segments introduced?
// or is it like number of merge introductions done
atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1)
}

Expand Down
24 changes: 19 additions & 5 deletions index/scorch/merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,9 @@ func cumulateBytesRead(sbs []segment.Segment) uint64 {

func closeNewMergedSegments(segs []segment.Segment) error {
for _, seg := range segs {
_ = seg.DecRef()
if seg != nil {
_ = seg.DecRef()
}
}
return nil
}
Expand All @@ -482,13 +484,16 @@ func (s *Scorch) mergeSegmentBasesParallel(snapshot *IndexSnapshot, flushableObj

var wg sync.WaitGroup
// we're tracking the merged segments and their doc number per worker
// to be able to introduce them all at once
// to be able to introduce them all at once, so the first dimension of the
// slices here correspond to workerID
newDocNumsSet := make([][][]uint64, len(flushableObjs))
newMergedSegments := make([]segment.Segment, len(flushableObjs))
newMergedSegmentIDs := make([]uint64, len(flushableObjs))
numFlushes := len(flushableObjs)
var numSegments, newMergedCount uint64
errs := make([]error, numFlushes)

// deploy the workers to merge and flush the batches of segments parallely
for i := 0; i < numFlushes; i++ {
wg.Add(1)
go func(segsBatch []segment.Segment, dropsBatch []*roaring.Bitmap, id int) {
Expand All @@ -499,15 +504,15 @@ func (s *Scorch) mergeSegmentBasesParallel(snapshot *IndexSnapshot, flushableObj
newDocNums, _, err :=
s.segPlugin.Merge(segsBatch, dropsBatch, path, s.closeCh, s)
if err != nil {
// handle error
errs[id] = err
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
return
}
newMergedSegmentIDs[id] = newSegmentID
newDocNumsSet[id] = newDocNums
newMergedSegments[id], err = s.segPlugin.Open(path)
if err != nil {
// handle error
errs[id] = err
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
return
}
Expand All @@ -517,6 +522,15 @@ func (s *Scorch) mergeSegmentBasesParallel(snapshot *IndexSnapshot, flushableObj
}

wg.Wait()

if errs[0] != nil {
// close the new merged segments
_ = closeNewMergedSegments(newMergedSegments)

// tbd: need a better way to handle error
return nil, nil, errs[0]
}

atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1)

memMergeZapTime := uint64(time.Since(memMergeZapStartTime))
Expand All @@ -536,7 +550,7 @@ func (s *Scorch) mergeSegmentBasesParallel(snapshot *IndexSnapshot, flushableObj
for i, flushable := range flushableObjs {
for j, idx := range flushable.sbIdxs {
ss := snapshot.segment[idx]
// oldSegmentSnapshot.id -> {threadIdx, oldSegmentSnapshot, docIDs}
// oldSegmentSnapshot.id -> {workerID, oldSegmentSnapshot, docIDs}
sm.mergedSegHistory[ss.id] = &mergedSegmentHistory{
workerID: uint64(i),
oldNewDocIDs: newDocNumsSet[i][j],
Expand Down
133 changes: 80 additions & 53 deletions index/scorch/persister.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,11 +369,17 @@ type flushable struct {
totDocs uint64
}

var DefaultNumPersisterWorkers = 4
var DefaultNumPersisterWorkers = 1

// maximum size of data that a single worker is allowed to perform the in-memory
// merge operation.
var DefaultMaxSizeInMemoryMerge = 200 * 1024 * 1024
var DefaultMaxSizeInMemoryMerge = 0

func legacyFlushBehaviour() bool {
// DefaultMaxSizeInMemoryMerge = 0 is a special value to preserve the leagcy
// one-shot in-memory merge + flush behaviour.
return DefaultMaxSizeInMemoryMerge == 0 && DefaultNumPersisterWorkers == 1
}

// persistSnapshotMaybeMerge examines the snapshot and might merge and
// persist the in-memory zap segments if there are enough of them
Expand All @@ -390,63 +396,84 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
var numSegsToFlushOut int
var totDocs uint64

for i, snapshot := range snapshot.segment {
if totSize >= DefaultMaxSizeInMemoryMerge {
if len(sbs) >= DefaultMinSegmentsForInMemoryMerge {
numSegsToFlushOut += len(sbs)
val := &flushable{
segments: make([]segment.Segment, len(sbs)),
drops: make([]*roaring.Bitmap, len(sbsDrops)),
sbIdxs: make([]int, len(sbsIndexes)),
totDocs: totDocs,
}
copy(val.segments, sbs)
copy(val.drops, sbsDrops)
copy(val.sbIdxs, sbsIndexes)
flushSet = append(flushSet, val)

oldSegIdxs = append(oldSegIdxs, sbsIndexes...)
sbs = sbs[:0]
sbsDrops = sbsDrops[:0]
sbsIndexes = sbsIndexes[:0]
totSize = 0
totDocs = 0
// legacy behaviour of merge + flush of all in-memory segments in one-shot
if legacyFlushBehaviour() {
val := &flushable{
segments: make([]segment.Segment, 0),
drops: make([]*roaring.Bitmap, 0),
sbIdxs: make([]int, 0),
totDocs: totDocs,
}
for i, snapshot := range snapshot.segment {
if _, ok := snapshot.segment.(segment.PersistedSegment); !ok {
val.segments = append(val.segments, snapshot.segment)
val.drops = append(val.drops, snapshot.deleted)
val.sbIdxs = append(val.sbIdxs, i)
oldSegIdxs = append(oldSegIdxs, i)
val.totDocs += snapshot.segment.Count()
numSegsToFlushOut++
}
}

if len(flushSet) >= DefaultNumPersisterWorkers {
break
}
flushSet = append(flushSet, val)
} else {
for i, snapshot := range snapshot.segment {
if totSize >= DefaultMaxSizeInMemoryMerge {
if len(sbs) >= DefaultMinSegmentsForInMemoryMerge {
numSegsToFlushOut += len(sbs)
val := &flushable{
segments: make([]segment.Segment, len(sbs)),
drops: make([]*roaring.Bitmap, len(sbsDrops)),
sbIdxs: make([]int, len(sbsIndexes)),
totDocs: totDocs,
}
copy(val.segments, sbs)
copy(val.drops, sbsDrops)
copy(val.sbIdxs, sbsIndexes)
flushSet = append(flushSet, val)

oldSegIdxs = append(oldSegIdxs, sbsIndexes...)
sbs = sbs[:0]
sbsDrops = sbsDrops[:0]
sbsIndexes = sbsIndexes[:0]
totSize = 0
totDocs = 0
}
}

if _, ok := snapshot.segment.(segment.PersistedSegment); !ok {
sbs = append(sbs, snapshot.segment)
sbsDrops = append(sbsDrops, snapshot.deleted)
sbsIndexes = append(sbsIndexes, i)
totDocs += snapshot.segment.Count()
totSize += snapshot.segment.Size()
}
}
if len(flushSet) >= DefaultNumPersisterWorkers {
break
}

// if there were too few segments just merge them all as part of a single worker
if len(flushSet) < DefaultNumPersisterWorkers {
numSegsToFlushOut += len(sbs)
val := &flushable{
segments: make([]segment.Segment, len(sbs)),
drops: make([]*roaring.Bitmap, len(sbsDrops)),
sbIdxs: make([]int, len(sbsIndexes)),
totDocs: totDocs,
if _, ok := snapshot.segment.(segment.PersistedSegment); !ok {
sbs = append(sbs, snapshot.segment)
sbsDrops = append(sbsDrops, snapshot.deleted)
sbsIndexes = append(sbsIndexes, i)
totDocs += snapshot.segment.Count()
totSize += snapshot.segment.Size()
}
}
// if there were too few segments just merge them all as part of a single worker
if len(flushSet) < DefaultNumPersisterWorkers {
numSegsToFlushOut += len(sbs)
val := &flushable{
segments: make([]segment.Segment, len(sbs)),
drops: make([]*roaring.Bitmap, len(sbsDrops)),
sbIdxs: make([]int, len(sbsIndexes)),
totDocs: totDocs,
}
copy(val.segments, sbs)
copy(val.drops, sbsDrops)
copy(val.sbIdxs, sbsIndexes)
flushSet = append(flushSet, val)

oldSegIdxs = append(oldSegIdxs, sbsIndexes...)
sbs = sbs[:0]
sbsDrops = sbsDrops[:0]
sbsIndexes = sbsIndexes[:0]
totSize = 0
totDocs = 0
}
copy(val.segments, sbs)
copy(val.drops, sbsDrops)
copy(val.sbIdxs, sbsIndexes)
flushSet = append(flushSet, val)

oldSegIdxs = append(oldSegIdxs, sbsIndexes...)
sbs = sbs[:0]
sbsDrops = sbsDrops[:0]
sbsIndexes = sbsIndexes[:0]
totSize = 0
totDocs = 0
}

if numSegsToFlushOut < DefaultMinSegmentsForInMemoryMerge {
Expand Down

0 comments on commit a4b2982

Please sign in to comment.