-
Notifications
You must be signed in to change notification settings - Fork 689
/
Copy pathsearch_knn.go
626 lines (575 loc) · 20.5 KB
/
search_knn.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package bleve
import (
"context"
"encoding/json"
"fmt"
"sort"
"github.com/blevesearch/bleve/v2/document"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/search/collector"
"github.com/blevesearch/bleve/v2/search/query"
index "github.com/blevesearch/bleve_index_api"
)
const supportForVectorSearch = true
type knnOperator string
// Must be updated only at init
var BleveMaxK = int64(10000)
type SearchRequest struct {
ClientContextID string `json:"client_context_id,omitempty"`
Query query.Query `json:"query"`
Size int `json:"size"`
From int `json:"from"`
Highlight *HighlightRequest `json:"highlight"`
Fields []string `json:"fields"`
Facets FacetsRequest `json:"facets"`
Explain bool `json:"explain"`
Sort search.SortOrder `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score,omitempty"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
KNN []*KNNRequest `json:"knn"`
KNNOperator knnOperator `json:"knn_operator"`
// PreSearchData will be a map that will be used
// in the second phase of any 2-phase search, to provide additional
// context to the second phase. This is useful in the case of index
// aliases where the first phase will gather the PreSearchData from all
// the indexes in the alias, and the second phase will use that
// PreSearchData to perform the actual search.
// The currently accepted map configuration is:
//
// "_knn_pre_search_data_key": []*search.DocumentMatch
PreSearchData map[string]interface{} `json:"pre_search_data,omitempty"`
sortFunc func(sort.Interface)
}
// Vector takes precedence over vectorBase64 in case both fields are given
type KNNRequest struct {
Field string `json:"field"`
Vector []float32 `json:"vector"`
VectorBase64 string `json:"vector_base64"`
K int64 `json:"k"`
Boost *query.Boost `json:"boost,omitempty"`
// Search parameters for the field's vector index part of the segment.
// Value of it depends on the field's backing vector index implementation.
//
// For Faiss IVF index, supported search params are:
// - ivf_nprobe_pct : int // percentage of total clusters to search
// - ivf_max_codes_pct : float // percentage of total vectors to visit to do a query (across all clusters)
//
// Consult go-faiss to know all supported search params
Params json.RawMessage `json:"params"`
// Filter query to use with kNN pre-filtering.
// Supports pre-filtering with all existing types of query clauses.
FilterQuery query.Query `json:"filter,omitempty"`
}
func (r *SearchRequest) AddKNN(field string, vector []float32, k int64, boost float64) {
b := query.Boost(boost)
r.KNN = append(r.KNN, &KNNRequest{
Field: field,
Vector: vector,
K: k,
Boost: &b,
})
}
func (r *SearchRequest) AddKNNWithFilter(field string, vector []float32, k int64,
boost float64, filterQuery query.Query) {
b := query.Boost(boost)
r.KNN = append(r.KNN, &KNNRequest{
Field: field,
Vector: vector,
K: k,
Boost: &b,
FilterQuery: filterQuery,
})
}
func (r *SearchRequest) AddKNNOperator(operator knnOperator) {
r.KNNOperator = operator
}
// UnmarshalJSON deserializes a JSON representation of
// a SearchRequest
func (r *SearchRequest) UnmarshalJSON(input []byte) error {
type tempKNNReq struct {
Field string `json:"field"`
Vector []float32 `json:"vector"`
VectorBase64 string `json:"vector_base64"`
K int64 `json:"k"`
Boost *query.Boost `json:"boost,omitempty"`
Params json.RawMessage `json:"params"`
FilterQuery json.RawMessage `json:"filter,omitempty"`
}
var temp struct {
Q json.RawMessage `json:"query"`
Size *int `json:"size"`
From int `json:"from"`
Highlight *HighlightRequest `json:"highlight"`
Fields []string `json:"fields"`
Facets FacetsRequest `json:"facets"`
Explain bool `json:"explain"`
Sort []json.RawMessage `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
KNN []*tempKNNReq `json:"knn"`
KNNOperator knnOperator `json:"knn_operator"`
PreSearchData json.RawMessage `json:"pre_search_data"`
}
err := json.Unmarshal(input, &temp)
if err != nil {
return err
}
if temp.Size == nil {
r.Size = 10
} else {
r.Size = *temp.Size
}
if temp.Sort == nil {
r.Sort = search.SortOrder{&search.SortScore{Desc: true}}
} else {
r.Sort, err = search.ParseSortOrderJSON(temp.Sort)
if err != nil {
return err
}
}
r.From = temp.From
r.Explain = temp.Explain
r.Highlight = temp.Highlight
r.Fields = temp.Fields
r.Facets = temp.Facets
r.IncludeLocations = temp.IncludeLocations
r.Score = temp.Score
r.SearchAfter = temp.SearchAfter
r.SearchBefore = temp.SearchBefore
r.Query, err = query.ParseQuery(temp.Q)
if err != nil {
return err
}
if r.Size < 0 {
r.Size = 10
}
if r.From < 0 {
r.From = 0
}
r.KNN = make([]*KNNRequest, len(temp.KNN))
for i, knnReq := range temp.KNN {
r.KNN[i] = &KNNRequest{}
r.KNN[i].Field = temp.KNN[i].Field
r.KNN[i].Vector = temp.KNN[i].Vector
r.KNN[i].VectorBase64 = temp.KNN[i].VectorBase64
r.KNN[i].K = temp.KNN[i].K
r.KNN[i].Boost = temp.KNN[i].Boost
r.KNN[i].Params = temp.KNN[i].Params
if len(knnReq.FilterQuery) == 0 {
// Setting this to nil to avoid ParseQuery() setting it to a match none
r.KNN[i].FilterQuery = nil
} else {
r.KNN[i].FilterQuery, err = query.ParseQuery(knnReq.FilterQuery)
}
}
r.KNNOperator = temp.KNNOperator
if r.KNNOperator == "" {
r.KNNOperator = knnOperatorOr
}
if temp.PreSearchData != nil {
r.PreSearchData, err = query.ParsePreSearchData(temp.PreSearchData)
if err != nil {
return err
}
}
return nil
}
// -----------------------------------------------------------------------------
func copySearchRequest(req *SearchRequest, preSearchData map[string]interface{}) *SearchRequest {
rv := SearchRequest{
Query: req.Query,
Size: req.Size + req.From,
From: 0,
Highlight: req.Highlight,
Fields: req.Fields,
Facets: req.Facets,
Explain: req.Explain,
Sort: req.Sort.Copy(),
IncludeLocations: req.IncludeLocations,
Score: req.Score,
SearchAfter: req.SearchAfter,
SearchBefore: req.SearchBefore,
KNN: req.KNN,
KNNOperator: req.KNNOperator,
PreSearchData: preSearchData,
}
return &rv
}
var (
knnOperatorAnd = knnOperator("and")
knnOperatorOr = knnOperator("or")
)
func createKNNQuery(req *SearchRequest, eligibleDocsMap map[int][]index.IndexInternalID,
requiresFiltering map[int]bool) (
query.Query, []int64, int64, error) {
if requestHasKNN(req) {
// first perform validation
err := validateKNN(req)
if err != nil {
return nil, nil, 0, err
}
var subQueries []query.Query
kArray := make([]int64, 0, len(req.KNN))
sumOfK := int64(0)
for i, knn := range req.KNN {
// If it's a filtered kNN but has no eligible filter hits, then
// do not run the kNN query.
if requiresFiltering[i] && len(eligibleDocsMap[i]) <= 0 {
continue
}
knnQuery := query.NewKNNQuery(knn.Vector)
knnQuery.SetFieldVal(knn.Field)
knnQuery.SetK(knn.K)
knnQuery.SetBoost(knn.Boost.Value())
knnQuery.SetParams(knn.Params)
if len(eligibleDocsMap[i]) > 0 {
knnQuery.SetFilterQuery(knn.FilterQuery)
filterResults, exists := eligibleDocsMap[i]
if exists {
knnQuery.SetFilterResults(filterResults)
}
}
subQueries = append(subQueries, knnQuery)
kArray = append(kArray, knn.K)
sumOfK += knn.K
}
rv := query.NewDisjunctionQuery(subQueries)
rv.RetrieveScoreBreakdown(true)
return rv, kArray, sumOfK, nil
}
return nil, nil, 0, nil
}
func validateKNN(req *SearchRequest) error {
if req.KNN != nil &&
req.KNNOperator != "" &&
req.KNNOperator != knnOperatorOr &&
req.KNNOperator != knnOperatorAnd {
return fmt.Errorf("unknown knn operator: %s", req.KNNOperator)
}
for _, q := range req.KNN {
if q == nil {
return fmt.Errorf("knn query cannot be nil")
}
if len(q.Vector) == 0 && q.VectorBase64 != "" {
// consider vector_base64 only if vector is not provided
decodedVector, err := document.DecodeVector(q.VectorBase64)
if err != nil {
return err
}
q.Vector = decodedVector
}
if q.K <= 0 || len(q.Vector) == 0 {
return fmt.Errorf("k must be greater than 0 and vector must be non-empty")
}
if q.K > BleveMaxK {
return fmt.Errorf("k must be less than %d", BleveMaxK)
}
}
switch req.KNNOperator {
case knnOperatorAnd, knnOperatorOr, "":
// Valid cases, do nothing
default:
return fmt.Errorf("knn_operator must be either 'and' / 'or'")
}
return nil
}
func addSortAndFieldsToKNNHits(req *SearchRequest, knnHits []*search.DocumentMatch, reader index.IndexReader, name string) (err error) {
requiredSortFields := req.Sort.RequiredFields()
var dvReader index.DocValueReader
var updateFieldVisitor index.DocValueVisitor
if len(requiredSortFields) > 0 {
dvReader, err = reader.DocValueReader(requiredSortFields)
if err != nil {
return err
}
updateFieldVisitor = func(field string, term []byte) {
req.Sort.UpdateVisitor(field, term)
}
}
for _, hit := range knnHits {
if len(requiredSortFields) > 0 {
err = dvReader.VisitDocValues(hit.IndexInternalID, updateFieldVisitor)
if err != nil {
return err
}
}
req.Sort.Value(hit)
err, _ = LoadAndHighlightFields(hit, req, "", reader, nil)
if err != nil {
return err
}
hit.Index = name
}
return nil
}
func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader, preSearch bool) ([]*search.DocumentMatch, error) {
// maps the index of the KNN query in the req to the pre-filter hits aka
// eligible docs' internal IDs .
filterHitsMap := make(map[int][]index.IndexInternalID)
// Indicates if this query requires filtering downstream
// No filtering required if it's a match all query/no filters applied.
requiresFiltering := make(map[int]bool)
for idx, knnReq := range req.KNN {
// TODO Can use goroutines for this filter query stuff - do it if perf results
// show this to be significantly slow otherwise.
filterQ := knnReq.FilterQuery
if filterQ == nil {
requiresFiltering[idx] = false
continue
}
if _, ok := filterQ.(*query.MatchAllQuery); ok {
// Equivalent to not having a filter query.
requiresFiltering[idx] = false
continue
}
if isMatchNoneQuery(filterQ) {
// Filtering required since no hits are eligible.
requiresFiltering[idx] = true
// a match none query just means none the documents are eligible
// hence, we can save on running the query.
continue
}
// Applies to all supported types of queries.
filterSearcher, _ := filterQ.Searcher(ctx, reader, i.m, search.SearcherOptions{
Score: "none", // just want eligible hits --> don't compute scores if not needed
})
// Using the index doc count to determine collector size since we do not
// have an estimate of the number of eligible docs in the index yet.
indexDocCount, err := i.DocCount()
if err != nil {
return nil, err
}
filterColl := collector.NewEligibleCollector(int(indexDocCount))
err = filterColl.Collect(ctx, filterSearcher, reader)
if err != nil {
return nil, err
}
filterHits := filterColl.IDs()
if len(filterHits) > 0 {
filterHitsMap[idx] = filterHits
}
// set requiresFiltering regardless of whether there're filtered hits or
// not to later decide whether to consider the knnQuery or not
requiresFiltering[idx] = true
}
// Add the filter hits when creating the kNN query
KNNQuery, kArray, sumOfK, err := createKNNQuery(req, filterHitsMap, requiresFiltering)
if err != nil {
return nil, err
}
knnSearcher, err := KNNQuery.Searcher(ctx, reader, i.m, search.SearcherOptions{
Explain: req.Explain,
})
if err != nil {
return nil, err
}
knnCollector := collector.NewKNNCollector(kArray, sumOfK)
err = knnCollector.Collect(ctx, knnSearcher, reader)
if err != nil {
return nil, err
}
knnHits := knnCollector.Results()
if !preSearch {
knnHits = finalizeKNNResults(req, knnHits)
}
// at this point, irrespective of whether it is a preSearch or not,
// the knn hits are populated with Sort and Fields.
// it must be ensured downstream that the Sort and Fields are not
// re-evaluated, for these hits.
// also add the index names to the hits, so that when early
// exit takes place after the first phase, the hits will have
// a valid value for Index.
err = addSortAndFieldsToKNNHits(req, knnHits, reader, i.name)
if err != nil {
return nil, err
}
return knnHits, nil
}
func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) {
if len(knnHits) > 0 {
newScoreExplComputer := func(queryMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) (float64, *search.Explanation) {
totalScore := queryMatch.Score + knnMatch.Score
if !req.Explain {
// exit early as we don't need to compute the explanation
return totalScore, nil
}
return totalScore, &search.Explanation{Value: totalScore, Message: "sum of:", Children: []*search.Explanation{queryMatch.Expl, knnMatch.Expl}}
}
coll.SetKNNHits(knnHits, search.ScoreExplCorrectionCallbackFunc(newScoreExplComputer))
}
}
func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch {
// if the KNN operator is AND, then we need to filter out the hits that
// do not have match the KNN queries.
if req.KNNOperator == knnOperatorAnd {
idx := 0
for _, hit := range knnHits {
if len(hit.ScoreBreakdown) == len(req.KNN) {
knnHits[idx] = hit
idx++
}
}
knnHits = knnHits[:idx]
}
// fix the score using score breakdown now
// if the score is none, then we need to set the score to 0.0
// if req.Explain is true, then we need to use the expl breakdown to
// finalize the correct explanation.
for _, hit := range knnHits {
hit.Score = 0.0
if req.Score != "none" {
for _, score := range hit.ScoreBreakdown {
hit.Score += score
}
}
if req.Explain {
childrenExpl := make([]*search.Explanation, 0, len(hit.ScoreBreakdown))
for i := range hit.ScoreBreakdown {
childrenExpl = append(childrenExpl, hit.Expl.Children[i])
}
hit.Expl = &search.Explanation{Value: hit.Score, Message: "sum of:", Children: childrenExpl}
}
// we don't need the score breakdown anymore
// so we can set it to nil
hit.ScoreBreakdown = nil
}
return knnHits
}
// when we are setting KNN hits in the preSearchData, we need to make sure that
// the KNN hit goes to the right index. This is because the KNN hits are
// collected from all the indexes in the alias, but the preSearchData is
// specific to each index. If alias A1 contains indexes I1 and I2 and
// the KNN hits collected from both I1 and I2, and merged to get top K
// hits, then the top K hits need to be distributed to I1 and I2,
// so that the preSearchData for I1 contains the top K hits from I1 and
// the preSearchData for I2 contains the top K hits from I2.
func validateAndDistributeKNNHits(knnHits []*search.DocumentMatch, indexes []Index) (map[string][]*search.DocumentMatch, error) {
// create a set of all the index names of this alias
indexNames := make(map[string]struct{}, len(indexes))
for _, index := range indexes {
indexNames[index.Name()] = struct{}{}
}
segregatedKnnHits := make(map[string][]*search.DocumentMatch)
for _, hit := range knnHits {
// for each hit, we need to perform a validation check to ensure that the stack
// is still valid.
//
// if the stack is empty, then we have an inconsistency/abnormality
// since any hit with an empty stack is supposed to land on a leaf index,
// and not an alias. This cannot happen in normal circumstances. But
// performing this check to be safe. Since we extract the stack top
// in the following steps.
if len(hit.IndexNames) == 0 {
return nil, ErrorTwoPhaseSearchInconsistency
}
// since the stack is not empty, we need to check if the top of the stack
// is a valid index name, of an index that is part of this alias. If not,
// then we have an inconsistency that could be caused due to a topology
// change.
stackTopIdx := len(hit.IndexNames) - 1
top := hit.IndexNames[stackTopIdx]
if _, exists := indexNames[top]; !exists {
return nil, ErrorTwoPhaseSearchInconsistency
}
if stackTopIdx == 0 {
// if the stack consists of only one index, then popping the top
// would result in an empty slice, and handle this case by setting
// indexNames to nil. So that the final search results will not
// contain the indexNames field.
hit.IndexNames = nil
} else {
hit.IndexNames = hit.IndexNames[:stackTopIdx]
}
segregatedKnnHits[top] = append(segregatedKnnHits[top], hit)
}
return segregatedKnnHits, nil
}
func requestHasKNN(req *SearchRequest) bool {
return len(req.KNN) > 0
}
// returns true if the search request contains a KNN request that can be
// satisfied by just performing a preSearch, completely bypassing the
// actual search.
func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool {
// if req.Query is not match_none => then we need to go to phase 2
// to perform the actual query.
if !isMatchNoneQuery(req.Query) {
return false
}
// req.Query is a match_none query
//
// if request contains facets, we need to perform phase 2 to calculate
// the facet result. Since documents were removed as part of the
// merging process after phase 1, if the facet results were to be calculated
// during phase 1, then they will be now be incorrect, since merging would
// remove some documents.
if req.Facets != nil {
return false
}
// the request is a match_none query and does not contain any facets
// so we can satisfy the request using just the preSearch result.
return true
}
func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preSearchResult *SearchResult,
indexes []Index) (map[string]map[string]interface{}, error) {
distributedHits, err := validateAndDistributeKNNHits([]*search.DocumentMatch(preSearchResult.Hits), indexes)
if err != nil {
return nil, err
}
for _, index := range indexes {
mergedOut[index.Name()][search.KnnPreSearchDataKey] = distributedHits[index.Name()]
}
return mergedOut, nil
}
func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) {
dummyReq.KNN = realReq.KNN
dummyReq.KNNOperator = knnOperatorOr
dummyReq.Explain = realReq.Explain
dummyReq.Fields = realReq.Fields
dummyReq.Sort = realReq.Sort
}
func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor {
kArray := make([]int64, len(req.KNN))
for i, knnReq := range req.KNN {
kArray[i] = knnReq.K
}
knnStore := collector.GetNewKNNCollectorStore(kArray)
return &knnPreSearchResultProcessor{
addFn: func(sr *SearchResult, indexName string) {
for _, hit := range sr.Hits {
// tag the hit with the index name, so that when the
// final search result is constructed, the hit will have
// a valid path to follow along the alias tree to reach
// the index.
hit.IndexNames = append(hit.IndexNames, indexName)
knnStore.AddDocument(hit)
}
},
finalizeFn: func(sr *SearchResult) {
// passing nil as the document fixup function, because we don't need to
// fixup the document, since this was already done in the first phase,
// hence error is always nil.
// the merged knn hits are finalized and set in the search result.
sr.Hits, _ = knnStore.Final(nil)
},
}
}