Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mnvx committed Sep 8, 2020
0 parents commit 27a4e74
Show file tree
Hide file tree
Showing 7 changed files with 463 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.idea
vendor
coverage.out
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
PATH_THIS:=$(realpath $(dir $(lastword ${MAKEFILE_LIST})))
DIR:=$(PATH_THIS)


help:
@echo " test"
@echo " Run tests"


.PHONY: test
test:
@cd $(DIR) \
&& go test ./...
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# In-memory search index

## How to use

```go
import "github.com/twelvedata/searchindex"

// Values for indexation
searchList := SearchList{
SearchItem{
Key: "AAPL",
Data: &SymbolInfo{Symbol: "AAPL", Exchange: "NASDAQ", Instrument: "Apple Inc"},
},
SearchItem{
Key: "AMZN",
Data: &SymbolInfo{Symbol: "AMZN", Exchange: "NASDAQ", Instrument: "Amazon.com Inc"},
},
}

// Fill index
searchIndex := NewSearchIndex(searchList, 10, nil, nil, true, nil)

// Search
result := searchIndex.Search(SearchParams{
Text: "aa",
OutputSize: 10,
Matching: searchindex.Beginning,
})
```

Run tests:

```bash
make test
```
8 changes: 8 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
module github.com/twelvedata/searchindex

go 1.12

require (
github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0
golang.org/x/text v0.3.3
)
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0 h1:i462o439ZjprVSFSZLZxcsoAe592sZB1rci2Z8j4wdk=
github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0/go.mod h1:N0Wam8K1arqPXNWjMo21EXnBPOPp36vB07FNRdD2geA=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
280 changes: 280 additions & 0 deletions search_index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
package searchindex

import (
"github.com/iancoleman/orderedmap"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
"reflect"
"regexp"
s "sort"
"strings"
"unicode"
)

type SearchIndexInterface interface {
AppendData(data SearchList)
Search(params SearchParams) []SearchData
}

type SearchIndex struct {
SearchIndexInterface
index Index
limit int
preprocessFunc func(key string, stopWords map[string]bool) []string
sortFunc func(i, j int, data interface{}) bool
indexParts bool
stopWords map[string]bool
}

type Index struct {
children *orderedmap.OrderedMap
key string
data SearchList
}

const (
Strict = iota
Beginning = iota
)

type SearchParams struct {
Text string
OutputSize int
Matching int
StartValues []SearchData
}

type SearchData interface{}

type SearchItem struct {
Key string
Data SearchData
}
type SearchList []*SearchItem

func defaultSortFunc(i, j int, data interface{}) bool {
return data.(SearchList)[i].Key < data.(SearchList)[j].Key
}

func defaultPreprocessFunc(key string, stopWords map[string]bool) []string {
// Replace punctuation to spaces
rePunctuation := regexp.MustCompile("[`'\".,:;\\?!+\\-–*=<>_~@#№$%^&()|/\\\\]")
processed := rePunctuation.ReplaceAllString(key, " ")

// Replace double spaces to single space
reSpaces := regexp.MustCompile("\\s+")
processed = reSpaces.ReplaceAllString(processed, " ")

processed = strings.Trim(processed, " ")
processed = strings.ToLower(processed)

// Replace "São, Österreich" to "Sao, Osterreich"
t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
processed, _, _ = transform.String(t, processed)

parts := strings.Split(processed, " ")

// Exclude stop words
var result []string
for _, part := range parts {
if _, ok := stopWords[part]; !ok {
result = append(result, part)
}
}

return result
}

func NewSearchIndex(
data SearchList,
limit int,
sort func(i, j int, data interface{}) bool,
preprocess func(key string, stopWords map[string]bool) []string,
indexParts bool,
stopWords []string,
) SearchIndexInterface {
preprocessFunc := preprocess
if preprocessFunc == nil {
preprocessFunc = defaultPreprocessFunc
}

sortFunc := defaultSortFunc
if sort != nil {
sortFunc = sort
}

// Prepare stop words
sw := make(map[string]bool)
for _, word := range stopWords {
parts := preprocessFunc(word, make(map[string]bool))
for _, part := range parts {
sw[part] = true
}
}

// Create and fill index with initial data
searchIndex := &SearchIndex{
index: Index{
children: orderedmap.New(),
},
limit: limit,
preprocessFunc: preprocessFunc,
sortFunc: sortFunc,
indexParts: indexParts,
stopWords: sw,
}
searchIndex.AppendData(data)

return searchIndex
}

func (c SearchIndex) AppendData(data SearchList) {
// Copy original data
copied := copyOriginalData(data)

// Preprocess keys
var preprocessed SearchList
for _, item := range copied {
sortedParts := c.preprocessFunc(item.Key, c.stopWords)
for j, _ := range sortedParts {
d := *item
copiedItem := &d
copiedItem.Key = strings.Join(sortedParts[j:], " ")
preprocessed = append(preprocessed, copiedItem)
if !c.indexParts {
break
}
}
}

// Sort
s.Slice(preprocessed, func(i, j int) bool {
return c.sortFunc(i, j, preprocessed)
})

// Group by key
itemsByKey := orderedmap.New()
for _, item := range preprocessed {
current, ok := itemsByKey.Get(item.Key)
if !ok {
itemsByKey.Set(item.Key, SearchList{item})
} else {
current = append(current.(SearchList), item)
itemsByKey.Set(item.Key, current)
}
}

for _, key := range itemsByKey.Keys() {
item, _ := itemsByKey.Get(key)
addToIndex(&c.index, key, key, item.(SearchList))
}
}

func copyOriginalData(data SearchList) SearchList {
copied := make(SearchList, len(data))
for i, _ := range data {
d := *data[i]
copied[i] = &d
}
return copied
}

func addToIndex(index *Index, keyTail string, key string, data SearchList) {
if len(keyTail) == 0 {
index.key = key
index.data = data
return
}
first := keyTail[:1]
tail := keyTail[1:]
idx, ok := index.children.Get(first)
if !ok {
idx = &Index{
children: orderedmap.New(),
}
index.children.Set(first, idx)
}
addToIndex(idx.(*Index), tail, key, data)
}

func (c SearchIndex) Search(params SearchParams) []SearchData {
outputSize := params.OutputSize
if outputSize == 0 || outputSize > c.limit || outputSize <= 0 {
outputSize = c.limit
}

start := make(map[uintptr]bool)
for _, item := range params.StartValues {
ptr := reflect.ValueOf(item).Pointer()
start[ptr] = true
}

// Start search
data := c.searchInIndex(
&c.index,
strings.Join(c.preprocessFunc(params.Text, c.stopWords), " "),
params.Matching,
outputSize - len(params.StartValues),
start,
)

// And append result after start
result := make([]SearchData, len(params.StartValues))
copy(result, params.StartValues)
result = append(result, data...)

return result
}

func (c SearchIndex) searchInIndex(index *Index, key string, matching int, outputSize int, start map[uintptr]bool) []SearchData {
if key == "" {
found := make(map[uintptr]bool)
searched := c.searchList(index, make(SearchList, 0), matching, outputSize, found, start)
return c.getData(searched)
}
idx, ok := index.children.Get(key[:1])
if !ok {
return make([]SearchData, 0)
}
return c.searchInIndex(idx.(*Index), key[1:], matching, outputSize, start)
}

func (c SearchIndex) searchList(index *Index, items SearchList, matching int, outputSize int, found map[uintptr]bool, start map[uintptr]bool) SearchList {
if (outputSize > 0 && len(items) >= outputSize) || outputSize == 0 {
return items
}
if index.data != nil {
for _, item := range index.data {
// Check data in found, because we do not need to add duplicates in result
ptr := reflect.ValueOf(item.Data).Pointer()
if _, exists := found[ptr]; !exists {
if _, exists := start[ptr]; !exists {
items = append(items, item)
found[ptr] = true
if outputSize > 0 && len(items) >= outputSize {
return items
}
}
}
}
}
if len(index.children.Keys()) == 0 {
return items
}
if matching == Beginning {
for _, key := range index.children.Keys() {
idx, _ := index.children.Get(key)
items = c.searchList(idx.(*Index), items, matching, outputSize, found, start)
}
}
return items
}

func (c SearchIndex) getData(data SearchList) []SearchData {
result := make([]SearchData, len(data))
for i, item := range data {
result[i] = item.Data
}
return result
}
Loading

0 comments on commit 27a4e74

Please sign in to comment.