forked from WorldBrain/Memex
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.test.js
148 lines (130 loc) · 4.78 KB
/
pipeline.test.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/* eslint-env jest */
import pipeline, { extractTerms } from './pipeline'
import * as DATA from './pipeline.test.data'
function testExtractTerms({ input, output = DATA.EXPECTED_TERMS }) {
const result = extractTerms(input)
expect(result).toEqual(new Set(output))
}
describe('Search index pipeline', () => {
test('process a document', async () => {
const result = await pipeline({
pageDoc: DATA.PAGE_1,
bookmarkDocs: [],
visits: ['12345'],
rejectNoContent: true,
})
expect(result).toEqual(expect.objectContaining(DATA.EXPECTED_OUTPUT))
})
test('extract terms from a document', () => {
testExtractTerms({
input: 'very often the people forget to optimize important code',
})
})
test('extract terms from a document removing URLs', () => {
testExtractTerms({
input:
'very often the people (https://thepeople.com) forget to optimize important code',
})
})
test('extract terms from a document combining punctuation', () => {
testExtractTerms({
input: "very often people's forget to optimize important code",
output: ['peoples', 'forget', 'optimize', 'important', 'code'],
})
})
test('extract terms from a document removing diacritics', () => {
testExtractTerms({
input: 'very often the péople forget to óptimize important code',
})
})
test('extract terms from a document normalizing weird spaces', () => {
testExtractTerms({
input:
'very often\u{2007}the people\u{202F}forget to optimize important\u{A0}code',
})
})
test('extract terms from a document _including_ words with numbers', () => {
testExtractTerms({
input:
'very often the people (like Punkdude123) forget to optimize important code',
output: [...DATA.EXPECTED_TERMS, 'punkdude123'],
})
})
test('extract terms from a document _including_ emails', () => {
testExtractTerms({
input:
'very often the people ([email protected]) forget to optimize important code',
output: [...DATA.EXPECTED_TERMS, 'punkdude123@gmail'],
})
})
// https://xkcd.com/37
test('extract terms from a document _including_ words found in "dash-words"', () => {
testExtractTerms({
input:
'very often the people forget to optimize important-ass code, important-ass-code, and important ass-code',
output: [
...DATA.EXPECTED_TERMS,
'important-ass-code',
'important-ass',
'ass-code',
'ass',
],
})
})
test('extract terms from a document ignoring - spaced - hyphens', () => {
testExtractTerms({
input:
'very - often - the - people forget - to - optimize important code',
output: DATA.EXPECTED_TERMS,
})
})
test('extract terms from a document removing useless whitespace', () => {
testExtractTerms({
input: 'very often the people forget to optimize important code',
})
})
test('extract terms from a document removing random digits', () => {
testExtractTerms({
input: 'very often the 5 people forget to optimize important code',
})
testExtractTerms({
input:
'very often the 555 people forget to optimize important code',
})
testExtractTerms({
input:
'very often the 5555 people forget to optimize important code',
output: [
'5555',
'people',
'forget',
'optimize',
'important',
'code',
],
})
testExtractTerms({
input:
'very often the 555555 people forget to optimize important code',
})
})
test('extract terms from a document removing long words', () => {
testExtractTerms({
input:
'very often the hippopotomonstrosesquippedaliophobic people forget to optimize important code',
})
})
test('extract terms from a document _including_ words with many consonants', () => {
testExtractTerms({
input:
'very often the people from Vrchlabí forget to optimize important code',
output: [...DATA.EXPECTED_TERMS, 'vrchlabi'],
})
})
test('extract terms from a document removing duplicate words', () => {
testExtractTerms({
input:
'very often the people forget to people optimize important code',
})
})
})