-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdemo.py
executable file
·47 lines (42 loc) · 1.29 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python3
#
# This file is very similar to the one used every year
# for the #NeoloxismoDoAno neology selection process
# As such, it's a perfect example of use for the library
# We hope it helps understand how it's used
#
from raposa.core.pipeline import BasicPipeline
from raposa.core.tokenizers import RegexTokenizer
from raposa.core.tubes import LowercaseAdaptor, UrlRemoval, MentionRemoval, HashtagRemoval, EmojiRemoval, RegexRemoval, PunctRemoval, NumberRemoval
from raposa.langs.gl.tubes import GLXiadaFilter, GLEstravizFilter, GLToponymFilter, GLWikipediaFilter
from raposa.langs.es.tubes import ESFirstNamesFilter, ESLastNamesFilter
pipe = BasicPipeline([
# preprocessing
BasicPipeline([
LowercaseAdaptor(),
UrlRemoval(),
MentionRemoval(),
HashtagRemoval(),
EmojiRemoval()
]),
# word massaging & filtering
BasicPipeline(
tokenizer=RegexTokenizer(r'[.,;:_\s\'\"]+'),
tubes=[
RegexRemoval(r'[ºª]+'),
NumberRemoval(),
PunctRemoval(),
GLXiadaFilter(),
GLEstravizFilter(),
GLToponymFilter(),
GLWikipediaFilter(),
ESFirstNamesFilter(),
ESLastNamesFilter()
]
)
], reads_from_gen=False)
with open("input.txt") as in_file, \
open("output.txt", mode='w') as out_file:
for line in in_file:
for word in pipe.pipe(line):
out_file.write(word + "\n")