-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare.py
76 lines (56 loc) · 2.51 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import numpy as np
import re
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
def normalize(string):
# lower case the string
string = string.lower()
# replace new lines with an empty space
string = re.sub(r'\n',' ',string)
# normalize unicode characters
string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
# replace . with space
string = re.sub(r'\.',' ',string)
# replace / with space
string = re.sub(r'/',' ',string)
# replace c++ th c plus plus
string = re.sub(r'c\+\+','cplusplus',string)
# replace c+ with c plus
string = re.sub(r'c\+','cplus ',string)
# replace c# with c sharp
string = re.sub(r'c#','csharp ',string)
# replace f# with f sharp
string = re.sub(r'f#','fsharp ',string)
# replaces certain special characters with a space
string = re.sub(r'[\[\](){}]',' ', string)
# replace multiple adjacent spaces with only one space
string = re.sub(r'\s+',' ',string)
# remove remaining special characters and numbers with regex
string = re.sub(r'[^a-z\s]','', string)
return string
def stem(string):
ps = nltk.porter.PorterStemmer()
stems = [ps.stem(word) for word in string.split()]
string_of_stems = ' '.join(stems)
return string_of_stems
def lemmatize(string):
wnl = nltk.stem.WordNetLemmatizer()
lemmas = [wnl.lemmatize(word) for word in string.split()]
string_of_lemmas = ' '.join(lemmas)
return string_of_lemmas
def tokenize(string):
tokenizer = nltk.tokenize.ToktokTokenizer()
return tokenizer.tokenize(string, return_str=True)
def remove_stopwords(tokenized_string, extra_words=['advent', 'of','code', 'aoc', 'day', 'solution', 'httpsgithub','githubhttpsimg','hpphttpsgithub','httpsadventofcode', 'adventofcode','commstksgadventofcodeblobmasterreflections', 'httpadventofcode','com', 'github', 'http', 'https','adventofcodehttps','master','blob','username', 'reponame','svg','src'], exclude_words=[]):
words = tokenized_string.split()
stopword_list = stopwords.words('english')
# remove the excluded words from the stopword list
stopword_list = set(stopword_list) - set(exclude_words)
# add in the user specified extra words
stopword_list = stopword_list.union(set(extra_words))
filtered_words = [w for w in words if w not in stopword_list]
final_string = " ".join(filtered_words)
return final_string