-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathaccessory_functions.py
115 lines (88 loc) · 3.14 KB
/
accessory_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
## Accessory functions for NLP
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
### Set these according to your setup ###
nltk_path = 'data/'
google_vec_file = nltk_path + 'GoogleNews-vectors-negative300.bin.gz'
### The NLTK corpora were downloaded with the following commands ###
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import pprint, pickle
def get_wordnet_pos(treebank_tag):
"""Convert the part-of-speech naming scheme
from the nltk default to that which is
recognized by the WordNet lemmatizer"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def preprocess_series_text(data)
"""Perform complete preprocessing on a Pandas series
including removal of alpha numerical words, normalization,
punctuation removal, tokenization, stop word removal,
and lemmatization."""
# setup nltk path
"""
if nltk_path:
nltk.data.path.insert(0, nltk_path)
"""
# remove alpha numerical words and make lowercase
alphanum_re = re.compile(r"""\w*\d\w*""")
alphanum_lambda = lambda x: alphanum_re.sub('', x.strip().lower())
data = data.map(alphanum_lambda)
# remove punctuation
punc_re = re.compile('[%s]' % re.escape(string.punctuation))
punc_lambda = lambda x: punc_re.sub(' ', x)
data = data.map(punc_lambda)
# tokenize words
data = data.map(word_tokenize)
# remove stop words
sw = stopwords.words('english')
sw_lambda = lambda x: list(filter(lambda y: y not in sw, x))
data = data.map(sw_lambda)
# part of speech tagging--must convert to format used by lemmatizer
data = data.map(nltk.pos_tag)
pos_lambda = lambda x: [(y[0], get_wordnet_pos(y[1])) for y in x]
data = data.map(pos_lambda)
# lemmatization
lemmatizer = WordNetLemmatizer()
lem_lambda = lambda x: [lemmatizer.lemmatize(*y) for y in x]
data = data.map(lem_lambda)
return data.map(' '.join)
def read_pickle_file(fileName):
"""
@fileName: pickle file saved in local disk
@return serialzed object
"""
pkl_file = open(fileName, 'rb')
data = pickle.load(pkl_file)
pkl_file.close()
return data
def get_score(lda_model, doc2vec):
"""
lda_model: LDA/LSA model
"""
for index, score in sorted(lda_model[doc2vec], key=lambda tup: -1*tup[1]):
print("\nScore: {}\nTopic: {} \nWord: {}".format(score, index, lda_model.print_topic(index, 10)))
def write_pickle_file(data, fileName):
"""
@data: serialized object
fileName: pickle file name
"""
output = open(fileName, 'wb')
pickle.dump(data, output)
output.close()