-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnlp_demo.py
132 lines (109 loc) · 5.16 KB
/
nlp_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
import lda
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def do_lemmatize(docs):
# Stem words (normalize words within the same part of speech)
# lemmatize words (normalize words across parts of speech)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed_docs = []
for di, doc in enumerate(docs):
if di % 100 == 99:
print("\tProcessed doc %d of %d" % (di + 1, len(docs)))
words = [w.strip() for w in doc.split(' ') if w.strip()]
stem_words = [stemmer.stem(w) for w in words]
new_words = [lemmatizer.lemmatize(w) for w in stem_words]
stemmed_docs.append(' '.join(new_words))
return stemmed_docs
def do_vectorize(docs, type="count", min_word_length=3, min_df=1, sentiment_weight=0.0):
if type == "count":
# Count words
cls = CountVectorizer
elif type == "tfidf":
# Count words, normalize by frequency
cls = TfidfVectorizer
else:
raise NotImplementedError()
vectorizer = cls(
min_df=min_df, token_pattern=u'(?u)\\b[a-zA-Z]{%d,}\\b' % min_word_length,
max_df=np.inf, analyzer='word', stop_words="english",
encoding='utf-8', decode_error='ignore')
vectorizer.fit(docs)
X = vectorizer.transform(docs)
vocab = np.asarray(vectorizer.vocabulary_.keys())
vocab = vocab[np.argsort(vectorizer.vocabulary_.values())] # ordered by counts (
# freqs = np.reshape(np.asarray(X.sum(axis=0)), (X.shape[1],))
# vocab = vocab[np.argsort(freqs)] # ordered by counts (
if sentiment_weight > 0:
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sid = SentimentIntensityAnalyzer()
# sentiments_df = body_df.apply(sid.polarity_scores)
raise NotImplementedError()
return X, vocab, vectorizer
def do_lda(lda_mat, vectorizer, vocab, n_topics=10, n_top_words=10, n_iter=1500,
model=None, verbose=1, random_state=1, return_model=False):
"""
Uses LDA to algorithmically find topics in a bag of words.
Parameters
----------
docs : An array of documents.
This is generated by text_functions.read_from_CSV
n_topics: the number of topics for LDA to find
n_top_words: when creating topic labels, how many words to keep?
verbose: int (optional), logging level
Returns
----------
lda_labels : the word that most represents each category
lda_output_mat : program x topic weight matrix
lda_cats : the argmax for lda topics of each program
"""
# need to find all words that were used to build a program x word
# count matrix for LDA
n_docs = lda_mat.shape[0]
if n_docs < n_topics and model is None:
raise ValueError("Must have more docs than topics! ({n_docs} < {n_topics})".format(
n_docs=n_docs, n_topics=n_topics))
if model is None:
if verbose > 0:
print("Running LDA for {n_topics} topics for {n_iter} iterations.".format(
n_topics=n_topics, n_iter=n_iter))
model = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=random_state)
model.vectorizer = vectorizer
model.vocab = vocab
model.fit(lda_mat) # .astype(np.int64))
lda_labels = []
t_word = model.topic_word_
topic_order_idx = np.argsort(np.linalg.norm(t_word, axis=1))[::-1]
topic_word = t_word[topic_order_idx] # order by max length
for ti, topic_dist in enumerate(topic_word):
topic_words = vocab[np.argsort(topic_dist)][::-1]
topic_words = topic_words[:n_top_words]
lda_labels.append(' '.join(topic_words))
if verbose > 0:
print('Topic {}: {}'.format(ti + 1, ' '.join(topic_words)))
if verbose > 0:
print("\tBuilding [document x topic weight] output matrix")
lda_cats = np.zeros(n_docs, dtype=int)
lda_output_mat = np.zeros((n_docs, n_topics))
for x in xrange(n_docs):
lda_output_mat[x, :] = model.doc_topic_[x][topic_order_idx]
lda_cats[x] = np.argmax(lda_output_mat[x, :])
if return_model:
return lda_labels, lda_output_mat, lda_cats, lda_mat, model
else:
return lda_labels, lda_output_mat, lda_cats, lda_mat
if __name__ == '__main__':
article_text = """
Police have said they are considering manslaughter charges in relation to the deadly Grenfell Tower blaze as they revealed that both the insulation and tiles at the building failed safety tests.
Det Supt Fiona McCormack, who is overseeing the investigation, said on Friday that officers had established that the initial cause of the fire was a fridge-freezer and that it was not started deliberately.
She said they were trying to get to the bottom of why the fire started so quickly. Insulation
"""
docs = [article_text] * 20
l_docs = do_lemmatize(docs)
vc_docs = do_vectorize(l_docs, type="count")
vt_docs = do_vectorize(l_docs, type="tfidf")
# print lda_categories(docs)
print "Original text: ", article_text
print "Stemmed & counted text: ", vc_docs[0].todense(), vc_docs[1]
print "Stemmed & counted (tfidf) text: ", vt_docs[0].todense(), vt_docs[1]