-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrelevance_ranking.py
219 lines (159 loc) · 5.67 KB
/
relevance_ranking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import os
from utils import tokenize, save_obj, load_obj
from itertools import chain
from collections import Counter
from collections import defaultdict
# import some stopwords
with open('stopwords.txt') as f:
stopwords = [s.rstrip() for s in f]
# import the documents and their annotations
documents = []
annotations = []
for f in os.listdir('./data/NLM_500/documents/'):
filename = './data/NLM_500/documents/' + f
if filename.endswith('.txt'):
documents.append(open(filename, encoding='ISO-8859-1').read())
elif filename.endswith('.key'):
an = [a.rstrip().lower() for a in open(filename,
encoding='ISO-8859-1')]
annotations.append(an)
# print(documents[0])
# print()
# print(annotations[0])
# tokenize documents
print('[ + ] Tokenizing documents')
documents = [tokenize(d, stopwords) for d in documents]
documents = [list(set(d)) for d in documents]
# Output some infos about the data
vocab = []
thesaurus = []
for doc in documents:
vocab += list(set(doc))
for tw in annotations:
thesaurus += tw
vocab = sorted(list(set(vocab)))
thesaurus = sorted(list(set(thesaurus)))
intersection = sorted(list(set(thesaurus).intersection(vocab)))
print('Vocab size:', len(vocab))
print('Thesaurus size:', len(thesaurus))
print('Intersection size:', len(intersection))
# Tag quantity (and not really a distribution)
nb_tags = 0
min_doc = 5
tag_dist = Counter(chain.from_iterable(annotations))
for t in sorted(tag_dist.items(), key=lambda x: x[1], reverse=True):
if t[1] >= min_doc:
nb_tags += 1
print('[ ! ]Tags that tag more than {} documents: {}'.format(min_doc, nb_tags))
# Init training
# TODO: cross validation
# Get training test
documents_train = documents[:375]
annotations_train = annotations[:375]
# Get test set
documents_test = documents[375:]
annotations_test = annotations[375:]
def word_occurence(documents, vocab):
# Init a zero vector of size vocab
word_count_idx = dict((w, 0) for i, w in enumerate(vocab))
for doc in documents:
# Update vectors for all the collection
for word in doc:
word_count_idx[word] += 1
return word_count_idx
def compute_mle_vector(word_occurence, N):
mle_vector = dict()
for w in word_occurence:
mle_vector[w] = (word_occurence[w] + 0.5) / (N + 1)
return mle_vector
def compute_relevance_matrices(documents, annotations, thesaurus, vocab):
for th in thesaurus:
with open('last_th.txt', 'w') as f:
f.write(th)
# hepls separate relevant docs from non-relevant ones
corpus = defaultdict(lambda: [])
# mark relevance for all documents
for doc, tags, i in zip(documents, annotations, range(len(documents))):
if th in tags:
corpus['relevant'].append(doc)
else:
corpus['nonrelevant'].append(doc)
# Word occurrences in relevant and non relevant documents
rel_count_vec = word_occurence(corpus['relevant'], vocab)
non_count_vec = word_occurence(corpus['nonrelevant'], vocab)
# Number of relevant and non-relevant documents
N_rel = len(corpus['relevant'])
N_non = len(corpus['nonrelevant'])
# Compute maximum likelihood
p_prob = compute_mle_vector(rel_count_vec, N_rel)
q_prob = compute_mle_vector(non_count_vec, N_non)
# save probabilities on disk
save_obj(obj=(p_prob, q_prob), name=th)
def score(p_vec, q_vec, new_doc):
'''
For each word of the thesaurus, we compute
the probability of new_doc of being relevant
'''
num_prod = 1
denum_prod = 1
score = 0
for t in new_doc:
num_prod *= p_vec[t] * (1 - q_vec[t])
denum_prod *= q_vec[t] * (1 - p_vec[t])
try:
score = num_prod / denum_prod
except Exception as e:
pass
return score
def predict_tags(thesaurus, new_doc, n_best=10):
scores = dict()
for th in thesaurus:
p_vec, q_vec = load_obj(th)
scores[th] = score(p_vec, q_vec, new_doc)
scores = sorted(scores.items(), key=lambda x: x[1])
return scores[:n_best]
def test():
# define some shit data
docs = []
doc1 = ['i', 'love', 'paris']
doc2 = ['i', 'love', 'cats']
doc3 = ['am', 'allergic', 'cats']
docs.append(doc1)
docs.append(doc2)
docs.append(doc3)
voc = doc1 + doc2 + doc3
voc = list(set(voc))
print('vocab:', voc)
# Here thesaurus and annotations are the same
thes = ['animals', 'city', 'health']
anno = [['city'], ['animals'], ['health']]
# check relevance matrix
compute_relevance_matrices(docs, anno, thes, voc)
for t in thes:
print('Thesaurus word:', t)
t_p, t_q = load_obj(t)
for w in voc:
print('word:', w, 't_p:', t_p[w], 't_q:', t_q[w])
print()
doc4 = ['cats', 'love', 'am']
print(doc4)
doc4_scores = predict_tags(thes, doc4)
for th_s in doc4_scores:
print('thesaurus:', th_s[0], 'relevance:', th_s[1])
print('\n========== TEST ============\n')
test()
print('\n========== TRAIN ============\n')
# compute_relevance_matrices(documents_train, annotations_train,
# thesaurus, vocab)
print('[ + ] finished training')
print('\n========== SCORE ============\n')
print('[...] Computing document relevance')
results = []
for doc, tags in zip(documents_test, annotations_test):
predicted = predict_tags(thesaurus, doc, n_best=len(tags))
results.append((predicted, tags))
print('[...] Computing model accuracy')
accuracy = 0
for r in results:
accuracy += len(set(r[0]).intersection(set(r[1]))) / len(results)
print('[ + ] Model accuracy: {} %'.format(accuracy))