This repository has been archived by the owner on Oct 3, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 26
/
build_index.py
90 lines (76 loc) · 2.98 KB
/
build_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Knowledge index maker
@author Tao PR (github.com/starcolon)
"""
import numpy as np
import os
import sys
import argparse
import word2vec
from termcolor import colored
from collections import Counter
from pylib.knowledge.graph import Knowledge
from pylib.knowledge.datasource import MineDB
from nltk.tokenize.punkt import PunktSentenceTokenizer
arguments = argparse.ArgumentParser()
arguments.add_argument('--verbose', dest='verbose', action='store_true', help='Turn verbose output on.')
arguments.add_argument('--limit', type=int, default=100, help='Maximum number of topics we want to build index')
arguments.add_argument('--root', type=str, default=None, help='Supply the OrientDB password for root account.')
arguments.add_argument('--modelpath', type=str, default='./models/word2vec.bin', help='Path of the word2vec binary model.')
args = vars(arguments.parse_args(sys.argv[1:]))
def collect_wordbag(kb, model):
print(colored('Iterating through topics...','cyan'))
n = 0
for topic in kb:
n += 1
kws = list(kb.keywords_in_topic(topic.title, with_edge_count=True))
# Frequency of [w] in the current topic
cnt = Counter([kw.w for kw in kws])
# Normalise with global frequency
for word in kws:
cnt[word.w] /= word.freq
# Normalise topic counter
norm = np.linalg.norm(list(cnt.values()))
cnt0 = {k:v/norm for k,v in cnt.items()}
# Generate similar words with word2vec
cnt = {}
for word, freq in cnt0.items():
cnt[word] = freq
try:
indexes, metrics = model.cosine(word)
synnonyms = model.generate_response(indexes, metrics).tolist()
for syn, confidence in synnonyms:
if confidence < 0.85: break
cnt[syn] = confidence * freq
except:
pass
yield (n,topic,cnt)
if n>=args['limit']: break
def load_word2vec_model(path):
if not os.path.isfile(model_path):
print(colored('[ERROR] word2vec model does not exist.','red'))
raise RuntimeError('Model does not exist')
print(colored('[Model] loading binary model.','cyan'))
return word2vec.WordVectors.from_binary(model_path, encoding='ISO-8859-1')
def add_to_index(index,bag):
print('------------------------------------')
n, topic, cnt = bag
print('...Constructing : {}'.format(colored(topic.title,'magenta')))
print('...#{} {}'.format(n, cnt))
words, weights = zip(*[(w,weight) for w,weight in cnt.items()])
index.add(topic.title, words, weights, verbose=False)
if __name__ == '__main__':
# Load word2vec model
model_path = os.path.realpath(args['modelpath'])
model = load_word2vec_model(model_path)
# Initialise a knowledge database
print(colored('Initialising knowledge graph database...','cyan'))
kb = Knowledge('localhost','vor','root',args['root'])
# Collect topic wordbag
wb = collect_wordbag(kb, model)
# Create knowledge index
index = Knowledge('localhost','vorindex','root',args['root'])
index.clear()
for bag in wb:
add_to_index(index, bag)
print(colored('[DONE] all process ended','green'))