Skip to content

Commit 23e5984

Browse files
committed
intial state
0 parents  commit 23e5984

File tree

3 files changed

+71
-0
lines changed

3 files changed

+71
-0
lines changed

classify.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from gensim.models import word2vec
2+
from sklearn.cluster import KMeans
3+
4+
#cd PycharmProjects/play_gensim/
5+
6+
w2v = word2vec.Word2Vec.load(u'C:\\Users\\michar\\PycharmProjects\\play_gensim\\w2v_model_size_100_window_5.model')
7+
8+
kmeans = KMeans(n_clusters=50, random_state=0).fit(w2v.syn0)
9+
10+
11+
labels_count = {}
12+
for l in kmeans.labels_:
13+
labels_count[l] = labels_count.get(l,0)+1
14+
for l in kmeans.labels_:
15+
labels_count[l] = labels_count.get(l,0)+1
16+
17+
sorted_labels = sorted(labels_count.keys(), key = labels_count.get)
18+
19+
minimal_label = sorted_labels[2]
20+
for i,l in enumerate(kmeans.labels_):
21+
if l == minimal_label:
22+
print w2v.index2word[i]

main.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
2+
from gensim.models import word2vec
3+
import time
4+
5+
6+
def convert_corpora_to_sentence_iter():
7+
t8 = word2vec.Text8Corpus('text8/text8')
8+
return t8
9+
10+
11+
def train_and_save_model(sentences, fname, **kwargs):
12+
w2v = word2vec.Word2Vec(sentences, **kwargs)
13+
w2v.save(fname)
14+
return w2v
15+
16+
def get_model_name(**kw):
17+
return u"w2v_model_size_{model_size}_window_{window}.model".format(**kw)
18+
19+
print __name__
20+
if __name__ == '__main__':
21+
sentences_iter = convert_corpora_to_sentence_iter()
22+
for model_size in range(10, 200, 10):
23+
for window in (5, 7, 10):
24+
t0 = time.clock()
25+
print "Training with size={}, window={}".format(model_size, window)
26+
fname = get_model_name(model_size=model_size, window=window)
27+
model = train_and_save_model(sentences_iter, fname, size=model_size, window=window)
28+
t1 = time.clock()
29+
print "Training took {} secs".format(t1-t0)

meassure_convergence.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from main import get_model_name
2+
from gensim.models import word2vec
3+
from sklearn.cluster import KMeans
4+
from sklearn.metrics import homogeneity_completeness_v_measure as hom_v_score
5+
6+
NCLUSTERS = 50
7+
results = {}
8+
for window_size in (5,7,10):
9+
results[window_size] = []
10+
ref_model_name = get_model_name(model_size=190, window=window_size)
11+
ref_model = word2vec.Word2Vec.load(u'C:\\Users\\michar\\PycharmProjects\\play_gensim\\'+ref_model_name)
12+
ref_kmeans = KMeans(n_clusters=NCLUSTERS, random_state=0).fit(ref_model.syn0)
13+
for model_size in range(10, 200, 10):
14+
model_name = get_model_name(model_size=model_size, window=window_size)
15+
model = word2vec.Word2Vec.load(u'C:\\Users\\michar\\PycharmProjects\\play_gensim\\'+model_name)
16+
kmeans = KMeans(n_clusters=NCLUSTERS, random_state=0).fit(model.syn0)
17+
18+
score = hom_v_score(kmeans.labels_, ref_kmeans.labels_)[2]
19+
print "window {}, size {}, score {}".format(window_size, model_size, score)
20+
results[window_size].append((model_size, score))

0 commit comments

Comments
 (0)