forked from rujunhan/ConditionalEmbeddings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuk_og_11.py
148 lines (125 loc) · 5.25 KB
/
uk_og_11.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import gensim.models
import seaborn as sns
import pandas as pd
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from model_to_vectors import load_model
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
from bias_utils import *
matplotlib.use('pdf')
#os.chdir('..')
# Load HW
histwords_dir = '../Replication-Garg-2018/data/coha-word'
histwords = load_coha_HistWords(input_dir=histwords_dir, only_nonzero=True)
# Load UK_OG_11
bbb_vecs = load_BBB_nonzero(
input_dir=os.path.join(Path(__file__).parent, f'data/UK/results'), file_stamp='UK',
run_id='UK_OG_11', only_nonzero=False, match_vectors=None)
# BBB global embeddings
model = load_model(
f"data/UK/results/model_best_UK_UK_OG_11.pth.tar",
f"data/UK/processed/vocabUK_freq.npy")
bbb_global_emb = model.word_input_embeddings
with open("TEMPORARYFILE.txt", "w") as f:
for word, emb in bbb_global_emb.items():
f.write(f"{word} {' '.join(map(str, emb))}\n")
global_vecs = gensim.models.KeyedVectors.load_word2vec_format("TEMPORARYFILE.txt", binary=False, no_header=True)
os.remove("TEMPORARYFILE.txt")
""""
# Explore cosine similarities
cs_df = pd.DataFrame()
for decade, model in histwords.items():
w = model.vectors
w = pd.DataFrame(w)
cs = cosine_similarity(w)
cs_df = pd.concat([cs_df, pd.DataFrame.from_dict(
{'vectors': ['HW'], 'decade': [int(decade)], 'median_cs': [np.median(cs)]})])
for decade, model in bbb_vecs.items():
w = model.vectors
w = pd.DataFrame(w)
cs = cosine_similarity(w)
cs_df = pd.concat([cs_df, pd.DataFrame.from_dict(
{'vectors': ['BBB-Decadal'], 'decade': [int(decade)], 'median_cs': [np.median(cs)]})])
emb_df = pd.DataFrame()
for w, emb in bbb_global_emb.items():
w_df = pd.DataFrame(emb.reshape(1, -1))
emb_df = pd.concat([emb_df, w_df])
cs = cosine_similarity(emb_df)
med_cs = np.median(cs)
for decade in bbb_vecs.keys():
cs_df = pd.concat([cs_df, pd.DataFrame.from_dict(
{'vectors': ['BBB-Global'], 'decade': [int(decade)], 'median_cs': [med_cs]})])
ax = sns.lineplot(data=cs_df, x='decade', y='median_cs', hue='vectors')
ax.set(xlabel='Decade', ylabel='Median cosine similarity between words')
"""
# Cosine similarities: specific decade (understand distribution)
cs_decade = '2000'
embeddings = {
'HistWords': histwords[cs_decade].vectors,
'BBB-Decadal': bbb_vecs[cs_decade].vectors,
'BBB-Global': global_vecs.vectors
}
def plot_cs(embeddings):
cs_df = pd.DataFrame()
for name, embedding in tqdm(embeddings.items()):
decade_cs = cosine_similarity(pd.DataFrame(embedding))
mask = np.zeros(decade_cs.shape, dtype='bool')
mask[np.triu_indices(len(decade_cs))] = True
decade_cs = pd.DataFrame(decade_cs).mask(mask, None)
#decade_cs = decade_cs.values
decade_cs = pd.melt(decade_cs)
decade_cs = decade_cs.loc[~decade_cs['value'].isna()]
decade_cs['model'] = name
# Randomly sample 5% of values so memory doesn't crash
decade_cs = decade_cs.sample(frac=0.05, replace=False)
cs_df = pd.concat([cs_df, decade_cs])
g = sns.FacetGrid(cs_df, row='model', sharex=True, aspect=3)
g.map_dataframe(sns.kdeplot, x='value')
g.set_titles(row_template="{row_name}")
g.fig.suptitle('')
g.figure.savefig(os.path.join('results/embeddings/cosine_sim1990s.png'), dpi=800)
plot_cs(embeddings)
"""
# Measure performance of the global vectors
eval_dir = Path(__file__).parent / "data" / "COHA" / "evaluation"
eval_score = pd.DataFrame()
score, sections = global_vecs.evaluate_word_analogies(str(eval_dir / 'questions-words.txt'))
for section_dict in sections:
if len(section_dict['correct']) + len(section_dict['incorrect']) == 0:
accuracy = None
else:
accuracy = len(section_dict['correct']) / (len(section_dict['correct']) + len(section_dict['incorrect']))
eval_score = pd.concat([eval_score, pd.DataFrame.from_dict(
{'task': ['analogy'], 'section': [section_dict['section']], 'accuracy': [accuracy],
'vectors': ['BBB-Global']})])
# Word similarity (Bruni et al 2012 -- used in HistWords)
pearson, spearman, oov = global_vecs.evaluate_word_pairs(str(eval_dir / 'MEN_dataset_natural_form_full.txt'))
eval_score = pd.concat(
[eval_score, pd.DataFrame.from_dict(
{'task': ['Bruni'], 'section': ['pearson_stat'], 'accuracy': [pearson.statistic],
'vectors': ['BBB-Global']})])
# Visualize HW
nonzero_df = pd.DataFrame()
for decade_str in histwords.keys():
w = histwords[decade_str].vectors
# nonzero = nonzero = np.abs(w) > 1e-6
nonzero = w
nonzero = pd.DataFrame(nonzero)
nonzero['decade'] = int(decade_str)
nonzero['word'] = histwords[decade_str].key_to_index.keys()
nonzero_df = pd.concat([nonzero_df, nonzero])
embed_df = nonzero_df.copy()
nonzero_df = pd.melt(nonzero_df, id_vars=['word', 'decade'], var_name='dim', value_name='element')
"""
"""
def facet_heatmap(data, color, **kws):
data = data.pivot_table(values='element', index='word', columns='dim')
sns.heatmap(data, cbar=True)
g = sns.FacetGrid(nonzero_df, col='decade', col_wrap=5)
g.map_dataframe(facet_heatmap)
g.set_titles(row_template="{row_name}", col_template='{col_name}')
g.fig.suptitle('')
g.figure.savefig(os.path.join("test.png"), dpi=800)
"""