-
Notifications
You must be signed in to change notification settings - Fork 0
/
morphology_preprocess.py
336 lines (277 loc) · 13 KB
/
morphology_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
"""
Algorithm that generates pre-segmentations on a target corpus. See the write up for more on
how exactly this process works.
"""
import json
import pickle
from collections import defaultdict
from bpe import get_vocabulary, get_vocabulary_freq_table
import networkx as nx
import sys
import os
import numpy as np
sys.path.append("./evaluation/")
from evaluate_seg import get_gs_data
import pdb
import time
def process_json(json_contents, vocab, word_vectors, num_examples=float("inf")):
"""
Extends a list of example morphological transformations to the given corpus/vocab. Does this
via a set of vector embeddings, which need to be trained before-hand.
Arguments:
json_contents -- json object containing the example transformations (format currently defined by the
json output from John's Soricut and Och implementation)
vocab -- Dict of (word -> freq). Extracted from target corpus.
word_vectors -- Dict of word -> embeddings. Trained on target corpus.
num_examples -- for some input rule from json file, max number of example direction vectors to store (for speed).
Returns:
morph_transforms -- This is a weirdly formatted dictionary (to speed up later code).
General structure: Rule Kind -> From String -> To String -> Example Direction Vectors.
"""
#Satanic code that makes a three-layer dictionary.
transforms_dict = defaultdict( lambda: defaultdict( lambda: defaultdict(lambda: []) ) )
for change in json_contents:
#Don't care about empty rules like this...
if "transformations" not in change:
continue
from_str = change["rule"]["from"]
to_str = change["rule"]["to"]
rule_kind = change["rule"]["kind"]
#To prevent infinite loops while propogating. Also, we also want "reductions"
if len(from_str) == len(to_str):
continue
rule_inverted = len(from_str) < len(to_str)
if rule_inverted:
from_str, to_str = to_str, from_str
from_str, to_str = get_drop_string(from_str, to_str, rule_kind)
transformations = sorted(change["transformations"], key=lambda x: float(x["cosine"]), reverse=True)
d_vectors = []
for trans in transformations:
#OPTIMIZATION: Only keep around a small number of word vectors for each rule.
if len(d_vectors) == num_examples:
break
if rule_inverted:
longer_word = trans["rule"]["output"]
shorter_word = trans["rule"]["input"]
else:
longer_word = trans["rule"]["input"]
shorter_word = trans["rule"]["output"]
if longer_word in vocab and shorter_word in vocab:
direction_vector = word_vectors[shorter_word] - word_vectors[longer_word]
d_vectors.append(direction_vector)
transforms_dict[rule_kind][from_str][to_str].extend(d_vectors)
#Build a final list of all transformations.
morph_transforms = []
for rule_kind in transforms_dict:
for from_str in transforms_dict[rule_kind]:
for to_str in transforms_dict[rule_kind][from_str]:
d_vectors = tuple(transforms_dict[rule_kind][from_str][to_str])
morph_transforms.append((rule_kind, from_str, to_str, d_vectors))
#Heuristic: Try transforms that lead to shorter words first.
morph_transforms = sorted(morph_transforms, key=lambda x: len(x[2]))
return morph_transforms
def get_drop_string(from_str, to_str, rule_kind):
"""
Gets drop strings (from string and to string) using the old and new words.
Arguments:
from_str -- Longer word in transformation. Will be whittled down.
to_str -- Shorter word in transformation. Will be whittled down.
rule_kind -- indicates prefix or suffix.
Returns:
from_str -- Smallest substring of longer word that, when removed from longer word, makes
the remaining string a substring of the shorter word.
to_str -- Smallest substring of shorter word that, when removed from shorter word, makes
the remaining string a substring of the longer word.
"""
if rule_kind == "s":
for char in to_str[:]:
if from_str[0] == char:
from_str = from_str[1:]
to_str = to_str[1:]
else:
break
else:
for char in to_str[::-1]:
if from_str[-1] == char:
from_str = from_str[:-1]
to_str = to_str[:-1]
else:
break
return from_str, to_str
def compute_preseg(vocabulary, word_vectors, morph_transforms, test_set=None, threshold=0.5):
"""
Heart and soul of pre-segmentation algorithm.
Arguments:
vocabulary -- Dict of (word -> freq). Extracted from target corpus.
word_vectors -- Dict of word -> embeddings. Trained on target corpus.
morph_transforms -- generalized morphological transformations (from process_json)
test_set -- If present, a smaller set of words to pre-segment.
threshold -- minimum cosine similarity needed to trigger a pre-segmentation.
Returns:
None. Writes a pickle of the generated pre-segmentations to the save directory give in args,
as well as a list of pre-segmentations generated.
"""
propogation_graph = nx.DiGraph()
vocab = list(vocabulary.keys())
#Go from longer words to shorter ones since we apply "drop" rules
vocab = sorted(vocab, key = lambda x: len(x), reverse=True)
presegs = {}
if test_set is None:
target_words = vocab
else:
target_words = sorted(test_set, key = lambda x: len(x), reverse=True)
i = 0
words_to_do = len(target_words)
start = time.time()
while target_words:
# if (i % 100 == 0):
# print("Processing " + str(i) + "/" + str(words_to_do))
if (i % 500 == 0):
print("")
print("Processing " + str(i) + "/" + str(words_to_do))
time_elapsed = time.time() - start
print("Time passed: " + str(time_elapsed))
print("")
#Don't change something while you iterate over it!
word = target_words[0]
target_words = target_words[1:]
presegs[word] = [word]
possible_change = test_transforms(word, morph_transforms, vocab, word_vectors, threshold)
if possible_change:
change_kind, drop_str, parent_word = possible_change
if change_kind == "s":
if spell_change_mode:
presegs[word] = [parent_word, drop_str]
else:
presegs[word] = [word[:-len(drop_str)], drop_str]
propogation_graph.add_edge(parent_word, word, link= [0])
else:
if spell_change_mode:
presegs[word] = [drop_str, parent_word]
else:
presegs[word] = [drop_str, word[len(drop_str):]]
propogation_graph.add_edge(parent_word, word, link = [1])
print(presegs[word])
#Core of the propogation algorithm!
if use_propogation:
propogate_to_children(propogation_graph, presegs, word, 0, drop_str, change_kind)
i += 1
#DEBUG
to_write = sorted(list(presegs.keys()))
#return presegs
segs_output_obj = open(save_dir + "pre_segs.txt", "w+")
for word in to_write:
final_seg = presegs[word]
delimited_seg = " ".join(final_seg)
segs_output_obj.write(word + ": " + delimited_seg)
segs_output_obj.write('\n')
segs_output_obj.close()
with open(os.path.join(save_dir, "presegs_ckpt.txt"), "wb+") as checkpoint_file:
pickle.dump(presegs, checkpoint_file)
def propogate_to_children(graph, presegs, word, prev_idx, drop_str, kind):
"""
Propogates segmentations of a word to longer words of which it is a base. Uses a graph where
each word is a node with edges to longer words it is a base of.
Example: give -> giving.
"""
try:
for child in graph.successors(word):
link = graph[word][child]["link"]
#BUG FIX: Simtimes things get propagated in one word but not another, meaning link indicies may not line up.
if len(link) <= prev_idx:
continue
idx = link[prev_idx]
segment = presegs[child][idx]
if kind == "s":
if segment[-len(drop_str):] == drop_str and len(segment) > len(drop_str):
presegs[child][idx] = segment[:-len(drop_str)]
presegs[child].insert(idx + 1, drop_str)
print("")
print("PROPOGATION!")
print(presegs[child])
print("")
link.append(max(link) + 1)
propogate_to_children(graph, presegs, child, idx, drop_str, kind)
else:
if segment[:len(drop_str)] == drop_str and len(segment) > len(drop_str):
presegs[child][idx] = drop_str
presegs[child].insert(idx + 1, segment[len(drop_str):])
print("")
print("PROPOGATION!")
print(presegs[child])
print("")
link.append(max(link) + 1)
propogate_to_children(graph, presegs, child, idx, drop_str, kind)
except Exception as e:
pdb.set_trace()
def cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
def check_transform_similarity(word, new_string, d_vectors, vocab, word_vectors, threshold):
"""
Test whether a transformation should be applied to a word, creating a pre-segmentation for that word.
Arguments:
word -- word we are testing a transformation on.
new_string -- word that applying this transformation would create.
d_vectors -- Example direction vectors for the transformation being tested.
vocab -- Dict of (word -> freq). Extracted from target corpus.
word_vectors -- Dict of word -> embeddings. Trained on target corpus.
threshold -- minimum cosine similarity needed to trigger a pre-segmentation.
Returns:
Boolean that indicates whether transformation passed the cosine similarity threshold.
"""
if new_string in vocab:
canidate_direction_vector = word_vectors[new_string] - word_vectors[word]
for direction_vector in d_vectors:
if cosine_similarity(canidate_direction_vector, direction_vector) > threshold:
print(word + " ---> " + new_string)
return True
return False
def test_transforms(word, morph_transforms, vocab, word_vectors, threshold):
"""
For a single word, iterate through all generalized transformations and potentially create a
new pre-segmentation for the current word.
Arguments:
word -- word we are testing a transformation on.
morph_transforms -- generalized morphological transformations (from process_json)
vocab -- Dict of (word -> freq). Extracted from target corpus.
word_vectors -- Dict of word -> embeddings. Trained on target corpus.
threshold -- minimum cosine similarity needed to trigger a pre-segmentation.
Returns:
IF some transformation passed the threshold, return the type (prefix or suffix) that
transformation was, the from string (dropped from the input/longer word), and the new word that
was formed.
"""
for transform in morph_transforms:
rule_kind, from_str, to_str, d_vectors = transform
i = len(from_str)
if i < len(word):
if rule_kind == "s":
if word[-i:] != from_str:
continue
new_string = word[:-i] + to_str
else:
if word[:i] != from_str:
continue
new_string = to_str + word[i:]
if check_transform_similarity(word, new_string, d_vectors, vocab, word_vectors, threshold):
return rule_kind, from_str, new_string
if __name__ == '__main__':
data_directory = sys.argv[1]
vectors_file = sys.argv[2]
transforms_file = sys.argv[3]
#Bit that sets whether or not to use propagation algo.
use_propogation = int(sys.argv[4])
#Bit if we want to undo spelling changes in segmentations.
spell_change_mode = int(sys.argv[5])
threshold = float(sys.argv[6])
k = int(sys.argv[7])
save_dir = sys.argv[8]
word_vectors = pickle.load(open(vectors_file, "rb"))
json_contents = json.load(open(transforms_file, "r"))
vocab = get_vocabulary(open(data_directory, "r"))
test_set = list(vocab.keys())
#Use Hyperparamter 2 (that blocks presegs on words above a certain freq.)
test_set = sorted(test_set, key = lambda x: vocab[x], reverse=True)
test_set = test_set[k:]
morph_transforms = process_json(json_contents, vocab, word_vectors)
compute_preseg(vocab, word_vectors, morph_transforms, test_set=test_set, threshold=threshold)