Skip to content

Commit e3a08b5

Browse files
MasoudMasoud
Masoud
authored and
Masoud
committed
added scoring to the WE filters and choosing config file
1 parent d9ee52f commit e3a08b5

File tree

8 files changed

+46
-28
lines changed

8 files changed

+46
-28
lines changed

config.json

+5-6
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,20 @@
1212
"normalize scores": "true",
1313
"emit scores": "false",
1414
"no out files": "true",
15-
"max decision": 0
15+
"max decision": -1
1616
},
1717

1818
"policies": [
1919
["OneNo", "off"],
20-
["TwentyNo", "off"],
21-
["MajorityVoting", "off"],
22-
["SingleFilterPolicy", "off"]
20+
["TwentyNo", "on"],
21+
["MajorityVoting", "off"]
2322
],
2423

2524
"filters": [
2625
["SampleFilter", "off"],
2726
["LengthStats", "off"],
2827

29-
["LengthRatio", "off"],
28+
["LengthRatio", "on"],
3029
["ReverseLengthRatio", "off"],
3130
["WordRatio", "off"],
3231
["ReverseWordRatio", "off"],
@@ -37,7 +36,7 @@
3736

3837
["Lang_Identifier", "off"],
3938

40-
["AlignedProportion", "on"],
39+
["AlignedProportion", "off"],
4140
["BigramAlignedProportion", "off"],
4241
["NumberOfUnalignedSequences", "off"],
4342
["LongestAlignedSequence", "off"],

filters/WE_Average/WE_Average.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def finalize(self):
129129
def process_tu(self, tu, num_of_finished_scans):
130130
if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
131131
if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
132-
return
132+
return [0]
133133

134134
src_vectors = []
135135
for w in tu.src_tokens:
@@ -138,7 +138,7 @@ def process_tu(self, tu, num_of_finished_scans):
138138
src_vectors.append(self.vectors[index])
139139

140140
if len(src_vectors) == 0:
141-
return
141+
return [0]
142142
src_rep = np.median(src_vectors, axis=0)
143143

144144
trg_vectors = []
@@ -148,15 +148,17 @@ def process_tu(self, tu, num_of_finished_scans):
148148
trg_vectors.append(self.vectors[index])
149149

150150
if len(trg_vectors) == 0:
151-
return
151+
return [0]
152152
trg_rep = np.median(trg_vectors, axis=0)
153153

154154
distance = cosine(src_rep, trg_rep)
155155

156156
self.n += 1
157157
self.sum += distance
158158
self.sum_sq += distance * distance
159-
159+
160+
return [distance]
161+
160162
elif num_of_finished_scans == 0:
161163
self.all_words += tu.src_tokens
162164
self.all_words += tu.trg_tokens

filters/WE_BestAlignScore/WE_BestAlignScore.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def finalize(self):
129129
def process_tu(self, tu, num_of_finished_scans):
130130
if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
131131
if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
132-
return
132+
return [0]
133133

134134
index = 0
135135
src_vectors = []
@@ -139,7 +139,7 @@ def process_tu(self, tu, num_of_finished_scans):
139139
src_vectors.append(self.vectors[index])
140140

141141
if len(src_vectors) == 0:
142-
return
142+
return [0]
143143

144144
trg_vectors = []
145145
for w in tu.trg_tokens:
@@ -148,7 +148,7 @@ def process_tu(self, tu, num_of_finished_scans):
148148
trg_vectors.append(self.vectors[index])
149149

150150
if len(trg_vectors) == 0:
151-
return
151+
return [0]
152152

153153
avg_distance = 0.0
154154
min_src_dist = [1.0] * len(src_vectors)
@@ -172,6 +172,8 @@ def process_tu(self, tu, num_of_finished_scans):
172172
self.sum += avg_distance
173173
self.sum_sq += avg_distance * avg_distance
174174

175+
return [avg_distance]
176+
175177
elif num_of_finished_scans == 0:
176178
self.all_words += tu.src_tokens
177179
self.all_words += tu.trg_tokens

filters/WE_Median/WE_Median.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def finalize(self):
129129
def process_tu(self, tu, num_of_finished_scans):
130130
if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
131131
if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
132-
return
132+
return [0]
133133

134134
src_vectors = []
135135
for w in tu.src_tokens:
@@ -138,7 +138,7 @@ def process_tu(self, tu, num_of_finished_scans):
138138
src_vectors.append(self.vectors[index])
139139

140140
if len(src_vectors) == 0:
141-
return
141+
return [0]
142142
src_rep = np.sum(src_vectors, axis=0)
143143

144144
trg_vectors = []
@@ -148,7 +148,7 @@ def process_tu(self, tu, num_of_finished_scans):
148148
trg_vectors.append(self.vectors[index])
149149

150150
if len(trg_vectors) == 0:
151-
return
151+
return [0]
152152
trg_rep = np.sum(trg_vectors, axis=0)
153153

154154
distance = cosine(src_rep, trg_rep)
@@ -157,6 +157,8 @@ def process_tu(self, tu, num_of_finished_scans):
157157
self.sum += distance
158158
self.sum_sq += distance * distance
159159

160+
return [distance]
161+
160162
elif num_of_finished_scans == 0:
161163
self.all_words += tu.src_tokens
162164
self.all_words += tu.trg_tokens

filters/WE_ScoreAlign_BestForRest/WE_ScoreAlign_BestForRest.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def finalize(self):
130130
def process_tu(self, tu, num_of_finished_scans):
131131
if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
132132
if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
133-
return
133+
return [0]
134134

135135
index = -1
136136
src_vectors = {}
@@ -139,7 +139,7 @@ def process_tu(self, tu, num_of_finished_scans):
139139
index = self.all_words[w]
140140
src_vectors[i] = self.vectors[index]
141141
if index == -1:
142-
return
142+
return [0]
143143

144144
index = -1
145145
trg_vectors = {}
@@ -148,7 +148,7 @@ def process_tu(self, tu, num_of_finished_scans):
148148
index = self.all_words[w]
149149
trg_vectors[i] = self.vectors[index]
150150
if index == -1:
151-
return
151+
return [0]
152152

153153
trg_mark = Set()
154154
avg_distance = 0.0
@@ -177,13 +177,15 @@ def process_tu(self, tu, num_of_finished_scans):
177177
counter += 1
178178

179179
if counter == 0:
180-
return
180+
return [0]
181181
avg_distance /= counter
182182

183183
self.n += 1
184184
self.sum += avg_distance
185185
self.sum_sq += avg_distance * avg_distance
186186

187+
return [avg_distance]
188+
187189
elif num_of_finished_scans == 0:
188190
self.all_words += tu.src_tokens
189191
self.all_words += tu.trg_tokens

filters/WE_ScoreOtherAlignment/WE_ScoreOtherAlignment.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def finalize(self):
129129
def process_tu(self, tu, num_of_finished_scans):
130130
if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
131131
if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
132-
return
132+
return [0]
133133

134134
index = -1
135135
src_vectors = []
@@ -141,7 +141,7 @@ def process_tu(self, tu, num_of_finished_scans):
141141
src_vectors.append(None)
142142

143143
if index == -1:
144-
return
144+
return [0]
145145

146146
index = -1
147147
trg_vectors = []
@@ -153,7 +153,7 @@ def process_tu(self, tu, num_of_finished_scans):
153153
trg_vectors.append(None)
154154

155155
if index == -1:
156-
return
156+
return [0]
157157

158158
avg_distance = 0.0
159159
counter = 0.0
@@ -162,7 +162,7 @@ def process_tu(self, tu, num_of_finished_scans):
162162
t_w = align_pair[1]
163163

164164
if s_w >= len(src_vectors) or t_w >= len(trg_vectors):
165-
return
165+
return [0]
166166
if src_vectors[s_w] is None or trg_vectors[t_w] is None:
167167
continue
168168
dist = cosine(src_vectors[s_w], trg_vectors[t_w])
@@ -171,13 +171,15 @@ def process_tu(self, tu, num_of_finished_scans):
171171
counter += 1
172172

173173
if counter == 0:
174-
return
174+
return [0]
175175
avg_distance /= counter
176176

177177
self.n += 1
178178
self.sum += avg_distance
179179
self.sum_sq += avg_distance * avg_distance
180180

181+
return [avg_distance]
182+
181183
elif num_of_finished_scans == 0:
182184
self.all_words += tu.src_tokens
183185
self.all_words += tu.trg_tokens

main.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from tm_manager import *
2+
import sys
23
"""
34
TMoP - Translation Memory Open-Source Purifier by Matteo Negri, Masoud Jalili Sabet and Marco Turchi, October 2015
45
@@ -32,6 +33,9 @@
3233
"""
3334

3435
if __name__ == "__main__":
35-
manager = TMManager()
36+
config_file = ""
37+
if len(sys.argv) > 1:
38+
config_file = sys.argv[1]
39+
manager = TMManager(config_file)
3640

3741
manager.run()

tm_manager.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class TMManager:
4747
"""
4848

4949
#
50-
def __init__(self):
50+
def __init__(self, conf_file_name=""):
5151
# Adding filter folder to path. after this filters can import 'AbstractFilter'.
5252
sys.path.append(os.getcwd() + '/filters/')
5353
sys.path.append(os.getcwd() + '/policies/')
@@ -72,6 +72,8 @@ def __init__(self):
7272
self.have_token = False
7373
self.create_out_files = True
7474

75+
self.config_file_name = conf_file_name
76+
7577
#
7678
def load_options_from_config_file(self):
7779
"""
@@ -84,7 +86,10 @@ def load_options_from_config_file(self):
8486

8587
print "Loading Options fro the config file ...\n"
8688

87-
conf_file = open("config.json")
89+
if self.config_file_name:
90+
conf_file = open(self.config_file_name)
91+
else:
92+
conf_file = open("config.json")
8893
try:
8994
config = json.load(conf_file)
9095
except ValueError:

0 commit comments

Comments
 (0)