Skip to content

Commit a59babd

Browse files
MasoudMasoud
Masoud
authored and
Masoud
committed
new features, like saving the stats files
1 parent a240315 commit a59babd

File tree

29 files changed

+1353
-196
lines changed

29 files changed

+1353
-196
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*.pyo
44
test*
55
note.txt
6+
quartiles
67
# Except this file
78
!.gitignore
89

config.json

+23-16
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,43 @@
11
{
22
"options": {
3-
"input file": "sample_en_it.csv",
3+
"input file1": "sample_en_it.csv",
4+
"input file": "en_it_1m_with_bgold",
45

5-
"align file": "sample_align",
6-
"token file": "sample_token",
6+
"align file1": "sample_align",
7+
"token file1": "sample_token",
8+
"align file": "en_it_1m_with_bgold.align",
9+
"token file": "en_it_1m_with_bgold.tok",
710

811
"output folder": "output",
9-
"source language": "English",
10-
"target language": "Italian",
12+
"source language": "en",
13+
"target language": "it",
1114

12-
"no out files": 1
15+
"normalize scores": "true",
16+
"emit scores": "true",
17+
"no out files": "true",
18+
19+
"max decision": "0"
1320
},
1421

1522
"policies": [
16-
["OneNo", "off"],
23+
["OneNo", "on"],
1724
["TwentyNo", "off"],
1825
["MajorityVoting", "off"],
1926
["SingleFilterPolicy", "off"]
2027
],
2128

2229
"filters": [
23-
["SampleFilter", "on"],
30+
["SampleFilter", "off"],
2431
["LengthStats", "off"],
2532

26-
["LengthRatio", "off"],
27-
["ReverseLengthRatio", "off"],
28-
["WordRatio", "off"],
29-
["ReverseWordRatio", "off"],
30-
["WordLength", "off"],
31-
["TagFinder", "off"],
32-
["RepeatedChars", "off"],
33-
["RepeatedWords", "off"],
33+
["LengthRatio", "on"],
34+
["ReverseLengthRatio", "of"],
35+
["WordRatio", "of"],
36+
["ReverseWordRatio", "of"],
37+
["WordLength", "of"],
38+
["TagFinder", "of"],
39+
["RepeatedChars", "of"],
40+
["RepeatedWords", "of"],
3441

3542
["Lang_Identifier", "off"],
3643

data/sample_token

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Having regard to the Opinion of the European Parliament ; VISTO IL PARERE DEL PARLAMENTO EUROPEO ,
2+
Measurement of fuel consumption MISURA DEL CONSUMO DI CARBURANTE
3+
in the case of carbon monoxide , d = 1 7 250 ; PER L'OSSIDO DI CARBONIO , D = 1,250 ;
4+
Common compensation procedures Metodi comuni di compensazione
5+
The provisions of Articles 10 to 13 shall apply . Sono applicabili le disposizioni degli articolo da 10 a 13 .

filters/AlignedProportion/AlignedProportion.py

+76-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
# sys.path.append(os.getcwd() + '/..') # Uncomment for standalone running
22
from abstract_filter import *
3+
import os.path
34
import math
5+
import numpy as np
46

57

68
class AlignedProportion(AbstractFilter):
79
def __init__(self):
10+
self.var_mult = 2
11+
# self.var_mult = 100 - self.var_mult
12+
813
self.num_of_scans = 1
914
self.src_language = ""
1015
self.trg_language = ""
@@ -21,15 +26,42 @@ def __init__(self):
2126
self.trg_mean = 0.0
2227
self.trg_var = 0.0
2328

29+
self.src_scores = []
30+
self.trg_scores = []
31+
self.s_thresh = 0.0
32+
self.t_thresh = 0.0
33+
2434
#
25-
def initialize(self, source_language, target_language):
35+
def initialize(self, source_language, target_language, extra_args):
2636
self.num_of_scans = 1
27-
self.src_language = source_language
28-
self.trg_language = target_language
37+
self.src_language = extra_args['source language']
38+
self.trg_language = extra_args['target language']
39+
self.normalize = extra_args['normalize scores']
40+
self.model_filename = "models/" + extra_args['input filename'] + "__AlignedProportion.stats"
41+
if self.normalize:
42+
self.model_filename += "_n"
43+
44+
if os.path.isfile(self.model_filename):
45+
self.num_of_scans = 0
46+
47+
f = open(self.model_filename, 'r')
48+
l = f.readline().strip().split("\t")
49+
self.src_mean = float(l[1])
50+
self.src_var = float(l[2])
51+
52+
l = f.readline().strip().split("\t")
53+
self.trg_mean = float(l[1])
54+
self.trg_var = float(l[2])
55+
56+
f.close()
57+
print "Loaded stats from the model file."
2958

3059
return
3160

3261
def finalize(self):
62+
if self.num_of_scans == 0:
63+
return
64+
3365
if self.n <= 1:
3466
self.n = 2.0
3567
self.src_mean = self.src_sum / self.n
@@ -44,6 +76,30 @@ def finalize(self):
4476
print "source mean & deviation:", self.src_mean, "\t", self.src_var
4577
print "target mean & deviation:", self.trg_mean, "\t", self.trg_var
4678

79+
f = open(self.model_filename, 'w')
80+
81+
f.write("source\t" + str(self.src_mean) + "\t" + str(self.src_var) + "\n")
82+
f.write("target\t" + str(self.trg_mean) + "\t" + str(self.trg_var) + "\n")
83+
84+
f.close()
85+
86+
self.s_thresh = np.percentile(self.src_scores, self.var_mult)
87+
self.t_thresh = np.percentile(self.trg_scores, self.var_mult)
88+
89+
f = open("models/quartiles", "a")
90+
91+
f.write("Aligned Proportion")
92+
f.write("\t" + str(np.percentile(self.src_scores, 25)))
93+
f.write("\t" + str(np.percentile(self.src_scores, 50)))
94+
f.write("\t" + str(np.percentile(self.src_scores, 75)))
95+
96+
f.write("\t" + str(np.percentile(self.trg_scores, 25)))
97+
f.write("\t" + str(np.percentile(self.trg_scores, 50)))
98+
f.write("\t" + str(np.percentile(self.trg_scores, 75)))
99+
f.write("\n")
100+
101+
f.close()
102+
47103
#
48104
def process_tu(self, tu, num_of_finished_scans):
49105
src_set = set([x[0] for x in tu.alignment])
@@ -53,17 +109,31 @@ def process_tu(self, tu, num_of_finished_scans):
53109
trg_size = float(len(tu.trg_tokens))
54110

55111
if src_size == 0 or trg_size == 0:
56-
return
112+
return [0.0, 0.0]
57113

58114
self.n += 1
59115
src_ratio = float(len(src_set)) / src_size
116+
# if src_ratio > 1:
117+
# print src_set
118+
# print tu.src_tokens
60119
trg_ratio = float(len(trg_set)) / trg_size
120+
# if trg_ratio > 1:
121+
# print trg_set
122+
# print tu.trg_tokens
123+
124+
src_ratio = min(src_ratio, 1.0)
125+
trg_ratio = min(trg_ratio, 1.0)
61126

62127
self.src_sum += src_ratio
63128
self.src_sum_sq += src_ratio * src_ratio
64129
self.trg_sum += trg_ratio
65130
self.trg_sum_sq += trg_ratio * trg_ratio
66131

132+
self.src_scores.append(src_ratio)
133+
self.trg_scores.append(trg_ratio)
134+
135+
return [src_ratio, trg_ratio]
136+
67137
#
68138
def do_after_a_full_scan(self, num_of_finished_scans):
69139
pass
@@ -84,6 +154,7 @@ def decide(self, tu):
84154
src_ratio = abs(src_ratio - self.src_mean)
85155
trg_ratio = abs(trg_ratio - self.trg_mean)
86156

87-
if src_ratio > 2 * self.src_var or trg_ratio > 2 * self.trg_var:
157+
if src_ratio > self.var_mult * self.src_var or trg_ratio > self.var_mult * self.trg_var:
158+
# if src_ratio < self.s_thresh or trg_ratio < self.t_thresh:
88159
return 'reject'
89160
return 'accept'

filters/AlignedSequenceLength/AlignedSequenceLength.py

+98-17
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
# sys.path.append(os.getcwd() + '/..') # Uncomment for standalone running
22
from abstract_filter import *
3+
import os.path
34
import math
5+
import numpy as np
46

57

68
class AlignedSequenceLength(AbstractFilter):
79
def __init__(self):
10+
self.var_mult = 2
11+
# self.var_mult = 100 - self.var_mult
12+
813
self.num_of_scans = 1
914
self.src_language = ""
1015
self.trg_language = ""
@@ -22,15 +27,42 @@ def __init__(self):
2227
self.trg_mean = 0.0
2328
self.trg_var = 0.0
2429

30+
self.src_scores = []
31+
self.trg_scores = []
32+
self.s_thresh = 0.0
33+
self.t_thresh = 0.0
34+
2535
#
26-
def initialize(self, source_language, target_language):
36+
def initialize(self, source_language, target_language, extra_args):
2737
self.num_of_scans = 1
28-
self.src_language = source_language
29-
self.trg_language = target_language
38+
self.src_language = extra_args['source language']
39+
self.trg_language = extra_args['target language']
40+
self.normalize = extra_args['normalize scores']
41+
self.model_filename = "models/" + extra_args['input filename'] + "__AlignedSequenceLength.stats"
42+
if self.normalize:
43+
self.model_filename += "_n"
44+
45+
if os.path.isfile(self.model_filename):
46+
self.num_of_scans = 0
47+
48+
f = open(self.model_filename, 'r')
49+
l = f.readline().strip().split("\t")
50+
self.src_mean = float(l[1])
51+
self.src_var = float(l[2])
52+
53+
l = f.readline().strip().split("\t")
54+
self.trg_mean = float(l[1])
55+
self.trg_var = float(l[2])
56+
57+
f.close()
58+
print "Loaded stats from the model file."
3059

3160
return
3261

3362
def finalize(self):
63+
if self.num_of_scans == 0:
64+
return
65+
3466
if self.src_n <= 1:
3567
self.src_n = 2.0
3668
self.src_mean = self.src_sum / self.src_n
@@ -43,6 +75,30 @@ def finalize(self):
4375
self.trg_var = (self.trg_sum_sq - (self.trg_sum * self.trg_sum) / self.trg_n) / (self.trg_n - 1)
4476
self.trg_var = math.sqrt(self.trg_var)
4577

78+
f = open(self.model_filename, 'w')
79+
80+
f.write("source\t" + str(self.src_mean) + "\t" + str(self.src_var) + "\n")
81+
f.write("target\t" + str(self.trg_mean) + "\t" + str(self.trg_var) + "\n")
82+
83+
f.close()
84+
85+
self.s_thresh = np.percentile(self.src_scores, self.var_mult)
86+
self.t_thresh = np.percentile(self.trg_scores, self.var_mult)
87+
88+
f = open("models/quartiles", "a")
89+
90+
f.write("AlignedSequenceLength")
91+
f.write("\t" + str(np.percentile(self.src_scores, 25)))
92+
f.write("\t" + str(np.percentile(self.src_scores, 50)))
93+
f.write("\t" + str(np.percentile(self.src_scores, 75)))
94+
95+
f.write("\t" + str(np.percentile(self.trg_scores, 25)))
96+
f.write("\t" + str(np.percentile(self.trg_scores, 50)))
97+
f.write("\t" + str(np.percentile(self.trg_scores, 75)))
98+
f.write("\n")
99+
100+
f.close()
101+
46102
#
47103
def process_tu(self, tu, num_of_finished_scans):
48104
src_set = set([x[0] for x in tu.alignment])
@@ -51,37 +107,57 @@ def process_tu(self, tu, num_of_finished_scans):
51107
trg_size = float(len(tu.trg_tokens))
52108

53109
if src_size == 0 or trg_size == 0:
54-
return
110+
return [0.0, 0.0]
55111

56112
src_bar = set([i for i in range(int(src_size))])
57113
trg_bar = set([i for i in range(int(trg_size))])
58114
src_set = src_bar - src_set
59115
trg_set = trg_bar - trg_set
60116

61117
last = -1
118+
n = 0.0
119+
smean = 0.0
62120
for current in src_set:
63121
if current - last > 1:
64-
self.src_n += 1
65-
self.src_sum += (current - last - 1)
66-
self.src_sum_sq += (current - last - 1) * (current - last - 1)
122+
n += 1
123+
smean += (current - last - 1)
67124
last = current
68125
if src_size - last > 1:
69-
self.src_n += 1
70-
self.src_sum += (src_size - last - 1)
71-
self.src_sum_sq += (src_size - last - 1) * (src_size - last - 1)
126+
n += 1
127+
smean += (src_size - last - 1)
128+
129+
smean /= max(n, 1.0)
130+
if self.normalize:
131+
smean = min(smean, 4.0) / 4.0
132+
133+
self.src_n += 1
134+
self.src_sum += smean
135+
self.src_sum_sq += smean * smean
72136

73137
last = -1
138+
n = 0.0
139+
tmean = 0.0
74140
for current in trg_set:
75141
if current - last > 1:
76-
self.trg_n += 1
77-
self.trg_sum += (current - last - 1)
78-
self.trg_sum_sq += (current - last - 1) * (current - last - 1)
142+
n += 1
143+
tmean += (current - last - 1)
79144
last = current
80145
if trg_size - last > 1:
81-
self.trg_n += 1
82-
self.trg_sum += (trg_size - last - 1)
83-
self.trg_sum_sq += (trg_size - last - 1) * (trg_size - last - 1)
146+
n += 1
147+
tmean += (trg_size - last - 1)
148+
149+
tmean /= max(n, 1.0)
150+
if self.normalize:
151+
tmean = min(tmean, 4.0) / 4.0
152+
153+
self.trg_n += 1
154+
self.trg_sum += tmean
155+
self.trg_sum_sq += tmean * tmean
156+
157+
self.src_scores.append(smean)
158+
self.trg_scores.append(tmean)
84159

160+
return [smean, tmean]
85161

86162
#
87163
def do_after_a_full_scan(self, num_of_finished_scans):
@@ -112,6 +188,8 @@ def decide(self, tu):
112188
if n < 1:
113189
n = 1.0
114190
src_mean /= n
191+
if self.normalize:
192+
src_mean = min(src_mean, 4.0) / 4.0
115193

116194
n = 0.0
117195
trg_mean = 0.0
@@ -127,10 +205,13 @@ def decide(self, tu):
127205
if n < 1:
128206
n = 1.0
129207
trg_mean /= n
208+
if self.normalize:
209+
trg_mean = min(trg_mean, 4.0) / 4.0
130210

131211
src_mean = abs(src_mean - self.src_mean)
132212
trg_mean = abs(trg_mean - self.trg_mean)
133213

134-
if src_mean > 2 * self.src_var or trg_mean > 2 * self.trg_var:
214+
if src_mean > self.var_mult * self.src_var or trg_mean > self.var_mult * self.trg_var:
215+
# if src_mean < self.s_thresh or trg_mean < self.t_thresh:
135216
return 'reject'
136217
return 'accept'

0 commit comments

Comments
 (0)