Skip to content

Commit 3091230

Browse files
MasoudMasoud
Masoud
authored and
Masoud
committed
version 1.2
1 parent a59babd commit 3091230

File tree

26 files changed

+590
-244
lines changed

26 files changed

+590
-244
lines changed

config.json

+15-19
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,22 @@
11
{
22
"options": {
3-
"input file1": "sample_en_it.csv",
4-
"input file": "en_it_1m_with_bgold",
3+
"input file": "sample_en_it.csv",
54

6-
"align file1": "sample_align",
7-
"token file1": "sample_token",
8-
"align file": "en_it_1m_with_bgold.align",
9-
"token file": "en_it_1m_with_bgold.tok",
5+
"align file": "sample_align",
6+
"token file": "sample_token",
107

118
"output folder": "output",
129
"source language": "en",
1310
"target language": "it",
1411

1512
"normalize scores": "true",
16-
"emit scores": "true",
13+
"emit scores": "false",
1714
"no out files": "true",
18-
19-
"max decision": "0"
15+
"max decision": 0
2016
},
2117

2218
"policies": [
23-
["OneNo", "on"],
19+
["OneNo", "off"],
2420
["TwentyNo", "off"],
2521
["MajorityVoting", "off"],
2622
["SingleFilterPolicy", "off"]
@@ -30,18 +26,18 @@
3026
["SampleFilter", "off"],
3127
["LengthStats", "off"],
3228

33-
["LengthRatio", "on"],
34-
["ReverseLengthRatio", "of"],
35-
["WordRatio", "of"],
36-
["ReverseWordRatio", "of"],
37-
["WordLength", "of"],
38-
["TagFinder", "of"],
39-
["RepeatedChars", "of"],
40-
["RepeatedWords", "of"],
29+
["LengthRatio", "off"],
30+
["ReverseLengthRatio", "off"],
31+
["WordRatio", "off"],
32+
["ReverseWordRatio", "off"],
33+
["WordLength", "off"],
34+
["TagFinder", "off"],
35+
["RepeatedChars", "off"],
36+
["RepeatedWords", "off"],
4137

4238
["Lang_Identifier", "off"],
4339

44-
["AlignedProportion", "off"],
40+
["AlignedProportion", "on"],
4541
["BigramAlignedProportion", "off"],
4642
["NumberOfUnalignedSequences", "off"],
4743
["LongestAlignedSequence", "off"],

data/sample_tok

-5
This file was deleted.

filters/AlignedProportion/AlignedProportion.py

+31-12
Original file line numberDiff line numberDiff line change
@@ -31,35 +31,52 @@ def __init__(self):
3131
self.s_thresh = 0.0
3232
self.t_thresh = 0.0
3333

34+
self.model_exist = False
35+
3436
#
3537
def initialize(self, source_language, target_language, extra_args):
3638
self.num_of_scans = 1
3739
self.src_language = extra_args['source language']
3840
self.trg_language = extra_args['target language']
3941
self.normalize = extra_args['normalize scores']
40-
self.model_filename = "models/" + extra_args['input filename'] + "__AlignedProportion.stats"
42+
self.model_filename = "models/AlignedProportion.stats"
4143
if self.normalize:
4244
self.model_filename += "_n"
4345

4446
if os.path.isfile(self.model_filename):
45-
self.num_of_scans = 0
46-
47+
lang_pair = self.src_language + self.trg_language
4748
f = open(self.model_filename, 'r')
48-
l = f.readline().strip().split("\t")
49-
self.src_mean = float(l[1])
50-
self.src_var = float(l[2])
5149

52-
l = f.readline().strip().split("\t")
53-
self.trg_mean = float(l[1])
54-
self.trg_var = float(l[2])
50+
l = f.readline()
51+
while l:
52+
if lang_pair not in l:
53+
l = f.readline()
54+
continue
55+
56+
# found the statistics
57+
self.model_exist = True
58+
self.num_of_scans = 0
59+
60+
l = f.readline().strip().split("\t")
61+
self.src_mean = float(l[1])
62+
self.src_var = float(l[2])
63+
64+
l = f.readline().strip().split("\t")
65+
self.trg_mean = float(l[1])
66+
self.trg_var = float(l[2])
67+
68+
break
5569

5670
f.close()
57-
print "Loaded stats from the model file."
71+
if self.model_exist:
72+
print "Loaded stats from the model file."
5873

74+
if extra_args['emit scores'] == True:
75+
self.num_of_scans = 1
5976
return
6077

6178
def finalize(self):
62-
if self.num_of_scans == 0:
79+
if self.model_exist:
6380
return
6481

6582
if self.n <= 1:
@@ -76,7 +93,9 @@ def finalize(self):
7693
print "source mean & deviation:", self.src_mean, "\t", self.src_var
7794
print "target mean & deviation:", self.trg_mean, "\t", self.trg_var
7895

79-
f = open(self.model_filename, 'w')
96+
f = open(self.model_filename, 'a')
97+
lang_pair = self.src_language + self.trg_language
98+
f.write("\n" + lang_pair + "\n")
8099

81100
f.write("source\t" + str(self.src_mean) + "\t" + str(self.src_var) + "\n")
82101
f.write("target\t" + str(self.trg_mean) + "\t" + str(self.trg_var) + "\n")

filters/AlignedSequenceLength/AlignedSequenceLength.py

+29-12
Original file line numberDiff line numberDiff line change
@@ -38,29 +38,44 @@ def initialize(self, source_language, target_language, extra_args):
3838
self.src_language = extra_args['source language']
3939
self.trg_language = extra_args['target language']
4040
self.normalize = extra_args['normalize scores']
41-
self.model_filename = "models/" + extra_args['input filename'] + "__AlignedSequenceLength.stats"
41+
self.model_filename = "models/AlignedSequenceLength.stats"
4242
if self.normalize:
4343
self.model_filename += "_n"
4444

4545
if os.path.isfile(self.model_filename):
46-
self.num_of_scans = 0
47-
46+
lang_pair = self.src_language + self.trg_language
4847
f = open(self.model_filename, 'r')
49-
l = f.readline().strip().split("\t")
50-
self.src_mean = float(l[1])
51-
self.src_var = float(l[2])
5248

53-
l = f.readline().strip().split("\t")
54-
self.trg_mean = float(l[1])
55-
self.trg_var = float(l[2])
49+
l = f.readline()
50+
while l:
51+
if lang_pair not in l:
52+
l = f.readline()
53+
continue
54+
55+
# found the statistics
56+
self.model_exist = True
57+
self.num_of_scans = 0
58+
59+
l = f.readline().strip().split("\t")
60+
self.src_mean = float(l[1])
61+
self.src_var = float(l[2])
62+
63+
l = f.readline().strip().split("\t")
64+
self.trg_mean = float(l[1])
65+
self.trg_var = float(l[2])
66+
67+
break
5668

5769
f.close()
58-
print "Loaded stats from the model file."
70+
if self.model_exist:
71+
print "Loaded stats from the model file."
5972

73+
if extra_args['emit scores'] == True:
74+
self.num_of_scans = 1
6075
return
6176

6277
def finalize(self):
63-
if self.num_of_scans == 0:
78+
if self.model_exist:
6479
return
6580

6681
if self.src_n <= 1:
@@ -75,7 +90,9 @@ def finalize(self):
7590
self.trg_var = (self.trg_sum_sq - (self.trg_sum * self.trg_sum) / self.trg_n) / (self.trg_n - 1)
7691
self.trg_var = math.sqrt(self.trg_var)
7792

78-
f = open(self.model_filename, 'w')
93+
f = open(self.model_filename, 'a')
94+
lang_pair = self.src_language + self.trg_language
95+
f.write("\n" + lang_pair + "\n")
7996

8097
f.write("source\t" + str(self.src_mean) + "\t" + str(self.src_var) + "\n")
8198
f.write("target\t" + str(self.trg_mean) + "\t" + str(self.trg_var) + "\n")

filters/BigramAlignedProportion/BigramAlignedProportion.py

+29-12
Original file line numberDiff line numberDiff line change
@@ -37,29 +37,44 @@ def initialize(self, source_language, target_language, extra_args):
3737
self.src_language = extra_args['source language']
3838
self.trg_language = extra_args['target language']
3939
self.normalize = extra_args['normalize scores']
40-
self.model_filename = "models/" + extra_args['input filename'] + "__BigramAlignedProportion.stats"
40+
self.model_filename = "models/BigramAlignedProportion.stats"
4141
if self.normalize:
4242
self.model_filename += "_n"
4343

4444
if os.path.isfile(self.model_filename):
45-
self.num_of_scans = 0
46-
45+
lang_pair = self.src_language + self.trg_language
4746
f = open(self.model_filename, 'r')
48-
l = f.readline().strip().split("\t")
49-
self.src_mean = float(l[1])
50-
self.src_var = float(l[2])
5147

52-
l = f.readline().strip().split("\t")
53-
self.trg_mean = float(l[1])
54-
self.trg_var = float(l[2])
48+
l = f.readline()
49+
while l:
50+
if lang_pair not in l:
51+
l = f.readline()
52+
continue
53+
54+
# found the statistics
55+
self.model_exist = True
56+
self.num_of_scans = 0
57+
58+
l = f.readline().strip().split("\t")
59+
self.src_mean = float(l[1])
60+
self.src_var = float(l[2])
61+
62+
l = f.readline().strip().split("\t")
63+
self.trg_mean = float(l[1])
64+
self.trg_var = float(l[2])
65+
66+
break
5567

5668
f.close()
57-
print "Loaded stats from the model file."
69+
if self.model_exist:
70+
print "Loaded stats from the model file."
5871

72+
if extra_args['emit scores'] == True:
73+
self.num_of_scans = 1
5974
return
6075

6176
def finalize(self):
62-
if self.num_of_scans == 0:
77+
if self.model_exist:
6378
return
6479

6580
if self.n <= 1:
@@ -76,7 +91,9 @@ def finalize(self):
7691
print "source mean & deviation:", self.src_mean, "\t", self.src_var
7792
print "target mean & deviation:", self.trg_mean, "\t", self.trg_var
7893

79-
f = open(self.model_filename, 'w')
94+
f = open(self.model_filename, 'a')
95+
lang_pair = self.src_language + self.trg_language
96+
f.write("\n" + lang_pair + "\n")
8097

8198
f.write("source\t" + str(self.src_mean) + "\t" + str(self.src_var) + "\n")
8299
f.write("target\t" + str(self.trg_mean) + "\t" + str(self.trg_var) + "\n")

filters/FirstUnalignedWord/FirstUnalignedWord.py

+29-12
Original file line numberDiff line numberDiff line change
@@ -36,29 +36,44 @@ def initialize(self, source_language, target_language, extra_args):
3636
self.src_language = extra_args['source language']
3737
self.trg_language = extra_args['target language']
3838
self.normalize = extra_args['normalize scores']
39-
self.model_filename = "models/" + extra_args['input filename'] + "__FirstUnalignedWord.stats"
39+
self.model_filename = "models/FirstUnalignedWord.stats"
4040
if self.normalize:
4141
self.model_filename += "_n"
4242

4343
if os.path.isfile(self.model_filename):
44-
self.num_of_scans = 0
45-
44+
lang_pair = self.src_language + self.trg_language
4645
f = open(self.model_filename, 'r')
47-
l = f.readline().strip().split("\t")
48-
self.src_mean = float(l[1])
49-
self.src_var = float(l[2])
5046

51-
l = f.readline().strip().split("\t")
52-
self.trg_mean = float(l[1])
53-
self.trg_var = float(l[2])
47+
l = f.readline()
48+
while l:
49+
if lang_pair not in l:
50+
l = f.readline()
51+
continue
52+
53+
# found the statistics
54+
self.model_exist = True
55+
self.num_of_scans = 0
56+
57+
l = f.readline().strip().split("\t")
58+
self.src_mean = float(l[1])
59+
self.src_var = float(l[2])
60+
61+
l = f.readline().strip().split("\t")
62+
self.trg_mean = float(l[1])
63+
self.trg_var = float(l[2])
64+
65+
break
5466

5567
f.close()
56-
print "Loaded stats from the model file."
68+
if self.model_exist:
69+
print "Loaded stats from the model file."
5770

71+
if extra_args['emit scores'] == True:
72+
self.num_of_scans = 1
5873
return
5974

6075
def finalize(self):
61-
if self.num_of_scans == 0:
76+
if self.model_exist:
6277
return
6378

6479
self.src_mean = self.src_sum / self.n
@@ -69,7 +84,9 @@ def finalize(self):
6984
self.trg_var = (self.trg_sum_sq - (self.trg_sum * self.trg_sum) / self.n) / (self.n - 1)
7085
self.trg_var = math.sqrt(self.trg_var)
7186

72-
f = open(self.model_filename, 'w')
87+
f = open(self.model_filename, 'a')
88+
lang_pair = self.src_language + self.trg_language
89+
f.write("\n" + lang_pair + "\n")
7390

7491
f.write("source\t" + str(self.src_mean) + "\t" + str(self.src_var) + "\n")
7592
f.write("target\t" + str(self.trg_mean) + "\t" + str(self.trg_var) + "\n")

filters/Lang_Identifier/Lang_Identifier.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ def __init__(self):
1010
self.trg_language = ""
1111

1212
def initialize(self, source_language, target_language, extra_args):
13-
self.num_of_scans = 1
13+
self.num_of_scans = 0
1414
self.src_language = extra_args['source language']
1515
self.trg_language = extra_args['target language']
1616
self.normalize = extra_args['normalize scores']
1717

18+
if extra_args['emit scores'] == True:
19+
self.num_of_scans = 1
1820
langid.load_model()
1921
return
2022

0 commit comments

Comments
 (0)