added scoring to the WE filters and choosing config file

Masoud · Masoud · commit e3a08b5bbf0f · 2016-08-07T23:58:41.000+04:30
diff --git a/config.json b/config.json
@@ -12,21 +12,20 @@
 		"normalize scores":			"true",
 		"emit scores":				"false",
 		"no out files":				"true",
-		"max decision":				0
+		"max decision":				-1
 	},
 
 	"policies": [
 		["OneNo",					"off"],
-		["TwentyNo",				"off"],
-		["MajorityVoting",			"off"],
-		["SingleFilterPolicy",		"off"]
+		["TwentyNo",				"on"],
+		["MajorityVoting",			"off"]
 	],
 
 	"filters": [
 		["SampleFilter",					"off"],
 		["LengthStats",						"off"],
 
-		["LengthRatio",						"off"],
+		["LengthRatio",						"on"],
 		["ReverseLengthRatio",				"off"],
 		["WordRatio",						"off"],
 		["ReverseWordRatio",				"off"],
@@ -37,7 +36,7 @@
 
 		["Lang_Identifier",					"off"],
 
-		["AlignedProportion",				"on"],
+		["AlignedProportion",				"off"],
 		["BigramAlignedProportion",			"off"],
 		["NumberOfUnalignedSequences",		"off"],
 		["LongestAlignedSequence",			"off"],
diff --git a/filters/WE_Average/WE_Average.py b/filters/WE_Average/WE_Average.py
@@ -129,7 +129,7 @@ def finalize(self):
 	def process_tu(self, tu, num_of_finished_scans):
 		if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
 			if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
-				return
+				return [0]
 
 			src_vectors = []
 			for w in tu.src_tokens:
@@ -138,7 +138,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					src_vectors.append(self.vectors[index])
 
 			if len(src_vectors) == 0:
-				return
+				return [0]
 			src_rep = np.median(src_vectors, axis=0)
 
 			trg_vectors = []
@@ -148,15 +148,17 @@ def process_tu(self, tu, num_of_finished_scans):
 					trg_vectors.append(self.vectors[index])
 
 			if len(trg_vectors) == 0:
-				return
+				return [0]
 			trg_rep = np.median(trg_vectors, axis=0)
 
 			distance = cosine(src_rep, trg_rep)
 
 			self.n += 1
 			self.sum += distance
 			self.sum_sq += distance * distance
-			
+
+			return [distance]
+
 		elif num_of_finished_scans == 0:
 			self.all_words += tu.src_tokens
 			self.all_words += tu.trg_tokens
diff --git a/filters/WE_BestAlignScore/WE_BestAlignScore.py b/filters/WE_BestAlignScore/WE_BestAlignScore.py
@@ -129,7 +129,7 @@ def finalize(self):
 	def process_tu(self, tu, num_of_finished_scans):
 		if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
 			if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
-				return
+				return [0]
 
 			index = 0
 			src_vectors = []
@@ -139,7 +139,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					src_vectors.append(self.vectors[index])
 
 			if len(src_vectors) == 0:
-				return
+				return [0]
 
 			trg_vectors = []
 			for w in tu.trg_tokens:
@@ -148,7 +148,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					trg_vectors.append(self.vectors[index])
 
 			if len(trg_vectors) == 0:
-				return
+				return [0]
 
 			avg_distance = 0.0
 			min_src_dist = [1.0] * len(src_vectors)
@@ -172,6 +172,8 @@ def process_tu(self, tu, num_of_finished_scans):
 			self.sum += avg_distance
 			self.sum_sq += avg_distance * avg_distance
 
+			return [avg_distance]
+
 		elif num_of_finished_scans == 0:
 			self.all_words += tu.src_tokens
 			self.all_words += tu.trg_tokens
diff --git a/filters/WE_Median/WE_Median.py b/filters/WE_Median/WE_Median.py
@@ -129,7 +129,7 @@ def finalize(self):
 	def process_tu(self, tu, num_of_finished_scans):
 		if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
 			if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
-				return
+				return [0]
 
 			src_vectors = []
 			for w in tu.src_tokens:
@@ -138,7 +138,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					src_vectors.append(self.vectors[index])
 
 			if len(src_vectors) == 0:
-				return
+				return [0]
 			src_rep = np.sum(src_vectors, axis=0)
 
 			trg_vectors = []
@@ -148,7 +148,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					trg_vectors.append(self.vectors[index])
 
 			if len(trg_vectors) == 0:
-				return
+				return [0]
 			trg_rep = np.sum(trg_vectors, axis=0)
 
 			distance = cosine(src_rep, trg_rep)
@@ -157,6 +157,8 @@ def process_tu(self, tu, num_of_finished_scans):
 			self.sum += distance
 			self.sum_sq += distance * distance
 
+			return [distance]
+
 		elif num_of_finished_scans == 0:
 			self.all_words += tu.src_tokens
 			self.all_words += tu.trg_tokens
diff --git a/filters/WE_ScoreAlign_BestForRest/WE_ScoreAlign_BestForRest.py b/filters/WE_ScoreAlign_BestForRest/WE_ScoreAlign_BestForRest.py
@@ -130,7 +130,7 @@ def finalize(self):
 	def process_tu(self, tu, num_of_finished_scans):
 		if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
 			if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
-				return
+				return [0]
 
 			index = -1
 			src_vectors = {}
@@ -139,7 +139,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					index = self.all_words[w]
 					src_vectors[i] = self.vectors[index]
 			if index == -1:
-				return
+				return [0]
 
 			index = -1
 			trg_vectors = {}
@@ -148,7 +148,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					index = self.all_words[w]
 					trg_vectors[i] = self.vectors[index]
 			if index == -1:
-				return
+				return [0]
 
 			trg_mark = Set()
 			avg_distance = 0.0
@@ -177,13 +177,15 @@ def process_tu(self, tu, num_of_finished_scans):
 				counter += 1
 
 			if counter == 0:
-				return
+				return [0]
 			avg_distance /= counter
 
 			self.n += 1
 			self.sum += avg_distance
 			self.sum_sq += avg_distance * avg_distance
 
+			return [avg_distance]
+
 		elif num_of_finished_scans == 0:
 			self.all_words += tu.src_tokens
 			self.all_words += tu.trg_tokens
diff --git a/filters/WE_ScoreOtherAlignment/WE_ScoreOtherAlignment.py b/filters/WE_ScoreOtherAlignment/WE_ScoreOtherAlignment.py
@@ -129,7 +129,7 @@ def finalize(self):
 	def process_tu(self, tu, num_of_finished_scans):
 		if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
 			if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
-				return
+				return [0]
 
 			index = -1
 			src_vectors = []
@@ -141,7 +141,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					src_vectors.append(None)
 
 			if index == -1:
-				return
+				return [0]
 
 			index = -1
 			trg_vectors = []
@@ -153,7 +153,7 @@ def process_tu(self, tu, num_of_finished_scans):
 					trg_vectors.append(None)
 
 			if index == -1:
-				return
+				return [0]
 
 			avg_distance = 0.0
 			counter = 0.0
@@ -162,7 +162,7 @@ def process_tu(self, tu, num_of_finished_scans):
 				t_w = align_pair[1]
 
 				if s_w >= len(src_vectors) or t_w >= len(trg_vectors):
-					return
+					return [0]
 				if src_vectors[s_w] is None or trg_vectors[t_w] is None:
 					continue
 				dist = cosine(src_vectors[s_w], trg_vectors[t_w])
@@ -171,13 +171,15 @@ def process_tu(self, tu, num_of_finished_scans):
 				counter += 1
 
 			if counter == 0:
-				return
+				return [0]
 			avg_distance /= counter
 
 			self.n += 1
 			self.sum += avg_distance
 			self.sum_sq += avg_distance * avg_distance
 
+			return [avg_distance]
+
 		elif num_of_finished_scans == 0:
 			self.all_words += tu.src_tokens
 			self.all_words += tu.trg_tokens
diff --git a/main.py b/main.py
@@ -1,4 +1,5 @@
 from tm_manager import *
+import sys
 """
 TMoP - Translation Memory Open-Source Purifier by Matteo Negri, Masoud Jalili Sabet and Marco Turchi, October 2015
 
@@ -32,6 +33,9 @@
 """
 
 if __name__ == "__main__":
-	manager = TMManager()
+	config_file = ""
+	if len(sys.argv) > 1:
+		config_file = sys.argv[1]
+	manager = TMManager(config_file)
 
 	manager.run()
diff --git a/tm_manager.py b/tm_manager.py
@@ -47,7 +47,7 @@ class TMManager:
 	"""
 
 	#
-	def __init__(self):
+	def __init__(self, conf_file_name=""):
 		# Adding filter folder to path. after this filters can import 'AbstractFilter'.
 		sys.path.append(os.getcwd() + '/filters/')
 		sys.path.append(os.getcwd() + '/policies/')
@@ -72,6 +72,8 @@ def __init__(self):
 		self.have_token = False
 		self.create_out_files = True
 
+		self.config_file_name = conf_file_name
+
 	#
 	def load_options_from_config_file(self):
 		"""
@@ -84,7 +86,10 @@ def load_options_from_config_file(self):
 
 		print "Loading Options fro the config file ...\n"
 
-		conf_file = open("config.json")
+		if self.config_file_name:
+			conf_file = open(self.config_file_name)
+		else:
+			conf_file = open("config.json")
 		try:
 			config = json.load(conf_file)
 		except ValueError: