@@ -27,11 +27,14 @@ def get_ngram_count(words, verbose = ""):
27
27
28
28
## end of initialization
29
29
30
- if len (sys .argv ) != 2 :
30
+ if not len (sys .argv ) in [ 2 , 3 , 4 ] :
31
31
print u"You have to enter token sequence to check in quotes"
32
32
print u"\t e.g. \" New York Rangers Jaromír Jágr\" "
33
+ print u"\t the 2nd argument is first correct n-gram e.g. \" New York Rangers\" "
33
34
sys .exit (1 )
34
35
36
+ ##print "#test#sequence bi_dice ngram_dice bi_mi ngram_mi diff4 diff1"
37
+
35
38
tokens = sys .argv [1 ].decode ("utf-8" ).split (" " )
36
39
37
40
sequence_count = get_ngram_count (tokens )
@@ -80,14 +83,14 @@ def get_ngram_count(words, verbose = ""):
80
83
print "NEG-1: %i" % (real1_neg + real2_neg )
81
84
print
82
85
83
- if low_remove > (pos1_neg + pos2_neg ):
84
- low_remove = pos1_neg + pos2_neg
85
- low_remove_idx = i
86
-
87
- if low_remove4 > (pos1 + pos2 - pos1_neg - pos2_neg ):
88
- low_remove4 = (pos1 + pos2 - pos1_neg - pos2_neg )
86
+ if low_remove4 > (pos1_neg + pos2_neg ):
87
+ low_remove4 = pos1_neg + pos2_neg
89
88
low_remove4_idx = i
90
89
90
+ if low_remove > (pos1 + pos2 - pos1_neg - pos2_neg ):
91
+ low_remove = (pos1 + pos2 - pos1_neg - pos2_neg )
92
+ low_remove_idx = i
93
+
91
94
### logDice
92
95
if (pos1 + pos2 == 0 ) or ((2.0 * sequence_count / (pos1 + pos2 )) == 0 ):
93
96
print "logDice (n-gram): N/A"
@@ -157,5 +160,11 @@ def get_ngram_count(words, verbose = ""):
157
160
print "MIN bigram MI: %s %s" % (tokens [0 :low_bigram_mi_idx + 1 ], tokens [low_bigram_mi_idx + 1 :])
158
161
print "MAX ngram MI: %s %s" % (tokens [0 :high_ngram_mi_idx + 1 ], tokens [high_ngram_mi_idx + 1 :])
159
162
160
- print "DIFF4 ngram-remove: %s %s" % (tokens [0 :low_remove_idx + 1 ], tokens [low_remove_idx + 1 :])
161
- print "DIFF1 ngram-remove: %s %s" % (tokens [0 :low_remove4_idx + 1 ], tokens [low_remove4_idx + 1 :])
163
+ print "DIFF4 ngram-remove: %s %s" % (tokens [0 :low_remove4_idx + 1 ], tokens [low_remove4_idx + 1 :])
164
+ print "DIFF1 ngram-remove: %s %s" % (tokens [0 :low_remove_idx + 1 ], tokens [low_remove_idx + 1 :])
165
+
166
+ ## test output against data (sys.argv[2])
167
+ tokens_ngram1_test = sys .argv [2 ].decode ("utf-8" ).split (" " )
168
+ count = len (tokens_ngram1_test ) - 1 # we want last index that is inside
169
+
170
+ print u"#test# '%s' %d %d %d %d %d %d" % (sys .argv [1 ].decode ("utf-8" ), int (count == low_bigram_dice_idx ), int (count == high_ngram_dice_idx ), int (count == low_bigram_mi_idx ), int (count == high_ngram_mi_idx ), int (count == low_remove4_idx ), int (count == low_remove_idx ))
0 commit comments