Skip to content

Commit a18f98d

Browse files
committed
[bugfix] Fix DIFF-4/DIFF-1 variables and add test output
1 parent 8eeb415 commit a18f98d

File tree

1 file changed

+18
-9
lines changed

1 file changed

+18
-9
lines changed

bin/corp-ngram.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,14 @@ def get_ngram_count(words, verbose = ""):
2727

2828
## end of initialization
2929

30-
if len(sys.argv) != 2:
30+
if not len(sys.argv) in [2, 3, 4]:
3131
print u"You have to enter token sequence to check in quotes"
3232
print u"\te.g. \"New York Rangers Jaromír Jágr\""
33+
print u"\tthe 2nd argument is first correct n-gram e.g. \"New York Rangers\""
3334
sys.exit(1)
3435

36+
##print "#test#sequence bi_dice ngram_dice bi_mi ngram_mi diff4 diff1"
37+
3538
tokens = sys.argv[1].decode("utf-8").split(" ")
3639

3740
sequence_count = get_ngram_count(tokens)
@@ -80,14 +83,14 @@ def get_ngram_count(words, verbose = ""):
8083
print "NEG-1: %i" % (real1_neg + real2_neg)
8184
print
8285

83-
if low_remove > (pos1_neg + pos2_neg):
84-
low_remove = pos1_neg + pos2_neg
85-
low_remove_idx = i
86-
87-
if low_remove4 > (pos1 + pos2 - pos1_neg - pos2_neg):
88-
low_remove4 = (pos1 + pos2 - pos1_neg - pos2_neg)
86+
if low_remove4 > (pos1_neg + pos2_neg):
87+
low_remove4 = pos1_neg + pos2_neg
8988
low_remove4_idx = i
9089

90+
if low_remove > (pos1 + pos2 - pos1_neg - pos2_neg):
91+
low_remove = (pos1 + pos2 - pos1_neg - pos2_neg)
92+
low_remove_idx = i
93+
9194
### logDice
9295
if (pos1 + pos2 == 0) or ((2.0 * sequence_count / (pos1 + pos2)) == 0):
9396
print "logDice (n-gram): N/A"
@@ -157,5 +160,11 @@ def get_ngram_count(words, verbose = ""):
157160
print "MIN bigram MI: %s %s" % (tokens[0:low_bigram_mi_idx+1], tokens[low_bigram_mi_idx+1:])
158161
print "MAX ngram MI: %s %s" % (tokens[0:high_ngram_mi_idx+1], tokens[high_ngram_mi_idx+1:])
159162

160-
print "DIFF4 ngram-remove: %s %s" % (tokens[0:low_remove_idx+1], tokens[low_remove_idx+1:])
161-
print "DIFF1 ngram-remove: %s %s" % (tokens[0:low_remove4_idx+1], tokens[low_remove4_idx+1:])
163+
print "DIFF4 ngram-remove: %s %s" % (tokens[0:low_remove4_idx+1], tokens[low_remove4_idx+1:])
164+
print "DIFF1 ngram-remove: %s %s" % (tokens[0:low_remove_idx+1], tokens[low_remove_idx+1:])
165+
166+
## test output against data (sys.argv[2])
167+
tokens_ngram1_test = sys.argv[2].decode("utf-8").split(" ")
168+
count = len(tokens_ngram1_test) - 1 # we want last index that is inside
169+
170+
print u"#test# '%s' %d %d %d %d %d %d" % (sys.argv[1].decode("utf-8"), int(count == low_bigram_dice_idx), int(count == high_ngram_dice_idx), int(count == low_bigram_mi_idx), int(count == high_ngram_mi_idx), int(count == low_remove4_idx), int (count == low_remove_idx))

0 commit comments

Comments
 (0)