-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathheldout_sample.py
49 lines (39 loc) · 1.59 KB
/
heldout_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python3
# coding: utf-8
import sys
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from convert2synsets import calc_similarity
if __name__ == '__main__':
method = sys.argv[1] # jcn or lch
corpus = sys.argv[2] # semcor or brown
maxval = 1000.0 # This value will be assigned to extremely high-similarity pairs (like 1e+300)
ic = wordnet_ic.ic('ic-%s.dat' % corpus)
words = set()
for line in sys.stdin:
if line.strip().startswith('#'):
continue
res = line.strip().split('\t')
(word0, word1, sim) = res
words.add(word0.strip())
words.add(word1.strip())
synsets = set()
for w in words:
w_synsets = wn.synsets(w, 'n')
for s in w_synsets:
synsets.add(s)
print('%d synsets produced from %d words' % (len(synsets), len(words)), file=sys.stderr)
all_synsets = list(wn.all_synsets('n'))
for synset in synsets:
print('Calculating similarities for', synset, file=sys.stderr)
for s in all_synsets:
if s != synset:
pair = (synset, s)
similarity = calc_similarity(pair, method, ic)
if similarity > 1000:
print('Clipped similarity to %f' % maxval, pair, similarity, file=sys.stderr)
similarity = maxval
if similarity < 0.0001:
print('Clipped similarity to 0.0', pair, similarity, file=sys.stderr)
similarity = 0.0
print('\t'.join([s.name() for s in pair]) + '\t' + str(similarity))