-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile.bs
181 lines (150 loc) · 6.39 KB
/
Snakefile.bs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
## Bootstrapping
from os.path import exists
cnf("BOOTWORK", WORK + "/bootstrap")
cnf("SEED", "42")
cnf("ITERS", "100000")
AWE_NN_VAR, = glob_wildcards(GUESS + "/awe_nn.{awe_nn_var,[^/]+}/")
SUPWSD_VAR, = glob_wildcards(GUESS + "/supwsd.{supwsd_var,[^/]+}.x1st/")
LESK_VAR, = glob_wildcards(GUESS + "/lesk.{lesk_var,[^/]+}/")
LESK_PP_VAR, = glob_wildcards(GUESS + "/lesk_pp.{lesk_pp_var,[^/]+}/")
# Utils
def nicks_of_selected(selected):
nicks = []
for _label, filter in selected:
for nick in SnakeMake.get_nicks(filter):
nicks.append(nick)
return nicks
def cmp_inputs(wildcards):
from wsdeval.selectedexps import UNSUP_SELECTED, SUP_SELECTED, XLING_SELECTED
def sup_var():
train, test = wildcards.cmp_var.split("-")
return train + "-train/" + test + "-dev.pkl"
def unsup_var():
test = wildcards.cmp_var
return test + "-dev.pkl"
if wildcards.cmp == "over":
result = []
# Add supervised
result.extend((
BOOTWORK + "/resamples/" + nick + "/" + training_corpus + "-train/{cmp_var,[^/]+}.pkl"
for nick in nicks_of_selected(SUP_SELECTED)
for training_corpus in CORPUS_NAMES
))
# Add xling
result.extend((
BOOTWORK + "/resamples/" + nick + "/semcor/{cmp_var,[^/]+}.pkl"
for nick in nicks_of_selected(XLING_SELECTED)
))
# Add unsupervised
result.extend((
BOOTWORK + "/resamples/" + nick + "/{cmp_var,[^/]+}.pkl"
for nick in nicks_of_selected(UNSUP_SELECTED)
))
return result
elif wildcards.cmp == "awe_nn":
train, test = wildcards.cmp_var.split("-")
return [BOOTWORK + "/resamples/{cmp,[^/\.]+}." + var + "/" + sup_var() for var in AWE_NN_VAR]
elif wildcards.cmp == "supwsd":
return [BOOTWORK + "/resamples/{cmp,[^/\.]+}." + var + ".x1st/" + sup_var() for var in SUPWSD_VAR]
elif wildcards.cmp == "lesk_pp":
return [BOOTWORK + "/resamples/{cmp,[^/\.]+}." + var + "/" + unsup_var() for var in LESK_PP_VAR]
else: # lesk
freq = ".freq" in wildcards.cmp_var
if freq:
corpus = wildcards.cmp_var.replace(".freq", "")
vars = [var for var in LESK_VAR if ".freq" in var]
else:
corpus = wildcards.cmp_var
vars = [var for var in LESK_VAR if ".freq" not in var]
return [BOOTWORK + "/resamples/{cmp,[^/\.]+}." + var + "/" + corpus + "-dev.pkl" for var in vars]
# Top levels
def over_boot_results():
for test_corpus in CORPUS_NAMES:
for test_seg in TEST_SEGMENT:
yield BOOTWORK + "/over/nsd-best-" + test_corpus + "-" + test_seg + ".db"
yield BOOTWORK + "/over/cld-" + test_corpus + "-" + test_seg + ".db"
def awe_boot_results():
for train_corpus in CORPUS_DIR_MAP:
for test_corpus in CORPUS_DIR_MAP:
yield BOOTWORK + "/awe_nn/nsd-best-" + train_corpus + "-" + test_corpus + ".db"
def supwsd_boot_results():
for train_corpus in CORPUS_DIR_MAP:
for test_corpus in CORPUS_DIR_MAP:
yield BOOTWORK + "/supwsd/nsd-best-" + train_corpus + "-" + test_corpus + ".db"
def lesk_boot_results():
for freq in [".freq", ""]:
for test_corpus in CORPUS_DIR_MAP:
yield BOOTWORK + "/lesk/nsd-best-" + test_corpus + freq + ".db"
def lesk_pp_boot_results():
for test_corpus in CORPUS_DIR_MAP:
yield BOOTWORK + "/lesk_pp/nsd-best-" + test_corpus + ".db"
rule bootstrap_over:
input: list(over_boot_results())
rule bootstrap_awe_dev:
input: list(awe_boot_results())
rule bootstrap_supwsd_dev:
input: list(supwsd_boot_results())
rule bootstrap_lesk_dev:
input: list(lesk_boot_results())
rule bootstrap_lesk_pp_dev:
input: list(lesk_pp_boot_results())
rule bootstrap_all:
input: list(over_boot_results()) + list(awe_boot_results()) + list(supwsd_boot_results()) + list(lesk_boot_results())
# Schedule
rule create_schedule:
output:
BOOTWORK + "/schedules/{test_corpus,[^-/.]+}/{gold_key,.+}.pkl"
input:
lambda wildcards: CORPUS_DIR_MAP[wildcards.test_corpus]() + "/" + wildcards.gold_key
shell:
"python scripts/expc.py sigtest create-schedule" +
" --seed " + SEED +
" --iters " + ITERS +
" {input} {output}"
# Bootstrapping sup
rule resample_sup:
output: BOOTWORK + "/resamples/{nick,[^-/]+}/{train_corpusseg,[^/.]+}/{test_corpus,[^-/.]+}-{test_seg,[^-/.]+}.pkl"
input:
schedule=BOOTWORK + "/schedules/{test_corpus}/{test_seg}/corpus.sup.key.pkl",
gold=lambda wildcards: CORPUS_DIR_MAP[wildcards.test_corpus]() + "/{test_seg}/corpus.sup.key",
run:
train_base = wildcards.nick + "/" + wildcards.train_corpusseg
if not exists(GUESS + "/" + train_base):
train_base += "f"
shell(
"python scripts/expc.py sigtest resample" +
" {output} {input.gold} " +
GUESS + "/" + train_base + "/{wildcards.test_corpus}-{wildcards.test_seg} " +
RESULTS + "/" + train_base + "/{wildcards.test_corpus}-{wildcards.test_seg}.db" +
" {input.schedule}"
)
# Bootstrapping unsup
rule resample_unsup:
output: BOOTWORK + "/resamples/{nick,[^/]+}/{test_corpus,[^-/.]+}-{test_seg,[^-/.]+}.pkl"
input:
schedule=BOOTWORK + "/schedules/{test_corpus}/{test_seg}/corpus.sup.key.pkl",
gold=lambda wildcards: CORPUS_DIR_MAP[wildcards.test_corpus]() + "/{test_seg}/corpus.sup.key",
shell:
"python scripts/expc.py sigtest resample" +
" {output} {input.gold} " +
GUESS + "/{wildcards.nick}/{wildcards.test_corpus}-{wildcards.test_seg} " +
RESULTS + "/{wildcards.nick}/{wildcards.test_corpus}-{wildcards.test_seg}.db" +
" {input.schedule} "
# Compare f1s
rule compare_f1s:
input: cmp_inputs
output: BOOTWORK + "/{cmp,[^/\.]+}/cmp-{cmp_var,[^/]+}.db"
shell:
"python scripts/expc.py sigtest compare-resampled {input} {output}"
# Get info about NSD from best
rule nsd_from_best:
input: BOOTWORK + "/{cmp,[^/\.]+}/cmp-{cmp_var,[^/]+}.db"
output: BOOTWORK + "/{cmp,[^/\.]+}/nsd-best-{cmp_var,[^/]+}.db"
shell:
"python scripts/expc.py sigtest nsd-from-best {input} {output}"
# Get CLD
rule cld:
input: BOOTWORK + "/{cmp,[^/\.]+}/cmp-{cmp_var,[^/]+}.db"
output: BOOTWORK + "/{cmp,[^/\.]+}/cld-{cmp_var,[^/]+}.db"
shell:
"python scripts/expc.py sigtest cld {input} {output}"