-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
101 lines (76 loc) · 3.92 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
.PHONY: eval clean
.SECONDARY:
# latest wikipedia dump:
# LANG_WIKI_DUMP_URL = https://dumps.wikimedia.org/LANGwiki/latest/LANGwiki-latest-pages-articles.xml.bz2
LATEST_DUMP = 20241201
PL_WIKI_DUMP_URL = https://dumps.wikimedia.org/plwiki/${LATEST_DUMP}/plwiki-${LATEST_DUMP}-pages-articles.xml.bz2
UK_WIKI_DUMP_URL = https://dumps.wikimedia.org/ukwiki/${LATEST_DUMP}/ukwiki-${LATEST_DUMP}-pages-articles.xml.bz2
SK_WIKI_DUMP_URL = https://dumps.wikimedia.org/skwiki/${LATEST_DUMP}/skwiki-${LATEST_DUMP}-pages-articles.xml.bz2
RU_WIKI_DUMP_URL = https://dumps.wikimedia.org/ruwiki/${LATEST_DUMP}/ruwiki-${LATEST_DUMP}-pages-articles.xml.bz2
CS_WIKI_DUMP_URL = https://dumps.wikimedia.org/cswiki/${LATEST_DUMP}/cswiki-${LATEST_DUMP}-pages-articles.xml.bz2
eval: work/uk.frqwl work/sk.frqwl work/pl.frqwl work/cs.frqwl work/ru.frqwl work/uk.ipa.wls work/sk.ipa.wls work/pl.ipa.wls work/cs.ipa.wls work/ru.ipa.wls
nice python optimize_params.py
repro: work/uk.frqwl work/sk.frqwl work/pl.frqwl work/cs.frqwl work/ru.frqwl work/uk.ipa.wls work/sk.ipa.wls work/pl.ipa.wls work/cs.ipa.wls work/ru.ipa.wls
python evaluate_data_mix.py
work/%.ipa.wls: work/%.ipa.wlh
tr -d '-' < $< > $@
wlh2ipawlh/target/release/wlh2ipawlh:
cd wlh2ipawlh; RUSTFLAGS="-C target-cpu=native -C opt-level=3 -C codegen-units=1" cargo build --release
mkdir -p work/ipacache
work/%.ipa.wlh: work/%.wlh wlh2ipawlh/target/release/wlh2ipawlh
nice ./wlh2ipawlh/target/release/wlh2ipawlh $< $@
work/%.frqwl: work/%wikidir/.extraction_complete
python wiki2frqwl.py work/$*wikidir $@
work/sk.wls: work/sk.frqwl
python frqwl2wls.py --len=10000 work/sk.wls work/sk.frqwl
work/%.wls: work/%.frqwl
python frqwl2wls.py $@ $<
work/%wikidir/.extraction_complete: work/%wiki-${LATEST_DUMP}-pages-articles.xml
wikiextractor -o work/$*wikidir $<
touch $@
%.xml: %.xml.bz2
bzip2 -d $<
work/plwiki-${LATEST_DUMP}-pages-articles.xml.bz2:
mkdir -p work
wget ${PL_WIKI_DUMP_URL} -O $@
work/skwiki-${LATEST_DUMP}-pages-articles.xml.bz2:
mkdir -p work
wget ${SK_WIKI_DUMP_URL} -O $@
work/ukwiki-${LATEST_DUMP}-pages-articles.xml.bz2:
mkdir -p work
wget ${UK_WIKI_DUMP_URL} -O $@
work/ruwiki-${LATEST_DUMP}-pages-articles.xml.bz2:
mkdir -p work
wget ${RU_WIKI_DUMP_URL} -O $@
work/cswiki-${LATEST_DUMP}-pages-articles.xml.bz2:
mkdir -p work
wget ${CS_WIKI_DUMP_URL} -O $@
work/%wiki-${LATEST_DUMP}-pages-articles.xml.bz2:
mkdir -p work
wget https://dumps.wikimedia.org/$*wiki/${LATEST_DUMP}/$*wiki-${LATEST_DUMP}-pages-articles.xml.bz2 -O $@
# sh patterns are named differently (sh-latn) and Serbocroat wikipedia uses Latin script.
work/sh.wlh: work/sh.wls
@if [ ! -f work/hyph-sh-latn.tex ]; then \
wget https://raw.githubusercontent.com/hyphenation/tex-hyphen/master/hyph-utf8/tex/generic/hyph-utf8/patterns/tex/hyph-sh-latn.tex -O work/hyph-sh-latn.tex; \
fi
python hyph.py work/hyph-sh-latn.tex $< > [email protected] && mv [email protected] $@
work/csskhyphen.pat:
@if [ ! -f work/csskhyphen.pat ]; then \
wget https://raw.githubusercontent.com/tensojka/cshyphen/master/csskhyphen.pat -O work/csskhyphen.pat; \
fi
work/sk.wlh: work/sk.wls work/csskhyphen.pat
python hyph.py work/csskhyphen.pat $< > [email protected] && mv [email protected] $@
work/cs.wlh: work/cs.wls work/csskhyphen.pat
python hyph.py work/csskhyphen.pat $< > [email protected] && mv [email protected] $@
work/%.wlh: work/%.wls
@if [ ! -f work/hyph-$*.tex ]; then \
wget https://raw.githubusercontent.com/hyphenation/tex-hyphen/master/hyph-utf8/tex/generic/hyph-utf8/patterns/tex/hyph-$*.tex -O work/hyph-$*.tex; \
fi
python hyph.py work/hyph-$*.tex $< > [email protected] && mv [email protected] $@
groundtruth/uk-full-wiktionary.wlh: work/ukwiktionary-20240920-pages-articles.xml parse_ground_truth.py
python parse_ground_truth.py $< > $@
work/ukwiktionary-20240920-pages-articles.xml:
wget -O work/ukwiktionary-20240920-pages-articles.xml.bz2 https://dumps.wikimedia.org/ukwiktionary/20240920/ukwiktionary-20240920-pages-articles.xml.bz2
bzip2 -d work/ukwiktionary-20240920-pages-articles.xml.bz2
clean:
rm -rf work/*