-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_corpora.sh
executable file
·33 lines (29 loc) · 1.69 KB
/
build_corpora.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/bin/bash
echo "Building CTB files..."
echo " ...splitting into train, dev, test, and all..."
python3 build_ctb.py # split into train, dev, test, and all
echo " ...concatenating corpora..."
mkdir -p CTBsegs
cat ctb8.0/data/segmented/*.txt > CTBsegs/ctb-all-simp-segmented.txt
cat CTBsegs/ctb-train-simp-segmented.txt | sed 's/ //g' > CTBsegs/ctb-train-simp-running.txt
cat CTBsegs/ctb-dev-simp-segmented.txt | sed 's/ //g' > CTBsegs/ctb-dev-simp-running.txt
cat CTBsegs/ctb-test-simp-segmented.txt | sed 's/ //g' > CTBsegs/ctb-test-simp-running.txt
cat CTBsegs/ctb-all-simp-segmented.txt | sed 's/ //g' > CTBsegs/ctb-all-simp-running.txt
echo " ...transliterating... (simp2trad)..."
cat CTBsegs/ctb-all-simp-segmented.txt | python3 tools/simp2trad.py > CTBsegs/ctb-all-trad-segmented.txt
cat CTBsegs/ctb-all-simp-running.txt | python3 tools/simp2trad.py > CTBsegs/ctb-all-trad-running.txt
echo "done!"
echo "Building UD files..."
for each in ~/repos/UD_Chinese/*.conllu
do
# extract spaceless sentences
egrep "^# text = " ${each} | sed 's/^# text = //g' | tr -d " " > $(echo ${each} | sed 's/^.*Chinese\/zh-ud/UDsegs\/UD/g' | sed 's/\.conllu/-trad-running.txt/g')
# extract segmented sentences
egrep -v "^# (text|sent_id)" ${each} | egrep -v "^$" | cut -f 2 | tr '\n' ' ' > $(echo ${each} | sed 's/^.*Chinese\/zh-ud/UDsegs\/UD/g' | sed 's/\.conllu/-trad-segmented.txt/g')
done
echo " ...transliterating... (trad2simp)..."
for subcorp in train "test" dev; do
cat UDsegs/UD-${subcorp}-trad-running.txt | python3 tools/trad2simp.py > UDsegs/UD-${subcorp}-simp-running.txt
cat UDsegs/UD-${subcorp}-trad-segmented.txt | python3 tools/trad2simp.py > UDsegs/UD-${subcorp}-simp-segmented.txt
done
echo "done!"