Skip to content

Commit 6c6d3e8

Browse files
huangruizhePiotr Żelasko
and
Piotr Żelasko
authored
Updates to PR #95 (#106)
* UTF-8 in validate_vocab * Change all shebangs to require python 3 * Force utf-8 encoding for every file opened in Python * Update python2.7 -> python3 requirement; best guess about package names * Fix bad encoding kwarg insertions * Fixes in automatic replacements * enforced the encoding of sys.stdout, sys.stdin, sys.stderr * no need for the default python to be python3 any more Co-authored-by: Piotr Żelasko <[email protected]>
1 parent 4f9e134 commit 6c6d3e8

33 files changed

+2525
-1548
lines changed

egs/swbd/local/map_acronyms_transcripts.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,26 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22
# convert acronyms in swbd transcript to fisher convention
33
# accoring to first two columns in the input acronyms mapping
44

55
import argparse
66
import re
77
__author__ = 'Minhua Wu'
8+
9+
# If the encoding of the default sys.stdout is not utf-8,
10+
# force it to be utf-8. See PR #95.
11+
if hasattr(sys.stdout, 'encoding') and sys.stdout.encoding.lower() != "utf-8":
12+
import codecs
13+
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
14+
sys.stderr = codecs.getwriter("utf-8")(sys.stderr.detach())
15+
sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach())
16+
817
parser = argparse.ArgumentParser(description='format acronyms to a._b._c.')
918
parser.add_argument('-i', '--input', help='Input transcripts', required=True)
1019
parser.add_argument('-o', '--output', help='Output transcripts', required=True)
11-
parser.add_argument('-M', '--Map', help='Input acronyms mapping', required=True)
20+
parser.add_argument('-M',
21+
'--Map',
22+
help='Input acronyms mapping',
23+
required=True)
1224
args = parser.parse_args()
1325

1426
fin_map = open(args.Map, "r")
@@ -22,7 +34,6 @@
2234
del dict_acronym_noi['I']
2335
del dict_acronym_noi['i']
2436

25-
2637
fin_trans = open(args.input, "r")
2738
fout_trans = open(args.output, "w")
2839
for line in fin_trans:
@@ -32,15 +43,15 @@
3243
for i in range(L):
3344
if items[i] == 'I':
3445
x = 0
35-
while(i-1-x >= 0 and re.match(r'^[A-Z]$', items[i-1-x])):
46+
while (i - 1 - x >= 0 and re.match(r'^[A-Z]$', items[i - 1 - x])):
3647
x += 1
3748
y = 0
38-
while(i+1+y < L and re.match(r'^[A-Z]$', items[i+1+y])):
49+
while (i + 1 + y < L and re.match(r'^[A-Z]$', items[i + 1 + y])):
3950
y += 1
4051

41-
if x+y > 0:
42-
for bias in range(-x, y+1):
43-
items[i+bias] = dict_acronym[items[i+bias]]
52+
if x + y > 0:
53+
for bias in range(-x, y + 1):
54+
items[i + bias] = dict_acronym[items[i + bias]]
4455
# Second pass mapping (not mapping 'i' and 'I')
4556
for i in range(len(items)):
4657
if items[i] in dict_acronym_noi.keys():

scripts/check_dependencies.sh

+5-18
Original file line numberDiff line numberDiff line change
@@ -41,29 +41,16 @@ if ! which awk >&/dev/null; then
4141
add_packages gawk gawk gawk
4242
fi
4343

44-
if which python >&/dev/null ; then
45-
version=`/usr/bin/env python 2>&1 --version | awk '{print $2}' `
46-
if [[ $version != "2.7"* && $version != "3."* ]] ; then
47-
status=1
48-
if which python2.7 >&/dev/null ; then
49-
echo "$0: python 2.7 is not the default python (lower version python does not "
50-
echo "$0: have packages that are required by pocolm). You should make it default"
51-
else
52-
echo "$0: python 2.7 is not installed"
53-
add_packages python2.7 python2.7 python2.7
54-
fi
55-
fi
56-
57-
else
58-
echo "$0: python 2.7 is not installed"
59-
add_packages python2.7 python2.7 python2.7
44+
if ! which python3 >&/dev/null ; then
45+
echo "$0: python3 is not installed"
46+
add_packages python3 python3 python3
6047
fi
6148

62-
if ! python -c 'import numpy' >&/dev/null; then
49+
if ! python3 -c 'import numpy' >&/dev/null; then
6350
echo "$0: python-numpy is not installed"
6451
# I'm not sure if this package name is OK for all distributions, this is what
6552
# it seems to be called on Debian. We'll have to investigate this.
66-
add_packages numpy python-numpy python-numpy
53+
add_packages numpy python3-numpy python3-numpy
6754
fi
6855

6956
printed=false

scripts/cleanup_count_dir.py

+19-10
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,26 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22

33
# we're using python 3.x style print but want it to work in python 2.x,
44
from __future__ import print_function
55
import os
66
import argparse
77
import sys
88

9-
parser = argparse.ArgumentParser(description="Cleanup the largish files. "
10-
"This may be called when the counts no longer useful.",
11-
epilog="E.g. cleanup_count_dir.py data/lm/work/counts_20000_3",
12-
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
9+
# If the encoding of the default sys.stdout is not utf-8,
10+
# force it to be utf-8. See PR #95.
11+
if hasattr(sys.stdout, 'encoding') and sys.stdout.encoding.lower() != "utf-8":
12+
import codecs
13+
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
14+
sys.stderr = codecs.getwriter("utf-8")(sys.stderr.detach())
15+
sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach())
1316

14-
parser.add_argument("count_dir",
15-
help="Directory to cleanup")
17+
parser = argparse.ArgumentParser(
18+
description="Cleanup the largish files. "
19+
"This may be called when the counts no longer useful.",
20+
epilog="E.g. cleanup_count_dir.py data/lm/work/counts_20000_3",
21+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
22+
23+
parser.add_argument("count_dir", help="Directory to cleanup")
1624

1725
args = parser.parse_args()
1826

@@ -34,12 +42,12 @@ def CleanupDir(count_dir, ngram_order, num_train_sets):
3442
if os.system("validate_count_dir.py " + args.count_dir) != 0:
3543
sys.exit("command validate_count_dir.py {0} failed".format(args.count_dir))
3644

37-
f = open(os.path.join(args.count_dir, 'ngram_order'))
45+
f = open(os.path.join(args.count_dir, 'ngram_order'), encoding="utf-8")
3846
line = f.readline()
3947
ngram_order = int(line)
4048
f.close()
4149

42-
f = open(os.path.join(args.count_dir, 'num_train_sets'))
50+
f = open(os.path.join(args.count_dir, 'num_train_sets'), encoding="utf-8")
4351
line = f.readline()
4452
num_train_sets = int(line)
4553
f.close()
@@ -50,7 +58,8 @@ def CleanupDir(count_dir, ngram_order, num_train_sets):
5058
# find split-dir and cleanup
5159
entities = os.listdir(args.count_dir)
5260
for dirname in entities:
53-
if os.path.isdir(os.path.join(args.count_dir, dirname)) and dirname[0:5] == 'split':
61+
if os.path.isdir(os.path.join(args.count_dir,
62+
dirname)) and dirname[0:5] == 'split':
5463
for n in range(1, int(dirname[5:]) + 1):
5564
count_dir = os.path.join(args.count_dir, dirname, str(n))
5665
if os.path.isdir(count_dir):

scripts/cleanup_int_dir.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,26 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22

33
# we're using python 3.x style print but want it to work in python 2.x,
44
from __future__ import print_function
55
import os
66
import argparse
77
import sys
88

9-
parser = argparse.ArgumentParser(description="Cleanup the largish files. "
10-
"This may be called when the ints no longer useful.",
11-
epilog="E.g. cleanup_int_dir.py data/lm/work/int_20000",
12-
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
9+
# If the encoding of the default sys.stdout is not utf-8,
10+
# force it to be utf-8. See PR #95.
11+
if hasattr(sys.stdout, 'encoding') and sys.stdout.encoding.lower() != "utf-8":
12+
import codecs
13+
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
14+
sys.stderr = codecs.getwriter("utf-8")(sys.stderr.detach())
15+
sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach())
1316

14-
parser.add_argument("int_dir",
15-
help="Directory in which to find the data")
17+
parser = argparse.ArgumentParser(
18+
description="Cleanup the largish files. "
19+
"This may be called when the ints no longer useful.",
20+
epilog="E.g. cleanup_int_dir.py data/lm/work/int_20000",
21+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
22+
23+
parser.add_argument("int_dir", help="Directory in which to find the data")
1624

1725
args = parser.parse_args()
1826

@@ -22,7 +30,7 @@
2230
if os.system("validate_int_dir.py " + args.int_dir) != 0:
2331
sys.exit("command validate_int_dir.py {0} failed".format(args.int_dir))
2432

25-
f = open(os.path.join(args.int_dir, 'num_train_sets'))
33+
f = open(os.path.join(args.int_dir, 'num_train_sets'), encoding="utf-8")
2634
line = f.readline()
2735
num_train_sets = int(line)
2836
f.close()

0 commit comments

Comments
 (0)