Skip to content

Commit 9677b48

Browse files
authored
Update GetNumWords to use utf-8 encoding (#109)
* Update GetNumWords to use utf-8 encoding This change fixes the following error: ``` Traceback (most recent call last): File "/home/sourya4/kaldi/egs/tamil_telugu_proj/s5_r3/../../../tools/pocolm/scripts/prepare_int_data.py", line 168, in <module> num_words = GetNumWords(args.vocab) File "/home/sourya4/kaldi/egs/tamil_telugu_proj/s5_r3/../../../tools/pocolm/scripts/prepare_int_data.py", line 75, in GetNumWords universal_newlines=True) File "/usr/lib/python3.6/subprocess.py", line 356, in check_output **kwargs).stdout File "/usr/lib/python3.6/subprocess.py", line 425, in run stdout, stderr = process.communicate(input, timeout=timeout) File "/usr/lib/python3.6/subprocess.py", line 850, in communicate stdout = self.stdout.read() File "/usr/lib/python3.6/encodings/ascii.py", line 26, in decode return codecs.ascii_decode(input, self.errors)[0] UnicodeDecodeError: 'ascii' codec can't decode byte 0xe0 in position 0: ordinal not in range(128) # exited with return code 1 after 0.3 seconds ``` * Update GetNumWords to use utf-8 encoding This change fixes the following error: ``` Traceback (most recent call last): File "/home/sourya4/kaldi/egs/tamil_telugu_proj/s5_r3/../../../tools/pocolm/scripts/prune_lm_dir.py", line 613, in <module> num_words = GetNumWords(args.lm_dir_in) File "/home/sourya4/kaldi/egs/tamil_telugu_proj/s5_r3/../../../tools/pocolm/scripts/prune_lm_dir.py", line 220, in GetNumWords universal_newlines=True) File "/usr/lib/python3.6/subprocess.py", line 356, in check_output **kwargs).stdout File "/usr/lib/python3.6/subprocess.py", line 425, in run stdout, stderr = process.communicate(input, timeout=timeout) File "/usr/lib/python3.6/subprocess.py", line 850, in communicate stdout = self.stdout.read() File "/usr/lib/python3.6/encodings/ascii.py", line 26, in decode return codecs.ascii_decode(input, self.errors)[0] UnicodeDecodeError: 'ascii' codec can't decode byte 0xe0 in position 0: ordinal not in range(128) ```
1 parent 76f2807 commit 9677b48

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

scripts/prepare_int_data.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ def GetNumWords(vocab):
7272
command = "tail -n 1 {0}".format(vocab)
7373
line = subprocess.check_output(command,
7474
shell=True,
75-
universal_newlines=True)
75+
universal_newlines=True,
76+
encoding='utf-8')
7677
try:
7778
a = line.split()
7879
assert len(a) == 2

scripts/prune_lm_dir.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,8 @@ def GetNumWords(lm_dir_in):
217217
command = "tail -n 1 {0}/words.txt".format(lm_dir_in)
218218
line = subprocess.check_output(command,
219219
shell=True,
220-
universal_newlines=True)
220+
universal_newlines=True,
221+
encoding='utf-8')
221222
try:
222223
a = line.split()
223224
assert len(a) == 2

0 commit comments

Comments
 (0)