danpovey
diff --git a/‎egs/swbd/local/map_acronyms_transcripts.py
+19-8 b/‎egs/swbd/local/map_acronyms_transcripts.py
+19-8
diff --git a/‎scripts/check_dependencies.sh
+5-18 b/‎scripts/check_dependencies.sh
+5-18
diff --git a/‎scripts/cleanup_count_dir.py
+19-10 b/‎scripts/cleanup_count_dir.py
+19-10
diff --git a/‎scripts/cleanup_int_dir.py
+16-8 b/‎scripts/cleanup_int_dir.py
+16-8
@@ -1,14 +1,26 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # convert acronyms in swbd transcript to fisher convention
 # accoring to first two columns in the input acronyms mapping
 
 import argparse
 import re
 __author__ = 'Minhua Wu'
+
+# If the encoding of the default sys.stdout is not utf-8,
+# force it to be utf-8. See PR #95.
+if hasattr(sys.stdout, 'encoding') and sys.stdout.encoding.lower() != "utf-8":
+    import codecs
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+    sys.stderr = codecs.getwriter("utf-8")(sys.stderr.detach())
+    sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach())
+
 parser = argparse.ArgumentParser(description='format acronyms to a._b._c.')
 parser.add_argument('-i', '--input', help='Input transcripts', required=True)
 parser.add_argument('-o', '--output', help='Output transcripts', required=True)
-parser.add_argument('-M', '--Map', help='Input acronyms mapping', required=True)
+parser.add_argument('-M',
+                    '--Map',
+                    help='Input acronyms mapping',
+                    required=True)
 args = parser.parse_args()
 
 fin_map = open(args.Map, "r")
@@ -22,7 +34,6 @@
 del dict_acronym_noi['I']
 del dict_acronym_noi['i']
 
-
 fin_trans = open(args.input, "r")
 fout_trans = open(args.output, "w")
 for line in fin_trans:
@@ -32,15 +43,15 @@
     for i in range(L):
         if items[i] == 'I':
             x = 0
-            while(i-1-x >= 0 and re.match(r'^[A-Z]$', items[i-1-x])):
+            while (i - 1 - x >= 0 and re.match(r'^[A-Z]$', items[i - 1 - x])):
                 x += 1
             y = 0
-            while(i+1+y < L and re.match(r'^[A-Z]$', items[i+1+y])):
+            while (i + 1 + y < L and re.match(r'^[A-Z]$', items[i + 1 + y])):
                 y += 1
 
-            if x+y > 0:
-                for bias in range(-x, y+1):
-                    items[i+bias] = dict_acronym[items[i+bias]]
+            if x + y > 0:
+                for bias in range(-x, y + 1):
+                    items[i + bias] = dict_acronym[items[i + bias]]
     # Second pass mapping (not mapping 'i' and 'I')
     for i in range(len(items)):
         if items[i] in dict_acronym_noi.keys():
 
@@ -41,29 +41,16 @@ if ! which awk >&/dev/null; then
   add_packages gawk gawk gawk
 fi
 
-if which python >&/dev/null ; then
-  version=`/usr/bin/env python 2>&1 --version | awk '{print $2}' `
-  if [[ $version != "2.7"* && $version != "3."* ]] ; then
-    status=1
-    if which python2.7 >&/dev/null ; then
-      echo "$0: python 2.7 is not the default python (lower version python does not "
-      echo "$0: have packages that are required by pocolm). You should make it default"
-    else
-      echo "$0: python 2.7 is not installed"
-      add_packages python2.7 python2.7 python2.7
-    fi
-  fi
-
-else
-  echo "$0: python 2.7 is not installed"
-  add_packages python2.7 python2.7 python2.7
+if ! which python3 >&/dev/null ; then
+  echo "$0: python3 is not installed"
+  add_packages python3 python3 python3
 fi
 
-if ! python -c 'import numpy' >&/dev/null; then
+if ! python3 -c 'import numpy' >&/dev/null; then
   echo "$0: python-numpy is not installed"
   # I'm not sure if this package name is OK for all distributions, this is what
   # it seems to be called on Debian.  We'll have to investigate this.
-  add_packages numpy python-numpy python-numpy
+  add_packages numpy python3-numpy python3-numpy
 fi
 
 printed=false
 
@@ -1,18 +1,26 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import os
 import argparse
 import sys
 
-parser = argparse.ArgumentParser(description="Cleanup the largish files. "
-                                 "This may be called when the counts no longer useful.",
-                                 epilog="E.g. cleanup_count_dir.py data/lm/work/counts_20000_3",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+# If the encoding of the default sys.stdout is not utf-8,
+# force it to be utf-8. See PR #95.
+if hasattr(sys.stdout, 'encoding') and sys.stdout.encoding.lower() != "utf-8":
+    import codecs
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+    sys.stderr = codecs.getwriter("utf-8")(sys.stderr.detach())
+    sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach())
 
-parser.add_argument("count_dir",
-                    help="Directory to cleanup")
+parser = argparse.ArgumentParser(
+    description="Cleanup the largish files. "
+    "This may be called when the counts no longer useful.",
+    epilog="E.g. cleanup_count_dir.py data/lm/work/counts_20000_3",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+parser.add_argument("count_dir", help="Directory to cleanup")
 
 args = parser.parse_args()
 
@@ -34,12 +42,12 @@ def CleanupDir(count_dir, ngram_order, num_train_sets):
 if os.system("validate_count_dir.py " + args.count_dir) != 0:
     sys.exit("command validate_count_dir.py {0} failed".format(args.count_dir))
 
-f = open(os.path.join(args.count_dir, 'ngram_order'))
+f = open(os.path.join(args.count_dir, 'ngram_order'), encoding="utf-8")
 line = f.readline()
 ngram_order = int(line)
 f.close()
 
-f = open(os.path.join(args.count_dir, 'num_train_sets'))
+f = open(os.path.join(args.count_dir, 'num_train_sets'), encoding="utf-8")
 line = f.readline()
 num_train_sets = int(line)
 f.close()
@@ -50,7 +58,8 @@ def CleanupDir(count_dir, ngram_order, num_train_sets):
 # find split-dir and cleanup
 entities = os.listdir(args.count_dir)
 for dirname in entities:
-    if os.path.isdir(os.path.join(args.count_dir, dirname)) and dirname[0:5] == 'split':
+    if os.path.isdir(os.path.join(args.count_dir,
+                                  dirname)) and dirname[0:5] == 'split':
         for n in range(1, int(dirname[5:]) + 1):
             count_dir = os.path.join(args.count_dir, dirname, str(n))
             if os.path.isdir(count_dir):
 
@@ -1,18 +1,26 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import os
 import argparse
 import sys
 
-parser = argparse.ArgumentParser(description="Cleanup the largish files. "
-                                 "This may be called when the ints no longer useful.",
-                                 epilog="E.g. cleanup_int_dir.py data/lm/work/int_20000",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+# If the encoding of the default sys.stdout is not utf-8,
+# force it to be utf-8. See PR #95.
+if hasattr(sys.stdout, 'encoding') and sys.stdout.encoding.lower() != "utf-8":
+    import codecs
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+    sys.stderr = codecs.getwriter("utf-8")(sys.stderr.detach())
+    sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach())
 
-parser.add_argument("int_dir",
-                    help="Directory in which to find the data")
+parser = argparse.ArgumentParser(
+    description="Cleanup the largish files. "
+    "This may be called when the ints no longer useful.",
+    epilog="E.g. cleanup_int_dir.py data/lm/work/int_20000",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+parser.add_argument("int_dir", help="Directory in which to find the data")
 
 args = parser.parse_args()
 
@@ -22,7 +30,7 @@
 if os.system("validate_int_dir.py " + args.int_dir) != 0:
     sys.exit("command validate_int_dir.py {0} failed".format(args.int_dir))
 
-f = open(os.path.join(args.int_dir, 'num_train_sets'))
+f = open(os.path.join(args.int_dir, 'num_train_sets'), encoding="utf-8")
 line = f.readline()
 num_train_sets = int(line)
 f.close()