"
+ exit 1;
+fi
+
+if [ ! -s `which normalizer_main` ] ; then
+ echo "Sparrowhawk normalizer was not found installed !"
+ echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!"
+ exit 1
+fi
+
+txtdir=$1
+textdir=$(realpath $txtdir)
+outdir=$(realpath $2)
+
+workdir=$outdir/tmp
+if [ $stage -le 0 ]; then
+ rm -rf $outdir
+ mkdir -p $workdir
+ mkdir -p $textdir/splits
+ mkdir -p $outdir/data
+ split -l 1000000 $textdir/in.txt $textdir/splits/out
+ numsplits=0
+ for x in $textdir/splits/*; do
+ numsplits=$((numsplits+1))
+ ln -s $x $outdir/data/$numsplits
+ done
+ echo $numsplits
+ cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt .
+ $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
+ local/run_norm.sh \
+ sparrowhawk_configuration.ascii_proto \
+ $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \
+ $outdir/data \
+ JOB \
+ $outdir/sparrowhawk/
+ cat $outdir/sparrowhawk/*.txt | sed "/^$/d" > $outdir/text_normalized
+
+ # check if numbers are there in normalized output
+ awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \
+ $outdir/text_normalized > $outdir/unique_words
+ grep "[0-9]" $outdir/unique_words | sort -u > $outdir/numbers
+fi
diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh
index 62860a10b7b..7d09f574580 100755
--- a/egs/fisher_callhome_spanish/s5/local/ctm.sh
+++ b/egs/fisher_callhome_spanish/s5/local/ctm.sh
@@ -19,9 +19,9 @@ fi
steps/get_ctm.sh $data_dir $lang_dir $decode_dir
# Make sure that channel markers match
-#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
+#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
# Get the environment variables
. /export/babel/data/software/env.sh
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
new file mode 100755
index 00000000000..242359e7c28
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+# Path to Gigaword corpus with all data files decompressed.
+export GIGAWORDDIR=$1
+# The directory to write output to
+export OUTPUTDIR=$2
+# The number of jobs to run at once
+export NUMJOBS=$3
+
+echo "Flattening Gigaword with ${NUMJOBS} processes..."
+mkdir -p $OUTPUTDIR
+find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \;
+echo "Combining the flattened files into one..."
+cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
new file mode 100644
index 00000000000..29f6766dd84
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+import re
+import spacy
+import gzip
+
+from argparse import ArgumentParser
+from bs4 import BeautifulSoup
+
+en_nlp = spacy.load("es")
+
+
+def flatten_one_gigaword_file(file_path):
+ f = gzip.open(file_path)
+ html = f.read()
+ # Parse the text with BeautifulSoup
+ soup = BeautifulSoup(html, "html.parser")
+
+ # Iterate over all items and get the text for each.
+ all_paragraphs = []
+ for paragraph in soup("p"):
+ # Turn inter-paragraph newlines into spaces
+ paragraph = paragraph.get_text()
+ paragraph = re.sub(r"\n+", "\n", paragraph)
+ paragraph = paragraph.replace("\n", " ")
+ # Tokenize the paragraph into words
+ tokens = en_nlp.tokenizer(paragraph)
+ words = [str(token) for token in tokens if not
+ str(token).isspace()]
+ if len(words) < 3:
+ continue
+ all_paragraphs.append(words)
+ # Return a list of strings, where each string is a
+ # space-tokenized paragraph.
+ return [" ".join(paragraph) for paragraph in all_paragraphs]
+
+
+if __name__ == "__main__":
+ log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+ logging.basicConfig(level=logging.INFO, format=log_fmt)
+ logger = logging.getLogger(__name__)
+
+ parser = ArgumentParser(description=("Flatten a gigaword data file for "
+ "use in language modeling."))
+ parser.add_argument("--gigaword-path", required=True,
+ metavar="", type=str,
+ help=("Path to Gigaword directory, with "
+ "all .gz files unzipped."))
+ parser.add_argument("--output-dir", required=True, metavar="",
+ type=str, help=("Directory to write final flattened "
+ "Gigaword file."))
+
+ A = parser.parse_args()
+ all_paragraphs = flatten_one_gigaword_file(A.gigaword_path)
+ output_path = os.path.join(A.output_dir,
+ os.path.basename(A.gigaword_path) + ".flat")
+ with open(output_path, "w") as output_file:
+ for paragraph in all_paragraphs:
+ output_file.write("{}\n".format(paragraph))
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
new file mode 100755
index 00000000000..6b236be0ab9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -e
+
+. ./path_venv.sh
+
+# Path to Gigaword corpus with all data files decompressed.
+GIGAWORDPATH=$1
+# The directory to write output to
+OUTPUTDIR=$2
+file=$(basename ${GIGAWORDPATH})
+if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then
+ echo "flattening to ${OUTPUTDIR}/${file}.flat"
+ python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR}
+else
+ echo "skipping ${file}.flat"
+fi
+
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
index 11d65da3e95..22b98a6c9db 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
@@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then
sed 's:::g' | \
sed 's:foreign>::g' | \
+ sed 's:\[noise\]:[noise] :g' | \
sed 's:>::g' | \
#How do you handle numbers?
grep -v '()' | \
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 779298305c4..7b2de2db392 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -105,8 +105,9 @@ if [ $stage -le 4 ]; then
cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
# Add prons for laughter, noise, oov
- w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
- perl -i -ne "print unless /\[(${w%?})\]/" $tmpdir/lexicon.2
+ for w in `grep -v sil $dir/silence_phones.txt`; do
+ sed -i "/\[$w\]/d" $tmpdir/lexicon.2
+ done
for w in `grep -v sil $dir/silence_phones.txt`; do
echo "[$w] $w"
diff --git a/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
new file mode 100755
index 00000000000..ca5b2a46f8e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/env perl
+
+# Nagendra Kumar Goel
+
+# This takes two arguments:
+# 1) Pocolm training output folder
+# 2) rnnlm weights file name (for output)
+
+use POSIX;
+use List::Util qw[min max];
+
+if (@ARGV != 2) {
+ die "Usage: get_data_weights.pl \n";
+}
+
+$pdir = shift @ARGV;
+$out = shift @ARGV;
+
+open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters";
+open(N, "<$pdir/names") || die "Could not open $pdir/names" ;
+open(O, ">$out") || die "Could not open $out for writing" ;
+
+my %scores = ();
+
+while() {
+ @n = split(/\s/,$_);
+ $name = $n[1];
+ $w = ;
+ @w = split(/\s/,$w);
+ $weight = $w[1];
+ $scores{$name} = $weight;
+}
+
+$min = min(values %scores);
+
+for(keys %scores) {
+ $weightout = POSIX::ceil($scores{$_} / $min);
+ print O "$_\t1\t$weightout\n";
+}
diff --git a/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
new file mode 100755
index 00000000000..fc13a7af701
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# 2018 Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 5:
+ print( "Usage: python get_rnnlm_wordlist.py ")
+ sys.exit()
+
+lexicon_words = open(sys.argv[1], 'r', encoding="utf-8")
+pocolm_words = open(sys.argv[2], 'r', encoding="utf-8")
+rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8")
+oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8")
+
+line_count=0
+lexicon=[]
+
+for line in lexicon_words:
+ lexicon.append(line.split()[0])
+ rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+ line_count = line_count + 1
+
+for line in pocolm_words:
+ if not line.split()[0] in lexicon:
+ oov_wordlist.write(line.split()[0]+'\n')
+ rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+ line_count = line_count + 1
+
+lexicon_words.close()
+pocolm_words.close()
+rnnlm_wordsout.close()
+oov_wordlist.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
new file mode 100644
index 00000000000..3ecd16772d7
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# 2018 Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 3:
+ print("Usage : python . ")
+ print(" Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.")
+ sys.exit()
+
+pocolmdir=sys.argv[1]
+unigramwts=open(sys.argv[2], 'w')
+
+names = open(pocolmdir+"/names", 'r')
+metaparams = open(pocolmdir+"/metaparameters", 'r')
+
+name_mapper={}
+for line in names:
+ fields=line.split()
+ name_mapper[fields[0]] = fields[1]
+
+lns = metaparams.readlines()
+for lineno in range(len(name_mapper.keys())):
+ line = lns[lineno]
+ fileid = line.split()[0].split("_")[-1]
+ weight = line.split()[1]
+ unigramwts.write(name_mapper[fileid] + " " + weight + "\n")
+
+names.close()
+unigramwts.close()
+metaparams.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index c7aa6affb11..c7e0f140d2f 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,11 +1,14 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
+
# Copyright 2014 Gaurav Kumar. Apache 2.0
# 2018 Nagendra Kumar Goel, Saikiran Valluri, GoVivace inc., Avaaya
+
# Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
from __future__ import print_function
-import sys, re
+import sys
+import re
import json
import codecs
import operator
diff --git a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
new file mode 100755
index 00000000000..0a5649c2a79
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+
+# this script generates Pocolm-estimated language models with various
+# data sources in data/text folder and places the output in data/lm.
+
+set -euo pipefail
+
+. ./path.sh
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+
+wordlist=None
+num_word=100000
+pocolm_stage=1
+ngram_order=3
+lm_dir=
+arpa_dir=
+textdir=
+max_memory='--max-memory=8G'
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+# If you do not want to set memory limitation for "sort", you can use
+#max_memory=
+# Choices for the max-memory can be:
+# 1) integer + 'K', 'M', 'G', ...
+# 2) integer + 'b', meaning unit is byte and no multiplication
+# 3) integer + '%', meaning a percentage of memory
+# 4) integer, default unit is 'K'
+
+fold_dev_opt=
+# If you want to fold the dev-set in to the 'swbd1' set to produce the final
+# model, un-comment the following line. For use in the Kaldi example script for
+# ASR, this isn't suitable because the 'dev' set is the first 10k lines of the
+# switchboard data, which we also use as dev data for speech recognition
+# purposes.
+#fold_dev_opt="--fold-dev-into=swbd1"
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 3-gram model running with train_lm.py.
+# the dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.091,0.867,0.753,0.275,0.100,0.018,0.902,0.371,0.183,0.070"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+#limit_unk_history_opt=
+# If you want to limit the left of in the history of a n-gram
+# un-comment the following line
+limit_unk_history_opt="--limit-unk-history=true"
+
+for order in ${ngram_order}; do
+ # decide on the vocabulary.
+ # Note: you'd use --wordlist if you had a previously determined word-list
+ # that you wanted to use.
+ lm_name="${num_word}_${order}"
+ min_counts=''
+ # Note: the following might be a more reasonable setting:
+ # min_counts='fisher=2 swbd1=1'
+ if [ -n "${min_counts}" ]; then
+ lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+ fi
+ unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+ train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \
+ --min-counts=${min_counts} \
+ --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \
+ ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+ if [ $pocolm_stage -eq 2 ];then
+ mkdir -p ${arpa_dir}
+ format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz
+
+ # example of pruning. note: the threshold can be less than or more than one.
+ get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+ for threshold in 1.0 2.0 4.0; do
+ pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm
+ prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3
+ get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+ format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz
+
+ done
+
+ # example of pruning by size.
+ size=1000000
+ pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm
+ prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes'
+ get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+ format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz
+ fi
+done
+
+# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 )
+
+# the following does does some self-testing, including
+# that the computed derivatives are accurate.
+# local/self_test.sh
+
+# perplexities from pocolm-estimated language models with pocolm's interpolation
+# method from orders 3, 4, and 5 are:
+# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689)
+# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797)
+# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181)
+
+# note, the perplexities from pocolm-estimated language models with SRILM's
+# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh),
+# 78.8449 and 75.2202 respectively.
+
+# note, the perplexities from SRILM-estimated language models with SRILM's
+# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh),
+# 78.9056 and 75.5528 respectively.
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh
new file mode 100755
index 00000000000..3850910f312
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright 2012 Johns Hopkins University (author: Daniel Povey)
+# 2015 Guoguo Chen
+# 2017 Hainan Xu
+# 2017 Xiaohui Zhang
+
+# This script trains LMs on the swbd LM-training data.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0.
+# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71
+# Dev objf: -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91
+
+
+dir=Spanish_gigawrd/rnnlm
+pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned
+wordslist=
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=0
+train_stage=-30
+text_dir=Spanish_gigawrd/text_lm
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+mkdir -p $dir/config
+set -e
+
+for f in $text_dir/dev.txt; do
+ [ ! -f $f ] && \
+ echo "$0: expected file $f to exist;" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+ if [ -f $text_dir/unigram_weights ] ; then
+ mv $text_dir/unigram_weights $pocolm_dir/
+ fi
+ cp $wordslist $dir/config/words.txt
+ n=`cat $dir/config/words.txt | wc -l`
+ echo " $n" >> $dir/config/words.txt
+
+ # words that are not present in words.txt but are in the training or dev data, will be
+ # mapped to during training.
+ echo "" >$dir/config/oov.txt
+ local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt
+ rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+ --unk-word="" \
+ --data-weights-file=$dir/config/data_weights.txt \
+ $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+ # choose features
+ rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+ --use-constant-feature=true \
+ --special-words=',,,,[noise],[laughter]' \
+ $dir/config/words.txt > $dir/config/features.txt
+fi
+
+if [ $stage -le 1 ]; then
+ cat <$dir/config/xconfig
+ input dim=$embedding_dim name=input
+ relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+ fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+ relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+ fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+ relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+ output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+ rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+ rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+ rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \
+ --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir
+fi
+
+exit 0
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
deleted file mode 100755
index 3713fe228d6..00000000000
--- a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson
-# 2017 Hainan Xu
-# 2017 Ke Li
-
-# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization.
-
-# local/rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration.
-# local/rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7.
-# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82
-# Dev objf: -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23
-
-# Begin configuration section.
-dir=exp/rnnlm_lstm_tdnn_1b
-embedding_dim=200
-embedding_l2=0.005 # embedding layer l2 regularize
-comp_l2=0.005 # component-level l2 regularize
-output_l2=0.005 # output-layer l2 regularize
-epochs=90
-mic=
-stage=-10
-train_stage=0
-
-. ./cmd.sh
-. ./utils/parse_options.sh
-[ -z "$cmd" ] && cmd=$train_cmd
-
-train=data/train/text
-dev=data/dev2/text # We at no stage in run.sh should decode dev2 partition for results!
-wordlist=data/lang/words.txt
-text_dir=data/local/rnnlm/text
-mkdir -p $dir/config
-set -e
-
-for f in $train $dev $wordlist; do
- [ ! -f $f ] && \
- echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
-done
-
-if [ $stage -le 0 ]; then
- mkdir -p $text_dir
- cat $train | cut -d ' ' -f2- > $text_dir/ami.txt
- cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
-fi
-
-if [ $stage -le 1 ]; then
- cp $wordlist $dir/config/
- n=`cat $dir/config/words.txt | wc -l`
- echo " $n" >> $dir/config/words.txt
-
- # words that are not present in words.txt but are in the training or dev data, will be
- # mapped to during training.
- echo "" >$dir/config/oov.txt
-
- cat > $dir/config/data_weights.txt <" \
- --data-weights-file=$dir/config/data_weights.txt \
- $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
-
- # choose features
- rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
- --use-constant-feature=true \
- --top-word-features 10000 \
- --min-frequency 1.0e-03 \
- --special-words=',,,,[noise],[laughter]' \
- $dir/config/words.txt > $dir/config/features.txt
-
-lstm_opts="l2-regularize=$comp_l2"
-tdnn_opts="l2-regularize=$comp_l2"
-output_opts="l2-regularize=$output_l2"
-
- cat >$dir/config/xconfig < $dir/normalize/$job/substitute.sh
+
+bash $dir/normalize/$job/substitute.sh | \
+ sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
+ sed "s: \s*: :g" > $dir/normalize/$job/text
+
+local/clean_abbrevs_text.py $dir/normalize/$job/text $data/"$job"_processed
+tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' < $data/"$job"_processed > $dir/normalize/$job/text
+
+normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh
new file mode 100755
index 00000000000..b8b3ca35ef9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+stage=-2
+num_words_pocolm=110000
+prune_size=1000000
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+textdir=$1
+pocolm_dir=$2
+
+
+if [ $stage -le -2 ]; then
+ echo "****"
+ echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model"
+ echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....."
+ echo "****"
+ if [ -e "$textdir"/unigram_weights ]; then
+ rm "$textdir"/unigram_weights
+ fi
+ if [ -e "$pocolm_dir" ]; then
+ rm -r "$pocolm_dir"
+ fi
+
+ bash local/pocolm_cust.sh --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \
+ --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+
+fi
+
+if [ $stage -le -1 ];then
+ echo "********"
+ echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....."
+ echo "********"
+
+ echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done
+ python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
+ bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \
+ --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+ prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \
+ "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"
+ mkdir -p "$pocolm_dir"/arpa
+ format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" | \
+ gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz
+fi
+
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index 17ffb0369f8..201edd95876 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -1,6 +1,11 @@
-export KALDI_ROOT=`pwd`/../../..
+export KALDI_ROOT=`pwd`/../../../
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
-export LC_ALL=C
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs
+
+export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
+export PATH=$SPARROWHAWK_ROOT/bin:$PATH
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 6e2752a7b68..70d4d0555a4 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -4,14 +4,25 @@
# Copyright 2014 Gaurav Kumar. Apache 2.0
# Recipe for Fisher/Callhome-Spanish
-stage=0
-train_stage=-20
+stage=-1
+lmstage=-2
+
+# GIGAWORD RNNLM training based options below.
+# GIGAWORD RAW CORPUS DATA is assumed to be already downloaded in the gigaword_datapath.
+train_rnnlm=false
+start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder.
+ # If you already have the normalised gigword text somewhere, you can bypass the
+ # time consuming text cleanup (~1 week) by setting this option false.
+addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to
+ # perform the A, A + G, Dev type POCOLM training configuration.
+ # A=fsp train, G=gigword text,
+num_words_pocolm=100000
train_sgmm2=false
# call the next line with the directory where the Spanish Fisher data is
# (the values below are just an example).
sfisher_speech=/export/corpora/LDC/LDC2010S01
-sfisher_transcripts=/export/corpora/LDC/LDC2010T04
+sfisher_transcripts=/export/c03/svalluri//LDC2010T04
spanish_lexicon=/export/corpora/LDC/LDC96L16
split=local/splits/split_fisher
@@ -19,15 +30,17 @@ callhome_speech=/export/corpora/LDC/LDC96S35
callhome_transcripts=/export/corpora/LDC/LDC96T17
split_callhome=local/splits/split_callhome
+gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data # GIGAWORD RAW CORPUS DATA DOWNLOAD PATH
+rnnlm_workdir=workdir_rnnlm_Spanish_gigaword
mfccdir=`pwd`/mfcc
. ./cmd.sh
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
-set -e
+set -eou pipefail
-if [ $stage -le 1 ]; then
+if [ $stage -le -1 ]; then
local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
@@ -37,19 +50,14 @@ if [ $stage -le 1 ]; then
# ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
# wordlist is downloaded if it is not available.
local/fsp_prepare_dict.sh $spanish_lexicon
+ # Let's keep the original dict copy for G2P training
+ cp -r data/local/dict data/local/dict_orig
+ (
+ steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
+ ) &
# Added c,j, v to the non silences phones manually
- utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang
-
- # Make sure that you do not use your test and your dev sets to train the LM
- # Some form of cross validation is possible where you decode your dev/set based on an
- # LM that is trained on everything but that that conversation
- # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
- # to get the numbers. Depending on your needs, you might have to change the size of
- # the splits within that file. The default paritions are based on the Kaldi + Joshua
- # requirements which means that I have very large dev and test sets
- local/fsp_train_lms.sh $split
- local/fsp_create_test_lang.sh
+ utils/prepare_lang.sh data/local/dict_orig "" data/local/lang_orig data/lang_orig
utils/fix_data_dir.sh data/local/data/train_all
@@ -70,34 +78,65 @@ if [ $stage -le 1 ]; then
cp -r data/local/data/callhome_train_all data/callhome_train_all
- # Creating data partitions for the pipeline
- # We need datasets for both the ASR and SMT system
- # We have 257455 utterances left, so the partitions are roughly as follows
- # ASR Train : 100k utterances
- # ASR Tune : 17455 utterances
- # ASR Eval : 20k utterances
- # MT Train : 100k utterances
- # MT Tune : Same as the ASR eval set (Use the lattices from here)
- # MT Eval : 20k utterances
- # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
- # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
- # As noted above, the LM has not been trained on the dev and the test sets.
- #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
- #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
- #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
- #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
- #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
- #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
- #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
- #rm -r data/dev_and_test
- #rm -r data/asr_dev_and_test
- #rm -r data/mt_train_and_test
-
local/create_splits.sh $split
local/callhome_create_splits.sh $split_callhome
+
fi
+if [ $stage -le 0 ]; then
+ if $start_textcleanup; then
+ echo "WARNING : Starting from cleaning up and normalizing the Gigword text"
+ echo " This might take few days........... You can skip out this stage "
+ echo " by setting start_textcleanup=false, and having normalised_gigaword_corpus/text_normalized ready inside $rnnlm_workdir."
+
+ mkdir -p "$rnnlm_workdir"/gigaword_rawtext
+ local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24
+ cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt
+ local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/ \
+ "$rnnlm_workdir"/normalised_gigaword_corpus/
+ fi
+ mkdir -p "$rnnlm_workdir"/text_lm
+ cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt
+ cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt # For RNNLM and POCOLM training we use dev2/text as dev file.
+ cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+ if $addtraintext; then
+ cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+ fi
+fi
+
+if [ $stage -le 1 ]; then
+ local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
+ local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
+ "$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords
+ if $train_rnnlm; then
+ local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
+ --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm
+ fi
+fi
+
+
if [ $stage -le 2 ]; then
+ wait # wait till G2P training finishes
+ if [ -f exp/g2p/.error ]; then
+ rm exp/g2p/.error || true
+ echo "Fail to train the G2P model." && exit 1;
+ fi
+ steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
+ cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^[[:space:]]*$/d" | sort | uniq > "$rnnlm_workdir"/lexicon_extended.txt
+ cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
+
+ utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang
+
+ # Make sure that you do not use your test and your dev sets to train the LM
+ # Some form of cross validation is possible where you decode your dev/set based on an
+ # LM that is trained on everything but that that conversation
+ # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
+ # to get the numbers. Depending on your needs, you might have to change the size of
+ # the splits within that file. The default paritions are based on the Kaldi + Joshua
+ # requirements which means that I have very large dev and test sets
+ local/fsp_train_lms.sh $split
+ local/fsp_create_test_lang.sh
+
# Now compute CMVN stats for the train, dev and test subsets
steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
@@ -264,8 +303,11 @@ for iter in 1 2 3 4; do
data/lang_test data/dev/ exp/sgmm5/decode_dev $decode
done
) &
-
fi
-local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1;
+wait;
+
+if [ $stage -le 6 ]; then
+ local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1;
+fi
exit 0;
diff --git a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
new file mode 100644
index 00000000000..e6e316ec6b1
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright 2018 Govivace Inc. (Author: Valluri Saikiran)
+# Apache License 2.0
+
+# This script applies a g2p model using CMUsphinx/seq2seq.
+
+stage=0
+encoding='utf-8'
+
+echo "$0 $@" # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 3 ]; then
+ echo "Usage: $0 [options] "
+ echo " where is the OOV wordlist "
+ echo " is directory where the models will be stored"
+ exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+outdir=$3
+
+mkdir -p $outdir
+
+[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
+
+if [ ! -s `which g2p-seq2seq` ] ; then
+ echo "g2p-seq2seq was not found !"
+ echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh"
+ exit 1
+fi
+
+g2p-seq2seq --decode $lexicon --model_dir $wdir --output $outdir/lexicon.lex
+
diff --git a/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh
new file mode 100644
index 00000000000..e0389171fd5
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Copyright 2018 Govivace Inc. (Author: Valluri Saikiran)
+# Apache License 2.0
+
+# This script trains a g2p model using CMUsphinx/seq2seq.
+
+stage=0
+encoding='utf-8'
+
+echo "$0 $@" # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 2 ]; then
+ echo "Usage: $0 [options] "
+ echo " where is the training lexicon (one pronunciation per "
+ echo " word per line, with lines like 'hello h uh l ow') and"
+ echo " is directory where the models will be stored"
+ exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+
+[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
+
+if [ ! -s `which g2p-seq2seq` ]; then
+ echo "g2p-seq2seq was not found !"
+ echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh"
+ exit 1
+fi
+
+g2p-seq2seq --max_epochs 12 --train $lexicon --model_dir $wdir
+
diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py
index 4335caed5d8..2beede5acc6 100755
--- a/scripts/rnnlm/show_word_features.py
+++ b/scripts/rnnlm/show_word_features.py
@@ -7,6 +7,7 @@
import argparse
import sys
+
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
import re
diff --git a/tools/extras/install_g2p_seq2seq.sh b/tools/extras/install_g2p_seq2seq.sh
new file mode 100644
index 00000000000..c9979b8b961
--- /dev/null
+++ b/tools/extras/install_g2p_seq2seq.sh
@@ -0,0 +1,5 @@
+if [ ! -e g2p-seq2seq ];then
+ git clone https://github.com/cmusphinx/g2p-seq2seq.git
+ cd g2p-seq2seq/
+ python setup.py install
+fi
diff --git a/tools/install_g2p_seq2seq.sh b/tools/install_g2p_seq2seq.sh
new file mode 120000
index 00000000000..77715305f74
--- /dev/null
+++ b/tools/install_g2p_seq2seq.sh
@@ -0,0 +1 @@
+extras/install_g2p_seq2seq.sh
\ No newline at end of file
diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh
new file mode 100755
index 00000000000..b6a7af211f5
--- /dev/null
+++ b/tools/install_sparrowhawk.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+export LDFLAGS="-L`pwd`/openfst/lib"
+export CXXFLAGS="-I`pwd`/openfst/include"
+stage=0
+
+if [ $stage -le 0 ] ; then
+ rm -rf re2 protobuf sparrowhawk*
+ git clone -b feature/Spanish_normalizer https://github.com/spokencloud/sparrowhawk-resources.git || exit 1;
+ patch -p0 < sparrowhawk-resources/local/Makefile.patch || exit 1;
+ make openfst || exit 1;
+ git clone https://github.com/mjansche/thrax.git
+ export LDFLAGS=-L`pwd`/openfst/lib
+ export CXXFLAGS=-I`pwd`/openfst/include
+ cd thrax
+ autoreconf --force --install || exit 1;
+ ./configure --prefix=`pwd` || exit 1;
+ make || exit 1;
+ make install || exit 1;
+ cd ..
+ git clone https://github.com/google/re2.git || exit 1;
+ cd re2/
+ make -j 20 || exit 1;
+ make test || exit 1;
+ make install prefix=`pwd` || exit 1;
+ cd ..
+ git clone https://github.com/google/protobuf.git || exit 1;
+ cd protobuf/
+ ./autogen.sh || exit 1;
+ ./configure --prefix=`pwd` || exit 1;
+ make -j 20 || exit 1;
+ make install || exit 1;
+ cd ..
+fi
+
+if [ $stage -le 1 ]; then
+ git clone https://github.com/google/sparrowhawk.git || exit 1;
+ patch -p0 < sparrowhawk-resources/local/sparrowhawk.patch || exit 1;
+ cd sparrowhawk/ || exit 1;
+ mkdir lib
+ mkdir bin
+ mkdir include
+ cp -r ../openfst/lib/* lib/ || exit 1;
+ cp -r ../protobuf/lib/* lib/ || exit 1;
+ cp -r ../re2/lib/* lib/ || exit 1;
+ cp -r ../thrax/lib/* lib/ || exit 1;
+ cp -r ../openfst/include/* include/ || exit 1;
+ cp -r ../protobuf/include/* include/ || exit 1;
+ cp -r ../re2/include/* include/ || exit 1;
+ cp -r ../thrax/include/* include/ || exit 1;
+ cp ../protobuf/bin/protoc bin/. || exit 1;
+ export PATH=`pwd`/bin:$PATH
+ aclocal || exit 1;
+ automake || exit 1;
+ ./configure --prefix=`pwd` CPPFLAGS="-I`pwd`/include" LDFLAGS="-L`pwd`/lib" || exit 1;
+ make || exit 1;
+ make install || exit 1;
+ cd ..
+fi
+
+if [ $stage -le 2 ]; then
+ cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1;
+ cd sparrowhawk/language-resources/esp/textnorm/classifier || exit 1;
+ . ./path.sh || exit 1;
+ python2 create_far.py ascii.syms universal_depot_ascii universal_depot universal_depot.far
+ thraxmakedep tokenize_and_classify.grm || exit 1;
+ make || exit 1;
+ cd ../verbalizer
+ python2 create_far.py ascii.syms number_names_depot_ascii number_names_depot number_names_depot.far
+ cp -r ../classifier/universal_depot.far .
+ thraxmakedep verbalize.grm || exit 1;
+ make || exit 1;
+ cd ../../../../..
+fi