diff --git a/egs/fisher_callhome_spanish/s5/RESULTS b/egs/fisher_callhome_spanish/s5/RESULTS
deleted file mode 100644
index 66613163cea..00000000000
--- a/egs/fisher_callhome_spanish/s5/RESULTS
+++ /dev/null
@@ -1,38 +0,0 @@
---------------------------------------------------------------------------------------
-Triphone with mono alignment (small)
---------------------------------------------------------------------------------------
-%WER 53.70 [ 21570 / 40170, 2618 ins, 6013 del, 12939 sub ] exp/tri1/decode_dev/wer_14_0.0
-
---------------------------------------------------------------------------------------
-Triphone with tri alignments
---------------------------------------------------------------------------------------
-%WER 53.18 [ 21364 / 40170, 2889 ins, 5533 del, 12942 sub ] exp/tri2/decode_dev/wer_13_0.0
-
---------------------------------------------------------------------------------------
-Triphone + LDA + MLLT
---------------------------------------------------------------------------------------
-%WER 46.95 [ 18858 / 40170, 2636 ins, 5197 del, 11025 sub ] exp/tri3a/decode_dev/wer_14_0.0
-
---------------------------------------------------------------------------------------
-+ SAT + fMLLR
---------------------------------------------------------------------------------------
-%WER 42.86 [ 17217 / 40170, 2556 ins, 4633 del, 10028 sub ] exp/tri4a/decode_dev/wer_15_0.0
-
---------------------------------------------------------------------------------------
-+ More leaves and gaussians
---------------------------------------------------------------------------------------
-%WER 40.48 [ 16261 / 40170, 2689 ins, 4130 del, 9442 sub ] exp/tri5a/decode_dev/wer_14_0.0
-
---------------------------------------------------------------------------------------
-+ bMMI + SGMM
---------------------------------------------------------------------------------------
-%WER 38.43 [ 15437 / 40170, 2800 ins, 3685 del, 8952 sub ] exp/sgmm5/decode_dev/wer_10_0.0
-%WER 36.90 [ 14821 / 40170, 2708 ins, 3552 del, 8561 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it1/wer_10_0.0
-%WER 36.09 [ 14499 / 40170, 2511 ins, 3737 del, 8251 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it2/wer_11_0.0
-%WER 35.48 [ 14252 / 40170, 2672 ins, 3370 del, 8210 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it3/wer_10_0.0
-%WER 35.16 [ 14122 / 40170, 2701 ins, 3287 del, 8134 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it4/wer_10_0.0
-
---------------------------------------------------------------------------------------
-pNorm-Ensemble DNN
---------------------------------------------------------------------------------------
-%WER 35.13 [ 14113 / 40170, 2680 ins, 3405 del, 8028 sub ] exp/tri6a_dnn/decode_dev/wer_11_0.0
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
index 7f407552c2e..3e400914521 100755
--- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -27,6 +27,7 @@ nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=1g   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 common_egs_dir=
 reporting_email=
+gigaword_workdir=
 
 # LSTM/chain options
 train_stage=-10
@@ -254,11 +255,6 @@ if [ $stage -le 21 ]; then
 
 fi
 
-rnnlmdir=exp/rnnlm_lstm_tdnn_1b
-if [ $stage -le 22 ]; then
-  local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
-fi
-
 if [ $stage -le 23 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
@@ -277,8 +273,11 @@ if [ $stage -le 23 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
       done
-      bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
-	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
+      if [ $gigaword_workdir ]; then
+	lmtype=fsp_train
+        bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
+              ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1;
+      fi
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py
new file mode 100644
index 00000000000..7d92eb9fe3a
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.,
+
+import os, sys
+import re
+import codecs
+
+if len(sys.argv) < 3:
+  print("Usage : python clean_abbrevs_text.py <Input text> <output text>")
+  print("        Processes the text before text normalisation to convert uppercase words as space separated letters")
+  sys.exit()
+
+inputfile=codecs.open(sys.argv[1], encoding='utf-8')
+outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w')
+
+for line in inputfile:
+  words = line.split()
+  textout = ""
+  wordcnt = 0
+  for word in words:
+    if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word):
+      if wordcnt > 0:
+        word = re.sub('\'?s', 's', word)
+        textout = textout + " ".join(word) + " "
+      else:
+        textout = textout + word + " "
+    else:
+      textout = textout + word + " "
+      if word.isalpha(): wordcnt = wordcnt + 1
+  outputfile.write(textout.strip()+ '\n')
+
+inputfile.close()
+outputfile.close() 
diff --git a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
new file mode 100755
index 00000000000..5d25e3a3fd2
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Script to clean up gigaword LM text
+# Removes punctuations, does case normalization
+
+stage=0
+nj=500
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <textdir> <outdir>"
+    exit 1;
+fi
+
+if [ ! -s `which normalizer_main` ] ; then
+  echo "Sparrowhawk normalizer was not found installed !"
+  echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!"
+  exit 1
+fi
+
+txtdir=$1
+textdir=$(realpath $txtdir)
+outdir=$(realpath $2)
+
+workdir=$outdir/tmp
+if [ $stage -le 0 ]; then
+  rm -rf $outdir
+  mkdir -p $workdir
+  mkdir -p $textdir/splits
+  mkdir -p $outdir/data
+  split -l 1000000 $textdir/in.txt $textdir/splits/out
+  numsplits=0
+  for x in $textdir/splits/*; do
+    numsplits=$((numsplits+1))
+    ln -s $x $outdir/data/$numsplits
+  done
+  echo $numsplits
+  cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt .
+  $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
+    local/run_norm.sh \
+    sparrowhawk_configuration.ascii_proto \
+    $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \
+    $outdir/data \
+    JOB \
+    $outdir/sparrowhawk/
+  cat $outdir/sparrowhawk/*.txt | sed "/^$/d"  > $outdir/text_normalized
+
+  # check if numbers are there in normalized output
+  awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \
+    $outdir/text_normalized > $outdir/unique_words
+  grep "[0-9]" $outdir/unique_words | sort -u >  $outdir/numbers
+fi
diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh
index 62860a10b7b..7d09f574580 100755
--- a/egs/fisher_callhome_spanish/s5/local/ctm.sh
+++ b/egs/fisher_callhome_spanish/s5/local/ctm.sh
@@ -19,9 +19,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
+#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
new file mode 100755
index 00000000000..242359e7c28
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+# Path to Gigaword corpus with all data files decompressed.
+export GIGAWORDDIR=$1
+# The directory to write output to
+export OUTPUTDIR=$2
+# The number of jobs to run at once
+export NUMJOBS=$3
+
+echo "Flattening Gigaword with ${NUMJOBS} processes..."
+mkdir -p $OUTPUTDIR
+find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \;
+echo "Combining the flattened files into one..."
+cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
new file mode 100644
index 00000000000..29f6766dd84
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+import re
+import spacy
+import gzip
+
+from argparse import ArgumentParser
+from bs4 import BeautifulSoup
+
+en_nlp = spacy.load("es")
+
+
+def flatten_one_gigaword_file(file_path):
+    f = gzip.open(file_path)
+    html = f.read()
+    # Parse the text with BeautifulSoup
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Iterate over all <p> items and get the text for each.
+    all_paragraphs = []
+    for paragraph in soup("p"):
+        # Turn inter-paragraph newlines into spaces
+        paragraph = paragraph.get_text()
+        paragraph = re.sub(r"\n+", "\n", paragraph)
+        paragraph = paragraph.replace("\n", " ")
+        # Tokenize the paragraph into words
+        tokens = en_nlp.tokenizer(paragraph)
+        words = [str(token) for token in tokens if not
+                 str(token).isspace()]
+        if len(words) < 3:
+            continue
+        all_paragraphs.append(words)
+    # Return a list of strings, where each string is a
+    # space-tokenized paragraph.
+    return [" ".join(paragraph) for paragraph in all_paragraphs]
+
+
+if __name__ == "__main__":
+    log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+    logger = logging.getLogger(__name__)
+
+    parser = ArgumentParser(description=("Flatten a gigaword data file for "
+                                         "use in language modeling."))
+    parser.add_argument("--gigaword-path", required=True,
+                        metavar="<gigaword_path>", type=str,
+                        help=("Path to Gigaword directory, with "
+                              "all .gz files unzipped."))
+    parser.add_argument("--output-dir", required=True, metavar="<output_dir>",
+                        type=str, help=("Directory to write final flattened "
+                                        "Gigaword file."))
+
+    A = parser.parse_args()
+    all_paragraphs = flatten_one_gigaword_file(A.gigaword_path)
+    output_path = os.path.join(A.output_dir,
+                               os.path.basename(A.gigaword_path) + ".flat")
+    with open(output_path, "w") as output_file:
+        for paragraph in all_paragraphs:
+            output_file.write("{}\n".format(paragraph))
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
new file mode 100755
index 00000000000..6b236be0ab9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -e
+
+. ./path_venv.sh
+
+# Path to Gigaword corpus with all data files decompressed.
+GIGAWORDPATH=$1
+# The directory to write output to
+OUTPUTDIR=$2
+file=$(basename ${GIGAWORDPATH})
+if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then
+    echo "flattening to ${OUTPUTDIR}/${file}.flat"
+    python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR}
+else
+    echo "skipping ${file}.flat"
+fi
+
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
index 11d65da3e95..22b98a6c9db 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
@@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then
   sed 's:</b::g' | \
   sed 's:<foreign langengullís>::g' | \
   sed 's:foreign>::g' | \
+  sed 's:\[noise\]:[noise] :g' | \
   sed 's:>::g' | \
   #How do you handle numbers?
   grep -v '()' | \
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 779298305c4..7b2de2db392 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -105,8 +105,9 @@ if [ $stage -le 4 ]; then
   cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
 
   # Add prons for laughter, noise, oov
-  w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
-  perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.2
+  for w in `grep -v sil $dir/silence_phones.txt`; do
+    sed -i "/\[$w\]/d" $tmpdir/lexicon.2
+  done
 
   for w in `grep -v sil $dir/silence_phones.txt`; do
     echo "[$w] $w"
diff --git a/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
new file mode 100755
index 00000000000..ca5b2a46f8e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/env perl
+
+# Nagendra Kumar Goel
+
+# This takes two arguments:
+# 1) Pocolm training output folder
+# 2) rnnlm weights file name (for output)
+
+use POSIX;
+use List::Util qw[min max];
+
+if (@ARGV != 2) {
+  die "Usage: get_data_weights.pl <pocolm-folder> <output-file>\n";
+}
+
+$pdir = shift @ARGV;
+$out = shift @ARGV;
+
+open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters";
+open(N, "<$pdir/names") || die "Could not open $pdir/names"  ;
+open(O, ">$out")  || die "Could not open $out for writing" ;
+
+my %scores = ();
+
+while(<N>) {
+    @n = split(/\s/,$_);
+    $name = $n[1];
+    $w = <P>;
+    @w = split(/\s/,$w);
+    $weight = $w[1];
+    $scores{$name} = $weight;
+}
+
+$min = min(values %scores);
+
+for(keys %scores) {
+    $weightout = POSIX::ceil($scores{$_} / $min);
+    print O "$_\t1\t$weightout\n";
+}
diff --git a/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
new file mode 100755
index 00000000000..fc13a7af701
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 5:
+    print( "Usage: python get_rnnlm_wordlist.py <ASR lexicon words> <POCOLM wordslist> <RNNLM wordslist output> <OOV wordlist>")
+    sys.exit()
+
+lexicon_words = open(sys.argv[1], 'r', encoding="utf-8")
+pocolm_words = open(sys.argv[2], 'r', encoding="utf-8")
+rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8")
+oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8")
+
+line_count=0
+lexicon=[]
+
+for line in lexicon_words:
+    lexicon.append(line.split()[0])
+    rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+    line_count = line_count + 1
+
+for line in pocolm_words:
+    if not line.split()[0] in lexicon:
+        oov_wordlist.write(line.split()[0]+'\n')
+        rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+        line_count = line_count + 1
+
+lexicon_words.close()
+pocolm_words.close()
+rnnlm_wordsout.close()
+oov_wordlist.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
new file mode 100644
index 00000000000..3ecd16772d7
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 3:
+    print("Usage : python . <pocolmmodelpat> <unigram weights outfile>")
+    print("      Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.")
+    sys.exit()
+ 
+pocolmdir=sys.argv[1]
+unigramwts=open(sys.argv[2], 'w')
+
+names = open(pocolmdir+"/names", 'r')
+metaparams = open(pocolmdir+"/metaparameters", 'r')
+
+name_mapper={}
+for line in names:
+    fields=line.split()
+    name_mapper[fields[0]] = fields[1]
+    
+lns = metaparams.readlines()
+for lineno in range(len(name_mapper.keys())):
+    line = lns[lineno]
+    fileid = line.split()[0].split("_")[-1]
+    weight = line.split()[1]
+    unigramwts.write(name_mapper[fileid] + "  " + weight + "\n")
+
+names.close()
+unigramwts.close()
+metaparams.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index c7aa6affb11..c7e0f140d2f 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,11 +1,14 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
+
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
 #    2018  Nagendra Kumar Goel, Saikiran Valluri, GoVivace inc., Avaaya
+
 # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
 from __future__ import print_function
-import sys, re
+import sys
+import re
 import json
 import codecs
 import operator
diff --git a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
new file mode 100755
index 00000000000..0a5649c2a79
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+
+# this script generates Pocolm-estimated language models with various
+# data sources in data/text folder and places the output in data/lm.
+
+set -euo pipefail
+
+. ./path.sh
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+
+wordlist=None
+num_word=100000
+pocolm_stage=1
+ngram_order=3
+lm_dir=
+arpa_dir=
+textdir=
+max_memory='--max-memory=8G'
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+# If you do not want to set memory limitation for "sort", you can use
+#max_memory=
+# Choices for the max-memory can be:
+# 1) integer + 'K', 'M', 'G', ...
+# 2) integer + 'b', meaning unit is byte and no multiplication
+# 3) integer + '%', meaning a percentage of memory
+# 4) integer, default unit is 'K'
+
+fold_dev_opt=
+# If you want to fold the dev-set in to the 'swbd1' set to produce the final
+# model, un-comment the following line.  For use in the Kaldi example script for
+# ASR, this isn't suitable because the 'dev' set is the first 10k lines of the
+# switchboard data, which we also use as dev data for speech recognition
+# purposes.
+#fold_dev_opt="--fold-dev-into=swbd1"
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 3-gram model running with train_lm.py.
+# the dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.091,0.867,0.753,0.275,0.100,0.018,0.902,0.371,0.183,0.070"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+#limit_unk_history_opt=
+# If you want to limit the left of <unk> in the history of a n-gram
+# un-comment the following line
+limit_unk_history_opt="--limit-unk-history=true"
+
+for order in ${ngram_order}; do
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  lm_name="${num_word}_${order}"
+  min_counts=''
+  # Note: the following might be a more reasonable setting:
+  # min_counts='fisher=2 swbd1=1'
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \
+              --min-counts=${min_counts} \
+              --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \
+              ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  if [ $pocolm_stage -eq 2 ];then
+  mkdir -p ${arpa_dir}
+  format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz
+
+  # example of pruning.  note: the threshold can be less than or more than one.
+  get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  for threshold in 1.0 2.0 4.0; do
+    pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm
+    prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3
+    get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+    format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz
+
+  done
+
+  # example of pruning by size.
+  size=1000000
+  pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm
+  prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes'
+  get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz
+  fi
+done
+
+# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 )
+
+# the following does does some self-testing, including
+# that the computed derivatives are accurate.
+# local/self_test.sh
+
+# perplexities from pocolm-estimated language models with pocolm's interpolation
+# method from orders 3, 4, and 5 are:
+# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689)
+# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797)
+# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181)
+
+# note, the perplexities from pocolm-estimated language models with SRILM's
+# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh),
+# 78.8449 and 75.2202 respectively.
+
+# note, the perplexities from SRILM-estimated language models with SRILM's
+# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh),
+# 78.9056 and 75.5528 respectively.
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh
new file mode 100755
index 00000000000..3850910f312
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+#           2017  Xiaohui Zhang
+
+# This script trains LMs on the swbd LM-training data.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0.
+# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71
+# Dev objf:   -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91
+
+
+dir=Spanish_gigawrd/rnnlm
+pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned
+wordslist=
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=0
+train_stage=-30
+text_dir=Spanish_gigawrd/text_lm
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+mkdir -p $dir/config
+set -e
+
+for f in $text_dir/dev.txt; do
+    [ ! -f $f ] && \
+	echo "$0: expected file $f to exist;" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+    if [ -f $text_dir/unigram_weights ] ; then
+	mv $text_dir/unigram_weights $pocolm_dir/
+    fi
+    cp $wordslist $dir/config/words.txt
+    n=`cat $dir/config/words.txt | wc -l`
+    echo "<brk> $n" >> $dir/config/words.txt
+
+    # words that are not present in words.txt but are in the training or dev data, will be
+    # mapped to <SPOKEN_NOISE> during training.
+    echo "<unk>" >$dir/config/oov.txt
+    local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt 
+    rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+				 --unk-word="<unk>" \
+				 --data-weights-file=$dir/config/data_weights.txt \
+				 $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+    
+      # choose features
+      rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+			       --use-constant-feature=true \
+			       --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+			       $dir/config/words.txt > $dir/config/features.txt
+fi
+
+if [ $stage -le 1 ]; then
+        cat <<EOF >$dir/config/xconfig 
+	input dim=$embedding_dim name=input
+	relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+	fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+	relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+	fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+	relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+	output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+	rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+    rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+    rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \
+			 --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir
+fi
+
+exit 0
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
deleted file mode 100755
index 3713fe228d6..00000000000
--- a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
-#           2017  Hainan Xu
-#           2017  Ke Li
-
-# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization.
-
-# local/rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration.
-# local/rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7.
-# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82
-# Dev objf:   -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23
-
-# Begin configuration section.
-dir=exp/rnnlm_lstm_tdnn_1b
-embedding_dim=200
-embedding_l2=0.005 # embedding layer l2 regularize
-comp_l2=0.005 # component-level l2 regularize
-output_l2=0.005 # output-layer l2 regularize
-epochs=90
-mic=
-stage=-10
-train_stage=0
-
-. ./cmd.sh
-. ./utils/parse_options.sh
-[ -z "$cmd" ] && cmd=$train_cmd
-
-train=data/train/text
-dev=data/dev2/text   # We at no stage in run.sh should decode dev2 partition for results!
-wordlist=data/lang/words.txt
-text_dir=data/local/rnnlm/text
-mkdir -p $dir/config
-set -e
-
-for f in $train $dev $wordlist; do
-  [ ! -f $f ] && \
-    echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
-done
-
-if [ $stage -le 0 ]; then
-  mkdir -p $text_dir
-  cat $train | cut -d ' ' -f2- > $text_dir/ami.txt
-  cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
-fi
-
-if [ $stage -le 1 ]; then
-  cp $wordlist $dir/config/
-  n=`cat $dir/config/words.txt | wc -l`
-  echo "<brk> $n" >> $dir/config/words.txt
-
-  # words that are not present in words.txt but are in the training or dev data, will be
-  # mapped to <unk> during training.
-  echo "<unk>" >$dir/config/oov.txt
-
-  cat > $dir/config/data_weights.txt <<EOF
-ami  1   1.0
-EOF
-
-  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
-                             --unk-word="<unk>" \
-                             --data-weights-file=$dir/config/data_weights.txt \
-                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
-
-  # choose features
-  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
-                           --use-constant-feature=true \
-                           --top-word-features 10000 \
-                           --min-frequency 1.0e-03 \
-                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
-                           $dir/config/words.txt > $dir/config/features.txt
-
-lstm_opts="l2-regularize=$comp_l2"
-tdnn_opts="l2-regularize=$comp_l2"
-output_opts="l2-regularize=$output_l2"
-
-  cat >$dir/config/xconfig <<EOF
-input dim=$embedding_dim name=input
-lstm-layer name=lstm1 cell-dim=$embedding_dim $lstm_opts
-relu-renorm-layer name=tdnn dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
-lstm-layer name=lstm2 cell-dim=$embedding_dim $lstm_opts
-output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
-EOF
-  rnnlm/validate_config_dir.sh $text_dir $dir/config
-fi
-
-if [ $stage -le 2 ]; then
-  # the --unigram-factor option is set larger than the default (100)
-  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
-  # was taking up too much CPU (as much as 10 cores).
-  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
-                             $text_dir $dir/config $dir
-fi
-
-if [ $stage -le 3 ]; then
-  rnnlm/train_rnnlm.sh --embedding_l2 $embedding_l2 \
-                       --stage $train_stage \
-                       --num-epochs $epochs --cmd "$cmd" $dir
-fi
-
-exit 0
diff --git a/egs/fisher_callhome_spanish/s5/local/run_norm.sh b/egs/fisher_callhome_spanish/s5/local/run_norm.sh
new file mode 100755
index 00000000000..839636ea21a
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/run_norm.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+set -euo pipefail
+
+punctuation_symbols=( "," "\"" "\`" "\:" "(" ")" "-" ";" "?" "!" "/" "_" "{" "}" "*" )
+
+config=$1
+path_prefix=$2
+data=$3
+job=$4
+dir=$5
+
+substitute_arg=""
+num_syms=0
+
+for i in "${punctuation_symbols[@]}"; do
+    symbol=${punctuation_symbols[${num_syms}]}
+    if [ $num_syms -eq 0 ]; then
+	substitute_arg="sed 's:${i}: :g'"
+    else
+	substitute_arg=$substitute_arg" | sed 's:${i}: :g'"
+    fi
+    substitute_arg=$substitute_arg" |sed 's:${i}$: :g' | sed 's:^${i}: :g'"
+    num_syms=$((num_syms+1))
+done
+mkdir -p $dir/normalize/$job
+
+echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh
+ 
+bash $dir/normalize/$job/substitute.sh | \
+    sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
+    sed "s: \s*: :g" > $dir/normalize/$job/text
+    
+local/clean_abbrevs_text.py $dir/normalize/$job/text $data/"$job"_processed
+tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' < $data/"$job"_processed > $dir/normalize/$job/text
+
+normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh
new file mode 100755
index 00000000000..b8b3ca35ef9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+stage=-2
+num_words_pocolm=110000
+prune_size=1000000
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+textdir=$1
+pocolm_dir=$2
+
+
+if [ $stage -le -2 ]; then
+    echo "****"
+    echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model"
+    echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....."
+    echo "****"
+    if [ -e "$textdir"/unigram_weights ]; then
+	rm "$textdir"/unigram_weights
+    fi
+    if [ -e "$pocolm_dir" ]; then
+	rm -r "$pocolm_dir"
+    fi
+    
+    bash local/pocolm_cust.sh  --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \
+	 --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+
+fi
+    
+if [ $stage -le -1 ];then
+    echo "********"
+    echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....."
+    echo "********"
+
+    echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done
+    python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
+    bash local/pocolm_cust.sh  --num-word "$num_words_pocolm"  --lm-dir "$pocolm_dir"/lm \
+	                       --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+    prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \
+                        "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"
+    mkdir -p "$pocolm_dir"/arpa
+    format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"  | \
+                                gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz    
+fi
+
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index 17ffb0369f8..201edd95876 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -1,6 +1,11 @@
-export KALDI_ROOT=`pwd`/../../..
+export KALDI_ROOT=`pwd`/../../../
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
-export LC_ALL=C
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs
+
+export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
+export PATH=$SPARROWHAWK_ROOT/bin:$PATH
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 6e2752a7b68..70d4d0555a4 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -4,14 +4,25 @@
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
 # Recipe for Fisher/Callhome-Spanish
 
-stage=0
-train_stage=-20
+stage=-1
+lmstage=-2
+
+# GIGAWORD RNNLM training based options below.
+# GIGAWORD RAW CORPUS DATA is assumed to be already downloaded in the gigaword_datapath.
+train_rnnlm=false
+start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder.
+                        # If you already have the normalised gigword text somewhere, you can bypass the
+			# time consuming text cleanup (~1 week) by setting this option false.
+addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to 
+                  # perform the A, A + G, Dev type POCOLM training configuration.
+		  # A=fsp train, G=gigword text, 
+num_words_pocolm=100000
 train_sgmm2=false
 
 # call the next line with the directory where the Spanish Fisher data is
 # (the values below are just an example).
 sfisher_speech=/export/corpora/LDC/LDC2010S01
-sfisher_transcripts=/export/corpora/LDC/LDC2010T04
+sfisher_transcripts=/export/c03/svalluri//LDC2010T04
 spanish_lexicon=/export/corpora/LDC/LDC96L16
 split=local/splits/split_fisher
 
@@ -19,15 +30,17 @@ callhome_speech=/export/corpora/LDC/LDC96S35
 callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
+gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data # GIGAWORD RAW CORPUS DATA DOWNLOAD PATH
+rnnlm_workdir=workdir_rnnlm_Spanish_gigaword
 mfccdir=`pwd`/mfcc
 
 . ./cmd.sh
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
-set -e
+set -eou pipefail
 
-if [ $stage -le 1 ]; then
+if [ $stage -le -1 ]; then
   local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
 
   local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
@@ -37,19 +50,14 @@ if [ $stage -le 1 ]; then
   # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
   # wordlist is downloaded if it is not available.
   local/fsp_prepare_dict.sh $spanish_lexicon
+  # Let's keep the original dict copy for G2P training
+  cp -r data/local/dict data/local/dict_orig
+  (
+    steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
+  ) &
 
   # Added c,j, v to the non silences phones manually
-  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
-
-  # Make sure that you do not use your test and your dev sets to train the LM
-  # Some form of cross validation is possible where you decode your dev/set based on an
-  # LM that is trained on  everything but that that conversation
-  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
-  # to get the numbers. Depending on your needs, you might have to change the size of
-  # the splits within that file. The default paritions are based on the Kaldi + Joshua
-  # requirements which means that I have very large dev and test sets
-  local/fsp_train_lms.sh $split
-  local/fsp_create_test_lang.sh
+  utils/prepare_lang.sh data/local/dict_orig "<unk>" data/local/lang_orig data/lang_orig
 
   utils/fix_data_dir.sh data/local/data/train_all
 
@@ -70,34 +78,65 @@ if [ $stage -le 1 ]; then
 
   cp -r data/local/data/callhome_train_all data/callhome_train_all
 
-  # Creating data partitions for the pipeline
-  # We need datasets for both the ASR and SMT system
-  # We have 257455 utterances left, so the partitions are roughly as follows
-  # ASR Train : 100k utterances
-  # ASR Tune : 17455 utterances
-  # ASR Eval : 20k utterances
-  # MT Train : 100k utterances
-  # MT Tune : Same as the ASR eval set (Use the lattices from here)
-  # MT Eval : 20k utterances
-  # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
-  # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
-  # As noted above, the LM has not been trained on the dev and the test sets.
-  #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
-  #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
-  #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
-  #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
-  #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
-  #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
-  #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
-  #rm -r data/dev_and_test
-  #rm -r data/asr_dev_and_test
-  #rm -r data/mt_train_and_test
-
   local/create_splits.sh $split
   local/callhome_create_splits.sh $split_callhome
+  
 fi
 
+if [ $stage -le 0 ]; then
+  if $start_textcleanup; then
+    echo "WARNING : Starting from cleaning up and normalizing the Gigword text"
+    echo "          This might take few days........... You can skip out this stage "
+    echo "          by setting start_textcleanup=false, and having normalised_gigaword_corpus/text_normalized ready inside $rnnlm_workdir."
+    	
+    mkdir -p "$rnnlm_workdir"/gigaword_rawtext
+    local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath"  "$rnnlm_workdir"/flattened_gigaword_corpus 24
+    cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt
+    local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/  \
+			   "$rnnlm_workdir"/normalised_gigaword_corpus/
+  fi
+    mkdir -p "$rnnlm_workdir"/text_lm
+    cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt
+    cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt  # For RNNLM and POCOLM training we use dev2/text as dev file.
+    cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+    if $addtraintext; then
+        cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+    fi
+fi
+
+if [ $stage -le 1 ]; then
+    local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
+    local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
+				"$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords
+    if $train_rnnlm; then
+        local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
+		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm
+    fi
+fi
+
+
 if [ $stage -le 2 ]; then
+  wait # wait till G2P training finishes
+  if [ -f exp/g2p/.error ]; then
+     rm exp/g2p/.error || true
+     echo "Fail to train the G2P model." && exit 1;
+  fi
+  steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
+  cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^[[:space:]]*$/d" | sort | uniq  > "$rnnlm_workdir"/lexicon_extended.txt
+  cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
+ 
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+  # Make sure that you do not use your test and your dev sets to train the LM
+  # Some form of cross validation is possible where you decode your dev/set based on an
+  # LM that is trained on  everything but that that conversation
+  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
+  # to get the numbers. Depending on your needs, you might have to change the size of
+  # the splits within that file. The default paritions are based on the Kaldi + Joshua
+  # requirements which means that I have very large dev and test sets
+  local/fsp_train_lms.sh $split
+  local/fsp_create_test_lang.sh
+
   # Now compute CMVN stats for the train, dev and test subsets
   steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
   steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
@@ -264,8 +303,11 @@ for iter in 1 2 3 4; do
     data/lang_test data/dev/  exp/sgmm5/decode_dev $decode
 done
 ) &
-
 fi
 
-local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1;
+wait;
+
+if [ $stage -le 6 ]; then
+  local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1;
+fi
 exit 0;
diff --git a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
new file mode 100644
index 00000000000..e6e316ec6b1
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright 2018  Govivace Inc. (Author: Valluri Saikiran)
+# Apache License 2.0
+
+# This script applies a g2p model using CMUsphinx/seq2seq.
+
+stage=0
+encoding='utf-8'
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [options] <oovlist-in> <model-dir> <outdir>"
+  echo "    where <lexicon-in> is the OOV wordlist "
+  echo "    <model-dir> is directory where the models will be stored"
+  exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+outdir=$3
+
+mkdir -p $outdir
+
+[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
+
+if [ ! -s `which g2p-seq2seq` ] ; then
+  echo "g2p-seq2seq was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh"
+  exit 1
+fi
+
+g2p-seq2seq --decode $lexicon --model_dir $wdir --output $outdir/lexicon.lex
+
diff --git a/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh
new file mode 100644
index 00000000000..e0389171fd5
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Copyright 2018  Govivace Inc. (Author: Valluri Saikiran)
+# Apache License 2.0
+
+# This script trains a g2p model using CMUsphinx/seq2seq.
+
+stage=0
+encoding='utf-8'
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [options] <lexicon-in> <work-dir>"
+  echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
+  echo "    word per line, with lines like 'hello h uh l ow') and"
+  echo "    <work-dir> is directory where the models will be stored"
+  exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+
+[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
+
+if [ ! -s `which g2p-seq2seq` ]; then
+  echo "g2p-seq2seq was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh"
+  exit 1
+fi
+
+g2p-seq2seq   --max_epochs 12 --train $lexicon --model_dir $wdir
+
diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py
index 4335caed5d8..2beede5acc6 100755
--- a/scripts/rnnlm/show_word_features.py
+++ b/scripts/rnnlm/show_word_features.py
@@ -7,6 +7,7 @@
 import argparse
 import sys
 
+
 sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
 import re
diff --git a/tools/extras/install_g2p_seq2seq.sh b/tools/extras/install_g2p_seq2seq.sh
new file mode 100644
index 00000000000..c9979b8b961
--- /dev/null
+++ b/tools/extras/install_g2p_seq2seq.sh
@@ -0,0 +1,5 @@
+if [ ! -e g2p-seq2seq ];then
+  git clone https://github.com/cmusphinx/g2p-seq2seq.git
+  cd g2p-seq2seq/
+  python setup.py install
+fi
diff --git a/tools/install_g2p_seq2seq.sh b/tools/install_g2p_seq2seq.sh
new file mode 120000
index 00000000000..77715305f74
--- /dev/null
+++ b/tools/install_g2p_seq2seq.sh
@@ -0,0 +1 @@
+extras/install_g2p_seq2seq.sh
\ No newline at end of file
diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh
new file mode 100755
index 00000000000..b6a7af211f5
--- /dev/null
+++ b/tools/install_sparrowhawk.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+export LDFLAGS="-L`pwd`/openfst/lib"
+export CXXFLAGS="-I`pwd`/openfst/include"
+stage=0
+
+if [ $stage -le 0 ] ; then
+    rm -rf re2 protobuf sparrowhawk*
+    git clone -b feature/Spanish_normalizer https://github.com/spokencloud/sparrowhawk-resources.git || exit 1;
+    patch -p0 < sparrowhawk-resources/local/Makefile.patch || exit 1;
+    make openfst || exit 1;
+    git clone https://github.com/mjansche/thrax.git
+    export LDFLAGS=-L`pwd`/openfst/lib
+    export CXXFLAGS=-I`pwd`/openfst/include
+    cd thrax
+    autoreconf --force --install || exit 1;
+    ./configure --prefix=`pwd` || exit 1;
+    make || exit 1;
+    make install || exit 1;
+    cd ..
+    git clone https://github.com/google/re2.git || exit 1;
+    cd re2/
+    make -j 20 || exit 1;
+    make test || exit 1;
+    make install prefix=`pwd` || exit 1;
+    cd ..
+    git clone https://github.com/google/protobuf.git || exit 1;
+    cd protobuf/
+    ./autogen.sh || exit 1;
+    ./configure --prefix=`pwd` || exit 1;
+    make -j 20 || exit 1;
+    make install || exit 1;
+    cd ..
+fi
+
+if [ $stage -le 1 ]; then 
+    git clone https://github.com/google/sparrowhawk.git || exit 1;
+    patch -p0 < sparrowhawk-resources/local/sparrowhawk.patch || exit 1;
+    cd sparrowhawk/ || exit 1;
+    mkdir lib
+    mkdir bin
+    mkdir include
+    cp -r ../openfst/lib/* lib/ || exit 1;
+    cp -r ../protobuf/lib/* lib/ || exit 1;
+    cp -r ../re2/lib/* lib/ || exit 1;
+    cp -r ../thrax/lib/* lib/ || exit 1;
+    cp -r ../openfst/include/* include/ || exit 1;
+    cp -r ../protobuf/include/* include/ || exit 1;
+    cp -r ../re2/include/* include/ || exit 1;
+    cp -r ../thrax/include/* include/ || exit 1;
+    cp ../protobuf/bin/protoc bin/. || exit 1;
+    export PATH=`pwd`/bin:$PATH
+    aclocal || exit 1;
+    automake || exit 1;
+    ./configure --prefix=`pwd`  CPPFLAGS="-I`pwd`/include"  LDFLAGS="-L`pwd`/lib" || exit 1;
+    make || exit 1;
+    make install || exit 1;
+    cd ..
+fi
+
+if [ $stage -le 2 ]; then 
+    cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1;
+    cd sparrowhawk/language-resources/esp/textnorm/classifier || exit 1;
+    . ./path.sh || exit 1;
+    python2 create_far.py ascii.syms  universal_depot_ascii universal_depot universal_depot.far 
+    thraxmakedep tokenize_and_classify.grm || exit 1;
+    make || exit 1;
+    cd ../verbalizer
+    python2 create_far.py ascii.syms  number_names_depot_ascii number_names_depot number_names_depot.far
+    cp -r ../classifier/universal_depot.far .
+    thraxmakedep  verbalize.grm || exit 1;
+    make || exit 1;
+    cd ../../../../..
+fi