diff --git a/egs/fisher_callhome_spanish/s5/RESULTS b/egs/fisher_callhome_spanish/s5/RESULTS deleted file mode 100644 index 66613163cea..00000000000 --- a/egs/fisher_callhome_spanish/s5/RESULTS +++ /dev/null @@ -1,38 +0,0 @@ --------------------------------------------------------------------------------------- -Triphone with mono alignment (small) --------------------------------------------------------------------------------------- -%WER 53.70 [ 21570 / 40170, 2618 ins, 6013 del, 12939 sub ] exp/tri1/decode_dev/wer_14_0.0 - --------------------------------------------------------------------------------------- -Triphone with tri alignments --------------------------------------------------------------------------------------- -%WER 53.18 [ 21364 / 40170, 2889 ins, 5533 del, 12942 sub ] exp/tri2/decode_dev/wer_13_0.0 - --------------------------------------------------------------------------------------- -Triphone + LDA + MLLT --------------------------------------------------------------------------------------- -%WER 46.95 [ 18858 / 40170, 2636 ins, 5197 del, 11025 sub ] exp/tri3a/decode_dev/wer_14_0.0 - --------------------------------------------------------------------------------------- -+ SAT + fMLLR --------------------------------------------------------------------------------------- -%WER 42.86 [ 17217 / 40170, 2556 ins, 4633 del, 10028 sub ] exp/tri4a/decode_dev/wer_15_0.0 - --------------------------------------------------------------------------------------- -+ More leaves and gaussians --------------------------------------------------------------------------------------- -%WER 40.48 [ 16261 / 40170, 2689 ins, 4130 del, 9442 sub ] exp/tri5a/decode_dev/wer_14_0.0 - --------------------------------------------------------------------------------------- -+ bMMI + SGMM --------------------------------------------------------------------------------------- -%WER 38.43 [ 15437 / 40170, 2800 ins, 3685 del, 8952 sub ] exp/sgmm5/decode_dev/wer_10_0.0 -%WER 36.90 [ 14821 / 40170, 2708 ins, 3552 del, 8561 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it1/wer_10_0.0 -%WER 36.09 [ 14499 / 40170, 2511 ins, 3737 del, 8251 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it2/wer_11_0.0 -%WER 35.48 [ 14252 / 40170, 2672 ins, 3370 del, 8210 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it3/wer_10_0.0 -%WER 35.16 [ 14122 / 40170, 2701 ins, 3287 del, 8134 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it4/wer_10_0.0 - --------------------------------------------------------------------------------------- -pNorm-Ensemble DNN --------------------------------------------------------------------------------------- -%WER 35.13 [ 14113 / 40170, 2680 ins, 3405 del, 8028 sub ] exp/tri6a_dnn/decode_dev/wer_11_0.0 diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh index 7f407552c2e..3e400914521 100755 --- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh +++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh @@ -27,6 +27,7 @@ nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=1g #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. common_egs_dir= reporting_email= +gigaword_workdir= # LSTM/chain options train_stage=-10 @@ -254,11 +255,6 @@ if [ $stage -le 21 ]; then fi -rnnlmdir=exp/rnnlm_lstm_tdnn_1b -if [ $stage -le 22 ]; then - local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; -fi - if [ $stage -le 23 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) rm $dir/.error 2>/dev/null || true @@ -277,8 +273,11 @@ if [ $stage -le 23 ]; then --online-ivector-dir exp/nnet3/ivectors_${data}_hires \ $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1; done - bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \ - ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1; + if [ $gigaword_workdir ]; then + lmtype=fsp_train + bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \ + ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1; + fi ) || touch $dir/.error & done wait diff --git a/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py new file mode 100644 index 00000000000..7d92eb9fe3a --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc., + +import os, sys +import re +import codecs + +if len(sys.argv) < 3: + print("Usage : python clean_abbrevs_text.py ") + print(" Processes the text before text normalisation to convert uppercase words as space separated letters") + sys.exit() + +inputfile=codecs.open(sys.argv[1], encoding='utf-8') +outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w') + +for line in inputfile: + words = line.split() + textout = "" + wordcnt = 0 + for word in words: + if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word): + if wordcnt > 0: + word = re.sub('\'?s', 's', word) + textout = textout + " ".join(word) + " " + else: + textout = textout + word + " " + else: + textout = textout + word + " " + if word.isalpha(): wordcnt = wordcnt + 1 + outputfile.write(textout.strip()+ '\n') + +inputfile.close() +outputfile.close() diff --git a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh new file mode 100755 index 00000000000..5d25e3a3fd2 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Script to clean up gigaword LM text +# Removes punctuations, does case normalization + +stage=0 +nj=500 + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +set -euo pipefail + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1; +fi + +if [ ! -s `which normalizer_main` ] ; then + echo "Sparrowhawk normalizer was not found installed !" + echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!" + exit 1 +fi + +txtdir=$1 +textdir=$(realpath $txtdir) +outdir=$(realpath $2) + +workdir=$outdir/tmp +if [ $stage -le 0 ]; then + rm -rf $outdir + mkdir -p $workdir + mkdir -p $textdir/splits + mkdir -p $outdir/data + split -l 1000000 $textdir/in.txt $textdir/splits/out + numsplits=0 + for x in $textdir/splits/*; do + numsplits=$((numsplits+1)) + ln -s $x $outdir/data/$numsplits + done + echo $numsplits + cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt . + $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \ + local/run_norm.sh \ + sparrowhawk_configuration.ascii_proto \ + $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \ + $outdir/data \ + JOB \ + $outdir/sparrowhawk/ + cat $outdir/sparrowhawk/*.txt | sed "/^$/d" > $outdir/text_normalized + + # check if numbers are there in normalized output + awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \ + $outdir/text_normalized > $outdir/unique_words + grep "[0-9]" $outdir/unique_words | sort -u > $outdir/numbers +fi diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh index 62860a10b7b..7d09f574580 100755 --- a/egs/fisher_callhome_spanish/s5/local/ctm.sh +++ b/egs/fisher_callhome_spanish/s5/local/ctm.sh @@ -19,9 +19,9 @@ fi steps/get_ctm.sh $data_dir $lang_dir $decode_dir # Make sure that channel markers match -#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm -#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {} -#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {} +#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm +#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {} +#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {} # Get the environment variables . /export/babel/data/software/env.sh diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh new file mode 100755 index 00000000000..242359e7c28 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -e + +# Path to Gigaword corpus with all data files decompressed. +export GIGAWORDDIR=$1 +# The directory to write output to +export OUTPUTDIR=$2 +# The number of jobs to run at once +export NUMJOBS=$3 + +echo "Flattening Gigaword with ${NUMJOBS} processes..." +mkdir -p $OUTPUTDIR +find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \; +echo "Combining the flattened files into one..." +cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py new file mode 100644 index 00000000000..29f6766dd84 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +import logging +import os +import re +import spacy +import gzip + +from argparse import ArgumentParser +from bs4 import BeautifulSoup + +en_nlp = spacy.load("es") + + +def flatten_one_gigaword_file(file_path): + f = gzip.open(file_path) + html = f.read() + # Parse the text with BeautifulSoup + soup = BeautifulSoup(html, "html.parser") + + # Iterate over all

items and get the text for each. + all_paragraphs = [] + for paragraph in soup("p"): + # Turn inter-paragraph newlines into spaces + paragraph = paragraph.get_text() + paragraph = re.sub(r"\n+", "\n", paragraph) + paragraph = paragraph.replace("\n", " ") + # Tokenize the paragraph into words + tokens = en_nlp.tokenizer(paragraph) + words = [str(token) for token in tokens if not + str(token).isspace()] + if len(words) < 3: + continue + all_paragraphs.append(words) + # Return a list of strings, where each string is a + # space-tokenized paragraph. + return [" ".join(paragraph) for paragraph in all_paragraphs] + + +if __name__ == "__main__": + log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + logging.basicConfig(level=logging.INFO, format=log_fmt) + logger = logging.getLogger(__name__) + + parser = ArgumentParser(description=("Flatten a gigaword data file for " + "use in language modeling.")) + parser.add_argument("--gigaword-path", required=True, + metavar="", type=str, + help=("Path to Gigaword directory, with " + "all .gz files unzipped.")) + parser.add_argument("--output-dir", required=True, metavar="", + type=str, help=("Directory to write final flattened " + "Gigaword file.")) + + A = parser.parse_args() + all_paragraphs = flatten_one_gigaword_file(A.gigaword_path) + output_path = os.path.join(A.output_dir, + os.path.basename(A.gigaword_path) + ".flat") + with open(output_path, "w") as output_file: + for paragraph in all_paragraphs: + output_file.write("{}\n".format(paragraph)) diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh new file mode 100755 index 00000000000..6b236be0ab9 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -e + +. ./path_venv.sh + +# Path to Gigaword corpus with all data files decompressed. +GIGAWORDPATH=$1 +# The directory to write output to +OUTPUTDIR=$2 +file=$(basename ${GIGAWORDPATH}) +if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then + echo "flattening to ${OUTPUTDIR}/${file}.flat" + python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR} +else + echo "skipping ${file}.flat" +fi + diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh index 11d65da3e95..22b98a6c9db 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh @@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then sed 's:::g' | \ sed 's:foreign>::g' | \ + sed 's:\[noise\]:[noise] :g' | \ sed 's:>::g' | \ #How do you handle numbers? grep -v '()' | \ diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh index 779298305c4..7b2de2db392 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh @@ -105,8 +105,9 @@ if [ $stage -le 4 ]; then cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2" # Add prons for laughter, noise, oov - w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|') - perl -i -ne "print unless /\[(${w%?})\]/" $tmpdir/lexicon.2 + for w in `grep -v sil $dir/silence_phones.txt`; do + sed -i "/\[$w\]/d" $tmpdir/lexicon.2 + done for w in `grep -v sil $dir/silence_phones.txt`; do echo "[$w] $w" diff --git a/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl new file mode 100755 index 00000000000..ca5b2a46f8e --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl @@ -0,0 +1,39 @@ +#!/usr/bin/env perl + +# Nagendra Kumar Goel + +# This takes two arguments: +# 1) Pocolm training output folder +# 2) rnnlm weights file name (for output) + +use POSIX; +use List::Util qw[min max]; + +if (@ARGV != 2) { + die "Usage: get_data_weights.pl \n"; +} + +$pdir = shift @ARGV; +$out = shift @ARGV; + +open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters"; +open(N, "<$pdir/names") || die "Could not open $pdir/names" ; +open(O, ">$out") || die "Could not open $out for writing" ; + +my %scores = (); + +while() { + @n = split(/\s/,$_); + $name = $n[1]; + $w =

; + @w = split(/\s/,$w); + $weight = $w[1]; + $scores{$name} = $weight; +} + +$min = min(values %scores); + +for(keys %scores) { + $weightout = POSIX::ceil($scores{$_} / $min); + print O "$_\t1\t$weightout\n"; +} diff --git a/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py new file mode 100755 index 00000000000..fc13a7af701 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc. + +import os, sys + +if len(sys.argv) < 5: + print( "Usage: python get_rnnlm_wordlist.py ") + sys.exit() + +lexicon_words = open(sys.argv[1], 'r', encoding="utf-8") +pocolm_words = open(sys.argv[2], 'r', encoding="utf-8") +rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8") +oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8") + +line_count=0 +lexicon=[] + +for line in lexicon_words: + lexicon.append(line.split()[0]) + rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') + line_count = line_count + 1 + +for line in pocolm_words: + if not line.split()[0] in lexicon: + oov_wordlist.write(line.split()[0]+'\n') + rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') + line_count = line_count + 1 + +lexicon_words.close() +pocolm_words.close() +rnnlm_wordsout.close() +oov_wordlist.close() diff --git a/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py new file mode 100644 index 00000000000..3ecd16772d7 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc. + +import os, sys + +if len(sys.argv) < 3: + print("Usage : python . ") + print(" Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.") + sys.exit() + +pocolmdir=sys.argv[1] +unigramwts=open(sys.argv[2], 'w') + +names = open(pocolmdir+"/names", 'r') +metaparams = open(pocolmdir+"/metaparameters", 'r') + +name_mapper={} +for line in names: + fields=line.split() + name_mapper[fields[0]] = fields[1] + +lns = metaparams.readlines() +for lineno in range(len(name_mapper.keys())): + line = lns[lineno] + fileid = line.split()[0].split("_")[-1] + weight = line.split()[1] + unigramwts.write(name_mapper[fileid] + " " + weight + "\n") + +names.close() +unigramwts.close() +metaparams.close() diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py index c7aa6affb11..c7e0f140d2f 100755 --- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py +++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py @@ -1,11 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # + # Copyright 2014 Gaurav Kumar. Apache 2.0 # 2018 Nagendra Kumar Goel, Saikiran Valluri, GoVivace inc., Avaaya + # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon from __future__ import print_function -import sys, re +import sys +import re import json import codecs import operator diff --git a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh new file mode 100755 index 00000000000..0a5649c2a79 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash + +# this script generates Pocolm-estimated language models with various +# data sources in data/text folder and places the output in data/lm. + +set -euo pipefail + +. ./path.sh + +export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P) +export PATH=$PATH:$POCOLM_ROOT/scripts + + +wordlist=None +num_word=100000 +pocolm_stage=1 +ngram_order=3 +lm_dir= +arpa_dir= +textdir= +max_memory='--max-memory=8G' + +. ./cmd.sh +. ./utils/parse_options.sh + + +# If you do not want to set memory limitation for "sort", you can use +#max_memory= +# Choices for the max-memory can be: +# 1) integer + 'K', 'M', 'G', ... +# 2) integer + 'b', meaning unit is byte and no multiplication +# 3) integer + '%', meaning a percentage of memory +# 4) integer, default unit is 'K' + +fold_dev_opt= +# If you want to fold the dev-set in to the 'swbd1' set to produce the final +# model, un-comment the following line. For use in the Kaldi example script for +# ASR, this isn't suitable because the 'dev' set is the first 10k lines of the +# switchboard data, which we also use as dev data for speech recognition +# purposes. +#fold_dev_opt="--fold-dev-into=swbd1" + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 3-gram model running with train_lm.py. +# the dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.091,0.867,0.753,0.275,0.100,0.018,0.902,0.371,0.183,0.070" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +#limit_unk_history_opt= +# If you want to limit the left of in the history of a n-gram +# un-comment the following line +limit_unk_history_opt="--limit-unk-history=true" + +for order in ${ngram_order}; do + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + lm_name="${num_word}_${order}" + min_counts='' + # Note: the following might be a more reasonable setting: + # min_counts='fisher=2 swbd1=1' + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \ + --min-counts=${min_counts} \ + --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \ + ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir} + + if [ $pocolm_stage -eq 2 ];then + mkdir -p ${arpa_dir} + format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz + + # example of pruning. note: the threshold can be less than or more than one. + get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + for threshold in 1.0 2.0 4.0; do + pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm + prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3 + get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz + + done + + # example of pruning by size. + size=1000000 + pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm + prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes' + get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz + fi +done + +# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 ) + +# the following does does some self-testing, including +# that the computed derivatives are accurate. +# local/self_test.sh + +# perplexities from pocolm-estimated language models with pocolm's interpolation +# method from orders 3, 4, and 5 are: +# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689) +# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797) +# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181) + +# note, the perplexities from pocolm-estimated language models with SRILM's +# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh), +# 78.8449 and 75.2202 respectively. + +# note, the perplexities from SRILM-estimated language models with SRILM's +# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh), +# 78.9056 and 75.5528 respectively. diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh new file mode 100755 index 00000000000..3850910f312 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# 2015 Guoguo Chen +# 2017 Hainan Xu +# 2017 Xiaohui Zhang + +# This script trains LMs on the swbd LM-training data. + +# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0. +# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71 +# Dev objf: -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91 + + +dir=Spanish_gigawrd/rnnlm +pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned +wordslist= +embedding_dim=1024 +lstm_rpd=256 +lstm_nrpd=256 +stage=0 +train_stage=-30 +text_dir=Spanish_gigawrd/text_lm + +. ./cmd.sh +. ./utils/parse_options.sh + +mkdir -p $dir/config +set -e + +for f in $text_dir/dev.txt; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist;" && exit 1 +done + +if [ $stage -le 0 ]; then + if [ -f $text_dir/unigram_weights ] ; then + mv $text_dir/unigram_weights $pocolm_dir/ + fi + cp $wordslist $dir/config/words.txt + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt + rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ + --unk-word="" \ + --data-weights-file=$dir/config/data_weights.txt \ + $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,,[noise],[laughter]' \ + $dir/config/words.txt > $dir/config/features.txt +fi + +if [ $stage -le 1 ]; then + cat <$dir/config/xconfig + input dim=$embedding_dim name=input + relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1)) + fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd + relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3)) + fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd + relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3)) + output-layer name=output include-log-softmax=false dim=$embedding_dim +EOF + rnnlm/validate_config_dir.sh $text_dir $dir/config +fi + +if [ $stage -le 2 ]; then + rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir +fi + +if [ $stage -le 3 ]; then + rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \ + --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir +fi + +exit 0 diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh deleted file mode 100755 index 3713fe228d6..00000000000 --- a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson -# 2017 Hainan Xu -# 2017 Ke Li - -# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization. - -# local/rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration. -# local/rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7. -# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82 -# Dev objf: -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23 - -# Begin configuration section. -dir=exp/rnnlm_lstm_tdnn_1b -embedding_dim=200 -embedding_l2=0.005 # embedding layer l2 regularize -comp_l2=0.005 # component-level l2 regularize -output_l2=0.005 # output-layer l2 regularize -epochs=90 -mic= -stage=-10 -train_stage=0 - -. ./cmd.sh -. ./utils/parse_options.sh -[ -z "$cmd" ] && cmd=$train_cmd - -train=data/train/text -dev=data/dev2/text # We at no stage in run.sh should decode dev2 partition for results! -wordlist=data/lang/words.txt -text_dir=data/local/rnnlm/text -mkdir -p $dir/config -set -e - -for f in $train $dev $wordlist; do - [ ! -f $f ] && \ - echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1 -done - -if [ $stage -le 0 ]; then - mkdir -p $text_dir - cat $train | cut -d ' ' -f2- > $text_dir/ami.txt - cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt -fi - -if [ $stage -le 1 ]; then - cp $wordlist $dir/config/ - n=`cat $dir/config/words.txt | wc -l` - echo " $n" >> $dir/config/words.txt - - # words that are not present in words.txt but are in the training or dev data, will be - # mapped to during training. - echo "" >$dir/config/oov.txt - - cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt - - # choose features - rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ - --use-constant-feature=true \ - --top-word-features 10000 \ - --min-frequency 1.0e-03 \ - --special-words=',,,,[noise],[laughter]' \ - $dir/config/words.txt > $dir/config/features.txt - -lstm_opts="l2-regularize=$comp_l2" -tdnn_opts="l2-regularize=$comp_l2" -output_opts="l2-regularize=$output_l2" - - cat >$dir/config/xconfig < $dir/normalize/$job/substitute.sh + +bash $dir/normalize/$job/substitute.sh | \ + sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \ + sed "s: \s*: :g" > $dir/normalize/$job/text + +local/clean_abbrevs_text.py $dir/normalize/$job/text $data/"$job"_processed +tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' < $data/"$job"_processed > $dir/normalize/$job/text + +normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt + +exit 0; diff --git a/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh new file mode 100755 index 00000000000..b8b3ca35ef9 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +stage=-2 +num_words_pocolm=110000 +prune_size=1000000 + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +set -euo pipefail + +export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P) +export PATH=$PATH:$POCOLM_ROOT/scripts + +textdir=$1 +pocolm_dir=$2 + + +if [ $stage -le -2 ]; then + echo "****" + echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model" + echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....." + echo "****" + if [ -e "$textdir"/unigram_weights ]; then + rm "$textdir"/unigram_weights + fi + if [ -e "$pocolm_dir" ]; then + rm -r "$pocolm_dir" + fi + + bash local/pocolm_cust.sh --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \ + --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" + +fi + +if [ $stage -le -1 ];then + echo "********" + echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....." + echo "********" + + echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done + python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights + bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \ + --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" + prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \ + "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" + mkdir -p "$pocolm_dir"/arpa + format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" | \ + gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz +fi + + +exit 0; diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh index 17ffb0369f8..201edd95876 100755 --- a/egs/fisher_callhome_spanish/s5/path.sh +++ b/egs/fisher_callhome_spanish/s5/path.sh @@ -1,6 +1,11 @@ -export KALDI_ROOT=`pwd`/../../.. +export KALDI_ROOT=`pwd`/../../../ +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh -export LC_ALL=C export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs + +export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk +export PATH=$SPARROWHAWK_ROOT/bin:$PATH +export LC_ALL=C.UTF-8 +export LANG=C.UTF-8 diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 6e2752a7b68..70d4d0555a4 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -4,14 +4,25 @@ # Copyright 2014 Gaurav Kumar. Apache 2.0 # Recipe for Fisher/Callhome-Spanish -stage=0 -train_stage=-20 +stage=-1 +lmstage=-2 + +# GIGAWORD RNNLM training based options below. +# GIGAWORD RAW CORPUS DATA is assumed to be already downloaded in the gigaword_datapath. +train_rnnlm=false +start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder. + # If you already have the normalised gigword text somewhere, you can bypass the + # time consuming text cleanup (~1 week) by setting this option false. +addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to + # perform the A, A + G, Dev type POCOLM training configuration. + # A=fsp train, G=gigword text, +num_words_pocolm=100000 train_sgmm2=false # call the next line with the directory where the Spanish Fisher data is # (the values below are just an example). sfisher_speech=/export/corpora/LDC/LDC2010S01 -sfisher_transcripts=/export/corpora/LDC/LDC2010T04 +sfisher_transcripts=/export/c03/svalluri//LDC2010T04 spanish_lexicon=/export/corpora/LDC/LDC96L16 split=local/splits/split_fisher @@ -19,15 +30,17 @@ callhome_speech=/export/corpora/LDC/LDC96S35 callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome +gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data # GIGAWORD RAW CORPUS DATA DOWNLOAD PATH +rnnlm_workdir=workdir_rnnlm_Spanish_gigaword mfccdir=`pwd`/mfcc . ./cmd.sh if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; -set -e +set -eou pipefail -if [ $stage -le 1 ]; then +if [ $stage -le -1 ]; then local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts local/callhome_data_prep.sh $callhome_speech $callhome_transcripts @@ -37,19 +50,14 @@ if [ $stage -le 1 ]; then # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted # wordlist is downloaded if it is not available. local/fsp_prepare_dict.sh $spanish_lexicon + # Let's keep the original dict copy for G2P training + cp -r data/local/dict data/local/dict_orig + ( + steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error + ) & # Added c,j, v to the non silences phones manually - utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang - - # Make sure that you do not use your test and your dev sets to train the LM - # Some form of cross validation is possible where you decode your dev/set based on an - # LM that is trained on everything but that that conversation - # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl - # to get the numbers. Depending on your needs, you might have to change the size of - # the splits within that file. The default paritions are based on the Kaldi + Joshua - # requirements which means that I have very large dev and test sets - local/fsp_train_lms.sh $split - local/fsp_create_test_lang.sh + utils/prepare_lang.sh data/local/dict_orig "" data/local/lang_orig data/lang_orig utils/fix_data_dir.sh data/local/data/train_all @@ -70,34 +78,65 @@ if [ $stage -le 1 ]; then cp -r data/local/data/callhome_train_all data/callhome_train_all - # Creating data partitions for the pipeline - # We need datasets for both the ASR and SMT system - # We have 257455 utterances left, so the partitions are roughly as follows - # ASR Train : 100k utterances - # ASR Tune : 17455 utterances - # ASR Eval : 20k utterances - # MT Train : 100k utterances - # MT Tune : Same as the ASR eval set (Use the lattices from here) - # MT Eval : 20k utterances - # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker - # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. - # As noted above, the LM has not been trained on the dev and the test sets. - #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test - #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test - #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test - #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev - #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test - #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train - #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test - #rm -r data/dev_and_test - #rm -r data/asr_dev_and_test - #rm -r data/mt_train_and_test - local/create_splits.sh $split local/callhome_create_splits.sh $split_callhome + fi +if [ $stage -le 0 ]; then + if $start_textcleanup; then + echo "WARNING : Starting from cleaning up and normalizing the Gigword text" + echo " This might take few days........... You can skip out this stage " + echo " by setting start_textcleanup=false, and having normalised_gigaword_corpus/text_normalized ready inside $rnnlm_workdir." + + mkdir -p "$rnnlm_workdir"/gigaword_rawtext + local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24 + cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt + local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/ \ + "$rnnlm_workdir"/normalised_gigaword_corpus/ + fi + mkdir -p "$rnnlm_workdir"/text_lm + cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt + cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt # For RNNLM and POCOLM training we use dev2/text as dev file. + cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt + if $addtraintext; then + cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt + fi +fi + +if [ $stage -le 1 ]; then + local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm + local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ + "$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords + if $train_rnnlm; then + local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ + --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm + fi +fi + + if [ $stage -le 2 ]; then + wait # wait till G2P training finishes + if [ -f exp/g2p/.error ]; then + rm exp/g2p/.error || true + echo "Fail to train the G2P model." && exit 1; + fi + steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex + cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^[[:space:]]*$/d" | sort | uniq > "$rnnlm_workdir"/lexicon_extended.txt + cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version. + + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + + # Make sure that you do not use your test and your dev sets to train the LM + # Some form of cross validation is possible where you decode your dev/set based on an + # LM that is trained on everything but that that conversation + # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl + # to get the numbers. Depending on your needs, you might have to change the size of + # the splits within that file. The default paritions are based on the Kaldi + Joshua + # requirements which means that I have very large dev and test sets + local/fsp_train_lms.sh $split + local/fsp_create_test_lang.sh + # Now compute CMVN stats for the train, dev and test subsets steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir @@ -264,8 +303,11 @@ for iter in 1 2 3 4; do data/lang_test data/dev/ exp/sgmm5/decode_dev $decode done ) & - fi -local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1; +wait; + +if [ $stage -le 6 ]; then + local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1; +fi exit 0; diff --git a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh new file mode 100644 index 00000000000..e6e316ec6b1 --- /dev/null +++ b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Copyright 2018 Govivace Inc. (Author: Valluri Saikiran) +# Apache License 2.0 + +# This script applies a g2p model using CMUsphinx/seq2seq. + +stage=0 +encoding='utf-8' + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +set -u +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 [options] " + echo " where is the OOV wordlist " + echo " is directory where the models will be stored" + exit 1; +fi + +lexicon=$1 +wdir=$2 +outdir=$3 + +mkdir -p $outdir + +[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit + +if [ ! -s `which g2p-seq2seq` ] ; then + echo "g2p-seq2seq was not found !" + echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh" + exit 1 +fi + +g2p-seq2seq --decode $lexicon --model_dir $wdir --output $outdir/lexicon.lex + diff --git a/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh new file mode 100644 index 00000000000..e0389171fd5 --- /dev/null +++ b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# Copyright 2018 Govivace Inc. (Author: Valluri Saikiran) +# Apache License 2.0 + +# This script trains a g2p model using CMUsphinx/seq2seq. + +stage=0 +encoding='utf-8' + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +set -u +set -e + +if [ $# != 2 ]; then + echo "Usage: $0 [options] " + echo " where is the training lexicon (one pronunciation per " + echo " word per line, with lines like 'hello h uh l ow') and" + echo " is directory where the models will be stored" + exit 1; +fi + +lexicon=$1 +wdir=$2 + +[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit + +if [ ! -s `which g2p-seq2seq` ]; then + echo "g2p-seq2seq was not found !" + echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh" + exit 1 +fi + +g2p-seq2seq --max_epochs 12 --train $lexicon --model_dir $wdir + diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py index 4335caed5d8..2beede5acc6 100755 --- a/scripts/rnnlm/show_word_features.py +++ b/scripts/rnnlm/show_word_features.py @@ -7,6 +7,7 @@ import argparse import sys + sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) import re diff --git a/tools/extras/install_g2p_seq2seq.sh b/tools/extras/install_g2p_seq2seq.sh new file mode 100644 index 00000000000..c9979b8b961 --- /dev/null +++ b/tools/extras/install_g2p_seq2seq.sh @@ -0,0 +1,5 @@ +if [ ! -e g2p-seq2seq ];then + git clone https://github.com/cmusphinx/g2p-seq2seq.git + cd g2p-seq2seq/ + python setup.py install +fi diff --git a/tools/install_g2p_seq2seq.sh b/tools/install_g2p_seq2seq.sh new file mode 120000 index 00000000000..77715305f74 --- /dev/null +++ b/tools/install_g2p_seq2seq.sh @@ -0,0 +1 @@ +extras/install_g2p_seq2seq.sh \ No newline at end of file diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh new file mode 100755 index 00000000000..b6a7af211f5 --- /dev/null +++ b/tools/install_sparrowhawk.sh @@ -0,0 +1,73 @@ +#!/bin/bash +export LDFLAGS="-L`pwd`/openfst/lib" +export CXXFLAGS="-I`pwd`/openfst/include" +stage=0 + +if [ $stage -le 0 ] ; then + rm -rf re2 protobuf sparrowhawk* + git clone -b feature/Spanish_normalizer https://github.com/spokencloud/sparrowhawk-resources.git || exit 1; + patch -p0 < sparrowhawk-resources/local/Makefile.patch || exit 1; + make openfst || exit 1; + git clone https://github.com/mjansche/thrax.git + export LDFLAGS=-L`pwd`/openfst/lib + export CXXFLAGS=-I`pwd`/openfst/include + cd thrax + autoreconf --force --install || exit 1; + ./configure --prefix=`pwd` || exit 1; + make || exit 1; + make install || exit 1; + cd .. + git clone https://github.com/google/re2.git || exit 1; + cd re2/ + make -j 20 || exit 1; + make test || exit 1; + make install prefix=`pwd` || exit 1; + cd .. + git clone https://github.com/google/protobuf.git || exit 1; + cd protobuf/ + ./autogen.sh || exit 1; + ./configure --prefix=`pwd` || exit 1; + make -j 20 || exit 1; + make install || exit 1; + cd .. +fi + +if [ $stage -le 1 ]; then + git clone https://github.com/google/sparrowhawk.git || exit 1; + patch -p0 < sparrowhawk-resources/local/sparrowhawk.patch || exit 1; + cd sparrowhawk/ || exit 1; + mkdir lib + mkdir bin + mkdir include + cp -r ../openfst/lib/* lib/ || exit 1; + cp -r ../protobuf/lib/* lib/ || exit 1; + cp -r ../re2/lib/* lib/ || exit 1; + cp -r ../thrax/lib/* lib/ || exit 1; + cp -r ../openfst/include/* include/ || exit 1; + cp -r ../protobuf/include/* include/ || exit 1; + cp -r ../re2/include/* include/ || exit 1; + cp -r ../thrax/include/* include/ || exit 1; + cp ../protobuf/bin/protoc bin/. || exit 1; + export PATH=`pwd`/bin:$PATH + aclocal || exit 1; + automake || exit 1; + ./configure --prefix=`pwd` CPPFLAGS="-I`pwd`/include" LDFLAGS="-L`pwd`/lib" || exit 1; + make || exit 1; + make install || exit 1; + cd .. +fi + +if [ $stage -le 2 ]; then + cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1; + cd sparrowhawk/language-resources/esp/textnorm/classifier || exit 1; + . ./path.sh || exit 1; + python2 create_far.py ascii.syms universal_depot_ascii universal_depot universal_depot.far + thraxmakedep tokenize_and_classify.grm || exit 1; + make || exit 1; + cd ../verbalizer + python2 create_far.py ascii.syms number_names_depot_ascii number_names_depot number_names_depot.far + cp -r ../classifier/universal_depot.far . + thraxmakedep verbalize.grm || exit 1; + make || exit 1; + cd ../../../../.. +fi