#!/bin/bash # This script trains LMs on the WSJ LM-training data. # It requires that you have already run wsj_extend_dict.sh, # to get the larger-size dictionary including all of CMUdict # plus any OOVs and possible acronyms that we could easily # derive pronunciations for. # This script takes no command-line arguments dir=data/local/local_lm srcdir=data/local/dict_larger mkdir -p $dir . ./path.sh || exit 1; # for KALDI_ROOT export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH ( # First make sure the kaldi_lm toolkit is installed. cd $KALDI_ROOT/tools || exit 1; if [ -d kaldi_lm ]; then echo Not installing the kaldi_lm toolkit since it is already there. else echo Downloading and installing the kaldi_lm tools if [ ! -f kaldi_lm.tar.gz ]; then wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1; fi tar -xvzf kaldi_lm.tar.gz || exit 1; cd kaldi_lm make || exit 1; echo Done making the kaldi_lm tools fi ) || exit 1; if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist"; echo "You need to run local/wsj_extend_dict.sh before running this script." exit 1; fi # Get a wordlist-- keep everything but silence, which should not appear in # the LM. awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt # Get training data with OOV words (w.r.t. our current vocab) replaced with . echo "Getting training data with OOV words replaced with (train_nounk.gz)" gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \ 'BEGIN{while((getline0) v[$1]=1;} {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ | gzip -c > $dir/train_nounk.gz # Get unigram counts (without bos/eos, but this doens't matter here, it's # only to get the word-map, which treats them specially & doesn't need their # counts). # Add a 1-count for each word in word-list by including that in the data, # so all words appear. gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \ awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ sort -nr > $dir/unigram.counts # Get "mapped" words-- a character encoding of the words that makes the common words very short. cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} { for(n=1;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz # To save disk space, remove the un-mapped training data. We could # easily generate it again if needed. rm $dir/train_nounk.gz train_lm.sh --arpa --lmtype 3gram-mincount $dir #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 # 7.8 million N-grams. prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ # 1.45 million N-grams. # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 train_lm.sh --arpa --lmtype 4gram-mincount $dir #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 # 10.3 million N-grams. prune_lm.sh --arpa 7.0 $dir/4gram-mincount # 1.50 million N-grams # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 exit 0 ### Below here, this script is showing various commands that ## were run during LM tuning. train_lm.sh --arpa --lmtype 3gram-mincount $dir #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 # 7.8 million N-grams. prune_lm.sh --arpa 3.0 $dir/3gram-mincount/ #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740 # 2.5 million N-grams. prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ # 1.45 million N-grams. # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 train_lm.sh --arpa --lmtype 4gram-mincount $dir #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 # 10.3 million N-grams. prune_lm.sh --arpa 3.0 $dir/4gram-mincount #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294 # 2.6 million N-grams. prune_lm.sh --arpa 4.0 $dir/4gram-mincount # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717 # 2.15 million N-grams. prune_lm.sh --arpa 5.0 $dir/4gram-mincount # 1.86 million N-grams # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023 prune_lm.sh --arpa 7.0 $dir/4gram-mincount # 1.50 million N-grams # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 train_lm.sh --arpa --lmtype 3gram $dir # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866 # 20.0 million N-grams ! which ngram-count \ && echo "SRILM tools not installed so not doing the comparison" && exit 1; ################# # You could finish the script here if you wanted. # Below is to show how to do baselines with SRILM. # You'd have to install the SRILM toolkit first. heldout_sent=10000 # Don't change this if you want result to be comparable with # kaldi_lm results sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. mkdir -p $sdir gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train (echo ""; echo "" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s # 3-gram: ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs #0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437 # Trying 4-gram: ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs #0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822 #3-gram with pruning: ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ -prune 0.0000001 -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs #0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616 # Around 2.25M N-grams. # Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/" # above, which gave 2.5 million N-grams and a perplexity of 156. # Note: all SRILM experiments above fully discount all singleton 3 and 4-grams. # You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to # the kaldi_lm experiments above without "-mincount". ## From here is how to train with # IRSTLM. This is not really working at the moment. export IRSTLM=$KALDI_ROOT/tools/irstlm/ idir=$dir/irstlm mkdir $idir gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \ gzip -c > $idir/train.gz $IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\ {print $0;}}' > vocab.irstlm.20k $IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \ -n 3 -s improved-kneser-ney -b yes # Testing perplexity with SRILM tools: ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout #data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for in closed-vocabulary LM #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs #0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599 # Perplexity is very bad (should be ~141, since we used -p option, # not 175), # but adding -debug 3 to the command line shows that # the IRSTLM LM does not seem to sum to one properly, so it seems that # it produces an LM that isn't interpretable in the normal way as an ARPA # LM.