#!/bin/bash # Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); # Arnab Ghoshal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # This script prepares a directory such as data/lang/, in the standard format, # given a source directory containing a dictionary lexicon.txt in a form like: # word phone1 phone2 ... phoneN # per line (alternate prons would be separate lines), or a dictionary with probabilities # called lexiconp.txt in a form: # word pron-prob phone1 phone2 ... phoneN # (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if # lexicon.txt exists. # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt # and extra_questions.txt # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and # non-silence phones respectively (where silence includes various kinds of # noise, laugh, cough, filled pauses etc., and nonsilence phones includes the # "real" phones.) # In each line of those files is a list of phones, and the phones on each line # are assumed to correspond to the same "base phone", i.e. they will be # different stress or tone variations of the same basic phone. # The file "optional_silence.txt" contains just a single phone (typically SIL) # which is used for optional silence in the lexicon. # extra_questions.txt might be empty; typically will consist of lists of phones, # all members of each list with the same stress or tone; and also possibly a # list for the silence phones. This will augment the automtically generated # questions (note: the automatically generated ones will treat all the # stress/tone versions of a phone the same, so will not "get to ask" about # stress or tone). # This script adds word-position-dependent phones and constructs a host of other # derived files, that go in data/lang/. # Begin configuration section. num_sil_states=5 num_nonsil_states=3 position_dependent_phones=true # position_dependent_phones is false also when position dependent phones and word_boundary.txt # have been generated by another source reverse=false share_silence_phones=false # if true, then share pdfs of different silence # phones together. sil_prob=0.5 make_individual_sil_models=false # enforce individual models for all silence phones # end configuration sections . utils/parse_options.sh if [ $# -ne 4 ]; then echo "usage: utils/prepare_lang.sh " echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang" echo " should contain the following files:" echo " extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt" echo "See http://kaldi.sourceforge.net/data_prep.html#data_prep_lang_creating for more info." echo "options: " echo " --num-sil-states # default: 5, #states in silence models." echo " --num-nonsil-states # default: 3, #states in non-silence models." echo " --position-dependent-phones (true|false) # default: true; if true, use _B, _E, _S & _I" echo " # markers on phones to indicate word-internal positions. " echo " --reverse (true|false) # reverse lexicon." echo " --share-silence-phones (true|false) # default: false; if true, share pdfs of " echo " # all non-silence phones. " echo " --sil-prob # default: 0.5 [must have 0 <= silprob < 1]" echo " --make-individual-sil-models (true|false) # default: false; make non-{shared,split} states for each silphone" exit 1; fi srcdir=$1 oov_word=$2 tmpdir=$3 dir=$4 mkdir -p $dir $tmpdir $dir/phones [ -f path.sh ] && . ./path.sh ! utils/validate_dict_dir.pl $srcdir && \ echo "*Error validating directory $srcdir*" && exit 1; if [[ ! -f $srcdir/lexicon.txt ]]; then echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt" perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1; fi if [[ ! -f $srcdir/lexiconp.txt ]]; then echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt" perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1; fi if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then utils/validate_dict_dir.pl $srcdir # show the output. echo "Validation failed (second time)" exit 1; fi if $position_dependent_phones; then # Create $tmpdir/lexicon.original from $srcdir/lexicon.txt by # adding the markers _B, _E, _S, _I depending on word position. # In this recipe, these markers apply to silence also. # Do this starting from lexiconp.txt only. perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die; if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ < $srcdir/lexiconp.txt > $tmpdir/lexiconp.original || exit 1; # create $tmpdir/phone_map.txt # this has the format (on each line) # ... # where the versions depend on the position of the phone within a word. # For instance, we'd have: # AA AA_B AA_E AA_I AA_S # for (B)egin, (E)nd, (I)nternal and (S)ingleton # and in the case of silence # SIL SIL SIL_B SIL_E SIL_I SIL_S # [because SIL on its own is one of the variants; this is for when it doesn't # occur inside a word but as an option in the lexicon.] # This phone map expands the phone lists into all the word-position-dependent # versions of the phone lists. cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ > $tmpdir/phone_map.txt else cp $srcdir/lexiconp.txt $tmpdir/lexiconp.original cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \ sed 's/ /\n/g' | awk '(NF>0){print}' > $tmpdir/phones paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt fi if $reverse; then echo "reversing lexicon." cat $tmpdir/lexiconp.original \ | awk '{printf "%s %s ",$1, $2;for(i=NF;i>2;i--){printf "%s ",$i;}printf "\n"}' \ > $tmpdir/lexiconp.txt else mv $tmpdir/lexiconp.original $tmpdir/lexiconp.txt fi mkdir -p $dir/phones # various sets of phones... # Sets of phones for use in clustering, and making monophone systems. if $share_silence_phones; then # build a roots file that will force all the silence phones to share the # same pdf's. [three distinct states, only the transitions will differ.] # 'shared'/'not-shared' means, do we share the 3 states of the HMM # in the same tree-root? # Sharing across models(phones) is achieved by writing several phones # into one line of roots.txt (shared/not-shared doesn't affect this). # 'shared split' means we have 1 tree-root for the 3 states of the HMM # (but we get to ask about the HMM-position when we split). # 'not-shared not-split' means we have separate tree roots for the 3 states, # but we never split the tree so they remain stumps # so all phones in the line correspond to the same model. if $make_individual_sil_models; then nsil=`wc $srcdir/silence_phones.txt | awk '{printf $1}'` cat $srcdir/silence_phones.txt | awk '{printf("%s\n", $0); }' | cat - $srcdir/nonsilence_phones.txt | \ utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt cat $dir/phones/sets.txt | \ awk -v nsil=$nsil '{if(NR<=nsil) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt else cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \ utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt cat $dir/phones/sets.txt | \ awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt fi else # different silence phones will have different GMMs. [note: here, all "shared split" means # is that we may have one GMM for all the states, or we can split on states. because they're # context-independent phones, they don't see the context.] cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt fi cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt cp $dir/phones/silence.txt $dir/phones/context_indep.txt cat $srcdir/extra_questions.txt | utils/apply_map.pl $tmpdir/phone_map.txt \ >$dir/phones/extra_questions.txt # Want extra questions about the word-start/word-end stuff. Make it separate for # silence and non-silence. Probably doesn't matter, as silence will rarely # be inside a word. if $position_dependent_phones; then for suffix in _B _E _I _S; do (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done for suffix in "" _B _E _I _S; do (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done fi # add disambig symbols to the lexicon in $tmpdir/lexiconp.txt # and produce $tmpdir/lexicon_disambig.txt ndisambig=`utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST. echo $ndisambig > $tmpdir/lex_ndisambig # Format of lexiconp_disambig.txt: # !SIL 1.0 SIL_S # 1.0 SPN_S #1 # 1.0 SPN_S #2 # 1.0 NSN_S # !EXCLAMATION-POINT 1.0 EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt # Create phone symbol table. echo "" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ awk '{n=NR-1; print $1, n;}' > $dir/phones.txt # Create a file that describes the word-boundary information for # each phone. 5 categories. if $position_dependent_phones; then cat $dir/phones/{silence,nonsilence}.txt | \ awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; } /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; } {print $1, "nonword";} ' > $dir/phones/word_boundary.txt else # word_boundary.txt might have been generated by another source [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt fi # Create word symbol table. cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | \ awk 'BEGIN{print " 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \ > $dir/words.txt || exit 1; # format of $dir/words.txt: # 0 #!EXCLAMATION-POINT 1 #!SIL 2 #"CLOSE-QUOTE 3 #... silphone=`cat $srcdir/optional_silence.txt` || exit 1; [ -z "$silphone" ] && \ ( echo "You have no optional-silence phone; it is required in the current scripts" echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \ exit 1; # create $dir/phones/align_lexicon.{txt,int}. # This is the new-new style of lexicon aligning. # First remove pron-probs from the lexicon. perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt # Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence, # and is not part of a word. [ ! -z "$silphone" ] && echo " $silphone" >> $tmpdir/align_lexicon.txt cat $tmpdir/align_lexicon.txt | \ perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt # create phones/align_lexicon.int cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int # Create the basic L.fst without disambiguation symbols, for use # in training. utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp.txt $sil_prob $silphone | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; # The file oov.txt contains a word that we will map any OOVs to during # training. echo "$oov_word" > $dir/oov.txt || exit 1; cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1; # integer version of oov symbol, used in some scripts. # Create these lists of phones in colon-separated integer list form too, # for purposes of being given to programs as command-line options. for f in silence nonsilence optional_silence disambig context_indep; do utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \ awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1; done for x in sets extra_questions; do utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1; done utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \ > $dir/phones/roots.int || exit 1; #if $position_dependent_phones; then if [ -f $dir/phones/word_boundary.txt ]; then utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \ > $dir/phones/word_boundary.int || exit 1; fi silphonelist=`cat $dir/phones/silence.csl` nonsilphonelist=`cat $dir/phones/nonsilence.csl` utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo # Create the lexicon FST with disambiguation symbols, and put it in lang_test. # There is an extra step where we create a loop to "pass through" the # disambiguation symbols from G.fst. phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'` word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; echo "$(basename $0): validating output directory" ! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" && exit 1; exit 0;