#!/bin/bash # This script creates the data directories for us to run a syllable-based system # in the directory with the same name as this, but with the -syllables suffix. # dependency and build a LM on the phone level. We get the syllable sequences from one # of the existing systems' alignments. set -e #Exit on non-zero return code from any command set -o pipefail #Exit if any of the commands in the pipeline will #return non-zero return code set -u #Fail on an undefined variable . conf/common_vars.sh || exit 1; . ./lang.conf || exit 1; # extra config: syllable_ins_prob=0.01 # relative to prob of # We will create a directory with the same name as this one, but # with "-syllables" at the end of its name. target=`pwd`-syllables if [ ! -d $target ]; then echo --------------------------------------------------------------------- echo "Creating directory $target for the syllable system" echo --------------------------------------------------------------------- mkdir -p $target cp *.sh $target for x in steps utils local conf lang.conf; do [ ! -s $x ] && echo "No such file or directory $x" && exit 1; if [ -L $x ]; then # if these are links cp -d $x $target # copy the link over. else # create a link to here. ln -s ../`basename $PWD`/$x $target fi done fi mkdir -p $target/data for dir in raw_train_data raw_dev2h_data raw_dev10h_data; do ln -s `pwd`/data/$dir $target/data/ || true; done if [ ! -f data/local_withprob/.done ]; then echo ------------------------------------------------------------------------------- echo "Creating lexicon with probabilities, data/local_withprob/lexiconp.txt on `date`" echo ------------------------------------------------------------------------------- cp -rT data/local data/local_withprob steps/get_lexicon_probs.sh data/train data/lang exp/tri5 data/local/lexicon.txt \ exp/tri5_lexprobs data/local_withprob/lexiconp.txt || exit 1; touch data/local_withprob/.done fi mkdir -p $target/data/local # we'll put some stuff here... if [ ! -f $target/data/local/.done ]; then echo --------------------------------------------------------------------- echo "Creating syllable-level lexicon in directory $target/data/local/w2s" echo --------------------------------------------------------------------- mkdir -p $target/data/local/w2s ! local/make_syllable_lexicon.sh --pron-probs true data/local_withprob/lexiconp.txt \ $target/data/local/w2s/lexiconp.txt $target/data/local/lexicon.txt && \ echo "Error creating syllable lexicon" && exit 1; for f in extra_questions.txt silence_phones.txt nonsilence_phones.txt optional_silence.txt; do cp data/local/$f $target/data/local/$f done touch $target/data/local/.done fi if [ ! -f $target/data/lang/.done ]; then echo ------------------------------------------------------------------------ echo "Creating lang directories $target/data/lang and $target/data/lang_nopos" echo ------------------------------------------------------------------------ # create the "lang" directory for the per-syllable setup. # note: the position dependency will now be on the syllable level, not the phone level. # Now we specify zero silence prob, because we'll be putting the silence in # the language model in order to reduce the chances of it being inserted between syllables # where it shouldn't belong. # The oov symbols '' doesn't really matter for this setup as there will be no OOVs; # the scripts just require it. utils/prepare_lang.sh --sil-prob 0.0 \ $target/data/local/ '' $target/data/local/tmp $target/data/lang utils/prepare_lang.sh --sil-prob 0.0 --position-dependent-phones false \ $target/data/local/ '' $target/data/local/tmp $target/data/lang_nopos # later stages require word-symbol tables of lang and lang_nopos be the same. cmp $target/data/lang/words.txt $target/data/lang_nopos/words.txt || exit 1; touch $target/data/lang/.done fi if [ ! -f $target/data/local/w2s/.done ]; then echo --------------------------------------------------------------------- echo "Creating word-to-syllable lexicon FSTs" echo -------------------------------------------------------------------- cp data/lang/words.txt $target/data/local/w2s/words.txt cp $target/data/lang/words.txt $target/data/local/w2s/phones.txt # This FST will be used for alignment of the current system to get syllable-level # alignments. utils/make_lexicon_fst.pl --pron-probs $target/data/local/w2s/lexiconp.txt \ 0.5 `cat data/lang/phones/optional_silence.txt` | \ fstcompile --osymbols=data/lang/words.txt --isymbols=$target/data/lang/words.txt | \ fstarcsort --sort_type=olabel > $target/data/local/w2s/L.fst touch $target/data/local/w2s/.done fi if [ ! -f $target/data/local/w2s_extended/.done ]; then echo --------------------------------------------------------------------- echo "Creating lexicon in $target/data/local/w2s_extended, for word decoding" echo "(allows penalized insertion of syllables we can't form into words)." echo -------------------------------------------------------------------- dir=$target/data/local/w2s_extended mkdir -p $dir cat $target/data/local/w2s/lexiconp.txt | \ perl -e ' $syll_prob = shift @ARGV; ($syll_prob > 0.0 && $syll_prob <= 1.0) || die "Bad prob $prob"; while() { @A = split; $word = shift @A; $prob = shift @A; if (!($prob > 0.0 && $prob <= 1.0)) { die "Bad pron-prob $prob in line $_"; } if (@A == 1) { $seen{$A[0]} = 1; } # saw this syllable as singleton. foreach $a ( @A ) { $is_syllable{$a} = 1; } print; # print out the line; all lines of the lexicon get printed, plus some more. } foreach $syllable (keys %is_syllable) { if (! defined $seen{$syllable}) { # did not see as singleton pron. print " $syll_prob $syllable\n"; # print new pron, . } } ' $syllable_ins_prob > $dir/lexiconp.txt ndisambig=`utils/add_lex_disambig.pl --pron-probs $dir/lexiconp.txt $dir/lexiconp_disambig.txt` ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST. [may not be needed here] ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/disambig.txt cat $target/data/lang/words.txt | awk '{print $1}' | cat - <(for n in `seq $ndisambig`; do echo '#'$n; done) | \ awk '{n=NR-1; print $1, n;}' > $dir/syllables.txt # syllable-level symbol table. utils/sym2int.pl $dir/syllables.txt $dir/disambig.txt >$dir/disambig.int cp data/lang/words.txt $dir/words.txt # word-level symbol table. # Make the lexicon with disambig symbols; determinize; remove disambig symbols. utils/make_lexicon_fst.pl --pron-probs $dir/lexiconp_disambig.txt 0.5 SIL "#$ndisambig" | \ fstcompile --osymbols=$dir/words.txt --isymbols=$dir/syllables.txt | \ fstdeterminizestar | fstrmsymbols $dir/disambig.int > $dir/Ldet.fst # Remove the #0 symbol from the input of word-level G.fst, and remove # from the output, and copy it over. We remove from the # output because it won't matter for scoring and because it was causing some # problems for phone-alignment which is part of word alignment. (a kind of blowup # from symbols getting out of sync; problems only happens if have 1 symbol per phone). disambig_word=`grep -w "#0" data/lang/words.txt | awk '{print $2}'` silence_word=`grep -w "" data/lang/words.txt | awk '{print $2}'` fstrmsymbols --remove-from-output=false "echo $disambig_word|" data/lang/G.fst | \ fstrmsymbols --remove-from-output=true "echo $silence_word|" | fstarcsort > $dir/G.fst mkdir -p $dir/tmp # remove pron-probs from syllable-level lexicon # Get syllable-level lexicon with word-position dependent phones. cat $target/data/lang/phones/align_lexicon.txt | cut -d ' ' -f 2- > $dir/tmp/syllable_lexicon.txt # remove pron-probs from word-level lexicon. cat $dir/lexiconp.txt | perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' >$dir/lexicon.txt utils/apply_map.pl -f 2- $dir/tmp/syllable_lexicon.txt < $dir/lexicon.txt \ >$dir/tmp/word2phone_lexicon.txt echo " SIL_S" >> $dir/tmp/word2phone_lexicon.txt # add optional silence. It only appears # in the form SIL_S, because silence is always a word in the syllable setup, never optional. awk '{print $1, $0;}' < $dir/tmp/word2phone_lexicon.txt > $dir/word_align_lexicon.txt # duplicate 1st field utils/sym2int.pl -f 1-2 $dir/words.txt < $dir/word_align_lexicon.txt | \ utils/sym2int.pl -f 3- $target/data/lang/phones.txt > $dir/word_align_lexicon.int fi if [ ! -f $target/data/train/.done ]; then echo --------------------------------------------------------------------- echo "Using the data alignments to get syllable-level pronunciations" echo -------------------------------------------------------------------- local/get_syllable_text.sh data/train data/lang $target/data/lang_nopos \ $target/data/local/w2s/L.fst \ exp/tri5_ali exp/tri5_align_syllables $target/data/train touch $target/data/train/.done fi if [ ! -f exp/tri5_dev2h_ali/.done ]; then echo --------------------------------------------------------------------- echo "Aligning dev data using transcripts (needed for LM dev set)" echo -------------------------------------------------------------------- steps/align_fmllr.sh --retry-beam 80 \ --boost-silence $boost_sil --nj ${dev2h_nj} --cmd "$train_cmd" \ data/dev2h data/lang exp/tri5 exp/tri5_dev2h_ali touch exp/tri5_dev2h_ali/.done fi if [ ! -f $target/data/dev2h/.done ]; then echo --------------------------------------------------------------------- echo "Using the dev-data alignments to get syllable-level pronunciations" echo -------------------------------------------------------------------- local/get_syllable_text.sh data/dev2h data/lang $target/data/lang_nopos \ $target/data/local/w2s/L.fst \ exp/tri5_dev2h_ali exp/tri5_align_dev2h_syllables $target/data/dev2h touch $target/data/dev2h/.done fi if [ ! -d $target/data/dev10h ]; then echo --------------------------------------------------------------------- echo "Copying data directory $target/data/dev10h/" echo -------------------------------------------------------------------- ## Note: the "text" in this directory is not really correct, it is word-based ## not syllable based. This doesn't really matter as we won't ever use that. cp -rT data/dev10h $target/data/dev10h rm -rf $target/data/dev10h/split* rm -rf $target/data/dev10h/kws fi for n in 2 10; do if [ -d data/dev${n}h/kws ] && [ ! -d $target/data/dev${n}h/kws/.done ]; then echo --------------------------------------------------------------------- echo "Preparing kws directory in $target/data/dev${n}h/kws" echo -------------------------------------------------------------------- cp -rT data/dev${n}h/kws $target/data/dev${n}h/kws # first copy the old dir, then # run a script for syllable-specific setup. perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' \ <$target/data/local/w2s/lexiconp.txt >$target/data/local/w2s/lexicon.txt local/kws_data_prep_syllables.sh --silence-word SIL \ $target/data/lang $target/data/dev${n}h $target/data/local/w2s/lexicon.txt \ $target/data/dev${n}h/kws fi done if [ ! -f $target/data/srilm/lm.gz ]; then echo --------------------------------------------------------------------- echo "Training LM in $target/data/srilm" echo -------------------------------------------------------------------- local/train_lms_srilm.sh --dev-text $target/data/dev2h/text --train-text $target/data/train/text \ --words-file $target/data/lang/words.txt $target/data $target/data/srilm fi if [ ! -f $target/data/lang/G.fst ]; then echo --------------------------------------------------------------------- echo "Creating $target/data/lang/G.fst" echo -------------------------------------------------------------------- local/arpa2G_syllables.sh $target/data/srilm/lm.gz $target/data/lang $target/data/lang fi exit 0; # At this point you go to the directory that it created, and run run-1-main.sh