# This is as arpa2G.sh but specialized for the per-syllable setup. This is # specific to the BABEL setup. # The difference from arpa2G.sh is that (1) we have to change to , because # is the name of the phone that was chosen to represent the unknown word [note: # is special to SRILM, which is why it appears in the vocab]; and (2) we have # a special step with fstrhocompose which we use to ensure that silence cannot appear # twice in succession. [Silence appears in the language model, which would naturally # allow it to appear twice in succession.] # input side, because is the name of the lmfile=$1 langdir=$2 destdir=$3 mkdir -p $destdir; # Make FST that we compose with to disallow >1 silence in a row. last_id=`tail -n 1 $langdir/words.txt | awk '{print $2}'` || exit 1; [ -z $last_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1; silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1; [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1; rho=$[$last_id+1] # state 0 is start-state. state 1 is state after we saw silence. state 2 is # "dead state/failure state" that is not coaccessible. cat < $destdir/rho.fst 0 1 $silence_id $silence_id 0 0 $rho $rho 1 2 $silence_id $silence_id 1 0 $rho $rho 0 1 EOF gunzip -c $lmfile | \ grep -v ' ' | grep -v ' ' | grep -v ' ' | \ sed 's///g' | \ arpa2fst - | \ fstprint | \ utils/eps2disambig.pl | \ utils/s2eps.pl | \ fstcompile --isymbols=$langdir/words.txt \ --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrhocompose "$rho" - $destdir/rho.fst | \ fstrmepsilon > $destdir/G.fst || exit 1 fstisstochastic $destdir/G.fst || true rm $destdir/rho.fst exit 0