#!/bin/bash # Copyright 2012 Navdeep Jaitly # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # To be run from one directory above this script. if [ -f path.sh ]; then . path.sh; fi arpa_lm=data/local/lm/biphone/lm_unpruned.gz data_list="train test dev" for x in lang lang_test $data_list; do mkdir -p data/$x done # Copy stuff into its final location: for x in $data_list; do cp data/local/$x.spk2utt data/$x/spk2utt || exit 1; cp data/local/$x.utt2spk data/$x/utt2spk || exit 1; cp data/local/${x}_wav.scp data/$x/wav.scp || exit 1; cp data/local/${x}_trans.txt data/$x/text || exit 1; scripts/filter_scp.pl data/$x/spk2utt data/local/spk2gender.map > data/$x/spk2gender || exit 1; done scripts/make_words_symtab.pl < data/local/lexicon.txt > data/lang/words.txt scripts/make_phones_symtab.pl < data/local/lexicon.txt > data/lang/phones.txt cp data/lang/words.txt data/lang_test/words.txt silphones="sil"; # This would in general be a space-separated list of all silence phones. E.g. "sil vn" # Generate colon-separated lists of silence and non-silence phones. scripts/silphones.pl data/lang/phones.txt "$silphones" data/lang/silphones.csl \ data/lang/nonsilphones.csl ndisambig=`scripts/add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt` ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST. scripts/add_disambig.pl data/lang/phones.txt $ndisambig > data/lang_test/phones_disambig.txt cp data/lang_test/phones_disambig.txt data/lang/ # needed for MMI. echo "Creating L.fst" silprob=0.3 # same prob as word scripts/make_lexicon_fst.pl data/local/lexicon.txt $silprob sil | \ fstcompile --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ fstarcsort --sort_type=olabel > data/lang/L.fst echo "Done creating L.fst" # L_disambig.fst has the disambiguation symbols (c.f. Mohri's papers) echo "Creating L_disambig.fst" scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob sil '#'$ndisambig | \ fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \ --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel \ > data/lang_test/L_disambig.fst echo "Done creating L_disambig.fst" cp data/lang_test/L_disambig.fst data/lang/ # Needed for MMI training. echo "Creating G.fst" #gunzip -c "$arpa_lm" | \ # grep -v ' ' | \ # grep -v ' ' | \ # grep -v ' ' | \ # arpa2fst - | fstprint | \ # scripts/remove_oovs.pl /dev/null | \ # scripts/eps2disambig.pl | scripts/s2eps.pl | \ # fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang_test/words.txt --keep_isymbols=false \ # --keep_osymbols=false > data/lang_test/G.fst gunzip -c "$arpa_lm" | \ grep -v ' ' | \ grep -v ' ' | \ grep -v ' ' | \ arpa2fst - | fstprint | \ scripts/remove_oovs.pl /dev/null | \ scripts/s2eps.pl | \ fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang_test/words.txt --keep_isymbols=false \ --keep_osymbols=false > data/lang_test/G.fst echo "G.fst created. How stochastic is it ?" fstisstochastic data/lang_test/G.fst # Checking that G.fst is determinizable. fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. # Checking that L_disambig.fst is determinizable. fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ fstdeterminize >/dev/null || echo Error # Checking that LG is stochastic: echo "How stochastic is LG.fst." fstisstochastic data/lang_test/G.fst fsttablecompose data/lang/L.fst data/lang_test/G.fst | \ fstisstochastic # Checking that LG_disambig.fst is stochastic: echo "How stochastic is LG_disambig.fst." fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic ## Check lexicon. ## just have a look and make sure it seems sane. echo "First few lines of lexicon FST:" fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'` nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'` cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \ sed "s:SILENCEPHONES:$silphonelist:" > data/lang/topo for x in phones.txt words.txt silphones.csl nonsilphones.csl topo; do cp data/lang/$x data/lang_test/$x || exit 1; done echo timit_format_data succeeded.