#!/bin/bash -u # Copyright 2012 Arnab Ghoshal # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail function error_exit () { echo -e "$@" >&2; exit 1; } function read_dirname () { local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`; [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory"; local retval=`cd $dir_name 2>/dev/null && pwd || exit 1` echo $retval } PROG=`basename $0`; usage="Usage: $PROG \n Prepare train, dev, test file lists.\n\n Required arguments:\n --hmm-proto=FILE\tPrototype of the HMM topology\n --work-dir=DIR\t\tWorking directory\n "; if [ $# -lt 2 ]; then error_exit $usage; fi while [ $# -gt 0 ]; do case "$1" in --help) echo -e $usage; exit 0 ;; --hmm-proto=*) PROTO=`expr "X$1" : '[^=]*=\(.*\)'`; [ -f $PROTO ] || error_exit "Cannot find HMM prototype file '$PROTO'"; shift ;; --work-dir=*) WDIR=`read_dirname $1`; shift ;; *) echo "Unknown argument: $1, exiting"; error_exit $usage ;; esac done cd $WDIR . path.sh echo "Preparing train data" # (0) Create a directory to contain files needed in training: for x in train dev test; do mkdir -p data/$x cp data/local/${x}_wav.scp data/$x/wav.scp cp data/local/${x}.trans2 data/$x/text cp data/local/${x}.spk2utt data/$x/spk2utt cp data/local/${x}.utt2spk data/$x/utt2spk done mkdir -p data/lang cp data/local/phones.txt -t data/lang/ cp data/local/words.txt -t data/lang/ # (1) Generate colon-separated lists of silence and non-silence phones silphones="cl epi sil vcl"; silphones.pl data/lang/phones.txt "$silphones" \ data/lang/silphones.csl data/lang/nonsilphones.csl # (2) Create the L.fst without disambiguation symbols, for use in training. make_lexicon_fst.pl data/local/lexicon.txt 0.5 sil \ | fstcompile --isymbols=data/lang/phones.txt \ --osymbols=data/lang/words.txt --keep_isymbols=false \ --keep_osymbols=false \ | fstarcsort --sort_type=olabel > data/lang/L.fst # (3) Create phonesets.txt and extra_questions.txt. timit_make_questions.pl -i data/lang/phones.txt \ -m data/lang/phonesets_mono.txt -r data/lang/roots.txt grep -v sil data/lang/phonesets_mono.txt \ > data/lang/phonesets_cluster.txt echo "cl epi sil vcl" > data/lang/extra_questions.txt # (4), Finally, for training, create the HMM topology prototype: silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'` nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'` sed -e "s:NONSILENCEPHONES:$nonsilphonelist:" \ -e "s:SILENCEPHONES:$silphonelist:" $PROTO > data/lang/topo echo "Preparing test data" # (0) Copy over some files common to traina and test: mkdir -p data/lang_test for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl; do cp data/lang/$f -t data/lang_test/ done # (1) Create a list of phones including the disambiguation symbols. # --include-zero includes the #0 symbol that is passed from G.fst ndisambig=`cat data/local/lex_ndisambig`; add_disambig.pl --include-zero data/lang_test/phones.txt $ndisambig \ > data/lang_test/phones_disambig.txt cp data/lang_test/phones_disambig.txt -t data/lang/ # for MMI. # (2) Create the lexicon FST with disambiguation symbols. There is an extra # step where we create a loop to "pass through" the disambiguation symbols # from G.fst. phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'` word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'` make_lexicon_fst.pl data/local/lexicon_disambig.txt 0.5 sil '#'$ndisambig \ | fstcompile --isymbols=data/lang_test/phones_disambig.txt \ --osymbols=data/lang_test/words.txt --keep_isymbols=false \ --keep_osymbols=false \ | fstaddselfloops "echo $phone_disambig_symbol |" \ "echo $word_disambig_symbol |" \ | fstarcsort --sort_type=olabel > data/lang_test/L_disambig.fst # Needed for discriminative training cp data/lang_test/L_disambig.fst -t data/lang/ # (3) Convert the language model to FST, and create decoding configuration. timit_format_lms.sh data echo "Succeeded in formatting data."