#!/bin/bash -u # Copyright 2012 Arnab Ghoshal # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail function error_exit () { echo -e "$@" >&2; exit 1; } function read_dirname () { local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`; [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory"; local retval=`cd $dir_name 2>/dev/null && pwd || exit 1` echo $retval } PROG=`basename $0`; usage="Usage: $PROG \n Prepare train, dev, eval file lists for a language.\n\n Required arguments:\n --hmm-proto=FILE\tPrototype of the HMM topology\n --work-dir=DIR\t\tWorking directory\n "; if [ $# -lt 2 ]; then error_exit $usage; fi while [ $# -gt 0 ]; do case "$1" in --help) echo -e $usage; exit 0 ;; --hmm-proto=*) PROTO=`expr "X$1" : '[^=]*=\(.*\)'`; [ -f $PROTO ] || error_exit "Cannot find HMM prototype file '$PROTO'"; shift ;; --work-dir=*) WDIR=`read_dirname $1`; shift ;; *) echo "Unknown argument: $1, exiting"; error_exit $usage ;; esac done cd $WDIR . path.sh echo "Preparing train data" for LCODE in GE PO SP SW; do # (0) Create a directory to contain files needed in training: for x in train dev eval; do mkdir -p data/$LCODE/$x cp data/$LCODE/local/${x}_${LCODE}_wav.scp data/$LCODE/$x/wav.scp cp data/$LCODE/local/${x}_${LCODE}.trans2 data/$LCODE/$x/text cp data/$LCODE/local/${x}_${LCODE}.spk2utt data/$LCODE/$x/spk2utt cp data/$LCODE/local/${x}_${LCODE}.utt2spk data/$LCODE/$x/utt2spk done mkdir -p data/$LCODE/lang cp data/$LCODE/local/phones.txt -t data/$LCODE/lang/ cp data/$LCODE/local/words.txt -t data/$LCODE/lang/ # (1) Generate colon-separated lists of silence and non-silence phones, and # the file 'oov.txt' containing a word that all OOVs map to during training. silphones="SIL SPN"; silphones.pl data/$LCODE/lang/phones.txt "$silphones" \ data/$LCODE/lang/silphones.csl data/$LCODE/lang/nonsilphones.csl echo "" > data/$LCODE/lang/oov.txt # (2) Create the L.fst without disambiguation symbols, for use in training. make_lexicon_fst.pl data/$LCODE/local/lexicon_${LCODE}.txt 0.5 SIL \ | fstcompile --isymbols=data/$LCODE/lang/phones.txt \ --osymbols=data/$LCODE/lang/words.txt --keep_isymbols=false \ --keep_osymbols=false \ | fstarcsort --sort_type=olabel > data/$LCODE/lang/L.fst # (3) Create phonesets.txt and extra_questions.txt. gp_make_questions.pl -i data/$LCODE/lang/phones.txt \ -m data/$LCODE/lang/phonesets_mono.txt -r data/$LCODE/lang/roots.txt # gp_extra_questions_${LCODE}.pl -i data/$LCODE/lang/phones.txt \ # -e data/$LCODE/lang/extra_questions.txt grep -v SIL data/$LCODE/lang/phonesets_mono.txt \ > data/$LCODE/lang/phonesets_cluster.txt # (4), Finally, for training, create the HMM topology prototype: silphonelist=`cat data/$LCODE/lang/silphones.csl | sed 's/:/ /g'` nonsilphonelist=`cat data/$LCODE/lang/nonsilphones.csl | sed 's/:/ /g'` sed -e "s:NONSILENCEPHONES:$nonsilphonelist:" \ -e "s:SILENCEPHONES:$silphonelist:" $PROTO > data/$LCODE/lang/topo done echo "Preparing test data" for LCODE in GE PO SP SW; do # (0) Copy over some files common to traina and test: mkdir -p data/$LCODE/lang_test for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl; do cp data/$LCODE/lang/$f -t data/$LCODE/lang_test/ done # (1) Create a list of phones including the disambiguation symbols. # --include-zero includes the #0 symbol that is passed from G.fst ndisambig=`cat data/$LCODE/local/lex_ndisambig`; add_disambig.pl --include-zero data/$LCODE/lang_test/phones.txt $ndisambig \ > data/$LCODE/lang_test/phones_disambig.txt cp data/$LCODE/lang_test/phones_disambig.txt -t data/$LCODE/lang/ # for MMI. # (2) Create the lexicon FST with disambiguation symbols. There is an extra # step where we create a loop to "pass through" the disambiguation symbols # from G.fst. phone_disambig_symbol=`grep \#0 data/$LCODE/lang_test/phones_disambig.txt | awk '{print $2}'` word_disambig_symbol=`grep \#0 data/$LCODE/lang_test/words.txt | awk '{print $2}'` make_lexicon_fst.pl data/$LCODE/local/lexicon_disambig_${LCODE}.txt 0.5 SIL \ '#'$ndisambig \ | fstcompile --isymbols=data/$LCODE/lang_test/phones_disambig.txt \ --osymbols=data/$LCODE/lang_test/words.txt --keep_isymbols=false \ --keep_osymbols=false \ | fstaddselfloops "echo $phone_disambig_symbol |" \ "echo $word_disambig_symbol |" \ | fstarcsort --sort_type=olabel > data/$LCODE/lang_test/L_disambig.fst # Needed for discriminative training cp data/$LCODE/lang_test/L_disambig.fst -t data/$LCODE/lang/ # (3) Create L_align.fst, which is as L.fst but with alignment symbols (#1 # and #2 at the beginning and end of words, on the input side). These are # used to work out word boundaries. Useful if we ever need to create ctm's cat data/$LCODE/local/lexicon_${LCODE}.txt \ | awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' \ | make_lexicon_fst.pl - 0.5 SIL \ | fstcompile --isymbols=data/$LCODE/lang_test/phones_disambig.txt \ --osymbols=data/$LCODE/lang_test/words.txt --keep_isymbols=false \ --keep_osymbols=false \ | fstarcsort --sort_type=olabel > data/$LCODE/lang_test/L_align.fst done # Convert the different available language models to FSTs, and create separate # decoding configurations for each. -- This is very Edinburgh specific. # TODO(arnab): The core formatting is done in a format_lm fucntion inside this # script, which will be common across setups, so it can probably be taken out # and put as a separate script in the utils directory. gp_format_lms_edin.sh data echo "Succeeded in formatting data."