#!/bin/bash # Copyright 2010-2012 Microsoft Corporation; Arnab Ghoshal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # To be run from .. # This script does training-data alignment given a model built using # CMN + delta + delta-delta features. It splits the data into # four chunks and does everything in parallel on the same machine. # Its output, all in its own experimental directory, is (assuming # you don't change the #jobs with --num-job option), # {0,1,2,3}.cmvn {0,1,2,3}.ali.gz, tree, final.mdl # and final.occs (the last three are just copied from the source directory). # Option to use precompiled graphs from last phase, if these # are available (i.e. if they were built with the same data). # These must be split into four pieces. function error_exit () { echo -e "$@" >&2; exit 1; } function readint () { local retval=${1/#*=/}; # In case --switch=ARG format was used retval=${retval#0*} # Strip any leading 0's [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \ || error_exit "Argument \"$retval\" not an integer." echo $retval } njobs=4 # Default number of jobs qcmd="" # Options for the submit_jobs.sh script oldgraphs=false PROG=`basename $0`; usage="Usage: $PROG [options] \n e.g.: $PROG data/train data/lang exp/tri1 exp/tri1_ali\n\n Options:\n --help\t\tPrint this message and exit\n --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n --use-graphs\tReuse older graphs\n "; while [ $# -gt 0 ]; do case "${1# *}" in # ${1# *} strips any leading spaces from the arguments --help) echo -e $usage; exit 0 ;; --num-jobs) shift; njobs=`readint $1`; [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive."; shift ;; --qcmd) shift; qcmd=" --qcmd=${1}"; shift ;; --use-graphs) oldgraphs=true; shift ;; -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; *) break ;; # end of options: interpreted as the data-dir esac done if [ $# != 4 ]; then error_exit $usage; fi [ -f path.sh ] && . path.sh data=$1 lang=$2 srcdir=$3 dir=$4 if [ -f $lang/oov.txt ]; then oov_opt="--map-oov '"`cat $lang/oov.txt`"'" else oov_opt='--ignore-oov' fi mkdir -p $dir # Create copy of the tree and model and occs... cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1; scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then split_data.sh $data $njobs fi echo "Computing cepstral mean and variance statistics" # for n in `get_splits.pl $njobs`; do # Do this locally; it's fast. submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/cmvnTASK_ID.log \ compute-cmvn-stats --spk2utt=ark:$data/split$njobs/TASK_ID/spk2utt \ scp:$data/split$njobs/TASK_ID/feats.scp ark:$dir/TASK_ID.cmvn \ || error_exit "Computing CMN/CVN stats failed."; # Align all training data using the supplied model. echo "Aligning data from $data" feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" if $oldgraphs; then # for n in `get_splits.pl $njobs`; do # feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" ls $srcdir/{1..$njobs}.fsts.gz >/dev/null \ || error_exit "Missing FSTs with --use-graphs option specified." submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/alignTASK_ID.log \ gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \ "ark:gunzip -c $srcdir/TASK_ID.fsts.gz|" "$feats" "ark:|gzip -c >$dir/TASK_ID.ali.gz" \ || error_exit "Error doing alignment."; else # for n in `get_splits.pl $njobs`; do # feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" # compute integer form of transcripts. tra="ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt $data/split$njobs/TASK_ID/text|"; # We could just use gmm-align in the next line, but it's less efficient as # it compiles the training graphs one by one. submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/alignTASK_ID.log \ compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \ gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \ ark:- "$feats" "ark:|gzip -c >$dir/TASK_ID.ali.gz" \ || error_exit "Error doing alignment."; fi echo "Done aligning data."