#!/bin/bash -u # Copyright 2012 Arnab Ghoshal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # This script shows the steps needed to build a recognizer for certain languages # of the GlobalPhone corpus. # !!! NOTE: The current recipe assumes that you have pre-built LMs. echo "This shell script may run as-is on your system, but it is recommended that you run the commands one by one by copying and pasting into the shell." #exit 1; [ -f cmd.sh ] && source ./cmd.sh \ || echo "cmd.sh not found. Jobs may not execute properly." # CHECKING FOR AND INSTALLING REQUIRED TOOLS: # This recipe requires shorten (3.6.1) and sox (14.3.2). # If they are not found, the local/gp_install.sh script will install them. local/gp_check_tools.sh $PWD path.sh . path.sh || { echo "Cannot source path.sh"; exit 1; } # Set the locations of the GlobalPhone corpus and language models GP_CORPUS=/mnt/matylda2/data/GLOBALPHONE # GP_LM=/mnt/matylda6/ijanda/GLOBALPHONE_LM GP_LM=$PWD/language_models # Set the languages that will actually be processed # export GP_LANGUAGES="CZ FR GE PL PO RU SP VN" export GP_LANGUAGES="CZ FR GE PL PO SP" # The following data preparation step actually converts the audio files from # shorten to WAV to take out the empty files and those with compression errors. local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$GP_CORPUS \ --languages="$GP_LANGUAGES" local/gp_dict_prep.sh --config-dir $PWD/conf $GP_CORPUS $GP_LANGUAGES # # Use the following to map to X-SAMPA phoneset # local/gp_dict_prep.sh --config-dir $PWD/conf --map-dir $PWD/conf/xsampa_map \ # $GP_CORPUS $GP_LANGUAGES for L in $GP_LANGUAGES; do utils/prepare_lang.sh --position-dependent-phones true \ data/$L/local/dict "" data/$L/local/lang_tmp data/$L/lang \ >& data/$L/prepare_lang.log || exit 1; done # Convert the different available language models to FSTs, and create separate # decoding configurations for each. for L in $GP_LANGUAGES; do # $highmem_cmd data/$L/format_lm.log \ # local/gp_format_lm.sh --filter-vocab-sri false $GP_LM $L & $highmem_cmd data/$L/format_lm.log \ local/gp_format_lm.sh --filter-vocab-sri true $GP_LM $L & done wait # Now make MFCC features. for L in $GP_LANGUAGES; do mfccdir=mfcc/$L for x in train dev eval; do ( steps/make_mfcc.sh --nj 6 --cmd "$train_cmd" data/$L/$x \ exp/$L/make_mfcc/$x $mfccdir; steps/compute_cmvn_stats.sh data/$L/$x exp/$L/make_mfcc/$x $mfccdir; ) & done done wait; for L in $GP_LANGUAGES; do mkdir -p exp/$L/mono; steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ data/$L/train data/$L/lang exp/$L/mono >& exp/$L/mono/train.log & done wait; for L in $GP_LANGUAGES; do for lm_suffix in tgpr_sri; do ( graph_dir=exp/$L/mono/graph_${lm_suffix} mkdir -p $graph_dir $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh --mono data/$L/lang_test_${lm_suffix} exp/$L/mono \ $graph_dir steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ exp/$L/mono/decode_dev_${lm_suffix} ) & done done for L in $GP_LANGUAGES; do ( mkdir -p exp/$L/mono_ali steps/align_si.sh --nj 10 --cmd "$train_cmd" \ data/$L/train data/$L/lang exp/$L/mono exp/$L/mono_ali \ >& exp/$L/mono_ali/align.log num_states=$(grep "^$L" conf/tri.conf | cut -f2) num_gauss=$(grep "^$L" conf/tri.conf | cut -f3) mkdir -p exp/$L/tri1 steps/train_deltas.sh --cmd "$train_cmd" --cluster-thresh 100 \ $num_states $num_gauss data/$L/train data/$L/lang exp/$L/mono_ali \ exp/$L/tri1 >& exp/$L/tri1/train.log ) & done wait; for L in $GP_LANGUAGES; do for lm_suffix in tgpr_sri; do ( graph_dir=exp/$L/tri1/graph_${lm_suffix} mkdir -p $graph_dir $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri1 $graph_dir steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ exp/$L/tri1/decode_dev_${lm_suffix} ) & done done # SAT-trained triphone systems: MFCC feats for L in $GP_LANGUAGES; do ( mkdir -p exp/$L/tri1_ali_fmllr steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \ data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali_fmllr \ >& exp/$L/tri1_ali_fmllr/align.log || exit 1; num_states=$(grep "^$L" conf/tri.conf | cut -f2) num_gauss=$(grep "^$L" conf/tri.conf | cut -f3) mkdir -p exp/$L/tri2a steps/train_sat.sh --cmd "$train_cmd" --cluster-thresh 100 \ $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr \ exp/$L/tri2a >& exp/$L/tri2a/train.log ) & done wait; for L in $GP_LANGUAGES; do for lm_suffix in tgpr_sri; do ( graph_dir=exp/$L/tri2a/graph_${lm_suffix} mkdir -p $graph_dir $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2a $graph_dir steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ exp/$L/tri2a/decode_dev_${lm_suffix} ) & done done # for L in $GP_LANGUAGES; do # mode=4 # # Doing this only for the LMs whose vocabs were limited using SRILM, since the # # other approach didn't yield LMs for all languages. # steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \ # data/$L/lang_test_tgpr_sri data/$L/lang_test_tg_sri data/$L/dev \ # exp/$L/tri2a/decode_dev_tgpr_sri exp/$L/tri2a/decode_dev_tg_sri$mode # done # SGMMs starting from non-SAT triphone system, both with and without # speaker vectors. for L in $GP_LANGUAGES; do ( mkdir -p exp/$L/tri1_ali steps/align_si.sh --nj 10 --cmd "$train_cmd" \ data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali \ >& exp/$L/tri1_ali/align.log mkdir -p exp/$L/ubm2a steps/train_ubm.sh --cmd "$train_cmd" \ 400 data/$L/train data/$L/lang exp/$L/tri1_ali exp/$L/ubm2a \ >& exp/$L/ubm2a/train.log || exit 1; num_states=$(grep "^$L" conf/sgmm.conf | cut -f2) num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3) mkdir -p exp/$L/sgmm2a steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 --spk-dim 0 \ $num_states $num_substates data/$L/train data/$L/lang exp/$L/tri1_ali \ exp/$L/ubm2a/final.ubm exp/$L/sgmm2a >& exp/$L/sgmm2a/train.log mkdir -p exp/$L/sgmm2b steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 \ $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali \ exp/$L/ubm2a/final.ubm exp/$L/sgmm2b >& exp/$L/sgmm2b/train.log ) & done wait for L in $GP_LANGUAGES; do # Need separate decoding graphs for models with and without speaker vectors, # since the trees may be different. for sgmm in sgmm2a sgmm2b; do for lm_suffix in tgpr_sri; do ( graph_dir=exp/$L/$sgmm/graph_${lm_suffix} mkdir -p $graph_dir $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/$sgmm $graph_dir steps/decode_sgmm.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ exp/$L/$sgmm/decode_dev_${lm_suffix} ) & done # loop over LMs done # loop over model with and without speaker vecs done # loop over languages # Train SGMMs using SAT features for L in $GP_LANGUAGES; do ( mkdir -p exp/$L/ubm2c steps/train_ubm.sh --cmd "$train_cmd" \ 400 data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr exp/$L/ubm2c \ >& exp/$L/ubm2c/train.log || exit 1; num_states=$(grep "^$L" conf/tri.conf | cut -f2) num_gauss=$(grep "^$L" conf/tri.conf | cut -f3) mkdir -p exp/$L/sgmm2c steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 \ $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr \ exp/$L/ubm2c/final.ubm exp/$L/sgmm2c >& exp/$L/sgmm2c/train.log ) & done wait for L in $GP_LANGUAGES; do for lm_suffix in tgpr_sri; do ( graph_dir=exp/$L/sgmm2c/graph_${lm_suffix} mkdir -p $graph_dir $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/sgmm2c $graph_dir steps/decode_sgmm.sh --nj 5 --cmd "$decode_cmd" \ --transform-dir exp/$L/tri2a/decode_dev_${lm_suffix} \ $graph_dir data/$L/dev exp/$L/sgmm2c/decode_dev_${lm_suffix} ) & done done