#!/bin/bash . cmd.sh steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ data/train_30k_nodup data/lang exp/tri3b exp/tri3b_ali_30k_nodup || exit 1; steps/train_lda_mllt.sh --cmd "$train_cmd" --realign-iters "" \ 1000 10000 data/train_30k_nodup data/lang exp/tri3b_ali_30k_nodup exp/tri4b_seg || exit 1; steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ data/train data/lang exp/tri3b exp/tri3b_ali_all || exit 1; # Make the phone decoding-graph. steps/make_phone_graph.sh data/lang exp/tri3b_ali_all exp/tri4b_seg || exit 1; mkdir -p data_reseg for data in train eval2000; do cp -rT data/${data} data_reseg/${data}_orig; rm -r data_reseg/${data}_orig/split* for f in text utt2spk spk2utt feats.scp cmvn.scp segments; do rm data_reseg/${data}_orig/$f; done cat data_reseg/${data}_orig/wav.scp | awk '{print $1, $1;}' | \ tee data_reseg/${data}_orig/spk2utt > data_reseg/${data}_orig/utt2spk mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that # dir, we'll overwrite the old data. mkdir -p mfcc_reseg steps/make_mfcc.sh --compress true --nj 20 --cmd "$train_cmd" data_reseg/${data}_orig exp/make_mfcc/${data}_orig $mfccdir # caution: the new speakers don't correspond to the old ones, since they now have "sw0" at the start.. steps/compute_cmvn_stats.sh --two-channel data_reseg/${data}_orig exp/make_mfcc/${data}_orig $mfccdir done steps/decode_nolats.sh --write-words false --write-alignments true \ --cmd "$decode_cmd" --nj 60 --beam 7.0 --max-active 1000 \ exp/tri4b_seg/phone_graph data_reseg/train_orig exp/tri4b_seg/decode_train_orig steps/decode_nolats.sh --write-words false --write-alignments true \ --cmd "$decode_cmd" --nj 10 --beam 7.0 --max-active 1000 \ exp/tri4b_seg/phone_graph data_reseg/eval2000_orig exp/tri4b_seg/decode_eval2000_orig # Here: resegment. # Note: it would be perfectly possible to use exp/tri3b_ali_train here instead # of exp/tri4b_seg/decode_train_orig. In this case we'd be relying on the transcripts. # I chose not to do this for more consistency with what happens in test time. steps/resegment_data.sh --cmd "$train_cmd" data_reseg/train_orig data/lang \ exp/tri4b_seg/decode_train_orig data_reseg/train exp/tri4b_resegment_train steps/resegment_data.sh --cmd "$train_cmd" data_reseg/eval2000_orig data/lang \ exp/tri4b_seg/decode_eval2000_orig data_reseg/eval2000 exp/tri4b_resegment_eval2000 # We need all the training data to be aligned (not just "train_nodup"), in order # to get the resegmented "text". steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ data/train data/lang exp/tri3b exp/tri3b_ali_train || exit 1; # Get the file data_reseg/train/text steps/resegment_text.sh --cmd "$train_cmd" data/train data/lang \ exp/tri3b_ali_train data_reseg/train exp/tri4b_resegment_train for data in train eval2000; do utils/fix_data_dir.sh data_reseg/${data} utils/validate_data_dir.sh --no-feats --no-text data_reseg/${data} mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that # dir, we'll overwrite the old data. steps/make_mfcc.sh --compress true --nj 40 --cmd "$train_cmd" data_reseg/${data} \ exp/make_mfcc/${data}_reseg $mfccdir || exit 1; steps/compute_cmvn_stats.sh data_reseg/${data} exp/make_mfcc/${data}_reseg $mfccdir || exit 1; utils/fix_data_dir.sh data_reseg/${data} || exit 1; done # Note: we'll be comparing tri4b, which was trained on train_nodup, with tri4c_reseg, which # was trained on *all* the resegmented data. However, it's comparable because the actual hours # of data is less in tri4c_reseg: 265h, versus 284 in the nodup data. # cat data/train_nodup/segments | awk '{nf += $4 - $3; } END{print nf /3600;}' # 284.433 # cat data_reseg/train/segments | awk '{nf += $4 - $3; } END{print nf /3600;}' # 265.154 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ data_reseg/train data/lang exp/tri3b exp/tri3b_ali_reseg || exit 1; steps/train_sat.sh --cmd "$train_cmd" \ 11500 200000 data_reseg/train data/lang exp/tri3b_ali_reseg exp/tri4c_reseg || exit 1; for lm_suffix in tg fsh_tgpr; do ( graph_dir=exp/tri4c_reseg/graph_sw1_${lm_suffix} $train_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri4c_reseg $graph_dir steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data_reseg/eval2000 exp/tri4c_reseg/decode_eval2000_sw1_${lm_suffix} ) & done exit 0; # Below is experimental. # I'm figuring out whether we should keep the segments the the 1st pass designated as noise. steps/resegment_data.sh --cmd "$train_cmd" \ --segmentation-opts "--remove-noise-only-segments false" \ data_reseg/eval2000_orig data/lang \ exp/tri4b_seg/decode_eval2000_orig data_reseg/eval2000_with_noise exp/tri4b_resegment_eval2000_with_noise for data in eval2000_with_noise; do utils/fix_data_dir.sh data_reseg/${data} utils/validate_data_dir.sh --no-feats --no-text data_reseg/${data} mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that # dir, we'll overwrite the old data. steps/make_mfcc.sh --compress true --nj 40 --cmd "$train_cmd" data_reseg/${data} \ exp/make_mfcc/${data}_reseg $mfccdir || exit 1; steps/compute_cmvn_stats.sh data_reseg/${data} exp/make_mfcc/${data}_reseg $mfccdir || exit 1; utils/fix_data_dir.sh data_reseg/${data} || exit 1; done for lm_suffix in tg fsh_tgpr; do ( graph_dir=exp/tri4c_reseg/graph_sw1_${lm_suffix} $train_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri4c_reseg $graph_dir steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data_reseg/eval2000_with_noise exp/tri4c_reseg/decode_eval2000_with_noise_sw1_${lm_suffix} ) & done