#!/bin/bash # Copyright 2014 Vimal Manohar, Johns Hopkins University (Author: Jan Trmal) # Apache 2.0 set -o pipefail set -e nj=8 cmd=run.pl stage=0 segmentation_opts="--isolated-resegmentation --min-inter-utt-silence-length 1.0 --silence-proportion 0.05" decoder_extra_opts="" reference_rttm= get_text=false # Get text corresponding to new segments in ${output_dir} # Assuming text is in $data/$type directory. # Does not work very well because the data does not get aligned to many training transcriptions. noise_oov=false # Treat as noise instead of speech beam=7.0 max_active=1000 #debugging stuff echo $0 $@ [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; set -u if [ $# -ne 5 ]; then echo "Usage: $0 [options] " echo " Options:" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." echo " --nj # Number of parallel jobs. " echo " For the standard data directories of dev10h, dev2h and eval" echo " this is taken from the lang.conf file" echo " --segmentation-opts '--opt1 opt1val --opt2 opt2val' # options for segmentation.py" echo " --reference-rttm # Reference RTTM file that will be used for analysis of the segmentation" echo " --get-text (true|false) # Convert text from base data directory to correspond to the new segments" echo echo "e.g.:" echo "$0 data/dev10h data/lang exp/tri4b_seg exp/tri4b_resegment_dev10h" exit 1 fi datadir=$1 # The base data directory that contains at least the files wav.scp and reco2file_and_channel lang=$2 model_dir=$3 # Segmentation model directory created using local/resegment/run_segmentation_train.sh temp_dir=$4 # Temporary directory to store some intermediate files during segmentation output_dir=$5 # The target directory ############################################################################### # # Phone Decoder # ############################################################################### mkdir -p $temp_dir dirid=`basename $datadir` total_time=0 t1=$(date +%s) if [ $stage -le 0 ] ; then steps/decode_nolats.sh $decoder_extra_opts --write-words false --write-alignments true \ --cmd "$cmd" --nj $nj --beam $beam --max-active $max_active \ $model_dir/phone_graph $datadir $model_dir/decode_${dirid} || exit 1 fi if [ $stage -le 1 ]; then [ ! -f $model_dir/decode_${dirid}/ali.1.gz ] && echo "File $model_dir/decode_${dirid}/ali.1.gz does not exist!" && exit 1 $cmd JOB=1:$nj $model_dir/decode_${dirid}/log/predict.JOB.log \ gunzip -c $model_dir/decode_${dirid}/ali.JOB.gz \| \ ali-to-phones --per-frame=true $model_dir/final.mdl ark:- ark,t:- \| \ utils/int2sym.pl -f 2- $lang/phones.txt \| \ gzip -c '>' $temp_dir/pred.JOB.gz || exit 1 mkdir -p $temp_dir/pred gunzip -c $temp_dir/pred.*.gz | \ perl -ne '($file, $phones)=split / /, $_, 2; open($fh, ">'$temp_dir/pred/'$file.pred" ) or die $!; print {$fh} "$file $phones"; close($fh);' || exit 1 fi t2=$(date +%s) total_time=$((total_time + t2 - t1)) echo "SI decoding done in $((t2-t1)) seconds" ############################################################################### # # Resegmenter # ############################################################################### if ! [ `cat $lang/phones/optional_silence.txt | wc -w` -eq 1 ]; then echo "Error: this script only works if $lang/phones/optional_silence.txt contains exactly one entry."; echo "You'd have to modify the script to handle other cases." exit 1; fi silphone=`cat $lang/phones/optional_silence.txt` # silphone will typically be "sil" or "SIL". # 3 sets of phones: 0 is silence, 1 is noise, 2 is speech., ( echo "$silphone 0" if ! $noise_oov; then grep -v -w $silphone $lang/phones/silence.txt \ | awk '{print $1, 1;}' \ | sed 's/SIL\(.*\)1/SIL\10/' \ | sed 's/\(.*\)1/\12/' else grep -v -w $silphone $lang/phones/silence.txt \ | awk '{print $1, 1;}' \ | sed 's/SIL\(.*\)1/SIL\10/' fi cat $lang/phones/nonsilence.txt | awk '{print $1, 2;}' | sed 's/\(<.*>.*\)2/\11/' | sed 's/\(.*\)1/\12/' ) > $temp_dir/phone_map.txt mkdir -p $output_dir mkdir -p $temp_dir/log local/resegment/segmentation.py --verbose 2 $segmentation_opts \ $temp_dir/pred $temp_dir/phone_map.txt 2> $temp_dir/log/resegment.log | \ sort > $output_dir/segments || exit 1 if [ ! -s $output_dir/segments ] ; then echo "Zero segments created during segmentation process." echo "That means something failed. Try the cause and re-run!" exit 1 fi t2=$(date +%s) total_time=$((total_time + t2 - t1)) echo "Resegment data done in $((t2-t1)) seconds" for file in reco2file_and_channel wav.scp ; do [ ! -f $datadir/$file ] && echo "Expected file $datadir/$file to exist" && exit 1 cp $datadir/$file $output_dir/$file done # We'll make the speaker-ids be the same as the recording-ids (e.g. conversation # sides). This will normally be OK for telephone data. cat $output_dir/segments | awk '{print $1, $2}' > $output_dir/utt2spk || exit 1 utils/utt2spk_to_spk2utt.pl ${output_dir}/utt2spk > $output_dir/spk2utt || exit 1 dur_hours=`cat ${output_dir}/segments | awk '{num_secs += $4 - $3;} END{print (num_secs/3600);}'` echo "Extracted segments of total length of $dur_hours hours audio" echo --------------------------------------------------------------------- echo "Resegment data Finished successfully on" `date` echo --------------------------------------------------------------------- exit 0