#!/bin/bash -e # Creating a UEM decoding setup with CMU segmentation from Florian (Feb 15, 2013). dummy_text=true text= filelist= #end of configuration [ -f ./path.sh ] && . ./path.sh [ -f ./cmd.sh ] && . ./cmd.sh . parse_options.sh || exit 1; if [ $# -ne 3 ] ; then echo "$0: Converts the CMU segmentation database file into a kaldi data directory for UEM decoding" echo "" echo "cmu_ume2kaldi_dir.sh " echo "example: cmu_ume2kaldi_dir.sh db-tag-eval-utt.dat /export/babel/data/106-tagalog/audio data/eval.uem" echo "Was called with: $*" exit 1; fi database=$1 audiopath=$2 datadir=$3 echo $0 $@ mkdir -p $datadir # 1. Create the segments file: [ ! -f $database ] && echo "Database file $1 does not exist!" && exit 1; echo "Converting `basename $database` to kaldi directory $datadir " cat $database | perl -pe 's:.+(BABEL):BABEL:; s:\}\s+\{FROM\s+: :; s:\}\s+\{TO\s+: :; s:\}.+::;' | \ perl -ne '@K = split; $utteranceID = @K[0]; $utteranceID =~ s:[^_]+_[^_]+_[^_]+_::; $utteranceID =~ s:([^_]+)_(.+)_(inLine|scripted):${1}_A_${2}:; $utteranceID =~ s:([^_]+)_(.+)_outLine:${1}_B_${2}:; $utteranceID .= sprintf ("_%06i", (100*@K[2])); printf("%s %s %.2f %.2f\n", $utteranceID, @K[0], @K[1], @K[2]);' | sort > $datadir/segments if [ ! -z $filelist ] ; then mv $datadir/segments $datadir/segments.full grep -F -f $filelist $datadir/segments.full > $datadir/segments l=`grep -v -F -f $filelist $datadir/segments.full | cut -f 2 -d ' ' | sort -u | wc -l` echo "Because of using filelist, $l files omitted" fi # 2. Create the utt2spk file: echo "Creating the $datadir/utt2spk file" cut -f1 -d' ' $datadir/segments | \ perl -ne 'chomp; m:([^_]+_[AB]).*:; print "$_ $1\n";' | \ sort > $datadir/utt2spk # 3. Create the spk2utt file: echo "Creating the $datadir/spk2utt file" perl -ne '{chomp; @K=split; $utt{@K[1]}.=" @K[0]";} END{foreach $spk (sort keys %utt) { printf("%s%s\n", $spk, $utt{$spk}); } }' < $datadir/utt2spk | sort > $datadir/spk2utt # 4. Create the wav.scp file: sph2pipe=`which sph2pipe || which $KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe` if [ $? -ne 0 ] ; then echo "Could not find sph2pipe binary. Add it to PATH" exit 1; fi sox=`which sox` if [ $? -ne 0 ] ; then echo "Could not find sox binary. Add it to PATH" exit 1; fi echo "Creating the $datadir/wav.scp file" ( set -o pipefail for file in `cut -f 2 -d ' ' $datadir/segments` ; do if [ -f $audiopath/audio/$file.sph ] ; then echo "$file $sph2pipe -f wav -p -c 1 $audiopath/audio/$file.sph |" elif [ -f $audiopath/audio/$file.wav ] ; then echo "$file $sox $audiopath/audio/$file.wav -r 8000 -c 1 -b 16 -t wav - downsample |" else echo "Audio file $audiopath/audio/$file.sph does not exist!" >&2 exit 1 fi done | sort -u > $datadir/wav.scp if [ $? -ne 0 ] ; then echo "Error producing the wav.scp file" exit 1 fi ) || exit 1 l1=`wc -l $datadir/wav.scp | cut -f 1 -d ' ' ` echo "wav.scp contains $l1 files" if [ ! -z $filelist ] ; then l2=`wc -l $filelist | cut -f 1 -d ' '` echo "filelist `basename $filelist` contains $l2 files" if [ "$l1" -ne "$l2" ] ; then echo "WARNING: Not all files from the specified fileset made their way into wav.scp" fi fi # 5. Create the text file: echo "Creating the $datadir/text file" if [ ! -z $text ] ; then cp $text $datadir/text || echo "Could not copy the source text file \"$text\" " && exit 1 elif $dummy_text ; then cut -f1 -d' ' $datadir/segments | \ sed -e 's/$/ IGNORE_TIME_SEGMENT_IN_SCORING/' | \ sort > $datadir/text fi # 6. reco2file_and_channel echo "Creating the $datadir/reco2file_and_channel file" (for f in $( cut -f 1 -d ' ' $datadir/wav.scp ) ; do echo $f $f "A"; done) > $datadir/reco2file_and_channel echo "Everything done"