#!/bin/bash -u # Copyright 2012 Navdeep Jaitly # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # To be run from one directory above this script. # The input is the 3 CDs from the LDC distribution of Resource Management. # The script's argument is a directory which has three subdirectories: # rm1_audio1 rm1_audio2 rm2_audio # Note: when creating your own data preparation scripts, it's a good idea # to make sure that the speaker id (if present) is a prefix of the utterance # id, that the output scp file is sorted on utterance id, and that the # transcription file is exactly the same length as the scp file and is also # sorted on utterance id (missing transcriptions should be removed from the # scp file using e.g. scripts/filter_scp.pl) if [ $# != 1 ]; then echo "Usage: ../../local/timit_data_prep.sh /path/to/TIMIT" exit 1; fi TIMIT_ROOT=$1 S3_ROOT=`pwd` mkdir -p data/local cd data/local lower_case=0 upper_case=0 if [ -d $TIMIT_ROOT/TIMIT/TRAIN -a -d $TIMIT_ROOT/TIMIT/TEST ]; then upper_case=1 train_folder=$TIMIT_ROOT/TIMIT/TRAIN test_folder=$TIMIT_ROOT/TIMIT/TEST spkr_info_file=$TIMIT_ROOT/TIMIT/DOC/SPKRINFO.TXT elif [ -d $TIMIT_ROOT/timit/train -a -d $TIMIT_ROOT/timit/test ]; then lower_case=1 train_folder=$TIMIT_ROOT/timit/train test_folder=$TIMIT_ROOT/timit/test spkr_info_file=$TIMIT_ROOT/timit/doc/spkrinfo.txt else echo "Error: run.sh requires a directory argument (an absolute pathname) that contains TIMIT/TRAIN and TIMIT/TEST or timit/train and timit/test." exit 1; fi ( find $train_folder -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }' ) > train_sph.flist # make_trans.pl also creates the utterance id's and the kaldi-format scp file. $S3_ROOT/local/make_trans.pl trn train_sph.flist train_trans.txt train_sph.scp || exit 1; mv train_trans.txt tmp; sort -k 1 tmp > train_trans.txt mv train_sph.scp tmp; sort -k 1 tmp > train_sph.scp rm tmp sph2pipe=`cd $S3_ROOT ; cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe` if [ ! -f $sph2pipe ]; then echo "Could not find the sph2pipe program at $sph2pipe"; exit 1; fi awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < train_sph.scp > train_wav.scp cat train_wav.scp | perl -ane 'm/^(\w+_(\w+)\w_\w+) / || die; print "$1 $2\n"' > train.utt2spk cat train.utt2spk | sort -k 2 | $S3_ROOT/scripts/utt2spk_to_spk2utt.pl > train.spk2utt echo "Creating coretest set." test_speakers="mdab0 mwbt0 felc0 mtas1 mwew0 fpas0 mjmp0 mlnt0 fpkt0 mlll0 mtls0 fjlm0 mbpm0 mklt0 fnlp0 mcmj0 mjdh0 fmgd0 mgrt0 mnjm0 fdhc0 mjln0 mpam0 fmld0" dev_speakers="faks0 fdac1 fjem0 mgwt0 mjar0 mmdb1 mmdm2 mpdf0 fcmh0 fkms0 mbdg0 mbwm0 mcsh0 fadg0" dev_speakers="${dev_speakers} fdms0 fedw0 mgjf0 mglb0 mrtk0 mtaa0 mtdt0 mthc0 mwjg0 fnmr0 frew0 fsem0 mbns0 mmjr0 mdls0 mdlf0" dev_speakers="${dev_speakers} mdvc0 mers0 fmah0 fdrw0 mrcs0 mrjm4 fcal1 mmwh0 fjsj0 majc0 mjsw0 mreb0 fgjd0 fjmg0 mroa0 mteb0 mjfc0 mrjr0 fmml0 mrws1" if [ $upper_case == 1 ] ; then test_speakers=`echo $test_speakers | tr '[:lower:]' '[:upper:]'` dev_speakers=`echo $dev_speakers | tr '[:lower:]' '[:upper:]'` fi rm -f test_sph.flist for speaker in $test_speakers ; do echo -n $speaker " " ( find $test_folder/*/${speaker} -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }' ) >> test_sph.flist done echo "" num_lines=`wc -l test_sph.flist | awk '{print $1}'` echo "# of utterances in coretest set = ${num_lines}" echo "Creating dev set." rm -f dev_sph.flist for speaker in $dev_speakers ; do echo -n $speaker " " ( find $test_folder/*/${speaker} -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }' ) >> dev_sph.flist done echo "" num_lines=`wc -l dev_sph.flist | awk '{print $1}'` echo "# of utterances in dev set = ${num_lines}" # make_trans.pl also creates the utterance id's and the kaldi-format scp file. for test in test dev ; do echo "Finalizing ${test}" $S3_ROOT/local/make_trans.pl ${test} ${test}_sph.flist ${test}_trans.txt ${test}_sph.scp || exit 1; mv ${test}_trans.txt tmp; sort -k 1 tmp > ${test}_trans.txt mv ${test}_sph.scp tmp; sort -k 1 tmp > ${test}_sph.scp rm tmp; awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${test}_sph.scp > ${test}_wav.scp cat ${test}_wav.scp | perl -ane 'm/^(\w+_(\w+)\w_\w+) / || die; print "$1 $2\n"' > ${test}.utt2spk cat ${test}.utt2spk | sort -k 2 | $S3_ROOT/scripts/utt2spk_to_spk2utt.pl > ${test}.spk2utt done # Need to set these on the basis of file name first characters. #grep -v "^;" DOC/SPKRINFO.TXT | awk '{print $1 " " $2 ; } ' | \ cat $spkr_info_file | \ perl -ane 'tr/A-Z/a-z/;print;' | grep -v ';' | \ awk '{print $2$1, $2}' | sort | uniq > spk2gender.map || exit 1; echo timit_data_prep succeeded.