#!/bin/bash # Copyright 2010-2011 Microsoft Corporation # Copyright 2012 Vassil Panayotov # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # To be run from one directory above this script. # Note: when creating your own data preparation scripts, it's a good idea # to make sure that the speaker id (if present) is a prefix of the utterance # id, that the output scp file is sorted on utterance id, and that the # transcription file is exactly the same length as the scp file and is also # sorted on utterance id (missing transcriptions should be removed from the # scp file using e.g. scripts/filter_scp.pl) if [ $# != 1 ]; then echo "Usage: ../../local/RM_data_prep.sh /path/to/RM" exit 1; fi export LC_ALL=C RMROOT=$1 mkdir -p data/local cd data/local if [ ! -f $RMROOT/RM_G.fst -o ! -d $RMROOT/rm1 ]; then echo "Required data is missing. You can download the data by running ./getdata.sh" exit 1; fi # Make a list of files cat $RMROOT/rm1/etc/rm1_train.fileids | \ xargs -I_x_ echo $RMROOT/rm1/feat/_x_.mfc > train.flist cat $RMROOT/rm1/etc/rm1_test.fileids | \ xargs -I_x_ echo $RMROOT/rm1/feat/_x_.mfc > test.flist # make_trans.pl also creates the utterance id's and the kaldi-format scp file. # this is needed, because the original "al_sents.snr" file is not available # (and because CMU's train utterances have tags like '' added) cat $RMROOT/rm1/etc/rm1_train.transcription |\ tr '[a-z]' '[A-Z]' |\ sed -E -e 's:: :g' -e 's:\([0-9]\): :g' -e 's: +: :g' -e 's:^ +::' |\ cat $RMROOT/rm1/etc/rm1_test.transcription - \ > al_sents.snr # training set ../../local/make_trans.pl trn train.flist al_sents.snr train_trans.txt train.scp mv train_trans.txt tmp; sort -k 1 tmp > train_trans.txt mv train.scp tmp; sort -k 1 tmp > train.scp rm tmp # test set ../../local/make_trans.pl test test.flist al_sents.snr test_trans.txt test.scp mv test_trans.txt tmp; sort -k 1 tmp > test_trans.txt mv test.scp tmp; sort -k 1 tmp > test.scp rm tmp # We already have the features, so sph2pipe step is skipped and # given the limited data the speaker-dependent processing is also not used # "wp_gram.txt" is no longer available from LDC's website, so we are just using a # pre-built grammar WFST (G.fst). The word-pair grammar is a finite-state description # of the allowed utterances, which just enumerates the words that can follow each word # in the vocabulary. G.fst is constructed by adding output arcs to each node # representing a word, one for each word that is allowed to follow, and the # probability mass is distributed uniformly among all these arcs. #../../scripts/make_rm_lm.pl $RMROOT/LDC93S3B/disc_1/doc/wp_gram.txt > G.txt || exit 1; cp $RMROOT/RM_G.fst ./G.fst # Convert the CMU's lexicon to a form which the other scripts expect # (leave only the first pronunciation variant and convert the phones to lower case) cat $RMROOT/rm1/etc/rm1.dic | \ egrep -v '\(' | \ sed -e "s/^\([[:alnum:]-]\+\('[[:alpha:]]\+\)\?\)\(.*\)/\1\L\3/g" > lexicon.txt echo RM_data_prep succeeded.