#!/bin/bash # Copyright 2012 Navdeep Jaitly # Derived from swbd/s3/local/swbd_p1_train_lms.sh scripts. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # To be run from one directory above this script. # This script takes no arguments. It assumes you have already run # timit_data_prep.sh. # It takes as input the file # [argument 1]/train_trans.txt # and uses it to create the lexicon (just the phones) and the biphone language model. # Creates folder [argument 1]/lm if [ $# != 1 ]; then echo "Usage: ../../local/timit_train_lms.sh [data path]" echo "eg: ../../local/timit_train_lms.sh data/local" exit 1; fi dir=$1/lm trans_file=$1/train_trans.txt phones_file=$1/phones.txt lex_file=$1/lexicon.txt if [ ! -e $trans_file ]; then echo "Transcript file $trans_file not found. Did you run local/timit_data_prep.sh" exit 1; fi mkdir -p $dir export LC_ALL=C # You'll get errors about things being not sorted, if you # have a different locale. export PATH=$PATH:`pwd`/../../../tools/kaldi_lm ( # First make sure the kaldi_lm toolkit is installed. cd ../../../tools || exit 1; if [ -d kaldi_lm ]; then echo Not installing the kaldi_lm toolkit since it is already there. else echo Downloading and installing the kaldi_lm tools if [ ! -f kaldi_lm.tar.gz ]; then wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; fi tar -xvzf kaldi_lm.tar.gz || exit 1; cd kaldi_lm make || exit 1; echo Done making the kaldi_lm tools fi ) || exit 1; mkdir -p $dir echo "Creating phones file, and monophone lexicon (mapping phones to itself)." cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq | awk '{print tolower($1) ; }' > $phones_file cat $phones_file | awk '{print toupper($1) " " $1 ; }' > $lex_file cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ sort -nr > $dir/word.counts # Get counts from acoustic training transcripts, and add one-count # for each word in the lexicon. cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | \ cat - <(cat $lex_file | awk '{print $1}') | \ sort | uniq -c | sort -nr > $dir/unigram.counts # note: we probably won't really make use of as there aren't any OOVs cat $dir/unigram.counts | awk '{print $2}' | local/get_word_map.pl "" "" > $dir/word_map # note: ignore 2nd field of train.txt, it's the utterance-id. cat $trans_file | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz ! merge_ngrams &/dev/null && \ echo merge_ngrams not found in kaldi_lm. You need to have kaldi_lm on your path OR && \ echo You can do the following: && \ echo 1. Install the latest version from http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz && \ echo 2. you delete kaldi_lm, and kaldi_lm.tar.gz in the tools folder. This script will automatically install it. && \ exit 1; echo "Creating biphone model" local/create_biphone_lm.sh $dir