#!/bin/bash # E.g. of usage: ./convert_htk.sh ../.. /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm84/MMF /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm10_800_500/cluster.trees convert_dir # This script takes as input the mmf and trees file and creates kaldi.tree and kaldi.mdl # as output. make_linear=no cleanup=yes for foo in 1 2 3; do if [ "$1" == "--linear-topology" ]; then make_linear=yes; shift; fi if [ "$1" == "--no-cleanup" ]; then cleanup=no; shift; fi done rootdir=$1 # e.g. /homes/eva/q/qpovey/UBM-ASR/branches/clean/ mmf=$2 #e.g. /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm84/MMF trees=$3 # e.g. /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm10_800_500/cluster.trees outdir=$4 # place to put tmp stuff + kaldi models. # Convert relative to absolute pathnames in arguments. D=`dirname "$rootdir"` B=`basename "$rootdir"` rootdir="`cd \"$D\" 2>/dev/null && pwd || echo \"$D\"`/$B" D=`dirname "$trees"` B=`basename "$trees"` trees="`cd \"$D\" 2>/dev/null && pwd || echo \"$D\"`/$B" D=`dirname "$mmf"` B=`basename "$mmf"` mmf="`cd \"$D\" 2>/dev/null && pwd || echo \"$D\"`/$B" export PATH=$PATH:$rootdir/misc/htk_conversion/:$rootdir/src/gmmbin/ mkdir -p $outdir cd $outdir mmf2trans.pl < $mmf > trans1.txt if [ $make_linear == "yes" ]; then make_linear.pl < trans1.txt > trans2.txt else cp trans1.txt trans2.txt fi grep TRANSITION trans2.txt | awk '{print $2}' | sort > phonelist.txt # Work out phones that appear in questions but not data (will remove them.) grep QS $trees | \ perl -ane ' foreach $a ( split(",", $_) ) { $a =~ m:\"(.+)\-\*: && print "$1\n"; $a =~ m:\"\*\+(.+)\": && print "$1\n"; } ' \ | sort | uniq > phonelist_tree.txt if diff phonelist.txt phonelist_tree.txt | grep -v a | grep -v '>'; then echo "Phones seen in tree but not model: error." exit 1; fi diff phonelist.txt phonelist_tree.txt | grep '>' | awk '{print $2}' > bad_phones.txt # bad_phones.txt is phones that appear in the tree file but not the model file. # Will just delete them from the tree # Get initial tree. parse_trees.pl $trees > trees1.txt # Add stump "trees" for the context-independent phones. This assumes they are macros-- otherwise, we have a problem. get_ci_phone_trees.pl $mmf | cat trees1.txt - > trees2.txt # Remove "bad" phones (unseen phones) from the trees. perl -e 'foreach $p (@ARGV) { $is_bad{$p} = 1; }; while() { foreach $a(split(" ",$_)) { if(!$is_bad{$a}) { print "$a "; } } print "\n"; } ' `cat bad_phones.txt` < trees2.txt > trees3.txt # Add eps to every question that has "sil" in it (should produce better end-of-file behaviour.) # have to treat the first symbol on each line specially (would be the phone whose tree it is). cat trees3.txt | awk '{for(n=1;n<=NF;n++) { if(n>1 && $n == "sil") { printf("sil "); } else { printf("%s ", $n); } } printf("\n"); }' > trees4.txt # Extract the GMMs for each state. get_hmm_states.pl $mmf > states1.txt cat states1.txt | grep STATE | awk '{print $2}' | sort > statelist.txt # Make symbol-tables for the phones and states. echo '' | cat - phonelist.txt | awk '{ printf("%s %d\n", $1, NR-1); }' > phones.txt cat statelist.txt | awk '{ printf("%s %d\n", $1, NR-1); }' > statesyms.txt integerize.pl phones.txt < trans2.txt > trans3.txt # work out map from phone-id to number of emitting states for that phone. grep TRANSITION trans3.txt | awk '{print $2 " " $3-2; }' > phone2len.txt trans2topo.pl < trans3.txt > kaldi.topo integerize.pl statesyms.txt < states1.txt > states2.txt integerize.pl phones.txt statesyms.txt < trees4.txt > trees5.txt # Sort the integers in the questions (the things between square brackets), # the Kaldi code requires this. cat trees5.txt | perl -ane ' chop; @A = (split("\\[",$_)); print shift @A; foreach $x (@A) { ($y,$z)=split("\\]",$x); $y = join(" ", sort{$a <=> $b}(split(" ", $y))); print " [ $y ] $z "; } print "\n"; ' > trees6.txt tree_convert.pl phone2len.txt trees6.txt > kaldi.tree dim=`grep -w MEAN $mmf | head -1 | awk '{print $2}'` # probably 39. convert_states.pl $dim < states2.txt > kaldi.am_gmm gmm-init-trans --binary=false kaldi.topo kaldi.am_gmm kaldi.tree kaldi.mdl || exit 1; # clean up: if [ $cleanup == "yes" ]; then mv phones.txt tmpa; mv statesyms.txt tmpb; rm *.txt kaldi.am_gmm mv tmpa phones.txt mv tmpb statesyms.txt fi # now, kaldi.tree and kaldi.mdl are a Kaldi-format model; kaldi.topo may # also be useful for some purposes.