#!/bin/bash no_feats=false no_wav=false no_text=false for x in `seq 3`; do if [ $1 == "--no-feats" ]; then no_feats=true shift; fi if [ $1 == "--no-text" ]; then no_text=true shift; fi if [ $1 == "--no-wav" ]; then no_wav=true shift; fi done if [ $# -ne 1 ]; then echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] data-dir" echo "e.g.: $0 data/train" fi data=$1 if [ ! -d $data ]; then echo "$0: no such directory $data" exit 1; fi for f in spk2utt utt2spk; do if [ ! -f $data/$f ]; then echo "$0: no such file $f" exit 1; fi if [ ! -s $data/$f ]; then echo "$0: empty file $f" exit 1; fi done ! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ echo "$0: $data/utt2spk has wrong format." && exit; tmpdir=$(mktemp -d); trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM export LC_ALL=C function check_sorted_and_uniq { ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; } function partial_diff { diff $1 $2 | head -n 6 echo "..." diff $1 $2 | tail -n 6 n1=`cat $1 | wc -l` n2=`cat $2 | wc -l` echo "[Lengths are $1=$n1 versus $2=$n2]" } check_sorted_and_uniq $data/utt2spk ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; check_sorted_and_uniq $data/spk2utt ! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ <(utils/spk2utt_to_utt2spk.pl $data/spk2utt) && \ echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts if [ ! -f $data/text ] && ! $no_text; then echo "$0: no such file $data/text (if this is by design, specify --no-text)" exit 1; fi num_utts=`cat $tmpdir/utts | wc -l` if [ -f $data/text ]; then check_sorted_and_uniq $data/text text_len=`cat $data/text | wc -l` awk '{print $1}' < $data/text > $tmpdir/utts.txt if ! cmp -s $tmpdir/utts{,.txt}; then echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" echo "$0: differ, partial diff is:" partial_diff $tmpdir/utts{,.txt} exit 1; fi fi if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then echo "$0: in directory $data, segments file exists but no wav.scp" exit 1; fi if [ ! -f $data/wav.scp ] && ! $no_wav; then echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" exit 1; fi if [ -f $data/wav.scp ]; then check_sorted_and_uniq $data/wav.scp if [ -f $data/segments ]; then check_sorted_and_uniq $data/segments # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. ! cat $data/segments | \ awk '{if (NF != 4 || !($4 > $3)) { print "Bad line in segments file", $0; exit(1); }}' && \ echo "$0: badly formatted segments file" && exit 1; segments_len=`cat $data/segments | wc -l` if [ -f $data/text ]; then ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/text) && \ echo "$0: Utterance list differs between $data/text and $data/segments " && \ echo "$0: Lengths are $segments_len vs $num_utts"; fi cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav if ! cmp -s $tmpdir/recordings{,.wav}; then echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" echo "$0: differ, partial diff is:" partial_diff $tmpdir/recordings{,.wav} exit 1; fi if [ -f $data/reco2file_and_channel ]; then # this file is needed only for ctm scoring; it's indexed by recording-id. check_sorted_and_uniq $data/reco2file_and_channel ! cat $data/reco2file_and_channel | \ awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { if ( NF == 3 && $3 == "1" ) { warning_issued = 1; } else { print "Bad line ", $0; exit 1; } } } END { if (warning_issued == 1) { print "The channel should be marked as A or B, not 1! You should change it ASAP! " } }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc if ! cmp -s $tmpdir/recordings{,.r2fc}; then echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" echo "$0: differ, partial diff is:" partial_diff $tmpdir/recordings{,.r2fc} exit 1; fi fi else # No segments file -> assume wav.scp indexed by utterance. cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav if ! cmp -s $tmpdir/utts{,.wav}; then echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" echo "$0: differ, partial diff is:" partial_diff $tmpdir/utts{,.wav} exit 1; fi if [ -f $data/reco2file_and_channel ]; then # this file is needed only for ctm scoring; it's indexed by recording-id. check_sorted_and_uniq $data/reco2file_and_channel ! cat $data/reco2file_and_channel | \ awk '{if (NF != 3 || ($3 != "A" && $3 != "B")) { print "Bad line ", $0; exit 1; }}' && \ echo "$0: badly formatted reco2file_and_channel file" && exit 1; cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc if ! cmp -s $tmpdir/utts{,.r2fc}; then echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" echo "$0: differ, partial diff is:" partial_diff $tmpdir/utts{,.r2fc} exit 1; fi fi fi fi if [ ! -f $data/feats.scp ] && ! $no_feats; then echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" exit 1; fi if [ -f $data/feats.scp ]; then check_sorted_and_uniq $data/feats.scp cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats if ! cmp -s $tmpdir/utts{,.feats}; then echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" echo "$0: differ, partial diff is:" partial_diff $tmpdir/utts{,.feats} exit 1; fi fi if [ -f $data/cmvn.scp ]; then check_sorted_and_uniq $data/cmvn.scp cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers if ! cmp -s $tmpdir/speakers{,.cmvn}; then echo "$0: Error: in $data, speaker lists extracted from spkutt and cmvn" echo "$0: differ, partial diff is:" partial_diff $tmpdir/speakers{,.cmvn} exit 1; fi fi if [ -f $data/spk2gender ]; then check_sorted_and_uniq $data/spk2gender ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ echo "Mal-formed spk2gender file" && exit 1; cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers if ! cmp -s $tmpdir/speakers{,.spk2gender}; then echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" echo "$0: differ, partial diff is:" partial_diff $tmpdir/speakers{,.spk2gender} exit 1; fi fi if [ -f $data/vad.scp ]; then check_sorted_and_uniq $data/vad.scp if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ <( awk '{print $1}' $data/vad.scp ); then echo "$0: error: in $data, vad.scp and utt2spk do not have identical utterance-id list" fi fi echo "Successfully validated data-directory $data"