#!/usr/bin/env perl use warnings; use strict; use utf8; use Data::Dumper; sub float_gt { my ($A, $B) = @_; #print Dumper(\@_); if ( ($A - $B) < 1e-12 ) { return 0; } elsif ($A > $B ) { return 1; } else { return 0; } } binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); my $datadir=$ARGV[0]; my $rttm_filename=$ARGV[1]; my $filename=""; my %rttm; my @times; open(rttm_f, "<:utf8", $rttm_filename) or die "Cannot open the RTTM file"; while ( ) { chop; my @elems = split; my $_filename= $elems[1]; my $_time=$elems[3]; my $_dur=$elems[4]; my $_text=$elems[5]; #We could simply pull-out the vector of times #from the hash, but in case the RTTM is not sorted #there might be some other problem somewhere #(as the RTTMs are normally sorted). So instead of being #"smart", let's make the user notice! if ( exists($rttm{$_filename}) ) { die "The RTTM file is not sorted!"; } if ( $filename ne $_filename ) { if ( $filename ne "" ) { #print $filename . "\n"; my @tmp = @times; $rttm{$filename} = \@tmp; #if ($filename eq "BABEL_BP_101_10470_20111118_172644_inLine" ) { # print "$filename\n"; # print Dumper($rttm{$filename}); #} #print Dumper($rttm{"BABEL_BP_101_10470_20111118_172644_inLine"}); } @times = (); $filename = $_filename; } #I don't really know what is the distinction between all #of these. Let's throw away the SPEAKER, as it does not #really contain information that is to be found in the transcript #and keep the others if ( $elems[0] eq "LEXEME") { push @times, [$_time, $_time + $_dur, $_text]; } elsif ( $elems[0] eq "NON-SPEECH" ) { push @times, [$_time, $_time + $_dur, $_text]; } elsif ( $elems[0] eq "NON-LEX" ) { push @times, [$_time, $_time + $_dur, $_text]; } elsif ( $elems[0] eq "SPEAKER") { ; } else { #This is just a safety precaution if a new flag/type appears. die "Unknown first element $elems[0] of line '" . join(" ", @elems) . "'\n"; } #We compare the two last entries of the #times vector, if they #are ordered properly. Again, this is just a safety recaution #In a well-formed RTTM, this is normal. if ( (@times > 1) && float_gt($times[-2][1], $times[-1][0]) ) { #print Dumper(\@times); my $A = $times[-2][0]; my $B = $times[-1][0]; my $Aend = $times[-2][1]; my $Bend = $times[-1][1]; #print "WARNING: Elements in the RTTM file are not sorted for FILENAME $filename!\n"; #print $times[-2][0] . " " . $times[-2][1] - $times[-2][0]. " " . $times[-2][2] . "\n"; #print $times[-1][0] . " " . $times[-1][1] - $times[-1][0]. " " . $times[-1][2] . "\n"; #print "\n"; my @sorted = sort {$a <=> $b} ($A, $B, $Aend, $Bend); #print Dumper(\@sorted); $times[-1][0] = $sorted[0]; $times[-1][1] = $sorted[2]; #We omit the gap between these two words $times[-2][0] = $sorted[2]; $times[-2][1] = $sorted[3]; } } if ( $filename ne "" ) { #print $filename . "\n"; $rttm{$filename} = \@times; } close(rttm_f); open(segments_f, "<:utf8", "$datadir/segments") or die "Cannot open file $datadir/segments"; while ( ) { chop; my ($segmentname, $filename, $start, $end) = split; if (! exists $rttm{$filename} ) { print "Filename $filename does not exists in the RTTM file\n"; die; } my @times = @{$rttm{$filename}}; my $i; my $j; #if ($segmentname ne "10470_A_20111118_172644_000000" ) { # next; #} #print $filename . "\n"; #print Dumper(\@times); $i = 0; #print $start . " " . $times[$i][0] . " " . $times[$i][1] . "\n"; while (($i < @times) && ( $times[$i][1] < $start ) ) { $i += 1; }; $j = $i; while (($j < @times) && ( $times[$j][0] < $end ) ) { $j += 1; }; print $segmentname . " "; while ( $i < $j ) { #print Dumper($times[$i]); print $times[$i][2] . " "; $i += 1; } print "\n"; #die } close(segments_f);