#!/usr/bin/perl # Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) # Apache 2.0. # use strict; use warnings; use Getopt::Long; use XML::Simple; use Data::Dumper; use File::Basename; my $tolerance = 0.5; sub ReadKwslist { my $kwslist_in = shift @_; my $source = "STDIN"; if ($kwslist_in ne "-") { open(I, "<$kwslist_in") || die "Fail to open kwslist $kwslist_in.\n"; $source = "I"; } # Read in the kwslist and parse it. Note that this is a naive parse -- I simply # assume that the kwslist is "properly" generated my @KWS; my (@info, $kwid, $tbeg, $dur, $file, $score, $channel); my ($kwlist_filename, $language, $system_id) = ("", "", ""); while (<$source>) { chomp; if (/[0]\" language=\"$info->[1]\" system_id=\"$info->[2]\">\n"; my $prev_kw = ""; foreach my $kwentry (@{$KWS}) { if ($prev_kw ne $kwentry->[0]) { if ($prev_kw ne "") {$kwslist .= " \n";} $kwslist .= " [0]\" oov_count=\"0\">\n"; $prev_kw = $kwentry->[0]; } $kwslist .= " [1]\" channel=\"$kwentry->[2]\" tbeg=\"$kwentry->[3]\" dur=\"$kwentry->[4]\" score=\"$kwentry->[5]\" decision=\"$kwentry->[6]\""; if (defined($kwentry->[7])) {$kwslist .= " threshold=\"$kwentry->[7]\"";} if (defined($kwentry->[8])) {$kwslist .= " raw_score=\"$kwentry->[8]\"";} $kwslist .= "/>\n"; } $kwslist .= " \n"; $kwslist .= "\n"; return $kwslist; } sub KwslistTimeCompare { my ($a, $b) = @_; if ($a->[0] eq $b->[0]) { if ($a->[1] eq $b->[1]) { if (abs($a->[3]-$b->[3]) <= $tolerance) { if (abs($a->[3]+$a->[4]-$b->[3]-$b->[4]) <= $tolerance) { return 0; } else { return ($a->[3]+$a->[4]) <=> ($b->[3]+$b->[4]); } } else { return $a->[3] <=> $b->[3]; } } else { return $a->[1] cmp $b->[1]; } } else { $a->[0] cmp $b->[0]; } } sub KwslistTimeSort { return KwslistTimeCompare($a, $b); } my $Usage = < w2 ... e.g.: naive_comb.pl 0.5 kwslist1.xml 0.5 kwslist2.xml ... kwslist_comb.xml Allowed options: --method : Use different combination method (int, default = 1) 1 -- Weighted sum 2 -- Weighted "powered" --power : The power of method 2 (float, default = 0.5) --tolerance : Tolerance for being the same hits (float, default = 0.5) EOU my $method = 1; my $power = 0.5; GetOptions('tolerance=f' => \$tolerance, 'method=i' => \$method, 'power=f' => \$power, 'inv-power=f' => sub { (my $opt, my $val) = @_; $power = 1.0/$val;}); @ARGV >= 3 || die $Usage; # Workout the input/output source @ARGV % 2 == 1 || die "Bad number of (weight, kwslist) pair.\n"; my @kwslist_file = (); my @weight = (); while (@ARGV != 1) { my $w = shift @ARGV; $w =~ m/^[0-9.]*$/ || die "Bad weight: $w.\n"; push(@weight, $w); push(@kwslist_file, shift @ARGV); } my $output = shift @ARGV; # Open the first kwslist my ($info, $KWS) = @{ReadKwslist($kwslist_file[0])}; # Open the rest kwslists my @kwslist = (); for (my $i = 1; $i < @kwslist_file; $i ++) { push(@kwslist, @{ReadKwslist($kwslist_file[$i])}[1]); } # Process the first kwslist my @KWS = sort KwslistTimeSort @{$KWS}; my $w = shift @weight; foreach my $kwentry (@$KWS) { if ($method == 1) { $kwentry->[5] = $kwentry->[5] * $w; } elsif ($method == 2) { $kwentry->[5] = ($kwentry->[5]**$power) * $w; } else { die "Method not defined.\n"; } } # Start merging the rest kwslists while (@kwslist > 0) { my $w = shift @weight; my @kws = sort KwslistTimeSort @{shift @kwslist}; # We'll take time information from the first system my ($i, $j) = (0, 0); my @from_kws; while ($i < @KWS and $j < @kws) { my $cmp = KwslistTimeCompare($KWS[$i], $kws[$j]); if ($cmp == 0) { if ($method == 1) { $KWS[$i]->[5] += $kws[$j]->[5] * $w; } elsif ($method == 2) { $KWS[$i]->[5] += ($kws[$j]->[5]**$power) * $w; } else { die "Method not defined.\n"; } $i ++; $j ++; } elsif ($cmp == -1) { $i ++; } else { if ($method == 1) { $kws[$j]->[5] = $kws[$j]->[5] * $w; } elsif ($method == 2) { $kws[$j]->[5] = ($kws[$j]->[5]**$power) * $w; } else { die "Method not defined.\n"; } push(@from_kws, $kws[$j]); $j ++; } } while ($j < @kws) { if ($method == 1) { $kws[$j]->[5] = $kws[$j]->[5] * $w; } elsif ($method == 2) { $kws[$j]->[5] = ($kws[$j]->[5]**$power) * $w; } else { die "Method not defined.\n"; } push(@from_kws, $kws[$j]); $j ++; } # Sort again @from_kws = (@KWS, @from_kws); @KWS = sort KwslistTimeSort @from_kws; } if ($method == 2) { foreach my $kwentry (@KWS) { $kwentry->[5] = $kwentry->[5]**(1.0/$power); } } # Sorting and pringting my $kwslist = PrintKwslist(\@{$info}, \@KWS); if ($output eq "-") { print $kwslist; } else { open(O, ">$output") || die "Fail to open output file: $output\n"; print O $kwslist; close(O); }