#! /usr/bin/env python # Copyright 2014 Vimal Manohar # Apache 2.0 import os, glob, argparse, sys, re, time from argparse import ArgumentParser use_numpy = True try: import numpy as np except ImportError: use_numpy = False # Global stats for analysis taking RTTM file as reference global_analysis_get_initial_segments = None global_analysis_set_nonspeech_proportion = None global_analysis_final = None def mean(l): if len(l) > 0: return float(sum(l)) / len(l) return 0 # Analysis class # Stores statistics like the confusion matrix, length of the segments etc. class Analysis: def __init__(self, file_id, frame_shift, prefix): self.confusion_matrix = [0] * 9 self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ] self.state_count = [ [] for i in range(0,9) ] self.markers = [ [] for i in range(0,9) ] self.phones = [ [] for i in range(0,9) ] self.min_length = [0] * 9 self.max_length = [0] * 9 self.mean_length = [0] * 9 self.percentile25 = [0] * 9 self.percentile50 = [0] * 9 self.percentile75 = [0] * 9 self.file_id = file_id self.frame_shift = frame_shift self.prefix = prefix # Add the statistics of this object to another object a # Typically used in a global object to accumulate stats # from local objects def add(self, a): for i in range(0,9): self.confusion_matrix[i] += a.confusion_matrix[i] self.state_count[i] += a.state_count[i] # Print the confusion matrix # The interpretation of 'speech', 'noise' and 'silence' are bound to change # through the different post-processing stages. e.g at the end, speech and silence # correspond respectively to 'in segment' and 'out of segment' def write_confusion_matrix(self, write_hours = False, file_handle = sys.stderr): sys.stderr.write("Total counts: \n") name = ['Silence as silence', \ 'Silence as noise', \ 'Silence as speech', \ 'Noise as silence', \ 'Noise as noise', \ 'Noise as speech', \ 'Speech as silence', \ 'Speech as noise', \ 'Speech as speech'] for j in range(0,9): if self.frame_shift != None: # The conventional usage is for frame_shift to have a value. # But this function can handle other counts like the number of frames. # This function is called to print in counts instead of seconds in # functions like merge_segments if write_hours: # Write stats in hours instead of seconds sys.stderr.write("File %s: %s : %s : %8.3f hrs\n" % (self.file_id, self.prefix, name[j], self.confusion_matrix[j] * self.frame_shift / 3600.0)) else: sys.stderr.write("File %s: %s : %s : %8.3f seconds\n" % (self.file_id, self.prefix, name[j], self.confusion_matrix[j] * self.frame_shift)) # End if write_hours else: sys.stderr.write("File %s: %s : Confusion: Type %d : %8.3f counts\n" % (self.file_id, self.prefix, j, self.confusion_matrix[j])) # End if # End for loop over 9 cells of confusion matrix # Print the total stats that are just row and column sums of # 3x3 confusion matrix def write_total_stats(self, write_hours = True, file_handle = sys.stderr): sys.stderr.write("Total Stats: \n") name = ['Actual Silence', \ 'Actual Noise', \ 'Actual Speech'] for j in [0,1,2]: if self.frame_shift != None: # The conventional usage is for frame_shift to have a value. # But this function can handle other counts like the number of frames. # This function is called to print in counts instead of seconds in # functions like merge_segments if write_hours: # Write stats in hours instead of seconds sys.stderr.write("File %s: %s : %s : %8.3f hrs\n" % (self.file_id, self.prefix, name[j], sum(self.confusion_matrix[3*j:3*j+3]) * self.frame_shift / 3600.0)) else: sys.stderr.write("File %s: %s : %s : %8.3f seconds\n" % (self.file_id, self.prefix, name[j], sum(self.confusion_matrix[3*j:3*j+3]) * self.frame_shift)) # End if write_hours else: sys.stderr.write("File %s: %s : %s : %8.3f counts\n" % (self.file_id, self.prefix, name[j], sum(self.confusion_matrix[3*j:3*j+3]))) # End if # End for loop over 3 rows of confusion matrix name = ['Predicted Silence', \ 'Predicted Noise', \ 'Predicted Speech'] for j in [0,1,2]: if self.frame_shift != None: # The conventional usage is for frame_shift to have a value. # But this function can handle other counts like the number of frames. # This function is called to print in counts instead of seconds in # functions like merge_segments if write_hours: # Write stats in hours instead of seconds sys.stderr.write("File %s: %s : %s : %8.3f hrs\n" % (self.file_id, self.prefix, name[j], sum(self.confusion_matrix[j:7+j:3]) * self.frame_shift / 3600.0)) else: sys.stderr.write("File %s: %s : %s : %8.3f seconds\n" % (self.file_id, self.prefix, name[j], sum(self.confusion_matrix[j:7+j:3]) * self.frame_shift)) # End if write_hours else: sys.stderr.write("File %s: %s : %s : %8.3f counts\n" % (self.file_id, self.prefix, name[j], sum(self.confusion_matrix[j:7+j:3]))) # End if # End for loop over 3 columns of confusion matrix # Print detailed stats of lengths of each of the 3 types of frames # in 8 kinds of segments def write_type_stats(self, file_handle = sys.stderr): for j in range(0,3): # 3 types of frames. Silence, noise, speech. # Typically, we store the number of frames of each type here. for i in range(0,9): # 2^3 = 8 kinds of segments like 'segment contains only silence', # 'segment contains only noise', 'segment contains noise and speech'. # For compatibility with the rest of the analysis code, # the for loop is over 9 kinds. max_length = max([0]+self.type_counts[j][i]) min_length = min([10000]+self.type_counts[j][i]) mean_length = mean(self.type_counts[j][i]) if use_numpy: try: percentile25 = np.percentile(self.type_counts[j][i], 25) except ValueError: percentile25 = 0 try: percentile50 = np.percentile(self.type_counts[j][i], 50) except ValueError: percentile50 = 0 try: percentile75 = np.percentile(self.type_counts[j][i], 75) except ValueError: percentile75 = 0 file_handle.write("File %s: %s : TypeStats: Type %d %d: Min: %4d Max: %4d Mean: %4d percentile25: %4d percentile50: %4d percentile75: %4d\n" % (self.file_id, self.prefix, j, i, min_length, max_length, mean_length, percentile25, percentile50, percentile75)) # End for loop over 9 different kinds of segments # End for loop over 3 types of frames # Print detailed stats of each cell of the confusion matrix. # The stats include different statistical measures like mean, max, min # and median of the length of continuous regions of frames in # each of the 9 cells of the confusion matrix def write_length_stats(self, file_handle = sys.stderr): for i in range(0,9): self.max_length[i] = max([0]+self.state_count[i]) self.min_length[i] = min([10000]+self.state_count[i]) self.mean_length[i] = mean(self.state_count[i]) if use_numpy: try: self.percentile25[i] = np.percentile(self.state_count[i], 25) except ValueError: self.percentile25[i] = 0 try: self.percentile50[i] = np.percentile(self.state_count[i], 50) except ValueError: self.percentile50[i] = 0 try: self.percentile75[i] = np.percentile(self.state_count[i], 75) except ValueError: self.percentile75[i] = 0 file_handle.write("File %s: %s : Length: Type %d: Min: %4d Max: %4d Mean: %4d percentile25: %4d percentile50: %4d percentile75: %4d\n" % (self.file_id, self.prefix, i, self.min_length[i], self.max_length[i], self.mean_length[i], self.percentile25[i], self.percentile50[i], self.percentile75[i])) # End for loop over 9 cells # Print detailed stats of each cell of the confusion matrix. # Similar structure to the above function. But this also prints additional # details. Format is like this - # Markers: Type : () () # The hypothesized_phones can be looked at to see what phones are # present in the hypothesis from start_frame for num_of_frames frames. def write_markers(self, file_handle = sys.stderr): file_handle.write("Start frames of different segments:\n") for j in range(0,9): if self.phones[j] == []: file_handle.write("File %s: %s : Markers: Type %d: %s\n" % (self.file_id, self.prefix, j, str(sorted([str(self.markers[j][i])+' ('+ str(self.state_count[j][i])+ ')' for i in range(0, len(self.state_count[j]))],key=lambda x:int(x.split()[0]))))) else: file_handle.write("File %s: %s : Markers: Type %d: %s\n" % (self.file_id, self.prefix, j, str(sorted([str(self.markers[j][i])+' ('+ str(self.state_count[j][i])+') ( ' + str(self.phones[j][i]) + ')' for i in range(0, len(self.state_count[j]))],key=lambda x:int(x.split()[0]))))) # End for loop over 9 cells # Function to read a standard IARPA Babel RTTM file # as structure in Jan 16, 2014 def read_rttm_file(rttm_file, temp_dir, frame_shift): file_id = None this_file = [] ref_file_handle = None reference = {} for line in open(rttm_file).readlines(): splits = line.strip().split() type1 = splits[0] if type1 == "SPEAKER": continue if splits[1] != file_id: # A different file_id. Need to open a different file to write if this_file != []: # If this_file is empty, no reference RTTM corresponding to the file_id # is read. This will happen at the start of the file_id. Otherwise it means a # contiguous segment of previous file_id is processed. So write it to the file. # corresponding to the previous file_id try: ref_file_handle.write(' '.join(this_file)) # Close the previous file if any ref_file_handle.close() this_file = [] except AttributeError: # Ignore AttributeError. It is expected. 1==1 # End if file_id = splits[1] if (file_id not in reference): # First time seeing this file_id. Open a new file for writing. reference[file_id] = 1 try: ref_file_handle = open(temp_dir+"/"+file_id+".ref", 'w') except IOError: sys.stderr.write("Unable to open " + temp_dir+"/"+file_id+".ref for writing\n") sys.exit(1) ref_file_handle.write(file_id + "\t") else: # This file has been seen before but not in the previous iteration. # The file has already been closed. So open it for append. try: this_file = open(temp_dir+"/"+file_id+".ref").readline().strip().split()[1:] ref_file_handle = open(temp_dir+"/"+file_id+".ref", 'a') except IOError: sys.stderr.write("Unable to open " + temp_dir+"/"+file_id+".ref for appending\n") sys.exit(1) # End if # End if i = len(this_file) category = splits[6] word = splits[5] start_time = int(float(splits[3])/frame_shift + 0.5) duration = int(float(splits[4])/frame_shift + 0.5) if i < start_time: this_file.extend(["0"]*(start_time - i)) if type1 == "NON-LEX": if category == "other": # is taken as Silence this_file.extend(["0"]*duration) else: this_file.extend(["1"]*duration) if type1 == "LEXEME": this_file.extend(["2"]*duration) if type1 == "NON-SPEECH": this_file.extend(["1"]*duration) ref_file_handle.write(' '.join(this_file)) ref_file_handle.close() # Stats class to store some basic stats about the number of # times the post-processor goes through particular loops or blocks # of code in the algorithm. This is just for debugging. class Stats: def __init__(self): self.inter_utt_nonspeech = 0 self.merge_nonspeech_segment = 0 self.merge_segments = 0 self.split_segments = 0 self.silence_only = 0 self.noise_only = 0 def print_stats(self): sys.stderr.write("Inter-utt nonspeech: %d\n" % self.inter_utt_nonspeech) sys.stderr.write("Merge nonspeech segment: %d\n" % self.merge_nonspeech_segment) sys.stderr.write("Merge segment: %d\n" % self.merge_segments) sys.stderr.write("Split segments: %d\n" % self.split_segments) sys.stderr.write("Noise only: %d\n" % self.noise_only) sys.stderr.write("Silence only: %d\n" % self.silence_only) def reset(self): self.inter_utt_nonspeech = 0 self.merge_nonspeech_segment = 0 self.merge_segments = 0 self.split_segments = 0 self.silence_only = 0 self.noise_only = 0 # Timer class to time functions class Timer: def __enter__(self): self.start = time.clock() return self def __exit__(self, *args): self.end = time.clock() self.interval = self.end - self.start # The main class for post-processing a file. # This does the segmentation either looking at the file isolated # or by looking at both classes simultaneously class JointResegmenter: def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): # Pointers to prediction arrays and Initialization self.P = P # Predicted phones self.B = [ i for i in A ] # Original predicted classes self.A = A # Predicted classes self.file_id = f # File name self.N = len(A) # Length of the prediction (= Num of frames in the audio file) self.S = [False] * self.N # Array of Start boundary markers self.E = [False] * (self.N+1) # Array of End boundary markers self.phone_map = phone_map self.options = options # Configuration self.frame_shift = options.frame_shift # Convert length in seconds to frames self.max_frames = int(options.max_segment_length / options.frame_shift) self.hard_max_frames = int(options.hard_max_segment_length / options.frame_shift) self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length / options.frame_shift) if ( options.remove_noise_only_segments == "false" ): self.remove_noise_segments = False elif ( options.remove_noise_only_segments == "true" ): self.remove_noise_segments = True # End of Configuration # Define Frame Type Constants self.THIS_SILENCE = ("0","1","2") self.THIS_NOISE = ("3","4","5") self.THIS_SPEECH = ("6", "7", "8") self.THIS_SPEECH_THAT_SIL = ("6",) self.THIS_SPEECH_THAT_NOISE = ("7",) self.THIS_SIL_CONVERT_THAT_SIL = ("9",) self.THIS_SIL_CONVERT_THAT_NOISE = ("10",) self.THIS_SIL_CONVERT = ("9","10","11") self.THIS_SILENCE_CONVERT = ("9","10","11") self.THIS_NOISE_CONVERT_THAT_SIL = ("12",) self.THIS_NOISE_CONVERT_THAT_NOISE = ("13",) self.THIS_NOISE_CONVERT = ("12","13","14") self.THIS_NOISE_OR_SILENCE = self.THIS_NOISE + self.THIS_SILENCE self.THIS_SILENCE_OR_NOISE = self.THIS_NOISE + self.THIS_SILENCE self.THIS_CONVERT = self.THIS_SILENCE_CONVERT + self.THIS_NOISE_CONVERT self.THIS_SILENCE_PLUS = self.THIS_SILENCE + self.THIS_SILENCE_CONVERT self.THIS_NOISE_PLUS = self.THIS_NOISE + self.THIS_NOISE_CONVERT self.THIS_SPEECH_PLUS = self.THIS_SPEECH + self.THIS_CONVERT if stats != None: self.stats = stats self.reference = None if reference != None: if len(reference) < self.N: self.reference = reference + ["0"] * (self.N - len(reference)) assert (len(self.reference) == self.N) else: self.reference = reference # This function restricts the output to length N def restrict(self, N): self.B = self.B[0:N] self.A = self.A[0:N] self.S = self.S[0:N] self.E = self.E[0:N+1] if sum(self.S) == sum(self.E) + 1: self.E[N] = True self.N = N # Main resegment function that calls other functions def resegment(self): with Timer() as t: self.get_initial_segments() if self.options.verbose > 1: sys.stderr.write("For %s: get_initial_segments took %f sec\n" % (self.file_id, t.interval)) with Timer() as t: self.set_nonspeech_proportion() if self.options.verbose > 1: sys.stderr.write("For %s: set_nonspeech_proportion took %f sec\n" % (self.file_id, t.interval)) with Timer() as t: self.merge_segments() if self.options.verbose > 1: sys.stderr.write("For %s: merge took %f sec\n" % (self.file_id, t.interval)) with Timer() as t: self.split_long_segments() if self.options.verbose > 1: sys.stderr.write("For %s: split took %f sec\n" % (self.file_id, t.interval)) if self.remove_noise_segments: with Timer() as t: self.remove_noise_only_segments() if self.options.verbose > 1: sys.stderr.write("For %s: remove took %f sec\n" % (self.file_id, t.interval)) elif self.min_inter_utt_nonspeech_length > 0.0: # This is the typical one with augmented training setup self.remove_silence_only_segments() if self.options.verbose > 1: sys.stderr.write("For file %s\n" % self.file_id) self.stats.print_stats() sys.stderr.write("\n") self.stats.reset() def get_initial_segments(self): for i in range(0, self.N): if (i > 0) and self.A[i-1] != self.A[i]: # This frame is different from the previous frame. if self.A[i] in self.THIS_SPEECH: # This frame is speech. if self.A[i-1] in self.THIS_SPEECH: # Both this and the previous frames are speech # But they are different. e.g. "8 7" # So this is the end of the previous region and # the beginning of the next region self.S[i] = True self.E[i] = True else: # The previous frame is non-speech, but not this one. # So this frame is the beginning of a new segment self.S[i] = True else: # This frame is non-speech if self.A[i-1] in self.THIS_SPEECH: # Previous frame is speech, but this one is not. # So this frame is the end of the previous segment self.E[i] = True elif i == 0 and self.A[i] in self.THIS_SPEECH: # The frame is speech. So this is the start of a new segment. self.S[i] = True if self.A[self.N-1] in self.THIS_SPEECH: # Handle the special case where the last frame of file is not nonspeech self.E[self.N] = True assert(sum(self.S) == sum(self.E)) ########################################################################### # Analysis section self.C = ["0"] * self.N C = self.C a = Analysis(self.file_id, self.frame_shift,"Analysis after get_initial_segments") if self.reference != None: count = 0 for i in range(0,self.N): if self.reference[i] == "0" and self.A[i] in self.THIS_SILENCE: C[i] = "0" elif self.reference[i] == "0" and self.A[i] in self.THIS_NOISE: C[i] = "1" elif self.reference[i] == "0" and self.A[i] in self.THIS_SPEECH: C[i] = "2" elif self.reference[i] == "1" and self.A[i] in self.THIS_SILENCE: C[i] = "3" elif self.reference[i] == "1" and self.A[i] in self.THIS_NOISE: C[i] = "4" elif self.reference[i] == "1" and self.A[i] in self.THIS_SPEECH: C[i] = "5" elif self.reference[i] == "2" and self.A[i] in self.THIS_SILENCE: C[i] = "6" elif self.reference[i] == "2" and self.A[i] in self.THIS_NOISE: C[i] = "7" elif self.reference[i] == "2" and self.A[i] in self.THIS_SPEECH: C[i] = "8" if i > 0 and C[i-1] != C[i]: a.state_count[int(C[i-1])].append(count) a.markers[int(C[i-1])].append(i - count) a.phones[int(C[i-1])].append(' '.join(set(self.P[i-count:i]))) count = 1 else: count += 1 for j in range(0,9): a.confusion_matrix[j] = sum([C[i] == str(j) for i in range(0,self.N)]) global_analysis_get_initial_segments.add(a) if self.reference != None and self.options.verbose > 0: a.write_confusion_matrix() a.write_length_stats() if self.reference != None and self.options.verbose > 1: a.write_markers() ########################################################################### def set_nonspeech_proportion(self): num_speech_frames = 0 in_segment = False # Active frames are the frames that are either segment starts # or segment ends active_frames = [] for n in range(0, self.N + 1): if self.E[n]: assert(in_segment) in_segment = False active_frames.append(n) if n < self.N and self.S[n]: assert(not in_segment) in_segment = True active_frames.append(n) if n < self.N: if in_segment: # Count the number of speech frames num_speech_frames += 1 assert (not in_segment) if num_speech_frames == 0: sys.stderr.write("%s: Warning: no speech found for recording %s\n" % (sys.argv[0], self.file_id)) # Set the number of non-speech frames to be added depending on the # silence proportion. The target number of frames in the segments # is computed as below: target_segment_frames = int(num_speech_frames / (1.0 - self.options.silence_proportion)) # The number of frames currently in the segments num_segment_frames = num_speech_frames count = 0 while num_segment_frames < target_segment_frames: count += 1 changed = False for i in range(0, len(active_frames)): # At each active frame, try include a nonspeech frame into # segment. Thus padding the speech segments with some # non-speech frames. These converted non-speech frames are # labelled 9...14 depending on whether they were originally # 0...5 respectively n = active_frames[i] if self.E[n] and n < self.N and not self.S[n]: # This must be the beginning of a non-speech region. # Include some of this non-speech in the segments assert (self.A[n] not in self.THIS_SPEECH) # Convert the non-speech frame to be included in segment self.A[n] = str(int(self.B[n]) + 9) if self.B[n-1] != self.B[n]: # In this frame there is a transition from # one type of non-speech (0, 1 ... 5) to another # So its the start of a segment. Also add it to the # end of the active frames list self.S[n] = True active_frames.append(n+1) else: # We need to extend the segment end since we have # included a non-speeech frame. Remove the current segment end mark # and one to the next frame self.E[n] = False active_frames[i] = n + 1 self.E[n+1] = True # Increment the number of frames in the segments num_segment_frames += 1 changed = True if n < self.N and self.S[n] and n > 0 and not self.E[n]: # This must be the beginning of a speech region. # Include some non-speech before it into the segments assert (self.A[n-1] not in self.THIS_SPEECH) self.A[n-1] = str(int(self.B[n-1]) + 9) if self.B[n-1] != self.B[n]: self.E[n] = True active_frames.append(n-1) else: self.S[n] = False active_frames[i] = n - 1 self.S[n-1] = True num_segment_frames += 1 changed = True if num_segment_frames >= target_segment_frames: break if not changed: # avoid an infinite loop. if no changes, then break. break if num_segment_frames < target_segment_frames: proportion = float(num_segment_frames - num_speech_frames) / num_segment_frames sys.stderr.write("%s: Warning: for recording %s, only got a proportion %f of non-speech frames, versus target %f\n" % (sys.argv[0], self.file_id, proportion, self.options.silence_proportion)) ########################################################################### # Analysis section self.C = ["0"] * self.N C = self.C a = Analysis(self.file_id, self.frame_shift,"Analysis after set_nonspeech_proportion") if self.reference != None: count = 0 for i in range(0,self.N): if self.reference[i] == "0" and self.A[i] in (self.THIS_SILENCE + self.THIS_NOISE): C[i] = "0" elif self.reference[i] == "0" and self.A[i] in self.THIS_CONVERT: C[i] = "1" elif self.reference[i] == "0" and self.A[i] in self.THIS_SPEECH: C[i] = "2" elif self.reference[i] == "1" and self.A[i] in (self.THIS_SILENCE + self.THIS_NOISE): C[i] = "3" elif self.reference[i] == "1" and self.A[i] in self.THIS_CONVERT: C[i] = "4" elif self.reference[i] == "1" and self.A[i] in self.THIS_SPEECH: C[i] = "5" elif self.reference[i] == "2" and self.A[i] in (self.THIS_SILENCE + self.THIS_NOISE): C[i] = "6" elif self.reference[i] == "2" and self.A[i] in self.THIS_CONVERT: C[i] = "7" elif self.reference[i] == "2" and self.A[i] in self.THIS_SPEECH: C[i] = "8" if i > 0 and C[i-1] != C[i]: a.state_count[int(C[i-1])].append(count) a.markers[int(C[i-1])].append(i - count) a.phones[int(C[i-1])].append(' '.join(set(self.P[i-count:i]))) count = 1 else: count += 1 for j in range(0,9): a.confusion_matrix[j] = sum([C[i] == str(j) for i in range(0,self.N)]) global_analysis_set_nonspeech_proportion.add(a) if self.reference != None and self.options.verbose > 0: a.write_confusion_matrix() a.write_length_stats() if self.reference != None and self.options.verbose > 1: a.write_markers() ########################################################################### def merge_segments(self): # Get list of frames which have segment start and segment end # markers into separate lists segment_starts = [i for i, val in enumerate(self.S) if val] segment_ends = [i for i, val in enumerate(self.E) if val] assert (sum(self.S) == sum(self.E)) if self.options.verbose > 3: sys.stderr.write("Length of segment starts before non-speech adding: %d\n" % len(segment_starts)) if self.min_inter_utt_nonspeech_length > 0.0: segment_starts = list(set([0] + segment_starts + segment_ends + [self.N])) segment_starts.sort() segment_starts.pop() segment_ends= list(set([0] + segment_starts + segment_ends + [self.N])) segment_ends.sort() segment_ends.pop(0) if self.options.verbose > 3: sys.stderr.write("Length of segment starts after non-speech adding: %d\n" % len(segment_starts)) for i in segment_starts: self.S[i] = True for i in segment_ends: self.E[i] = True # Just a check. There must always be equal number of segment starts # and segment ends assert (len(segment_starts) == len(segment_ends)) # A boundary is a frame which is both a segment start and a segment end # The list of boundaries is obtained in the following step along with # a few statistics like the type of segment on either side of the boundary # and the length of the segment on either side of it boundaries = [] i = 0 j = 0 while i < len(segment_starts) and j < len(segment_ends): if segment_ends[j] < segment_starts[i]: # The segment end marker is before the segment start marker. # This means that this segment end marker corresponds to a segment # that is before the one indicated by the segment start marker. # So advance the segment end pointer to the next segment end to # check if that is a 'boundary' j += 1 elif segment_ends[j] > segment_starts[i]: # The segment end marker is after the segment start marker. # This means that this segment end marker would corresponds # to segment indicated by the segment start marker. # So advance the segment start pointer to the next segment start to # check if that is a 'boundary' i += 1 else: assert(i < len(segment_starts) and j < len(segment_ends)) # A boundary: # Find the segment score as the min of lengths of the segments # to the left and to the right. # This segment score will be used to prioritize merging of # the segment with its neighbor assert ((j + 1) < len(segment_ends)) segment_score = min(segment_starts[i] - segment_starts[i-1], \ segment_ends[j+1] - segment_ends[j]) # Also find the type of tranisition of the segments at the boundary. # This is also used to prioritize the merging of the segment boundaries.append((segment_ends[j], segment_score, \ self.transition_type(segment_ends[j]))) # Sort the boundaries based on segment score boundaries.sort(key = lambda x: x[1]) # Then sort based on the type of transition by keeping it still # sorted within each transition type based on segment score boundaries.sort(key = lambda x: x[2]) i += 1 j += 1 # End if # End while loop # Begin merging of segments by removing the start and end mark # at the boundary to be merged count = 0 for b in boundaries: count += 1 segment_length = 0 if self.min_inter_utt_nonspeech_length > 0.0 and not self.E[b[0]]: # This will happen only if the boundary is at the end of # a non-speech region that has already been merged or removed # b[0] will then not be an end mark. continue # Count the number of frames in the segment to the # left of the boundary p = b[0] - 1 while p >= 0: if self.S[p]: break p -= 1 # End if # End while loop p_left = p segment_length += b[0] - p # Count the number of frames in the segment to the # right of the boundary p = b[0] + 1 while p <= self.N: if self.E[p]: break p += 1 assert (self.min_inter_utt_nonspeech_length == 0 or p == self.N or self.S[p] or self.A[p] in self.THIS_SILENCE_OR_NOISE) if self.min_inter_utt_nonspeech_length > 0 and self.A[b[0]] in self.THIS_SILENCE_OR_NOISE: assert(b[2] == 6 or b[2] == 7) if (p - b[0]) > self.min_inter_utt_nonspeech_length: # This is a non-speech segment that is longer than the minimum # inter-utterance non-speech length. # Therefore treat this non-speech as inter-utterance non-speech and # remove it from the segments self.S[b[0]] = False self.E[p] = False # Count the number of times inter utt non-speech # length is greater than the set threshold # This is the number of times the silence is # not merged with adjacent speech self.stats.inter_utt_nonspeech += 1 # This is boundary is no longer valid. # So we can continue to the next boundary continue # End if # This non-speech segment is less than the minimum inter-utterance # non-speech length. It is possible to merge this segment # with the adjacent ones as long as the length of the # segment after merging to see if its within limits. p_temp = p p += 1 while p <= self.N: if self.E[p]: break p += 1 # End while loop segment_length += p - b[0] if segment_length < self.max_frames: # Merge the non-speech segment with the segments # on either sides # Count the number of times segment merge happens self.stats.merge_nonspeech_segment += 1 if p_temp < self.N: self.S[p_temp] = False self.E[p_temp] = False self.S[b[0]] = False self.E[b[0]] = False continue else: # The merged segment length is longer than max_frames. # Therefore treat this non-speech as inter-utterance non-speech and # remove it from the segments self.S[b[0]] = False self.E[p_temp] = False continue # End if elif self.min_inter_utt_nonspeech_length > 0 and (b[2] == 8 or b[2] == 9): assert(p_left == 0) if b[0] - p_left > self.min_inter_utt_nonspeech_length: self.S[p_left] = False self.E[b[0]] = False continue # End if # End if segment_length += p - b[0] if segment_length < self.max_frames: self.stats.merge_segments += 1 self.S[b[0]] = False self.E[b[0]] = False # End if # End for loop over boundaries assert (sum(self.S) == sum(self.E)) ########################################################################### # Analysis section if self.reference != None and self.options.verbose > 3: a = self.segmentation_analysis("Analysis after merge_segments") a.write_confusion_matrix() if self.reference != None and self.options.verbose > 4: a.write_type_stats() # End if if self.reference != None and self.options.verbose > 4: a.write_markers() # End if # End if ########################################################################### # End function merge_segments def split_long_segments(self): assert (sum(self.S) == sum(self.E)) for n in range(0, self.N): if self.S[n]: p = n + 1 while p <= self.N: if self.E[p]: break p += 1 segment_length = p - n if segment_length > self.hard_max_frames: # Count the number of times long segments are split self.stats.split_segments += 1 num_pieces = int((float(segment_length) / self.hard_max_frames) + 0.99999) sys.stderr.write("%s: Warning: for recording %s, " \ % (sys.argv[0], self.file_id) \ + "splitting segment of length %f seconds into %d pieces " \ % (segment_length * self.frame_shift, num_pieces) \ + "(--hard-max-segment-length %f)\n" \ % self.options.hard_max_segment_length) frames_per_piece = int(segment_length / num_pieces) for i in range(1,num_pieces): q = n + i * frames_per_piece self.S[q] = True self.E[q] = True if p - 1 > n: n = p - 1 assert (sum(self.S) == sum(self.E)) # End function split_long_segments def remove_silence_only_segments(self): for n in range(0, self.N): # Run through to find a segment start if self.S[n]: p = n saw_nonsilence = False # From the segment start, go till the segment end to see # if there is speech in it while p <= self.N: if self.E[p] and p != n: break if p < self.N and self.A[p] not in self.THIS_SILENCE: saw_nonsilence = True p += 1 # End of while loop through the segment assert (p > self.N or self.E[p]) if not saw_nonsilence: # Count the number of silence only segments self.stats.silence_only += 1 self.S[n] = False self.E[p] = False # End if if p - 1 > n: # Go to the end of the segment since that segment is # already processed n = p - 1 # End if if self.reference != None and self.options.verbose > 3: a = self.segmentation_analysis("Analysis after remove_silence_only_segments") a.write_confusion_matrix() if self.reference != None and self.options.verbose > 4: a.write_type_stats() # End if if self.reference != None and self.options.verbose > 4: a.write_markers() # End if # End if # End function remove_silence_only_segments def remove_noise_only_segments(self): for n in range(0, self.N): if self.S[n]: p = n saw_speech = False while p <= self.N: if self.E[p] and p != n: break if self.A[p] in self.THIS_SPEECH: saw_speech = True p += 1 assert (self.E[p]) if not saw_speech: # Count the number of segments with no speech self.stats.noise_only += 1 self.S[n] = False self.E[p] = False # End if if p - 1 > n: n = p - 1 # End if # End if # End for loop over frames ########################################################################### # Analysis section if self.reference != None and self.options.verbose > 3: a = self.segmentation_analysis("Analysis after remove_noise_only_segments") a.write_confusion_matrix() if self.reference != None and self.options.verbose > 4: a.write_type_stats() # End if if self.reference != None and self.options.verbose > 4: a.write_markers() # End if # End if ########################################################################### # End function remove_noise_only_segments # Return the transition type from frame j-1 to frame j def transition_type(self, j): assert (j > 0) assert (self.A[j-1] != self.A[j] or self.A[j] in self.THIS_CONVERT) if self.A[j-1] in (self.THIS_SPEECH_THAT_NOISE + self.THIS_SPEECH_THAT_SIL) and self.A[j] in (self.THIS_SPEECH_THAT_NOISE + self.THIS_SPEECH_THAT_SIL): return 0 if self.A[j-1] in self.THIS_SPEECH and self.A[j] in self.THIS_SPEECH: return 1 if self.A[j-1] in (self.THIS_SPEECH + self.THIS_NOISE_CONVERT_THAT_SIL + self.THIS_NOISE_CONVERT_THAT_NOISE) and self.A[j] in (self.THIS_SPEECH + self.THIS_NOISE_CONVERT_THAT_SIL + self.THIS_NOISE_CONVERT_THAT_NOISE): return 2 if self.A[j-1] in (self.THIS_SPEECH + self.THIS_NOISE_CONVERT) and self.A[j] in (self.THIS_SPEECH + self.THIS_NOISE_CONVERT): return 3 if self.A[j-1] in (self.THIS_SPEECH + self.THIS_NOISE_CONVERT + self.THIS_SIL_CONVERT_THAT_SIL + self.THIS_SIL_CONVERT_THAT_NOISE) and self.A[j] in (self.THIS_SPEECH + self.THIS_NOISE_CONVERT + self.THIS_SIL_CONVERT_THAT_SIL + self.THIS_SIL_CONVERT_THAT_NOISE): return 4 if self.A[j-1] in (self.THIS_SPEECH + self.THIS_CONVERT) and self.A[j] in (self.THIS_SPEECH + self.THIS_CONVERT): return 5 if self.A[j-1] in self.THIS_SPEECH_PLUS and self.A[j] in (self.THIS_SPEECH_PLUS + self.THIS_NOISE): return 6 if self.A[j-1] in self.THIS_SPEECH_PLUS and self.A[j] in (self.THIS_SPEECH_PLUS + self.THIS_SILENCE): return 7 if self.A[j-1] in (self.THIS_SPEECH_PLUS + self.THIS_NOISE) and self.A[j] in self.THIS_SPEECH_PLUS: return 8 if self.A[j-1] in (self.THIS_SPEECH_PLUS + self.THIS_SILENCE) and self.A[j] in self.THIS_SPEECH_PLUS: return 9 assert (False) # Output the final segments def print_segments(self, out_file_handle = sys.stdout): # We also do some sanity checking here. segments = [] assert (self.N == len(self.S)) assert (self.N + 1 == len(self.E)) max_end_time = 0 n = 0 while n < self.N: if self.E[n] and not self.S[n]: sys.stderr.write("%s: Error: Ending segment before starting it: n=%d\n" % (sys.argv[0], n)) if self.S[n]: p = n + 1 while p < self.N and not self.E[p]: assert (not self.S[p]) p += 1 assert (p == self.N or self.E[p]) segments.append((n,p)) max_end_time = p if p < self.N and self.S[p]: n = p - 1 else: n = p n += 1 if len(segments) == 0: sys.stderr.write("%s: Warning: no segments for recording %s\n" % (sys.argv[0], self.file_id)) sys.exit(1) ############################################################################ # Analysis section self.C = ["0"] * self.N C = self.C a = Analysis(self.file_id, self.frame_shift,"Analysis final") if self.reference != None: count = 0 in_seg = False for i in range(0,self.N): if in_seg and self.E[i]: in_seg = False if i == 0 and self.S[i]: in_seg = True if not in_seg and self.S[i]: in_seg = True if self.reference[i] == "0" and not in_seg: C[i] = "0" elif self.reference[i] == "0" and in_seg: C[i] = "2" elif self.reference[i] == "1" and not in_seg: C[i] = "3" elif self.reference[i] == "1" and in_seg: C[i] = "5" elif self.reference[i] == "2" and not in_seg: C[i] = "6" elif self.reference[i] == "2" and in_seg: C[i] = "8" if i > 0 and C[i-1] != C[i]: a.state_count[int(C[i-1])].append(count) a.markers[int(C[i-1])].append(i - count) a.phones[int(C[i-1])].append(' '.join(set(self.P[i-count:i]))) count = 1 else: count += 1 for j in range(0,9): a.confusion_matrix[j] = sum([C[i] == str(j) for i in range(0,self.N)]) if self.options.verbose > 0: a.write_confusion_matrix() a.write_length_stats() if self.options.verbose > 1: a.write_markers() global_analysis_final.add(a) ############################################################################ # we'll be printing the times out in hundredths of a second (regardless of the # value of $frame_shift), and first need to know how many digits we need (we'll be # printing with "%05d" or similar, for zero-padding. max_end_time_hundredths_second = int(100.0 * self.frame_shift * max_end_time) num_digits = 1 i = 1 while i < max_end_time_hundredths_second: i *= 10 num_digits += 1 format_str = r"%0" + "%d" % num_digits + "d" # e.g. "%05d" for start, end in segments: assert (end > start) start_seconds = "%.2f" % (self.frame_shift * start) end_seconds = "%.2f" % (self.frame_shift * end) start_str = format_str % (start * self.frame_shift * 100.0) end_str = format_str % (end * self.frame_shift * 100.0) utterance_id = "%s%s%s%s%s" % (self.file_id, self.options.first_separator, start_str, self.options.second_separator, end_str) # Output: out_file_handle.write("%s %s %s %s\n" % (utterance_id, self.file_id, start_seconds, end_seconds)) # Some intermediate stage analysis of the segmentation def segmentation_analysis(self, title = "Analysis"): # In this analysis, we are trying to find in each segment, # the number of frames that are speech, noise and silence # in the reference RTTM # First get the segment start and segment ends # Note that they are in sync by construction segment_starts = [i for i in range(0,self.N) if self.S[i]] segment_ends = [i for i in range(0,self.N+1) if self.E[i]] D = {} for i,st in enumerate(segment_starts): en = segment_ends[i] types = {} for val in self.reference[st:en]: # The segment is defined by the indices st:en # Count the number of frames in the segment that # are silence, speech and noise in the reference. types[val] = types.get(val,0) + 1 # End for loop over a particular segment # Make a tuple out of the counts of the types of frames D[st] = (en, types.get("0",0), types.get("1", 0), types.get("2", 0)) # End for loop over all segments a = Analysis(self.file_id, None, title) for st, info in D.items(): en = info[0] if info[1] > 0 and info[2] == 0 and info[3] == 0: # All frames silence a.confusion_matrix[0] += 1 a.state_count[0].append((en-st,)+info[1:]) a.type_counts[0][0].append(info[1]) a.type_counts[1][0].append(info[2]) a.type_counts[2][0].append(info[3]) a.markers[0].append(st) elif info[1] == 0 and info[2] > 0 and info[3] == 0: # All frames noise a.confusion_matrix[1] += 1 a.state_count[1].append((en-st,)+info[1:]) a.type_counts[0][1].append(info[1]) a.type_counts[1][1].append(info[2]) a.type_counts[2][1].append(info[3]) a.markers[1].append(st) elif info[1] == 0 and info[2] == 0 and info[3] > 0: # All frames speech a.confusion_matrix[2] += 1 a.state_count[2].append((en-st,)+info[1:]) a.type_counts[0][2].append(info[1]) a.type_counts[1][2].append(info[2]) a.type_counts[2][2].append(info[3]) a.markers[2].append(st) elif info[1] > 0 and info[2] > 0 and info[3] == 0: # Segment contains both silence and noise a.confusion_matrix[3] += 1 a.state_count[3].append((en-st,)+info[1:]) a.type_counts[0][3].append(info[1]) a.type_counts[1][3].append(info[2]) a.type_counts[2][3].append(info[3]) a.markers[3].append(st) elif info[1] > 0 and info[2] == 0 and info[3] > 0: # Segment contains both silence and speech a.confusion_matrix[4] += 1 a.type_counts[0][4].append(info[1]) a.type_counts[1][4].append(info[2]) a.type_counts[2][4].append(info[3]) a.state_count[4].append((en-st,)+info[1:]) a.markers[4].append(st) elif info[1] == 0 and info[2] > 0 and info[3] > 0: # Segment contains both noise and speech a.confusion_matrix[5] += 1 a.state_count[5].append((en-st,)+info[1:]) a.type_counts[0][5].append(info[1]) a.type_counts[1][5].append(info[2]) a.type_counts[2][5].append(info[3]) a.markers[5].append(st) elif info[1] > 0 and info[2] > 0 and info[3] > 0: # Segment contains silence, noise and speech a.confusion_matrix[6] += 1 a.state_count[6].append((en-st,)+info[1:]) a.type_counts[0][6].append(info[1]) a.type_counts[1][6].append(info[2]) a.type_counts[2][6].append(info[3]) a.markers[6].append(st) else: # Should never be here assert (False) # End if # End for loop over all stats return a # End function segmentation_analysis def map_prediction(A1, A2, phone_map, speech_cap = None, f = None): if A2 == None: B = [] # Isolated segmentation prev_x = None len_x = 0 i = 0 for x in A1: if prev_x == None or x == prev_x: len_x += 1 else: assert (len_x > 0) #sys.stderr.write("PHONE_LENGTH %s %d %s %d\n" % (prev_x, len_x, f, i - len_x)) if phone_map[prev_x] == "0": B.extend(["0"] * len_x) elif (speech_cap != None and len_x > speech_cap) or phone_map[prev_x] == "1": B.extend(["4"] * len_x) elif phone_map[prev_x] == "2": B.extend(["8"] * len_x) # End if len_x = 1 # End if prev_x = x i += 1 # End for try: assert (len_x > 0) except AssertionError as e: repr(e) sys.stderr.write("In file %s\n" % f) sys.exit(1) if phone_map[prev_x] == "0": B.extend(["0"] * len_x) elif (speech_cap != None and len_x > speech_cap) or phone_map[prev_x] == "1": B.extend(["4"] * len_x) elif phone_map[prev_x] == "2": B.extend(["8"] * len_x) # End if return B # End if (isolated segmentation) # Assuming len(A1) > len(A2) # Otherwise A1 and A2 must be interchanged before # passing to this function B1 = [] B2 = [] for i in range(0, len(A2)): if phone_map[A1[i]] == "0" and phone_map[A2[i]] == "0": B1.append("0") B2.append("0") if phone_map[A1[i]] == "0" and phone_map[A2[i]] == "1": B1.append("1") B2.append("3") if phone_map[A1[i]] == "0" and phone_map[A2[i]] == "2": B1.append("2") B2.append("6") if phone_map[A1[i]] == "1" and phone_map[A2[i]] == "0": B1.append("3") B2.append("1") if phone_map[A1[i]] == "1" and phone_map[A2[i]] == "1": B1.append("4") B2.append("4") if phone_map[A1[i]] == "1" and phone_map[A2[i]] == "2": B1.append("5") B2.append("7") if phone_map[A1[i]] == "2" and phone_map[A2[i]] == "0": B1.append("6") B2.append("2") if phone_map[A1[i]] == "2" and phone_map[A2[i]] == "1": B1.append("7") B2.append("5") if phone_map[A1[i]] == "2" and phone_map[A2[i]] == "2": B1.append("8") B2.append("8") for i in range(len(A2), len(A1)): if phone_map[A1[i]] == "0": B1.append("0") B2.append("0") if phone_map[A1[i]] == "1": B1.append("3") B2.append("1") if phone_map[A1[i]] == "2": B1.append("6") B2.append("2") return (B1, B2) def main(): parser = ArgumentParser(description='Get segmentation arguments') parser.add_argument('--verbose', type=int, \ dest='verbose', default=0, \ help='Give higher verbose for more logging (default: %(default)s)') parser.add_argument('--silence-proportion', type=float, \ dest='silence_proportion', default=0.05, \ help="The amount of silence at the sides of segments is " \ + "tuned to give this proportion of silence. (default: %(default)s)") parser.add_argument('--frame-shift', type=float, \ dest='frame_shift', default=0.01, \ help="Time difference between adjacent frame (default: %(default)s)s") parser.add_argument('--max-segment-length', type=float, \ dest='max_segment_length', default=10.0, \ help="Maximum segment length while we are marging segments (default: %(default)s)") parser.add_argument('--hard-max-segment-length', type=float, \ dest='hard_max_segment_length', default=15.0, \ help="Hard maximum on the segment length above which the segment " \ + "will be broken even if in the middle of speech (default: %(default)s)") parser.add_argument('--first-separator', type=str, \ dest='first_separator', default="-", \ help="Separator between recording-id and start-time (default: %(default)s)") parser.add_argument('--second-separator', type=str, \ dest='second_separator', default="-", \ help="Separator between start-time and end-time (default: %(default)s)") parser.add_argument('--remove-noise-only-segments', type=str, \ dest='remove_noise_only_segments', default="true", choices=("true", "false"), \ help="Remove segments that have only noise. (default: %(default)s)") parser.add_argument('--min-inter-utt-silence-length', type=float, \ dest='min_inter_utt_silence_length', default=1.0, \ help="Minimum silence that must exist between two separate utterances (default: %(default)s)"); parser.add_argument('--channel1-file', type=str, \ dest='channel1_file', default="inLine", \ help="String that matches with the channel 1 file (default: %(default)s)") parser.add_argument('--channel2-file', type=str, \ dest='channel2_file', default="outLine", \ help="String that matches with the channel 2 file (default: %(default)s)") parser.add_argument('--isolated-resegmentation', \ dest='isolated_resegmentation', \ action='store_true', help="Do not do joint segmentation (default: %(default)s)") parser.add_argument('--max-length-diff', type=float, \ dest='max_length_diff', default=1.0, \ help="Maximum difference in the lengths of the two channels for joint " \ + "segmentation to be done (default: %(default)s)") parser.add_argument('--reference-rttm', dest='reference_rttm', \ help="RTTM file to compare and get statistics (default: %(default)s)") parser.add_argument('--speech-cap-length', type=float, default=None, \ help="Maximum length in seconds of a particular speech phone prediction." \ + "\nAny length above this will be considered as noise") parser.add_argument('prediction_dir', \ help='Directory where the predicted phones (.pred files) are found') parser.add_argument('phone_map', \ help='Phone Map file that maps from phones to classes') parser.add_argument('output_segments', nargs='?', default="-", \ help='Output segments file') parser.usage=':'.join(parser.format_usage().split(':')[1:]) \ + 'e.g. : %(prog)s exp/tri4b_whole_resegment_dev10h/pred exp/tri4b_whole_resegment_dev10h/phone_map.txt data/dev10h.seg/segments' options = parser.parse_args() sys.stderr.write(' '.join(sys.argv) + "\n") if not ( options.silence_proportion \ > 0.01 and options.silence_proportion < 0.99 ): sys.stderr.write("%s: Error: Invalid silence-proportion value %f\n" \ % options.silence_proportion) sys.exit(1) if not ( options.remove_noise_only_segments == "false" or options.remove_noise_only_segments == "true" ): sys.stderr.write("%s: Error: Invalid value for remove-noise-only segments %s. Must be true or false.\n" \ % options.remove_noise_only_segments) sys.exit(1) if options.output_segments == '-': out_file = sys.stdout else: try: out_file = open(options.output_segments, 'w') except IOError as e: sys.stderr.write("%s: %s: Unable to open file %s\n" % (sys.argv[0], e, options.output_segments)) sys.exit(1) # End if phone_map = {} try: for line in open(options.phone_map).readlines(): phone, cls = line.strip().split() phone_map[phone] = cls except IOError as e: repr(e) sys.exit(1) prediction_dir = options.prediction_dir channel1_file = options.channel1_file channel2_file = options.channel2_file temp_dir = prediction_dir + "/../rttm_classes" os.system("mkdir -p %s" % temp_dir) if options.reference_rttm != None: read_rttm_file(options.reference_rttm, temp_dir, options.frame_shift) else: temp_dir = None stats = Stats() pred_files = dict([ (f.split('/')[-1][0:-5], False) \ for f in glob.glob(os.path.join(prediction_dir, "*.pred")) ]) global global_analysis_get_initial_segments global_analysis_get_initial_segments = Analysis("TOTAL_Get_Initial_Segments", options.frame_shift, "Global Analysis after get_initial_segments") global global_analysis_set_nonspeech_proportion global_analysis_set_nonspeech_proportion = Analysis("TOTAL_set_nonspeech_proportion", options.frame_shift, "Global Analysis after set_nonspeech_proportion") global global_analysis_final global_analysis_final= Analysis("TOTAL_Final", options.frame_shift, "Global Analysis Final") speech_cap = None if options.speech_cap_length != None: speech_cap = int( options.speech_cap_length / options.frame_shift ) # End if for f in pred_files: if pred_files[f]: continue if re.match(".*_"+channel1_file, f) is None: if re.match(".*_"+channel2_file, f) is None: sys.stderr.write("%s does not match pattern .*_%s or .*_%s\n" \ % (f,channel1_file, channel2_file)) sys.exit(1) else: f1 = f f2 = f f1 = re.sub("(.*_)"+channel2_file, r"\1"+channel1_file, f1) else: f1 = f f2 = f f2 = re.sub("(.*_)"+channel1_file, r"\1"+channel2_file, f2) if options.isolated_resegmentation or f2 not in pred_files or f1 not in pred_files: pred_files[f] = True try: A = open(os.path.join(prediction_dir, f+".pred")).readline().strip().split()[1:] except IndexError: sys.stderr.write("Incorrect format of file %s/%s.pred\n" % (prediction_dir, f)) sys.exit(1) B = map_prediction(A, None, phone_map, speech_cap, f) if temp_dir != None: try: reference = open(os.path.join(temp_dir, f+".ref")).readline().strip().split()[1:] except IOError: reference = None else: reference = None r = JointResegmenter(A, B, f, options, phone_map, stats, reference) r.resegment() r.print_segments(out_file) else: if pred_files[f1] and pred_files[f2]: continue pred_files[f1] = True pred_files[f2] = True try: A1 = open(os.path.join(prediction_dir, f1+".pred")).readline().strip().split()[1:] except IndexError: sys.stderr.write("Incorrect format of file %s/%s.pred\n" % (prediction_dir, f1)) sys.exit(1) try: A2 = open(os.path.join(prediction_dir, f2+".pred")).readline().strip().split()[1:] except IndexError: sys.stderr.write("Incorrect format of file %s/%s.pred\n" % (prediction_dir, f2)) sys.exit(1) if len(A1) < len(A2): A3 = A1 A1 = A2 A2 = A3 f3 = f1 f1 = f2 f2 = f3 # End if if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift: sys.stderr.write( \ "%s: Warning: Lengths of %s and %s differ by more than %f. " \ % (sys.argv[0], f1,f2, options.max_length_diff) \ + "So using isolated resegmentation\n") B1 = map_prediction(A1, None, phone_map, speech_cap) B2 = map_prediction(A2, None, phone_map, speech_cap) else: B1,B2 = map_prediction(A1, A2, phone_map, speech_cap) # End if if temp_dir != None: try: reference1 = open(os.path.join(temp_dir, f1+".ref")).readline().strip().split()[1:] except IOError: reference1 = None else: reference1 = None r1 = JointResegmenter(A1, B1, f1, options, phone_map, stats, reference1) r1.resegment() r1.print_segments(out_file) if temp_dir != None: try: reference2 = open(os.path.join(temp_dir, f2+".ref")).readline().strip().split()[1:] except IOError: reference2= None else: reference2 = None r2 = JointResegmenter(A1, B2, f2, options, phone_map, stats, reference2) r2.resegment() r2.restrict(len(A2)) r2.print_segments(out_file) # End if # End for loop over files if options.reference_rttm != None: global_analysis_get_initial_segments.write_confusion_matrix(True) global_analysis_get_initial_segments.write_total_stats(True) global_analysis_get_initial_segments.write_length_stats() global_analysis_set_nonspeech_proportion.write_confusion_matrix(True) global_analysis_set_nonspeech_proportion.write_total_stats(True) global_analysis_set_nonspeech_proportion.write_length_stats() global_analysis_final.write_confusion_matrix(True) global_analysis_final.write_total_stats(True) global_analysis_final.write_length_stats() if __name__ == '__main__': main()