/*************************************************************************** * This file is part of the 'Shout LVCS Recognition toolkit'. * *************************************************************************** * Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 by Marijn Huijbregts * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; version 2 of the License. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include #include #include #include #include "standard.h" #include "shout_cluster.h" #include "shout-misc.h" #include "shoutconfig.h" using namespace StringFunctions; #include extern int SEGMENT_INITIAL_DATA_SEGMENTS; // extern int MINPASSES; extern int MERGE_METHOD; extern int STOP_METHOD; ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This is the main function for the shout_cluster application. ///////////////////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { ShoutConfig myConfig(APPID_CLUSTER, argc, argv); char *feature2 = NULL; if(myConfig.parIsDefined(ARGUMENT_SECOND_STREAM)) { feature2 = myConfig.getStringValue(ARGUMENT_SECOND_STREAM); } char *feature3 = NULL; if(myConfig.parIsDefined(ARGUMENT_THIRD_STREAM)) { feature3 = myConfig.getStringValue(ARGUMENT_THIRD_STREAM); } int mergeAtaTime = 1; if(myConfig.parIsDefined(ARGUMENT_FAST_MERGE)) { mergeAtaTime = (int)(myConfig.getFloatValue(ARGUMENT_FAST_MERGE)); if(mergeAtaTime <= 0) { USER_ERROR("The 'fast merge' number must be 1 or higher."); } } char fileSAD2[MAXSTR]; strcpy(fileSAD2,""); if(myConfig.parIsDefined(ARGUMENT_META_IN)) { strcpy(fileSAD2,myConfig.getStringValue(ARGUMENT_META_IN)); strcat(fileSAD2,".train"); } int maxNumberOfClusters = -1; if(myConfig.parIsDefined(ARGUMENT_MAX_CLUSTERS)) { maxNumberOfClusters = (int)(myConfig.getFloatValue(ARGUMENT_MAX_CLUSTERS)); } Shout_Cluster Shout_Cluster(myConfig.getStringValue(ARGUMENT_AUDIOFILE), myConfig.getStringValue(ARGUMENT_LABEL), fileSAD2, myConfig.getStringValue(ARGUMENT_META_IN), myConfig.getStringValue(ARGUMENT_META_OUT), feature2, 0.9, feature3, mergeAtaTime, maxNumberOfClusters, myConfig.parIsDefined(ARGUMENT_WIDEN_SEGMENTS)); return 0; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This constructor reads a feature file in order to segment and cluster it. /// /// Energy based Speaker Activity Detection is used for a first rough speech/non-speech clustering. /// After that, the method in Train_Speaker_Segmenter is used to obtain a speaker segmentation. ///////////////////////////////////////////////////////////////////////////////////////////////////// Shout_Cluster::Shout_Cluster(char *featureFile, char *label, char *inputRTTM_train, char *inputRTTM_decode, char *outputRTTM, char *featureFile2, double initWeight, char *featureFile3, int fastMerge, int maxClusters, bool widen) { FeaturePool_Multistream_file files[3]; char refCheat[MAXSTR]; refCheat[0] = 0; strcpy(refCheat, inputRTTM_decode); strcat(refCheat,".ref"); FILE *test = fopen(inputRTTM_train,"rt"); if(test == NULL) { inputRTTM_train = inputRTTM_decode; } else { fclose(test); } files[0].fileName = featureFile; files[0].fileType = FILE_TYPE_RAW_DIARIZATION; PRINTF2("Feature file: %s\n",featureFile); FILE *testFile = fopen(featureFile2,"rb"); bool useSecondStream = false; bool useThirdStream = false; if(testFile != NULL) { PRINTF2("Second feature file: %s\n",featureFile2); useSecondStream = true; fclose(testFile); files[1].fileName = featureFile2; files[1].fileType = FILE_TYPE_TEXT; testFile = fopen(featureFile3,"rb"); if(testFile != NULL) { PRINTF2("Third feature file: %s\n",featureFile3); useThirdStream = true; fclose(testFile); files[2].fileName = featureFile3; files[2].fileType = FILE_TYPE_TEXT; } else { PRINTF("Not using a third feature file.\n"); } } else { PRINTF("Not using a second feature file.\n"); } PRINTF2("Label: %s\n",label); PRINTF2("RTTM for training: %s\n",inputRTTM_train); PRINTF2("RTTM for decoding: %s\n",inputRTTM_decode); PRINTF2("RTTM for cheating: %s\n",refCheat); PRINTF2("Output RTTM: %s\n",outputRTTM); PRINTF("Loading feature vector file...\n"); int len = strlen(featureFile); FeaturePool *feaPool = NULL; FILE_TYPE fileType = FILE_TYPE_SHOUT; if((featureFile[len-3] == 'r' && featureFile[len-2] == 'a' && featureFile[len-1] == 'w') || (featureFile[len-3] == 'R' && featureFile[len-2] == 'A' && featureFile[len-1] == 'W') ) { PRINTF("Raw file detected...\n"); fileType = FILE_TYPE_RAW_DIARIZATION; } if(useThirdStream) { double weight[3] = {0.75,0.25,0.05}; feaPool = new FeaturePool(3,files,weight,true); } else if(useSecondStream) { double weight[2] = {initWeight,1.0-initWeight}; feaPool = new FeaturePool(2,files,weight,true); } else { feaPool = new FeaturePool(featureFile,fileType,false,1.0,false,true); } char filterName[2000]; strcpy(filterName,featureFile); strcat(filterName,".asr"); // feaPool->setFilter(filterName,FEATURE_SPEAKER_SIZE,0.8); feaPool->enableFilter(false); int sadID_train = feaPool->claimSegmentationID(); int sadID_decode = feaPool->claimSegmentationID(); int segID_cheat = -1; // segID_cheat = maxClusters; // maxClusters = 40; PRINTF2("Done, total number of frames: %d\n",feaPool->getPoolLength()); // Use the input RTTM file to create segments: PRINTF("Reading SAD RTTM file for training...\n"); FILE *inSegFile = fopen(inputRTTM_train,"rt"); if(inSegFile == NULL) { USER_ERROR("ERROR: Could not read the input RTTM file for training!\n"); } char segmentString[60]; strcpy(segmentString,"SPEECH"); strcpy(&(segmentString[20]),"SIL"); strcpy(&(segmentString[40]),"SOUND"); feaPool->readRTTM(sadID_train,segmentString,3,20,0,inSegFile); fclose(inSegFile); PRINTF("Reading SAD RTTM file for decoding...\n"); inSegFile = fopen(inputRTTM_decode,"rt"); if(inSegFile == NULL) { USER_ERROR("ERROR: Could not read the input RTTM file for decoding!\n"); } feaPool->readRTTM(sadID_decode,segmentString,3,20,0,inSegFile); fclose(inSegFile); PRINTF("Reading reference RTTM file for cheating...\n"); inSegFile = NULL; if(refCheat != NULL && strlen(refCheat) > 0) { inSegFile = fopen(refCheat,"rt"); } if(inSegFile == NULL) { PRINTF("Could not open reference file. I guess we're not going to cheat today...\n"); } else { segID_cheat = feaPool->claimSegmentationID(); feaPool->readRTTM(segID_cheat,NULL,-1,-1,3,inSegFile); fclose(inSegFile); } PRINTF("Creating features...\n"); fflush(stdout); if(useSecondStream) { feaPool->createNewMultiStreamPool(files, sadID_train); } else { feaPool->createNewPool(featureFile, NULL, NULL, false, false, sadID_train); } // Now train! PRINTF("Start training..\n"); // segmenter = new Train_Segmenter(feaPool,sadID_train,sadID_decode,label, fastMerge, 14, 18, widen); segmenter = new Train_Segmenter(feaPool,sadID_train,sadID_decode,label, fastMerge, 16, maxClusters, widen, NULL); segmenter->train(0, label, outputRTTM, NULL, segID_cheat); delete segmenter; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// The destructor is empty. ///////////////////////////////////////////////////////////////////////////////////////////////////// Shout_Cluster::~Shout_Cluster() { }