/*************************************************************************** * This file is part of the 'Shout LVCS Recognition toolkit'. * *************************************************************************** * Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 by Marijn Huijbregts * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; version 2 of the License. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include #include #include #include "standard.h" #include "shout_vtln.h" #include "lexicaltree.h" #include "featurepool.h" #include "phonemodel.h" #include "shout-misc.h" #include "featureextraction.h" using namespace StringFunctions; using namespace WriteFileLittleBigEndian; #define MAX_STRING (5000) ///////////////////////////////////////////////////////////////////////////////////////////////////// /// \todo Documentation ///////////////////////////////////////////////////////////////////////////////////////////////////// Shout_VTLN::Shout_VTLN(char *method, char *AMFile, char *audioRttmIn, char *audioName, char *audioRttmOut, double *totals) { lastSpeakerVTLN = VTLN_STEPS / 2; SegmentationAdmin segmentList; int vectorSize = VTLN_DEFAULT_MFCC_NUMBER+VTLN_DEFAULT_MFCC_ENERGY + VTLN_DEFAULT_MFCC_ZEROCROSS + 2*VTLN_DEFAULT_MFCC_DELTA*(VTLN_DEFAULT_MFCC_NUMBER+VTLN_DEFAULT_MFCC_ENERGY+VTLN_DEFAULT_MFCC_ZEROCROSS); int numberOfModels = 0; int numberOfClusters = 0; int useModel = -1; PhoneModel **models = NULL; if(audioRttmOut != NULL) { PRINTF("# Reading acoustic models...\n"); fflush(stdout); } // Creating the acoustic models: FILE *modelFile = fopen(AMFile, "rb"); if(modelFile==NULL) { USER_ERROR("No valid AM file passed\n"); } int vectSize; freadEndianSafe(&vectSize,1,sizeof(vectSize),modelFile); if(vectorSize != vectSize) { USER_ERROR("This AM file contains models with incompatible vector size!"); } freadEndianSafe(&numberOfModels,1,sizeof(numberOfModels),modelFile); freadEndianSafe(&numberOfClusters,1,sizeof(numberOfClusters),modelFile); if(numberOfClusters > 1) { USER_WARNING("This AM contains clusters. Only the first cluster will be used!\n"); } if(numberOfModels <= 0 || numberOfClusters <= 0) { USER_ERROR("This AM file does not contain any acoustic models!\n"); } else { models = new PhoneModel*[numberOfModels]; for(int i=0;igetStatistics()->name); fflush(stdout); } if(strncmp(models[i]->getStatistics()->name,"SPEECH ",6)==0) { useModel = i; } } if(audioRttmOut != NULL) { PRINTF2("# Using model %02d\n",useModel); fflush(stdout); } if(useModel < 0) { USER_ERROR("Sorry, no 'SPEECH' AM model in this file."); } fclose(modelFile); modelFile = NULL; } FeaturePool *featurePool[VTLN_STEPS]; double vtln = VTLN_START; char label[MAX_STRING]; FILE *listFile; int segID = 0; for(int vtcount=0;vtcountclaimSegmentationID(); if(vtcount == 0) { segID = s; } featurePool[vtcount]->readRTTM(s,NULL,-1,-1,3,listFile,-1,0,0,label); fclose(listFile); featurePool[vtcount]->createNewPool(audioName, NULL, NULL, false, false, s, vtln); // Segments are used to find SAD... vtln+=VTLN_STEPSIZE; } if(strcmp(method,"SEGMENT") == 0) { Vector *vector = featurePool[0]->getFirstVectorFirstSegment(&segmentList,segID,-1,true); while(vector != NULL) { int bestVtln = VTLN_STEPS/2 + 1; int time = featurePool[0]->getCurSegmentStart(&segmentList); int length = featurePool[0]->getCurSegmentLen(&segmentList); double bestScore = -9.0e300; for(int vtcount=0;vtcountgetLogPDFProbability(0,featurePool[vtcount]->getVector(NULL,i)); } if(like > bestScore) { bestScore = like; bestVtln = vtcount; } } featurePool[0]->setSegmentID(TRAIN_CONTEXTRIGHT,bestVtln,&segmentList); vector = featurePool[0]->getFirstVectorNextSegment(&segmentList,true); } if(audioRttmOut != NULL) { listFile = fopen(audioRttmOut,"wt"); featurePool[0]->writeRTTM(segID, label, "SPK", listFile, 0); fclose(listFile); } } else if(strcmp(method,"SPEAKER") == 0) { for(int spk=0;spkgetNumberOfSpeakers(segID);spk++) { int spkID = featurePool[0]->getSpeakerID(segID,spk); int bestVtln = VTLN_STEPS/2 + 1; double bestScore = -9.0e300; for(int vtcount=0;vtcountgetFirstVectorFirstSegment(&segmentList,segID,spkID,false); while(vector != NULL) { int time = featurePool[0]->getCurSegmentStart(&segmentList); int length = featurePool[0]->getCurSegmentLen(&segmentList); for(int i=time;igetLogPDFProbability(0,featurePool[vtcount]->getVector(NULL,i)); } vector = featurePool[0]->getFirstVectorNextSegment(&segmentList,false); } if(audioRttmOut != NULL) { PRINTF3("Score using VTLN factor number %02d\t%f\n",vtcount,like); } if(totals != NULL) { totals[vtcount] = like; } if(like > bestScore) { bestScore = like; bestVtln = vtcount; } } lastSpeakerVTLN = bestVtln; // Set the ID for each segment: Vector *vector = featurePool[0]->getFirstVectorFirstSegment(&segmentList,segID,spkID,false); while(vector != NULL) { featurePool[0]->setSegmentID(TRAIN_CONTEXTRIGHT,bestVtln,&segmentList); vector = featurePool[0]->getFirstVectorNextSegment(&segmentList,false); } } if(audioRttmOut != NULL) { listFile = fopen(audioRttmOut,"wt"); featurePool[0]->writeRTTM(segID, label, "SPK", listFile, 0); fclose(listFile); } } else { USER_ERROR("Sorry, this VTLN method is not yet implemented... Use 'SEGMENT' for now.\n"); } /* for(int i=0;i