/*************************************************************************** * This file is part of the 'Shout LVCS Recognition toolkit'. * *************************************************************************** * Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 by Marijn Huijbregts * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; version 2 of the License. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include "languagemodel.h" #include #include #include #include #include #include #include "hash.h" #include "shout-misc.h" using namespace WriteFileLittleBigEndian; ///////////////////////////////////////////////////////////////////////////////////////////////////// /// The standard constructor only initialises some internal variables. ///////////////////////////////////////////////////////////////////////////////////////////////////// LanguageModel::LanguageModel() { uni_lmData = NULL; bi_lmData = NULL; tri_lmData = NULL; four_lmData = NULL; uni_tableLength = 0; bi_tableLength = 0; tri_tableLength = 0; four_tableLength= 0; bi_Hash = NULL; tri_Hash = NULL; four_Hash = NULL; tri_HashListSearch = NULL; bi_lmDataList = NULL; tri_lmDataList = NULL; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// Creates a unigram LM ///////////////////////////////////////////////////////////////////////////////////////////////////// LanguageModel::LanguageModel(int nrUni) { uni_tableLength = nrUni; uni_lmData = new LMEntryType_1[uni_tableLength]; bi_lmData = NULL; tri_lmData = NULL; four_lmData = NULL; bi_tableLength = 0; tri_tableLength = 0; four_tableLength= 0; bi_Hash = NULL; tri_Hash = NULL; four_Hash = NULL; for(int i=0;i 0) { if(DO_MEMORY_MAPPED_FILE_LM) { fileOffsets[1] = offset; offset += bi_tableLength * (sizeof(float)*2 + sizeof(int)*2); // fseek(binlmFile,bi_tableLength * (sizeof(float)*2 + sizeof(int)*2), SEEK_CUR); } else { bi_lmData = new LMEntryType_2[bi_tableLength]; for(int ientry=0;ientry 1 freadEndianSafe(&(bi_lmData[ientry].backoff), 1,sizeof(bi_lmData[ientry].backoff),binlmFile); #endif } } /* int temp; freadEndianSafe(&(temp),1,sizeof(temp),binlmFile); assert(temp == uni_tableLength); if(DO_MEMORY_MAPPED_FILE_LM) { bi_lmDataList = &((LMListSearch_2*)(memoryPool+ftell(binlmFile)))[0]; fseek(binlmFile,uni_tableLength * (sizeof(int)*2), SEEK_CUR); } else { bi_lmDataList = new LMListSearch_2[uni_tableLength]; for(int ientry=0;ientry 1 if(DO_MEMORY_MAPPED_FILE_LM) { tri_tableLength = *((int*)(memoryPool+offset)); offset += sizeof(int); } else { freadEndianSafe(&(tri_tableLength),1,sizeof(tri_tableLength),binlmFile); } if(tri_tableLength > 0) { if(DO_MEMORY_MAPPED_FILE_LM) { fileOffsets[2] = offset; offset += tri_tableLength * (sizeof(float) + sizeof(int)*3); // fseek(binlmFile,tri_tableLength * (sizeof(float) + sizeof(int)*3), SEEK_CUR); } else { tri_lmData = new LMEntryType_3[tri_tableLength]; for(int ientry=0;ientry 2 freadEndianSafe(&(tri_lmData[ientry].backoff), 1,sizeof(tri_lmData[ientry].backoff),binlmFile); #endif } } tri_Hash = new Hash(binlmFile, memoryPool, &offset); /* freadEndianSafe(&(tri_tableLengthList),1,sizeof(tri_tableLengthList),binlmFile); if(DO_MEMORY_MAPPED_FILE_LM) { tri_lmDataList = &((LMListSearch_3*)(memoryPool+ftell(binlmFile)))[0]; fseek(binlmFile,tri_tableLengthList * (sizeof(int)*4), SEEK_CUR); } else { tri_lmDataList = new LMListSearch_3[tri_tableLengthList]; for(int ientry=0;ientry 2 four_tableLength = *((int*)(memoryPool+offset)); offset += sizeof(int); // freadEndianSafe(&(four_tableLength),1,sizeof(four_tableLength),binlmFile); if(four_tableLength > 0) { four_lmData = new LMEntryType_4[four_tableLength]; for(int ientry=0;ientry 3 freadEndianSafe(&(four_lmData[ientry].backoff), 1,sizeof(four_lmData[ientry].backoff),binlmFile); #endif } four_Hash = new Hash(binlmFile); } #endif */ fclose(binlmFile); if(DO_MEMORY_MAPPED_FILE_LM) { uni_lmData = &((LMEntryType_1*)(memoryPool+fileOffsets[0]))[0]; bi_lmData = &((LMEntryType_2*)(memoryPool+fileOffsets[1]))[0]; tri_lmData = &((LMEntryType_3*)(memoryPool+fileOffsets[2]))[0]; //four_lmData = ((LMEntryType_4*)(memoryPool+fileOffsets[3]))[0]; } } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// Given the current word history, the last wordID is returned. ///////////////////////////////////////////////////////////////////////////////////////////////////// int LanguageModel::getLastWordID(int *wordHistory) { int wordID = -1; if(wordHistory != NULL) { #if LM_NGRAM_DEPTH > 2 if(wordHistory[2] >= 0) { wordID = tri_lmData[wordHistory[2]].words[2]; } #endif #if LM_NGRAM_DEPTH > 1 if(wordID < 0 && wordHistory[1] >= 0) { wordID = bi_lmData[wordHistory[1]].words[1]; } #endif if(wordID < 0) { wordID = wordHistory[0]; } } return wordID; } ///////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////// void LanguageModel::getAllP(int *lmHistory, float *allP) { if(lmHistory == NULL) { for(int i=0;i= 0) { w1 = lmHistory[0]; w2 = bi_lmData[lmHistory[1]].words[1]; } else if(bi_tableLength > 0) { w2 = lmHistory[0]; } bool pSet[uni_tableLength]; for(int i=0;i=0 && w2 >=0) { int wordHistoryNew[2] = {w1,w2}; int wIndex = tri_HashListSearch->getIndex(wordHistoryNew); if(tri_lmDataList[wIndex].words[0] == wordHistoryNew[0] && tri_lmDataList[wIndex].words[1] == wordHistoryNew[1]) { int start = tri_lmDataList[wIndex].lowestIndex; int end = start + tri_lmDataList[wIndex].length; for(int i=start;i=0) { int start = bi_lmDataList[w2].lowestIndex; int end = start + bi_lmDataList[w2].length; for(int i=start;i 1 wordHistoryOut[1] = -1; #endif #if LM_NGRAM_DEPTH > 2 wordHistoryOut[2] = -1; #endif } #if LM_NGRAM_DEPTH > 1 else if(tri_tableLength > 0 && wordHistory[1] == -1) // We only have a uni-gram history { res = getBiGramP(newWordID,wordHistory,wordHistoryOut); #if LM_NGRAM_DEPTH > 2 wordHistoryOut[2] = -1; #endif } #endif #if LM_NGRAM_DEPTH > 2 else if(four_tableLength > 0 && wordHistory[2] == -1) // We only have a bi-gram history { res = getTriGramP(newWordID,wordHistory,wordHistoryOut); } #endif else // Full house.. { int w[LM_NGRAM_DEPTH+1]; if(tri_tableLength == 0) { res = getBiGramP(newWordID,wordHistory,w); if(w[1] != -1) { wordHistoryOut[0] = bi_lmData[w[1]].words[1]; } else { wordHistoryOut[0] = w[0]; } #if LM_NGRAM_DEPTH > 1 wordHistoryOut[1] = -1; #endif #if LM_NGRAM_DEPTH > 2 wordHistoryOut[2] = -1; #endif } else if(four_tableLength == 0) { #if LM_NGRAM_DEPTH > 1 res = getTriGramP(newWordID,wordHistory,w); if(w[2] != -1) { int w2[2] = {tri_lmData[w[2]].words[1],tri_lmData[w[2]].words[2]}; int r = getBiGramIndex(w2); if(r == -1) { wordHistoryOut[0] = tri_lmData[w[2]].words[2]; wordHistoryOut[1] = -1; } else { wordHistoryOut[0] = tri_lmData[w[2]].words[1]; wordHistoryOut[1] = r; } } else { wordHistoryOut[0] = w[0]; wordHistoryOut[1] = w[1]; } #endif #if LM_NGRAM_DEPTH > 2 wordHistoryOut[2] = -1; #endif } else { #if LM_NGRAM_DEPTH > 2 res = get4GramP(newWordID,wordHistory,w); if(w[3] != -1) { int w2[3] = {four_lmData[w[3]].words[1],four_lmData[w[3]].words[2],four_lmData[w[3]].words[3]}; int r = getTriGramIndex(w2); int r2 = getBiGramIndex(w2); if(r == -1) { if(r2 == -1) { wordHistoryOut[0] = four_lmData[w[3]].words[3]; wordHistoryOut[1] = -1; wordHistoryOut[2] = -1; } else { wordHistoryOut[0] = four_lmData[w[3]].words[2]; wordHistoryOut[1] = r2; wordHistoryOut[2] = -1; } } else { wordHistoryOut[0] = four_lmData[w[3]].words[1]; wordHistoryOut[0] = r2; wordHistoryOut[2] = r; } } else { wordHistoryOut[0] = w[0]; wordHistoryOut[1] = w[1]; wordHistoryOut[2] = w[2]; } #endif } } return res; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method returns the unigram probability and updates the new LM history. /// (this method is a helper method of getP() ) ///////////////////////////////////////////////////////////////////////////////////////////////////// float LanguageModel::getUniGramP(int newWordID, int *wordHistoryNew) { wordHistoryNew[0] = newWordID; return uni_lmData[newWordID].p; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method returns the bigram probability and updates the new LM history. /// If needed it will backoff to getUniGramP(). (this method is a helper method of getP() ) ///////////////////////////////////////////////////////////////////////////////////////////////////// float LanguageModel::getBiGramP(int newWordID,int *wordHistory, int *wordHistoryNew) { wordHistoryNew[0] = wordHistory[0]; wordHistoryNew[1] = newWordID; wordHistoryNew[1] = getBiGramIndex(wordHistoryNew); if(wordHistoryNew[1] < 0) { if(bi_tableLength > 0) { return (uni_lmData[wordHistory[0]].backoff + getUniGramP(newWordID,wordHistoryNew)); } return getUniGramP(newWordID,wordHistoryNew); } else { return bi_lmData[wordHistoryNew[1]].p; } return SMALLEST_NEGATIVE; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method returns the trigram probability and updates the new LM history. /// If needed it will backoff to getBiGramP(). (this method is a helper method of getP() ) ///////////////////////////////////////////////////////////////////////////////////////////////////// float LanguageModel::getTriGramP(int newWordID,int *wordHistory, int *wordHistoryNew) { #if LM_NGRAM_DEPTH > 1 /* returns a triple history. This triple is cut back to 2 at getP... */ wordHistoryNew[0] = wordHistory[0]; wordHistoryNew[1] = bi_lmData[wordHistory[1]].words[1]; wordHistoryNew[2] = newWordID; wordHistoryNew[2] = getTriGramIndex(wordHistoryNew); if(wordHistoryNew[2] < 0) { wordHistoryNew[0] = wordHistoryNew[1]; wordHistoryNew[1] = -1; wordHistoryNew[2] = -1; if(tri_tableLength > 0) { return (bi_lmData[wordHistory[1]].backoff + getBiGramP(newWordID,wordHistoryNew,wordHistoryNew)); } return getBiGramP(newWordID,wordHistoryNew,wordHistoryNew); } else { wordHistoryNew[1] = wordHistory[1]; return tri_lmData[wordHistoryNew[2]].p; } #else // Not used in this case: newWordID = newWordID; wordHistory = wordHistory; wordHistoryNew = wordHistoryNew; #endif return SMALLEST_NEGATIVE; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method returns the 4-gram probability and updates the new LM history. /// If needed it will backoff to getTriGramP(). (this method is a helper method of getP() ) ///////////////////////////////////////////////////////////////////////////////////////////////////// float LanguageModel::get4GramP(int newWordID,int *wordHistory, int *wordHistoryNew) { #if LM_NGRAM_DEPTH > 2 /* returns a 'quatro' ;-) history. This quatro is cut back to 3 at getP... */ wordHistoryNew[0] = wordHistory[0]; wordHistoryNew[1] = tri_lmData[wordHistory[2]].words[1]; wordHistoryNew[2] = tri_lmData[wordHistory[2]].words[2]; wordHistoryNew[3] = newWordID; wordHistoryNew[3] = get4GramIndex(wordHistoryNew); if(wordHistoryNew[3] < 0) { wordHistoryNew[0] = wordHistoryNew[1]; int w[2] = {wordHistoryNew[1],wordHistoryNew[2]}; wordHistoryNew[1] = getBiGramIndex(w); assert(wordHistoryNew[1] >= 0); wordHistoryNew[2] = -1; wordHistoryNew[3] = -1; if(four_tableLength > 0) { return (tri_lmData[wordHistory[2]].backoff + getTriGramP(newWordID,wordHistoryNew,wordHistoryNew)); } return getTriGramP(newWordID,wordHistoryNew,wordHistoryNew); } else { wordHistoryNew[1] = wordHistory[1]; wordHistoryNew[2] = wordHistory[2]; return four_lmData[wordHistoryNew[3]].p; } #else // Not used in this case: newWordID = newWordID; wordHistory = wordHistory; wordHistoryNew = wordHistoryNew; #endif return SMALLEST_NEGATIVE; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method returns the bigram index for the bigram array for the word pair w[0] and w[1]. /// If this bigram does not exist, it will return -1. ///////////////////////////////////////////////////////////////////////////////////////////////////// int LanguageModel::getBiGramIndex(int *w) { if(bi_Hash != NULL) { int res = bi_Hash->getIndex(w); if(res>=0 && compareKeys(bi_lmData[res].words,w,2)) { return res; } } return -1; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method returns the trigram index for the trigram array for the word pair w[0], w[1] and w[2]. /// If this trigram does not exist, it will return -1. ///////////////////////////////////////////////////////////////////////////////////////////////////// int LanguageModel::getTriGramIndex(int *w) { int res = tri_Hash->getIndex(w); if(res>=0 && compareKeys(tri_lmData[res].words,w,3)) { return res; } return -1; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method returns the 4-gram index for the 4-gram array for the word pair w[0], w[1], w[2] and w[3]. /// If this trigram does not exist, it will return -1. ///////////////////////////////////////////////////////////////////////////////////////////////////// int LanguageModel::get4GramIndex(int *w) { int res = four_Hash->getIndex(w); if(res>=0 && compareKeys(four_lmData[res].words,w,4)) { return res; } return -1; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method is only used for debugging purposes. It checks if the language model is consistant with /// the hash tables. ///////////////////////////////////////////////////////////////////////////////////////////////////// bool LanguageModel::checkValidity() { // First check the bigram table: for(int i=0;igetIndex(w); if(res < 0) { PRINTF2("No return from %d\n",i); } else if(res != i) { PRINTF5("ERROR IN: %d\t%s\t%s\t%f\n",i,(char*) bi_lmData[i].words[0], (char*) bi_lmData[i].words[1], bi_lmData[i].p); return false; } } // Same for the trigram: for(int i=0;igetIndex(w); if(res < 0) { PRINTF2("No return from %d\n",i); } else if(res != i) { PRINTF6("ERROR IN: %d\t%s\t%s\t%s\t%f\n",i,(char*) tri_lmData[i].words[0], (char*) tri_lmData[i].words[1], (char*) tri_lmData[i].words[2], tri_lmData[i].p); return false; } } return true; } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// This method prints some information about the language model to standard output. ///////////////////////////////////////////////////////////////////////////////////////////////////// void LanguageModel::printInfo() { printf("1-gram table: %d\n", uni_tableLength); printf("2-gram table: %d\n", bi_tableLength); printf("3-gram table: %d\n", tri_tableLength); printf("4-gram table: %d\n", four_tableLength); }