/*************************************************************************** * This file is part of the 'Shout LVCS Recognition toolkit'. * *************************************************************************** * Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 by Marijn Huijbregts * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; version 2 of the License. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include "standard.h" #include "shout_lm2bin.h" #include "lexicaltree.h" #include "trainhash.h" #include "shout-misc.h" #include "shoutconfig.h" using namespace StringFunctions; using namespace WriteFileLittleBigEndian; #include #include #include #include #include #include #define WARNING(a) printf("Warning: %s\n",a); return; #define MAPbi_lmData(i) ((LMEntryType_2*)memMap->map(i)) #define MAPtri_lmData(i) ((LMEntryType_3*)memMap->map(i)) #define MAPfour_lmData(i) ((LMEntryType_4*)memMap->map(i)) int main(int argc, char *argv[]) { ShoutConfig myConfig(APPID_LM2BIN, argc, argv); Shout_lm2bin myLm2Bin(myConfig.getStringValue(ARGUMENT_DCT_BIN),myConfig.getStringValue(ARGUMENT_LM_ARPA),myConfig.getStringValue(ARGUMENT_LM_BIN)); return 0; } void Shout_lm2bin::sortWords(int numberOfItems, int *wordList, int *sortList) { int nrWords[numberOfWords]; for(int i=0;i 2 four_tableLength = 0; #endif struct timeval time1; struct timeval time2; gettimeofday( &time1, NULL ); FILE *lexFile = fopen(lexName,"rb"); if(lexFile == NULL) { USER_ERROR("The lexical tree file could not be opened!"); } freadEndianSafe(&numberOfWords,1,sizeof(numberOfWords),lexFile); freadEndianSafe(&numberOfWords,1,sizeof(numberOfWords),lexFile); vocabulary = new char*[numberOfWords]; for(int i=0;i0) { backoff = atof(str); } else { backoff = 0.0; } if(p == -99.0) { p = -9e30; // Small, but not to be ignored! } wordID = wordTree->getWordID(str2); if(wordID >= 0) { if(wordID >= uni_tableLength) { USER_ERROR2("The lexical tree contains at least one word that does not exist in the language model file:",vocabulary[wordID]); } uni_lmData[wordID].p = p; uni_lmData[wordID].backoff = backoff; } else { if(strlen(str2)>0) { printf("Warning: word \"%s\" does not exist in the lexical tree. This word will be ignored...\n",str2); } } break; case 2: // Reading bigram stats: if((strncmp("\\3-grams:",str,9)==0) || (strncmp("\\end\\",str,5)==0)) { printf("\b\b\b\b\b\b\b\b\b\b%10d\n",counter); fflush(stdout); bi_tableLength = counter; // We may have won some entries... /* // Sorting: printf("Sorting the bigrams...\n"); fflush(stdout); int *sortList = new int[bi_tableLength]; assert(sortList != NULL); int *sortList2 = new int[bi_tableLength]; assert(sortList2 != NULL); int *sortList3 = new int[bi_tableLength]; assert(sortList3 != NULL); int *wordList = new int[bi_tableLength]; assert(wordList != NULL); bi_lmDataList = new LMListSearch_2[numberOfWords]; assert(bi_lmDataList != NULL); for(int i=0;i 0) { bi_lmDataList[index].length = sum - bi_lmDataList[index].lowestIndex; index++; bi_lmDataList[index].lowestIndex = sum; diff--; } } bi_lmDataList[numberOfWords-1].length = bi_tableLength - bi_lmDataList[numberOfWords-1].lowestIndex; for(int i=0;i 0) { for(int i2=0;i2 1 reOrderBi[i].backoff = bi_lmData[sortList[i]].backoff; #endif } delete[] sortList; delete[] sortList2; delete[] sortList3; delete[] wordList; delete[] bi_lmData; bi_lmData = reOrderBi; reOrderBi = NULL; */ // Now create the bigram hash table! TrainHash *hash = NULL; bool done = false; printf("Bigram hash (f) : %10d",0); fflush(stdout); while(!done) { if(hash != NULL) { delete hash; } hash = new TrainHash(bi_tableLength,2,numberOfWords); memMap->setWritePermission(false); int factorCounter = 0; while(!done && factorCounter < 999) { printf("\b\b\b%03d",factorCounter); fflush(stdout); factorCounter++; hash->initialiseMapping(); int i = 0; done = true; while(ifillMapping(MAPbi_lmData(i)->words); i++; } } } hash->finalizeHash(); for(int i=0;igetIndex((int*)MAPbi_lmData(i)); if(hashResult != i) { printf("\nError in hash: (%d,%d) - %d %d\n",i,hashResult,MAPbi_lmData(i)->words[0],MAPbi_lmData(i)->words[1]); exit(1); } } // Done with the bigram hash table. fwriteEndianSafe(&(bi_tableLength),1,sizeof(bi_tableLength),binlmFile); for(int ientry=0;ientrywords, 2,sizeof(MAPbi_lmData(ientry)->words[0]),binlmFile); fwriteEndianSafe(&(MAPbi_lmData(ientry)->p), 1,sizeof(MAPbi_lmData(ientry)->p),binlmFile); #if LM_NGRAM_DEPTH > 1 fwriteEndianSafe(&(MAPbi_lmData(ientry)->backoff), 1,sizeof(MAPbi_lmData(ientry)->backoff),binlmFile); #endif } /* fwriteEndianSafe(&(numberOfWords),1,sizeof(numberOfWords),binlmFile); for(int ientry=0;ientrystoreHash(binlmFile); delete hash; bi_Hash = NULL; hash = NULL; if(memMap != NULL) { delete memMap; } memMap = new MemMappedFile(sizeof(LMEntryType_3),tri_tableLength,4096*1000, "tmp.lm"); counter = 0; displayCounter = 0; if(strncmp("\\end\\",str,5)==0) { state=100; // Done. } else { #if LM_NGRAM_DEPTH > 1 printf("\nNumber of trigrams : %10d",counter); fflush(stdout); #else printf("\n"); fflush(stdout); state = 100; // Done.. #endif state++; } break; } splitList(str,str2); p = atof(str); if(p == -99.0) { p = -9e30; // Small, but not to be ignored! } splitList(str2,str); splitList(str,str3); RightTrim(str3); #if LM_NGRAM_DEPTH > 1 if(strlen(str3)>0) { backoff = atof(str3); } else { backoff = 0.0; } #endif wordID = wordTree->getWordID(str2); wordID2 = wordTree->getWordID(str); if(wordID >=0 && wordID2 >= 0) { if(counter >= bi_tableLength) { USER_ERROR("The arpa LM is invalid: more bi-grams than noted at the top of the file!"); } MAPbi_lmData(counter)->words[0] = wordID; MAPbi_lmData(counter)->words[1] = wordID2; MAPbi_lmData(counter)->p = p; #if LM_NGRAM_DEPTH > 1 MAPbi_lmData(counter)->backoff = backoff; #endif counter++; displayCounter++; if(displayCounter > 9999) { displayCounter = 0; printf("\b\b\b\b\b\b\b\b\b\b%10d",counter); fflush(stdout); } } break; case 3: // Reading trigram stats: if((strncmp("\\4-grams:",str,9)==0) || (strncmp("\\end\\",str,5)==0)) { printf("\b\b\b\b\b\b\b\b\b\b%10d",counter); fflush(stdout); tri_tableLength = counter; // We may have won some entries... /* // Sorting: printf("\nSorting the trigrams...\n"); fflush(stdout); int *sortList = new int[tri_tableLength]; int *sortList2 = new int[tri_tableLength]; int *sortList3 = new int[tri_tableLength]; int *wordList = new int[tri_tableLength]; int sortHelp[numberOfWords+1]; for(int i=0;i= 0); } int index = 0; int sum = 0; sortHelp[0] = 0; for(int i=1;i 0) { index++; sortHelp[index] = sum; diff--; } } for(int i=index+1;i 0) { for(int i2=0;i2 2 reOrderTri[i].backoff = tri_lmData[sortList[i]].backoff; #endif } delete[] tri_lmData; tri_lmData = reOrderTri; reOrderTri = NULL; tri_tableLengthList = 1; int prevID1 = tri_lmData[0].words[0]; int prevID2 = tri_lmData[0].words[1]; for(int li=1;li 1 TrainHash *hash = NULL; bool done = false; printf("\nTrigram hash (f) : %10d",0); fflush(stdout); while(!done) { if(hash != NULL) { delete hash; } hash = new TrainHash(tri_tableLength,3,numberOfWords); memMap->setWritePermission(false); int factorCounter = 0; while(!done && factorCounter < 999) { printf("\b\b\b%03d",factorCounter); fflush(stdout); factorCounter++; hash->initialiseMapping(); int i = 0; done = true; while(ifillMapping(MAPtri_lmData(i)->words); i++; } } } hash->finalizeHash(); for(int i=0;igetIndex((int*)MAPtri_lmData(i)); if(hashResult != i) { printf("Error in hash: (%d,%d)- %d %d %d\n", i,hashResult,MAPtri_lmData(i)->words[0],MAPtri_lmData(i)->words[1],MAPtri_lmData(i)->words[2]); exit(1); } } fwriteEndianSafe(&(tri_tableLength),1,sizeof(tri_tableLength),binlmFile); if(tri_tableLength > 0) { for(int ientry=0;ientrywords, 3,sizeof(MAPtri_lmData(ientry)->words[0]),binlmFile); fwriteEndianSafe(&(MAPtri_lmData(ientry)->p), 1,sizeof(MAPtri_lmData(ientry)->p),binlmFile); #if LM_NGRAM_DEPTH > 2 fwriteEndianSafe(&(MAPtri_lmData(ientry)->backoff), 1,sizeof(MAPtri_lmData(ientry)->backoff),binlmFile); #endif } hash->storeHash(binlmFile); } delete hash; tri_Hash = NULL; hash = NULL; if(tri_tableLength > 0) { /* // Now create the trigram-list hash table! hash = NULL; done = false; printf("Trigram-list hash (f) : %10d",0); fflush(stdout); while(!done) { if(hash != NULL) { delete hash; } hash = new TrainHash(tri_tableLengthList,2,numberOfWords); tri_HashListSearch = hash; int factorCounter = 0; while(!done && factorCounter < 999) { printf("\b\b\b%03d",factorCounter); fflush(stdout); factorCounter++; hash->initialiseMapping(); int i = 0; done = true; while(ifillMapping(tri_lmDataList[i].words); i++; } } } hash->finalizeHash(); for(int i=0;igetIndex((int*)&tri_lmDataList[i]); if(hashResult != i) { printf("\nError in hash: (%d,%d) - %d\n",i,hashResult,tri_lmDataList[i].words[0]); exit(1); } } // Sorting information: fwriteEndianSafe(&(tri_tableLengthList),1,sizeof(tri_tableLengthList),binlmFile); for(int ientry=0;ientrystoreHash(binlmFile); delete hash; tri_HashListSearch = NULL; hash = NULL; */ } if(memMap != NULL) { delete memMap; } memMap = new MemMappedFile(sizeof(LMEntryType_4),four_tableLength,4096*1000, "tmp.lm"); #endif // Done with the 3-gram hash. counter = 0; displayCounter = 0; if(strncmp("\\end\\",str,5)==0) { state=100; // Done. } else { #if LM_NGRAM_DEPTH > 2 printf("\nNumber of 4-grams : %10d",counter); fflush(stdout); #else printf("\n"); fflush(stdout); state = 100; // Done.. #endif state++; } break; } splitList(str,str2); p = atof(str); if(p == -99.0) { p = -9e30; // Small, but not to be ignored! } splitList(str2,str); splitList(str,str3); splitList(str3,str4); RightTrim(str4); #if LM_NGRAM_DEPTH > 2 if(strlen(str4)>0) { backoff = atof(str4); } else { backoff = 0.0; } #endif wordID = wordTree->getWordID(str2); wordID2 = wordTree->getWordID(str); wordID3 = wordTree->getWordID(str3); if(wordID >=0 && wordID2 >= 0 && wordID3 >=0) { if(counter >= tri_tableLength) { USER_ERROR("The arpa LM is invalid: more tri-grams than noted at the top of the file!"); } MAPtri_lmData(counter)->words[0] = wordID; MAPtri_lmData(counter)->words[1] = wordID2; MAPtri_lmData(counter)->words[2] = wordID3; MAPtri_lmData(counter)->p = p; #if LM_NGRAM_DEPTH > 2 MAPtri_lmData(counter)->backoff = backoff; #endif counter++; displayCounter++; if(displayCounter > 9999) { displayCounter = 0; printf("\b\b\b\b\b\b\b\b\b\b%10d",counter); fflush(stdout); } } break; case 4: // Reading 4-gram stats: if(strncmp("\\end\\",str,5)==0) { printf("\b\b\b\b\b\b\b\b\b\b%10d\n",counter); fflush(stdout); four_tableLength = counter; // We may have won some entries... // Create the 4-gram hash: #if LM_NGRAM_DEPTH > 2 TrainHash *hash = NULL; bool done = false; printf("\n4-gram hash (f) : %10d",0); fflush(stdout); while(!done) { if(hash != NULL) { delete hash; } hash = new TrainHash(four_tableLength,4,numberOfWords); memMap->setWritePermission(false); int factorCounter = 0; while(!done && factorCounter < 999) { printf("\b\b\b%03d",factorCounter); fflush(stdout); factorCounter++; hash->initialiseMapping(); int i = 0; done = true; while(ifillMapping(i,MAPfour_lmData(i)->words); i++; } } } hash->finalizeHash(); for(int i=0;igetIndex((int*)MAPfour_lmData(i)); if(hashResult != i) { printf("Error in hash: (%d,%d)- %d %d %d %d\n", i,hashResult, MAPfour_lmData(i)->words[0],MAPfour_lmData(i)->words[1], MAPfour_lmData(i)->words[2],MAPfour_lmData(i)->words[3]); exit(1); } } fwriteEndianSafe(&(four_tableLength),1,sizeof(four_tableLength),binlmFile); if(four_tableLength > 0) { for(int ientry=0;ientrywords, 4,sizeof(MAPfour_lmData(ientry)->words[0]),binlmFile); fwriteEndianSafe(&(MAPfour_lmData(ientry)->p), 1,sizeof(MAPfour_lmData(ientry)->p),binlmFile); #if LM_NGRAM_DEPTH > 3 fwriteEndianSafe(&(MAPfour_lmData(ientry)->backoff), 1,sizeof(MAPfour_lmData(ientry)->backoff),binlmFile); #endif } hash->storeHash(binlmFile); } delete hash; four_Hash = NULL; hash = NULL; #endif // Done. state++; break; } splitList(str,str2); p = atof(str); if(p == -99.0) { p = -9e30; // Small, but not to be ignored! } splitList(str2,str); splitList(str,str3); splitList(str3,str4); splitList(str4,str5); RightTrim(str5); #if LM_NGRAM_DEPTH > 3 /* if(strlen(str5)>0) { backoff = atof(str5); } else { backoff = 0.0; }*/ #endif wordID = wordTree->getWordID(str2); wordID2 = wordTree->getWordID(str); wordID3 = wordTree->getWordID(str3); wordID4 = wordTree->getWordID(str4); if(wordID >=0 && wordID2 >= 0 && wordID3 >=0 && wordID4 >= 0) { if(counter >= four_tableLength) { USER_ERROR("The arpa LM is invalid: more 4-grams than noted at the top of the file!"); } MAPfour_lmData(counter)->words[0] = wordID; MAPfour_lmData(counter)->words[1] = wordID2; MAPfour_lmData(counter)->words[2] = wordID3; MAPfour_lmData(counter)->words[3] = wordID4; MAPfour_lmData(counter)->p = p; #if LM_NGRAM_DEPTH > 3 // MAPfour_lmData(counter)->backoff = backoff; #endif counter++; displayCounter++; if(displayCounter > 9999) { displayCounter = 0; printf("\b\b\b\b\b\b\b\b\b\b%10d",counter); fflush(stdout); } } break; default: break; } } if(tri_tableLength == 0) { fwriteEndianSafe(&(tri_tableLength),1,sizeof(tri_tableLength),binlmFile); } fclose(binlmFile); gettimeofday( &time2, NULL ); int sec = (time2.tv_sec) - (time1.tv_sec); int min = ((int)sec/60); sec = sec - min*60; printf("\n----------------------------------------------------\n"); printf("-- Done. It took %d minutes and %d seconds.\n", min,sec); printf("----------------------------------------------------\n"); } ///////////////////////////////////////////////////////////////////////////////////////////////////// /// The destructor is empty. ///////////////////////////////////////////////////////////////////////////////////////////////////// Shout_lm2bin::~Shout_lm2bin() { }