/*************************************************************************** * This file is part of the 'Shout LVCS Recognition toolkit'. * *************************************************************************** * Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 by Marijn Huijbregts * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; version 2 of the License. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #ifndef LEXICALTREE_H #define LEXICALTREE_H #include "standard.h" #include "phonemodel.h" #include "languagemodel.h" #include "shout_maketrainset.h" #ifdef TEST_ARTICULATORY #include "articulatorystream.h" #endif struct LMLAType; #define NUMBER_OF_BLAME_CLUSTERS (6) #define MAX_PDF_CALC_SIZE (1024000) #ifdef CELL_PROCESSOR typedef struct _control_block { unsigned long long out_addr; //beginning address of the output array unsigned long long in_addr[MAX_PDF_CALC_SIZE]; unsigned char in_length[MAX_PDF_CALC_SIZE]; float vector[39]; int nrPDFs; unsigned char pad[24]; } control_block_getfastp_t; typedef struct _control_block_lmla { unsigned long long out_addr; //beginning address of the output array unsigned long long in_addr; int numberOfCompressedNodes; unsigned char pad[8]; } control_block_createlmla_t; #endif //////////////////////////////////////////////////////////////////////////// /// \brief Used for N-Best/lattice calculations: //////////////////////////////////////////////////////////////////////////// struct NBestList { int nrWords; int noSilWords; float totAM; float totLM; }; //////////////////////////////////////////////////////////////////////////// /// \brief This type is used to keep track of analysis administration. //////////////////////////////////////////////////////////////////////////// struct AnalysisSettings { WLRTracker **prunePathAnalyses; int prunePathAnalysesLength; int beamCategories[3]; bool containsOOV; int errorRegionActive; // In this record so that we do not have to pass it in the recursive error-function. int *binDistr; float *bestScore; float *correctScore; float *bestStateScore; int *stateRanking; float maxBeam; float maxStateBeam; float maxEndStateBeam; int maxHistogram; int maxStateHistogram; LexicalNode *endStateActive; int currentLMHist[LM_NGRAM_DEPTH]; NBestList refStats; }; //////////////////////////////////////////////////////////////////////////// /// \brief This type is used to make a global cache table for Lanuage Model /// Look-Ahead (LMLA). /// /// The 'used' parameter is added for making pruning faster. The wordHistory /// defines the LM history for which this record is created. 'lookAhead' is /// an array (of length 'number of compressed noded') and next is used to /// link the structures together. //////////////////////////////////////////////////////////////////////////// struct LMLAGlobalListType { int wordHistory[LM_NGRAM_DEPTH]; int collissionTime; TokenType *collissionNode; LMLAGlobalListType *next; float *lookAhead; }; //////////////////////////////////////////////////////////////////////////// /// \brief A lexical node is a single node of the lexical tree or Pronounciation /// Prefix Tree (PPT). It contains token, AM and word information. /// /// Because we use a static PPT (no copies are made), for each HMM state in /// a node, it is possible that there are multiple valid tokens linked to it. /// /// The modelID points to the PhoneModel object (AM model). If wordID is not /// -1, this node is a word-end node. /// /// Each node can have a following node (next) in the tree. It may also have /// a parallel node. Finaly it is possible that a node points to another /// lexical tree structure. This typically happens with alignment- or grammar-tasks. //////////////////////////////////////////////////////////////////////////// struct LexicalNode { TokenType **tokenSeq; ///< Normally of length NONSIL_NUMBER_OF_STATES (3) LexicalNode *nextTree; ///< If wordID != -1, this points to the start node of the next tree (for LVCSR back to the top) LexicalNode *next; ///< Next node. Multiple 'next' nodes are linked together (in the next node) with 'parallel'. LexicalNode *parallel; ///< Parallel node. TokenType **inputToken; int modelID; int contextPrev; ///< Previous modelID (left context) int contextNext; ///< Next modelID (right context) int contextKey; ///< For speed-up reasons pre-calculated: contextPrev * numberOfPhones + contextNext int wordID; ///< -1 if not an end node. Otherwise the ID of the word. int depth; int tokenSeqLength; int compressedTreeIndex; bool nodeIsActive; ///< One or more states of this node are being used. bool toBeDeletedFromList; ///< When this switch is TRUE. The node will be deleted from the 'active list'. }; //////////////////////////////////////////////////////////////////////////// /// \brief The LexicalNodeList is needed for administration of all currently /// active nodes. //////////////////////////////////////////////////////////////////////////// struct LexicalNodeList { LexicalNodeList *next; LexicalNode *node; }; #ifdef SEARCH_STATISTICS_ON //////////////////////////////////////////////////////////////////////////// /// \brief This structure keeps record of different kind of statistics during /// decoding. It may be switched on and off by the compiler switch SEARCH_STATISTICS_ON //////////////////////////////////////////////////////////////////////////// struct SearchStatistics { int nrOfHistPruning; int nrOfCleanUps; int nrOfLMLATables_Temp; int nrOfLMLATables; int nrOfActiveNodes; int nrOfActiveTokens; int nrOfLMLATables_Prev; int nrOfActiveNodes_Prev; int nrOfActiveTokens_Prev; int nrOfLMLATables_Max; int nrOfActiveNodes_Max; int nrOfActiveTokens_Max; }; #endif struct FastCompressedTree { int wordID; int nextIndex; bool hasPar; }; //////////////////////////////////////////////////////////////////////////// /// \brief This class contains all lexical tree functionality and also almost /// all decoder steps. /// /// The lexical tree (or Pronunciation Prefix Tree (PPT)) is a special form /// of a pronunciation vocabulary. All words that start with the same phones /// are put together. As soon as the pronunciation differs, the tree splits into /// two sub-trees. (for example: "word" and "worst" will share the first two /// phones.) /// /// This class handles the structure of the tree using LexicalNode, but it /// also handles tokens being propagated through the tree (including pruning), /// LM probabilities being added to tokens at tree leaves, LMLA, keeping track /// of word- and phone-histories, picking the final recognition and storing /// statistics. //////////////////////////////////////////////////////////////////////////// class LexicalTree { protected: LMLAGlobalListType *lookAheadUnigram; #ifdef TEST_ARTICULATORY ArticulatoryStream *myArtStream; #endif FastCompressedTree *fastCompressedTree; int numberOfCompressedNodes; int nodeListLength; LexicalNode **nodeArray; bool latticeGeneration; LexicalNode **latticeWordList; int latticeWordListLength; int *tokenDepthAdmin; int biggestNodeDepth; int startTime; FILE *outputFile; ///< Output hyp file. FILE *tdFile; ///< Token Distribution File. AnalysisSettings *analysisSettings; ///< Used to keep track of analysis administration (blame assignment). int blameAssingment[NUMBER_OF_BLAME_CLUSTERS]; int analyse_deletions[NUMBER_OF_BLAME_CLUSTERS]; int analyse_insertions[NUMBER_OF_BLAME_CLUSTERS]; int analyse_substitutions[NUMBER_OF_BLAME_CLUSTERS]; int analyse_totRefWords; int totCategories[3]; DecoderSettings settings; WLRType *bestRecPath; ///< Used for adaptation, it would be inconvenient to find the best token each frame. int initialLMHist[LM_NGRAM_DEPTH]; ///< The history words at the start of a new utterence. int nrOfTokens; ///< The number of tokens in the tree. int timeStamp; int numberOfPhones; ///< The number of acoustic models types (not the number in the tree). int numberOfWords; ///< The number of words in the DCT. LexicalNode **treeEnd; ///< The matrix of end-of-word nodes (so called phase-out). LexicalNode **treeStart; ///< The matrix of start-of-word nodes (phase-in) LexicalNode *grammarStart; ///< The start of the tree or (forced-align) grammar LexicalNode *endNode; ///< The last node. Only used for forced-align grammars. int grammarStartContext; ///< Initial left context when starting the system (for forced-align). WLRType *wlrStart; ///< The first Word-Link-Record (linked together in WLRType). WLRTypeList *wlrNBest; ///< Each time frame the N-Best wlrs are marked as being one of the best! LatticeNode *latticeAdmin; ///< A list of all lattice nodes. int latticeN; int latticeL; char **vocabulary; ///< The vocabulary. PhoneModel **phoneModels; ///< An array of all distinct acoustic models. int *wordLength; TokenType *bestToken; ///< A pointer to the best token in the tree. float bestL; ///< The likelihood of token 'bestToken'. LanguageModel *languageModel; ///< The used language model. LexicalNodeList *nodeList; ///< A list of all active nodes for the following time frame (so that the tree does not have to be searched) bool alignParallel; ///< Currently not used! LMLAGlobalListType *lmla_list[LMLA_CACHE]; ///< The LMLA data structure. (see paper) float *transitionPenalty; ///< The transition penalty and short-word-penalty can be calculated once in the constructor. int intervalTimer; float *phoneLoopConfidence; int phoneLoopConfidenceOffset; LexicalNode **borrowWordList; double *vList; int threadsRunning; MixGaussian **pdfUpdateList; #ifdef SEARCH_STATISTICS_ON SearchStatistics sentenceStats; SearchStatistics globalStats; #endif int startOfSentenceWord; ///< The index of the \ word. int endOfSentenceWord; ///< The index of the \ word. bool currentlyAligning; ///< If TRUE, the system will be aligning. Otherwise it does LVCSR. public: LexicalTree (FILE *outFile); LexicalTree (FILE *outFile, FILE *treeFile, bool useT = true); ~LexicalTree (); public: // for multi-threading void processNode (LexicalNode *node, Vector *v); protected: float oldcreateLMLAs(LMLAGlobalListType *lmlaGlobal, LexicalNode *node, float *allLMP); void setNodeContext (LexicalNode *node,int leftContext); void createLatticeNodeGroups(WLRType *w, double lmScore, int wordID); void copyLMHistory(int *lmHistory1, int *lmHistory2); bool compareLMHistory(int *lmHistory1, int *lmHistory2); void printTokenDistribution(); int createLatticeLMRescoring(LatticeNode *l, NBestList *scoreList, float amTot, float lmTot, int nrWords, int noSilWords, int *lmHist, int sentenceID = 0); int countLatticePaths(LatticeNode *l, int sentenceID = 0); void sortLatticePaths(LatticeNode *l); void setWlrNBest(WLRType *wlr); int getWordFromWLR(WLRType *wlr); LatticeNode *findLatticeNodes(WLRType *wlr); void printLMParStats (bool outputXML); void createLattice(); void lattice_copyNonSilArcs(LatticeNode *source, LatticeNode *dest, int loopID); void lattice_removeDoubleArcs(); void deleteLatticeAdmin(); bool addWordStringToAlignment(LexicalNode *word); int getLastModelForContext(LexicalNode *node, int wordID); void calcErrorRegionStats(WLRType *wlr, WLRType *lastCorrect, int firstRef, int lastRef, int *nrWordHyp, float *scoreLM); int printErrorString(int errorID); void processVector_LMLAReordering(); void processVector_LMLAReordering_prepare(); void processVector_processNodes(Vector **v); void createActiveNodesList(); void processVector_prune_processNodesOutput(); void processVector_pruneLM(); void processVector_grammar(); void processVector_administrationCleanup(); void getBestGrammarEndToken(TokenType **bestT, bool notFinishedAllowed); void setNodeLocationPars (LexicalNode *node, bool fromParallel); void setDepthLevel (LexicalNode *node, int phone, int depth); void readTree (LexicalNode *node, FILE *treeFile, int length); void deleteLookAheadList (); void deleteTree (LexicalNode *node, bool isPar); void initialiseNode (LexicalNode *node); void pruneWithMinBeam (LexicalNode *node, float minLikelihood_0); void deleteNodes (); virtual void processNodeOutput (LexicalNode *node); void processWord (int wordID, TokenType *token, char isSil, LexicalNode *resultNode); void getBestPath (WLRType *wlr, bool outputXML, bool complete); void errorAnalysis (WLRType *wlr, int depth, bool outputXML); void addNodeToList (LexicalNode *node); void pruneToken(TokenType **token, float minLikelihood, float binSize = 0.0, int *bins = NULL); void touchWLRpath (WLRType *w); void touchWLRs (TokenType *token); bool printWordPronunciation(LexicalNode *node, int wordID); void checkTreeRobustness (LexicalNode *node); void initialiseSystem (); void setTreeStartEndMatrix (); TokenType *findBestToken (bool addEndOfSentence, bool complete, bool outputXML, bool notFinishedAllowed = false); LMLAGlobalListType *getLMLATable (int *lmHistory, bool onlyPrepare); int getLMLAHashKey (int *lmHistory); void createLMLAs (LMLAGlobalListType *lmlaGlobal, float *allLMP); void prepareLMLACreation (LexicalNode *node); void pruneLMLA (); LexicalNode *getPhoneString (LexicalNode *node, int wordID); void getPhoneAlignment (const char *prefix, PLRType *pt, int lastFrame, bool confident); LexicalNode *findCorrectNode (PLRType *pt, int wordID); #ifdef SEARCH_STATISTICS_ON void updateStats (); #endif public: bool alignmentIsEmpty (); int latticeBaumWelch_numberNodes (LexicalNode *node, int number, bool clear); void latticeBaumWelch_setLikelihoods (LexicalNode *node, Vector *t, int time, int numberOfStates, double *latticeLikelihood); void latticeBaumWelch_calculatePosteriors (double *latticeLikelihood, LexicalNode *node, double incomingScore, double *latticeBaumWelchAlfa, double *latticeBaumWelchBeta, double *posteriors, int time, int numberOfStates); void latticeBaumWelch_mmi_accumulatorsPosteriors(LexicalNode *node, double *posteriors, int numberOfStates, Vector *observation); void latticeBaumWelch_printPosteriors (LexicalNode *node, double *posteriors, int time, int numberOfStates, int timeOffset); void latticeBaumWelch_forward (double *latticeLikelihood, LexicalNode *node, double incomingScore, double *latticeBaumWelchAlfa, int time, int numberOfStates); void latticeBaumWelch_initForward (double *latticeBaumWelchAlfa); double latticeBaumWelch_backward (double *latticeLikelihood, LexicalNode *node, double *latticeBaumWelchBeta, int time, int numberOfStates, double normFactor, double *resArray); void latticeBaumWelch_initBackward(double *latticeBaumWelchBeta, int offset); void setLattice (FILE *latFile); char *getAlignmentString (); void printInitialSettings(const char *amName, const char *dctName, const char *backName, const char *lmName, bool outXML); void printFinalSettings (bool outXML, int totMilliSec, int totTime); void updateGlobalStats (); void checkAMs (int nrM, PhoneModel **models, bool outXML); void setAMs (PhoneModel **models); void setLM (LanguageModel *lm); int getWordID (const char *word); char* getWord (int wordID); int getNumberOfWords (); bool setForcedAlign (const char *string); void setAlignParallel (); virtual void initialiseTree (int startTime = 0); void setInitialLMHistory (const char *word1, const char *word2); void processVector (Vector **v, int time); void adaptAMs (Vector *v, int time); void getLogging (const char *string); #ifdef TEST_ARTICULATORY void testArticulatory (ArticulatoryStream *s); #endif int getBestIDSequence (int *idList, int maxLength, bool showSil, bool notFinishedAllowed); void getBestRecognition (bool addEndOfSentence, bool outputXML, bool complete, const char *label = NULL, int milliSec = 0, int totLength = 0, const char *beginTime = NULL, const char *endTime = NULL); double getBestRecognitionScore(); void storePLConfidence (int time); void setPhoneLoop (int nrP, PhoneModel **models); void overwritePrunePars (bool doHist, bool doBeam, bool doEndBeam, double beam, double state_beam, double endstate_beam,bool lmla,int histState, int hist); void overwriteWeightPars (double lmScale, double transPenalty, double silPenalty); bool safeBestRecognition (bool addEndOfSentence); LexicalNode *borrowPhoneString (const char *string); bool addForcedAlignOOV (LexicalNode *oovNode); void createWordTree (); void setPhoneLoopConfidence (float *phoneConf, int offset = 0); void printLattice (FILE *latFile, const char *label, int timeEnd); void printNBestList (FILE *nbestFile = NULL, LatticeNode *node = NULL); void setTokenDistributionFile (FILE *tdFile); void setLatticeGeneration(bool setting); }; #endif