/*************************************************************************** * This file is part of the 'Shout LVCS Recognition toolkit'. * *************************************************************************** * Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 by Marijn Huijbregts * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; version 2 of the License. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #ifndef MODEL_H #define MODEL_H #include "standard.h" #include "mixgaussian.h" #include "vector.h" #include "featurepool.h" struct WLRType; struct WLRTracker; struct PLRType; struct TokenType; struct LatticeNode; struct WLRList; struct LMLAGlobalListType; struct LexicalNode; // So that we can make initialiseToken static extern bool doPhoneAlignment; typedef enum { UNKNOWN = 0, OOV, SEARCH, AMLM, LM, AM } BlameCluster; #define MAX_CLUSTERS (10) //////////////////////////////////////////////////////////////////////////// /// \brief This structure contains the name of an acoustic model and the /// number of samples that are used during training (per cluster). //////////////////////////////////////////////////////////////////////////// struct DataStats { char name[11]; int nrOfSamples[MAX_CLUSTERS]; }; //////////////////////////////////////////////////////////////////////////// /// \brief The PLRType, the Phone Link Record Type is the structure that contains /// the phone history information for a single word stored in a WLR (of struct WLRType). /// /// This structure is ONLY used when phone alignment is enabled! /// /// It contains the phoneID that is beeing pronounced, the time at which the /// sound started (timeStamp) and the likelihood that the particular phone is /// pronounced given the history defined by the previous variable. //////////////////////////////////////////////////////////////////////////// struct PLRType // Phone Link Record { int phoneID; int contextKey; unsigned char stateOffset[2]; int timeStamp; float likelihood; PLRType *previous; }; //////////////////////////////////////////////////////////////////////////// /// \brief The WLRType, the Word Link Record is the structure that contains /// word history information for tokens (of struct TokenType). /// /// The wordID variable defines the word that is recognized. /// The timeStamp variable contains the time at which the word is started. /// COMBlikelihood contains the combined AM/LM likelihood. /// (likelihoods are in the log scale). The variable usedAt is added for /// administrative purposes. This variable is updated each time frame as long as /// there is at least a single token history containing this WLR. If the usedAt /// variable is not updated, the WLR is deleted. The variable adminNext is used /// to make create a long list of WLRs. The previous variable points to the /// previous word pronounced. This variable is used to track the total word history. /// /// The phoneAlignment variable points to the phone alignment history of this /// word. This variable is ONLY used when phone alignment is enabled! /// //////////////////////////////////////////////////////////////////////////// struct WLRType // Word Link Record { int lmHistory[LM_NGRAM_DEPTH]; char isSil; int timeStamp; LatticeNode *lattice; WLRType **nBest; float COMBlikelihood; float LMlikelihood; int usedAt; WLRType *adminNext; WLRType *previous; PLRType *phoneAlignment; }; //////////////////////////////////////////////////////////////////////////// /// The NBest list administration is done using this data type: //////////////////////////////////////////////////////////////////////////// struct WLRTypeList { WLRType **nBest; WLRTypeList *next; }; //////////////////////////////////////////////////////////////////////////// /// The lattice administration is done using this data type: //////////////////////////////////////////////////////////////////////////// struct LatticeNode { int nodeNr; int wordID; int timeBegin; int timeEnd; WLRType *exampleWord; WLRList *inArcs; WLRList *outArcs; LatticeNode *adminNext; }; ///////////////////////////////////////////////////////////////////////////////// /// The WLRList is a list of WLR's :-) (used for arc-administration in lattices) ///////////////////////////////////////////////////////////////////////////////// struct WLRList { WLRType *wlr; WLRList *next; float amScore; // Also available in wlr, but this safes calculations. float lmScore; // Also available in wlr, but this safes calculations. float totScore; // Also available in wlr, but this safes calculations. }; //////////////////////////////////////////////////////////////////////////// /// \brief The WLRTracker type, is created to track the single correct recognition /// obtained during forced alignment. It can be used to check when the correct /// path was left because of (for example) pruning. /// /// The contextNext parameter can only be used when phone recognition is enabled. //////////////////////////////////////////////////////////////////////////// struct WLRTracker { WLRType w; WLRType *linkWord; int wordID; int errorRegionID; BlameCluster errorCategory; int contextNext; }; //////////////////////////////////////////////////////////////////////////// /// \brief The PhoneModel class defines the phone models, the LexicalTree class the tree structure and the TokenType /// struct is the glue between them. LexicalTree is responsible for the token flow between phones and for /// language model lookahead, while PhoneModels will fill the likelihood variables and decises if a /// token may be passed within the phone model. /// /// The TokenType structure is the data type of each token. Because a token /// will always be in a list and it shall be possible to search forwards and backwards /// in that list, the next and previous variables are added to the type. /// /// The likelihood variable and lookAheadV(alue) contain the current likelihood of the token /// and the part of this likelihood that is added due to language model lookahead. /// lookAheadV is stored in order to quickly being able to substract the lookahead from /// the real likelihood and replace it with a new value. /// /// The lookahead value is calculated with help of an entire LM lookahead tree. This tree contains all /// possible lookahead values and is shared between all tokens with the same LM history. The variable /// lmLookAhead contains a pointer to this lookahead tree. /// /// In order to make backtracking word history possible a Word Link Record list is stored in the path variable. /// If phone alignment is enabled, the phonePath variable contains the phone history information. /// //////////////////////////////////////////////////////////////////////////// struct TokenType { TokenType *next; float likelihood; WLRType *path; PLRType *phonePath; float lookAheadV; LMLAGlobalListType *lmLookAhead; }; //////////////////////////////////////////////////////////////////////////// /// \brief This structure contains all data for one single gaussian mixture /// state set. This includes the PDF and state transition probabilities. //////////////////////////////////////////////////////////////////////////// struct MixtureSet { MixGaussian *state; double transitionP_toSelf; double transitionP_toNext; double currentVectorP; }; //////////////////////////////////////////////////////////////////////////// /// \brief This structure contains a summary of the data in an acoustic /// model stored as a PhoneModel object. /// /// The parameters 'frameMeanLikelihood' and 'unused' are added for future use. /// Please ignore them (especialy frameMeanLikelihood, which is filled falsely). //////////////////////////////////////////////////////////////////////////// /// \todo Change this data type! (there is an 'unUsed' in read/write and isSil is double). struct ModelStats { char name[10]; int nrOfGaussians; int nrOfContexts; int maxNrOfContexts; int nrOfTrainOcc; double likelihood; double frameMeanLikelihood; double isSil; // Now a double, because there was already a double variable here "unused"... }; //////////////////////////////////////////////////////////////////////////// /// \brief The PhoneModel class handles likelihood calculation of phones given an observation sequence. /// /// For this implementation, the phone models are represented by Hidden Markov Models (HMM). /// Training the HMM is done by the TrainPhoneModel class. TrainPhoneModel stores the HMM paramaters /// in a binary file (all models together form the binary acoustic model file) and PhoneModel /// can load the model in memory at startup. /// /// PhoneModel objects are able to determine the likelihood that a phone is pronounced, /// given an observation sequence AND a TokenType string. PhoneModel does not store /// state tokens itself, it only handles the (static) parameters needed to calculate HMM likelihoods. /// Because the user handles the token administration (in LexicalTree), it is possible /// to use a phone model in more than one node, without copying its parameters. /// /// The acoustic models are context-dependent models. During training (TrainPhoneModel) it is decided /// which contexts share one or more states and transition probabilities. Each model contains a pool /// of states, stored in the variable mixtureSetData. The arrays stateMix_1, stateMix_2 and stateMix_3 /// contain for each context the index of mixtureSetData. stateMix_1 is used to determine which state /// of the pool-of-states should be used for the first state of the HMM. stateMix_2 is used for the /// second and stateMix_3 for the third. The index for stateMix_x is calculated as followes: /// /// contextKey = leftContext * numberOfPhones + rightContext /// /// Therefore, the first state of the context 'A' - 'l' - 's' (where this model is 'l' and the left/right /// context is 'A'/'s' and 'A' is the 2nd phone and 's' the 8th) can be found by: /// /// First HMM-state = mixtureSetData[stateMix_1[2*54+8]] /// //////////////////////////////////////////////////////////////////////////// class PhoneModel { public: /// \todo This is a quick fix to check if this can help SAT training. Will change it back to protected later! MixtureSet *mixtureSetData; ///< The entire set of states and transition parameters used by this model. protected: // Attributes: ModelStats statistics; ///< Statistical information about this acoustic model. int *timeStamp; ///< The model only calculates some context info once for every 'timestamp' frame. int *stateMix_1; ///< An array with the index i for mixtureSetData[i] for every context (state 1). int *stateMix_2; ///< An array with the index i for mixtureSetData[i] for every context (state 2). int *stateMix_3; ///< An array with the index i for mixtureSetData[i] for every context (state 3). bool isSil; int dimensions; int silRinglastPos; float *weightRinglastPos; public: // Constructor and destructor: PhoneModel (int dim = ASR_DEFAULT_VECTORSIZE); PhoneModel (FILE *inFile, int dim = ASR_DEFAULT_VECTORSIZE, bool onlyUseFastP = false); PhoneModel (MixGaussian *mix, double toNext); ~PhoneModel (); // Methods: protected: static bool replaceTokenLM (TokenType *nt,TokenType *token, float like, float *bestL); static void addChain_lmla (TokenType **tokenNew, float likelihood, TokenType *token, int index, DecoderSettings *settings, float *bestL); public: // INLINE FUNCTIONS FOR CODE OPTIMALIZATION: inline int dim () const {return dimensions;} inline bool isSilModel () const {return isSil;} static void addChain (TokenType **tokenNew, float likelihood, TokenType *token, int stateNr, int curTime, DecoderSettings *settings, float *bestL, bool checkCollission); static void addChain_ordered(TokenType **tokenNew, float likelihood, TokenType *token, int stateNr, int curTime, DecoderSettings *settings, float *bestL); static void initialiseToken(TokenType **token); void setSilTrans (double tr); int getStateNr (int contextKey, int state); ModelStats *getStatistics (void); int getNumberOfGaussians(); virtual int touchPDF (int contextKey, int t, MixGaussian **updateThese, double **resultHere); virtual void processVector (int contextKey, Vector *v, int t, int index, TokenType **token, int tLength, TokenType *inToken, DecoderSettings *settings, float *bestL); virtual void getOutput (int contextKey, TokenType **token, int tLength, TokenType **outToken, DecoderSettings *settings, float *bestL); virtual double getLookaheadLogP (double *vectorList, int timeStamp, bool doSecondHalf); void copyGaussians (MixGaussian *destMixGaussian, int maxNmbr); void resetPhoneAdmin (); void mapAdaptMeans (); void adapt_setInitialNode (Adapt_AM_TreeNode *node); void adapt_setNode (); void adapt_addAcumulatorData (int state, int contextKey, Vector *observation, double probability = 1.0); void adapt_setHelperMatrices (); void adapt_clear (); void adapt_adapt (); void adapt_setVarTrans (); void adapt_adaptVar (); void adapt_unAdapt (); void adapt_setAcumulators (int useLabel, int useSegmentation, FeaturePool *usePool); void writeAccumulators (FILE *file, FILE *fileF = NULL, FILE *fileST = NULL, bool doBinary = false); void addAccumulators (FILE *file); void writeModel (FILE *outFile); double getLogPDFProbability (int contextKey, Vector *v); double getPDFProbability (int contextKey, Vector *v, int stateNr, int time); double getTransition (int contextKey, bool toSelf, int stateNr); void getSilSumHist (Vector **histogram); static PLRType *copyPhonePath (PLRType *pP); static void initialisePhonePath (PLRType *t); void printModel(FILE *fileMean, FILE *fileVariance, FILE *fileWeight); int printInfo(Vector *v); }; #endif