/**
* @file recog.h
*
*
* @brief エンジンインスタンスの定義
*
* 認識エンジンのインスタンス定義を行います.インスタンスは,
* Recog をトップインスタンスとして,使用する音響モデル,言語モデル,
* それらを組み合わせた認識処理インスタンスを複数持ちます.
*
* 各部のインスタンスは,対応する jconf 内の設定構造体,および
* 使用するサブインスタンスへのポインタを持ちます.PROCESS_AM は音響モデル,
* PROCESS_LM は言語モデルごとに定義されます.
*
* MFCCCalc は,
* 音響モデルおよび GMM で要求されるパラメータタイプを調べたのち,
* それらを生成するのに必要なだけ生成されます.同一のMFCC型および
* その他のフロントエンド処理条件を持つ音響モデルおよびGMMどうしでは
* 同じ MFCCCalc が共有されます.
*
*
*
*
* @brief Enging instance definitions
*
* This file defines the engine instance and all its sub instances.
* The top instance is Recog, and it consists of several
* sub instances for LM, AM, and recognition process instances.
*
* Each sub-instance keeps pointer to corresponding jconf setting
* part, and also has pointers to other instances to use.
* PROCESS_AM will be generated for each acoustic model, and PROCESS_LM
* will be for each language model.
*
* MFCCCalc will be generated for each required MFCC frontend types
* by inspecting all AMs and GMM. The AM's and GMMs that requires
* exactly the same MFCC frontend will share the same MFCC frontend.
*
*
*
*
* Recog
* +- *JCONF
* +- input related work area
* +- MFCCCalc[] (linked list) (generated from HMM + GMM)
* +- PROCESS_AM[] (linked list)
* +- *pointer to JCONF_AM
* +- *pointer to MFCCCalc
* +- hmminfo, hmm_gs
* +- hmmwrk
* +- multipath, ccd_flag, cmn_loaded
* +- PROCESS_LM[] (linked list)
* +- *pointer to JCONF_LM
* +- *pointer to PROCESS_AM
* +- lmtype, lmvar
* +- winfo
* +- ngram or grammars
* +- lmfunc
* +- RecogProcess process[] (linked list)
* +- *pointer to JCONF_SEARCH
* +- *pointer to PROCESS_AM
* +- *pointer to PROCESS_LM
* +- lmtype, lmvar
* +- misc. param
* +- GMMCalc
* +- *JCONF_AM for GMM
* +- *pointer to MFCCCalc
*
*
* @author Akinobu Lee
* @date Fri Feb 16 13:42:28 2007
*
* $Revision: 1.17 $
*
*/
/*
* Copyright (c) 1991-2012 Kawahara Lab., Kyoto University
* Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
* Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology
* All rights reserved
*/
/*
*/
#ifndef __J_RECOG_H__
#define __J_RECOG_H__
#include
#include
#include
#include
#include
#include
#include
#include
#include
/*
How tokens are managed:
o tlist[][] is a token stocker. It holds all tokens in sequencial
buffer. They are malloced first on startup, and refered by ID while
Viterbi procedure. In word-pair mode, each token also has a link to
another token to allow a node to have more than 1 token.
o token[n] holds the current ID number of a token associated to a
lexicon tree node 'n'.
*/
/**
* Work area for the first pass
*
*/
typedef struct __FSBeam__ {
/* token stocker */
TOKEN2 *tlist[2]; ///< Token space to hold all token entities.
TOKENID *tindex[2]; ///< Token index corresponding to @a tlist for sort
int maxtnum; ///< Allocated number of tokens (will grow)
int expand_step; ///< Number of tokens to be increased per expansion
boolean expanded; ///< TRUE if the tlist[] and tindex[] has been expanded at last create_token();
int tnum[2]; ///< Current number of tokens used in @a tlist
int n_start; ///< Start index of in-beam nodes on @a tindex
int n_end; ///< end index of in-beam nodes on @a tindex
int tl; ///< Current work area id (0 or 1, swapped for each frame)
int tn; ///< Next work area id (0 or 1, swapped for each frame)
#ifdef SCORE_PRUNING
LOGPROB score_pruning_max; ///< Maximum score at current frame
LOGPROB score_pruning_threshold;///< Score threshold for score pruning
int score_pruning_count; ///< Number of tokens pruned by score (debug)
#endif
/* Active token list */
TOKENID *token; ///< Active token list that holds currently assigned tokens for each tree node
#ifdef UNIGRAM_FACTORING
/* for wordend processing with 1-gram factoring */
LOGPROB wordend_best_score; ///< Best score of word-end nodes
int wordend_best_node; ///< Node id of the best wordend nodes
TRELLIS_ATOM *wordend_best_tre; ///< Trellis word corresponds to above
WORD_ID wordend_best_last_cword; ///< Last context-aware word of above
#endif
int totalnodenum; ///< Allocated number of nodes in @a token
TRELLIS_ATOM bos; ///< Special token for beginning-of-sentence
boolean nodes_malloced; ///< Flag to check if tokens already allocated
LOGPROB lm_weight; ///< Language score weight (local copy)
LOGPROB lm_penalty; ///< Word insertion penalty (local copy)
LOGPROB lm_penalty_trans; ///< Additional insertion penalty for transparent words (local copy)
LOGPROB penalty1; ///< Word insertion penalty for DFA (local copy)
#if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT)
boolean wpair_keep_nlimit; ///< Keeps only N token on word-pair approx. (local copy from jconf)
#endif
/* for short-pause segmentation */
boolean in_sparea; ///< TRUE when we are in a pause area now
int tmp_sparea_start; ///< Memorize where the current pause area begins
#ifdef SP_BREAK_RESUME_WORD_BEGIN
WORD_ID tmp_sp_break_last_word; ///< Keep the max word hypothesis at beginning of this segment as the starting word of next segment
#else
WORD_ID last_tre_word; ///< Keep ths max word hypothesis at the end of this segment for as the starting word of the next segment
#endif
boolean first_sparea; ///< TRUE when we are in the first pause area
int sp_duration; ///< Number of current successive sp frame
#ifdef SPSEGMENT_NAIST
boolean after_trigger; ///< TRUE if speech already triggered
int trigger_duration; ///< Current speech duration at uptrigger detection
boolean want_rewind; ///< TRUE if process wants mfcc rewinding
int rewind_frame; ///< Place to rewind to
boolean want_rewind_reprocess; ///< TRUE if requires re-processing after rewind
#endif
char *pausemodelnames; ///< pause model name string to detect segment
char **pausemodel; ///< each pause model name to detect segment
int pausemodelnum; ///< num of pausemodel
} FSBeam;
/**
* Work area for realtime processing of 1st pass
*
*/
typedef struct __RealBeam__ {
/* input parameter */
int maxframelen; ///< Maximum allowed input frame length
SP16 *window; ///< Window buffer for MFCC calculation
int windowlen; ///< Buffer length of @a window
int windownum; ///< Currently left samples in @a window
/* for short-pause segmentation */
boolean last_is_segmented; ///< TRUE if last pass was a segmented input
SP16 *rest_Speech; ///< Speech samples left unprocessed by segmentation at previous segment
int rest_alloc_len; ///< Allocated length of rest_Speech
int rest_len; ///< Current stored length of rest_Speech
} RealBeam;
/**
* Work area for the 2nd pass
*
*/
typedef struct __StackDecode__ {
int hypo_len_count[MAXSEQNUM+1]; ///< Count of popped hypothesis per each length
int maximum_filled_length; ///< Current least beam-filled depth
#ifdef SCAN_BEAM
LOGPROB *framemaxscore; ///< Maximum score of each frame on 2nd pass for score enveloping
#endif
NODE *stocker_root; ///< Node stocker for recycle
int popctr; ///< Num of popped hypotheses from stack
int genectr; ///< Num of generated hypotheses
int pushctr; ///< Num of hypotheses actually pushed to stack
int finishnum; ///< Num of found sentence hypothesis
NODE *current; ///< Current node for debug
#ifdef CONFIDENCE_MEASURE
LOGPROB cm_alpha; ///< alpha scaling value from jconf
# ifdef CM_MULTIPLE_ALPHA
LOGPROB *cmsumlist; ///< Sum of cm score for each alpha coef.
int cmsumlistlen; ///< Allocated length of cmsumlist.
# endif
# ifdef CM_SEARCH
LOGPROB cm_tmpbestscore; ///< Temporal best score for summing up scores
# ifndef CM_MULTIPLE_ALPHA
LOGPROB cm_tmpsum; ///< Sum of CM score
# endif
int l_stacksize; ///< Local stack size for CM
int l_stacknum; ///< Num of hypo. in local stack for CM
NODE *l_start; ///< Top node of local stack for CM
NODE *l_bottom; ///< bottom node of local stack for CM
# endif
# ifdef CM_NBEST
LOGPROB *sentcm = NULL; ///< Confidence score of each sentence
LOGPROB *wordcm = NULL; ///< Confidence score of each word voted from @a sentcm
int sentnum; ///< Allocated length of @a sentcm
int wordnum; ///< Allocated length of @a wordcm
# endif
#endif /* CONFIDENCE_MEASURE */
LOGPROB *wordtrellis[2]; ///< Buffer to compute viterbi path of a word
LOGPROB *g; ///< Buffer to hold source viterbi scores
HMM_Logical **phmmseq; ///< Phoneme sequence to be computed
int phmmlen_max; ///< Maximum length of @a phmmseq.
boolean *has_sp; ///< Mark which phoneme allow short pause for multi-path mode
#ifdef GRAPHOUT_PRECISE_BOUNDARY
short *wend_token_frame[2]; ///< Propagating token of word-end frame to detect corresponding end-of-words at word head
LOGPROB *wend_token_gscore[2]; ///< Propagating token of scores at word-end to detect corresponding end-of-words at word head
short *wef; ///< Work area for word-end frame tokens for v2
LOGPROB *wes; ///< Work area for word-end score tokens for v2
#endif
WORD_ID *cnword; ///< Work area for N-gram computation
WORD_ID *cnwordrev; ///< Work area for N-gram computation
} StackDecode;
/**
* User LM function entry point
*
*/
typedef struct {
LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); ///< Pointer to function returning word occurence probability
LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); ///< Pointer to function returning a word probability given a word context (corresponds to bi-gram)
LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); ///< Pointer to function returning LM probability
} LMFunc;
/**
* Work area for GMM calculation
*
*/
typedef struct __gmm_calc__{
LOGPROB *gmm_score; ///< Current accumurated scores for each GMM
boolean *is_voice; ///< True if corresponding model designates speech, FALSE if noise
int framecount; ///< Current frame count
short OP_nstream; ///< Number of input stream for GMM
VECT *OP_vec_stream[MAXSTREAMNUM]; ///< input vector for each stream at that frame
short OP_veclen_stream[MAXSTREAMNUM]; ///< vector length for each stream
LOGPROB *OP_calced_score; ///< Work area for Gaussian pruning on GMM: scores
int *OP_calced_id; ///< Work area for Gaussian pruning on GMM: id
int OP_calced_num; ///< Work area for Gaussian pruning on GMM: number of above
int OP_calced_maxnum; ///< Work area for Gaussian pruning on GMM: size of allocated area
int OP_gprune_num; ///< Number of Gaussians to be computed in Gaussian pruning
VECT *OP_vec; ///< Local workarea to hold the input vector of current frame
short OP_veclen; ///< Local workarea to hold the length of above
HTK_HMM_Data *max_d; ///< Hold model of the maximum score
int max_i; ///< Index of max_d
#ifdef CONFIDENCE_MEASURE
LOGPROB gmm_max_cm; ///< Hold maximum score
#endif
#ifdef GMM_VAD
LOGPROB *rates; ///< voice rate of recent N frames (cycle buffer)
int nframe; ///< Length of rates
boolean filled;
int framep; ///< Current frame pointer
boolean in_voice; ///< TRUE if currently in voice area
boolean up_trigger; ///< TRUE when detect up trigger
boolean down_trigger; ///< TRUE when detect down trigger
boolean after_trigger; ///< TRUE when currently we are processing speech segment
boolean want_rewind; ///< TRUE if GMM wants rewinding its MFCC
boolean want_rewind_reprocess; ///< TRUE if GMM wants re-processing after rewind
int rewind_frame; ///< Frame to rewind
int duration; ///< Current GMM duration work
#endif
} GMMCalc;
/**
* Alignment result, valid when forced alignment was done
*
*/
typedef struct __sentence_align__ {
int num; ///< Number of units
short unittype; ///< Unit type (one of PER_*)
WORD_ID *w; ///< word sequence by id (PER_WORD)
HMM_Logical **ph; ///< Phone sequence (PER_PHONEME, PER_STATE)
short *loc; ///< sequence of state location in a phone (PER_STATE)
boolean *is_iwsp; ///< TRUE if PER_STATE and this is the inter-word pause state at multipath mode
int *begin_frame; ///< List of beginning frame
int *end_frame; ///< List of ending frame
LOGPROB *avgscore; ///< Score averaged by frames
LOGPROB allscore; ///< Re-computed acoustic score
struct __sentence_align__ *next; ///< data chain pointer
} SentenceAlign;
/**
* Output result structure
*
*/
typedef struct __sentence__ {
WORD_ID word[MAXSEQNUM]; ///< Sequence of word ID
int word_num; ///< Number of words in the sentence
LOGPROB score; ///< Likelihood (LM+AM)
LOGPROB confidence[MAXSEQNUM]; ///< Word confidence scores
LOGPROB score_lm; ///< Language model likelihood (scaled) for N-gram
LOGPROB score_am; ///< Acoustic model likelihood for N-gram
int gram_id; ///< The grammar ID this sentence belongs to for DFA
SentenceAlign *align;
} Sentence;
/**
* A/D-in work area
*
*/
typedef struct __adin__ {
/* functions */
/// Pointer to function for device initialization (call once on startup)
boolean (*ad_standby)(int, void *);
/// Pointer to function to open audio stream for capturing
boolean (*ad_begin)(char *);
/// Pointer to function to close audio stream capturing
boolean (*ad_end)();
/// Pointer to function to begin / restart recording
boolean (*ad_resume)();
/// Pointer to function to pause recording
boolean (*ad_pause)();
/// Pointer to function to terminate current recording immediately
boolean (*ad_terminate)();
/// Pointer to function to read samples
int (*ad_read)(SP16 *, int);
/// Pointer to function to return current input source name (filename, devname, etc.)
char * (*ad_input_name)();
/* configuration parameters */
int thres; ///< Input Level threshold (0-32767)
int noise_zerocross; ///< Computed threshold of zerocross num in the cycle buffer
int nc_max; ///< Computed number of fragments for tail margin
int chunk_size; ///< audio process unit
boolean adin_cut_on; ///< TRUE if do input segmentation by silence
boolean silence_cut_default; ///< Device-dependent default value of adin_cut_on()
boolean strip_flag; ///< TRUE if skip invalid zero samples
boolean enable_thread; ///< TRUE if input device needs threading
boolean need_zmean; ///< TRUE if perform zmeansource
float level_coef; ///< Input level scaling factor
/* work area */
int c_length; ///< Computed length of cycle buffer for zero-cross, actually equals to head margin length
int c_offset; ///< Static data DC offset (obsolute, should be 0)
SP16 *swapbuf; ///< Buffer for re-triggering in tail margin
int sbsize; ///< Size of @a swapbuf
int sblen; ///< Current length of @a swapbuf
int rest_tail; ///< Samples not processed yet in swap buffer
ZEROCROSS zc; ///< Work area for zero-cross computation
#ifdef HAVE_PTHREAD
/* Variables related to POSIX threading */
pthread_t adin_thread; ///< Thread information
pthread_mutex_t mutex; ///< Lock primitive
SP16 *speech; ///< Unprocessed samples recorded by A/D-in thread
int speechlen; ///< Current length of @a speech
/*
* Semaphore to start/stop recognition.
*
* If TRUE, A/D-in thread will store incoming samples to @a speech and
* main thread will detect and process them.
* If FALSE, A/D-in thread will still get input and check trigger as the same
* as TRUE case, but does not store them to @a speech.
*
*/
boolean transfer_online;
/**
* TRUE if buffer overflow occured in adin thread.
*
*/
boolean adinthread_buffer_overflowed;
/**
* TRUE if adin thread ended
*
*/
boolean adinthread_ended;
boolean ignore_speech_while_recog; ///< TRUE if ignore speech input between call, while waiting recognition process
#endif
/* Input data buffer */
SP16 *buffer; ///< Temporary buffer to hold input samples
int bpmax; ///< Maximum length of @a buffer
int bp; ///< Current point to store the next data
int current_len; ///< Current length of stored samples
SP16 *cbuf; ///< Buffer for flushing cycle buffer just after detecting trigger
boolean down_sample; ///< TRUE if perform down sampling from 48kHz to 16kHz
SP16 *buffer48; ///< Another temporary buffer to hold 48kHz inputs
int io_rate; ///< frequency rate (should be 3 always for 48/16 conversion
boolean is_valid_data; ///< TRUE if we are now triggered
int nc; ///< count of current tail silence segments
boolean end_of_stream; ///< TRUE if we have reached the end of stream
boolean need_init; ///< if TRUE, initialize buffer on startup
DS_BUFFER *ds; ///< Filter buffer for 48-to-16 conversion
boolean rehash; ///< TRUE is want rehash at rewinding on decoder-based VAD
boolean input_side_segment; ///< TRUE if segmentation requested by ad_read
unsigned int total_captured_len; ///< Total number of recorded samples from start until now
unsigned int last_trigger_sample; ///< Last speech area was triggeed at this sample
unsigned int last_trigger_len; // Length of last speech area
char current_input_name[MAXPATHLEN]; ///< File or device name of current input
} ADIn;
/**
* Recognition result output structure. You may want to use with model data
* to get fully detailed results.
*
*/
typedef struct __Output__ {
/**
* 1: recognition in progress
* 0: recognition succeeded (at least one candidate has been found)
* -1: search failed, no candidate has been found
* -2: input rejected by short input
* -3: input rejected by GMM
*
*/
int status;
int num_frame; ///< Number of frames of the recognized part
int length_msec; ///< Length of the recognized part
Sentence *sent; ///< List of (N-best) recognition result sentences
int sentnum; ///< Number of sentences
WordGraph *wg1; ///< List of word graph generated on 1st pass
int wg1_num; ///< Num of words in the wg1
WordGraph *wg; ///< List of word graph
CN_CLUSTER *confnet; ///< List of confusion network clusters
Sentence pass1; ///< Recognition result on the 1st pass
} Output;
/**********************************************************************/
/**********************************************************************/
/**********************************************************************/
/**
* instance for a parameter vector computation
*
*/
typedef struct __mfcc_calc__ {
/**
* Unique id
*
*/
short id;
/**
* Parameter setting (entity in JCONF_AM)
*
*/
Value *para;
/**
* TRUE if the para came from "-htkconf"
*
*/
boolean htk_loaded;
/**
* TRUE if the para came from binhmm embedded header
*
*/
boolean hmm_loaded;
/**
* Check input parameter type with header of the hmmdefs
* (-notypecheck to unset)
*/
boolean paramtype_check_flag;
/**
* Parameter extraction work area
*
*/
MFCCWork *wrk;
/**
* Parameter vector sequence to be recognized
*
*/
HTK_Param *param;
/**
* Rest parameter for next segment for short-pause segmentation
*/
HTK_Param *rest_param;
/**
* Work area and setting for cepstral mean normalization
*
*/
struct {
/**
* CMN: load initial cepstral mean from file at startup (-cmnload)
*/
char *load_filename;
/**
* CMN: update cepstral mean while recognition
* (-cmnnoupdate to unset)
*/
boolean update;
/**
* CMN: save cepstral mean to file at end of every recognition (-cmnsave)
*/
char *save_filename;
/**
* CMN: MAP weight for initial cepstral mean on (-cmnmapweight)
*/
float map_weight;
/**
* TRUE if CMN parameter loaded from file at boot up
*/
boolean loaded;
/**
* realtime CMN work area
*
*/
CMNWork *wrk;
} cmn;
/**
* Work area for front-end processing
*
*/
struct {
/**
* Estimated noise spectrum
*/
float *ssbuf;
/**
* Length of @a ssbuf
*/
int sslen;
/**
* Alpha coefficient for spectral subtraction
*
*/
float ss_alpha;
/**
* Flooring coefficient for spectral subtraction
*
*/
float ss_floor;
/**
* SS: compute noise spectrum from head silence on file input (-sscalc)
*/
boolean sscalc;
/**
* With "-sscalc", specify noise length at input head in msec (-sscalclen)
*/
int sscalc_len;
/**
* Load noise spectrum data from file (-ssload), that was made by "mkss".
*/
char *ssload_filename;
/**
* Parameter extraction work area for spectral subtraction
*
*/
MFCCWork *mfccwrk_ss;
} frontend;
/**
* work area for energy normalization on real time processing
*
*/
ENERGYWork ewrk;
/**
* delta MFCC cycle buffer
*
*/
DeltaBuf *db;
/**
* accel MFCC cycle buffer
*
*/
DeltaBuf *ab;
/**
* working buffer holding current computing mfcc vector
*
*/
VECT *tmpmfcc;
/**
* FALSE indicates that the current frame (f) is not valid and should
* not be used for recognition
*
*/
boolean valid;
/**
* Current frame
*
*/
int f;
/**
* Processed frame length when segmented
*
*/
int last_time;
/**
* Re-start frame if segmenetd
*
*/
int sparea_start;
/**
* TRUE if a parent instance has decided segmented
*
*/
boolean segmented;
/**
* TRUE if an input functionhas decided segmented
*
*/
boolean segmented_by_input;
/**
* id of an plugin module if MFCC should be obtained via plugin
*
*/
int plugin_source;
/**
* Function entry points for plugin input
*
*/
struct {
/// Pointer to function for device initialization (call once on startup)
boolean (*fv_standby)();
/// Pointer to function to open audio stream for capturing
boolean (*fv_begin)();
/// Pointer to function to read samples
int (*fv_read)(VECT *, int);
/// Pointer to function to close audio stream capturing
boolean (*fv_end)();
/// Pointer to function to begin / restart recording
boolean (*fv_resume)();
/// Pointer to function to pause recording
boolean (*fv_pause)();
/// Pointer to function to terminate current recording immediately
boolean (*fv_terminate)();
/// Pointer to function to return current input name
char * (*fv_input_name)();
} func;
#ifdef POWER_REJECT
float avg_power;
#endif
/**
* pointer to next
*
*/
struct __mfcc_calc__ *next;
} MFCCCalc;
/**
* instance for an AM.
*
*/
typedef struct __process_am__ {
/**
* Configuration parameters
*
*/
JCONF_AM *config;
/**
* Corresponding input parameter vector instance
*
*/
MFCCCalc *mfcc;
/**
* Main phoneme HMM
*/
HTK_HMM_INFO *hmminfo;
/**
* HMM for Gaussian Selection
*/
HTK_HMM_INFO *hmm_gs;
/**
* Work area and outprob cache for HMM output probability computation
*/
HMMWork hmmwrk;
/**
* pointer to next
*
*/
struct __process_am__ *next;
} PROCESS_AM;
/**
* instance for a LM.
*
*/
typedef struct __process_lm__ {
/**
* Configuration parameters
*
*/
JCONF_LM *config;
/**
* Corresponding AM
*
*/
PROCESS_AM *am;
/**
* the LM type of this Model holder: will be set from Jconf used for loading
*
*/
int lmtype;
/**
* the LM variation type of this Model holder: will be set from
* Jconf used for loading
*
*/
int lmvar;
/**
* Main Word dictionary for all LM types
*/
WORD_INFO *winfo;
/**
* Main N-gram language model (do not use with grammars)
*/
NGRAM_INFO *ngram;
/**
* List of all loaded grammars (do not use with ngram)
*/
MULTIGRAM *grammars;
/**
* Current maximum value of assigned grammar ID.
* A new grammar ID will be assigned to each new grammar.
*
*/
int gram_maxid;
/**
* Global DFA for recognition. This will be generated from @a grammars,
* concatinating each DFA into one.
*/
DFA_INFO *dfa;
/**
* TRUE if modified in multigram_update()
*
*/
boolean global_modified;
/**
* LM User function entry point
*
*/
LMFunc lmfunc;
/**
* pointer to next
*
*/
struct __process_lm__ *next;
} PROCESS_LM;
/**
* instance for a decoding, i.e. set of LM, AM and parameters
*
*/
typedef struct __recogprocess__ {
/**
* TRUE is this instance is alive, or FALSE when temporary disabled.
*
*/
boolean live;
/**
* 1 if this instance should be made alive in the next recognition,
* -1 if should become dead in the next recognition,
* or 0 to leave unchanged.
*
*/
short active;
/**
* search configuration data
*
*/
JCONF_SEARCH *config;
/**
* acoustic model instance to use
*
*/
PROCESS_AM *am;
/**
* language model instance to use
*
*/
PROCESS_LM *lm;
/**
* Language model type: one of LM_UNDEF, LM_NGRAM, LM_DFA
*
*/
int lmtype;
/**
* Variation type of language model: one of LM_NGRAM, LM_DFA_GRAMMAR,
* LM_DFA_WORD
*
*/
int lmvar;
/**
* Whether handle phone context dependency (local copy from jconf)
*/
boolean ccd_flag;
/**
* Word-conjunction HMM as tree lexicon
*/
WCHMM_INFO *wchmm;
/**
* Actual beam width of 1st pass (will be set on startup)
*/
int trellis_beam_width;
/**
* Word trellis index generated at the 1st pass
*/
BACKTRELLIS *backtrellis;
/**
* Work area for the first pass
*/
FSBeam pass1;
/**
* Work area for second pass
*
*/
StackDecode pass2;
/**
* Word sequence of best hypothesis on 1st pass
*/
WORD_ID pass1_wseq[MAXSEQNUM];
/**
* Number of words in @a pass1_wseq
*/
int pass1_wnum;
/**
* Score of @a pass1_wseq
*/
LOGPROB pass1_score;
/**
* Last maximum word hypothesis on the begin point for short-pause segmentation
*/
WORD_ID sp_break_last_word;
/**
* Last (not transparent) context word for LM for short-pause segmentation
*/
WORD_ID sp_break_last_nword;
/**
* Allow override of last context word from result of 2nd pass for short-pause segmentation
*/
boolean sp_break_last_nword_allow_override;
/**
* Search start word on 2nd pass for short-pause segmentation
*/
WORD_ID sp_break_2_begin_word;
/**
* Search end word on 2nd pass for short-pause segmentation
*/
WORD_ID sp_break_2_end_word;
/**
* Input length in frames
*/
int peseqlen;
/**
* GraphOut: total number of words in the generated graph
*/
int graph_totalwordnum;
/**
* Recognition results
*
*/
Output result;
/**
* graphout: will be set from value from jconf->graph.enabled
*
*/
boolean graphout;
/**
* Temporal matrix work area to hold the order relations between words
* for confusion network construction.
*
*/
char *order_matrix;
/**
* Number of words to be expressed in the order matrix for confusion network
* construction.
*
*/
int order_matrix_count;
#ifdef DETERMINE
int determine_count;
LOGPROB determine_maxnodescore;
boolean determined;
LOGPROB determine_last_wid;
boolean have_determine;
#endif
/**
* TRUE if has something to output at CALLBACK_RESULT_PASS1_INTERIM.
*
*/
boolean have_interim;
/**
* User-defined data hook. JuliusLib does not concern about its content.
*
*/
void *hook;
/**
* Pointer to next instance
*
*/
struct __recogprocess__ *next;
} RecogProcess;
/**
* Top level instance for the whole recognition process
*
*/
typedef struct __Recog__ {
/*******************************************/
/**
* User-specified configuration parameters
*
*/
Jconf *jconf;
/*******************************************/
/**
* A/D-in buffers
*
*/
ADIn *adin;
/**
* Work area for the realtime processing of first pass
*/
RealBeam real;
/**
* Linked list of MFCC calculation/reading instances
*
*/
MFCCCalc *mfcclist;
/**
* Linked list of acoustic model instances
*
*/
PROCESS_AM *amlist;
/**
* Linked list of language model instances
*
*/
PROCESS_LM *lmlist;
/**
* Linked list of recognition process instances
*
*/
RecogProcess *process_list;
/**
* TRUE when engine is processing a segment (for short-pause segmentation)
*
*/
boolean process_segment;
/*******************************************/
/* inputs */
/**
* Input speech data
*/
SP16 *speech;
/**
* Allocate length of speech
*
*/
int speechalloclen;
/**
* Input length in samples
*/
int speechlen;
/**
* Input length in frames
*/
int peseqlen;
/*******************************************/
/**
* GMM definitions
*
*/
HTK_HMM_INFO *gmm;
/**
* Pointer to MFCC instance for GMM
*
*/
MFCCCalc *gmmmfcc;
/**
* Work area for GMM calculation
*
*/
GMMCalc *gc;
/*******************************************/
/* misc. */
/**
* Status flag indicating whether the recognition is alive or not. If
* TRUE, the process is currently activated, either monitoring an
* audio input or recognizing the current input. If FALSE, the recognition
* is now disabled until some activation command has been arrived from
* client. While disabled, all the inputs are ignored.
*
* If set to FALSE in the program, Julius/Julian will stop after
* the current recognition ends, and enter the disabled status.
*
*/
boolean process_active;
/**
* If set to TRUE, Julius/Julian stops recognition immediately, terminating
* the currenct recognition process, and enter into disabled status.
*
*/
boolean process_want_terminate;
/**
* If set to TRUE, Julius/Julian stops recognition softly. If it is
* performing recognition of the 1st pass, it immediately segments the
* current input, process the 2nd pass, and output the result. Then it
* enters the disabled status.
*
*/
boolean process_want_reload;
/**
* When to refresh the global lexicon if received while recognition for
* DFA
*
*/
short gram_switch_input_method;
/**
* TRUE if audio stream is now open and engine is either listening
* audio stream or recognizing a speech. FALSE on startup or when
* in pause specified by a module command.
*
*/
boolean process_online;
/**
* Function pointer to parameter vector computation for realtime 1st pass.
* default: RealTimeMFCC() in realtime-1stpass.c
*
*/
boolean (*calc_vector)(MFCCCalc *, SP16 *, int);
/**
* TRUE when recognition triggered and some recognition started,
* FALSE if engine terminated with no input.
*
*/
boolean triggered;
/**
* Callback entry point
*
*/
void (*callback_function[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK])();
/**
* Callback user data
*
*/
void *callback_user_data[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK];
/**
* Numbers of callbacks registered
*
*/
int callback_function_num[SIZEOF_CALLBACK_ID];
/**
* Callback function code list
*
*/
int callback_list_code[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID];
/**
* Callback function location list
*
*/
int callback_list_loc[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID];
/**
* Number of callbacks
*
*/
int callback_num;
/*******************************************/
/**
* User-defined data hook. JuliusLib does not concern about its content.
*
*/
void *hook;
} Recog;
#endif /* __J_RECOG_H__ */