/**
* @file mfcc.h
*
*
* @brief MFCC計算のための定義
*
* このファイルには,音声波形データからMFCC形式の特徴量ベクトル系列を
* 計算するための構造体の定義およびデフォルト値が含まれています.
* デフォルト値は Julius とともに配布されている音響モデルで使用している
* 値であり,HTKのデフォルトとは値が異なる部分がありますので注意して下さい.
*
*
* @brief Definitions for MFCC computation
*
* This file contains structures and default values for extracting speech
* parameter vectors of Mel-Frequency Cepstral Cefficients (MFCC).
* The default values here are the ones used in the standard acoustic models
* distributed together with Julius, and some of them have different value from
* HTK defaults. So be careful of the default values.
*
*
* @sa libsent/src/wav2mfcc/wav2mfcc.c
* @sa libsent/src/wav2mfcc/wav2mfcc-pipe.c
* @sa julius/wav2mfcc.c
* @sa julius/realtime-1stpass.c
*
* @author Akinobu LEE
* @date Fri Feb 11 03:40:52 2005
*
* $Revision: 1.6 $
*
*/
/************************************************************************/
/* mfcc.h */
/* */
/* Author : Yuichiro Nakano */
/************************************************************************/
#ifndef __MFCC_H__
#define __MFCC_H__
/// DEBUG: define if you want to enable debug messages for sin/cos table operation
#undef MFCC_TABLE_DEBUG
#define CPMAX 500 ///< Maximum number of frames to store ceptral mean for realtime CMN update
#define CPSTEP 5 ///< allocate step of cmean list per sentence
#include
#include
#include
#include
#define DEF_SMPPERIOD 625 ///< Default sampling period in 100ns (625 = 16kHz)
#define DEF_FRAMESIZE 400 ///< Default Window size in samples, similar to WINDOWSIZE in HTK (unit is different)
#define DEF_FFTNUM 512 ///< Number of FFT steps
#define DEF_FRAMESHIFT 160 ///< Default frame shift length in samples
#define DEF_PREENPH 0.97 ///< Default pre-emphasis coefficient, corresponds to PREEMCOEF in HTK
#define DEF_MFCCDIM 12 ///< Default number of MFCC dimension, corresponds to NUMCEPS in HTK
#define DEF_CEPLIF 22 ///< Default cepstral Liftering coefficient, corresponds to CEPLIFTER in HTK
#define DEF_FBANK 24 ///< Default number of filterbank channels, corresponds to NUMCHANS in HTK
#define DEF_DELWIN 2 ///< Default delta window size, corresponds to DELTAWINDOW in HTK
#define DEF_ACCWIN 2 ///< Default acceleration window size, corresponds to ACCWINDOW in HTK
#define DEF_SILFLOOR 50.0 ///< Default energy silence floor in dBs, corresponds to SILFLOOR in HTK
#define DEF_ESCALE 1.0 ///< Default scaling coefficient of log energy, corresponds to ESCALE in HTK
#define DEF_SSALPHA 2.0 ///< Default alpha coefficient for spectral subtraction
#define DEF_SSFLOOR 0.5 ///< Default flooring coefficient for spectral subtraction
/* version 2 ... ss_floor and ss_alpha removed */
/* version 3 add usepower */
#define VALUE_VERSION 3 ///< Integer version number of Value, for embedding
/// mfcc configuration parameter values
typedef struct {
int smp_period; ///< Sampling period in 100ns units
int smp_freq; ///< Sampling frequency
int framesize; ///< Window size in samples, similar to WINDOWSIZE in HTK (unit is different)
int frameshift; ///< Frame shift length in samples
float preEmph; ///< Pre-emphasis coefficient, corresponds to PREEMCOEF in HTK
int lifter; ///< Cepstral liftering coefficient, corresponds to CEPLIFTER in HTK
int fbank_num; ///< Number of filterbank channels, corresponds to NUMCHANS in HTK
int delWin; ///< Delta window size, corresponds to DELTAWINDOW in HTK
int accWin; ///< Acceleration window size, corresponds to ACCWINDOW in HTK
float silFloor; ///< Energy silence floor in dBs, corresponds to SILFLOOR in HTK
float escale; ///< Scaling coefficient of log energy, corresponds to ESCALE in HTK
int hipass; ///< High frequency cut-off in fbank analysis, -1 if disabled, corresponds to HIFREQ in HTK
int lopass; ///< Low frequency cut-off in fbank analysis, -1 if disabled, corresponds to LOFREQ in HTK
int enormal; ///< 1 if normalise raw energy, 0 if disabled, corresponds to ENORMALISE in HTK
int raw_e; ///< 1 if using raw energy, 0 if disabled, corresponds to RAWENERGY in HTK
int zmeanframe; ///< 1 if apply zero mean frame like ZMEANSOURCE in HTK
int usepower; ///< 1 if use power instead of magnitude in filterbank analysis
float vtln_alpha; ///< warping factor for VTLN, corresponds to WARPFREQ in HTK
float vtln_upper; ///< hi freq. cut off for VTLN, corresponds to WARPUCUTOFF in HTK
float vtln_lower; ///< low freq. cut off for VTLN, corresponds to WARPLCUTOFF in HTK
/* items below does not need to be embedded, because they can be
detemined from the acoustic model header, or should be computed
from run-time variables */
int delta; ///< 1 if delta coef. needs to be computed
int acc; ///< 1 if acceleration coef. needs to be computed
int energy; ///< 1 if energy coef. needs to be computed
int c0; ///< 1 if use 0'th cepstral parameter, 0 if disabled, corresponds to _0 qualifier in HTK
int absesup; ///< 1 if absolute energy should be suppressed
int cmn; ///< 1 if use Cepstrum Mean Normalization, 0 if disabled, corresponds to _Z qualifier in HTK
int cvn; ///< 1 if use cepstral variance normalization, else 0 */
int mfcc_dim; ///< Number of MFCC dimensions
int baselen; ///< Number of base MFCC dimension with energies
int vecbuflen; ///< Vector length needed for computation
int veclen; ///< Resulting length of vector
int loaded; ///< 1 if these parameters were loaded from HTK config file or binhmm header
}Value;
/// Workspace for filterbank analysis
typedef struct {
int fftN; ///< Number of FFT point
int n; ///< log2(fftN)
int klo; ///< FFT indices of lopass cut-off
int khi; ///< FFT indices of hipass cut-off
float fres; ///< Scaled FFT resolution
float *cf; ///< Array[1..pOrder+1] of centre freqs
short *loChan; ///< Array[1..fftN/2] of loChan index
float *loWt; ///< Array[1..fftN/2] of loChan weighting
float *Re; ///< Array[1..fftN] of fftchans (real part)
float *Im; ///< Array[1..fftN] of fftchans (imag part)
} FBankInfo;
/// Cycle buffer for delta computation
typedef struct {
float **mfcc; ///< MFCC buffer
int veclen; ///< Vector length of above
float *vec; ///< Points to the current MFCC
int win; ///< Delta window length
int len; ///< Length of the buffer (= win*2+1)
int store; ///< Current next storing point
boolean *is_on; ///< TRUE if data filled
int B; ///< B coef. for delta computation
} DeltaBuf;
/// Work area for MFCC computation
typedef struct {
float *bf; ///< Local buffer to hold windowed waveform
double *fbank; ///< Local buffer to hold filterbank
FBankInfo fb; ///< Local buffer to hold filterbank information
int bflen; ///< Length of above
#ifdef MFCC_SINCOS_TABLE
double *costbl_hamming; ///< Cos table for hamming window
int costbl_hamming_len; ///< Length of above
/* cos/-sin table for FFT */
double *costbl_fft; ///< Cos table for FFT
double *sintbl_fft; ///< Sin table for FFT
int tbllen; ///< Length of above
/* cos table for MakeMFCC */
double *costbl_makemfcc; ///< Cos table for DCT
int costbl_makemfcc_len; ///< Length of above
/* sin table for WeightCepstrum */
double *sintbl_wcep; ///< Sin table for cepstrum weighting
int sintbl_wcep_len; ///< Length of above
#endif /* MFCC_SINCOS_TABLE */
float sqrt2var; ///< Work area that holds value of sqrt(2.0) / fbank_num
float *ssbuf; ///< Pointer to noise spectrum for SS
int ssbuflen; ///< length of @a ssbuf
float ss_floor; ///< flooring value for SS
float ss_alpha; ///< alpha scaling value for SS
} MFCCWork;
/**
* Structure to hold sentence sum of MFCC for realtime CMN
*
*/
typedef struct {
float *mfcc_sum; ///< Sum of MFCC parameters
float *mfcc_var; ///< Variance sum of MFCC parameters
int framenum; ///< summed number of frames
} CMEAN;
/**
* Work area for real-time CMN
*
*/
typedef struct {
CMEAN *clist; ///< List of MFCC sum for previous inputs
int clist_max; ///< Allocated number of CMEAN in clist
int clist_num; ///< Currentlly filled CMEAN in clist
float cweight; ///< Weight of initial cepstral mean
float *cmean_init; ///< Initial cepstral mean for each input
float *cvar_init; ///< Inisial cepstral standard deviation for each input
int mfcc_dim; ///< base MFCC dimension (to apply CMN)
int veclen; ///< full MFCC vector length
boolean mean; ///< TRUE if CMN is enabled
boolean var; ///< TRUE if CVN is enabled
boolean cmean_init_set; ///< TRUE if cmean_init (and cvar_init) was set
CMEAN now; ///< Work area to hold current cepstral mean
} CMNWork;
/**
* work area for energy normalization on real time input
*
*/
typedef struct {
LOGPROB max_last; ///< Maximum energy value of last input
LOGPROB min_last; ///< Minimum floored energy value of last input
LOGPROB max; ///< Maximum energy value of current input
} ENERGYWork;
#ifdef __cplusplus
extern "C" {
#endif
/**** mfcc-core.c ****/
MFCCWork *WMP_work_new(Value *para);
void WMP_calc(MFCCWork *w, float *mfcc, Value *para);
void WMP_free(MFCCWork *w);
/* Get filterbank information */
boolean InitFBank(MFCCWork *w, Value *para);
void FreeFBank(FBankInfo *fb);
/* Apply hamming window */
void Hamming (float *wave, int framesize, MFCCWork *w);
/* Apply pre-emphasis filter */
void PreEmphasise (float *wave, int framesize, float preEmph);
/* Return mel-frequency */
float Mel(int k, float fres);
/* Apply FFT */
void FFT(float *xRe, float *xIm, int p, MFCCWork *w);
/* Convert wave -> mel-frequency filterbank */
void MakeFBank(float *wave, MFCCWork *w, Value *para);
/* Apply the DCT to filterbank */
void MakeMFCC(float *mfcc, Value *para, MFCCWork *w);
/* Calculate 0'th Cepstral parameter*/
float CalcC0(MFCCWork *w, Value *para);
/* Calculate Log Raw Energy */
float CalcLogRawE(float *wave, int framesize);
/* Zero Mean Souce by frame */
void ZMeanFrame(float *wave, int framesize);
/* Re-scale cepstral coefficients */
void WeightCepstrum (float *mfcc, Value *para, MFCCWork *w);
/**** wav2mfcc-buffer.c ****/
/* Convert wave -> MFCC_E_D_(Z) (batch) */
int Wav2MFCC(SP16 *wave, float **mfcc, Value *para, int nSamples, MFCCWork *w);
/* Calculate delta coefficients (batch) */
void Delta(float **c, int frame, Value *para);
/* Calculate acceleration coefficients (batch) */
void Accel(float **c, int frame, Value *para);
/* Normalise log energy (batch) */
void NormaliseLogE(float **c, int frame_num, Value *para);
/* Cepstrum Mean Normalization (batch) */
void CMN(float **mfcc, int frame_num, int dim);
void MVN(float **mfcc, int frame_num, Value *para);
/**** wav2mfcc-pipe.c ****/
DeltaBuf *WMP_deltabuf_new(int veclen, int windowlen);
void WMP_deltabuf_free(DeltaBuf *db);
void WMP_deltabuf_prepare(DeltaBuf *db);
boolean WMP_deltabuf_proceed(DeltaBuf *db, float *new_mfcc);
boolean WMP_deltabuf_flush(DeltaBuf *db);
CMNWork *CMN_realtime_new(Value *para, float weight);
void CMN_realtime_free(CMNWork *c);
void CMN_realtime_prepare(CMNWork *c);
void CMN_realtime(CMNWork *c, float *mfcc);
void CMN_realtime_update(CMNWork *c, HTK_Param *param);
boolean CMN_load_from_file(CMNWork *c, char *filename);
boolean CMN_save_to_file(CMNWork *c, char *filename);
void energy_max_init(ENERGYWork *energy);
void energy_max_prepare(ENERGYWork *energy, Value *para);
LOGPROB energy_max_normalize(ENERGYWork *energy, LOGPROB f, Value *para);
/**** ss.c ****/
/* spectral subtraction */
float *new_SS_load_from_file(char *filename, int *slen);
float *new_SS_calculate(SP16 *wave, int wavelen, int *slen, MFCCWork *w, Value *para);
/**** para.c *****/
void undef_para(Value *para);
void make_default_para(Value *para);
void make_default_para_htk(Value *para);
void apply_para(Value *dst, Value *src);
boolean htk_config_file_parse(char *HTKconffile, Value *para);
void calc_para_from_header(Value *para, short param_type, short vec_size);
void put_para(FILE *fp, Value *para);
#ifdef __cplusplus
}
#endif
#endif /* __MFCC_H__ */