/** * @file wav2mfcc.c * * * @brief 特徴量ベクトル(MFCC)系列の算出(非実時間版) * * 入力された音声波形から,特徴ベクトル系列を抽出します. * Julius/Julianで抽出できる特徴ベクトルは,MFCC の任意次元数のもので, * _0, _E, _D, _A, _Z, _N の任意の組合わせをサポートします. * そのほか,窓長やフレームシフト,帯域カットなどのパラメータを指定できます. * 認識時には,音響モデルのヘッダとチェックが行われ,CMNの有無など * が決定されます. * * ここの関数は,バッファ上に蓄積された音声波形データを一度に * 特徴ベクトル系列に変換するもので,ファイル入力などに用いられます. * マイク入力などで,入力と平行に認識を行う場合は,ここの関数ではなく, * realtime-1stpass.c 内で行われます. * * * * @brief Calculate feature vector (MFCC) sequence (non on-the-fly ver.) * * Parameter vector sequence extraction of input speech is done * here. The supported parameter is MFCC, with any combination of * all the qualifiers in HTK: _0, _E, _D, _A, _Z, _N. Acoustic model * for recognition should be trained with the same parameter type. * You can specify other parameters such as window size, frame shift, * high/low frequency cut-off via runtime options. At startup, Julius * will check for the parameter types of acoustic model if it conforms * the limitation, and determine whether other additional processing * is needed such as Cepstral Mean Normalization. * * Functions below are used to convert fully buffered whole sentence * utterance, and typically used for audio file input. When input * is concurrently processed with recognition process at 1st pass, * in case of microphone input, the MFCC computation will be done * within functions in realtime-1stpass.c instead of these. * * * @author Akinobu Lee * @date Sun Sep 18 19:40:34 2005 * * $Revision: 1.4 $ * */ /* * Copyright (c) 1991-2012 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology * All rights reserved */ #include #include /** * * 音声波形データから MFCC パラメータを抽出する. * エンジンインスタンス内の MFCC 計算インスタンスごとにパラメータ抽出が * 行われ,それぞれの mfcc->param に格納される. * * @param speech [in] 音声波形データ * @param speechlen [in] @a speech の長さ(単位:サンプル数) * @param recog [in] エンジンインスタンス * * @return 成功時 TRUE, エラー時 FALSE を返す. * * * Extract MFCC parameters with sentence CMN from given waveform. * Parameters will be computed for each MFCC calculation instance * in the engine instance, and stored in mfcc->param for each. * * @param speech [in] buffer of speech waveform * @param speechlen [in] length of @a speech in samples * @param recog [in] engine instance * * @return TRUE on success, FALSE on error. * * * @callgraph * @callergraph */ boolean wav2mfcc(SP16 speech[], int speechlen, Recog *recog) { int framenum; int len; Value *para; MFCCCalc *mfcc; /* calculate frame length from speech length, frame size and frame shift */ framenum = (int)((speechlen - recog->jconf->input.framesize) / recog->jconf->input.frameshift) + 1; if (framenum < 1) { jlog("WARNING: input too short (%d samples), ignored\n", speechlen); return FALSE; } for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) { if (mfcc->frontend.ssload_filename) { /* setup for spectral subtraction using file */ if (mfcc->frontend.ssbuf == NULL) { /* load noise spectrum for spectral subtraction from file (once) */ if ((mfcc->frontend.ssbuf = new_SS_load_from_file(mfcc->frontend.ssload_filename, &(mfcc->frontend.sslen))) == NULL) { jlog("ERROR: wav2mfcc: failed to read noise spectrum from file \"%s\"\n", mfcc->frontend.ssload_filename); return FALSE; } } } if (mfcc->frontend.sscalc) { /* compute noise spectrum from head silence for each input */ len = mfcc->frontend.sscalc_len * recog->jconf->input.sfreq / 1000; if (len > speechlen) len = speechlen; #ifdef SSDEBUG jlog("DEBUG: [%d]\n", len); #endif mfcc->frontend.ssbuf = new_SS_calculate(speech, len, &(mfcc->frontend.sslen), mfcc->frontend.mfccwrk_ss, mfcc->para); } } /* compute mfcc from speech file for each mfcc instances */ for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) { para = mfcc->para; /* malloc new param */ param_init_content(mfcc->param); if (param_alloc(mfcc->param, framenum, para->veclen) == FALSE) { jlog("ERROR: failed to allocate memory for converted parameter vectors\n"); return FALSE; } if (mfcc->frontend.ssload_filename || mfcc->frontend.sscalc) { /* make link from mfccs to this buffer */ mfcc->wrk->ssbuf = mfcc->frontend.ssbuf; mfcc->wrk->ssbuflen = mfcc->frontend.sslen; mfcc->wrk->ss_alpha = mfcc->frontend.ss_alpha; mfcc->wrk->ss_floor = mfcc->frontend.ss_floor; } /* make MFCC from speech data */ if (Wav2MFCC(speech, mfcc->param->parvec, para, speechlen, mfcc->wrk) == FALSE) { jlog("ERROR: failed to compute MFCC from input speech\n"); if (mfcc->frontend.sscalc) { free(mfcc->frontend.ssbuf); mfcc->frontend.ssbuf = NULL; } return FALSE; } /* set miscellaneous parameters */ mfcc->param->header.samplenum = framenum; mfcc->param->header.wshift = para->smp_period * para->frameshift; mfcc->param->header.sampsize = para->veclen * sizeof(VECT); /* not compressed */ mfcc->param->header.samptype = F_MFCC; if (para->delta) mfcc->param->header.samptype |= F_DELTA; if (para->acc) mfcc->param->header.samptype |= F_ACCL; if (para->energy) mfcc->param->header.samptype |= F_ENERGY; if (para->c0) mfcc->param->header.samptype |= F_ZEROTH; if (para->absesup) mfcc->param->header.samptype |= F_ENERGY_SUP; if (para->cmn) mfcc->param->header.samptype |= F_CEPNORM; mfcc->param->veclen = para->veclen; mfcc->param->samplenum = framenum; if (mfcc->frontend.sscalc) { free(mfcc->frontend.ssbuf); mfcc->frontend.ssbuf = NULL; } } return TRUE; } /* end of file */