/** * @file wav2mfcc-buffer.c * * * @brief 音声波形から MFCC 特徴量へ変換する(発話単位) * * ここでは音声波形全体を単位として MFCC ベクトル系列へ変換する関数が定義 * されています.フレーム単位で抽出を行う関数は wav2mfcc-pipe.c に * 記述されています * * ここで抽出できるのは MFCC[_0][_E][_D][_A][_Z] の形式です. * * * * @brief Convert speech inputs into MFCC parameter vectors (per utterance) * * This file contains functions to convert the whole speech input * to MFCC vector array. The frame-wise MFCC computation needed for * real-time recognition is defined in wav2mfcc-pipe.c. * * The supported format is MFCC[_0][_E][_D][_A][_Z]. * * * @author Akinobu LEE * @date Thu Feb 17 17:43:35 2005 * * $Revision: 1.6 $ * */ /************************************************************************/ /* wav2mfcc.c Convert Speech file to MFCC_E_D_(Z) file */ /*----------------------------------------------------------------------*/ /* Author : Yuichiro Nakano */ /* */ /* Copyright(C) Yuichiro Nakano 1996-1998 */ /*----------------------------------------------------------------------*/ /************************************************************************/ #include #include /** * Convert wave data to MFCC. Also does spectral subtraction * if @a ssbuf specified. * * @param wave [in] waveform data * @param mfcc [out] buffer to store the resulting MFCC parameter vector [t][0..veclen-1], should be already allocated * @param para [in] configuration parameters * @param nSamples [in] length of waveform data * @param w [i/o] MFCC calculation work area * * @return the number of processed frames. */ int Wav2MFCC(SP16 *wave, float **mfcc, Value *para, int nSamples, MFCCWork *w) { int i, k, t; int end = 0, start = 1; int frame_num; /* Number of samples in output file */ /* set noise spectrum if any */ if (w->ssbuf != NULL) { /* check ssbuf length */ if (w->ssbuflen != w->bflen) { jlog("Error: mfcc-core: noise spectrum length not match\n"); return FALSE; } } frame_num = (int)((nSamples - para->framesize) / para->frameshift) + 1; for(t = 0; t < frame_num; t++){ if(end != 0) start = end - (para->framesize - para->frameshift) - 1; k = 1; for(i = start; i <= start + para->framesize; i++){ w->bf[k] = (float)wave[i - 1]; k++; } end = i; /* Calculate base MFCC coefficients */ WMP_calc(w, mfcc[t], para); } /* Normalise Log Energy */ if (para->energy && para->enormal) NormaliseLogE(mfcc, frame_num, para); /* Delta (consider energy suppress) */ if (para->delta) Delta(mfcc, frame_num, para); /* Acceleration */ if (para->acc) Accel(mfcc, frame_num, para); /* Cepstrum Mean and/or Variance Normalization */ if (para->cmn && ! para->cvn) CMN(mfcc, frame_num, para->mfcc_dim + (para->c0 ? 1 : 0)); else if (para->cmn || para->cvn) MVN(mfcc, frame_num, para); return(frame_num); } /** * Normalise log energy * * @param mfcc [i/o] array of MFCC vectors * @param frame_num [in] number of frames * @param para [in] configuration parameters */ void NormaliseLogE(float **mfcc, int frame_num, Value *para) { float max, min, f; int t; int l; l = para->mfcc_dim; if (para->c0) l++; /* find max log energy */ max = mfcc[0][l]; for(t = 0; t < frame_num; t++) if(mfcc[t][l] > max) max = mfcc[t][l]; /* set the silence floor */ min = max - (para->silFloor * LOG_TEN) / 10.0; /* normalise */ for(t = 0; t < frame_num; t++){ f = mfcc[t][l]; if (f < min) f = min; mfcc[t][l] = 1.0 - (max - f) * para->escale; } } /** * Calculate delta coefficients * * @param c [i/o] MFCC vectors, in which the delta coeff. will be appended. * @param frame [in] number of frames * @param para [in] configuration parameters */ void Delta(float **c, int frame, Value *para) { int theta, t, n, B = 0; float A1, A2, sum; for(theta = 1; theta <= para->delWin; theta++) B += theta * theta; for(n = para->baselen - 1; n >=0; n--){ for(t = 0; t < frame; t++){ sum = 0; for(theta = 1; theta <= para->delWin; theta++){ /* Replicate the first or last vector */ /* at the beginning and end of speech */ if (t - theta < 0) A1 = c[0][n]; else A1 = c[t - theta][n]; if (t + theta >= frame) A2 = c[frame - 1][n]; else A2 = c[t + theta][n]; sum += theta * (A2 - A1); } sum /= (2.0 * B); if (para->absesup) { c[t][para->baselen + n - 1] = sum; } else { c[t][para->baselen + n] = sum; } } } } /** * Calculate acceleration coefficients. * * @param c [i/o] MFCC vectors, in which the delta coeff. will be appended. * @param frame [in] number of frames * @param para [in] configuration parameters */ void Accel(float **c, int frame, Value *para) { int theta, t, n, B = 0; int src, dst; float A1, A2, sum; for(theta = 1; theta <= para->accWin; theta++) B += theta * theta; for(t = 0; t < frame; t++){ src = para->baselen * 2 - 1; if (para->absesup) src--; dst = src + para->baselen; for(n = 0; n < para->baselen; n++){ sum = 0; for(theta = 1; theta <= para->accWin; theta++){ /* Replicate the first or last vector */ /* at the beginning and end of speech */ if (t - theta < 0) A1 = c[0][src]; else A1 = c[t - theta][src]; if (t + theta >= frame) A2 = c[frame - 1][src]; else A2 = c[t + theta][src]; sum += theta * (A2 - A1); } c[t][dst] = sum / (2 * B); src--; dst--; } } } /** * Cepstrum Mean Normalization (buffered) * Cepstral mean will be computed within the given MFCC vectors. * * @param mfcc [i/o] array of MFCC vectors * @param frame_num [in] number of frames * @param dim [in] total dimension of MFCC vectors */ void CMN(float **mfcc, int frame_num, int dim) { int i, t; float *mfcc_ave, *sum; mfcc_ave = (float *)mycalloc(dim, sizeof(float)); sum = (float *)mycalloc(dim, sizeof(float)); for(i = 0; i < dim; i++){ sum[i] = 0.0; for(t = 0; t < frame_num; t++) sum[i] += mfcc[t][i]; mfcc_ave[i] = sum[i] / frame_num; } for(t = 0; t < frame_num; t++){ for(i = 0; i < dim; i++) mfcc[t][i] = mfcc[t][i] - mfcc_ave[i]; } free(sum); free(mfcc_ave); } /** * Cepstrum Mean/Variance Normalization (buffered) * * @param mfcc [i/o] array of MFCC vectors * @param frame_num [in] number of frames * @param para [in] configuration parameters */ void MVN(float **mfcc, int frame_num, Value *para) { int i, t; float *mfcc_mean, *mfcc_sd; float x; int basedim; basedim = para->mfcc_dim + (para->c0 ? 1 : 0); mfcc_mean = (float *)mycalloc(para->veclen, sizeof(float)); if (para->cvn) mfcc_sd = (float *)mycalloc(para->veclen, sizeof(float)); /* get mean */ for(i = 0; i < para->veclen; i++){ mfcc_mean[i] = 0.0; for(t = 0; t < frame_num; t++) mfcc_mean[i] += mfcc[t][i]; mfcc_mean[i] /= (float)frame_num; } if (para->cvn) { /* get standard deviation */ for(i = 0; i < para->veclen; i++){ mfcc_sd[i] = 0.0; for(t = 0; t < frame_num; t++) { x = mfcc[t][i] - mfcc_mean[i]; mfcc_sd[i] += x * x; } mfcc_sd[i] = sqrt(mfcc_sd[i] / (float)frame_num); } } for(t = 0; t < frame_num; t++){ if (para->cmn) { /* mean normalization (base MFCC only) */ for(i = 0; i < basedim; i++) mfcc[t][i] -= mfcc_mean[i]; } if (para->cvn) { /* variance normalization (full MFCC) */ for(i = 0; i < para->veclen; i++) mfcc[t][i] /= mfcc_sd[i]; } } if (para->cvn) free(mfcc_sd); free(mfcc_mean); }