// featbin/process-pitch-feats.cc // Copyright 2013 Bagher BabaAli // Johns Hopkins University (author: Daniel Povey) // // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "base/kaldi-common.h" #include "util/common-utils.h" namespace kaldi { // For the probability-of-voicing features (the first element of // each row), change p -> log ((p + 0.0001) / (1.00001 - p)) // This makes it more Gaussian; otherwise it clumps up near // the edges of the range [0, 1]. void ProcessPovFeatures(Matrix *mat) { int32 num_frames = mat->NumRows(); for (int32 i = 0; i < num_frames; i++) { BaseFloat p = (*mat)(i, 0); KALDI_ASSERT(p >= 0.0 && p <= 1.0); (*mat)(i, 0) = log((p + 0.0001) / (1.0001 - p)); } } void TakeLogOfPitch(Matrix *mat) { int32 num_frames = mat->NumRows(); for (int32 i = 0; i < num_frames; i++) { KALDI_ASSERT((*mat)(i, 1) > 0.0); (*mat)(i, 1) = log((*mat)(i, 1)); } } // Subtract the moving average over a largish window // (e.g. 151 frames) void SubtractMovingAverage(int32 normalization_window_size, Matrix *mat) { int32 num_frames = mat->NumRows(); Vector temp_pitch(num_frames); Matrix &features = *mat; int32 i; for (i = 0; i < num_frames; i++) temp_pitch(i) = features(i, 1); // Moving Window Normalization BaseFloat mean = 0.0; int32 mid_win = (normalization_window_size - 1) / 2; for (i = 0; (i < num_frames) && (i < normalization_window_size); i++) { mean += features(i, 1); } mean /= i; if (num_frames <= normalization_window_size) { for (i = 0; i < num_frames; i++) { features(i, 1) -= mean; } } else { for (i = 0; i <= mid_win; i++) { features(i, 1) -= mean; } for (i = (mid_win + 1); i < num_frames; i++) { if (i + (mid_win + 1) < num_frames) mean -= (temp_pitch(i - (mid_win + 1)) - temp_pitch(i + (mid_win + 1))) / normalization_window_size; features(i,1) -= mean; } } } // Set to the moving average over a small window, e.g. 5 frames. void SetToMovingAverage(int32 average_window_size, Matrix *mat) { int32 num_frames = mat->NumRows(); Matrix &features = *mat; Vector temp_pitch(num_frames); int32 width = (average_window_size - 1) / 2, i; // e.g. if average_window_size is 5, width will equal 2. for (i = width; i < num_frames - width ; i++) { temp_pitch(i) = features(i, 1); for(int j = 1; j <= width; ++j) { temp_pitch(i) += (features(i - j, 1) + features(i + j, 1)); } temp_pitch(i) /= (2 * width + 1); } for (i = width; i < num_frames - width; i++) features(i, 1) = temp_pitch(i); } } // namespace kaldi int main(int argc, char *argv[]) { try { using namespace kaldi; const char *usage = "This is a rather special-purpose program which processes 2-dimensional\n" "features consisting of (prob-of-voicing, pitch) into something suitable\n" "to put into a speech recognizer. First use interpolate-feats\n" "Usage: process-pitch-feats [options...] \n"; // construct all the global objects ParseOptions po(usage); int32 normalization_window_size = 151; // should be odd number int32 average_window_size = 5; // Register the options po.Register("normalization-window-size", &normalization_window_size, "Size of window used for " "moving window nomalization (must be odd)."); po.Register("average-window-size", &average_window_size, "Size of moving average window (must be odd)."); // parse options (+filling the registered variables) po.Read(argc, argv); if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); } KALDI_ASSERT(average_window_size > 0 && average_window_size % 2 == 1 && "--average-window-size option must be an odd positive number."); KALDI_ASSERT(normalization_window_size > 0 && normalization_window_size % 2 == 1 && "--normalization-window-size option must be an odd positive number."); std::string input_rspecifier = po.GetArg(1); std::string output_wspecifier = po.GetArg(2); SequentialBaseFloatMatrixReader reader(input_rspecifier); BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter. if (!kaldi_writer.Open(output_wspecifier)) KALDI_ERR << "Could not initialize output with wspecifier " << output_wspecifier; int32 num_done = 0, num_err = 0; for (; !reader.Done(); reader.Next()) { std::string utt = reader.Key(); Matrix features = reader.Value(); int num_frames = features.NumRows(); if (num_frames == 0 && features.NumCols() != 2) { KALDI_WARN << "Feature file has bad size " << features.NumRows() << " by " << features.NumCols(); num_err++; continue; } ProcessPovFeatures(&features); TakeLogOfPitch(&features); SubtractMovingAverage(normalization_window_size, &features); SetToMovingAverage(average_window_size, &features); kaldi_writer.Write(utt, features); num_done++; if (num_done % 10 == 0) KALDI_LOG << "Processed " << num_done << " utterances"; KALDI_VLOG(2) << "Processed features for key " << utt; } KALDI_LOG << "Done " << num_done << " utterances, " << num_err << " with errors."; return (num_done != 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); return -1; } }