// featbin/interpolate-pitch.cc // Copyright 2013 Bagher BabaAli // Johns Hopkins University (author: Daniel Povey) // // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "base/kaldi-common.h" #include "util/common-utils.h" namespace kaldi { struct PitchInterpolatorOptions { BaseFloat pitch_interval; // Discretization interval [affects efficiency] BaseFloat interpolator_factor; // This affects the tendency of the algorithm to // follow the observed pitch contours versus pick its own path which will tend // to be closer to a straight line. BaseFloat max_voicing_prob; // p(voicing) we use at the end of the range when it was observed // at one. (probably 0.9 is suitable; allows to not follow observed pitch even if p(voicing)=1. BaseFloat max_pitch_change_per_frame; PitchInterpolatorOptions(): pitch_interval(4.0), interpolator_factor(1.0e-05), max_voicing_prob(0.9), max_pitch_change_per_frame(10.0) { } void Register(OptionsItf *po) { po->Register("pitch-interval", &pitch_interval, "Frequency interval in Hz, used " "for the pitch interpolation and smoothing algorithm."); po->Register("interpolator-factor", &interpolator_factor, "Factor affecting the " "interpolation algorithm; setting it closer to zero will cause " "it to follow the measured pitch more faithfully but less " "smoothly"); po->Register("max-voicing-prob", &max_voicing_prob, "Probability of voicing the " "algorithm uses as the observed p(voicing) approaches 1; having " "value <1 allows it to interpolate even if p(voicing) = 1"); po->Register("max-pitch-change-per-frame", &max_pitch_change_per_frame, "This value should be set large enough to no longer affect the " "results, but the larger it is the slower the algorithm will be."); } void Check() const { KALDI_ASSERT(pitch_interval > 0.0 && pitch_interval < 20.0 && interpolator_factor > 0.0 && interpolator_factor < 1.0 && max_voicing_prob <= 1.0 && max_voicing_prob >= 0.5 && max_pitch_change_per_frame > 2.0 * pitch_interval); } }; struct PitchInterpolatorStats { int64 num_frames_tot; int64 num_frames_zero; // #frames that were zero in original pitch. int64 num_frames_changed; // #frames that were not zero originally, but // which the algorithm changed. PitchInterpolatorStats(): num_frames_tot(0), num_frames_zero(0), num_frames_changed(0) { } void Print() { BaseFloat zero_percent = num_frames_zero * 100.0 / num_frames_tot, changed_percent = num_frames_changed * 100.0 / num_frames_tot; KALDI_LOG << "Over " << num_frames_tot << " frames, " << zero_percent << "% were zero at input, and " << changed_percent << "% were not zero but were changed."; } }; class PitchInterpolator { public: PitchInterpolator(const PitchInterpolatorOptions &opts, Matrix *mat, PitchInterpolatorStats *stats): opts_(opts) { opts.Check(); InitValues(*mat); Forward(); Backtrace(mat, stats); } private: void InitValues(const Matrix &mat) { BaseFloat pitch_interval = opts_.pitch_interval; num_frames_ = mat.NumRows(); KALDI_ASSERT(mat.NumCols() == 2); BaseFloat min_pitch = 1.0e+10, max_pitch = 0.0; pitch_.resize(num_frames_); p_voicing_.resize(num_frames_); for (int32 f = 0; f < num_frames_; f++) { BaseFloat p_voicing = mat(f, 0), pitch = mat(f, 1); p_voicing *= opts_.max_voicing_prob; if (pitch == 0.0) { p_voicing = 0.0; // complete uncertainty about real pitch. } else { if (pitch < min_pitch) min_pitch = pitch; if (pitch > max_pitch) max_pitch = pitch; } p_voicing_[f] = p_voicing; } if (max_pitch == 0.0) { // No voiced frames at all. min_pitch = 100.0; max_pitch = 100.0; } if (max_pitch <= min_pitch + (2.0 * pitch_interval)) { max_pitch = min_pitch + 2.0 * pitch_interval; } // avoid crashes. // Note: the + 2 here is for edge effects. num_pitches_ = floor((max_pitch - min_pitch) / pitch_interval + 0.5) + 2; KALDI_ASSERT(num_pitches_ >= 3); min_pitch_.resize(num_frames_); for (int32 f = 0; f < num_frames_; f++) { min_pitch_[f] = min_pitch - pitch_interval * RandUniform(); // bottom of // discretization range for each frame is randomly different. BaseFloat pitch = mat(f, 1); if (pitch == 0.0) { pitch_[f] = 0; // This will actually be a don't-care value; we just put in // some value that won't crash the algorithm. } else { int32 int_pitch = floor((pitch - min_pitch_[f]) / pitch_interval + 0.5); KALDI_ASSERT(int_pitch >= 0 && int_pitch < num_pitches_); pitch_[f] = int_pitch; } } } void MultiplyObsProb(int32 t) { // For the forward computation: // Multiplies the observation probabilities into alpha at time t. // constant_prob is the constant part that does not depend on the pitch value: BaseFloat constant_prob = (1.0 - p_voicing_[t]) * opts_.interpolator_factor, specified_prob = p_voicing_[t] + constant_prob; // specified_prob adds in the extra probability mass at the observed pitch value. BaseFloat log_constant_prob = log(constant_prob), log_ratio = log(specified_prob / constant_prob); log_alpha_.Add(log_constant_prob); // add log_constant_prob to all pitches at this time. log_alpha_(pitch_[t]) += log_ratio; // corrects this to be like adding // log(specified_prob) to the observed pitch at this time. Note: if pitch_[t] == 0, // this won't have any effect because log_ratio will be zero too. Vector temp_rand(num_pitches_); temp_rand.SetRandn(); // Set to Gaussian noise. Type of noise doesn't really matter. log_alpha_.AddVec(0.01, temp_rand); // We add a small amount of noise to the // observation probabilities; this has the effect of breaking symmetries in // a more random way to overcome certain weirdnesses that could otherwise // happen due to the discretization. } // This function updates log_alpha_, as a function of prev_log_alpha_; it also // updates back_pointers_[t]; void ComputeTransitionProb(int32 t) { KALDI_ASSERT(t > 0); BaseFloat pitch_interval = opts_.pitch_interval; back_pointers_[t].resize(num_pitches_); // Transition probability between pitch p and p' on times t-1 and t // is (p - p')^2, with the pitch measured in Hz. We're doing Viterbi, // so always pick the max over the previous frame's t. KALDI_ASSERT(t > 0 && t < num_frames_); int32 K = floor(opts_.max_pitch_change_per_frame / pitch_interval + 0.5); // K is max #bins we can move; a kind of pruning, for speed. for (int32 p = 0; p < num_pitches_; p++) { int32 min_prev_p = p - K, max_prev_p = p + K; if (min_prev_p < 0) min_prev_p = 0; if (max_prev_p >= num_pitches_) max_prev_p = num_pitches_ - 1; BaseFloat best_logprob = -1.0e+10; int32 best_prev_p = -1; for (int32 prev_p = min_prev_p; prev_p <= max_prev_p; prev_p++) { BaseFloat delta_pitch = (min_pitch_[t-1] + prev_p * pitch_interval) - (min_pitch_[t] + p * pitch_interval); BaseFloat this_logprob = prev_log_alpha_(prev_p) - 0.5 * delta_pitch * delta_pitch; if (this_logprob > best_logprob) { best_logprob = this_logprob; best_prev_p = prev_p; } } back_pointers_[t][p] = best_prev_p; log_alpha_(p) = best_logprob; } } void Forward() { // Viterbi in a discrete model of the pitch, in which the observation // probability of a pitch is p(voicing) at the observed pitch, and // interpolator_factor_ * 1.0 - p(voicing) at all other pitches. the // transition log-probability is -0.5 times the squared difference in pitch. // [We measure this in Hz, not in integer values, to make it more invariant // to the discretization interval]. back_pointers_.resize(num_frames_); log_alpha_.Resize(num_pitches_); prev_log_alpha_.Resize(num_pitches_); log_alpha_.Set(0.0); MultiplyObsProb(0); for (int32 t = 1; t < num_frames_; t++) { log_alpha_.Swap(&prev_log_alpha_); ComputeTransitionProb(t); MultiplyObsProb(t); } } void Backtrace(Matrix *mat, PitchInterpolatorStats *stats) { const BaseFloat pitch_interval = opts_.pitch_interval; BaseFloat *p_begin = log_alpha_.Data(), *p_end = p_begin + num_pitches_, *p_best = std::max_element(p_begin, p_end); std::vector best_pitch(num_frames_); int32 best_p = p_best - p_begin; // best discrete pitch p at time T-1. for (int32 t = num_frames_ - 1; t >= 0; t--) { { // Update stats: stats->num_frames_tot++; if (pitch_[t] == 0) stats->num_frames_zero++; else if (best_p != pitch_[t]) stats->num_frames_changed++; } BaseFloat pitch = min_pitch_[t] + pitch_interval * best_p; (*mat)(t, 1) = pitch; KALDI_ASSERT(best_p >= 0 && best_p < num_pitches_); if (t > 0) best_p = back_pointers_[t][best_p]; } } const PitchInterpolatorOptions &opts_; std::vector min_pitch_; // Bottom of discretization range... // previously this was a BaseFloat, but for better pseudo-randomization we // have a slightly perturbed value for each frame now, so it's a vector. int32 num_frames_; // number of frames; int32 num_pitches_; // Number of discrete pitch intervals. std::vector pitch_; // observed pitch, discretized; [it's don't-care if algorithm had no // observation (0)] std::vector p_voicing_; // p(voicing) times max_voicing_prob_; or zero if // pitch was 0.0 for this frame. std::vector > back_pointers_; // at each t, points to best pitch // on time t-1. Vector log_alpha_; Vector prev_log_alpha_; }; // Linear Interpolation for places where the pitch value is zero void LinearlyInterpolatePitch(Matrix *mat) { int32 num_frames = mat->NumRows(); int i = 0; Matrix &features = *mat; while (i < num_frames) { if(features(i, 1) == 0.0) { int start = i - 1; int end = i; while( (features(end, 1)) == 0.0 && (end < num_frames)) end++; BaseFloat end_value = -1, start_value = -1; if (end < num_frames) end_value = features(end, 1); if (start > 0) start_value = features(start, 1); if (start_value < 0 && end_value < 0) { // the whole file is unvoiced -> just put an arbitrary value, // it will all be normalized out anyway. start_value = 1.0; end_value = 1.0; } // If we don't have a value for one end of the range, i.e. at the start or // end, set it to 0.9 times the pitch value that we have at the other end // of the range. The reason we don't set it to that value itself, is that // then over this segment we would have zero time-derivative, so if we // took time derivatives we would have an artificial spike at zero. if (start_value < 0.0) start_value = 0.9 * end_value; if (end_value < 0.0) end_value = 0.9 * start_value; for(int k = start + 1; k < end; k++) features(k, 1) = start_value + (end_value - start_value) / (end - start) * (k - start); i = end; } i++; } } } // namespace kaldi int main(int argc, char *argv[]) { try { using namespace kaldi; const char *usage = "This is a rather special-purpose program which processes 2-dimensional\n" "features consisting of (prob-of-voicing, pitch). By default we do model-based\n" "pitch smoothing and interpolation (see code), or if --linear-interpolation=true,\n" "just linear interpolation across gaps where pitch == 0 (not predicted).\n" "Usage: interpolate-pitch [options...]

\n"; // construct all the global objects ParseOptions po(usage); bool linear_interpolation = false; PitchInterpolatorOptions interpolate_opts; po.Register("linear-interpolation", &linear_interpolation, "If true, just do simple linear " "interpolation across gaps (else, model-based)"); interpolate_opts.Register(&po); // parse options (+filling the registered variables) po.Read(argc, argv); if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); } std::string input_rspecifier = po.GetArg(1); std::string output_wspecifier = po.GetArg(2); SequentialBaseFloatMatrixReader reader(input_rspecifier); BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter. if (!kaldi_writer.Open(output_wspecifier)) KALDI_ERR << "Could not initialize output with wspecifier " << output_wspecifier; int32 num_done = 0, num_err = 0; PitchInterpolatorStats stats; for (; !reader.Done(); reader.Next()) { std::string utt = reader.Key(); Matrix features = reader.Value(); int num_frames = features.NumRows(); if (num_frames == 0 && features.NumCols() != 2) { KALDI_WARN << "Feature file has bad size " << features.NumRows() << " by " << features.NumCols(); num_err++; continue; } if (linear_interpolation) LinearlyInterpolatePitch(&features); else { // work happens in constructor of this class. PitchInterpolator pi(interpolate_opts, &features, &stats); } kaldi_writer.Write(utt, features); num_done++; if (num_done % 10 == 0) KALDI_LOG << "Processed " << num_done << " utterances"; KALDI_VLOG(2) << "Processed features for key " << utt; } if (!linear_interpolation) stats.Print(); KALDI_LOG << "Done " << num_done << " utterances, " << num_err << " with errors."; return (num_done != 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); return -1; } }