// transform/exponential-transform.h // Copyright 2009-2011 Microsoft Corporation // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_TRANSFORM_EXPONENTIAL_TRANSFORM_H_ #define KALDI_TRANSFORM_EXPONENTIAL_TRANSFORM_H_ #include "base/kaldi-common.h" #include "matrix/matrix-lib.h" #include "transform/fmllr-diag-gmm.h" namespace kaldi { // We define an exponential transform as a transform of the form // W_s = D_s exp(t_s A) B, which takes x^+ -> x (where ^+ is adding a one); // only t_s and D_s are speaker-specific. It is roughly analogous to th elog // of the vtln warp factor. // D_s is either a diagonal or an offset-only fMLLR matrix (or just // the "default" transform [ I ; 0 ]), depending on options. // "exp" here is matrix exponential, defined by exp(A) = I + A + 1/2! A A + 1/3! A A A + ... // note that the last row of A is 0 0 0 ... and the last row of B is // 0 0 0 ... 0 1. The "globally trained" things are A and B. // We train A and B on separate iterations. enum EtNormalizeType { kEtNormalizeOffset, kEtNormalizeDiag, kEtNormalizeNone }; // Note: Revision 121 corresponds to the submitted version of the ASRU paper. // There has been a correction to the update for A since then. class ExponentialTransformAccsA; // Class ExponentialTransform holds just the globally shared parts of the exponential // transform, i.e. A_ and B_. class ExponentialTransform { public: ExponentialTransform() { } // typically use this constructor only prior to // calling Read(). ExponentialTransform(int32 dim, EtNormalizeType norm_type, int32 seed = 0) { Init(dim, norm_type, seed); } void Init(int32 dim, EtNormalizeType norm_type, int32 seed = 0); // Initializes A to a pseudo-random unit-norm matrix // (with last row zero), and B to unity. "dim" is the feature dim, so both A and B // are of dimension dim+1 // SetNormalizeType sets the normalization type to this. But it only allows // you to increase the normalization type, i.e. None->Offset or Diag // or Offset->Diag void SetNormalizeType(EtNormalizeType norm_type); // ComputeTransform does not attempt to work out the objective function change, // because of possible confusion about what the correct baseline should be. // You can use FmllrAuxFuncDiagGmm to measure the change. void ComputeTransform(const FmllrDiagGmmAccs &accs, MatrixBase *Ws, // output fMLLR transform, should be size dim x dim+1 BaseFloat *t, MatrixBase *Ds, BaseFloat *objf_impr = NULL, // versus just B BaseFloat *count = NULL); int32 Dim() const { return A_.NumRows() - 1; } // returns feature dim. // Ds is the first term in // fmllr_mat = W_s = D_s exp(t_s A) B, which is a diagonal-only // fMLLR (or possibly // just mean-offset or [ I; 0 ], depending on whether norm_type_ is // {Diag, Offset, None}. void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); /// Returns B minus its last row, which is the closest thing to a "default transform" /// that we have. void GetDefaultTransform(Matrix *transform) const; void GetATransform(Matrix *transform) const { transform->CopyFromMat(A_); } /// Make B unit; this can be useful for combining the B part of the /// transform with MLLT. void MakeBUnit() { B_.SetUnit(); } void ComputeDs(const MatrixBase &Ws, BaseFloat t, MatrixBase *Ds) const; // Computes the D_s matrix, // given W_s and the value of t. // takes "Cpart" which is a d x d STC/MLLT matrix, and applies // it to the transform by doing A <-- C A C^{-1}, B <-- C B, // where C is Cpart extended with an extra row and column with values // equal to those in the unit matrix. void ApplyC(const MatrixBase &Cpart); friend class ExponentialTransformAccsA; friend class ExponentialTransformAccsANew; protected: Matrix A_; // d+1 by d+1 matrix; last row 0 0 0 .. 0 0. Matrix B_; // d+1 by d+1 matrix; last row 0 0 0 .. 0 1. EtNormalizeType norm_type_; // tells us how to train D_s. private: static void ComposeAffineTransforms(const MatrixBase &A, const MatrixBase &B, MatrixBase *C); }; struct ExponentialTransformUpdateAOptions { BaseFloat learning_rate; bool renormalize; // renormalize A and recenter the warp factors on each iteration... ExponentialTransformUpdateAOptions(): learning_rate(1.0), renormalize(true) { } void Register(OptionsItf *po) { po->Register("learning-rate", &learning_rate, "Learning rate for updating A (make <1 if instability suspected)\n"); po->Register("renormalize", &renormalize, "True if you want to renormalize the warp factors on each iteration of update (recommended)."); } }; class ExponentialTransformAccsA { public: // This class does the accumulation and upate for the "A" part of the // global transform. // AccumulateForSpeaker does the accumulation for the speaker, // given standard fMLLR accs that have been accumulated given the // un-transformed data. void AccumulateForSpeaker(const FmllrDiagGmmAccs &accs, const ExponentialTransform &et, const MatrixBase &Ds, BaseFloat t); ExponentialTransformAccsA() { } // typically use this constructor prior to Read(). ExponentialTransformAccsA(int32 dim) { Init(dim); } void Init(int32 dim); void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary, bool add = false); // Updates the matrix A (also changes B as a side effect). void Update(const ExponentialTransformUpdateAOptions &opts, ExponentialTransform *et, BaseFloat *objf_impr, BaseFloat *count); private: double beta_; // sum of speaker betas. for diagnostics. double beta_t_; // sum of speaker betas times T. for log-det term. std::vector > G_; // Like the G stats of // fMLLR, taken after the B transform. Summed over speakers and // weighted by t^2. Matrix Ahat_; // local gradient w.r.t. the first d rows of A. // note, \hat{A} in the paper has an extra row; // this is never used. }; } // End namespace kaldi #endif // KALDI_TRANSFORM_LDA_ESTIMATE_H_