// nnet2/nnet-nnet.h // Copyright 2011-2012 Karel Vesely // Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_NNET2_NNET_NNET_H_ #define KALDI_NNET2_NNET_NNET_H_ #include "base/kaldi-common.h" #include "util/kaldi-io.h" #include "matrix/matrix-lib.h" #include "nnet2/nnet-component.h" #include #include #include namespace kaldi { namespace nnet2 { /* This neural net is basically a series of Components, and is a fairly passive object that mainly acts as a store for the Components. Training is handled by a separate class NnetTrainer(), and extracting likelihoods for decoding is handled by DecodableNnetCpu(). There are a couple of things that make this class more than just a vector of Components. (1) It handles frame splicing (temporal context.) We'd like to encompass the approach described in http://www.fit.vutbr.cz/research/groups/speech/publi/2011/vesely_asru2011_00042.pdf where at a certain point they splice together frames -10, -5, 0, +5 and +10. It seems that it's not necessarily best to splice together a contiguous sequence of frames. (2) It handles situations where the input features have two parts-- a "frame-specific" part (the normal features), and a "speaker-specific", or at least utterance-specific part that does not vary with the frame index. These features are provided separately from the frame-specific ones, to avoid redundancy. */ class Nnet { public: /// Returns number of components-- think of this as similar to # of layers, but /// e.g. the nonlinearity and the linear part count as separate components, /// so the number of components will be more than the number of layers. int32 NumComponents() const { return components_.size(); } const Component &GetComponent(int32 c) const; Component &GetComponent(int32 c); /// Sets the c'th component to "component", taking ownership of the pointer /// and deleting the corresponding one that we own. void SetComponent(int32 c, Component *component); /// Returns the LeftContext() summed over all the Components... this is the /// entire left-context in frames that the network requires. int32 LeftContext() const; /// Returns the LeftContext() summed over all the Components... this is the /// entire left-context in frames that the network requires. int32 RightContext() const; /// The output dimension of the network -- typically /// the number of pdfs. int32 OutputDim() const; /// Dimension of the input features, e.g. 13 or 40. Does not /// take account of frame splicing-- that is done with the "chunk" /// mechanism, where you provide chunks of features over time. int32 InputDim() const; void ZeroStats(); // zeroes the stats on the nonlinear layers. /// Copies only the statistics in layers of type NonlinearComponewnt, from /// this neural net, leaving everything else fixed. void CopyStatsFrom(const Nnet &nnet); int32 NumUpdatableComponents() const; /// Scales the parameters of each of the updatable components. /// Here, scale_params is a vector of size equal to /// NumUpdatableComponents() void ScaleComponents(const VectorBase &scales); /// Excise any components of type DropoutComponent or AdditiveNoiseComponent void RemoveDropout(); /// Calls SetDropoutScale for all the dropout nodes. void SetDropoutScale(BaseFloat scale); /// Replace any components of type AffineComponentPreconditioned with /// components of type AffineComponent. void RemovePreconditioning(); /// For each updatatable component, adds to it /// the corresponding element of "other" times the /// appropriate element of "scales" (which has the /// same format as for ScaleComponents(), i.e. /// one entry for each updatable component). void AddNnet(const VectorBase &scales, const Nnet &other); /// Scales all the Components with the same scale. This applies to /// UpdatableComponents, and (unlike the ScaleComponents function) to /// SoftmaxComponents. void Scale(BaseFloat scale); /// Adds to *this, the other neural net times the scale "alpha". This applies /// to UpdatableComponents, and (unlike the other AddNnet function) to /// SoftmaxComponents. void AddNnet(BaseFloat alpha, const Nnet &other); /// Turns the last affine layer into two layers of the same type, with a /// smaller dimension in between-- we're keeping the top singular values of /// the matrix. void LimitRankOfLastLayer(int32 dimension); /// This version of AddNnet adds to *this, alpha times *other, and then scales /// *other by beta. The reason why we make this a separate function is for /// multithreading reasons (otherwise you could do AddNnet(alpha, *iter) and then /// other->Scale(beta). void AddNnet(BaseFloat alpha, Nnet *other, BaseFloat beta); /// Removes final components from the neural network (used for /// debugging). void Resize(int32 num_components); /// Where possible, collapse multiple affine or linear components in a /// sequence into a single one by composing the transforms. If /// match_updatableness=true, this will not collapse, say, an /// AffineComponent with a FixedAffineComponent or FixedLinearComponent. /// If false, it will collapse them. This function won't necessarily /// work for all pairs of such layers. It currently only works where /// one of each pair is an AffineComponent. void Collapse(bool match_updatableness); /// Sets the index_ values of the components. void SetIndexes(); Nnet(const Nnet &other); // Copy constructor. Nnet(const Nnet &nnet1, const Nnet &nnet2); // Constructor that takes two // nnets: it concatenates the layers. Nnet() {} Nnet &operator = (const Nnet &other); // assignment operator. /// Initialize from config file. /// Each line of the config is either a comment line starting /// with whitespace then #, or it is a line that specifies one /// layer of the network, as accepted by Component::InitFromString(). /// An example non-comment line is: /// AffineComponent learning-rate=0.01 l2-penalty=0.001 input-dim=10 output-dim=15 param-stddev=0.1 void Init(std::istream &is); /// This Init method works from a vector of components. It will take ownership /// of the pointers and resize the vector to zero to avoid a chance of the /// caller deallocating them. void Init(std::vector *components); /// Appends this component to the components already in the neural net. /// Takes ownership of the pointer. void Append(Component *new_component); virtual ~Nnet() { Destroy(); } std::string Info() const; // some human-readable summary info. /* std::string LrateInfo() const; // some info on the learning rates, // in human-readable form. // the same, broken down by sets. std::string LrateInfo(const std::vector > &final_sets) const; // the same, broken down by sets, for shrinkage rates. std::string SrateInfo(const std::vector > &final_sets) const; // Mix up by increasing the dimension of the output of softmax layer (and the // input of the linear layer). This is exactly analogous to mixing up // Gaussians in a GMM-HMM system, and we use a similar power rule to allocate // new ones [so a "category" gets an allocation of indices/Gaussians allocated // proportional to a power "power" of its total occupancy. void MixUp(int32 target_tot_neurons, BaseFloat power, // e.g. 0.2. BaseFloat perturb_stddev); void Init(const Nnet1InitInfo &init_info); */ void Destroy(); void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); void SetZero(bool treat_as_gradient); // Sets all parameters to zero and if // treat_as_gradient == true, also tells components to "think of themselves as // gradients" (affects some of the update code). Also zeroes stats stored // with things of type NonlinearComponent. /// [This function is only used in the binary nnet-train.cc which is currently not /// being used]. This is used to separately adjust learning rates of each layer, /// after each "phase" of training. We basically ask (using the validation /// gradient), do we wish we had gone further in this direction? Yes-> /// increase learning rate, no -> decrease it. The inputs have dimension /// NumUpdatableComponents(). void AdjustLearningRates( const VectorBase &old_model_old_gradient, const VectorBase &new_model_old_gradient, const VectorBase &old_model_new_gradient, const VectorBase &new_model_new_gradient, BaseFloat measure_at, // where to measure gradient, on line between old // and new model; 0.5 < measure_at <= 1.0. BaseFloat learning_rate_ratio, BaseFloat max_learning_rate); /// Scale all the learning rates in the neural net by this factor. void ScaleLearningRates(BaseFloat factor); /// Set all the learning rates in the neural net to this value. void SetLearningRates(BaseFloat learning_rates); /// Set all the learning rates in the neural net to these values /// (one for each updatable layer). void SetLearningRates(const VectorBase &learning_rates); /// Get all the learning rates in the neural net (the output /// must have dim equal to NumUpdatableComponents()). void GetLearningRates(VectorBase *learning_rates) const; // This sets *dot_prod to the dot prod of *this . validation_gradient, // separately for each updatable component. The vector must have size equal // to this->NumUpdatableComponents(). Warning: previously it had to have size // equal to this->NumComponents()). This is used in updating learning rates // and shrinkage rates. void ComponentDotProducts( const Nnet &other, VectorBase *dot_prod) const; void Check() const; // Consistency check. void ResetGenerators(); // resets random-number generators for all // random components. You must also set srand() for this to be // effective. // The following three functions are used for vectorizing // the parameters-- used, for example, in L-BFGS. virtual int32 GetParameterDim() const; virtual void Vectorize(VectorBase *params) const; virtual void UnVectorize(const VectorBase ¶ms); friend class NnetUpdater; friend class DecodableNnet; private: std::vector components_; }; } // namespace nnet2 } // namespace kaldi #endif