// nnet2/combine-nnet-fast.h // Copyright 2012 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_NNET2_COMBINE_NNET_FAST_H_ #define KALDI_NNET2_COMBINE_NNET_FAST_H_ #include "nnet2/nnet-update.h" #include "nnet2/nnet-compute.h" #include "util/parse-options.h" #include "itf/options-itf.h" // Compare with combine-nnet.h. What we're doing is taking // a set of neural nets, and combining them with combination weights // (separate weights for each updatable layer), and optimizing // these weights using a validation set, // This is a faster implementation // with multi-threading and more careful preconditioning. // To get the pre-conditioning, we divide the validation subset // up into small-ish batches (e.g. 100 frames), and compute the // neural net gradient for each one. We then compute the parameter // gradient (i.e. the gradient w.r.t. the combination weights we're // optimizing) for each batch, and use the scatter of these as a // kind of Fisher matrix for preconditioning. namespace kaldi { namespace nnet2 { /** Configuration class that controls neural net combination, where we combine a number of neural nets, trying to find for each layer the optimal weighted combination of the different neural-net parameters. */ struct NnetCombineFastConfig { int32 initial_model; // If provided, the index of the initial model to start // the optimization from. int32 num_lbfgs_iters; int32 num_threads; BaseFloat initial_impr; BaseFloat fisher_floor; // Flooring value we use for Fisher matrix (mainly // makes a difference in pnorm systems, where there // are don't-care directions in parameter space. BaseFloat alpha; // A smoothing value we use in getting the Fisher matrix. int32 fisher_minibatch_size; // e.g. 64; a relatively small minibatch size we // use in the Fisher matrix computation (smaller will generally mean more accurate // preconditioning but will slow down the computation). int32 minibatch_size; // e.g. 1028; a larger minibatch size we use in // the gradient computation. int32 max_lbfgs_dim; BaseFloat regularizer; NnetCombineFastConfig(): initial_model(-1), num_lbfgs_iters(10), num_threads(1), initial_impr(0.01), fisher_floor(1.0e-20), alpha(0.01), fisher_minibatch_size(64), minibatch_size(1024), max_lbfgs_dim(10), regularizer(0.0) {} void Register(OptionsItf *po) { po->Register("initial-model", &initial_model, "Specifies where to start the " "optimization from. If 0 ... #models-1, then specifies the model; " "if >= #models, then the average of all inputs; if <0, chosen " "automatically from the previous options."); po->Register("num-lbfgs-iters", &num_lbfgs_iters, "Maximum number of function " "evaluations for L-BFGS to use when optimizing combination weights"); po->Register("initial-impr", &initial_impr, "Amount of objective-function change " "We aim for on the first iteration."); po->Register("num-threads", &num_threads, "Number of threads to use in " "multi-core computation"); po->Register("fisher-floor", &fisher_floor, "Floor for diagonal of Fisher matrix (used in preconditioning)"); po->Register("alpha", &alpha, "Value we use in smoothing the Fisher matrix " "with its diagonal, in preconditioning the update."); po->Register("fisher-minibatch-size", &fisher_minibatch_size, "Size of minibatch " "used in computation of Fisher matrix (smaller -> better " "preconditioning"); po->Register("minibatch-size", &minibatch_size, "Minibatch size used in computing " "gradients (only affects speed)"); po->Register("max-lbfgs-dim", &max_lbfgs_dim, "Maximum dimension to use in " "L-BFGS (will not get higher than this even if the dimension " "of the space gets higher.)"); po->Register("regularizer", ®ularizer, "Add to the objective " "function (which is average log-like per frame), -0.5 * " "regularizer * square of parameters."); } }; void CombineNnetsFast(const NnetCombineFastConfig &combine_config, const std::vector &validation_set, const std::vector &nnets_in, Nnet *nnet_out); } // namespace nnet2 } // namespace kaldi #endif