// latbin/lattice-oracle.cc // Copyright 2011 Gilles Boulianne // 2013 Johns Hopkins University (author: Daniel Povey) // // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "base/kaldi-common.h" #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" namespace kaldi { using std::vector; using std::set; typedef unordered_set LabelSet; void ReadSymbolList(const std::string &rxfilename, fst::SymbolTable *word_syms, LabelSet *lset) { Input ki(rxfilename); std::string line; KALDI_ASSERT(lset != NULL); lset->clear(); while (getline(ki.Stream(), line)) { std::string sym; std::istringstream ss(line); ss >> sym >> std::ws; if (ss.fail() || !ss.eof()) { KALDI_ERR << "Bad line in symbol list: "<< line << ", file is: " << PrintableRxfilename(rxfilename); } fst::StdArc::Label lab = word_syms->Find(sym.c_str()); if (lab == fst::SymbolTable::kNoSymbol) { KALDI_ERR << "Can't find symbol in symbol table: " << line << ", file is: " << PrintableRxfilename(rxfilename); } lset->insert(lab); } } void MapWildCards(const LabelSet &wildcards, fst::StdVectorFst *ofst) { // map all wildcards symbols to epsilons for (fst::StateIterator siter(*ofst); !siter.Done(); siter.Next()) { fst::StdArc::StateId s = siter.Value(); for (fst::MutableArcIterator aiter(ofst, s); !aiter.Done(); aiter.Next()) { fst::StdArc arc(aiter.Value()); LabelSet::const_iterator it = wildcards.find(arc.ilabel); if (it != wildcards.end()) { KALDI_VLOG(4) << "MapWildCards: mapping symbol " << arc.ilabel << " to epsilon" << endl; arc.ilabel = 0; } it = wildcards.find(arc.olabel); if (it != wildcards.end()) {arc.olabel = 0;} aiter.SetValue(arc); } } } // convert from Lattice to standard FST // also maps wildcard symbols to epsilons // then removes epsilons void ConvertLatticeToUnweightedAcceptor(const kaldi::Lattice &ilat, const LabelSet &wildcards, fst::StdVectorFst *ofst) { // first convert from lattice to normal FST fst::ConvertLattice(ilat, ofst); // remove weights, project to output, sort according to input arg fst::Map(ofst, fst::RmWeightMapper()); fst::Project(ofst, fst::PROJECT_OUTPUT); // The words are on the output side MapWildCards(wildcards, ofst); fst::RmEpsilon(ofst); // Don't tolerate epsilons as they make it hard to tally errors fst::ArcSort(ofst, fst::StdILabelCompare()); } void CreateEditDistance(const fst::StdVectorFst &fst1, const fst::StdVectorFst &fst2, fst::StdVectorFst *pfst) { using namespace fst; typedef StdArc StdArc; typedef StdArc::Weight Weight; typedef StdArc::Label Label; Weight correct_cost(0.0); Weight substitution_cost(1.0); Weight insertion_cost(1.0); Weight deletion_cost(1.0); // create set of output symbols in fst1 std::vector