/**
* @file init_ngram.c
*
*
* @brief N-gramファイルをメモリに読み込み単語辞書と対応を取る
*
*
*
* @brief Load N-gram file into memory and setup with word dictionary
*
*
* @author Akinobu LEE
* @date Wed Feb 16 07:40:53 2005
*
* $Revision: 1.9 $
*
*/
/*
* Copyright (c) 1991-2012 Kawahara Lab., Kyoto University
* Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
* Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology
* All rights reserved
*/
#include
#include
#include
/**
* Read and setup N-gram data from binary format file.
*
* @param ndata [out] pointer to N-gram data structure to store the data
* @param bin_ngram_file [in] file name of the binary N-gram
*/
boolean
init_ngram_bin(NGRAM_INFO *ndata, char *bin_ngram_file)
{
FILE *fp;
jlog("Stat: init_ngram: reading in binary n-gram from %s\n", bin_ngram_file);
if ((fp = fopen_readfile(bin_ngram_file)) == NULL) {
jlog("Error: init_ngram: failed to open \"%s\"\n", bin_ngram_file);
return FALSE;
}
if (ngram_read_bin(fp, ndata) == FALSE) {
jlog("Error: init_ngram: failed to read \"%s\"\n", bin_ngram_file);
return FALSE;
}
if (fclose_readfile(fp) == -1) {
jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file);
return FALSE;
}
/* set default unknown (=OOV) word id */
set_default_unknown_id(ndata);
jlog("Stat: init_ngram: finished reading n-gram\n");
return TRUE;
}
/**
* Read and setup N-gram data from ARPA format file.
*
* @param ndata [out] pointer to N-gram data structure to store the data
* @param ngram_file [in] file name of ARPA (reverse) 3-gram file
* @param dir [in] direction (DIR_LR | DIR_RL)
*/
boolean
init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir)
{
FILE *fp;
ndata->root = NULL;
ndata->dir = dir;
jlog("Stat: init_ngram: reading in ARPA %s n-gram from %s\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ngram_file);
/* read RL n-gram */
if ((fp = fopen_readfile(ngram_file)) == NULL) {
jlog("Error: init_ngram: failed to open \"%s\"\n", ngram_file);
return FALSE;
}
if (ngram_read_arpa(fp, ndata, FALSE) == FALSE) {
jlog("Error: init_ngram: failed to read \"%s\"\n", ngram_file);
return FALSE;
}
if (fclose_readfile(fp) == -1) {
jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file);
return FALSE;
}
/* set default unknown (=OOV) word id */
set_default_unknown_id(ndata);
jlog("Stat: init_ngram: finished reading n-gram\n");
return TRUE;
}
/**
* Read additional LR 2-gram for 1st pass.
*
* @param ndata [out] pointer to N-gram data structure to store the data
* @param bigram_file [in] file name of ARPA 2-gram file
*/
boolean
init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file)
{
FILE *fp;
jlog("Stat: init_ngram: reading in additional LR 2-gram for the 1st pass from %s\n", bigram_file);
if ((fp = fopen_readfile(bigram_file)) == NULL) {
jlog("Error: init_ngram: failed to open \"%s\"\n", bigram_file);
return FALSE;
}
if (ngram_read_arpa(fp, ndata, TRUE) == FALSE) {
jlog("Error: init_ngram: failed to read \"%s\"\n", bigram_file);
return FALSE;
}
if (fclose_readfile(fp) == -1) {
jlog("Error: init_ngram: failed to close \"%s\"\n", bigram_file);
return FALSE;
}
jlog("Stat: init_ngram: finished reading LR 2-gram\n");
return TRUE;
}
/**
* Make correspondence between word dictionary and N-gram vocabulary.
*
* @param ndata [i/o] word/class N-gram, the unknown word information will be set.
* @param winfo [i/o] word dictionary, the word-to-ngram-entry mapping will be done here.
*/
boolean
make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo)
{
int i;
boolean ok_flag = TRUE;
int count = 0;
jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n");
ndata->unk_num = 0;
for (i = 0; i < winfo->num; i++) {
winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]);
if (winfo->wton[i] == WORD_INVALID) {
ok_flag = FALSE;
count++;
continue;
}
if (winfo->wton[i] == ndata->unk_id) {
(ndata->unk_num)++;
}
}
if (ok_flag == FALSE) {
jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count);
jlog("Error: --- Specify the word to which those words are mapped with \"-mapunk\" (default: \"\" or \"\"\n");
return FALSE;
}
if (ndata->unk_num == 0) {
ndata->unk_num_log = 0.0; /* for safe */
} else {
ndata->unk_num_log = (float)log10(ndata->unk_num);
}
jlog("Stat: init_ngram: finished word-to-ngram mapping\n");
return TRUE;
}
/**
* @brief Set default unknown word ID to the N-gram data.
* If default "" is not found, also try "".
*
* @param ndata [out] N-gram data to set unknown word ID.
*/
void
set_default_unknown_id(NGRAM_INFO *ndata)
{
ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT);
if (ndata->unk_id != WORD_INVALID) {
jlog("Stat: init_ngram: found unknown word entry \"%s\"\n", UNK_WORD_DEFAULT);
ndata->isopen = TRUE;
} else {
ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2);
if (ndata->unk_id != WORD_INVALID) {
jlog("Stat: init_ngram: found unknown word entry \"%s\"\n", UNK_WORD_DEFAULT2);
ndata->isopen = TRUE;
} else{
jlog("Stat: init_ngram: neither \"%s\" nor \"%s\" was found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2);
ndata->isopen = FALSE;
}
}
ndata->unk_num = 0;
}
/**
* @brief Set user-specified word ID to the N-gram data.
*
* @param ndata [out] N-gram data to set unknown word ID.
* @param str [in] word name string of unknown word
*/
void
set_unknown_id(NGRAM_INFO *ndata, char *str)
{
WORD_ID w;
w = ngram_lookup_word(ndata, str);
if (w == WORD_INVALID) {
jlog("Stat: init_ngram: \"%s\" not found", str);
} else {
jlog("Stat: init_ngram: unknown word entry was set to \"%s\"\n", str);
ndata->unk_id = w;
ndata->isopen = TRUE;
}
}
/**
* @brief Fix unigram probability of BOS / EOS word.
*
* This function checks the probabilities of BOS / EOS word, and
* if it is set to "-99", give the same as another one.
* This is the case when the LM is trained by SRILM, which assigns
* unigram probability of "-99" to the beginning-of-sentence word,
* and causes search on reverse direction to fail.
*
* @param ndata [i/o] N-gram data
* @param winfo [i/o] Vocabulary information
*
*/
void
fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo)
{
WORD_ID wb, we;
wb = winfo->wton[winfo->head_silwid];
we = winfo->wton[winfo->tail_silwid];
if (ndata->d[0].prob[wb] == -99.0) {
jlog("Warning: BOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[wb]);
jlog("Warning: assigining value of EOS word \"%s\": %f\n", ndata->wname[we], ndata->d[0].prob[we]);
ndata->d[0].prob[wb] = ndata->d[0].prob[we];
} else if (ndata->d[0].prob[we] == -99.0) {
jlog("Warning: EOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[we]);
jlog("Warning: assigining value of BOS word \"%s\": %f\n", ndata->wname[wb], ndata->d[0].prob[wb]);
ndata->d[0].prob[we] = ndata->d[0].prob[wb];
}
}