/**
* @file ngram_read_arpa.c
*
*
* @brief ARPA形式のN-gramファイルを読み込む
*
* ARPA形式のN-gramファイルを用いる場合,2-gram と逆向き 3-gram を
* それぞれ別々のファイルから読み込みます.
*
*
*
* @brief Read ARPA format N-gram files
*
* When N-gram data is given in ARPA format, both 2-gram file and
* reverse 3-gram file should be specified.
*
*
* @sa ngram2.h
*
* @author Akinobu LEE
* @date Wed Feb 16 16:52:24 2005
*
* $Revision: 1.20 $
*
*/
/*
* Copyright (c) 1991-2012 Kawahara Lab., Kyoto University
* Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
* Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology
* All rights reserved
*/
/* $Id: ngram_read_arpa.c,v 1.20 2012/08/31 05:17:28 sumomo Exp $ */
/* words should be alphabetically sorted */
#include
#include
static char buf[800]; ///< Local buffer for reading
static char pbuf[800]; ///< Local buffer for error string
/**
* Set number of N-gram entries, for reading the first LR 2-gram.
*
* @param fp [in] file pointer
* @param numlist [out] set the values to this buffer (malloc)
*
* @return the value of N, or -1 on error.
*/
static int
get_total_info(FILE *fp, NNID **numlist)
{
char *p;
int n;
int maxn;
unsigned long entry_num;
int numnum;
maxn = 0;
numnum = 10;
*numlist = (NNID *)mymalloc(sizeof(NNID) * numnum);
while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
if (strnmatch(buf, "ngram", 5)) { /* n-gram num */
//p = strtok(buf, " =");
//n = atoi(p);
//p = strtok(NULL, " =");
//entry_num = atol(p);
//sscanf(p, "%lu", &entry_num);
sscanf(buf, "ngram %d = %lu", &n, &entry_num);
/* check maximum number */
if (entry_num > NNID_MAX) {
jlog("Error: too big %d-gram (exceeds %d bit)\n", n, sizeof(NNID) * 8);
return -1;
}
/* ignore empty entry */
if (entry_num == 0) {
jlog("Warning: empty %d-gram, skipped\n", n);
} else {
if (maxn < n) maxn = n;
if (n >= numnum) {
numnum *= 2;
*numlist = (NNID *)myrealloc(*numlist, sizeof(NNID) * numnum);
}
(*numlist)[n-1] = entry_num;
}
}
}
return(maxn);
}
/**
* Read word/class entry names and 1-gram data from LR 2-gram file.
*
* @param fp [in] file pointer
* @param ndata [out] N-gram to set the read data.
*/
static boolean
set_unigram(FILE *fp, NGRAM_INFO *ndata)
{
WORD_ID nid;
int resid;
LOGPROB prob, bo_wt;
char *name, *p;
boolean ok_p = TRUE;
NGRAM_TUPLE_INFO *t;
t = &(ndata->d[0]);
/* malloc name area */
ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
for (nid = 0; nid < ndata->max_word_num; nid++) {
ndata->wname[nid] = NULL;
}
/* malloc data area */
//t->bgn_upper = t->bgn_lower = t->bgn = t->num = NULL;
t->bgn_upper = NULL;
t->bgn_lower = NULL;
t->bgn = NULL;
t->num = NULL;
t->bgnlistlen = 0;
t->nnid2wid = NULL;
t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
t->context_num = t->totalnum;
t->nnid2ctid_upper = NULL;
t->nnid2ctid_lower = NULL;
nid = 0;
while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
if ((p = strtok(buf, DELM)) == NULL) {
jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n");
return FALSE;
}
prob = (LOGPROB)atof(p);
if ((p = strtok(NULL, DELM)) == NULL) {
jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n");
return FALSE;
}
name = strcpy((char *)mymalloc(strlen(p)+1), p);
if ((p = strtok(NULL, DELM)) == NULL) {
bo_wt = 0.0;
} else {
bo_wt = (LOGPROB)atof(p);
}
/* register word entry name */
ndata->wname[nid] = name;
/* add entry name to index tree */
if (ndata->root == NULL) {
ndata->root = ptree_make_root_node(nid, &(ndata->mroot));
} else {
resid = ptree_search_data(name, ndata->root);
if (resid != -1 && strmatch(name, ndata->wname[resid])) { /* already exist */
jlog("Error: ngram_read_arpa: duplicate word entry \"%s\" at #%d and #%d in 1-gram\n", name, resid, nid);
ok_p = FALSE;
continue;
} else {
ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root), &(ndata->mroot));
}
}
if (nid >= ndata->max_word_num) {
jlog("Error: ngram_read_arpa: num of 1-gram is bigger than header value (%d)\n", ndata->max_word_num);
return FALSE;
}
/* register entry info */
t->prob[nid] = prob;
t->bo_wt[nid] = bo_wt;
nid++;
}
if (nid != t->totalnum) {
jlog("Error: ngram_read_arpa: num of 1-gram (%d) not equal to header value (%d)\n", nid, t->totalnum);
return FALSE;
}
if (ok_p == TRUE) {
jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", nid);
}
return ok_p;
}
/* read-in 1-gram (RL) --- only add back-off weight */
/**
* Read 1-gram data from RL 3-gram file. Only the back-off weights are
* stored.
*
* @param fp [in] file pointer
* @param ndata [out] N-gram to store the read data.
*/
static boolean
add_unigram(FILE *fp, NGRAM_INFO *ndata)
{
WORD_ID read_word_num;
WORD_ID nid;
LOGPROB prob, bo_wt;
char *name, *p;
boolean ok_p = TRUE;
boolean mismatched = FALSE;
ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->max_word_num);
read_word_num = 0;
while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
if ((p = strtok(buf, DELM)) == NULL) {
jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
return FALSE;
}
prob = atof(p);
if ((p = strtok(NULL, DELM)) == NULL) {
jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
return FALSE;
}
name = strcpy((char *)mymalloc(strlen(p)+1), p);
if ((p = strtok(NULL, DELM)) == NULL) {
bo_wt = 0.0;
} else {
bo_wt = (LOGPROB)atof(p);
}
/* add bo_wt_rl to existing 1-gram entry */
nid = ngram_lookup_word(ndata, name);
if (nid == WORD_INVALID) {
if (mismatched == FALSE) {
jlog("Error: ngram_read_arpa: vocabulary mismatch between LR n-gram and RL n-gram\n");
mismatched = TRUE;
}
jlog("Error: ngram_read_arpa: \"%s\" does not appears in LR n-gram\n", name);
ok_p = FALSE;
} else {
ndata->bo_wt_1[nid] = bo_wt;
}
read_word_num++;
if (read_word_num > ndata->max_word_num) {
jlog("Error: ngram_read_arpa: vocabulary size of RL n-gram is bigger than header value (%d)\n", ndata->max_word_num);
return FALSE;
}
free(name);
}
if (ok_p == TRUE) {
jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", read_word_num);
}
return ok_p;
}
/**
* Read forward 2-gram data and set the LR 2-gram probabilities to the
* already loaded RL N-gram.
*
* @param fp [in] file pointer
* @param ndata [i/o] N-gram to set the read data.
*/
static boolean
add_bigram(FILE *fp, NGRAM_INFO *ndata)
{
WORD_ID w[2], wtmp;
LOGPROB prob;
NNID bi_count = 0;
NNID n2;
boolean ok_p = TRUE;
char *s;
ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum);
while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
strcpy(pbuf, buf);
if ( ++bi_count % 100000 == 0) {
jlog("Stat: ngram_read_arpa: 2-gram read %lu (%d%%)\n", bi_count, bi_count * 100 / ndata->d[1].totalnum);
}
if ((s = strtok(buf, DELM)) == NULL) {
jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
return FALSE;
}
prob = (LOGPROB)atof(s);
if ((s = strtok(NULL, DELM)) == NULL) {
jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
return FALSE;
}
w[0] = ngram_lookup_word(ndata, s);
if (w[0] == WORD_INVALID) {
jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s);
ok_p = FALSE;
continue;
}
if ((s = strtok(NULL, DELM)) == NULL) {
jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
return FALSE;
}
w[1] = ngram_lookup_word(ndata, s);
if (w[1] == WORD_INVALID) {
jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s);
ok_p = FALSE;
continue;
}
if (ndata->dir == DIR_RL) {
/* word order should be reversed */
wtmp = w[0];
w[0] = w[1];
w[1] = wtmp;
}
n2 = search_ngram(ndata, 2, w);
if (n2 == NNID_INVALID) {
jlog("Warning: ngram_read_arpa: 2-gram #%d: \"%s\": (%s,%s) not exist in LR 2-gram (ignored)\n", n2+1, pbuf, ndata->wname[w[0]], ndata->wname[w[1]]);
} else {
ndata->p_2[n2] = prob;
}
}
if (ok_p == TRUE) {
jlog("Stat: ngram_read_arpa: 2-gram read %lu end\n", bi_count);
}
return ok_p;
}
/**
* Read n-gram data for a given N from ARPA n-gram file. (n >= 2)
*
* @param fp [in] file pointer
* @param ndata [out] N-gram to set the read data.
*/
static boolean
set_ngram(FILE *fp, NGRAM_INFO *ndata, int n)
{
NNID i;
WORD_ID *w;
WORD_ID *w_last;
LOGPROB p, bowt;
NNID nnid;
NNID cid, cid_last;
boolean ok_p = TRUE;
char *s;
NGRAM_TUPLE_INFO *t;
NGRAM_TUPLE_INFO *tprev;
NNID ntmp;
if (n < 2) {
jlog("Error: ngram_read_arpa: unable to process 1-gram\n");
return FALSE;
}
w = (WORD_ID *)mymalloc(sizeof(WORD_ID) * n);
w_last = (WORD_ID *)mymalloc(sizeof(WORD_ID) * n);
t = &(ndata->d[n-1]);
tprev = &(ndata->d[n-2]);
/* initialize pointer storage to access from (N-1)-gram */
t->bgnlistlen = tprev->context_num;
if (t->is24bit) {
t->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->bgnlistlen);
t->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->bgnlistlen);
for(i = 0; i < t->bgnlistlen; i++) {
t->bgn_upper[i] = NNID_INVALID_UPPER;
t->bgn_lower[i] = 0;
}
} else {
t->bgn = (NNID *)mymalloc_big(sizeof(NNID), t->bgnlistlen);
for(i = 0;i < t->bgnlistlen; i++) {
t->bgn[i] = NNID_INVALID;
}
}
t->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->bgnlistlen);
for(i = 0; i < t->bgnlistlen; i++) {
t->num[i] = 0;
}
/* allocate data area */
t->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->totalnum);
t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
t->bo_wt = NULL;
t->nnid2ctid_upper = NULL;
t->nnid2ctid_lower = NULL;
nnid = 0;
cid = cid_last = NNID_INVALID;
for(i=0;itotalnum);
}
/* N-gram probability */
if ((s = strtok(buf, DELM)) == NULL) {
jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
free(w_last); free(w);
return FALSE;
}
p = (LOGPROB)atof(s);
/* read in context word and lookup the ID */
for(i=0;iwname[w_m], ndata->wname[w_r], n-1);
jlog("Warning: ngram_read_arpa: %d-gram #%d: \"%s\": context (",
n, nnid+1, pbuf);
for(i=0;iwname[w[i]]);
}
jlog(") not exist in %d-gram (ignored)\n", n-1);
ok_p = FALSE;
continue;
}
if (cid_last != NNID_INVALID) {
/* close last entry */
if (t->is24bit) {
ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]);
} else {
ntmp = t->bgn[cid_last];
}
t->num[cid_last] = nnid - ntmp;
}
/* the next context word should be an new entry */
if (t->is24bit) {
if (t->bgn_upper[cid] != NNID_INVALID_UPPER) {
jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
free(w_last); free(w);
return FALSE;
}
ntmp = nnid & 0xffff;
t->bgn_lower[cid] = ntmp;
ntmp = nnid >> 16;
t->bgn_upper[cid] = ntmp;
} else {
if (t->bgn[cid] != NNID_INVALID) {
jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
free(w_last); free(w);
return FALSE;
}
t->bgn[cid] = nnid;
}
cid_last = cid;
w_last[n-1] = WORD_INVALID;
}
/* store the probabilities of the target word */
if (w[n-1] == w_last[n-1]) {
jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": duplicated entry\n", n, nnid+1, pbuf);
ok_p = FALSE;
continue;
} else if (w_last[n-1] != WORD_INVALID && w[n-1] < w_last[n-1]) {
jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
free(w_last); free(w);
return FALSE;
}
/* if the 2-gram has back-off entries, store them here */
if ((s = strtok(NULL, DELM)) != NULL) {
bowt = (LOGPROB) atof(s);
if (t->bo_wt == NULL) {
t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
for(i=0;ibo_wt[i] = 0.0;
}
t->bo_wt[nnid] = bowt;
} else {
if (t->bo_wt != NULL) t->bo_wt[nnid] = 0.0;
}
/* store the entry info */
t->nnid2wid[nnid] = w[n-1];
t->prob[nnid] = p;
nnid++;
for(i=0;i t->totalnum) {
jlog("Error: ngram_read_arpa: %d-gram: read num (%d) not match the header value (%d)\n", n, nnid, t->totalnum);
free(w_last); free(w);
return FALSE;
}
}
/* set the last entry */
if (t->is24bit) {
ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]);
} else {
ntmp = t->bgn[cid_last];
}
t->num[cid_last] = nnid - ntmp;
if (t->bo_wt != NULL) t->context_num = t->totalnum;
if (ok_p == TRUE) {
jlog("Stat: ngram_read_arpa: %d-gram read %d end\n", n, nnid);
}
free(w_last); free(w);
return ok_p;
}
/**
* Read in one ARPA N-gram file. Supported combinations are
* LR 2-gram, RL 3-gram and LR 3-gram.
*
* @param fp [in] file pointer
* @param ndata [out] N-gram data to store the read data
* @param addition [in] TRUE if going to read additional 2-gram
*
* @return TRUE on success, FALSE on failure.
*/
boolean
ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition)
{
int i, n;
NNID *num;
/* source file is not a binary N-gram */
ndata->from_bin = FALSE;
ndata->bigram_index_reversed = FALSE;
/* read until `\data\' found */
while (getl(buf, sizeof(buf), fp) != NULL && strncmp(buf,"\\data\\",6) != 0);
if (addition) {
/* reading additional forward 2-gram for the 1st pass */
if (ndata->n < 2) {
jlog("Error: base N-gram should be longer than 2-gram\n");
return FALSE;
}
/* read n-gram total info */
n = get_total_info(fp, &num);
if (n == -1) { /* error */
free(num);
return FALSE;
}
/* check N limit */
if (n < 2) {
jlog("Error: forward N-gram for pass1 is does not contain 2-gram\n");
free(num);
return FALSE;
}
if (n > 2) {
jlog("Warning: forward N-gram for pass1 contains %d-gram, only 2-gram will be used\n", n);
}
/* check if the numbers are the same with already read n-gram */
for(i=0;i<2;i++) {
if (ndata->d[i].totalnum != num[i]) {
jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", i+1);
}
}
free(num);
/* read additional 1-gram data */
if (!strnmatch(buf,"\\1-grams",8)) {
jlog("Error: ngram_read_arpa: 1-gram not found for additional LR 2-gram\n");
return FALSE;
}
jlog("Stat: ngram_read_arpa: reading 1-gram part...\n");
if (add_unigram(fp, ndata) == FALSE) return FALSE;
/* read 2-gram data */
if (!strnmatch(buf,"\\2-grams", 8)) {
jlog("Error: ngram_read_arpa: 2-gram not found for additional LR 2-gram\n");
return FALSE;
}
jlog("Stat: ngram_read_arpa: reading 2-gram part...\n");
if (add_bigram(fp, ndata) == FALSE) return FALSE;
/* ignore the rest */
if (strnmatch(buf,"\\3-grams", 8)) {
jlog("Warning: forward n-gram contains more than 3-gram, ignored\n");
}
} else {
/* read n-gram total info */
n = get_total_info(fp, &num);
if (n == -1) { /* error */
free(num);
return FALSE;
}
jlog("Stat: ngram_read_arpa: this is %d-gram file\n", n);
ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * n);
memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * n);
for(i=0;id[i].totalnum = num[i];
}
free(num);
/* set word num */
if (ndata->d[0].totalnum > MAX_WORD_NUM) {
jlog("Error: ngram_read_arpa: N-gram vocabulary size exceeds the limit (%d)\n", MAX_WORD_NUM);
return FALSE;
}
ndata->max_word_num = ndata->d[0].totalnum;
/* check if each N-gram allows 24bit and back-off compaction mode */
/* for fast access, 1-gram and 2-gram always use non-compaction mode */
for(i=0;id[i].is24bit = FALSE;
} else {
/* for 3-gram and later 24 bit mode is preferred,
but should be disabled if number of entries is over 2^24 */
if (ndata->d[i].totalnum > NNID_MAX_24) {
jlog("Warning: ngram_read_arpa: num of %d-gram exceeds 24bit, now switch to %dbit index\n", i+1, sizeof(NNID) * 8);
ndata->d[i].is24bit = FALSE;
} else {
ndata->d[i].is24bit = TRUE;
}
}
}
/* disable ct_compaction flag while reading ARPA data */
for(i=0;id[i].ct_compaction = FALSE;
}
/* read 1-gram data */
if (!strnmatch(buf,"\\1-grams",8)) {
jlog("Error: ngram_read_arpa: data format error: 1-gram not found\n");
return FALSE;
}
jlog("Stat: ngram_read_arpa: reading 1-gram part...\n");
if (set_unigram(fp, ndata) == FALSE) return FALSE;
i = 2;
while(i <= n) {
/* read n-gram data in turn */
sprintf(pbuf, "\\%d-grams", i);
if (!strnmatch(buf, pbuf, 8)) {
jlog("Error: ngram_read_arpa: data format error: %d-gram not found\n", i);
return FALSE;
}
jlog("Stat: ngram_read_arpa: reading %d-gram part...\n", i);
if (set_ngram(fp, ndata, i) == FALSE) return FALSE;
i++;
}
/* finished reading file */
if (!strnmatch(buf, "\\end", 4)) {
jlog("Error: ngram_read_arpa: data format error: end marker \"\\end\" not found\n");
return FALSE;
}
ndata->n = n;
for(i=2;id[i-1].bo_wt != NULL) {
/* perform back-off compaction */
if (ngram_compact_context(ndata, i) == FALSE) return FALSE;
}
}
/* swap and for backward SRILM N-gram */
if (ndata->dir == DIR_RL) {
WORD_ID bos, eos;
char *p;
bos = ngram_lookup_word(ndata, BEGIN_WORD_DEFAULT);
eos = ngram_lookup_word(ndata, END_WORD_DEFAULT);
if (!ndata->bos_eos_swap) {
/* check */
if (bos != WORD_INVALID && eos != WORD_INVALID && ndata->d[0].prob[bos] == -99) {
jlog("Stat: \"P(%s) = -99\" in reverse N-gram, may be trained by SRILM\n", BEGIN_WORD_DEFAULT);
jlog("Stat: going to swap \"%s\" and \"%s\"\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT);
ndata->bos_eos_swap = TRUE;
}
}
if (ndata->bos_eos_swap) {
if (bos == WORD_INVALID) {
jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", BEGIN_WORD_DEFAULT);
}
if (eos == WORD_INVALID) {
jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", END_WORD_DEFAULT);
}
if (bos == WORD_INVALID || eos == WORD_INVALID) {
return FALSE;
}
/* do swap */
jlog("Stat: ngram_read_arpa: swap \"%s\" and \"%s\" at backward N-gram\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT);
/* swap name buffer */
p = ndata->wname[bos];
ndata->wname[bos] = ndata->wname[eos];
ndata->wname[eos] = p;
/* replace index */
ptree_replace_data(BEGIN_WORD_DEFAULT, eos, ndata->root);
ptree_replace_data(END_WORD_DEFAULT, bos, ndata->root);
}
}
}
#ifdef CLASS_NGRAM
/* skip in-class word entries (they should be in word dictionary) */
if (getl(buf, sizeof(buf), fp) != NULL) {
if (strnmatch(buf, "\\class", 6)) {
jlog("Stat: ngram_read_arpa: skipping in-class word entries...\n");
}
}
#endif
bi_prob_func_set(ndata);
return TRUE;
}