/**
* @file ngram_read_bin.c
*
*
* @brief バイナリ形式のN-gramファイルを読み込む
*
* バイナリ形式では 2-gram と逆向き 3-gram が1つのファイルに
* 収められています.バイナリ形式はJuilus独自形式のみをサポートしており,
* 他のバイナリ形式と互換性はありませんので注意して下さい.
*
* rev.3.5 より,バイナリN-gramのファイル形式の一部が変更されました.
* バイトオーダーが Big endian 固定からマシン依存に変更され(ヘッダに
* 変換時の条件を記述), またインデックスの 24bit 化
* および 2-gram のバックオフデータの圧縮も行われました.
* これにより,3.5 以降の mkbingram で生成したバイナリN-gramは,
* 3.4.2以前の Julius では使えませんので注意してください.
* (ヘッダチェックでエラーとなる)
*
* なお 3.5 以降の Julius では従来のモデルも問題なく読める.この場合,
* インデックスの 24bit 化とバックオフの圧縮はモデル読み込み時に
* その都度行われる.またバイトオーダーはヘッダを見て適宜変換するので,
* 異なるバイトオーダーのマシンで生成した
* バイナリN-gramでも問題なく読める.もちろん従来のモデルもそのまま
* 読み込める.
*
*
*
*
* @brief Read binary foramt N-gram file
*
* In binary format, both 2-gram and reverse 3-gram are stored
* together in one file. This binary format is not
* compatible with other binary format of language model.
*
* From 3.5, internal format of binary N-gram has changed for using
* machine-dependent natural byte order (previously fixed to big endian),
* 24bit index and 2-gram backoff compression. So, binary N-gram
* generated by mkbingram of 3.5 and later will not work on 3.4.2 and
* earlier versions.
*
* There is full upward- and cross-machine compatibility in 3.5. Old
* binary N-gram files still can be read directly, in which case the conversion
* to 24bit index will performed just after model has been read.
* Byte order will also considered by header information, so
* binary N-gram still can be used among different machines.
*
*
* @author Akinobu LEE
* @date Wed Feb 16 17:12:08 2005
*
* $Revision: 1.11 $
*
*/
/*
* Copyright (c) 1991-2012 Kawahara Lab., Kyoto University
* Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
* Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology
* All rights reserved
*/
#include
#include
static int file_version; ///< N-gram format version of the file
static boolean need_swap; ///< TRUE if need byte swap
#ifdef WORDS_INT
static boolean need_conv; ///< TRUE if need conversion of word ID from 2 bytes to 4 bytes
static boolean words_int_retry = FALSE; ///< TRUE if retrying with conversion
#endif
/**
*
*
*/
#define rdn(A,B,C,D) if (rdnfunc(A,B,C,D) == FALSE) return FALSE
#define rdn_wordid(A,B,C,D) if (rdn_wordid_func(A,B,C,D) == FALSE) return FALSE
/**
* Binary read function with byte swap
*
* @param fp [in] file pointer
* @param buf [out] data buffer
* @param unitbyte [in] unit size in bytes
* @param unitnum [in] number of unit to read.
*/
static boolean
rdnfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum)
{
size_t tmp;
if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < unitnum) {
jlog("Error: ngram_read_bin: failed to read %d bytes\n", unitbyte*unitnum);
return FALSE;
}
if (need_swap) {
if (unitbyte != 1) {
swap_bytes(buf, unitbyte, unitnum);
}
}
return TRUE;
}
#ifdef WORDS_INT
/**
* Binary read function with byte swap and word id conversion
*
* @param fp [in] file pointer
* @param buf [out] data buffer
* @param unitnum [in] number of unit to read.
* @param need_conv [in] TRUE if need conversion from 2byte to 4byte
*/
static boolean
rdn_wordid_func(FILE *fp, void *buf, int unitnum, boolean need_conv)
{
int i;
unsigned short *s;
WORD_ID *t;
WORD_ID d;
if (need_conv) {
/* read unsigned short units */
rdn(fp, buf, sizeof(unsigned short), unitnum);
/* convert them to WORD_ID (integer) */
for(i=unitnum-1;i>=0;i--) {
s = (unsigned short *)buf + i;
t = (WORD_ID *)buf + i;
d = *s;
*t = d;
}
} else {
/* read as usual */
rdn(fp, buf, sizeof(WORD_ID), unitnum);
}
return TRUE;
}
#endif
/**
* Check header to see whether the version matches.
*
* @param fp [in] file pointer
*/
static boolean
check_header(FILE *fp)
{
char buf[BINGRAM_HDSIZE], *p;
rdn(fp, buf, 1, BINGRAM_HDSIZE);
p = buf;
#ifdef WORDS_INT
need_conv = FALSE;
#endif
/* version check */
if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) {
/* bingram file made by mkbingram before 3.4.2 */
file_version = 3;
p += strlen(BINGRAM_IDSTR) + 1;
} else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) {
/* bingram file made by mkbingram later than 3.5 */
file_version = 4;
p += strlen(BINGRAM_IDSTR_V4) + 1;
} else if (strnmatch(p, BINGRAM_IDSTR_V5, strlen(BINGRAM_IDSTR_V5))) {
/* bingram file made by JuliusLib-4 and later */
file_version = 5;
p += strlen(BINGRAM_IDSTR_V5) + 1;
} else {
/* not a bingram file */
jlog("Error: ngram_read_bin: invalid header\n");
return FALSE;
}
/* word size check (for bingram build by mkbingram 3.3p5 and later */
if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) {
p += strlen(BINGRAM_SIZESTR_HEAD);
if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) {
/* word size does not match (int / short) */
#ifdef WORDS_INT
if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) {
/* this is 2-byte word ID, will convert while reading */
jlog("Warning: ngram_read_bin: 2-bytes bingram, converting to 4 bytes\n");
need_conv = TRUE;
p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1;
} else {
jlog("Error: ngram_read_bin: unknown word byte size!\n");
return FALSE;
}
#else
if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) {
/*** 4bytes to 2bytes not implemented, just terminate here... ***/
jlog("Error: ngram_read_bin: cannot handle 4-bytes bingram\n");
jlog("Error: ngram_read_bin: please use Julius compiled with --enable-words-int\n");
return FALSE;
//p += strlen(BINGRAM_SIZESTR_BODY_4BYTE) + 1;
} else {
jlog("Error: ngram_read_bin: unknown word byte size!\n");
return FALSE;
}
#endif
} else {
p += strlen(BINGRAM_SIZESTR_BODY) + 1;
}
/* byte order check (v4 (rev.3.5) and later) */
if (file_version >= 4) {
if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) {
jlog("Error: ngram_read_bin: no information for byte order??\n");
return FALSE;
}
p += strlen(BINGRAM_BYTEORDER_HEAD);
if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) {
/* file endian and running endian is different, need swapping */
need_swap = TRUE;
} else {
need_swap = FALSE;
}
p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1;
}
} /* if no BINGRAM_SIZESTR_HEAD found, just pass it */
/* in case of V3 bingram file, the unit size of word_id and its byte order
cannot be determined from the header. In that case, we assume
byteorder to be a BIG ENDIAN. The word_id unit size (2byte in normal,
or 4byte if bingram generated with mkbingram with --enable-words-int)
will be automagically detected.
*/
if (file_version < 4) {
/* assume input as big endian */
#ifdef WORDS_BIGENDIAN
need_swap = FALSE;
#else
need_swap = TRUE;
#endif
}
/*jlog("%s",buf);*/
return TRUE;
}
static boolean
ngram_read_bin_v5(FILE *fp, NGRAM_INFO *ndata)
{
int i,n,len;
char *w, *p;
#ifdef WORDS_INT
unsigned short *buf;
#endif
NGRAM_TUPLE_INFO *t;
/* read some info extended from version 5 */
rdn(fp, &(ndata->n), sizeof(int), 1);
rdn(fp, &(ndata->dir), sizeof(int), 1);
rdn(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
jlog("Stat: ngram_read_bin_v5: this is %s %d-gram file\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ndata->n);
/* read total info and set max_word_num */
ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * ndata->n);
memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * ndata->n);
for(n=0;nn;n++) {
rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
}
ndata->max_word_num = ndata->d[0].totalnum;
/* read wname */
rdn(fp, &len, sizeof(int), 1);
w = mymalloc(len);
rdn(fp, w, 1, len);
/* assign... */
ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
ndata->wname[0] = NULL;
p = w; i = 0;
while (p < w + len) {
ndata->wname[i++] = p;
while(*p != '\0') p++;
p++;
}
if (i != ndata->max_word_num) {
jlog("Error: ngram_read_bin_v5: wname error??\n");
return FALSE;
}
/* read N-gram */
for(n=0;nn;n++) {
jlog("stat: ngram_read_bin_v5: reading %d-gram\n", n+1);
t = &(ndata->d[n]);
rdn(fp, &(t->is24bit), sizeof(boolean), 1);
rdn(fp, &(t->ct_compaction), sizeof(boolean), 1);
rdn(fp, &(t->bgnlistlen), sizeof(NNID), 1);
rdn(fp, &(t->context_num), sizeof(NNID), 1);
if (n > 0) {
if (t->is24bit) {
t->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->bgnlistlen);
rdn(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
t->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->bgnlistlen);
rdn(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
} else {
t->bgn = (NNID *)mymalloc_big(sizeof(NNID), t->bgnlistlen);
rdn(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
}
t->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->bgnlistlen);
#ifdef WORDS_INT
rdn_wordid(fp, t->num, t->bgnlistlen, need_conv);
#else
rdn(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
#endif
t->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->totalnum);
#ifdef WORDS_INT
rdn_wordid(fp, t->nnid2wid, t->totalnum, need_conv);
#else
rdn(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
#endif
} else {
t->bgn_upper = NULL;
t->bgn_lower = NULL;
t->bgn = NULL;
t->num = NULL;
t->bgnlistlen = 0;
t->nnid2wid = NULL;
}
t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum);
rdn(fp, &i, sizeof(int), 1);
if (i == 1) {
t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num);
rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
} else {
t->bo_wt = NULL;
}
rdn(fp, &i, sizeof(int), 1);
if (i == 1) {
t->nnid2ctid_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->totalnum);
t->nnid2ctid_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->totalnum);
rdn(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
rdn(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
} else {
t->nnid2ctid_upper = NULL;
t->nnid2ctid_lower = NULL;
}
}
rdn(fp, &i, sizeof(int), 1);
if (i == 1) {
ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[0].context_num);
rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
} else {
ndata->bo_wt_1 = NULL;
}
rdn(fp, &i, sizeof(int), 1);
if (i == 1) {
jlog("Stat: ngram_read_bin_v5: reading additional LR 2-gram\n");
ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum);
rdn(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
} else {
ndata->p_2 = NULL;
}
return TRUE;
}
static boolean
ngram_read_bin_compat(FILE *fp, NGRAM_INFO *ndata, int *retry_ret)
{
int i,n,len;
char *w, *p;
NNID *n3_bgn;
NNID d, ntmp;
#ifdef WORDS_INT
unsigned short *buf;
#endif
NGRAM_TUPLE_INFO *t, *tt, *ttt;
/* old binary N-gram assumes these types */
ndata->bigram_index_reversed = TRUE;
ndata->n = 3;
ndata->dir = DIR_RL;
/* read total info and set max_word_num */
ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * ndata->n);
memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * ndata->n);
for(n=0;nn;n++) {
rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
}
ndata->max_word_num = ndata->d[0].totalnum;
if (file_version == 4) {
rdn(fp, &(ndata->d[1].context_num), sizeof(NNID), 1);
}
for(n=0;nn;n++) {
if (n < 2) {
ndata->d[n].is24bit = FALSE;
} else {
if (ndata->d[n].totalnum >= NNID_MAX_24) {
jlog("Warning: ngram_read_bin_compat: num of %d-gram exceeds 24bit, now switch to %dbit index\n", n+1, sizeof(NNID) * 8);
ndata->d[n].is24bit = FALSE;
} else {
ndata->d[n].is24bit = TRUE;
}
}
ndata->d[n].nnid2ctid_upper = NULL;
ndata->d[n].nnid2ctid_lower = NULL;
}
/* always do back-off compaction for 3-gram and up */
/* mark 2-gram and up */
ndata->d[0].ct_compaction = FALSE;
for(n=1;nn;n++) {
ndata->d[n].ct_compaction = TRUE;
}
/* read wname */
rdn(fp, &len, sizeof(int), 1);
w = mymalloc(len);
rdn(fp, w, 1, len);
/* assign... */
ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
p = w; i = 0;
while (p < w + len) {
ndata->wname[i++] = p;
while(*p != '\0') p++;
p++;
}
if (i != ndata->max_word_num) {
jlog("Error: ngram_read_bin_compat: wname error??\n");
return FALSE;
}
/* malloc 1-gram */
t = &(ndata->d[0]);
tt = &(ndata->d[1]);
ttt = &(ndata->d[2]);
t->bgn_upper = NULL;
t->bgn_lower = NULL;
t->bgn = NULL;
t->num = NULL;
t->bgnlistlen = 0;
t->nnid2wid = NULL;
t->nnid2ctid_upper = NULL;
t->nnid2ctid_lower = NULL;
t->context_num = t->totalnum;
t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num);
t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num);
tt->bgnlistlen = t->context_num;
tt->bgn = (NNID *)mymalloc_big(sizeof(NNID), tt->bgnlistlen);
tt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), tt->bgnlistlen);
/* read 1-gram */
jlog("stat: ngram_read_bin_compat: reading 1-gram\n");
rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum);
rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), t->context_num);
rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
rdn(fp, tt->bgn, sizeof(NNID), tt->bgnlistlen);
#ifdef WORDS_INT
rdn_wordid(fp, tt->num, tt->bgnlistlen, need_conv);
#else
rdn(fp, tt->num, sizeof(WORD_ID), tt->bgnlistlen);
#endif
#ifdef WORDS_INT
{
/* check if we are wrongly reading word_id=2byte bingram
(if bingram version >= 4, this should not be happen because
header correctly tells the word_id byte size. This will
occur only if matches all the conditions below:
- you run Julius with --enable-words-int,
- you use old bingram of version <= 3, and
- you use bingram file converted without --enable-words-int
*/
WORD_ID w;
for(w=0;wmax_word_num;w++) {
if (ndata->d[1].num[w] > ndata->max_word_num) {
if (words_int_retry) {
jlog("Error: ngram_read_bin_compat: retry failed, wrong bingram format\n");
return FALSE;
}
jlog("Warning: ngram_read_bin_compat: incorrect data, may be a 2-byte v3 bingram, retry with conversion\n");
free(ndata->wname[0]);
free(ndata->wname);
free(t->prob);
free(ndata->bo_wt_1);
free(t->bo_wt);
free(tt->bgn);
free(tt->num);
myfrewind(fp);
words_int_retry = TRUE;
*retry_ret = 1;
return FALSE;
}
}
}
#endif
/* malloc the rest */
tt->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), tt->totalnum);
tt->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->totalnum);
ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->totalnum);
if (file_version == 4) { /* context compaction and 24bit */
tt->nnid2ctid_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), tt->totalnum);
tt->nnid2ctid_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), tt->totalnum);
tt->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->context_num);
ttt->bgnlistlen = tt->context_num;
ttt->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), ttt->bgnlistlen);
ttt->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), ttt->bgnlistlen);
ttt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->bgnlistlen);
} else {
tt->context_num = tt->totalnum;
tt->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->context_num);
ttt->bgnlistlen = tt->context_num;
ttt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->bgnlistlen);
if (ttt->is24bit) {
ttt->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), ttt->bgnlistlen);
ttt->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), ttt->bgnlistlen);
n3_bgn = (NNID *)mymalloc_big(sizeof(NNID), ttt->bgnlistlen);
} else {
ttt->bgn = (NNID *)mymalloc_big(sizeof(NNID), ttt->bgnlistlen);
}
}
ttt->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->totalnum);
ttt->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ttt->totalnum);
ttt->bo_wt = NULL;
/* read 2-gram*/
jlog("Stat: ngram_read_bin_compat: reading 2-gram\n");
#ifdef WORDS_INT
rdn_wordid(fp, tt->nnid2wid, tt->totalnum, need_conv);
#else
rdn(fp, tt->nnid2wid, sizeof(WORD_ID), tt->totalnum);
#endif
rdn(fp, ndata->p_2, sizeof(LOGPROB), tt->totalnum);
rdn(fp, tt->prob, sizeof(LOGPROB), tt->totalnum);
if (file_version == 4) {
rdn(fp, tt->nnid2ctid_upper, sizeof(NNID_UPPER), tt->totalnum);
rdn(fp, tt->nnid2ctid_lower, sizeof(NNID_LOWER), tt->totalnum);
rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num);
rdn(fp, ttt->bgn_upper, sizeof(NNID_UPPER), ttt->bgnlistlen);
rdn(fp, ttt->bgn_lower, sizeof(NNID_LOWER), ttt->bgnlistlen);
#ifdef WORDS_INT
rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv);
#else
rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen);
#endif
} else {
rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num);
if (ttt->is24bit) {
rdn(fp, n3_bgn, sizeof(NNID), ttt->bgnlistlen);
for(d=0;dbgnlistlen;d++) {
if (n3_bgn[d] == NNID_INVALID) {
ttt->bgn_lower[d] = 0;
ttt->bgn_upper[d] = NNID_INVALID_UPPER;
} else {
ntmp = n3_bgn[d] & 0xffff;
ttt->bgn_lower[d] = ntmp;
ntmp = n3_bgn[d] >> 16;
ttt->bgn_upper[d] = ntmp;
}
}
} else {
rdn(fp, ttt->bgn, sizeof(NNID), ttt->bgnlistlen);
}
#ifdef WORDS_INT
rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv);
#else
rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen);
#endif
}
/* read 3-gram*/
jlog("Stat: ngram_read_bin_compat: reading 3-gram\n");
#ifdef WORDS_INT
rdn_wordid(fp, ttt->nnid2wid, ttt->totalnum, need_conv);
#else
rdn(fp, ttt->nnid2wid, sizeof(WORD_ID), ttt->totalnum);
#endif
rdn(fp, ttt->prob, sizeof(LOGPROB), ttt->totalnum);
/* compact the 2-gram back-off and 3-gram links */
if (file_version != 4) {
if (ttt->is24bit) {
free(n3_bgn);
if (ngram_compact_context(ndata, 2) == FALSE) return FALSE;
}
}
return TRUE;
}
/**
* Read a N-gram binary file and store to data.
*
* @param fp [in] file pointer
* @param ndata [out] N-gram data to store the read data
*
* @return TRUE on success, FALSE on failure.
*/
boolean
ngram_read_bin(FILE *fp, NGRAM_INFO *ndata)
{
int retry;
#ifdef WORDS_INT
/* reset retry flag */
words_int_retry = FALSE;
/* when retrying, it restarts from here with words_int_retry = TRUE */
ngram_read_bin_start:
#endif
ndata->from_bin = TRUE;
/* check initial header */
if (check_header(fp) == FALSE) return FALSE;
jlog("Stat: ngram_read_bin: file version: %d\n", file_version);
#ifdef WORDS_INT
/* in retry mode, force word_id conversion */
if (words_int_retry) need_conv = TRUE;
#endif
#ifdef WORDS_INT
if (need_conv) jlog("Stat: ngram_read_bin: word-id size conversion enabled\n");
#endif
if (file_version <= 4) {
retry = 0;
if (ngram_read_bin_compat(fp, ndata, &retry) == FALSE) {
#ifdef WORDS_INT
if (retry == 1) {
goto ngram_read_bin_start;
} else {
return FALSE;
}
#else
return FALSE;
#endif
}
} else {
if (ngram_read_bin_v5(fp, ndata) == FALSE) return FALSE;
}
/* make word search tree for later lookup */
jlog("Stat: ngram_read_bin: making entry name index\n");
ngram_make_lookup_tree(ndata);
bi_prob_func_set(ndata);
return TRUE;
}