/**
* @file ngram_write_bin.c
*
*
* @brief N-gramをバイナリ形式でファイルに書き出す
*
* rev.3.5 より,読み込みの高速性を考慮して書き出しのバイトオーダーを
* Big endian 固定からマシン依存に変更された.またインデックスの 24bit 化
* および 2-gram のバックオフデータの圧縮も行うなど,ファイル形式の
* 内部仕様が一部変更された.これにより,3.5 以降の mkbingram で
* で生成したバイナリN-gramは, 3.4.2以前の Julius では使えない.
* (ヘッダチェックでエラーとなる)
*
* なお 3.5 以降の Julius では従来のモデルも問題なく読める.この場合,
* インデックスの 24bit 化とバックオフの圧縮はモデル読み込み時に
* その都度行われる.
*
* バイトオーダーに関してヘッダに記述することで,読み込み時に判定して
* 読み込む.これにより,異なるバイトオーダーのマシンで生成した
* バイナリN-gramでも問題なく読める.もちろん従来のモデルもそのまま
* 読み込める.
*
*
*
* @brief Write a whole N-gram data to a file in binary format
*
* From 3.5, internal format of binary N-gram has changed for using
* machine-dependent natural byte order (previously fixed to big endian),
* 24bit index and 2-gram backoff compression. So, binary N-gram
* generated by mkbingram of 3.5 and later will not work on 3.4.2 and
* earlier versions.
*
* There is full upward- and cross-machine compatibility in 3.5. Old
* binary N-gram files still can be read directly, in which case the conversion
* to 24bit index will performed just after model has been read.
* Byte order will also considered by header information, so
* binary N-gram still can be used among different machines.
*
*
* @author Akinobu LEE
* @date Wed Feb 16 17:23:16 2005
*
* $Revision: 1.6 $
*
*/
/*
* Copyright (c) 1991-2012 Kawahara Lab., Kyoto University
* Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
* Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology
* All rights reserved
*/
#include
#include
static boolean need_swap; ///< TRUE if need byte swap
#define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE
static unsigned int count;
void
reset_wrt_counter()
{
count = 0;
}
static unsigned int
get_wrt_counter()
{
return count;
}
/**
* Binary write function, with byte swapping if needed.
*
* @param fp [in] file pointer
* @param buf [in] data buffer to write
* @param unitbyte [in] unit size in bytes
* @param unitnum [in] number of unit to write
*/
static boolean
wrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum)
{
if (need_swap == TRUE && unitbyte != 1) {
swap_bytes((char *)buf, unitbyte, unitnum);
}
if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) {
jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum);
return FALSE;
}
if (need_swap == TRUE && unitbyte != 1) {
swap_bytes((char *)buf, unitbyte, unitnum);
}
count += unitbyte * unitnum;
return TRUE;
}
/**
* Write header information, with identifier string.
*
* @param fp [in] file pointer
* @param str [in] user header string (any string within BINGRAM_HDSIZE
* bytes is allowed)
* @param version [in] file format version id
*/
static boolean
write_header(FILE *fp, char *str)
{
char buf[BINGRAM_HDSIZE];
int i, totallen;
for(i=0;i= BINGRAM_HDSIZE) {
jlog("Warning: write_bingram: header too long, last will be truncated\n");
i = strlen(str) - (totallen - BINGRAM_HDSIZE);
str[i] = '\0';
}
sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
wrt(fp, buf, 1, BINGRAM_HDSIZE);
return TRUE;
}
/**
* Write a whole N-gram data in binary format.
*
* @param fp [in] file pointer
* @param ndata [in] N-gram data to write
* @param headerstr [in] user header string
*
* @return TRUE on success, FALSE on failure
*/
boolean
ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
{
int i,n;
unsigned int len;
int wlen;
NGRAM_TUPLE_INFO *t;
reset_wrt_counter();
/* write initial header */
if (write_header(fp, headerstr) == FALSE) return FALSE;
/* swap not needed any more */
need_swap = FALSE;
/* write some header info */
wrt(fp, &(ndata->n), sizeof(int), 1);
wrt(fp, &(ndata->dir), sizeof(int), 1);
wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
/* write total info */
for(n=0;nn;n++) {
wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
/*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/
}
/* unk_*, isopen, max_word_num are set after read, so need not save */
/* write wname */
wlen = 0;
for(i=0;imax_word_num;i++) {
wlen += strlen(ndata->wname[i]) + 1;
}
wrt(fp, &wlen, sizeof(int), 1);
for(i=0;imax_word_num;i++) {
wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */
}
/* write N-gram */
for(n=0;nn;n++) {
t = &(ndata->d[n]);
wrt(fp, &(t->is24bit), sizeof(boolean), 1);
wrt(fp, &(t->ct_compaction), sizeof(boolean), 1);
wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1);
wrt(fp, &(t->context_num), sizeof(NNID), 1);
if (n > 0) {
if (t->is24bit) {
wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
} else {
wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
}
wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
}
wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum);
if (t->bo_wt) {
i = 1;
wrt(fp, &i, sizeof(int), 1);
wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
} else {
i = 0;
wrt(fp, &i, sizeof(int), 1);
}
if (t->nnid2ctid_upper) {
i = 1;
wrt(fp, &i, sizeof(int), 1);
wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
} else {
i = 0;
wrt(fp, &i, sizeof(int), 1);
}
}
/* write additional LR 2-gram */
if (ndata->bo_wt_1) {
i = 1;
wrt(fp, &i, sizeof(int), 1);
wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
} else {
i = 0;
wrt(fp, &i, sizeof(int), 1);
}
if (ndata->p_2) {
i = 1;
wrt(fp, &i, sizeof(int), 1);
wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
} else {
i = 0;
wrt(fp, &i, sizeof(int), 1);
}
len = get_wrt_counter();
jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0);
return TRUE;
}