/** * @file ngram_write_bin.c * * * @brief N-gramをバイナリ形式でファイルに書き出す * * rev.3.5 より,読み込みの高速性を考慮して書き出しのバイトオーダーを * Big endian 固定からマシン依存に変更された.またインデックスの 24bit 化 * および 2-gram のバックオフデータの圧縮も行うなど,ファイル形式の * 内部仕様が一部変更された.これにより,3.5 以降の mkbingram で * で生成したバイナリN-gramは, 3.4.2以前の Julius では使えない. * (ヘッダチェックでエラーとなる) * * なお 3.5 以降の Julius では従来のモデルも問題なく読める.この場合, * インデックスの 24bit 化とバックオフの圧縮はモデル読み込み時に * その都度行われる. * * バイトオーダーに関してヘッダに記述することで,読み込み時に判定して * 読み込む.これにより,異なるバイトオーダーのマシンで生成した * バイナリN-gramでも問題なく読める.もちろん従来のモデルもそのまま * 読み込める. * * * * @brief Write a whole N-gram data to a file in binary format * * From 3.5, internal format of binary N-gram has changed for using * machine-dependent natural byte order (previously fixed to big endian), * 24bit index and 2-gram backoff compression. So, binary N-gram * generated by mkbingram of 3.5 and later will not work on 3.4.2 and * earlier versions. * * There is full upward- and cross-machine compatibility in 3.5. Old * binary N-gram files still can be read directly, in which case the conversion * to 24bit index will performed just after model has been read. * Byte order will also considered by header information, so * binary N-gram still can be used among different machines. * * * @author Akinobu LEE * @date Wed Feb 16 17:23:16 2005 * * $Revision: 1.6 $ * */ /* * Copyright (c) 1991-2012 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology * All rights reserved */ #include #include static boolean need_swap; ///< TRUE if need byte swap #define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE static unsigned int count; void reset_wrt_counter() { count = 0; } static unsigned int get_wrt_counter() { return count; } /** * Binary write function, with byte swapping if needed. * * @param fp [in] file pointer * @param buf [in] data buffer to write * @param unitbyte [in] unit size in bytes * @param unitnum [in] number of unit to write */ static boolean wrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum) { if (need_swap == TRUE && unitbyte != 1) { swap_bytes((char *)buf, unitbyte, unitnum); } if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) { jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum); return FALSE; } if (need_swap == TRUE && unitbyte != 1) { swap_bytes((char *)buf, unitbyte, unitnum); } count += unitbyte * unitnum; return TRUE; } /** * Write header information, with identifier string. * * @param fp [in] file pointer * @param str [in] user header string (any string within BINGRAM_HDSIZE * bytes is allowed) * @param version [in] file format version id */ static boolean write_header(FILE *fp, char *str) { char buf[BINGRAM_HDSIZE]; int i, totallen; for(i=0;i= BINGRAM_HDSIZE) { jlog("Warning: write_bingram: header too long, last will be truncated\n"); i = strlen(str) - (totallen - BINGRAM_HDSIZE); str[i] = '\0'; } sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str); wrt(fp, buf, 1, BINGRAM_HDSIZE); return TRUE; } /** * Write a whole N-gram data in binary format. * * @param fp [in] file pointer * @param ndata [in] N-gram data to write * @param headerstr [in] user header string * * @return TRUE on success, FALSE on failure */ boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr) { int i,n; unsigned int len; int wlen; NGRAM_TUPLE_INFO *t; reset_wrt_counter(); /* write initial header */ if (write_header(fp, headerstr) == FALSE) return FALSE; /* swap not needed any more */ need_swap = FALSE; /* write some header info */ wrt(fp, &(ndata->n), sizeof(int), 1); wrt(fp, &(ndata->dir), sizeof(int), 1); wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1); /* write total info */ for(n=0;nn;n++) { wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1); /*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/ } /* unk_*, isopen, max_word_num are set after read, so need not save */ /* write wname */ wlen = 0; for(i=0;imax_word_num;i++) { wlen += strlen(ndata->wname[i]) + 1; } wrt(fp, &wlen, sizeof(int), 1); for(i=0;imax_word_num;i++) { wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */ } /* write N-gram */ for(n=0;nn;n++) { t = &(ndata->d[n]); wrt(fp, &(t->is24bit), sizeof(boolean), 1); wrt(fp, &(t->ct_compaction), sizeof(boolean), 1); wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1); wrt(fp, &(t->context_num), sizeof(NNID), 1); if (n > 0) { if (t->is24bit) { wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen); wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen); } else { wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen); } wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen); wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum); } wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum); if (t->bo_wt) { i = 1; wrt(fp, &i, sizeof(int), 1); wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num); } else { i = 0; wrt(fp, &i, sizeof(int), 1); } if (t->nnid2ctid_upper) { i = 1; wrt(fp, &i, sizeof(int), 1); wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum); wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum); } else { i = 0; wrt(fp, &i, sizeof(int), 1); } } /* write additional LR 2-gram */ if (ndata->bo_wt_1) { i = 1; wrt(fp, &i, sizeof(int), 1); wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num); } else { i = 0; wrt(fp, &i, sizeof(int), 1); } if (ndata->p_2) { i = 1; wrt(fp, &i, sizeof(int), 1); wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum); } else { i = 0; wrt(fp, &i, sizeof(int), 1); } len = get_wrt_counter(); jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0); return TRUE; }