/**
* @file ngram2.h
*
*
* @brief Definitions for word N-gram
*
* This file defines a structure for word N-gram language model.
* Julius now support N-gram for arbitrary N.
*
* Both direction of forward (left-to-right) N-gram and backward
* (right-to-left) N-gram is supported. Since the final recognition
* process is done by right-to-left direction, using backward N-gram
* is recommended.
*
* A forward 2-gram is necessary for the 1st recognition pass. If a
* forward N-gram is specified, Julius simply use its 2-gram part for
* the 1st pass. If only backward N-gram is specified, Julius calculate
* the forward probability from the defined backward N-gram by the
* equation "P(w_2|w_1) = P(w_1|w_2) * P(w_2) / P(w_1)." If both
* forward N-gram and backward N-gram are specified, Julius uses the
* 2-gram part of the forward n-gram at the 1st pass, and use the
* backward N-gram at the 2nd pass as the main LM. Note that the last
* behavior is the same as previous versions (<=3.5.x)
*
* ARPA standard format and Julius binary format is supported. The
* binary format can be loaded much faster at startup, so it is
* recommended to use binary format by converting from ARPA format
* N-gram beforehand. All combination of N-gram (forward only,
* backward only, forward 2-gram + backward N-gram) is supported.
*
* @sa mkbingram
*
* For memory efficiency of holding the huge word N-gram on memory, Julius
* merges the two language model into one structure. So the forward bigram and
* reverse trigram should meet the following requirements:
*
* - their vocabularies should be the same.
* - their unigram probabilities of each word should be the same.
* - the same bigram tuple sets are defined.
* - the bigram tuples for context word sequences of existing trigram
* tuples should exist in both.
*
* The first three requirements can be fullfilled easily if you train the
* forward bigram and reverse trigram on the same training text.
* The last condition can be qualified if you set a cut-off value of trigram
* which is larger or equal to that of bigram. These conditions are checked
* when Julius or mkbingram reads in the ARPA models, and output error if
* not cleared.
*
* From 3.5, tuple ID on 2-gram changed from 32bit to 24bit, and 2-gram
* back-off weights will not be saved if the corresponding 3-gram is empty.
* They will be performed when reading N-gram to reduce memory size.
*
*
* @brief 単語N-gram言語モデルの定義
*
* このファイルには単語N-gram言語モデルを格納するための構造体定義が
* 含まれています.Julius はN-gramにおいて任意の N をサポートしました.
*
* 通常の前向き (left-to-right) と後向き (right-to-left) の N-gram が
* サポートされています.認識の最終パス(第2パス)は後向きに行われるので,
* 後向き N-gram を使用することを推奨します.
*
* 第1パスの実行には前向き2-gramが必要です.前向き N-gram のみが
* 与えられた場合,Julius はその2-gramの部分を使います.
* 後向きN-gramのみが与えられた場合,Julius は
* 式 "P(w_2|w_1) = P(w_1|w_2) * P(w_2) / P(w_1)" にしたがって
* 前向き2-gramを推定します.前向きと後向きの両方指定された場合は,
* 前向きN-gramの2-gram部分が第1パスで用いられ,第2パスでは後向きN-gram
* が使われます.この両方指定したときの挙動は以前のバージョン (<=3.5.3)
* と同じです.
*
* 入力ファイル形式は
* ARPA形式とJulius独自のバイナリ形式の2つをサポートしています.
* 読み込みは後者のほうが高速です.前向き,後向き,両方の
* 全てのパターンに対応しています.
*
* @sa mkbingram
*
* NGRAM_INFO ではメモリ量節約のため,これらを一つの構造体で表現しています.
* このことから,Julius は使用するこれら2つの言語モデルが
* 以下を満たすことを要求します.
*
* - 語彙が同一であること
* - 各語彙の1-gram確率が同一であること
* - 同じ 2-gram tuple 集合が定義されていること
* - 3-gram のコンテキストである単語組の2-gramが定義されていること
*
* 上記の前提のほとんどは,これらの2つのN-gramを同一のコーパスから
* 学習することで満たされます.最後の条件については,3-gram のカットオフ
* 値に 2-gram のカットオフ値と同値かそれ以上の値を指定すればOKです.
* 与えられたN-gramが上記を満たさない場合,Julius はエラーを出します.
*
*
* @author Akinobu LEE
* @date Fri Feb 11 15:04:02 2005
*
* $Revision: 1.12 $
*
*/
/*
* Copyright (c) 1991-2012 Kawahara Lab., Kyoto University
* Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
* Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology
* All rights reserved
*/
#ifndef __SENT_NGRAM2_H__
#define __SENT_NGRAM2_H__
#include
#include
typedef unsigned int NNID; ///< Type definition for N-gram entry ID (full)
#define NNID_INVALID 0xffffffff ///< Value to indicate no id (full)
#define NNID_MAX 0xfffffffe ///< Value of maximum value (full)
typedef unsigned char NNID_UPPER; ///< N-gram entry ID (24bit: upper bit)
typedef unsigned short NNID_LOWER; ///< N-gram entry ID (24bit: lower bit)
#define NNID_INVALID_UPPER 255 ///< Value to indicate no id at NNID_UPPER (24bit)
#define NNID_MAX_24 16711679 ///< Allowed maximum number of id (255*65536-1) (24bit)
/// Default word string of beginning-of-sentence word
#define BEGIN_WORD_DEFAULT ""
/// Default word string of end-of-sentence word
#define END_WORD_DEFAULT ""
/// Default word string of unknown word for open vocabulary
#define UNK_WORD_DEFAULT ""
#define UNK_WORD_DEFAULT2 ""
/// Maximum length of unknown word string
#define UNK_WORD_MAXLEN 30
/**
* N-gram entries for a m-gram (1 <= m <= N)
*
*/
typedef struct {
NNID totalnum; ///< Number of defined tuples
boolean is24bit; ///< TRUE if this m-gram uses 24bit index for tuples instead of 32bit
NNID bgnlistlen; ///< Length of bgn and num, should be the same as @a context_num of (m-1)-gram
NNID_UPPER *bgn_upper; ///< Beginning ID of a tuple set whose context is the (m-1) tuple for 24bit mode (upper 8bit)
NNID_LOWER *bgn_lower; ///< Beginning ID of a tuple set whose context is the (m-1) tuple for 24bit mode (lower 16bit)
NNID *bgn; ///< Beginning ID of a tuple set whose context is the (m-1) tuple for 32bit mode
WORD_ID *num; ///< Size of a tuple set whose context is the (m-1) tuple
WORD_ID *nnid2wid; ///< List of Word IDs of edge word of the tuple
LOGPROB *prob; ///< Log probabilities of edge word of the tuple
NNID context_num; ///< Number of tuples to be a context of (m+1)-gram (= number of defined back-off weights)
LOGPROB *bo_wt; ///< Back-off weights for (m+1)-gram, the length is @a context_num if @a ct_compaction is TRUE, or @a totalnum if FALSE.
boolean ct_compaction; ///< TRUE if use compacted index for back-off contexts
NNID_UPPER *nnid2ctid_upper; ///< Index to map tuple ID of this m-gram to valid context id (upper 8bit)
NNID_LOWER *nnid2ctid_lower; ///< Index to map tuple ID of this m-gram to valid context id (upper 16bit)
} NGRAM_TUPLE_INFO;
/**
* @brief Main N-gram structure
*
* bigrams and trigrams are stored in the form of sequential lists.
* They are grouped by the same context, and referred from the
* context ((N-1)-gram) data by the beginning ID and its number.
*
*/
typedef struct __ngram_info__ {
int n; ///< N-gram order (ex. 3 for 3-gram)
int dir; ///< direction (either DIR_LR or DIR_RL)
boolean from_bin; ///< TRUE if source was bingram, otherwise ARPA
boolean bigram_index_reversed; ///< TRUE if read from old (<=3.5.3) bingram, in which case the 2-gram tuple index is reversed (DIR_LR) against the RL 3-gram.
boolean bos_eos_swap; ///< TRUE if swap BOS and SOS on backward N-gram
WORD_ID max_word_num; ///< N-gram vocabulary size
char **wname; ///< List of word strings.
PATNODE *root; ///< Root of index tree to search n-gram word ID from its name
WORD_ID unk_id; ///< Word ID of unknown word.
int unk_num; ///< Number of dictionary words that are not in this N-gram vocabulary
LOGPROB unk_num_log; ///< Log10 value of @a unk_num, used for calculating probability of unknown words
boolean isopen; ///< TRUE if dictionary has unknown words, which does not appear in this N-gram
NGRAM_TUPLE_INFO *d; ///< Main body of N-gram info
/* for pass1 */
LOGPROB *bo_wt_1; ///< back-off weights for 2-gram on 1st pass
LOGPROB *p_2; ///< 2-gram prob for the 1st pass
LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID); ///< Pointer of a function to compite bigram probability on the 1st pass. See bi_prob_func_set() for details
BMALLOC_BASE *mroot; ///< Pointer for block memory allocation for lookup index
} NGRAM_INFO;
/* Definitions for binary N-gram */
/// Header string to identify version of bingram (v3: <= rev.3.4.2)
#define BINGRAM_IDSTR "julius_bingram_v3"
/// Header string to identify version of bingram (v4: <= rev.3.5.3)
#define BINGRAM_IDSTR_V4 "julius_bingram_v4"
/// Header string to identify version of bingram (v5: >= rev.4.0)
#define BINGRAM_IDSTR_V5 "julius_bingram_v5"
/// Bingram header size in bytes
#define BINGRAM_HDSIZE 512
/// Bingram header info string to identify the unit byte (head)
#define BINGRAM_SIZESTR_HEAD "word="
/// Bingram header string that indicates 4 bytes unit
#define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
/// Bingram header string that indicates 2 bytes unit
#define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
#ifdef WORDS_INT
#define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
#else
#define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
#endif
/// Bingram header info string to identify the byte order (head) (v4)
#define BINGRAM_BYTEORDER_HEAD "byteorder="
/// Bingram header info string to identify the byte order (body) (v4)
#ifdef WORDS_BIGENDIAN
#define BINGRAM_NATURAL_BYTEORDER "BE"
#else
#define BINGRAM_NATURAL_BYTEORDER "LE"
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* function declaration */
NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w);
LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w);
LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
void bi_prob_func_set(NGRAM_INFO *ndata);
boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition);
boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
boolean ngram_compact_context(NGRAM_INFO *ndata, int n);
void ngram_make_lookup_tree(NGRAM_INFO *ndata);
WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
NGRAM_INFO *ngram_info_new();
void ngram_info_free(NGRAM_INFO *ngram);
boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir);
boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file);
void set_default_unknown_id(NGRAM_INFO *ndata);
void set_unknown_id(NGRAM_INFO *ndata, char *str);
void print_ngram_info(FILE *fp, NGRAM_INFO *ndata);
#include
boolean make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
void fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo);
#ifdef __cplusplus
}
#endif
#endif /* __SENT_NGRAM2_H__ */