/** * @file ngram_access.c * * * @brief 単語列・クラス列の N-gram 確率を求める * * * * * @brief Get N-gram probability of a word/class sequence. * * * @author Akinobu LEE * @date Wed Feb 16 07:46:18 2005 * * $Revision: 1.7 $ * */ /* * Copyright (c) 1991-2012 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology * All rights reserved */ #include #include #undef ADEBUG /** * Search for n-gram tuple. * * @param ndata [in] word/class N-gram * @param n [in] N of N-gram * @param nid_prev [in] context (N-1)-gram tuple ID * @param wkey [in] the target word ID * * @return corresponding index to the 2-gram data part if found, or * NNID_INVALID if the tuple does not exist in 2-gram. */ static NNID search_ngram_core(NGRAM_INFO *ndata, int n, NNID nid_prev, WORD_ID wkey) { NGRAM_TUPLE_INFO *t, *tprev; NNID nnid; NNID left,right,mid; NNID x; if (ndata->bigram_index_reversed && n == 2) { /* old binary format builds 1gram->2gram mapping using LR 2-gram, although the main model is RL 3-gram. This hacks this problem */ x = nid_prev; nid_prev = wkey; wkey = x; } t = &(ndata->d[n-1]); tprev = &(ndata->d[n-2]); if (tprev->ct_compaction) { nnid = tprev->nnid2ctid_upper[nid_prev]; if (nnid == NNID_INVALID_UPPER) return (NNID_INVALID); nnid = (nnid << 16) + (NNID)(tprev->nnid2ctid_lower[nid_prev]); } else { nnid = nid_prev; } if (t->is24bit) { left = t->bgn_upper[nnid]; if (left == NNID_INVALID_UPPER) return (NNID_INVALID); left = (left << 16) + (NNID)(t->bgn_lower[nnid]); } else { left = t->bgn[nnid]; if (left == NNID_INVALID) return (NNID_INVALID); } right = left + t->num[nnid] - 1; while(left < right) { mid = (left + right) / 2; if (t->nnid2wid[mid] < wkey) { left = mid + 1; } else { right = mid; } } if (t->nnid2wid[left] == wkey) { return (left); } else { return (NNID_INVALID); } } /** * Search for N-tuples. * * @param ndata [in] word/class N-gram * @param n [in] N of N-gram (= number of words in @a w) * @param w [in] word sequence * * @return */ NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w) { int i; NNID prev, next; if (n == 1) { /* wid = nnid in 1-gram */ return(w[0]); } prev = w[0]; for(i=2;i<=n;i++) { next = search_ngram_core(ndata, i, prev, w[i-1]); if (next == NNID_INVALID) { return NNID_INVALID; } prev = next; } return(next); } /** * Get N-gram probability of the last word w_n, given context w_1^n-1. * * @param ndata [in] word/class N-gram * @param n [in] N of N-gram (= number of words in @a w) * @param w [in] word sequence * * @return */ LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w) { int i; NNID prev, next, bid; LOGPROB p; NGRAM_TUPLE_INFO *t; if (n > ndata->n) { jlog("ERROR: no %d-gram exist (max %d)\n", n, ndata->n); return LOG_ZERO; } #ifdef ADEBUG printf("["); if (n > 1) { for(i=0;iwname[w[i]]); printf("| "); } printf("%s]\n", ndata->wname[w[n-1]]); #endif /* unigram */ if (n == 1) { p = ndata->d[0].prob[w[0]]; if (w[0] == ndata->unk_id) p -= ndata->unk_num_log; #ifdef ADEBUG printf("hit: %f\n", p); #endif return(p); } /* parse for ngram to reach the N-gram tuple */ prev = w[0]; for(i=2;i<=n;i++) { next = search_ngram_core(ndata, i, prev, w[i-1]); if (next == NNID_INVALID) break; prev = next; } if (next == NNID_INVALID) { /* not reached */ /* both back-off or fallback uses (n-1) gram of the target word */ /* recursive call to get the fallback likelihood */ #ifdef ADEBUG printf("--(not found)->\n"); #endif p = ngram_prob(ndata, n-1, &(w[1])); if (i == n) { /* the last parse was terminated at last step */ /* get back-off weight on prev */ t = &(ndata->d[i-2]); if (t->ct_compaction) { if ((bid = t->nnid2ctid_upper[prev]) == NNID_INVALID_UPPER) { /* in case back-off entry not found, it means bo_wt == 0.0 */ #ifdef ADEBUG printf("fall: %f\n", p); #endif return(p); } else { bid = (bid << 16) + (NNID)(t->nnid2ctid_lower[prev]); } } else { bid = prev; } /* return back-off likelihood */ #ifdef ADEBUG printf("back: %f + %f\n", t->bo_wt[bid], p); #endif return(t->bo_wt[bid] + p); } else { /* previous context not found, fallback to (n-1)-gram */ return(p); } } /* n-gram found */ /* trigram exist */ p = ndata->d[n-1].prob[next]; if (w[n-1] == ndata->unk_id) p -= ndata->unk_num_log; #ifdef ADEBUG printf("hit: %f\n", p); #endif return(p); } /* ---------------------------------------------------------------------- */ /* separate access functions for the 1st pass */ /** * Get 1-gram probability of @f$w@f$ in log10. * * @param ndata [in] word/class N-gram * @param w [in] word/class ID in N-gram * * @return log10 probability @f$\log p(w)@f$. */ LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w) { if (w != ndata->unk_id) { return(ndata->d[0].prob[w]); } else { return(ndata->d[0].prob[w] - ndata->unk_num_log); } } /** * Find bi-gram entry. Assumes ct_compaction and is24bit is FALSE on 2-gram * * @param ndata [in] N-gram data that holds the 2-gram * @param w_context [in] context word ID * @param w [in] target word ID * * @return the ID of N-gram tuple entry ID where the (w_context, w) exists. * */ static NNID search_bigram(NGRAM_INFO *ndata, WORD_ID w_context, WORD_ID w) { /* do binary search to find bigram entry */ /* assume ct_compaction and is24bit is FALSE on 2-gram */ NNID left,right,mid; /* n2 */ NGRAM_TUPLE_INFO *t; t = &(ndata->d[1]); if ((left = t->bgn[w_context]) == NNID_INVALID) /* has no bigram */ return (NNID_INVALID); right = left + t->num[w_context] - 1; while(left < right) { mid = (left + right) / 2; if (t->nnid2wid[mid] < w) { left = mid + 1; } else { right = mid; } } if (t->nnid2wid[left] == w) { return (left); } else { return (NNID_INVALID); } } /** * Get LR bi-gram prob: for LR N-gram * * @param ndata [in] N-gram data that holds the 2-gram * @param w1 [in] left context word * @param w2 [in] right target word * * @return the log N-gram probability P(w2|w1) * */ static LOGPROB bi_prob_normal(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) { NNID n2; LOGPROB prob; /* index is LR */ /* prob is in main N-gram area */ if ((n2 = search_bigram(ndata, w1, w2)) != NNID_INVALID) { prob = ndata->d[1].prob[n2]; } else { prob = ndata->d[0].bo_wt[w1] + ndata->d[0].prob[w2]; } if (w2 != ndata->unk_id) { return(prob); } else { return(prob - ndata->unk_num_log); } } /** * Get LR bi-gram prob: for RL N-gram with additional LR 2-gram, in * old bingram format. The old format has 2-gram index in reversed * orfer, so this function is for old bingram formats. * * @param ndata [in] N-gram data that holds the 2-gram * @param w1 [in] left context word * @param w2 [in] right target word * * @return the log N-gram probability P(w2|w1) * */ static LOGPROB bi_prob_additional_oldbin(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) { NNID n2; LOGPROB prob; /* index is LR */ /* prob is in additional N-gram area */ if ((n2 = search_bigram(ndata, w1, w2)) != NNID_INVALID) { prob = ndata->p_2[n2]; } else { prob = ndata->bo_wt_1[w1] + ndata->d[0].prob[w2]; } if (w2 != ndata->unk_id) { return(prob); } else { return(prob - ndata->unk_num_log); } } /** * Get LR bi-gram prob: for RL N-gram with additional LR 2-gram. * * @param ndata [in] N-gram data that holds the 2-gram * @param w1 [in] left context word * @param w2 [in] right target word * * @return the log N-gram probability P(w2|w1) * */ static LOGPROB bi_prob_additional(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) { NNID n2; LOGPROB prob; /* index is RL */ /* prob is in additional N-gram area */ if ((n2 = search_bigram(ndata, w2, w1)) != NNID_INVALID) { prob = ndata->p_2[n2]; } else { prob = ndata->bo_wt_1[w1] + ndata->d[0].prob[w2]; } if (w2 != ndata->unk_id) { return(prob); } else { return(prob - ndata->unk_num_log); } } /** * Get LR bi-gram prob: for RL N-gram with no LR 2-gram. * This function will compute the LR 2-gram from the RL 2-gram. * * @param ndata [in] N-gram data that holds the 2-gram * @param w1 [in] left context word * @param w2 [in] right target word * * @return the log N-gram probability P(w2|w1) * */ static LOGPROB bi_prob_compute(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) { NNID n2; LOGPROB prob; /* index is RL */ /* no additional N-gram, compute it directly */ /* get p(w1|w2) */ if ((n2 = search_bigram(ndata, w2, w1)) != NNID_INVALID) { prob = ndata->d[1].prob[n2]; } else { prob = ndata->d[0].bo_wt[w2] + ndata->d[0].prob[w1]; } /* p(w2|w1) = p(w1|w2) * p(w2) / p(w1) */ prob = prob + ndata->d[0].prob[w2] - ndata->d[0].prob[w1]; if (w2 != ndata->unk_id) { return(prob); } else { return(prob - ndata->unk_num_log); } } /** * Get 2-gram probability * This function is not used in Julius, since each function of bi_prob_* * will be called directly from the search. * * @param ndata [in] N-gram data that holds the 2-gram * @param w1 [in] left context word * @param w2 [in] right target word * * @return the log N-gram probability P(w2|w1) * */ LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) { LOGPROB p; if (ndata->bigram_index_reversed) { /* old binary format */ /* RL 3-gram with additional LR 2-gram, index by LR */ /* indexes are LR (swap not needed), probs are in additional area */ p = bi_prob_additional_oldbin(ndata, w1, w2); } else if (ndata->dir == DIR_LR) { /* LR 3-gram, index by LR */ p = bi_prob_normal(ndata, w1, w2); } else if (ndata->bo_wt_1 != NULL) { /* RL 3-gram with additional LR 2-gram, index by RL */ p = bi_prob_additional(ndata, w1, w2); } else { /* RL 3-gram only, index by RL */ p = bi_prob_compute(ndata, w1, w2); } return p; } /** * Determinte which bi-gram computation function to be used according to * the N-gram type, and set pointer to the proper function into the * N-gram data. * * @param ndata [i/o] N-gram information to use * */ void bi_prob_func_set(NGRAM_INFO *ndata) { if (ndata->bigram_index_reversed) { /* old binary format */ /* RL 3-gram with additional LR 2-gram, index by LR */ /* indexes are LR (swap not needed), probs are in additional area */ ndata->bigram_prob = bi_prob_additional_oldbin; } else if (ndata->dir == DIR_LR) { /* LR 3-gram, index by LR */ ndata->bigram_prob = bi_prob_normal; } else if (ndata->bo_wt_1 != NULL) { /* RL 3-gram with additional LR 2-gram, index by RL */ ndata->bigram_prob = bi_prob_additional; } else { /* RL 3-gram only, index by RL */ ndata->bigram_prob = bi_prob_compute; } }