/** * @file mkbingram.c * * * @brief ARPA標準形式のN-gramから Julius 用のバイナリN-gramに変換する. * * Julius で使える ARPA 標準形式の (前向き)2-gram と 後ろ向き * 3-gram を,単一のバイナリN-gramに変換する. * * バイナリN-gramの形式はまた古い形式(3.4.2以前) * のバイナリN-gramを 3.5 以降の新しい形式に変換することもできる. * * * * @brief * * * @author Akinobu LEE * @date Thu Mar 24 12:22:27 2005 * * $Revision: 1.6 $ * */ /* * Copyright (c) 1991-2012 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology * All rights reserved */ /* mkbingram --- make binary n-gram for JULIUS from ARPA standard format */ /* $Id: mkbingram.c,v 1.6 2012/08/11 10:44:02 sumomo Exp $ */ #include #include #include #include #include "charconv.h" static NGRAM_INFO *ngram; void usage(char *s) { printf("mkbingram: convert ARPA format N-gram to binary format for Julius\n"); printf("\nUsage: %s [options...] outfile\n", s); printf("\n options:\n"); printf(" -nlr file forward N-gram in ARPA format\n"); printf(" -nrl file backward N-gram in ARPA format\n"); printf(" -d bingramfile Julius binary N-gram file input\n"); printf(" -c from to convert character code\n"); printf(" -swap swap \"%s\" and \"%s\"\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT); printf("\n When both \"-nlr\" and \"-nrl\" are specified, \n"); printf(" Julius will use the BACKWARD N-gram as main LM\n"); printf(" and use the forward 2-gram only at the 1st pass\n"); printf("\nLibrary configuration: "); confout_version(stdout); confout_lm(stdout); printf("\n"); } int main(int argc, char *argv[]) { FILE *fp; char header[512]; time_t now; char *binfile, *lrfile, *rlfile, *outfile; int i; char *from_code, *to_code, *buf; boolean charconv_enabled = FALSE; boolean force_swap = FALSE; WORD_ID w; binfile = lrfile = rlfile = outfile = NULL; from_code = to_code = NULL; if (argc <= 1) { usage(argv[0]); return -1; } for(i=1;i= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } binfile = argv[i]; } else if (argv[i][1] == 'n') { switch(argv[i][2]) { case 'l': if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } lrfile = argv[i]; break; case 'r': if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } rlfile = argv[i]; break; default: printf("Error: no such option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } } else if (argv[i][1] == 'c') { if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } from_code = strcpy((char*)mymalloc(strlen(argv[i])+1), argv[i]); if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); free(from_code); return -1; } to_code = strcpy((char*)mymalloc(strlen(argv[i])+1),argv[i]); charconv_enabled = TRUE; } else if (argv[i][1] == 's') { force_swap = TRUE; } } else { if (outfile == NULL) { outfile = argv[i]; } else { printf("Error: more than one output file\n"); usage(argv[0]); return -1; } } } if (!outfile) { printf("Error: no output file specified\n"); usage(argv[0]); return -1; } if (binfile) { if (lrfile || rlfile) { printf("Error: both binary file and ARPA file are specified\n"); usage(argv[0]); return -1; } printf("bingram: %s\n", binfile); } else { if (rlfile) { printf("backward n-gram: %s\n", rlfile); if (lrfile) { printf("additional forward 2-gram for 1st pass: %s\n", lrfile); } } else if (lrfile) { printf("forward n-gram: %s\n", lrfile); } else { printf("Error: no input N-gram file specified\n"); usage(argv[0]); return -1; } } printf("\nSTART LOADING\n\n"); /* make header string */ now = time(NULL); if (binfile) { sprintf(header, "converted at %s\nfrom bingram = %s\n", ctime(&now), binfile); } else { if (rlfile && lrfile) { sprintf(header, "converted at %s\nfrom n-gram = %s, LR 2-gram = %s\n", ctime(&now), rlfile, lrfile); } else if (rlfile) { sprintf(header, "converted at %s\nfrom n-gram = %s\n", ctime(&now), rlfile); } else { sprintf(header, "converted at %s\nfrom n-gram = %s\n", ctime(&now), lrfile); } } ngram = ngram_info_new(); if (binfile) { /* read in bingram */ if (init_ngram_bin(ngram, binfile) == FALSE) return -1; } else { /* read in ARPA n-gram */ if (force_swap) { ngram->bos_eos_swap = TRUE; } if (rlfile) { if (init_ngram_arpa(ngram, rlfile, DIR_RL) == FALSE) return -1; if (lrfile) { if (init_ngram_arpa_additional(ngram, lrfile) == FALSE) return -1; } } else if (lrfile) { if (init_ngram_arpa(ngram, lrfile, DIR_LR) == FALSE) return -1; } } print_ngram_info(stdout, ngram); if (charconv_enabled == TRUE) { /* do character conversion */ if (charconv_setup(from_code, to_code) == -1) { fprintf(stderr, "failed to setup character convertsion\n"); return -1; } buf = (char *)mymalloc(4096); for (w = 0; w < ngram->max_word_num; w++) { charconv(ngram->wname[w], buf, 4096); ngram->wname[w] = mybmalloc2(strlen(buf)+1, &(ngram->mroot)); strcpy(ngram->wname[w], buf); } free(buf); } /* write in JULIUS binary format */ if ((fp = fopen_writefile(outfile)) == NULL) { fprintf(stderr, "failed to open \"%s\"\n", outfile); return -1; } printf("\nWriting in v5 format to \"%s\"...\n", outfile); if (ngram_write_bin(fp, ngram, header) == FALSE){/* failed */ fprintf(stderr, "failed to write \"%s\"\n",outfile); return -1; } fclose_writefile(fp); printf("completed\n"); return 0; }