/*************************************************************************/ /* */ /* Centre for Speech Technology Research */ /* University of Edinburgh, UK */ /* Copyright (c) 1995,1996 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Authors: Alan W Black and Simon King */ /* Date : January 1997 */ /*-----------------------------------------------------------------------*/ /* A simple use of the Viterbi decoder */ /* */ /*=======================================================================*/ #include #include #include #include "EST.h" EST_read_status load_TList_of_StrVector(EST_TList &w, const EST_String &filename, const int vec_len); static void print_results(EST_Relation &wstream); static bool do_search(EST_Relation &wstream); static EST_VTPath *vit_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f); static EST_VTCandidate *vit_candlist(EST_Item *s,EST_Features &f); static void top_n_candidates(EST_VTCandidate* &all_c); static void load_vocab(const EST_String &vfile); static void add_word(EST_Relation &w, const EST_String &word, int pos); static void load_wstream(const EST_String &filename, const EST_String &vfile, EST_Relation &w, EST_Track &obs); static void load_given(const EST_String &filename, const int ngram_order); static double find_gram_prob(EST_VTPath *p,int *state); // special stuff for non-sliding window ngrams static double find_extra_gram_prob(EST_VTPath *p,int *state, int time); static void get_history(EST_StrVector &history, EST_VTPath *p); static void fill_window(EST_StrVector &window,EST_StrVector &history, EST_VTPath *p,const int time); static int is_a_special(const EST_String &s, int &val); static int max_history=0; static EST_Ngrammar ngram; static EST_String pstring = SENTENCE_START_MARKER; static EST_String ppstring = SENTENCE_END_MARKER; static float lm_scale = 1.0; static float ob_scale = 1.0; static float ob_scale2 = 1.0; // pruning beam widths static float beam=-1; static float ob_beam=-1; static int n_beam = -1; static bool trace_on = FALSE; // always logs static double ob_log_prob_floor = SAFE_LOG_ZERO; static double ob_log_prob_floor2 = SAFE_LOG_ZERO; static double lm_log_prob_floor = SAFE_LOG_ZERO; int btest_debug = FALSE; static EST_String out_file = ""; static EST_StrList vocab; static EST_Track observations; static EST_Track observations2; static EST_TList given; // to do : convert to array for speed int using_given=FALSE; // default is that obs are already logs int take_logs = FALSE; int num_obs = 1; /** @name viterbi Combine n-gram model and likelihoods to estimate posterior probabilities * @id viterbi-manual * @toc */ //@{ /**@name Synopsis */ //@{ //@synopsis /** viterbi is a simple time-synchronous Viterbi decoder. It finds the most likely sequence of items drawn from a fixed vocabulary, given frame-by-frame observation probabilities for each item in that vocabulary, and a ngram grammar. Possible uses include: Simple speech recogniser back end viterbi can optionally use two sets of frame-by-frame observation probabilities in a weighted-sum fashion. Also, the ngram language model is not restricted to the conventional sliding window type in which the previous n-1 items are the ngram context. Items in the ngram context at each frame may be given. In this case, the user must provide a file containing the ngram context: one (n-1) tuple per line. To include items from the partial Viterbi path so far (i.e. found at recognition time, not given) the special notation <-N> is used where N indicates the distance back to the item required. For example <-1> would indicate the item on the partial Viterbi path at the last frame. See \Ref{Examples}. Pruning Three types of pruning are available to reduce the size of the search space and therefore speed up the search: Observation pruning Top-N pruning at each frame Fixed width beam pruning */ //@} /**@name Options */ //@{ //@options //@} int main(int argc, char **argv) { EST_StrList files; EST_Option al; EST_Relation wstream; double floor; // a temporary parse_command_line(argc, argv, EST_String("[observations file] -o [output file]\n")+ "Summary: find the most likely path through a sequence of\n"+ " observations, constrained by a language model.\n"+ "-ngram Grammar file, required\n"+ "-given ngram left contexts, per frame\n"+ "-vocab File with names of vocabulary, this\n"+ " must be same number as width of observations, required\n"+ "-ob_type Observation type : likelihood .... and change doc\"probs\" or \"logs\" (default is \"logs\")\n"+ "\nFloor values and scaling (scaling is applied after floor value)\n"+ "-lm_floor LM floor probability\n"+ "-lm_scale LM scale factor factor (applied to log prob)\n"+ "-ob_floor Observations floor probability\n"+ "-ob_scale Observation scale factor (applied to prob or log prob, depending on -ob_type)\n\n"+ "-prev_tag \n"+ " tag before sentence start\n"+ "-prev_prev_tag \n"+ " all words before 'prev_tag'\n"+ "-last_tag \n"+ " after sentence end\n"+ "-default_tags use default tags of "+SENTENCE_START_MARKER+"," SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+ " respectively\n\n"+ "-observes2 second observations (overlays first, ob_type must be same)\n"+ "-ob_floor2 \n"+ "-ob_scale2 \n\n"+ "-ob_prune observation pruning beam width (log) probability\n"+ "-n_prune top-n pruning of observations\n"+ "-prune pruning beam width (log) probability\n"+ "-trace show details of search as it proceeds\n", files, al); out_file = al.present("-o") ? al.val("-o") : (EST_String)"-"; if (files.length() != 1) { cerr << argv[0]; cerr << ": you must give exactly one observations file on the command line"; cerr << endl; cerr << "(use -observes2 for optional second observations)" << endl; exit(-1); } if (al.present("-ngram")) { ngram.load(al.val("-ngram")); } else { cerr << argv[0] << ": no ngram specified" << endl; exit(-1); } if(!al.present("-vocab")) { cerr << "You must provide a vocabulary file !" << endl; exit(-1); } load_wstream(files.first(),al.val("-vocab"),wstream,observations); if (al.present("-observes2")) { load_wstream(al.val("-observes2"),al.val("-vocab"),wstream,observations2); num_obs = 2; } if (al.present("-given")) { load_given(al.val("-given"),ngram.order()); using_given=TRUE; } if (al.present("-lm_scale")) lm_scale = al.fval("-lm_scale"); else lm_scale = 1.0; if (al.present("-ob_scale")) ob_scale = al.fval("-ob_scale"); else ob_scale = 1.0; if (al.present("-ob_scale2")) ob_scale2 = al.fval("-ob_scale2"); else ob_scale2 = 1.0; if (al.present("-prev_tag")) pstring = al.val("-prev_tag"); if (al.present("-prev_prev_tag")) ppstring = al.val("-prev_prev_tag"); // pruning if (al.present("-prune")) beam = al.fval("-prune"); else beam = -1; // no pruning if (al.present("-ob_prune")) ob_beam = al.fval("-ob_prune"); else ob_beam = -1; // no pruning if (al.present("-n_prune")) { n_beam = al.ival("-n_prune"); if(n_beam <= 0) { cerr << "WARNING : " << n_beam; cerr << " is not a reasonable value for -n_prune !" << endl; } } else n_beam = -1; // no pruning if (al.present("-trace")) trace_on = TRUE; // language model floor if (al.present("-lm_floor")) { floor = al.fval("-lm_floor"); if(floor < 0) { cerr << "Error : LM floor probability is negative !" << endl; exit(-1); } else if(floor > 1) { cerr << "Error : LM floor probability > 1 " << endl; exit(-1); } lm_log_prob_floor = safe_log(floor); } // observations floor if (al.present("-ob_floor")) { floor = al.fval("-ob_floor"); if(floor < 0) { cerr << "Error : Observation floor probability is negative !" << endl; exit(-1); } else if(floor > 1) { cerr << "Error : Observation floor probability > 1 " << endl; exit(-1); } ob_log_prob_floor = safe_log(floor); } if (al.present("-ob_floor2")) { floor = al.fval("-ob_floor2"); if(floor < 0) { cerr << "Error : Observation2 floor probability is negative !" << endl; exit(-1); } else if(floor > 1) { cerr << "Error : Observation2 floor probability > 1 " << endl; exit(-1); } ob_log_prob_floor2 = safe_log(floor); } if (al.present("-ob_type")) { if(al.val("-ob_type") == "logs") take_logs = false; else if(al.val("-ob_type") == "probs") take_logs = true; else { cerr << "\"" << al.val("-ob_type") << "\" is not a valid ob_type : try \"logs\" or \"probs\"" << endl; exit(-1); } } if(do_search(wstream)) print_results(wstream); else cerr << "No path could be found." << endl; return 0; } static void print_results(EST_Relation &wstream) { EST_Item *s; float pscore; EST_String predict; FILE *fd; if (out_file == "-") fd = stdout; else if ((fd = fopen(out_file,"wb")) == NULL) { cerr << "can't open \"" << out_file << "\" for output" << endl; exit(-1); } for (s=wstream.head(); s != 0 ; s=s->next()) { predict = s->f("best").string(); pscore = s->f("best_score"); fprintf(fd,"%s %f\n",(const char *)predict,pscore); } if (out_file != "") fclose(fd); } static bool do_search(EST_Relation &wstream) { // Apply Ngram to matrix of probs int states; states = ngram.num_states(); EST_Viterbi_Decoder vc(vit_candlist,vit_npath,states); vc.initialise(&wstream); if((beam > 0) || (ob_beam > 0)) vc.set_pruning_parameters(beam,ob_beam); if(trace_on) { vc.turn_on_trace(); cerr << "Starting Viterbi search..." << endl; } vc.search(); return vc.result("best"); // adds fields to w with best values } static void load_wstream(const EST_String &filename, const EST_String &vfile, EST_Relation &w, EST_Track &obs) { // Load in vocab and probs into Stream (this isn't general) EST_String word, pos; int i=-1; if(vocab.empty()) load_vocab(vfile); if (obs.load(filename,0.10) != 0) { cerr << "can't find observations file \"" << filename << "\"" << endl; exit(-1); } if (vocab.length() != obs.num_channels()) { cerr << "Number in vocab (" << vocab.length() << ") not equal to observation's width (" << obs.num_channels() << ")" << endl; exit(-1); } if(w.empty()) for (i=0; i < obs.num_frames(); i++) add_word(w,itoString(i),i); } static void load_given(const EST_String &filename, const int ngram_order) { EST_String word, pos; EST_Litem *p; int i,j; if (load_TList_of_StrVector(given,filename,ngram_order-1) != 0) { cerr << "can't load given file \"" << filename << "\"" << endl; exit(-1); } // set max history for (p = given.head(); p; p = p->next()) { for(i=0;i max_history)) max_history = -j; } } static void load_vocab(const EST_String &vfile) { // Load vocabulary (strings) EST_TokenStream ts; if (ts.open(vfile) == -1) { cerr << "can't find vocab file \"" << vfile << "\"" << endl; exit(-1); } while (!ts.eof()) if (ts.peek() != "") vocab.append(ts.get().string()); ts.close(); } static void add_word(EST_Relation &w, const EST_String &word, int pos) { EST_Item *item = w.append(); item->set_name(word); item->set("pos",pos); } static EST_VTCandidate *vit_candlist(EST_Item *s,EST_Features &f) { // Return a list of new candidates from this point double prob=1.0,prob2=1.0; int i; EST_Litem *p; int observe; EST_VTCandidate *all_c = 0; EST_VTCandidate *c; (void)f; observe = s->f("pos"); // index for observations TRACK for (i=0,p=vocab.head(); i < observations.num_channels(); i++,p=p->next()) { c = new EST_VTCandidate; c->name = vocab(p); // to be more efficient this could be the index prob = observations.a(observe,i); if(num_obs == 2) prob2 = observations2.a(observe,i); if(take_logs) { prob = safe_log10(prob); if (prob < ob_log_prob_floor) prob = ob_log_prob_floor; if(num_obs == 2) { prob2 = safe_log10(prob2); if (prob2 < ob_log_prob_floor2) prob2 = ob_log_prob_floor2; } } else // already in logs { if (prob < ob_log_prob_floor) prob = ob_log_prob_floor; if ((num_obs == 2) && (prob2 < ob_log_prob_floor2)) prob2 = ob_log_prob_floor2; } prob *= ob_scale; prob2 *= ob_scale2; if(num_obs == 2) c->score = prob + prob2; else c->score = prob; c->next = all_c; c->s = s; all_c = c; } if(n_beam > 0) { // N.B. this might be very time-consuming top_n_candidates(all_c); } return all_c; } static EST_VTPath *vit_npath(EST_VTPath *p,EST_VTCandidate *c, EST_Features &f) { // Build a (potential) new path link from this previous path and // This candidate EST_VTPath *np = new EST_VTPath; double lprob,prob; EST_String prev,ttt; (void)f; np->c = c; np->from = p; // are we using extra info ? if(using_given) // time of candidate is // c->s->f("pos"); prob = find_extra_gram_prob(np,&np->state,c->s->f("pos")); else prob = find_gram_prob(np,&np->state); lprob = safe_log10(prob); if (lprob < lm_log_prob_floor) lprob = lm_log_prob_floor; lprob *= lm_scale; np->f.set("lscore",(c->score+lprob)); // simonk : changed prob to lprob if (p==0) np->score = (c->score+lprob); else np->score = (c->score+lprob) + p->score; return np; } static double find_gram_prob(EST_VTPath *p,int *state) { // Look up transition probability from *state for name. // Return probability and update state double prob=0.0,nprob; int i,f=FALSE; EST_VTPath *pp; EST_StrVector window(ngram.order()); for (pp=p->from,i=ngram.order()-2; i >= 0; i--) { if (pp != 0) { window[i] = pp->c->name.string(); pp = pp->from; } else if (f) window[i] = ppstring; else { window[i] = pstring; f = TRUE; } } window[ngram.order()-1] = p->c->name.string(); const EST_DiscreteProbDistribution &pd = ngram.prob_dist(window); if (pd.samples() == 0) prob = 0; else prob = (double)pd.probability(p->c->name.string()); for (i=0; i < ngram.order()-1; i++) window[i] = window(i+1); ngram.predict(window,&nprob,state); return prob; } static double find_extra_gram_prob(EST_VTPath *p,int *state,int time) { int i; double prob=0.0,nprob; EST_StrVector window(ngram.order()); EST_StrVector history(max_history); get_history(history,p); fill_window(window,history,p,time); /* cerr << "Looking up ngram "; for(i=0;ic->name.string()); // shift history, adding latest item at 'end' (0) if(max_history>0) { for(i=history.length()-1;i>0;i--) history[i] = history(i-1); history[0] = p->c->name.string(); } fill_window(window,history,p,time+1); ngram.predict(window,&nprob,state); //cerr << endl << endl; return prob; } static void get_history(EST_StrVector &history, EST_VTPath *p) { EST_VTPath *pp; int i,f=FALSE; for (pp=p->from,i=0; i < history.length(); i++) { if (pp != 0) { history[i] = pp->c->name.string(); pp = pp->from; } else if (f) history[i] = ppstring; else { history[i] = pstring; f = TRUE; } } } static void fill_window(EST_StrVector &window,EST_StrVector &history, EST_VTPath *p,const int time) { // Look up transition probability from *state for name. // Return probability and update state int i,j; EST_String s; // can we even do it? if( time >= given.length() ) return; // format should be run-time defined, but try this for now // first n-1 things in window come from 'given' // last one is predictee // also want vocab and grammar mismatch allowed !!!!!! // predictee window[ngram.order()-1] = p->c->name.string(); // given info for this time EST_StrVector *this_g = &(given.nth(time)); // inefficient to count down a list for(i=0;i (0) j=-2 -> (1) etc. else window[i] = (*this_g)(i); } } static int is_a_special(const EST_String &s, int &val) { // special is "" EST_String tmp; if(s.contains("<") && s.contains(">")) { tmp = s.after("<"); tmp = tmp.before(">"); val = atoi(tmp); //cerr << "special " << tmp << "=" << val << endl; return TRUE; } return FALSE; } static void top_n_candidates(EST_VTCandidate* &all_c) { // keep the n most likely candidates // avoiding a full sort of the (potentially long) list EST_VTCandidate *top_c=NULL,*p,*q,*this_best, *prev_to_best; int i; if(n_beam < 1) return; // do nothing // here we assume big is always good //if(!big_is_good) //score_multiplier = -1; for(i=0;inext) { //cerr << "item : " << p->score << endl; if(p->score > this_best->score) { this_best = p; prev_to_best=q; } } if(this_best == NULL) break; // give up now - must have run out of candidates // move best candidate over to new list if(prev_to_best == NULL) // best was head of list all_c = this_best->next; else // best was not head of all_c prev_to_best->next = this_best->next; this_best->next = top_c; top_c = this_best; } delete all_c; all_c = top_c; /* cerr << "Here are the top " << n_beam << " candidates" << endl; for(p=all_c;p != NULL;p=p->next) cerr << p->score << endl; */ } /**@name Examples Example 'given' file (items f and g are in the vocabulary), the ngram is a 4-gram. <-2> g g <-1> g f <-1> f g <-2> g g <-3> g g <-1> g f */ //@{ //@} //@}