/*************************************************************************/ /* */ /* Centre for Speech Technology Research */ /* University of Edinburgh, UK */ /* Copyright (c) 1996 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author : Alan W Black */ /* Date : December 1997 */ /*-----------------------------------------------------------------------*/ /* */ /* A Regular grammar compiler, its pretty free about the grammar */ /* Actually it will take full context free grammars and convert them */ /* up to a specified rewrite depth */ /* */ /* Based loosely on "Finite State Machines from Features Grammars" by */ /* Black, International Workshop of Parsing Technologies, CMU 89 */ /* */ /*=======================================================================*/ #include #include "EST_cutils.h" #include "EST_WFST.h" static LISP find_rewrites(LISP rules, LISP terms, LISP nonterms); static LISP rg_find_nt_ts(LISP rules,LISP sets); static LISP prod_join(LISP n, LISP p); static int production_index(LISP state, EST_WFST_MultiStateIndex &index, int proposed); void rgcompile(LISP rg, EST_WFST &all_wfst) { // Build a transducer from given regular grammar. LISP nt_ts,nonterms,terms,rewrites; LISP sets=siod_nth(2,rg); LISP rules=siod_nth(3,rg); nt_ts = rg_find_nt_ts(rules,sets); nonterms = car(nt_ts); terms = cdr(nt_ts); rewrites = find_rewrites(rules,terms,nonterms); if (rewrites == NIL) return; // left recursive or no rules all_wfst.build_from_rg(terms,terms, car(car(rules)), // distinguished symbol rewrites, sets,terms,25); } static LISP find_rewrites(LISP rules, LISP terms, LISP nonterms) { // Find the full rewrites of each nonterminal until a terminal // appears as the first item LISP nt,r; LISP rewrites = NIL; (void)terms; // got lazy and haven't done this recursively yet ... for (nt=nonterms; nt != NIL; nt=cdr(nt)) { LISP nn = NIL; for (r=rules; r != NIL; r=cdr(r)) if (car(car(r)) == car(nt)) // depend on symbols being eq nn = cons(cdr(cdr(car(r))),nn); rewrites = cons(cons(car(nt),nn),rewrites); } return rewrites; } static LISP rg_find_nt_ts(LISP rules,LISP sets) { // Find the alphabets used in the rules LISP terms=NIL,nonterms=NIL,r,s,set,t; for (r=rules; r != NIL; r=cdr(r)) if (!siod_member_str(get_c_string(car(car(r))),nonterms)) nonterms = cons(car(car(r)),nonterms); for (r=rules; r != NIL; r=cdr(r)) for (s=cdr(cdr(car(r))); s != NIL; s=cdr(s)) if ((!siod_member_str(get_c_string(car(s)),terms)) && (!siod_member_str(get_c_string(car(s)),nonterms)) && (!siod_assoc_str(get_c_string(car(s)),sets))) terms = cons(car(s),terms); else if ((set=siod_assoc_str(get_c_string(car(s)),sets))) { for (t=cdr(set); t != 0; t=cdr(t)) if (!siod_member_str(get_c_string(car(t)),terms)) terms = cons(car(t),terms); } return cons(nonterms,terms); } void EST_WFST::build_from_rg(LISP inalpha, LISP outalpha, LISP distinguished, LISP rewrites, LISP sets, LISP terms, int max_depth) { // This is sort of similar to determinising in that the "state" // is represented by a list of numbers, i.e. the remainder of // of production LISP current, start_state, remainder, set, new_prod; int ns, current_state; const char *current_sym; LISP agenda = NIL; EST_WFST_MultiStateIndex index(100); (void)max_depth; int c=0; clear(); init(inalpha,outalpha); int i_epsilon = in_epsilon(); int o_epsilon = out_epsilon(); // Create a starting state and add it to this WFST p_start_state = add_state(wfst_nonfinal); start_state = cons(flocons((double)p_start_state), cons(distinguished,NIL)); production_index(start_state,index,p_start_state); agenda = cons(start_state,agenda); // initialize agenda while (agenda != NIL) { current = car(agenda); agenda = cdr(agenda); current_state = get_c_int(car(current)); current_sym = get_c_string(car(cdr(current))); remainder = cdr(cdr(current)); if ((c % 1000)== 0) cout << summary() << " Agenda " << siod_llength(agenda) << endl; c++; if ((set=siod_assoc_str(current_sym,sets))) { ns = production_index(remainder,index,p_num_states); // add transitions for each member of set for (LISP s=cdr(set); s != NIL; s=cdr(s)) p_states[current_state] ->add_transition(0.0, // no weights ns, p_in_symbols.name(get_c_string(car(s))), p_out_symbols.name(get_c_string(car(s)))); if (remainder == NIL) add_state(wfst_final); else if (ns == p_num_states) // its a new remainder { add_state(wfst_nonfinal); agenda = cons(cons(flocons(ns),remainder),agenda); } } else if (siod_member_str(current_sym,terms)) { ns = production_index(remainder,index,p_num_states); // Add transition for this terminal symbol p_states[current_state] ->add_transition(0.0, // no weights ns, p_in_symbols.name(current_sym), p_out_symbols.name(current_sym)); if (remainder == NIL) add_state(wfst_final); else if (ns == p_num_states) // its a new remainder { add_state(wfst_nonfinal); agenda = cons(cons(flocons(ns),remainder),agenda); } } else // its a non-terminal so simply rewrite { for (LISP p=cdr(siod_assoc_str(current_sym,rewrites)); p != NIL; p=cdr(p)) { new_prod = prod_join(car(p),remainder); ns = production_index(new_prod,index,p_num_states); p_states[current_state] ->add_transition(0.0, // no weights ns,i_epsilon,o_epsilon); if (ns == p_num_states) // its a new remainder { if (new_prod == NIL) add_state(wfst_final); else { add_state(wfst_nonfinal); agenda = cons(cons(flocons(ns),new_prod),agenda); } } } } } } static int production_index(LISP state, EST_WFST_MultiStateIndex &index, int proposed) { // Returns proposed if ms is not already in index, otherwise // returns the value that was proposed when it was first new. // I'll have to make this more efficient in future. EST_String istring(""); LISP p; for (p=state; p != NIL; p = cdr(p)) istring += EST_String(get_c_string(car(p))) + " "; int ns,found; for (p=state; p != NIL; p = cdr(p)) istring += EST_String(get_c_string(car(p))) + " "; ns = index.val(istring,found); if (found) return ns; else { index.add_item(istring,proposed); return proposed; } } static LISP prod_join(LISP n, LISP p) { if (n == NIL) return p; else return cons(car(n),prod_join(cdr(n),p)); }