/*************************************************************************/ /* */ /* Centre for Speech Technology Research */ /* University of Edinburgh, UK */ /* Copyright (c) 1996 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author : Alan W Black */ /* Date : December 1997 */ /*-----------------------------------------------------------------------*/ /* */ /* A LTS rule compiler, where rules are contextual rewrite rules. Rules */ /* are of the for LC [ x ] RC => y where LC and RC are regexs on the */ /* tape only and x and y are simple strings on symbols. That is the */ /* standard form of LTS rules used in Festival. */ /* */ /*=======================================================================*/ #include #include "EST_cutils.h" #include "EST_WFST.h" static LISP lts_find_feasible_pairs(LISP rules); static LISP make_fp(LISP in, LISP out); static LISP find_outs(LISP rule); static LISP find_ins(LISP rule); static LISP add_alpha(LISP n, LISP existing); static LISP lts_find_alphabets(LISP rules); static void ltsrule_compile(LISP inalpha, LISP outalpha, LISP fp, LISP sets, LISP rule, EST_WFST &a, EST_WFST ¬_a); static LISP analyse_rule(LISP rule); static LISP expand_sets(LISP l, LISP fp, LISP sets); static LISP expand_set(LISP p, LISP fp, LISP sets); static LISP find_notMAP(LISP MAP,LISP fp); void ltscompile(LISP lts_rules, EST_WFST &all_wfst) { // Build a transducer from given LTS rules. Because the interpretation // of these rules is normally ordered and the WFST is not, the // complement of each cumulative WFST must be generated before // adding the next rule LISP r; LISP fp; // feasible pairs, those pairs with rules (rather than IxO) LISP inalpha,outalpha,alphas; LISP sets=siod_nth(2,lts_rules); LISP rules=siod_nth(3,lts_rules); EST_WFST nots; alphas = lts_find_alphabets(lts_rules); inalpha = car(alphas); outalpha = cdr(alphas); fp = lts_find_feasible_pairs(rules); // set up an empty WFST, accepts nothing all_wfst.build_from_regex(inalpha,outalpha,NIL); // matches things not matched by rules, everything to begin with nots.build_from_regex(inalpha,outalpha, cons(rintern("*"), cons(cons(rintern("or"),fp),NIL))); nots.save("-"); for (r=rules; r != NIL; r=cdr(r)) { EST_WFST a, not_a,b,c,d; // all = all u (all' n r) ltsrule_compile(inalpha,outalpha,fp,sets,car(r),a,not_a); pprint(car(r)); a.save("-"); c.intersection(a,nots); c.save("-"); // Add for next rule b.uunion(nots,not_a); not_a.save("-"); b.save("-"); nots = b; d.uunion(all_wfst,c); all_wfst = d; all_wfst.save("-"); } } static LISP lts_find_alphabets(LISP rules) { // Find the alphabets used in the rules LISP r; LISP in=NIL, out=NIL; for (r=siod_nth(3,rules); r != NIL; r=cdr(r)) { in = add_alpha(find_ins(car(r)),in); out = add_alpha(find_outs(car(r)),out); } return cons(in,out); } static LISP add_alpha(LISP n, LISP existing) { // Add values in n if not already in existing LISP t; LISP e=existing; for (t=n; t != NIL; t=cdr(t)) if (!siod_member_str(get_c_string(car(t)),e)) e = cons(car(t),e); return e; } static LISP find_ins(LISP rule) { // find all symbols in [] in rule LISP c; int state=FALSE; LISP ins = NIL; for (c=rule; c != NIL; c=cdr(c)) { if (streq("[",get_c_string(car(c)))) state=TRUE; else if (streq("]",get_c_string(car(c)))) break; else if (state) ins = cons(car(c),ins); } return reverse(ins); } static LISP find_outs(LISP rule) { // find all symbols after = rule LISP c; int state=FALSE; LISP outs = NIL; for (c=rule; c != NIL; c=cdr(c)) { if (streq("=",get_c_string(car(c)))) state=TRUE; else if (state) outs = cons(car(c),outs); } return reverse(outs); } static LISP lts_find_feasible_pairs(LISP rules) { // Find the set of pairs that have rules associated with them // This effectively defines the transducer alphabet. // We take the surface part in [] and the part after the = and // linearly match them to form a set of pairs, padded with epsilon // if necessary. LISP fp = NIL; LISP r; for (r=rules; r != NIL; r=cdr(r)) { LISP in = find_ins(car(r)); LISP out = find_outs(car(r)); LISP pairs = make_fp(in,out); for (LISP p=pairs; p != NIL; p=cdr(p)) { if (!siod_member_str(get_c_string(car(p)),fp)) fp = cons(car(p),fp); } } return fp; } static LISP make_fp(LISP in, LISP out) { // Returns a list of pairs by matching each member of in to out // padding the shorted one with _epsilon_ if necessary LISP i,o; LISP fp=NIL; EST_String is,os; int m; if (siod_llength(in) > siod_llength(out)) m = siod_llength(in); else m = siod_llength(out); for (i=in,o=out ; m > 0; --m,i=cdr(i),o=cdr(o)) { if (i == NIL) is = "__epsilon__"; else is = get_c_string(car(i)); if (o == NIL) os = "__epsilon__"; else os = get_c_string(car(o)); fp = cons(strintern(is+"/"+os),fp); } return reverse(fp); } static void ltsrule_compile(LISP inalpha, LISP outalpha, LISP fp, LISP sets, LISP rule, EST_WFST &a, EST_WFST ¬_a) { // Return two regexs, one matching with rewrites and another // that matches things this rule doesn't match. LISP LC,MAP,RC,notMAP,r; r = analyse_rule(rule); LC = siod_nth(0,r); MAP = siod_nth(1,r); RC = siod_nth(2,r); LC = expand_sets(LC,fp,sets); RC = expand_sets(RC,fp,sets); notMAP = find_notMAP(MAP,fp); LISP kk = cons(LC,cons(MAP,cons(RC,NIL))); cout << "kk rule" << endl; pprint(kk); a.kkrule_compile(inalpha,outalpha,fp,kk,NIL); // (or (* ) (not )) ;; everything except the rule LISP regex_r = cons(rintern("and"),append(LC,append(MAP,RC))); // LISP nn = cons(rintern("or"), // cons(cons(rintern("*"),cons(cons(rintern("or"),fp),NIL)), // cons(cons(rintern("not"),cons(regex_r,NIL)), // NIL))); LISP nn = cons(rintern("not"),cons(regex_r,NIL)); not_a.build_from_regex(inalpha,outalpha,nn); } static LISP analyse_rule(LISP rule) { // return the left context, map and right context; LISP LC=NIL, RC=NIL, in=NIL, out=NIL; LISP l; int state=0; for (l=rule; l != NIL; l=cdr(l)) { if ((state==0) && (!streq("[",get_c_string(car(l))))) LC = cons(car(l),LC); else if ((state==0) && (streq("[",get_c_string(car(l))))) state = 1; else if ((state==1) && (!streq("]",get_c_string(car(l))))) in = cons(car(l),in); else if ((state==1) && (streq("]",get_c_string(car(l))))) state = 2; else if ((state==2) && (!streq("=",get_c_string(car(l))))) RC = cons(car(l),RC); else if ((state==2) && (streq("=",get_c_string(car(l))))) state = 3; else if (state == 3) { out = l; break; } } return cons(reverse(LC), cons(make_fp(reverse(in),out), cons(reverse(RC),NIL))); } static LISP expand_sets(LISP l, LISP fp, LISP sets) { // Expand sets in l and fix regex characters LISP r,es=NIL; for (r=l; r != NIL; r=cdr(r)) { LISP s = expand_set(car(r),fp,sets); if (cdr(r) && (streq("*",get_c_string(car(cdr(r)))))) { es = cons(cons(rintern("*"),s),es); r=cdr(r); } else if (cdr(r) && (streq("+",get_c_string(car(cdr(r)))))) { es = cons(cons(rintern("+"),s),es); r=cdr(r); } else es = cons(cons(rintern("and"),s),es); } return reverse(es); } static LISP expand_set(LISP p, LISP fp, LISP sets) { // expand p with respect to sets and feasible pairs LISP set = siod_assoc_str(get_c_string(p),sets); LISP s,f; LISP r=NIL; if (set == NIL) set = cons(p,NIL); for (s=set; s != NIL; s=cdr(s)) { for (f=fp; f != NIL; f=cdr(f)) { EST_String ss = get_c_string(car(s)); EST_String sf = get_c_string(car(f)); if (sf.contains(ss+"/")) r = cons(car(f),r); } } return reverse(r); } static LISP find_notMAP(LISP MAP,LISP fp) { // Returns REGEX that matches everything except MAP, this doesn't // try all possible epsilons though LISP r,notrp=NIL,m,np; EST_String s,l,p,sr,lr,rr; for (m=MAP; m != NIL; m=cdr(m)) { p = get_c_string(car(m)); if (p.contains("/")) { s = p.before("/"); l = p.after("/"); } else { s = p; l = p; } for (np=NIL,r=fp; r != NIL; r = cdr(r)) { rr = get_c_string(car(r)); if (rr.contains("/")) { sr = rr.before("/"); lr = rr.after("/"); } else { sr = rr; lr = rr; } if ((s == sr) && (l != lr)) np = cons(car(r),np); } notrp = cons(cons(rintern("or"),np),notrp); } return reverse(notrp); }