/*************************************************************************/ /* */ /* Centre for Speech Technology Research */ /* University of Edinburgh, UK */ /* Copyright (c) 1996 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author : Simon King & Alan W Black */ /* Date : February 1997 */ /*-----------------------------------------------------------------------*/ /* */ /* Auxiliary functions for EST_Ngram class */ /* */ /*=======================================================================*/ #include #include #include "EST_String.h" #include "EST_Ngrammar.h" static bool ExponentialFit(EST_DVector &N, double &a, double &b, int first, int last) { // fit the function N(r) = e^a * r^b where a,b are constants // to the points N(first)...N(last) inclusive // i.e. a straight line fit in the (natural) log domain // minimising the mean squared error (in log domain), is : // limitation : r must be an integer (it is the index of v) // default if (last == -1) last = N.n()-1; if (first < 0) { cerr << "ExponentialFit : first must be >= 0" << endl; return false; } if (last >= N.n()-1) { cerr << "ExponentialFit : last must be < N.n()-1 = " << N.n()-1 << endl; } if (first == last) { a=log(N(first)); b=0; return true; } double ElnNr=0.0,ElnNrlnr=0.0, Elnr=0.0,Elnr2=0.0, R=0.0; for(int r=first;r<=last;r++) { R += 1; if ( N(r) > 0) { ElnNr += log( N(r) ); ElnNrlnr += log( N(r) ) * log( (double)r ); } Elnr += log( (double)r ); Elnr2 += log( (double)r ) * log( (double)r ); } // and the answer is : b = ( (ElnNr*Elnr/R) - ElnNrlnr ) / ( (Elnr*Elnr/R) - Elnr2); a = (ElnNr - (b*Elnr) ) / R; return true; } static bool smooth_ExponentialFit(EST_DVector &N, int first, int last) { double a=0.0,b=0.0; if (!ExponentialFit(N,a,b,first,last)) { cerr << "smooth_ExponentialFit : ExponentialFit failed !" << endl; return false; } for(int r=first;r<=last;r++) N[r] = exp(a)* pow((double)r, b); return true; } void make_f_of_f(EST_BackoffNgrammarState *s,void *params) { EST_Litem *k; double freq; EST_String name; EST_DVector *ff = (EST_DVector*)params; int max = ff->n(); for (k=s->pdf_const().item_start(); !s->pdf_const().item_end(k); k = s->pdf_const().item_next(k)) { s->pdf_const().item_freq(k,name,freq); if(freq+0.5 < max) (*ff)[(int)(freq+0.5)] += 1; } } void get_max_f(EST_BackoffNgrammarState *s,void *params) { EST_Litem *k; double freq; EST_String name; double *max = (double*)params; for (k=s->pdf_const().item_start(); !s->pdf_const().item_end(k); k = s->pdf_const().item_next(k)) { s->pdf_const().item_freq(k,name,freq); if(freq > *max) *max = freq; } } void map_f_of_f(EST_BackoffNgrammarState *s,void *params) { EST_Litem *k; double freq; EST_String name; //cerr << "map_f_of_f : visited " << *s << endl; EST_DVector *map = (EST_DVector*)params; int max = map->n(); for (k=s->pdf_const().item_start(); !s->pdf_const().item_end(k); k = s->pdf_const().item_next(k)) { s->pdf_const().item_freq(k,name,freq); if (freq+0.5 < max) { double nfreq = (*map)((int)(freq+0.5)); s->pdf().set_frequency(name,nfreq); } } } void zero_small_f(EST_BackoffNgrammarState *s,void *params) { EST_Litem *k; double freq; EST_String name; double *min = (double*)params; for (k=s->pdf_const().item_start(); !s->pdf_const().item_end(k); k = s->pdf_const().item_next(k)) { s->pdf_const().item_freq(k,name,freq); if (freq < *min) { //cerr << "zeroing " << freq << " < " << *min << endl; s->pdf().override_frequency(k,0.0); } } } void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order) { int i,size; EST_Litem *k; double max=0.0; // if ff has zero size, do complete frequency of frequencies // otherwise just fill in frequencies from 1 to ff.n()-1 bool complete = (bool)(ff.n() == 0); switch(n.representation()) { case EST_Ngrammar::sparse: case EST_Ngrammar::dense: { size = n.num_states(); if (complete) { // find highest frequency in EST_Ngram for(i=0;i max) max = n.p_states[i].pdf().samples(); ff.resize((int)(max+1.5)); ff.fill(0.0); } // Sum the frequencies for(i=0;i N.n()-2) { maxcount = N.n()-2; cerr << "adjusted_frequencies_BasicGoodTuring :"; cerr << " maxcount is too big, reducing it to " << maxcount << endl; } M.resize(N.n()); int r; for(r=0; r<=maxcount;r++) { // don't like this bit, but what can you do ? if( (N(r+1) == 0) || (N(r) == 0) ) M[r] = r; else M[r] = (r + 1) * N(r+1) / N(r); } // and do not map higher counts for(;r N.n()-2) { maxcount = N.n()-2; cerr << "smoothed_frequency_distribution_ExponentialFit :" << " maxcount too big, reducing it to " << maxcount << endl; } // the idea to fit this particular fn was by Steve Isard // we ignore r=0 in doing the fit if (!smooth_ExponentialFit(N,1,maxcount+1)) cerr << "smooth_ExponentialFit failed !" << endl; return; } bool Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount) { // works for any N // since it just remaps frequencies // frequencies above maxcount are not changed // for discounting -- see discount routine ! if (ngrammar.entry_type() != EST_Ngrammar::frequencies) { cerr << "EST_Ngram: cannot Good-Turing smooth ngram:" << " entries are not frequencies" << endl; return false; } switch(ngrammar.representation()) { case EST_Ngrammar::sparse: case EST_Ngrammar::dense: { EST_DVector freqs,mapped_freqs; // grammar is of a single order - simple // Find frequency distribution frequency_of_frequencies(freqs,ngrammar); // smoothing should be optional - to do smoothed_frequency_distribution_ExponentialFit(freqs,maxcount-1); // Build map of frequencies adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,maxcount); // Map all frequencies in grammar to Good Turing Smoothed values map_frequencies(ngrammar,mapped_freqs); } break; case EST_Ngrammar::backoff: { cerr << "Smoothing of backed of grammars is not available!" << endl; return false; (void)mincount; /* // need to smooth for each order independently int i,o; double threshold; //for (o=1;o<=ngrammar.order();o++) for(o=ngrammar.order();o<=ngrammar.order();o++) { EST_DVector freqs,mapped_freqs; frequency_of_frequencies(freqs,ngrammar,o); cerr << "FREQ : " << freqs << endl; if(freqs.n() < 2) { // i.e. only unseen things - zero them all threshold = 2; // 1 ? if(o>1) ngrammar.backoff_traverse(ngrammar.backoff_representation, &zero_small_f, (void*)(&threshold), o-1); continue; } int max = maxcount; if(max > freqs.n() - 2) max = freqs.n() - 2; if(max > 2) // need at least 3 points // smoothing should be optional - to do smoothed_frequency_distribution_ExponentialFit(freqs,max); cerr << "SMOOTHED : " << freqs << endl; adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max); // initial map of frequencies modifies total counts // so pdfs are correct map_frequencies(ngrammar,mapped_freqs,o); cerr << "Map for " << o << " : "; for(i=0;i freqs.n() - 2) max = freqs.n() - 2; if(max > 2) { // need at least 3 points // smoothing should be optional - to do // April 97 // to overcome problems with N(x)=0, shift data points, fit, and shift back for(i=0;i<=max+1;i++) freqs[i] += 1; smoothed_frequency_distribution_ExponentialFit(freqs,max); for(i=0;i<=max+1;i++) { freqs[i] -= 1; // possibly unnecesary check if ( freqs(i) < 0 ) freqs[i] = 0; } } adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max); // and put it in the discount array ngrammar.backoff_discount[o-1].resize(freqs.n()); for(i=(int)ngrammar.backoff_threshold;i<=max;i++) { ngrammar.backoff_discount[o-1][i] = (double)i - mapped_freqs(i); // sanity check if( ngrammar.backoff_discount[o-1][i] < 0) { ngrammar.backoff_discount[o-1][i] = 0; } } for(;i