/*************************************************************************/ /* */ /* Centre for Speech Technology Research */ /* University of Edinburgh, UK */ /* Copyright (c) 1996,1997 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author : Simon King & Alan W Black */ /* Date : February 1997 */ /*-----------------------------------------------------------------------*/ /* */ /* IO functions for EST_Ngram class */ /* */ /*=======================================================================*/ #include #include #include #include "EST_unix.h" #include #include #include #include "EST_String.h" #include "EST_Ngrammar.h" #include "EST_Token.h" #include "EST_cutils.h" EST_read_status load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n) { (void)filename; (void)n; return wrong_format; } EST_read_status load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n) { (void)filename; (void)n; return wrong_format; } EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab) { EST_TokenStream ts; EST_String s; int i,j,k, order=0; double occur,weight; int this_num,this_order; if (ts.open(filename) == -1) return misc_read_error; // find backslash data backslash while ((!ts.eof()) && !ts.get().string().contains("\\data\\")); if (ts.eof()) { ts.close(); return wrong_format; } // find order and numbers of ngrams // somewhere to keep numbers EST_IVector nums(100); // not going to have anything bigger than a 100-gram ! while (!ts.eof()) { // have we got to next section if (ts.peek().string().contains("-grams:")) break; s=ts.get_upto_eoln().string(); if(s.contains("ngram ") && s.contains("=")) { s=s.after("ngram "); this_order=atoi(s.before("=")); this_num=atoi(s.after("=")); //cerr << "There are " << this_num << " " << this_order //<< "-grams" << endl; nums[this_order] = this_num; if(this_order > order) order = this_order; } } if(order==0) { //cerr << "No ngram ?=? in header !" << endl; ts.close(); return wrong_format; } //cerr << "Initialising " << order << "-grammar" << endl; if(!n.init(order,EST_Ngrammar::backoff,vocab)) return misc_read_error; // read data for(i=1;i<=order;i++) { EST_StrVector window(i); // find start of data for this order "-grams:" EST_String tmp = "\\" + itoString(i) + "-grams:"; while (!ts.eof()) { s=ts.get().string(); if (s.contains(tmp)) break; } if(ts.eof()) { cerr << "Unexpected end of grammar file whilst looking for '" << tmp << "'" << endl; return misc_read_error; } //cerr << "Found order " << i << " : " << tmp << endl; //cerr << "Looking for " << nums(i) << " ngrams" << endl; // look for nums(i) ngrams for(j=0;j= num_entries) { cerr << "EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl; ts.close(); fclose(ifd); return misc_read_error; } for (k=n.p_states[i].pdf().item_start(); (!n.p_states[i].pdf().item_end(k)) && (j < num_entries) ; k = n.p_states[i].pdf().item_next(k)) { n.p_states[i].pdf().set_frequency(k,dd[j]); // Update global info too approx_num_samples += dd[j]; // probably not right n.vocab_pdf.cumulate(k,dd[j]); // Number of consecutive occurrences of this frequency as in // dd[j+1] if its a negative number if (j+1 >= num_entries) j++; else if (dd[j+1] < -1) dd[j+1]++; else if (dd[j+1] == -1) j +=2; else j++; } } // With smoothing num_samples might not be as exact as you like n.p_num_samples = (int)approx_num_samples; delete [] dd; ts.close(); fclose(ifd); return format_ok; } // ==================================================================== EST_write_status save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost, EST_Ngrammar &n, double floor) { EST_Litem *k; EST_String name; double freq; EST_StrVector this_ngram(2); // assumes bigram this_ngram[0] = word; EST_DiscreteProbDistribution this_pdf; this_pdf = n.prob_dist(this_ngram); double lfreq=-1; int lcount=0; double total_freq=0; double floor_prob_total = floor * (n.pred_vocab->length()-1); if (word == n.p_sentence_end_marker) { *ost << word; *ost << " 0*" << n.pred_vocab->length()-1 << " " << 1 << endl; return write_ok; } if(floor_prob_total > 1) { cerr << "ERROR : floor is impossibly large, scaling it !" << endl; floor = 1.0 / (double)(n.pred_vocab->length()-1); floor_prob_total = 1; } // not efficient but who cares ? for (k=this_pdf.item_start(); !this_pdf.item_end(k); k = this_pdf.item_next(k)) { this_pdf.item_freq(k,name,freq); if(name != n.p_sentence_start_marker) { total_freq += freq; } } // 0 for prob(word,start marker) *ost << word << " 0 "; if (total_freq <= 0) { *ost << 1.0 / (double)(n.pred_vocab->length()-1) << "*"; *ost << n.pred_vocab->length()-1 << " " << endl; } else { lfreq=-1; for (k=this_pdf.item_start(); !this_pdf.item_end(k); k = this_pdf.item_next(k)) { this_pdf.item_freq(k,name,freq); if ( (name == n.p_sentence_start_marker) || (name == n.p_sentence_end_marker) || (name == OOV_MARKER) ) continue; if (freq == lfreq) lcount++; else { if (lcount > 1) *ost << "*" << lcount << " "; else *ost << " "; lcount=1; lfreq = freq; if(freq > 0) { double base_prob = freq / total_freq; // and floor/scale it *ost << floor + ( base_prob * (1-floor_prob_total) ); } else *ost << floor; } } } // total_freq > 0 if(!n.closed_vocab()) { // not fully tested !!!!!!!! *ost << 0 << " ERROR !!!!!!!! "; } if (total_freq > 0) { freq = this_pdf.frequency(n.p_sentence_end_marker); if(freq == lfreq) { lcount++; *ost << "*" << lcount << " " << endl; } else { if (lcount > 1) *ost << "*" << lcount << " "; else *ost << " "; if(freq > 0) { double base_prob = freq / total_freq; // and floor/scale it *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl; } else *ost << floor << endl; } } return write_ok; } EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor) { ostream *ost; // only for bigram if(n.order() != 2) { cerr << "Can only save bigrams in htk_ascii format" << endl; return misc_write_error; } if (floor < 0) { cerr << "Negative floor probability does not make sense !" << endl; return misc_write_error; } if (filename == "-") ost = &cout; else ost = new ofstream(filename); if(!(*ost)) return write_fail; if(floor * (n.pred_vocab->length()-1) > 1) { floor = 1.0 / (double)(n.pred_vocab->length()-1); cerr << "ERROR : floor is impossibly large, scaling it to "; cerr << floor << endl; } int i; if(n.p_sentence_start_marker == "") { cerr << "Can't save in HTK format as no sentence start/end tags" << " were given !" << endl; return misc_write_error; } // need '!ENTER' (or whatever) as first word- that's HTK for you save_ngram_htk_ascii_sub(n.p_sentence_start_marker,ost,n,floor); // the real words for(i=0;ilength();i++) { if ( (n.vocab->name(i) != n.p_sentence_start_marker) && (n.vocab->name(i) != n.p_sentence_end_marker) && (n.vocab->name(i) != OOV_MARKER) ) save_ngram_htk_ascii_sub(n.vocab->name(i),ost,n,floor); } if(!n.closed_vocab()) save_ngram_htk_ascii_sub(OOV_MARKER,ost,n,floor); save_ngram_htk_ascii_sub(n.p_sentence_end_marker,ost,n,floor); if(ost != &cout) delete ost; return write_ok; } /* EST_write_status save_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n) { return write_ok; } */ void count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count) { if(n->ngram_exists(ngram)) *((double*)count) += 1; } void save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost) { int i; if(n->ngram_exists(ngram)) { *((ostream*)(ost)) << safe_log10(n->probability(ngram)) << " "; for(i=0;irepresentation() == EST_Ngrammar::backoff) && (n->order() > ngram.n()) ) *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram)); //<< " = " //<< n->get_backoff_weight(ngram) << " "; *((ostream*)(ost)) << endl; } } EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n) { // ARPA MIT-LL format - see HTK manual !! ostream *ost; int i,num_n,o; if (filename == "-") ost = &cout; else ost = new ofstream(filename); if (!(*ost)) return write_fail; //n.set_entry_type(EST_Ngrammar::probabilities); //n.make_htk_compatible(); // fix enter/exit probs //*ost << *(n.vocab) << endl; // count number of ngrams num_n = (int)n.samples(); *ost << "\\data\\" << endl; double *count = new double; if (n.representation() == EST_Ngrammar::backoff) { for(o=1;o<=n.order();o++) { EST_StrVector ngram(o); for(i=0;ilength(); i++) *ost << n.vocab->name(i) << " "; *ost << endl; for (i=0; i < n.pred_vocab->length(); i++) *ost << n.pred_vocab->name(i) << " "; *ost << endl; if (n.representation() == EST_Ngrammar::dense) n.print_freqs(*ost,floor); else if (n.representation() == EST_Ngrammar::backoff) { int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1)); for(i=0;ilength(); i++) fprintf(ost," %s\n",(const char *)n.vocab->name(i)); fprintf(ost," )\"\n"); fprintf(ost,"out \"("); for (i=0; i < n.vocab->length(); i++) fprintf(ost," %s\n",(const char *)n.vocab->name(i)); fprintf(ost," )\"\n"); fprintf(ost,"NumStates %d\n",n.num_states()); fprintf(ost,"EST_Header_End\n"); for (i=0; ilength(); i++) fprintf(ofd,"%s ",(const char *)n.vocab->name(i)); fprintf(ofd,"\n"); for (i=0; i < n.pred_vocab->length(); i++) fprintf(ofd,"%s ",(const char *)n.pred_vocab->name(i)); fprintf(ofd,"\n"); // We use a simple form of run-length encoding, if consecutive // values are equal only a length is printed. lengths are // negative as frequencies (even smoothed ones) can never be -ve if ( trace ) cerr << "Saving ..." << endl; if (n.representation() == EST_Ngrammar::dense) { for(i=0;i