/*************************************************************************/ /* */ /* Centre for Speech Technology Research */ /* University of Edinburgh, UK */ /* Copyright (c) 1996 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author : Simon King & Alan W Black */ /* Date : February 1997 */ /*-----------------------------------------------------------------------*/ /* */ /* A general class for ngrams (bi-gram, tri-gram etc) */ /* */ /*=======================================================================*/ #ifndef __EST_NGRAMMAR_H__ #define __EST_NGRAMMAR_H__ #include #include using namespace std; #include "EST_String.h" #include "EST_Val.h" #include "EST_rw_status.h" #include "EST_types.h" #include "EST_FMatrix.h" #include "EST_TList.h" #include "EST_StringTrie.h" #include "EST_simplestats.h" #include "EST_PST.h" #include "EST_string_aux.h" #include "EST_math.h" // HTK style #define SENTENCE_START_MARKER "!ENTER" #define SENTENCE_END_MARKER "!EXIT" #define OOV_MARKER "!OOV" #define EST_NGRAMBIN_MAGIC 1315402337 // for compressed save/load #define GZIP_FILENAME_EXTENSION "gz" #define COMPRESS_FILENAME_EXTENSION "Z" // Ultimate floor #define TINY_FREQ 1.0e-10 // ngram state - represents the N-1 word history and contains // the pdf of the next word class EST_NgrammarState { private: protected: EST_DiscreteProbDistribution p_pdf; int p_id; // a 'name' public: EST_NgrammarState() : p_pdf() { init(); }; EST_NgrammarState(int id,EST_Discrete *d){clear();init(id,d);}; EST_NgrammarState(int id,const EST_DiscreteProbDistribution &pdf) {clear();init(id,pdf);}; EST_NgrammarState(const EST_NgrammarState &s); EST_NgrammarState(const EST_NgrammarState *const s); ~EST_NgrammarState(); EST_IVector path; // how we got here // initialise void clear(); void init(); void init(int id, EST_Discrete *d); void init(int id, const EST_DiscreteProbDistribution &pdf); // build void cumulate(const int index, const double count=1) {p_pdf.cumulate(index,count);}; void cumulate(const EST_String &word, const double count=1) {p_pdf.cumulate(word,count);}; // access int id() const {return p_id; }; const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; }; EST_DiscreteProbDistribution &pdf() {return p_pdf; }; double probability(const EST_String &w) const {return p_pdf.probability(w);} double probability(int w) const {return p_pdf.probability(w);} double frequency(const EST_String &w) const {return p_pdf.frequency(w);} double frequency(int w) const {return p_pdf.frequency(w);} const EST_String &most_probable(double *prob = NULL) const {return p_pdf.most_probable(prob);} friend ostream& operator<<(ostream& s, const EST_NgrammarState &a); }; class EST_BackoffNgrammarState { private: protected: int p_level; // = 0 for root node double backoff_weight; EST_DiscreteProbDistribution p_pdf; EST_StringTrie children; EST_BackoffNgrammarState* add_child(const EST_Discrete *d, const EST_StrVector &words); EST_BackoffNgrammarState* add_child(const EST_Discrete *d, const EST_IVector &words); public: EST_BackoffNgrammarState() { init(); }; EST_BackoffNgrammarState(const EST_Discrete *d,int level) {clear();init(d,level);}; EST_BackoffNgrammarState(const EST_DiscreteProbDistribution &pdf,int level) {clear();init(pdf,level);}; EST_BackoffNgrammarState(const EST_BackoffNgrammarState &s); EST_BackoffNgrammarState(const EST_BackoffNgrammarState *const s); ~EST_BackoffNgrammarState(); // initialise void clear(); void init(); void init(const EST_Discrete *d, int level); void init(const EST_DiscreteProbDistribution &pdf, int level); // build bool accumulate(const EST_StrVector &words, const double count=1); bool accumulate(const EST_IVector &words, const double count=1); // access const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; }; EST_DiscreteProbDistribution &pdf() {return p_pdf; }; double probability(const EST_String &w) const {return p_pdf.probability(w);} double frequency(const EST_String &w) const {return p_pdf.frequency(w);} const EST_String &most_probable(double *prob = NULL) const {return p_pdf.most_probable(prob);} const int level() const {return p_level;} EST_BackoffNgrammarState* get_child(const EST_String &word) const { return (EST_BackoffNgrammarState*)children.lookup(word); } EST_BackoffNgrammarState* get_child(const int word) const { return (EST_BackoffNgrammarState*)children.lookup(p_pdf.get_discrete()->name(word)); } void remove_child(EST_BackoffNgrammarState *child, const EST_String &name); // recursive delete of contents and children void zap(); const EST_BackoffNgrammarState *const get_state(const EST_StrVector &words) const; bool ngram_exists(const EST_StrVector &words, const double threshold) const; const double get_backoff_weight() const {return backoff_weight; } const double get_backoff_weight(const EST_StrVector &words) const; bool set_backoff_weight(const EST_StrVector &words, const double w); void frequency_of_frequencies(EST_DVector &ff); void print_freqs(ostream &os,const int order,EST_String followers=""); friend ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a); }; class EST_Ngrammar { public: // 3 representations : sparse, dense and backed off. User specifies which. enum representation_t {sparse, dense, backoff}; // now only keep frequencies (or log frequencies) // probabilities (or log probabilities) can be done // on the fly quickly enough enum entry_t {frequencies, log_frequencies}; protected: // each instance of an EST_Ngrammar is a grammar of fixed order // e.g. a bigram (order = 2) int p_order; int p_num_samples; double p_number_of_sentences; // which were used to build this grammar EST_String p_sentence_start_marker; EST_String p_sentence_end_marker; // only one representation in use at a time representation_t p_representation; entry_t p_entry_type; // sparse representation is a tree structure // holding only those N-grams which were seen EST_PredictionSuffixTree sparse_representation; bool init_sparse_representation(); // dense representation is just an array of all states bool init_dense_representation(); // backoff representation is also a tree structure // but the root state pdf is the most recent word in the // ngram and going down the tree is going back in time.... // here is the root node : EST_BackoffNgrammarState *backoff_representation; double backoff_threshold; // need a non-zero unigram floor to enable backing off double backoff_unigram_floor_freq; // instead of simple discounting, we have a (possibly) different // discount per order and per frequency // e.g. backoff_discount[2](4) contains the discount to be // applied to a trigram frequency of 4 // backoff_discount[0] is unused (we don't discount unigrams) EST_DVector *backoff_discount; const double get_backoff_discount(const int order, const double freq) const; bool init_backoff_representation(); void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL); // remove any zero frequency branches void backoff_restore_unigram_states(); int p_num_states; // == p_vocab_size ^ (p_ord-1) if fully dense EST_NgrammarState *p_states; // state id is index into this array int find_dense_state_index(const EST_IVector &words, int index=0) const; // and the reverse const EST_StrVector &make_ngram_from_index(const int i) const; // vocabulary EST_Discrete *vocab; EST_Discrete *pred_vocab; // may be different from state vocab bool init_vocab(const EST_StrList &wordlist); bool init_vocab(const EST_StrList &word_list, const EST_StrList &pred_list); // make sure vocab matches a given wordlist bool check_vocab(const EST_StrList &wordlist); EST_DiscreteProbDistribution vocab_pdf; // overall pdf const EST_String &lastword(const EST_StrVector &words) const { return words(p_order-1); } const int lastword(const EST_IVector &words) const { return words(p_order-1); } // are we allowing out-of-vocabulary words, or is the vocabulary closed? bool allow_oov; bool sparse_to_dense(); bool dense_to_sparse(); // these aren't sorted yet ... void take_logs(); void take_exps(); void freqs_to_probs(); // just calls normalise bool build_sparse(const EST_String &filename, const EST_String &prev, const EST_String &prev_prev, const EST_String &last); // for dense and backoff bool build_ngram(const EST_String &filename, const EST_String &prev, const EST_String &prev_prev, const EST_String &last, const EST_String &input_format); // go through all matching ngrams ( *(ngram[i])="" matches anything ) void iterate(EST_StrVector &words, void (*function)(EST_Ngrammar *n, EST_StrVector &words, void *params), void *params); // same, but with a constant Ngrammar void const_iterate(EST_StrVector &words, void (*function)(const EST_Ngrammar *const n, EST_StrVector &words, void *params), void *params) const; bool p_init(int o, representation_t r); // new filename returned of we had to copy stdin to a // temporary file - must delete it later ! bool oov_preprocess(const EST_String &filename, EST_String &new_filename, const EST_String &what); const EST_NgrammarState &find_state_const(const EST_StrVector &words)const; EST_NgrammarState &find_state(const EST_StrVector &words); const EST_NgrammarState &find_state_const(const EST_IVector &words) const; EST_NgrammarState &find_state(const EST_IVector &words); // special versions for backoff grammars const EST_DiscreteProbDistribution &backoff_prob_dist(const EST_StrVector &words) const; const double backoff_reverse_probability_sub(const EST_StrVector &words, const EST_BackoffNgrammarState *root) const; const double backoff_probability(const EST_StrVector &words, const bool trace=false) const; const double backoff_reverse_probability(const EST_StrVector &words) const; const EST_String & backoff_most_probable(const EST_StrVector &words, double *prob = NULL) const; // backoff representation isn't a nice array of states // so use this to visit every node in the tree // and apply the function to that node void backoff_traverse(EST_BackoffNgrammarState *start_state, void (*function)(EST_BackoffNgrammarState *s, void *params), void *params); // visit every node at a given level void backoff_traverse(EST_BackoffNgrammarState *start_state, void (*function)(EST_BackoffNgrammarState *s, void *params), void *params, const int level); public: EST_Ngrammar() {default_values();} EST_Ngrammar(int o, representation_t r, const EST_StrList &wordlist) { default_values(); init(o,r,wordlist); } // When state trans vocab differs from prediction vocab EST_Ngrammar(int o, representation_t r, const EST_StrList &wordlist, const EST_StrList &predlist) { default_values(); init(o,r,wordlist,predlist); } EST_Ngrammar(int o, representation_t r, EST_Discrete &v) { default_values(); init(o,r,v); } ~EST_Ngrammar(); void default_values(); void clear(); bool init(int o, representation_t r, const EST_StrList &wordlist); bool init(int o, representation_t r, const EST_StrList &wordlist, const EST_StrList &predlist); bool init(int o, representation_t r, EST_Discrete &v); bool init(int o, representation_t r, EST_Discrete &v,EST_Discrete &pv); // access int num_states(void) const { return p_num_states;} double samples(void) const { return p_num_samples;} int order() const { return p_order; } int get_vocab_length() const { return vocab?vocab->length():0; } EST_String get_vocab_word(int i) const; int get_vocab_word(const EST_String &s) const; int get_pred_vocab_length() const { return pred_vocab->length(); } EST_String get_pred_vocab_word(int i) const { return pred_vocab->name(i); } int get_pred_vocab_word(const EST_String &s) const { return pred_vocab->name(s); } int closed_vocab() const {return !allow_oov; } entry_t entry_type() const {return p_entry_type;} representation_t representation() const { return p_representation;} // build bool build(const EST_StrList &filenames, const EST_String &prev = SENTENCE_START_MARKER, const EST_String &prev_prev = SENTENCE_END_MARKER, const EST_String &last = SENTENCE_END_MARKER, const EST_String &input_format = "", const EST_String &oov_mode = "", const int mincount=1, const int maxcount=10); // Accumulate ngrams void accumulate(const EST_StrVector &words, const double count=1); //const int index=0); void accumulate(const EST_IVector &words, const double count=1); //const int index=0); // hack - fix enter/exit probs s.t. P(...,!ENTER)=P(!EXIT,...)=0, for all x void make_htk_compatible(); // I/O functions EST_read_status load(const EST_String &filename); EST_read_status load(const EST_String &filename, const EST_StrList &wordlist); EST_write_status save(const EST_String &filename, const EST_String type="cstr_ascii", const bool trace=false, double floor=0.0); int wordlist_index(const EST_String &word, const bool report=true) const; const EST_String &wordlist_index(int i) const; int predlist_index(const EST_String &word) const; const EST_String &predlist_index(int i) const; // set bool set_entry_type(entry_t new_type); bool set_representation(representation_t new_representation); // probability distributions // ------------------------- // flag 'force' forces computation of probs on-the-fly if necessary double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const; double frequency(const EST_StrVector &words, bool force=false, const bool trace=false) const; const EST_String &predict(const EST_StrVector &words, double *prob,int *state) const; const EST_String &predict(const EST_StrVector &words) const {double p; int state; return predict(words,&p,&state); } const EST_String &predict(const EST_StrVector &words,double *prob) const {int state; return predict(words,prob,&state); } const EST_String &predict(const EST_IVector &words,double *prob,int *state) const; const EST_String &predict(const EST_IVector &words) const {double p; int state; return predict(words,&p,&state); } const EST_String &predict(const EST_IVector &words,double *prob) const {int state; return predict(words,prob,&state); } int find_state_id(const EST_StrVector &words) const; int find_state_id(const EST_IVector &words) const; int find_next_state_id(int state, int word) const; // fast versions for common N //const double probability(const EST_String w1); //const double probability(const EST_String w1,const EST_String w2); //const double probability(const EST_String w1,const EST_String w2, //const EST_String w2); // reverse - probability of words[0..order-2] given word[order-1] double reverse_probability(const EST_StrVector &words, bool force=false) const; double reverse_probability(const EST_IVector &words, bool force=false) const; // predict, where words has 'order' elements and the last one is "" or NULL const EST_DiscreteProbDistribution &prob_dist(const EST_StrVector &words) const; const EST_DiscreteProbDistribution &prob_dist(const EST_IVector &words) const; const EST_DiscreteProbDistribution &prob_dist(int state) const; // bool stats(const EST_String filename, // double &raw_entropy, double &count, // double &entropy, double &perplexity, // const EST_String &prev = SENTENCE_START_MARKER, // const EST_String &prev_prev = SENTENCE_END_MARKER, // const EST_String &last = SENTENCE_END_MARKER, // const EST_String &input_format = "") const; void fill_window_start(EST_IVector &window, const EST_String &prev, const EST_String &prev_prev) const; void fill_window_start(EST_StrVector &window, const EST_String &prev, const EST_String &prev_prev) const; // why anybody would want to do this .... //EST_Ngrammar &operator =(const EST_Ngrammar &a); bool ngram_exists(const EST_StrVector &words) const; bool ngram_exists(const EST_StrVector &words, const double threshold) const; const double get_backoff_weight(const EST_StrVector &words) const; bool set_backoff_weight(const EST_StrVector &words, const double w); void print_freqs(ostream &os,double floor=0.0); // i/o functions // ------------- friend ostream& operator<<(ostream& s, EST_Ngrammar &n); friend EST_read_status load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n); friend EST_read_status load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n); friend EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab); friend EST_read_status load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n); friend EST_read_status load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n); friend EST_write_status save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost, EST_Ngrammar &n, double floor); friend EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor=0.0); //friend EST_write_status save_ngram_htk_binary(const EST_String filename, // EST_Ngrammar &n); friend EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0); friend EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0); friend EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n); friend EST_write_status save_ngram_arpa_sub(ostream *ost, EST_Ngrammar &n, const EST_StrVector &words); friend EST_write_status save_ngram_wfst(const EST_String filename, EST_Ngrammar &n); // Auxiliary functions // smoothing friend void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order=0); friend void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order=0); friend bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount=0); friend void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount=0.5); friend void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram); friend int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram, int smooth_thresh); // frequencies below mincount get backed off // frequencies above maxcount are not smoothed(discounted) bool compute_backoff_weights(const int mincount=1, const int maxcount=10); bool merge(EST_Ngrammar &n,float weight); friend class EST_BackoffNgrammar; }; void Ngram_freqsmooth(EST_Ngrammar &ngram, int smooth_thresh1, int smooth_thresh2); // utils void slide(EST_IVector &i, const int l); void slide(EST_StrVector &i, const int l); bool test_stats(EST_Ngrammar &ngram, const EST_String &filename, double &raw_entropy, double &count, double &entropy, double &perplexity, const EST_String &input_format, const EST_String &prev = SENTENCE_START_MARKER, const EST_String &prev_prev = SENTENCE_END_MARKER, const EST_String &last = SENTENCE_END_MARKER); VAL_REGISTER_CLASS_DCLS(ngrammar,EST_Ngrammar) #endif // __EST_NGRAMMAR_H__