/*************************************************************************/ /* */ /* Centre for Speech Technology Research */ /* University of Edinburgh, UK */ /* Copyright (c) 1996 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author : Alan W Black */ /* Date : April 1996 */ /*-----------------------------------------------------------------------*/ /* */ /* A Tokenize class, both for Tokens (Strings plus alpha) */ /* EST_TokenStream for strings, FILE *, files, pipes etc */ /* */ /*=======================================================================*/ #include #include #include "EST_unix.h" #include #include #include #include "EST_math.h" #include "EST_Token.h" #include "EST_string_aux.h" #include "EST_cutils.h" #include "EST_error.h" const EST_String EST_Token_Default_WhiteSpaceChars = " \t\n\r"; const EST_String EST_Token_Default_SingleCharSymbols = "(){}[]"; const EST_String EST_Token_Default_PrePunctuationSymbols = "\"'`({["; const EST_String EST_Token_Default_PunctuationSymbols = "\"'`.,:;!?]})"; const EST_String Token_Origin_FD = "existing file descriptor"; const EST_String Token_Origin_Stream = "existing istream"; const EST_String Token_Origin_String = "existing string"; static EST_Regex RXanywhitespace("[ \t\n\r]"); static inline char *check_extend_str_in(char *str, int pos, int *max) { // Check we are not at the end of the string, if so get some more // and copy the old one into the new one char *newstuff; if (pos >= *max) { if (pos > *max) *max = 2 * pos; else *max *= 2; newstuff = new char[*max]; strncpy(newstuff,str,pos); delete [] str; return newstuff; } else return str; } #define check_extend_str(STR, POS, MAX) \ (((POS)>= *(MAX))?check_extend_str_in((STR),(POS),(MAX)):(STR)) ostream& operator<<(ostream& s, const EST_Token &p) { s << "[TOKEN " << p.pname << "]"; return s; } EST_Token &EST_Token::operator = (const EST_Token &a) { linenum = a.linenum; linepos = a.linepos; p_filepos = a.p_filepos; p_quoted = a.p_quoted; space = a.space; prepunc = a.prepunc; pname = a.pname; punc = a.punc; return *this; } const EST_String EST_Token::pos_description() const { return "line "+itoString(linenum)+" char "+itoString(linepos); } EST_Token &EST_Token::operator = (const EST_String &a) { pname = a; return *this; } EST_TokenStream::EST_TokenStream() { tok_wspacelen = 64; // will grow if necessary tok_wspace = new char[tok_wspacelen]; tok_stufflen = 512; // will grow if necessary tok_stuff = new char[tok_stufflen]; tok_prepuncslen = 32; // will grow if necessary tok_prepuncs = new char[tok_prepuncslen]; default_values(); } EST_TokenStream::EST_TokenStream(EST_TokenStream &s) { (void)s; cerr << "TokenStream: warning passing TokenStream not as reference" << endl; // You *really* shouldn't use this AT ALL unless you // fully understand its consequences, you'll be copying open // files and moving file pointers all over the place // basically *DON'T* do this, pass the stream by reference // Now there may be occasions when you do want to do this for example // when you need to do far look ahead or check point as you read // but they are obscure and I'm not sure how to do that for all // the file forms supported by the TokenStream. If you do // I can write a clone function that might do it. } void EST_TokenStream::default_values() { type = tst_none; peeked_tokp = FALSE; peeked_charp = FALSE; eof_flag = FALSE; quotes = FALSE; p_filepos = 0; linepos = 1; WhiteSpaceChars = EST_Token_Default_WhiteSpaceChars; SingleCharSymbols = EST_String::Empty; PrePunctuationSymbols = EST_String::Empty; PunctuationSymbols = EST_String::Empty; build_table(); close_at_end=TRUE; } EST_TokenStream::~EST_TokenStream() { if (type != tst_none) close(); delete [] tok_wspace; delete [] tok_stuff; delete [] tok_prepuncs; } ostream& operator<<(ostream& s, EST_TokenStream &p) { s << "[TOKENSTREAM "; switch (p.type) { case tst_none: cerr << "UNSET"; break; case tst_file: cerr << "FILE"; break; case tst_pipe: cerr << "PIPE"; break; case tst_istream: cerr << "ISTREAM"; break; case tst_string: cerr << "STRING"; break; default: cerr << "UNKNOWN" << endl; } s << "]"; return s; } int EST_TokenStream::open(const EST_String &filename) { if (type != tst_none) close(); default_values(); fp = fopen(filename,"rb"); if (fp == NULL) { cerr << "Cannot open file " << filename << " as tokenstream" << endl; return -1; } Origin = filename; type = tst_file; return 0; } int EST_TokenStream::open(FILE *ofp, int close_when_finished) { // absorb already open stream if (type != tst_none) close(); default_values(); fp = ofp; if (fp == NULL) { cerr << "Cannot absorb NULL filestream as tokenstream" << endl; return -1; } Origin = Token_Origin_FD; type = tst_file; close_at_end = close_when_finished; return 0; } int EST_TokenStream::open(istream &newis) { // absorb already open istream if (type != tst_none) close(); default_values(); is = &newis; Origin = Token_Origin_Stream; type = tst_istream; return 0; } int EST_TokenStream::open_string(const EST_String &newbuffer) { // Make a tokenstream from an internal existing string/buffer const char *buf; if (type != tst_none) close(); default_values(); buf = (const char *)newbuffer; buffer_length = newbuffer.length(); buffer = new char[buffer_length+1]; memmove(buffer,buf,buffer_length+1); pos = 0; Origin = Token_Origin_String; type = tst_string; return 0; } int EST_TokenStream::seek_end() { // This isn't actually useful but people expect it peeked_charp = FALSE; peeked_tokp = FALSE; switch (type) { case tst_none: cerr << "EST_TokenStream unset" << endl; return -1; break; case tst_file: fseek(fp,0,SEEK_END); p_filepos = ftell(fp); return p_filepos; case tst_pipe: cerr << "EST_TokenStream seek on pipe not supported" << endl; return -1; break; case tst_istream: cerr << "EST_TokenStream seek on istream not yet supported" << endl; return -1; break; case tst_string: pos = buffer_length; return pos; default: cerr << "EST_TokenStream: unknown type" << endl; return -1; } return -1; // can't get here } int EST_TokenStream::seek(int position) { peeked_charp = FALSE; peeked_tokp = FALSE; switch (type) { case tst_none: cerr << "EST_TokenStream unset" << endl; return -1; break; case tst_file: p_filepos = position; return fseek(fp,position,SEEK_SET); case tst_pipe: cerr << "EST_TokenStream seek on pipe not supported" << endl; return -1; break; case tst_istream: cerr << "EST_TokenStream seek on istream not yet supported" << endl; return -1; break; case tst_string: if (position >= pos) { pos = position; return -1; } else { pos = position; return 0; } break; default: cerr << "EST_TokenStream: unknown type" << endl; return -1; } return -1; // can't get here } static int stdio_fread(void *buff,int size,int nitems,FILE *fp) { // So it can find the stdio one rather than the TokenStream one return fread(buff,size,nitems,fp); } int EST_TokenStream::fread(void *buff, int size, int nitems) { // switching into binary mode for current position int items_read; // so we can continue to read afterwards if (peeked_tokp) { cerr << "ERROR " << pos_description() << " peeked into binary data" << endl; return 0; } peeked_charp = FALSE; peeked_tokp = FALSE; switch (type) { case tst_none: cerr << "EST_TokenStream unset" << endl; return 0; break; case tst_file: items_read = stdio_fread(buff,(size_t)size,(size_t)nitems,fp); p_filepos += items_read*size; return items_read; case tst_pipe: cerr << "EST_TokenStream fread pipe not yet supported" << endl; return 0; break; case tst_istream: cerr << "EST_TokenStream fread istream not yet supported" << endl; return 0; case tst_string: if ((buffer_length-pos)/size < nitems) items_read = (buffer_length-pos)/size; else items_read = nitems; memcpy(buff,&buffer[pos],items_read*size); pos += items_read*size; return items_read; default: cerr << "EST_TokenStream: unknown type" << endl; return EOF; } return 0; // can't get here } void EST_TokenStream::close(void) { // close any files (if they were used) switch (type) { case tst_none: break; case tst_file: if (close_at_end) fclose(fp); case tst_pipe: // close(fd); break; case tst_istream: break; case tst_string: delete [] buffer; buffer = 0; break; default: cerr << "EST_TokenStream: unknown type" << endl; break; } type = tst_none; peeked_charp = FALSE; peeked_tokp = FALSE; } int EST_TokenStream::restart(void) { // For paul, the only person I know who uses this switch (type) { case tst_none: break; case tst_file: fp = freopen(Origin,"rb",fp); p_filepos = 0; break; case tst_pipe: cerr << "EST_TokenStream: can't rewind pipe" << endl; return -1; break; case tst_istream: cerr << "EST_TokenStream: can't rewind istream" << endl; break; case tst_string: pos = 0; break; default: cerr << "EST_TokenStream: unknown type" << endl; break; } linepos = 1; peeked_charp = FALSE; peeked_tokp = FALSE; eof_flag = FALSE; return 0; } EST_TokenStream & EST_TokenStream::operator >>(EST_Token &p) { return get(p); } EST_TokenStream & EST_TokenStream::operator >>(EST_String &p) { EST_Token t; get(t); p = t.string(); return *this; } EST_TokenStream &EST_TokenStream::get(EST_Token &tok) { tok = get(); return *this; } EST_Token EST_TokenStream::get_upto(const EST_String &s) { // Returns a concatenated token form here to next symbol that matches s // including s (though not adding s on the result) // Not really for the purist but lots of times very handy // Note this is not very efficient EST_String result; EST_Token t; for (result=EST_String::Empty; (t=get()) != s; ) { result += t.whitespace() + t.prepunctuation() + t.string() + t.punctuation(); if (eof()) { cerr << "EST_TokenStream: end of file when looking for \"" << s << "\"" << endl; break; } } return EST_Token(result); } EST_Token EST_TokenStream::get_upto_eoln(void) { // Swallow the lot up to end of line // assumes \n is a whitespace character EST_String result(EST_String::Empty); while (!eoln()) { EST_Token &t=get(); result += t.whitespace() + t.prepunctuation(); if (quotes) result += quote_string(t.string()); else result += t.string(); result += t.punctuation(); if (eof()) { // cerr << "EST_TokenStream: end of file when looking for end of line" // << endl; break; } } // So that the next call works I have to step over the eoln condition // That involves removing the whitespace upto and including the next // \n in the peek token. char *w = wstrdup(peek().whitespace()); int i; for (i=0; w[i] != 0; i++) if (w[i] == '\n') // maybe not portable peek().set_whitespace(&w[i+1]); wfree(w); static EST_Token result_t; result_t.set_token(result); return result_t; } EST_Token &EST_TokenStream::must_get(EST_String expected, bool *ok) { EST_Token &tok = get(); if (tok != expected) { if (ok != NULL) { *ok=FALSE; return tok; } else EST_error("Expected '%s' got '%s' at %s", (const char *)expected, (const char *)(EST_String)tok, (const char *)pos_description()); } if (ok != NULL) *ok=TRUE; return tok; } void EST_TokenStream::build_table() { int i; const char *p; unsigned char c; for (i=0; i<256; ++i) p_table[i]=0; for (p=WhiteSpaceChars; *p; ++p) if (p_table[c=(unsigned char)*p]) EST_warning("Character '%c' has two classes, '%c' and '%c'", *p, c, ' '); else p_table[c] = ' '; for (p=SingleCharSymbols; *p; ++p) if (p_table[c=(unsigned char)*p]) EST_warning("Character '%c' has two classes, '%c' and '%c'", *p, p_table[c], '!'); else p_table[c] = '@'; for (p=PunctuationSymbols; *p; ++p) if (p_table[c=(unsigned char)*p] == '@') continue; else if (p_table[c]) EST_warning("Character '%c' has two classes, '%c' and '%c'", *p, p_table[c], '.'); else p_table[c] = '.'; for(p=PrePunctuationSymbols; *p; ++p) if (p_table[c=(unsigned char)*p] == '@') continue; else if (p_table[c] == '.') p_table[c] = '"'; else if (p_table[c]) EST_warning("Character '%c' has two classes, '%c' and '%c'", *p, p_table[c], '$'); else p_table[c] = '$'; p_table_wrong=0; } inline int EST_TokenStream::getpeeked_internal(void) { peeked_charp = FALSE; return peeked_char; } inline int EST_TokenStream::getch_internal() { // Return next character in stream if (EST_TokenStream::peeked_charp) { return getpeeked_internal(); } switch (type) { case tst_none: cerr << "EST_TokenStream unset" << endl; return EOF; break; case tst_file: p_filepos++; { char lc; if (stdio_fread(&lc,1,1,fp) == 0) return EOF; else return (int)lc; } /* return getc(fp); */ case tst_pipe: cerr << "EST_TokenStream pipe not yet supported" << endl; return EOF; break; case tst_istream: p_filepos++; return is->get(); case tst_string: if (pos < buffer_length) { p_filepos++; return buffer[pos++]; } else return EOF; default: cerr << "EST_TokenStream: unknown type" << endl; return EOF; } return EOF; // can't get here } int EST_TokenStream::getch(void) { return getch_internal(); } inline int EST_TokenStream::peekch_internal() { // Return next character in stream (without reading it) if (!peeked_charp) peeked_char = getch_internal(); peeked_charp = TRUE; return peeked_char; } int EST_TokenStream::peekch(void) { return peekch_internal(); } #define CLASS(C,CL) (p_table[(unsigned char)(C)]==(CL)) #define CLASS2(C,CL1,CL2) (p_table[(unsigned char)(C)]==(CL1)||p_table[(unsigned char)(C)]==(CL2)) EST_Token &EST_TokenStream::get(void) { if (peeked_tokp) { peeked_tokp = FALSE; return current_tok; } if (p_table_wrong) build_table(); char *word; int c,i,j; for (i=0; (CLASS(c=getch_internal(),' ') && ( c != EOF )); i++) { if (c == '\n') linepos++; tok_wspace = check_extend_str(tok_wspace,i,&tok_wspacelen); tok_wspace[i] = c; } tok_wspace[i] = '\0'; current_tok.init(); if (c != EOF) { current_tok.set_filepos(p_filepos-1); if ((quotes) && // quoted strings (with escapes) are allowed (c == quote)) { for (i=0; ((c = getch_internal()) != EOF) ;) { if (c == quote) break; tok_stuff = check_extend_str(tok_stuff,i,&tok_stufflen); if (c == escape) c = getch_internal(); tok_stuff[i++] = c; } current_tok.set_quoted(TRUE); } else // standard whitespace separated tokens { for (i=0,tok_stuff[i++]=c; ( !CLASS(c,'@') && !CLASS(c=peekch_internal(),' ') && !CLASS(c,'@') && ( c != EOF )) ;) { tok_stuff = check_extend_str(tok_stuff,i,&tok_stufflen); // note, we must have peeked to get here. tok_stuff[i++] = getpeeked_internal(); } } tok_stuff[i] = '\0'; // Are there any punctuation symbols at the start? for (j=0; ((j < i) && CLASS2(tok_stuff[j], '$', '"')); j++); if ((j > 0) && (j < i)) // there are { tok_prepuncs = check_extend_str(tok_prepuncs,j+1,&tok_prepuncslen); memmove(tok_prepuncs,tok_stuff,j); tok_prepuncs[j] = '\0'; current_tok.set_prepunctuation(tok_prepuncs); word=&tok_stuff[j]; i-=j; // reduce size by number of prepuncs } else { current_tok.set_prepunctuation(EST_String::Empty); word = tok_stuff; } // Are there any punctuation symbols at the end for (j=i-1; ((j > 0) && CLASS2(word[j],'.','"')); j--); if (word[j+1] != '\0') { current_tok.set_punctuation(&word[j+1]); word[j+1] = '\0'; } else current_tok.set_punctuation(EST_String::Empty); current_tok.set_token(word); if (tok_wspace[0] == '\0') // feature paths will have null whitespace current_tok.set_whitespace(EST_String::Empty); else current_tok.set_whitespace(tok_wspace); } else { current_tok.set_token(EST_String::Empty); current_tok.set_whitespace(tok_wspace); current_tok.set_punctuation(EST_String::Empty); current_tok.set_prepunctuation(EST_String::Empty); eof_flag = TRUE; } return current_tok; } int EST_TokenStream::eoln(void) { // This doesn't really work if there are blank lines (and you want // to know about them) if ((peek().whitespace().contains("\n")) || eof()) return TRUE; else return FALSE; } EST_String quote_string(const EST_String &s, const EST_String "e, const EST_String &escape, int force) { // Quotes s always if force true, or iff s contains whitespace, // quotes or escapes force is false // Note quote and escape are assumed to be string of length 1 EST_String quoted_form; if ((force) || (s.contains(quote)) || (s.contains(escape)) || (s.contains(RXanywhitespace)) || (s.length() == 0)) { // bigger than the quoted form could ever be int i,j; char *quoted = new char[s.length()*(quote.length()+escape.length())+ 1+quote.length()+quote.length()]; quoted[0] = quote(0); for (i=1,j=0; j < s.length(); j++,i++) { if (s(j) == quote(0)) quoted[i++] = escape(0); else if (s(j) == escape(0)) quoted[i++] = escape(0); quoted[i] = s(j); } quoted[i++] = quote(0); quoted[i] = '\0'; quoted_form = quoted; delete [] quoted; return quoted_form; } else return s; } const EST_String EST_TokenStream::pos_description() { return Origin+":"+itoString(linepos); }