/* xgettext Scheme backend. Copyright (C) 2004-2009 Free Software Foundation, Inc. This file was written by Bruno Haible , 2004-2005. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifdef HAVE_CONFIG_H # include "config.h" #endif /* Specification. */ #include "x-scheme.h" #include #include #include #include #include #include "message.h" #include "xgettext.h" #include "error.h" #include "xalloc.h" #include "hash.h" #include "gettext.h" #define _(s) gettext(s) /* The Scheme syntax is described in R5RS. It is implemented in guile-1.6.4/libguile/read.c. Since we are interested only in strings and in forms similar to (gettext msgid ...) or (ngettext msgid msgid_plural ...) we make the following simplifications: - Assume the keywords and strings are in an ASCII compatible encoding. This means we can read the input file one byte at a time, instead of one character at a time. No need to worry about multibyte characters: If they occur as part of identifiers, they most probably act as constituent characters, and the byte based approach will do the same. - Assume the read-hash-procedures is in the default state. Non-standard reader extensions are mostly used to read data, not programs. The remaining syntax rules are: - The syntax code assigned to each character, and how tokens are built up from characters (single escape, multiple escape etc.). - Comment syntax: ';' and '#! ... \n!#\n'. - String syntax: "..." with single escapes. - Read macros and dispatch macro character '#'. Needed to be able to tell which is the n-th argument of a function call. */ /* ====================== Keyword set customization. ====================== */ /* If true extract all strings. */ static bool extract_all = false; static hash_table keywords; static bool default_keywords = true; void x_scheme_extract_all () { extract_all = true; } void x_scheme_keyword (const char *name) { if (name == NULL) default_keywords = false; else { const char *end; struct callshape shape; const char *colon; if (keywords.table == NULL) hash_init (&keywords, 100); split_keywordspec (name, &end, &shape); /* The characters between name and end should form a valid Lisp symbol. Extract the symbol name part. */ colon = strchr (name, ':'); if (colon != NULL && colon < end) { name = colon + 1; if (name < end && *name == ':') name++; colon = strchr (name, ':'); if (colon != NULL && colon < end) return; } insert_keyword_callshape (&keywords, name, end - name, &shape); } } /* Finish initializing the keywords hash table. Called after argument processing, before each file is processed. */ static void init_keywords () { if (default_keywords) { /* When adding new keywords here, also update the documentation in xgettext.texi! */ x_scheme_keyword ("gettext"); /* libguile/i18n.c */ x_scheme_keyword ("ngettext:1,2"); /* libguile/i18n.c */ x_scheme_keyword ("gettext-noop"); default_keywords = false; } } void init_flag_table_scheme () { xgettext_record_flag ("gettext:1:pass-scheme-format"); xgettext_record_flag ("ngettext:1:pass-scheme-format"); xgettext_record_flag ("ngettext:2:pass-scheme-format"); xgettext_record_flag ("gettext-noop:1:pass-scheme-format"); xgettext_record_flag ("format:2:scheme-format"); } /* ======================== Reading of characters. ======================== */ /* Real filename, used in error messages about the input file. */ static const char *real_file_name; /* Logical filename and line number, used to label the extracted messages. */ static char *logical_file_name; static int line_number; /* The input file stream. */ static FILE *fp; /* Fetch the next character from the input file. */ static int do_getc () { int c = getc (fp); if (c == EOF) { if (ferror (fp)) error (EXIT_FAILURE, errno, _("\ error while reading \"%s\""), real_file_name); } else if (c == '\n') line_number++; return c; } /* Put back the last fetched character, not EOF. */ static void do_ungetc (int c) { if (c == '\n') line_number--; ungetc (c, fp); } /* ========================== Reading of tokens. ========================== */ /* A token consists of a sequence of characters. */ struct token { int allocated; /* number of allocated 'token_char's */ int charcount; /* number of used 'token_char's */ char *chars; /* the token's constituents */ }; /* Initialize a 'struct token'. */ static inline void init_token (struct token *tp) { tp->allocated = 10; tp->chars = XNMALLOC (tp->allocated, char); tp->charcount = 0; } /* Free the memory pointed to by a 'struct token'. */ static inline void free_token (struct token *tp) { free (tp->chars); } /* Ensure there is enough room in the token for one more character. */ static inline void grow_token (struct token *tp) { if (tp->charcount == tp->allocated) { tp->allocated *= 2; tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char)); } } /* Read the next token. 'first' is the first character, which has already been read. */ static void read_token (struct token *tp, int first) { init_token (tp); grow_token (tp); tp->chars[tp->charcount++] = first; for (;;) { int c = do_getc (); if (c == EOF) break; if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n' || c == '"' || c == '(' || c == ')' || c == ';') { do_ungetc (c); break; } grow_token (tp); tp->chars[tp->charcount++] = c; } } /* Tests if a token represents an integer. Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int(). */ static inline bool is_integer_syntax (const char *str, int len, int radix) { const char *p = str; const char *p_end = str + len; /* The accepted syntax is ['+'|'-'] DIGIT+ where DIGIT is a hexadecimal digit whose value is below radix. */ if (p == p_end) return false; if (*p == '+' || *p == '-') { p++; if (p == p_end) return false; } do { int c = *p++; if (c >= '0' && c <= '9') c = c - '0'; else if (c >= 'A' && c <= 'F') c = c - 'A' + 10; else if (c >= 'a' && c <= 'f') c = c - 'a' + 10; else return false; if (c >= radix) return false; } while (p < p_end); return true; } /* Tests if a token represents a rational, floating-point or complex number. If unconstrained is false, only real numbers are accepted; otherwise, complex numbers are accepted as well. Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo(). */ static inline bool is_other_number_syntax (const char *str, int len, int radix, bool unconstrained) { const char *p = str; const char *p_end = str + len; bool seen_sign; bool seen_digits; /* The accepted syntaxes are: for a floating-point number: ['+'|'-'] DIGIT+ [EXPONENT] ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT] where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+ (Dot and exponent are allowed only if radix is 10.) for a rational number: ['+'|'-'] DIGIT+ '/' DIGIT+ for a complex number: REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i' REAL-NUMBER {'+'|'-'} 'i' {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i' {'+'|'-'} 'i' REAL-NUMBER '@' REAL-NUMBER */ if (p == p_end) return false; /* Parse leading sign. */ seen_sign = false; if (*p == '+' || *p == '-') { p++; if (p == p_end) return false; seen_sign = true; /* Recognize complex number syntax: {'+'|'-'} 'i' */ if (unconstrained && (*p == 'I' || *p == 'i') && p + 1 == p_end) return true; } /* Parse digits before dot or exponent or slash. */ seen_digits = false; do { int c = *p; if (c >= '0' && c <= '9') c = c - '0'; else if (c >= 'A' && c <= 'F') { if (c >= 'D' && radix == 10) /* exponent? */ break; c = c - 'A' + 10; } else if (c >= 'a' && c <= 'f') { if (c >= 'd' && radix == 10) /* exponent? */ break; c = c - 'a' + 10; } else break; if (c >= radix) return false; seen_digits = true; p++; } while (p < p_end); /* If p == p_end, we know that seen_digits = true, and the number is an integer without exponent. */ if (p < p_end) { /* If we have no digits so far, we need a decimal point later. */ if (!seen_digits && !(*p == '.' && radix == 10)) return false; /* Trailing '#' signs are equivalent to zeroes. */ while (p < p_end && *p == '#') p++; if (p < p_end) { if (*p == '/') { /* Parse digits after the slash. */ bool all_zeroes = true; p++; for (; p < p_end; p++) { int c = *p; if (c >= '0' && c <= '9') c = c - '0'; else if (c >= 'A' && c <= 'F') c = c - 'A' + 10; else if (c >= 'a' && c <= 'f') c = c - 'a' + 10; else break; if (c >= radix) return false; if (c != 0) all_zeroes = false; } /* A zero denominator is not allowed. */ if (all_zeroes) return false; /* Trailing '#' signs are equivalent to zeroes. */ while (p < p_end && *p == '#') p++; } else { if (*p == '.') { /* Decimal point notation. */ if (radix != 10) return false; /* Parse digits after the decimal point. */ p++; for (; p < p_end; p++) { int c = *p; if (c >= '0' && c <= '9') seen_digits = true; else break; } /* Digits are required before or after the decimal point. */ if (!seen_digits) return false; /* Trailing '#' signs are equivalent to zeroes. */ while (p < p_end && *p == '#') p++; } if (p < p_end) { /* Parse exponent. */ switch (*p) { case 'D': case 'd': case 'E': case 'e': case 'F': case 'f': case 'L': case 'l': case 'S': case 's': if (radix != 10) return false; p++; if (p == p_end) return false; if (*p == '+' || *p == '-') { p++; if (p == p_end) return false; } if (!(*p >= '0' && *p <= '9')) return false; for (;;) { p++; if (p == p_end) break; if (!(*p >= '0' && *p <= '9')) break; } break; default: break; } } } } } if (p == p_end) return true; /* Recognize complex number syntax. */ if (unconstrained) { /* Recognize the syntax {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i' */ if (seen_sign && (*p == 'I' || *p == 'i') && p + 1 == p_end) return true; /* Recognize the syntaxes REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i' REAL-NUMBER {'+'|'-'} 'i' */ if (*p == '+' || *p == '-') return (p_end[-1] == 'I' || p_end[-1] == 'i') && (p + 1 == p_end - 1 || is_other_number_syntax (p, p_end - 1 - p, radix, false)); /* Recognize the syntax REAL-NUMBER '@' REAL-NUMBER */ if (*p == '@') { p++; return is_other_number_syntax (p, p_end - p, radix, false); } } return false; } /* Tests if a token represents a number. Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number(). */ static bool is_number (const struct token *tp) { const char *str = tp->chars; int len = tp->charcount; int radix = 10; enum { unknown, exact, inexact } exactness = unknown; bool seen_radix_prefix = false; bool seen_exactness_prefix = false; if (len == 1) if (*str == '+' || *str == '-') return false; while (len >= 2 && *str == '#') { switch (str[1]) { case 'B': case 'b': if (seen_radix_prefix) return false; radix = 2; seen_radix_prefix = true; break; case 'O': case 'o': if (seen_radix_prefix) return false; radix = 8; seen_radix_prefix = true; break; case 'D': case 'd': if (seen_radix_prefix) return false; radix = 10; seen_radix_prefix = true; break; case 'X': case 'x': if (seen_radix_prefix) return false; radix = 16; seen_radix_prefix = true; break; case 'E': case 'e': if (seen_exactness_prefix) return false; exactness = exact; seen_exactness_prefix = true; break; case 'I': case 'i': if (seen_exactness_prefix) return false; exactness = inexact; seen_exactness_prefix = true; break; default: return false; } str += 2; len -= 2; } if (exactness != inexact) { /* Try to parse an integer. */ if (is_integer_syntax (str, len, 10)) return true; /* FIXME: Other Scheme implementations support exact rational numbers or exact complex numbers. */ } if (exactness != exact) { /* Try to parse a rational, floating-point or complex number. */ if (is_other_number_syntax (str, len, 10, true)) return true; } return false; } /* ========================= Accumulating comments ========================= */ static char *buffer; static size_t bufmax; static size_t buflen; static inline void comment_start () { buflen = 0; } static inline void comment_add (int c) { if (buflen >= bufmax) { bufmax = 2 * bufmax + 10; buffer = xrealloc (buffer, bufmax); } buffer[buflen++] = c; } static inline void comment_line_end (size_t chars_to_remove) { buflen -= chars_to_remove; while (buflen >= 1 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) --buflen; if (chars_to_remove == 0 && buflen >= bufmax) { bufmax = 2 * bufmax + 10; buffer = xrealloc (buffer, bufmax); } buffer[buflen] = '\0'; savable_comment_add (buffer); } /* These are for tracking whether comments count as immediately before keyword. */ static int last_comment_line; static int last_non_comment_line; /* ========================= Accumulating messages ========================= */ static message_list_ty *mlp; /* ========================== Reading of objects. ========================= */ /* We are only interested in symbols (e.g. gettext or ngettext) and strings. Other objects need not to be represented precisely. */ enum object_type { t_symbol, /* symbol */ t_string, /* string */ t_other, /* other kind of real object */ t_dot, /* '.' pseudo object */ t_close, /* ')' pseudo object */ t_eof /* EOF marker */ }; struct object { enum object_type type; struct token *token; /* for t_symbol and t_string */ int line_number_at_start; /* for t_string */ }; /* Free the memory pointed to by a 'struct object'. */ static inline void free_object (struct object *op) { if (op->type == t_symbol || op->type == t_string) { free_token (op->token); free (op->token); } } /* Convert a t_symbol/t_string token to a char*. */ static char * string_of_object (const struct object *op) { char *str; int n; if (!(op->type == t_symbol || op->type == t_string)) abort (); n = op->token->charcount; str = XNMALLOC (n + 1, char); memcpy (str, op->token->chars, n); str[n] = '\0'; return str; } /* Context lookup table. */ static flag_context_list_table_ty *flag_context_list_table; /* Read the next object. */ static void read_object (struct object *op, flag_context_ty outer_context) { for (;;) { int c = do_getc (); switch (c) { case EOF: op->type = t_eof; return; case ' ': case '\r': case '\f': case '\t': continue; case '\n': /* Comments assumed to be grouped with a message must immediately precede it, with no non-whitespace token on a line between both. */ if (last_non_comment_line > last_comment_line) savable_comment_reset (); continue; case ';': { bool all_semicolons = true; last_comment_line = line_number; comment_start (); for (;;) { c = do_getc (); if (c == EOF || c == '\n') break; if (c != ';') all_semicolons = false; if (!all_semicolons) { /* We skip all leading white space, but not EOLs. */ if (!(buflen == 0 && (c == ' ' || c == '\t'))) comment_add (c); } } comment_line_end (0); continue; } case '(': { int arg = 0; /* Current argument number. */ flag_context_list_iterator_ty context_iter; const struct callshapes *shapes = NULL; struct arglist_parser *argparser = NULL; for (;; arg++) { struct object inner; flag_context_ty inner_context; if (arg == 0) inner_context = null_context; else inner_context = inherited_context (outer_context, flag_context_list_iterator_advance ( &context_iter)); read_object (&inner, inner_context); /* Recognize end of list. */ if (inner.type == t_close) { op->type = t_other; last_non_comment_line = line_number; if (argparser != NULL) arglist_parser_done (argparser, arg); return; } /* Dots are not allowed in every position. But be tolerant. */ /* EOF inside list is illegal. But be tolerant. */ if (inner.type == t_eof) break; if (arg == 0) { /* This is the function position. */ if (inner.type == t_symbol) { char *symbol_name = string_of_object (&inner); void *keyword_value; if (hash_find_entry (&keywords, symbol_name, strlen (symbol_name), &keyword_value) == 0) shapes = (const struct callshapes *) keyword_value; argparser = arglist_parser_alloc (mlp, shapes); context_iter = flag_context_list_iterator ( flag_context_list_table_lookup ( flag_context_list_table, symbol_name, strlen (symbol_name))); free (symbol_name); } else context_iter = null_context_list_iterator; } else { /* These are the argument positions. */ if (argparser != NULL && inner.type == t_string) arglist_parser_remember (argparser, arg, string_of_object (&inner), inner_context, logical_file_name, inner.line_number_at_start, savable_comment); } free_object (&inner); } if (argparser != NULL) arglist_parser_done (argparser, arg); } op->type = t_other; last_non_comment_line = line_number; return; case ')': /* Tell the caller about the end of list. Unmatched closing parenthesis is illegal. But be tolerant. */ op->type = t_close; last_non_comment_line = line_number; return; case ',': { int c = do_getc (); /* The ,@ handling inside lists is wrong anyway, because ,@form expands to an unknown number of elements. */ if (c != EOF && c != '@') do_ungetc (c); } /*FALLTHROUGH*/ case '\'': case '`': { struct object inner; read_object (&inner, null_context); /* Dots and EOF are not allowed here. But be tolerant. */ free_object (&inner); op->type = t_other; last_non_comment_line = line_number; return; } case '#': /* Dispatch macro handling. */ { c = do_getc (); if (c == EOF) /* Invalid input. Be tolerant, no error message. */ { op->type = t_other; return; } switch (c) { case '(': /* Vector */ do_ungetc (c); { struct object inner; read_object (&inner, null_context); /* Dots and EOF are not allowed here. But be tolerant. */ free_object (&inner); op->type = t_other; last_non_comment_line = line_number; return; } case 'T': case 't': /* Boolean true */ case 'F': case 'f': /* Boolean false */ op->type = t_other; last_non_comment_line = line_number; return; case 'B': case 'b': case 'O': case 'o': case 'D': case 'd': case 'X': case 'x': case 'E': case 'e': case 'I': case 'i': { struct token token; do_ungetc (c); read_token (&token, '#'); if (is_number (&token)) { /* A number. */ free_token (&token); op->type = t_other; last_non_comment_line = line_number; return; } else { if (token.charcount == 2 && (token.chars[1] == 'e' || token.chars[1] == 'i')) { c = do_getc (); if (c != EOF) do_ungetc (c); if (c == '(') /* Homogenous vector syntax, see arrays.scm. */ case 'a': /* Vectors of char */ case 'c': /* Vectors of complex */ /*case 'e':*/ /* Vectors of long */ case 'h': /* Vectors of short */ /*case 'i':*/ /* Vectors of double-float */ case 'l': /* Vectors of long long */ case 's': /* Vectors of single-float */ case 'u': /* Vectors of unsigned long */ case 'y': /* Vectors of byte */ { struct object inner; read_object (&inner, null_context); /* Dots and EOF are not allowed here. But be tolerant. */ free_token (&token); free_object (&inner); op->type = t_other; last_non_comment_line = line_number; return; } } /* Unknown # object. But be tolerant. */ free_token (&token); op->type = t_other; last_non_comment_line = line_number; return; } } case '!': /* Block comment '#! ... \n!#\n'. We don't extract it because it's only used to introduce scripts on Unix. */ { int last1 = 0; int last2 = 0; int last3 = 0; for (;;) { c = do_getc (); if (c == EOF) /* EOF is not allowed here. But be tolerant. */ break; if (last3 == '\n' && last2 == '!' && last1 == '#' && c == '\n') break; last3 = last2; last2 = last1; last1 = c; } continue; } case '*': /* Bit vector. */ { struct token token; read_token (&token, c); /* The token should consists only of '0' and '1', except for the initial '*'. But be tolerant. */ free_token (&token); op->type = t_other; last_non_comment_line = line_number; return; } case '{': /* Symbol with multiple escapes: #{...}# */ { op->token = XMALLOC (struct token); init_token (op->token); for (;;) { c = do_getc (); if (c == EOF) break; if (c == '\\') { c = do_getc (); if (c == EOF) break; } else if (c == '}') { c = do_getc (); if (c == '#') break; if (c != EOF) do_ungetc (c); c = '}'; } grow_token (op->token); op->token->chars[op->token->charcount++] = c; } op->type = t_symbol; last_non_comment_line = line_number; return; } case '\\': /* Character. */ { struct token token; c = do_getc (); if (c != EOF) { read_token (&token, c); free_token (&token); } op->type = t_other; last_non_comment_line = line_number; return; } case ':': /* Keyword. */ case '&': /* Deprecated keyword, installed in optargs.scm. */ { struct token token; read_token (&token, '-'); free_token (&token); op->type = t_other; last_non_comment_line = line_number; return; } /* The following are installed through read-hash-extend. */ /* arrays.scm */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* Multidimensional array syntax: #nx(...) where n ::= DIGIT+ x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'} */ do c = do_getc (); while (c >= '0' && c <= '9'); /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}. But be tolerant. */ /*FALLTHROUGH*/ case '\'': /* boot-9.scm */ case '.': /* boot-9.scm */ case ',': /* srfi-10.scm */ { struct object inner; read_object (&inner, null_context); /* Dots and EOF are not allowed here. But be tolerant. */ free_object (&inner); op->type = t_other; last_non_comment_line = line_number; return; } default: /* Unknown. */ op->type = t_other; last_non_comment_line = line_number; return; } /*NOTREACHED*/ abort (); } case '"': { op->token = XMALLOC (struct token); init_token (op->token); op->line_number_at_start = line_number; for (;;) { int c = do_getc (); if (c == EOF) /* Invalid input. Be tolerant, no error message. */ break; if (c == '"') break; if (c == '\\') { c = do_getc (); if (c == EOF) /* Invalid input. Be tolerant, no error message. */ break; switch (c) { case '\n': continue; case '0': c = '\0'; break; case 'a': c = '\a'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; default: break; } } grow_token (op->token); op->token->chars[op->token->charcount++] = c; } op->type = t_string; if (extract_all) { lex_pos_ty pos; pos.file_name = logical_file_name; pos.line_number = op->line_number_at_start; remember_a_message (mlp, NULL, string_of_object (op), null_context, &pos, NULL, savable_comment); } last_non_comment_line = line_number; return; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '+': case '-': case '.': /* Read a number or symbol token. */ op->token = XMALLOC (struct token); read_token (op->token, c); if (op->token->charcount == 1 && op->token->chars[0] == '.') { free_token (op->token); free (op->token); op->type = t_dot; } else if (is_number (op->token)) { /* A number. */ free_token (op->token); free (op->token); op->type = t_other; } else { /* A symbol. */ op->type = t_symbol; } last_non_comment_line = line_number; return; case ':': default: /* Read a symbol token. */ op->token = XMALLOC (struct token); read_token (op->token, c); op->type = t_symbol; last_non_comment_line = line_number; return; } } } void extract_scheme (FILE *f, const char *real_filename, const char *logical_filename, flag_context_list_table_ty *flag_table, msgdomain_list_ty *mdlp) { mlp = mdlp->item[0]->messages; fp = f; real_file_name = real_filename; logical_file_name = xstrdup (logical_filename); line_number = 1; last_comment_line = -1; last_non_comment_line = -1; flag_context_list_table = flag_table; init_keywords (); /* Eat tokens until eof is seen. When read_object returns due to an unbalanced closing parenthesis, just restart it. */ do { struct object toplevel_object; read_object (&toplevel_object, null_context); if (toplevel_object.type == t_eof) break; free_object (&toplevel_object); } while (!feof (fp)); /* Close scanner. */ fp = NULL; real_file_name = NULL; logical_file_name = NULL; line_number = 0; }