From: Bruno Haible Date: Tue, 11 Dec 2001 11:49:01 +0000 (+0000) Subject: New Common Lisp backend. X-Git-Tag: v0.11~205 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b659383be704ef1649537adb69c5196af52742e7;p=thirdparty%2Fgettext.git New Common Lisp backend. --- diff --git a/NEWS b/NEWS index 9915f9d7b..95d79b517 100644 --- a/NEWS +++ b/NEWS @@ -13,7 +13,7 @@ Version 0.11 - XXX 2001 * msgfmt can create (and msgunfmt can dump) Java ResourceBundles. -* xgettext now also supports Java, ObjectPascal, YCP. +* xgettext now also supports Lisp, Java, ObjectPascal, YCP. * The tools now know about format strings in languages other than C. They recognize new message flags named lisp-format, smalltalk-format, diff --git a/doc/ChangeLog b/doc/ChangeLog index 92b7a491a..df08bec86 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,7 @@ +2001-12-09 Bruno Haible + + * gettext.texi (Common Lisp): Update. + 2001-12-08 Bruno Haible * msgfilter.texi: Renamed from msgexec.texi. diff --git a/doc/gettext.texi b/doc/gettext.texi index 344624245..fe44508b5 100644 --- a/doc/gettext.texi +++ b/doc/gettext.texi @@ -5944,7 +5944,7 @@ automatic use @item Extractor -@code{clisp-xgettext} +@code{xgettext -k_ -kENGLISH} @item Formatting with positions @code{format "~1@@*~D ~0@@*~D"} diff --git a/src/ChangeLog b/src/ChangeLog index bee6f3911..7bb60dfb5 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,14 @@ +2001-12-09 Bruno Haible + + * x-lisp.h: New file. + * x-lisp.c: New file. + * xgettext.c: Include x-lisp.h. + (main): Call x_lisp_extract_all, x_lisp_keyword. + (language_to_scanner): Add Lisp rule. Remove preliminary Lisp rule. + (extension_to_language): Add Lisp rule. + * Makefile.am (noinst_HEADERS): Add x-lisp.h. + (xgettext_SOURCES): Add x-lisp.c. + 2001-12-09 Bruno Haible * xgettext.h (split_keywordspec): New declaration. diff --git a/src/Makefile.am b/src/Makefile.am index f5610bb79..39a88c02d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -32,7 +32,7 @@ po.h open-po.h read-po.h str-list.h write-po.h dir-list.h file-list.h \ po-gram-gen.h po-hash-gen.h msgl-charset.h msgl-equal.h msgl-iconv.h \ msgl-ascii.h msgl-cat.h msgl-english.h msgfmt.h msgunfmt.h read-mo.h \ write-mo.h read-java.h write-java.h po-time.h format.h xgettext.h x-c.h \ -x-po.h x-java.h x-ycp.h x-rst.h +x-po.h x-lisp.h x-java.h x-ycp.h x-rst.h EXTRA_DIST = FILES project-id \ gnu/gettext/DumpResource.java gnu/gettext/GetURL.java @@ -82,7 +82,7 @@ msgcmp_SOURCES = msgcmp.c msgfmt_SOURCES = msgfmt.c write-mo.c write-java.c plural-eval.c msgmerge_SOURCES = msgmerge.c msgunfmt_SOURCES = msgunfmt.c read-mo.c read-java.c -xgettext_SOURCES = xgettext.c x-c.c x-po.c x-java.l x-ycp.c x-rst.c +xgettext_SOURCES = xgettext.c x-c.c x-po.c x-lisp.c x-java.l x-ycp.c x-rst.c msgattrib_SOURCES = msgattrib.c msgcat_SOURCES = msgcat.c msgcomm_SOURCES = msgcomm.c diff --git a/src/x-lisp.c b/src/x-lisp.c new file mode 100644 index 000000000..a8671dc3b --- /dev/null +++ b/src/x-lisp.c @@ -0,0 +1,1471 @@ +/* xgettext Lisp backend. + Copyright (C) 2001 Free Software Foundation, Inc. + + This file was written by Bruno Haible , 2001. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include +#include + +#include "message.h" +#include "x-lisp.h" +#include "xgettext.h" +#include "error.h" +#include "xmalloc.h" +#include "system.h" +#include "libgettext.h" + +#define _(s) gettext(s) + + +/* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2. + Since we are interested only in strings and in forms similar to + (gettext msgid ...) + or (ngettext msgid msgid_plural ...) + we make the following simplifications: + + - Assume the keywords and strings are in an ASCII compatible encoding. + This means we can read the input file one byte at a time, instead of + one character at a time. No need to worry about multibyte characters: + If they occur as part of identifiers, they most probably act as + constituent characters, and the byte based approach will do the same. + + - Assume the read table is the standard Common Lisp read table. + Non-standard read tables are mostly used to read data, not programs. + + - Assume the read table case is :UPCASE, and *READ-BASE* is 10. + + - Don't interpret #n= and #n#, they usually don't appear in programs. + + - Don't interpret #+, #-, they are unlikely to appear in a gettext form. + + The remaining syntax rules are: + + - The syntax code assigned to each character, and how tokens are built + up from characters (single escape, multiple escape etc.). + + - Comment syntax: ';' and '#| ... |#'. + + - String syntax: "..." with single escapes. + + - Read macros and dispatch macro character '#'. Needed to be able to + tell which is the n-th argument of a function call. + + */ + + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +struct char_syntax; +struct token; +struct object; +static void init_keywords PARAMS ((void)); +static int do_getc PARAMS ((void)); +static void do_ungetc PARAMS ((int c)); +static enum syntax_code syntax_code_of PARAMS ((unsigned char c)); +static void read_char_syntax PARAMS ((struct char_syntax *p)); +static enum attribute attribute_of PARAMS ((unsigned char c)); +static inline void init_token PARAMS ((struct token *tp)); +static inline void free_token PARAMS ((struct token *tp)); +static inline void grow_token PARAMS ((struct token *tp)); +static void read_token PARAMS ((struct token *tp, + const struct char_syntax *first)); +static inline bool has_a_dot PARAMS ((const struct token *tp)); +static inline bool all_a_number PARAMS ((const struct token *tp)); +static inline void a_letter_to_digit PARAMS ((const struct token *tp, + int base)); +static inline bool has_a_digit PARAMS ((const struct token *tp)); +static inline bool has_adjacent_letters PARAMS ((const struct token *tp)); +static bool is_potential_number PARAMS ((const struct token *tp, int *basep)); +static enum number_type is_number PARAMS ((const struct token *tp, int *basep)); +static void upcase_token PARAMS ((struct token *tp)); +static void downcase_token PARAMS ((struct token *tp)); +static void case_convert_token PARAMS ((struct token *tp)); +static inline void comment_start PARAMS ((void)); +static inline void comment_add PARAMS ((int c)); +static inline void comment_line_end PARAMS ((size_t chars_to_remove)); +static inline void free_object PARAMS ((struct object *op)); +static char * string_of_object PARAMS ((const struct object *op)); +static void read_object PARAMS ((struct object *op)); + + +/* ========================= Lexer customization. ========================= */ + +/* 'readtable_case' is the case conversion that is applied to non-escaped + parts of symbol tokens. In Common Lisp: (readtable-case *readtable*). */ + +enum rtcase +{ + case_upcase, + case_downcase, + case_preserve, + case_invert +}; + +static enum rtcase readtable_case = case_upcase; + +/* 'read_base' is the assumed radix of integers and rational numbers. + In Common Lisp: *read-base*. */ +static int read_base = 10; + +/* 'read_preserve_whitespace' specifies whether a whitespace character + that terminates a token must be pushed back on the input stream. + We set it to true, because the special newline side effect in read_object() + requires that read_object() sees every newline not inside a token. */ +static bool read_preserve_whitespace = true; + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_lisp_extract_all () +{ + extract_all = true; +} + + +void +x_lisp_keyword (name) + const char *name; +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + int argnum1; + int argnum2; + const char *colon; + size_t len; + char *symname; + size_t i; + + if (keywords.table == NULL) + init_hash (&keywords, 100); + + split_keywordspec (name, &end, &argnum1, &argnum2); + + /* The characters between name and end should form a valid Lisp symbol. + Extract the symbol name part. */ + colon = strchr (name, ':'); + if (colon != NULL && colon < end) + { + name = colon + 1; + if (name < end && *name == ':') + name++; + colon = strchr (name, ':'); + if (colon != NULL && colon < end) + return; + } + + /* Uppercase it. */ + len = end - name; + symname = (char *) xmalloc (len); + for (i = 0; i < len; i++) + symname[i] = + (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]); + + if (argnum1 == 0) + argnum1 = 1; + insert_entry (&keywords, symname, len, + (void *) (long) (argnum1 + (argnum2 << 10))); + } +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + x_lisp_keyword ("gettext"); /* I18N:GETTEXT */ + x_lisp_keyword ("ngettext:1,2"); /* I18N:NGETTEXT */ + x_lisp_keyword ("gettext-noop"); + default_keywords = false; + } +} + + +/* ======================== Reading of characters. ======================== */ + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + + +/* Fetch the next character from the input file. */ +static int +do_getc () +{ + int c = getc (fp); + + if (c == EOF) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("\ +error while reading \"%s\""), real_file_name); + } + else if (c == '\n') + line_number++; + + return c; +} + +/* Put back the last fetched character, not EOF. */ +static void +do_ungetc (c) + int c; +{ + if (c == '\n') + line_number--; + ungetc (c, fp); +} + + +/* ========= Reading of tokens. See CLHS 2.2 "Reader Algorithm". ========= */ + + +/* Syntax code. See CLHS 2.1.4 "Character Syntax Types". */ + +enum syntax_code +{ + syntax_illegal, /* non-printable, except whitespace */ + syntax_single_esc, /* '\' (single escape) */ + syntax_multi_esc, /* '|' (multiple escape) */ + syntax_constituent, /* everything else (constituent) */ + syntax_whitespace, /* TAB,LF,FF,CR,' ' (whitespace) */ + syntax_eof, /* EOF */ + syntax_t_macro, /* '()'"' (terminating macro) */ + syntax_nt_macro /* '#' (non-terminating macro) */ +}; + +/* Returns the syntax code of a character. */ +static enum syntax_code +syntax_code_of (c) + unsigned char c; +{ + switch (c) + { + case '\\': + return syntax_single_esc; + case '|': + return syntax_multi_esc; + case '\t': case '\n': case '\f': case '\r': case ' ': + return syntax_whitespace; + case '(': case ')': case '\'': case '"': case ',': case ';': case '`': + return syntax_t_macro; + case '#': + return syntax_nt_macro; + default: + if (c < ' ' && c != '\b') + return syntax_illegal; + else + return syntax_constituent; + } +} + +struct char_syntax +{ + int ch; /* character */ + enum syntax_code scode; /* syntax code */ +}; + +/* Returns the next character and its syntax code. */ +static void +read_char_syntax (p) + struct char_syntax *p; +{ + int c = do_getc (); + + p->ch = c; + p->scode = (c == EOF ? syntax_eof : syntax_code_of (c)); +} + +/* Every character in a token has an attribute assigned. The attributes + help during interpretation of the token. See + CLHS 2.3 "Interpretation of Tokens" for the possible interpretations, + and CLHS 2.1.4.2 "Constituent Traits". */ + +enum attribute +{ + a_illg, /* invalid constituent */ + a_pack_m, /* ':' package marker */ + a_alpha, /* normal alphabetic */ + a_escaped, /* alphabetic but not subject to case conversion */ + a_ratio, /* '/' */ + a_dot, /* '.' */ + a_sign, /* '+-' */ + a_extens, /* '_^' extension characters */ + a_digit, /* '0123456789' */ + a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */ + a_expodigit, /* 'esfdlESFDL' below base */ + a_letter, /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */ + a_expo /* 'esfdlESFDL' */ +}; + +#define is_letter_attribute(a) ((a) >= a_letter) +#define is_number_attribute(a) ((a) >= a_ratio) + +/* Returns the attribute of a character, assuming base 10. */ +static enum attribute +attribute_of (c) + unsigned char c; +{ + switch (c) + { + case ':': + return a_pack_m; + case '/': + return a_ratio; + case '.': + return a_dot; + case '+': case '-': + return a_sign; + case '_': case '^': + return a_extens; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return a_digit; + case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': + case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': + return a_letter; + case 'e': case 's': case 'd': case 'f': case 'l': + case 'E': case 'S': case 'D': case 'F': case 'L': + return a_expo; + default: + /* Treat everything as valid. Never return a_illg. */ + return a_alpha; + } +} + +struct token_char +{ + unsigned char ch; /* character */ + unsigned char attribute; /* attribute */ +}; + +/* A token consists of a sequence of characters with associated attribute. */ +struct token +{ + int allocated; /* number of allocated 'token_char's */ + int charcount; /* number of used 'token_char's */ + struct token_char *chars; /* the token's constituents */ + bool with_escape; /* whether single-escape or multiple escape occurs */ +}; + +/* Initialize a 'struct token'. */ +static inline void +init_token (tp) + struct token *tp; +{ + tp->allocated = 10; + tp->chars = + (struct token_char *) xmalloc (tp->allocated * sizeof (struct token_char)); + tp->charcount = 0; +} + +/* Free the memory pointed to by a 'struct token'. */ +static inline void +free_token (tp) + struct token *tp; +{ + free (tp->chars); +} + +/* Ensure there is enough room in the token for one more character. */ +static inline void +grow_token (tp) + struct token *tp; +{ + if (tp->charcount == tp->allocated) + { + tp->allocated *= 2; + tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char)); + } +} + +/* Read the next token. If 'first' is given, it points to the first + character, which has already been read. + The algorithm follows CLHS 2.2 "Reader Algorithm". */ +static void +read_token (tp, first) + struct token *tp; + const struct char_syntax *first; +{ + bool multiple_escape_flag; + struct char_syntax curr; + + init_token (tp); + tp->with_escape = false; + + multiple_escape_flag = false; + if (first) + curr = *first; + else + read_char_syntax (&curr); + + for (;; read_char_syntax (&curr)) + { + switch (curr.scode) + { + case syntax_illegal: + /* Invalid input. Be tolerant, no error message. */ + do_ungetc (curr.ch); + return; + + case syntax_single_esc: + tp->with_escape = true; + read_char_syntax (&curr); + if (curr.scode == syntax_eof) + /* Invalid input. Be tolerant, no error message. */ + return; + grow_token (tp); + tp->chars[tp->charcount].ch = curr.ch; + tp->chars[tp->charcount].attribute = a_escaped; + tp->charcount++; + break; + + case syntax_multi_esc: + multiple_escape_flag = !multiple_escape_flag; + tp->with_escape = true; + break; + + case syntax_constituent: + case syntax_nt_macro: + grow_token (tp); + if (multiple_escape_flag) + { + tp->chars[tp->charcount].ch = curr.ch; + tp->chars[tp->charcount].attribute = a_escaped; + tp->charcount++; + } + else + { + tp->chars[tp->charcount].ch = curr.ch; + tp->chars[tp->charcount].attribute = attribute_of (curr.ch); + tp->charcount++; + } + break; + + case syntax_whitespace: + case syntax_t_macro: + if (multiple_escape_flag) + { + grow_token (tp); + tp->chars[tp->charcount].ch = curr.ch; + tp->chars[tp->charcount].attribute = a_escaped; + tp->charcount++; + } + else + { + if (curr.scode != syntax_whitespace || read_preserve_whitespace) + do_ungetc (curr.ch); + return; + } + break; + + case syntax_eof: + if (multiple_escape_flag) + /* Invalid input. Be tolerant, no error message. */ + ; + return; + } + } +} + +/* A potential number is a token which + 1. consists only of digits, '+','-','/','^','_','.' and number markers. + The base for digits is context dependent, but always 10 if a dot '.' + occurs. A number marker is a non-digit letter which is not adjacent + to a non-digit letter. + 2. has at least one digit. + 3. starts with a digit, '+','-','.','^' or '_'. + 4. does not end with '+' or '-'. + See CLHS 2.3.1.1 "Potential Numbers as Tokens". + */ + +static inline bool +has_a_dot (tp) + const struct token *tp; +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (tp->chars[i].attribute == a_dot) + return true; + return false; +} + +static inline bool +all_a_number (tp) + const struct token *tp; +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (!is_number_attribute (tp->chars[i].attribute)) + return false; + return true; +} + +static inline void +a_letter_to_digit (tp, base) + const struct token *tp; + int base; +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (is_letter_attribute (tp->chars[i].attribute)) + { + int c = tp->chars[i].ch; + + if (c >= 'a') + c -= 'a' - 'A'; + if (c - 'A' + 10 < base) + tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit, + a_expo -> a_expodigit */ + } +} + +static inline bool +has_a_digit (tp) + const struct token *tp; +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (tp->chars[i].attribute == a_digit + || tp->chars[i].attribute == a_letterdigit + || tp->chars[i].attribute == a_expodigit) + return true; + return false; +} + +static inline bool +has_adjacent_letters (tp) + const struct token *tp; +{ + int n = tp->charcount; + int i; + + for (i = 1; i < n; i++) + if (is_letter_attribute (tp->chars[i-1].attribute) + && is_letter_attribute (tp->chars[i].attribute)) + return true; + return false; +} + +static bool +is_potential_number (tp, basep) + const struct token *tp; + int *basep; +{ + /* CLHS 2.3.1.1.1: + "A potential number cannot contain any escape characters." */ + if (tp->with_escape) + return false; + + if (has_a_dot (tp)) + *basep = 10; + + if (!all_a_number (tp)) + return false; + + a_letter_to_digit (tp, *basep); + + if (!has_a_digit (tp)) + return false; + + if (has_adjacent_letters (tp)) + return false; + + if (!(tp->chars[0].attribute >= a_dot + && tp->chars[0].attribute <= a_expodigit)) + return false; + + if (tp->chars[tp->charcount - 1].attribute == a_sign) + return false; + + return true; +} + +/* A number is one integer, ratio, float. Each has a particular syntax. + See CLHS 2.3.1 "Numbers as Tokens". + But note a mistake: The exponent rule should read: + exponent ::= exponent-marker [sign] {decimal-digit}+ + (see 22.1.3.1.3 "Printing Floats"). */ + +enum number_type +{ + n_none, + n_integer, + n_ratio, + n_float +}; + +static enum number_type +is_number (tp, basep) + const struct token *tp; + int *basep; +{ + struct token_char *ptr_limit; + struct token_char *ptr1; + + if (!is_potential_number (tp, basep)) + return n_none; + + /* is_potential_number guarantees + - all attributes are >= a_ratio, + - there is at least one a_digit or a_letterdigit or a_expodigit, and + - if there is an a_dot, then *basep = 10. */ + + ptr1 = &tp->chars[0]; + ptr_limit = &tp->chars[tp->charcount]; + + if (ptr1->attribute == a_sign) + ptr1++; + + /* Test for syntax + * { a_sign | } + * { a_digit < base }+ { a_ratio { a_digit < base }+ | } + */ + { + bool seen_a_ratio = false; + bool seen_a_digit = false; /* seen a digit in last digit block? */ + struct token_char *ptr; + + for (ptr = ptr1;; ptr++) + { + if (ptr >= ptr_limit) + { + if (!seen_a_digit) + break; + if (seen_a_ratio) + return n_ratio; + else + return n_integer; + } + if (ptr->attribute == a_digit + || ptr->attribute == a_letterdigit + || ptr->attribute == a_expodigit) + { + int c = ptr->ch; + + c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10); + if (c >= *basep) + break; + seen_a_digit = true; + } + else if (ptr->attribute == a_ratio) + { + if (seen_a_ratio || !seen_a_digit) + break; + seen_a_ratio = true; + seen_a_digit = false; + } + else + break; + } + } + + /* Test for syntax + * { a_sign | } + * { a_digit }* { a_dot { a_digit }* | } + * { a_expo { a_sign | } { a_digit }+ | } + * + * If there is an exponent part, there must be digits before the dot or + * after the dot. The result is a float. + * If there is no exponen: + * If there is no dot, it would an integer in base 10, but is has already + * been verified to not be an integer in the current base. + * If there is a dot: + * If there are digits after the dot, it's a float. + * Otherwise, if there are digits before the dot, it's an integer. + */ + *basep = 10; + { + bool seen_a_dot = false; + bool seen_a_dot_with_leading_digits = false; + bool seen_a_digit = false; /* seen a digit in last digit block? */ + struct token_char *ptr; + + for (ptr = ptr1;; ptr++) + { + if (ptr >= ptr_limit) + { + /* no exponent */ + if (!seen_a_dot) + return n_none; + if (seen_a_digit) + return n_float; + if (seen_a_dot_with_leading_digits) + return n_integer; + else + return n_none; + } + if (ptr->attribute == a_digit) + { + seen_a_digit = true; + } + else if (ptr->attribute == a_dot) + { + if (seen_a_dot) + return n_none; + seen_a_dot = true; + if (seen_a_digit) + seen_a_dot_with_leading_digits = true; + seen_a_digit = false; + } + else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit) + break; + else + return n_none; + } + ptr++; + if (!seen_a_dot_with_leading_digits || !seen_a_digit) + return n_none; + if (ptr >= ptr_limit) + return n_none; + if (ptr->attribute == a_sign) + ptr++; + seen_a_digit = false; + for (;; ptr++) + { + if (ptr >= ptr_limit) + break; + if (ptr->attribute != a_digit) + return n_none; + seen_a_digit = true; + } + if (!seen_a_digit) + return n_none; + return n_float; + } +} + +/* A token representing a symbol must be case converted. + For portability, we convert only ASCII characters here. */ + +static void +upcase_token (tp) + struct token *tp; +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (tp->chars[i].attribute != a_escaped) + { + unsigned char c = tp->chars[i].ch; + if (c >= 'a' && c <= 'z') + tp->chars[i].ch = c - 'a' + 'A'; + } +} + +static void +downcase_token (tp) + struct token *tp; +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (tp->chars[i].attribute != a_escaped) + { + unsigned char c = tp->chars[i].ch; + if (c >= 'A' && c <= 'Z') + tp->chars[i].ch = c - 'A' + 'a'; + } +} + +static void +case_convert_token (tp) + struct token *tp; +{ + int n = tp->charcount; + int i; + + switch (readtable_case) + { + case case_upcase: + upcase_token (tp); + break; + + case case_downcase: + downcase_token (tp); + break; + + case case_preserve: + break; + + case case_invert: + { + bool seen_uppercase = false; + bool seen_lowercase = false; + for (i = 0; i < n; i++) + if (tp->chars[i].attribute != a_escaped) + { + unsigned char c = tp->chars[i].ch; + if (c >= 'a' && c <= 'z') + seen_lowercase = true; + if (c >= 'A' && c <= 'Z') + seen_uppercase = true; + } + if (seen_uppercase) + { + if (!seen_lowercase) + downcase_token (tp); + } + else + { + if (seen_lowercase) + upcase_token (tp); + } + } + break; + } +} + + +/* ========================= Accumulating comments ========================= */ + + +static char *buffer; +static size_t bufmax; +size_t buflen; + +static inline void +comment_start () +{ + buflen = 0; +} + +static inline void +comment_add (c) + int c; +{ + if (buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen++] = c; +} + +static inline void +comment_line_end (chars_to_remove) + size_t chars_to_remove; +{ + buflen -= chars_to_remove; + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + if (chars_to_remove == 0 && buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen] = '\0'; + xgettext_comment_add (buffer); +} + + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; + + +/* ========================= Accumulating messages ========================= */ + + +static message_list_ty *mlp; + + +/* ============== Reading of objects. See CLHS 2 "Syntax". ============== */ + + +/* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings. + Other objects need not to be represented precisely. */ +enum object_type +{ + t_symbol, /* symbol */ + t_string, /* string */ + t_other, /* other kind of real object */ + t_dot, /* '.' pseudo object */ + t_close, /* ')' pseudo object */ + t_eof /* EOF marker */ +}; + +struct object +{ + enum object_type type; + struct token *token; /* for t_symbol and t_string */ + int line_number_at_start; /* for t_string */ +}; + +/* Free the memory pointed to by a 'struct object'. */ +static inline void +free_object (op) + struct object *op; +{ + if (op->type == t_symbol || op->type == t_string) + { + free_token (op->token); + free (op->token); + } +} + +/* Convert a t_string token to a char*. */ +static char * +string_of_object (op) + const struct object *op; +{ + char *str; + const struct token_char *p; + char *q; + int n; + + if (!(op->type == t_symbol || op->type == t_string)) + abort (); + n = op->token->charcount; + str = (char *) xmalloc (n + 1); + q = str; + for (p = op->token->chars; n > 0; p++, n--) + *q++ = p->ch; + *q = '\0'; + return str; +} + +/* Read the next object. */ +static void +read_object (op) + struct object *op; +{ + for (;;) + { + struct char_syntax curr; + + read_char_syntax (&curr); + + switch (curr.scode) + { + case syntax_eof: + op->type = t_eof; + return; + + case syntax_whitespace: + if (curr.ch == '\n') + /* Comments assumed to be grouped with a message must immediately + precede it, with no non-whitespace token on a line between + both. */ + if (last_non_comment_line > last_comment_line) + xgettext_comment_reset (); + continue; + + case syntax_illegal: + op->type = t_other; + return; + + case syntax_single_esc: + case syntax_multi_esc: + case syntax_constituent: + /* Start reading a token. */ + op->token = (struct token *) xmalloc (sizeof (struct token)); + read_token (op->token, &curr); + last_non_comment_line = line_number; + + /* Interpret the token. */ + + /* Dots. */ + if (!op->token->with_escape + && op->token->charcount == 1 + && op->token->chars[0].attribute == a_dot) + { + free_token (op->token); + free (op->token); + op->type = t_dot; + return; + } + /* Tokens consisting entirely of dots are illegal, but be tolerant + here. */ + + /* Number. */ + { + int base = read_base; + + if (is_number (op->token, &base) != n_none) + { + free_token (op->token); + free (op->token); + op->type = t_other; + return; + } + } + + /* We interpret all other tokens as symbols (including 'reserved + tokens', i.e. potential numbers which are not numbers). */ + case_convert_token (op->token); + op->type = t_symbol; + return; + + case syntax_t_macro: + case syntax_nt_macro: + /* Read a macro. */ + switch (curr.ch) + { + case '(': + { + int arg = 0; /* Current argument number. */ + int argnum1 = 0; /* First string position. */ + int argnum2 = 0; /* Plural string position. */ + message_ty *plural_mp = NULL; /* Remember the msgid. */ + + for (;; arg++) + { + struct object inner; + + read_object (&inner); + + /* Recognize end of list. */ + if (inner.type == t_close) + { + op->type = t_other; + /* Don't bother converting "()" to "NIL". */ + last_non_comment_line = line_number; + return; + } + + /* Dots are not allowed in every position. + But be tolerant. */ + + /* EOF inside list is illegal. + But be tolerant. */ + if (inner.type == t_eof) + break; + + /* No need to bother if we extract all strings anyway. */ + if (!extract_all) + { + if (arg == 0) + { + /* This is the function position. */ + if (inner.type == t_symbol) + { + char *symbol_name = string_of_object (&inner); + int i; + int prefix_len; + void *keyword_value; + + /* Omit any package name. */ + i = inner.token->charcount; + while (i > 0 + && inner.token->chars[i-1].attribute != a_pack_m) + i--; + prefix_len = i; + + if (find_entry (&keywords, + symbol_name + prefix_len, + strlen (symbol_name + prefix_len), + &keyword_value) + == 0) + { + argnum1 = (int) (long) keyword_value & ((1 << 10) - 1); + argnum2 = (int) (long) keyword_value >> 10; + } + + free (symbol_name); + } + } + else + { + /* These are the argument positions. + Extract a string if we have reached the right + argument position. */ + if (arg == argnum1) + { + if (inner.type == t_string) + { + lex_pos_ty pos; + message_ty *mp; + + pos.file_name = logical_file_name; + pos.line_number = inner.line_number_at_start; + mp = remember_a_message (mlp, string_of_object (&inner), &pos); + if (argnum2 > 0) + plural_mp = mp; + } + } + else if (arg == argnum2) + { + if (inner.type == t_string && plural_mp != NULL) + { + lex_pos_ty pos; + + pos.file_name = logical_file_name; + pos.line_number = inner.line_number_at_start; + remember_a_message_plural (plural_mp, string_of_object (&inner), &pos); + } + } + } + } + + free_object (&inner); + } + } + op->type = t_other; + last_non_comment_line = line_number; + return; + + case ')': + /* Tell the caller about the end of list. + Unmatched closing parenthesis is illegal. + But be tolerant. */ + op->type = t_close; + last_non_comment_line = line_number; + return; + + case ',': + { + int c = do_getc (); + /* The ,@ handling inside lists is wrong anyway, because + ,@form expands to an unknown number of elements. */ + if (c != EOF && c != '@') + do_ungetc (c); + } + /*FALLTHROUGH*/ + case '\'': + case '`': + { + struct object inner; + + read_object (&inner); + + /* Dots and EOF are not allowed here. But be tolerant. */ + + free_object (&inner); + + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case ';': + { + bool all_semicolons = true; + + last_comment_line = line_number; + comment_start (); + for (;;) + { + int c = do_getc (); + if (c == EOF || c == '\n') + break; + if (c != ';') + all_semicolons = false; + if (!all_semicolons) + comment_add (c); + } + comment_line_end (0); + continue; + } + + case '"': + { + op->token = (struct token *) xmalloc (sizeof (struct token)); + init_token (op->token); + op->line_number_at_start = line_number; + for (;;) + { + int c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + break; + if (c == '"') + break; + if (c == '\\') /* syntax_single_esc */ + { + c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + break; + } + grow_token (op->token); + op->token->chars[op->token->charcount++].ch = c; + } + op->type = t_string; + + if (extract_all) + { + lex_pos_ty pos; + + pos.file_name = logical_file_name; + pos.line_number = op->line_number_at_start; + remember_a_message (mlp, string_of_object (op), &pos); + } + last_non_comment_line = line_number; + return; + } + + case '#': + /* Dispatch macro handling. */ + { + int c; + + for (;;) + { + c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + { + op->type = t_other; + return; + } + if (!(c >= '0' && c <= '9')) + break; + } + + switch (c) + { + case '(': + case '"': + do_ungetc (c); + /*FALLTHROUGH*/ + case '\'': + case ':': + case '.': + case ',': + case 'A': case 'a': + case 'C': case 'c': + case 'P': case 'p': + case 'S': case 's': + { + struct object inner; + read_object (&inner); + /* Dots and EOF are not allowed here. + But be tolerant. */ + free_object (&inner); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case '|': + { + int depth = 0; + int c; + + comment_start (); + c = do_getc (); + for (;;) + { + if (c == EOF) + break; + if (c == '|') + { + c = do_getc (); + if (c == EOF) + break; + if (c == '#') + { + if (depth == 0) + { + comment_line_end (0); + break; + } + depth--; + comment_add ('|'); + comment_add ('#'); + c = do_getc (); + } + else + comment_add ('|'); + } + else if (c == '#') + { + c = do_getc (); + if (c == EOF) + break; + comment_add ('#'); + if (c == '|') + { + depth++; + comment_add ('|'); + c = do_getc (); + } + } + else + { + /* We skip all leading white space. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + if (c == '\n') + { + comment_line_end (1); + comment_start (); + } + c = do_getc (); + } + } + if (c == EOF) + { + /* EOF not allowed here. But be tolerant. */ + op->type = t_eof; + return; + } + last_comment_line = line_number; + continue; + } + + case '\\': + { + struct token token; + struct char_syntax first; + first.ch = '\\'; + first.scode = syntax_single_esc; + read_token (&token, &first); + free_token (&token); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case 'B': case 'b': + case 'O': case 'o': + case 'X': case 'x': + case 'R': case 'r': + case '*': + { + struct token token; + read_token (&token, NULL); + free_token (&token); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case '=': + /* Ignore read labels. */ + continue; + + case '#': + /* Don't bother looking up the corresponding object. */ + op->type = t_other; + last_non_comment_line = line_number; + return; + + case '+': + case '-': + /* Simply assume every feature expression is true. */ + { + struct object inner; + read_object (&inner); + /* Dots and EOF are not allowed here. + But be tolerant. */ + free_object (&inner); + continue; + } + + default: + op->type = t_other; + last_non_comment_line = line_number; + return; + } + /*NOTREACHED*/ + abort (); + } + + default: + /*NOTREACHED*/ + abort (); + } + + default: + /*NOTREACHED*/ + abort (); + } + } +} + + +void +extract_lisp (f, real_filename, logical_filename, mdlp) + FILE *f; + const char *real_filename; + const char *logical_filename; + msgdomain_list_ty *mdlp; +{ + mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + last_comment_line = -1; + last_non_comment_line = -1; + + init_keywords (); + + /* Eat tokens until eof is seen. When read_object returns + due to an unbalanced closing parenthesis, just restart it. */ + do + { + struct object toplevel_object; + + read_object (&toplevel_object); + + if (toplevel_object.type == t_eof) + break; + } + while (!feof (fp)); + + /* Close scanner. */ + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} diff --git a/src/x-lisp.h b/src/x-lisp.h new file mode 100644 index 000000000..1c6b02b61 --- /dev/null +++ b/src/x-lisp.h @@ -0,0 +1,35 @@ +/* xgettext Lisp backend. + Copyright (C) 2001 Free Software Foundation, Inc. + Written by Bruno Haible , 2001. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + + +#define EXTENSIONS_LISP \ + { "lisp", "Lisp" }, \ + +#define SCANNERS_LISP \ + { "Lisp", extract_lisp, &formatstring_lisp }, \ + +/* Scan a Lisp file and add its translatable strings to mdlp. */ +extern void extract_lisp PARAMS ((FILE *fp, const char *real_filename, + const char *logical_filename, + msgdomain_list_ty *mdlp)); + + +/* Handling of options specific to this language. */ + +extern void x_lisp_extract_all PARAMS ((void)); +extern void x_lisp_keyword PARAMS ((const char *name)); diff --git a/src/xgettext.c b/src/xgettext.c index dce737139..28b5735f9 100644 --- a/src/xgettext.c +++ b/src/xgettext.c @@ -65,6 +65,7 @@ struct passwd *getpwuid (); #include "x-c.h" #include "x-po.h" +#include "x-lisp.h" #include "x-java.h" #include "x-ycp.h" #include "x-rst.h" @@ -225,6 +226,7 @@ main (argc, argv) break; case 'a': x_c_extract_all (); + x_lisp_extract_all (); x_java_extract_all (); break; case 'c': @@ -276,6 +278,7 @@ main (argc, argv) if (optarg == NULL || *optarg != '\0') { x_c_keyword (optarg); + x_lisp_keyword (optarg); x_java_keyword (optarg); } break; @@ -1146,11 +1149,11 @@ language_to_extractor (name) { SCANNERS_C SCANNERS_PO + SCANNERS_LISP SCANNERS_JAVA SCANNERS_YCP SCANNERS_RST { "Python", extract_c, &formatstring_python }, - { "Lisp", extract_c, &formatstring_lisp }, /* Here will follow more languages and their scanners: awk, perl, etc... Make sure new scanners honor the --exclude-file option. */ }; @@ -1187,10 +1190,11 @@ extension_to_language (extension) { EXTENSIONS_C EXTENSIONS_PO + EXTENSIONS_LISP EXTENSIONS_JAVA EXTENSIONS_YCP EXTENSIONS_RST - /* Here will follow more file extensions: sh, pl, tcl, lisp ... */ + /* Here will follow more file extensions: sh, pl, tcl ... */ }; table_ty *tp; diff --git a/tests/ChangeLog b/tests/ChangeLog index a6aa895b7..7d0c5cf03 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,9 @@ +2001-12-09 Bruno Haible + + * lang-clisp: New file. + * Makefile.am (TESTS): Add it. + * format-lisp-1: Use real Lisp syntax. + 2001-12-08 Bruno Haible * msgattrib-1: New file. diff --git a/tests/Makefile.am b/tests/Makefile.am index 044ef7983..a80860e37 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -52,7 +52,7 @@ TESTS = gettext-1 gettext-2 \ format-pascal-1 format-pascal-2 \ format-ycp-1 format-ycp-2 \ plural-1 plural-2 \ - lang-c lang-c++ lang-objc lang-java lang-pascal lang-ycp lang-po lang-rst + lang-c lang-c++ lang-objc lang-clisp lang-java lang-pascal lang-ycp lang-po lang-rst EXTRA_DIST = $(TESTS) test.mo xg-test1.ok.po msguniq-a.in msguniq-a.out diff --git a/tests/format-lisp-1 b/tests/format-lisp-1 index c1c037bff..9297055d3 100755 --- a/tests/format-lisp-1 +++ b/tests/format-lisp-1 @@ -260,7 +260,7 @@ while read comment; do n=`expr $n + 1` tmpfiles="$tmpfiles f-l-1-$n.in f-l-1-$n.po" cat < f-l-1-$n.in -gettext(${string}); +(gettext ${string}) EOF ${XGETTEXT} -L Lisp -o f-l-1-$n.po f-l-1-$n.in || exit 1 test -f f-l-1-$n.po || exit 1 diff --git a/tests/lang-clisp b/tests/lang-clisp new file mode 100755 index 000000000..c0fb73a59 --- /dev/null +++ b/tests/lang-clisp @@ -0,0 +1,97 @@ +#! /bin/sh + +# Test of gettext facilities in the CLISP language. +# Assumes an fr_FR locale is installed. +# Assumes the following packages are installed: clisp. + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles prog.lisp" +cat <<\EOF > prog.lisp +(setf (textdomain) "prog") +(setf (textdomaindir "prog") "./") + +(setq n (parse-integer (first *args*))) + +(format t "~A~%" (gettext "'Your command, please?', asked the waiter.")) + +(format t "~@?~%" (ngettext "a piece of cake" "~D pieces of cake" n) n) +EOF + +tmpfiles="$tmpfiles prog.pot" +: ${XGETTEXT=xgettext} +${XGETTEXT} -o prog.pot --omit-header --no-location prog.lisp + +tmpfiles="$tmpfiles prog.ok" +cat < prog.ok +msgid "'Your command, please?', asked the waiter." +msgstr "" + +msgid "a piece of cake" +msgid_plural "~D pieces of cake" +msgstr[0] "" +msgstr[1] "" +EOF + +: ${DIFF=diff} +${DIFF} prog.ok prog.pot || exit 1 + +tmpfiles="$tmpfiles fr.po" +cat <<\EOF > fr.po +msgid "" +msgstr "" +"Content-Type: text/plain; charset=ISO-8859-1\n" +"Plural-Forms: nplurals=2; plural=(n > 1);\n" + +msgid "'Your command, please?', asked the waiter." +msgstr "«Votre commande, s'il vous plait», dit le garçon." + +# Les gateaux allemands sont les meilleurs du monde. +msgid "a piece of cake" +msgid_plural "~D pieces of cake" +msgstr[0] "un morceau de gateau" +msgstr[1] "~D morceaux de gateau" +EOF + +tmpfiles="$tmpfiles fr.po.new" +: ${MSGMERGE=msgmerge} +${MSGMERGE} -q -o fr.po.new fr.po prog.pot + +: ${DIFF=diff} +${DIFF} fr.po fr.po.new || exit 1 + +tmpfiles="$tmpfiles fr" +test -d fr || mkdir fr +test -d fr/LC_MESSAGES || mkdir fr/LC_MESSAGES + +: ${MSGFMT=msgfmt} +${MSGFMT} -o fr/LC_MESSAGES/prog.mo fr.po + +tmpfiles="$tmpfiles prog.ok prog.out" +: ${DIFF=diff} +cat <<\EOF > prog.ok +«Votre commande, s'il vous plait», dit le garçon. +2 morceaux de gateau +EOF + +# Test for presence of clisp version 2.28 or newer. +# Use clisp for the comparison of the version numbers; neither 'expr' nor 'bc' +# can deal with floating-point numbers. +clisp --version >/dev/null 2>/dev/null \ + || { echo "SKIP: lang-clisp"; rm -fr $tmpfiles; exit 77; } +version=`clisp --version | sed -e 's/^[^0-9]*//'` +case $version in + 19* | 20*) # older than 2.25 + echo "SKIP: lang-clisp"; rm -fr $tmpfiles; exit 77;; +esac +version=`echo $version | sed -e 's/^\([0-9]*\.[0-9]*\).*/\1/'` +clisp -norc -x "(sys::exit (not (>= $version 2.28)))" >/dev/null \ + || { echo "SKIP: lang-clisp"; rm -fr $tmpfiles; exit 77; } + +CLISP_LANGUAGE= LANGUAGE= LC_ALL=fr_FR clisp prog.lisp 2 > prog.out || exit 1 +${DIFF} prog.ok prog.out || exit 1 + +rm -fr $tmpfiles + +exit 0