From: Bruno Haible Date: Wed, 6 Feb 2002 12:58:55 +0000 (+0000) Subject: New Python backend. X-Git-Tag: v0.11.1~88 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f34964a1d70413169a0240abbf153ffa810951fb;p=thirdparty%2Fgettext.git New Python backend. --- diff --git a/doc/ChangeLog b/doc/ChangeLog index 53c1a3c35..fc8c11bc2 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,7 @@ +2002-02-02 Bruno Haible + + * gettext.texi (Python): Update. + 2002-02-03 Eli Zaretskii Bruno Haible diff --git a/doc/gettext.texi b/doc/gettext.texi index c17105ebb..600bfa822 100644 --- a/doc/gettext.texi +++ b/doc/gettext.texi @@ -6566,7 +6566,7 @@ not used by the gettext emulation emulate. Bug: uses only the first found .mo file, not all of them @item Extractor -pygettext.py +@code{xgettext} @item Formatting with positions @code{'...%(ident)d...' % @{ 'ident': value @}} diff --git a/src/ChangeLog b/src/ChangeLog index 7eb816c82..4742b46cd 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,17 @@ +2002-02-02 Bruno Haible + + * x-python.h: New file. + * x-python.c: New file. + * xgettext.c: Include x-python.h. + (main): Call x_python_extract_all, x_python_keyword. + (language_to_scanner): Add Python rule. + (extension_to_language): Add Python rule. + * Makefile.am (noinst_HEADERS): Add x-python.h. + (INCLUDES): Add -I$(top_srcdir)/libuniname. + (LIBUNINAME): New variable. + (xgettext_SOURCES): Add x-python.c. + (xgettext_LDADD): Add $(LIBUNINAME). + 2002-02-03 Bruno Haible * msgfmt.c (check_header_entry): Terminate the error strings with diff --git a/src/Makefile.am b/src/Makefile.am index c012b780d..cffbd9ee9 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -32,7 +32,8 @@ po.h open-po.h read-po.h str-list.h write-po.h dir-list.h file-list.h \ po-gram-gen.h po-hash-gen.h msgl-charset.h msgl-equal.h msgl-iconv.h \ msgl-ascii.h msgl-cat.h msgl-english.h msgfmt.h msgunfmt.h read-mo.h \ write-mo.h read-java.h write-java.h po-time.h plural-table.h format.h \ -xgettext.h x-c.h x-po.h x-lisp.h x-elisp.h x-librep.h x-java.h x-ycp.h x-rst.h +xgettext.h x-c.h x-po.h x-python.h x-lisp.h x-elisp.h x-librep.h x-java.h \ +x-ycp.h x-rst.h EXTRA_DIST = FILES project-id \ gnu/gettext/DumpResource.java gnu/gettext/GetURL.java @@ -41,8 +42,8 @@ localedir = $(datadir)/locale jardir = $(datadir)/gettext projectsdir = $(pkgdatadir)/projects -INCLUDES = -I. -I$(srcdir) -I.. -I../lib -I$(top_srcdir)/lib -I../intl \ --I$(top_srcdir)/intl +INCLUDES = -I. -I$(srcdir) -I.. -I$(top_srcdir)/libuniname \ +-I../lib -I$(top_srcdir)/lib -I../intl -I$(top_srcdir)/intl DEFS = -DLOCALEDIR=\"$(localedir)\" -DGETTEXTJAR=\"$(jardir)/gettext.jar\" \ -DLIBDIR=\"$(libdir)\" -DPROJECTSDIR=\"$(projectsdir)\" @DEFS@ LDADD = ../lib/libgettextlib.la @LTLIBINTL@ @@ -75,6 +76,9 @@ $(COMMON_SOURCE) read-po.c write-po.c msgl-ascii.c msgl-iconv.c msgl-equal.c \ msgl-cat.c msgl-english.c file-list.c msgl-charset.c po-time.c plural.c \ plural-table.c $(FORMAT_SOURCE) +# x-python needs table of Unicode character names. +LIBUNINAME = ../libuniname/libuniname.a + # Source dependencies. gettext_SOURCES = gettext.c ngettext_SOURCES = ngettext.c @@ -83,7 +87,8 @@ msgfmt_SOURCES = msgfmt.c write-mo.c write-java.c plural-eval.c msgmerge_SOURCES = msgmerge.c msgunfmt_SOURCES = msgunfmt.c read-mo.c read-java.c xgettext_SOURCES = xgettext.c \ - x-c.c x-po.c x-lisp.c x-elisp.c x-librep.c x-java.l x-ycp.c x-rst.c + x-c.c x-po.c x-python.c x-lisp.c x-elisp.c x-librep.c x-java.l x-ycp.c \ + x-rst.c msgattrib_SOURCES = msgattrib.c msgcat_SOURCES = msgcat.c msgcomm_SOURCES = msgcomm.c @@ -113,7 +118,7 @@ msgcmp_LDADD = libgettextsrc.la msgfmt_LDADD = libgettextsrc.la msgmerge_LDADD = libgettextsrc.la msgunfmt_LDADD = libgettextsrc.la -xgettext_LDADD = libgettextsrc.la +xgettext_LDADD = libgettextsrc.la $(LIBUNINAME) msgattrib_LDADD = libgettextsrc.la msgcat_LDADD = libgettextsrc.la msgcomm_LDADD = libgettextsrc.la diff --git a/src/x-python.c b/src/x-python.c new file mode 100644 index 000000000..6553bdd22 --- /dev/null +++ b/src/x-python.c @@ -0,0 +1,1191 @@ +/* xgettext Python backend. + Copyright (C) 2002 Free Software Foundation, Inc. + + This file was written by Bruno Haible , 2002. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include +#include +#include + +#include "message.h" +#include "x-python.h" +#include "xgettext.h" +#include "error.h" +#include "progname.h" +#include "xmalloc.h" +#include "exit.h" +#include "po-charset.h" +#include "msgl-ascii.h" +#include "msgl-iconv.h" +#include "uniname.h" +#include "utf16-ucs4.h" +#include "ucs4-utf8.h" +#include "gettext.h" + +#define _(s) gettext(s) + +#if HAVE_C_BACKSLASH_A +# define ALERT_CHAR '\a' +#else +# define ALERT_CHAR '\7' +#endif + + +/* The Python syntax is defined in the Python Reference Manual + /usr/share/doc/packages/python/html/ref/index.html. + See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c, + Python-2.0/Objects/unicodeobject.c. */ + +enum token_type_ty +{ + token_type_eof, + token_type_lparen, /* ( */ + token_type_rparen, /* ) */ + token_type_comma, /* , */ + token_type_string, /* "abc", 'abc', """abc""", '''abc''' */ + token_type_symbol, /* symbol, number */ + token_type_other /* misc. operator */ +}; +typedef enum token_type_ty token_type_ty; + +typedef struct token_ty token_ty; +struct token_ty +{ + token_type_ty type; + char *string; /* for token_type_string, token_type_symbol */ + int line_number; +}; + + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +static void init_keywords PARAMS ((void)); +static int phase1_getc PARAMS ((void)); +static void phase1_ungetc PARAMS ((int c)); +static inline void comment_start PARAMS ((void)); +static inline void comment_add PARAMS ((int c)); +static inline void comment_line_end PARAMS ((void)); +static int phase2_getc PARAMS ((void)); +static void phase2_ungetc PARAMS ((int c)); +static int phase7_getuc PARAMS ((int quote_char, bool triple, + bool interpret_ansic, bool interpret_unicode, + unsigned int *backslash_counter)); +static void phase5_get PARAMS ((token_ty *tp)); +static void phase5_unget PARAMS ((token_ty *tp)); +static void x_python_lex PARAMS ((token_ty *tp)); +static bool extract_parenthesized PARAMS ((message_list_ty *mlp, + int commas_to_skip, + int plural_commas)); + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_python_extract_all () +{ + extract_all = true; +} + + +void +x_python_keyword (name) + const char *name; +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + int argnum1; + int argnum2; + const char *colon; + + if (keywords.table == NULL) + init_hash (&keywords, 100); + + split_keywordspec (name, &end, &argnum1, &argnum2); + + /* The characters between name and end should form a valid C identifier. + A colon means an invalid parse in split_keywordspec(). */ + colon = strchr (name, ':'); + if (colon == NULL || colon >= end) + { + if (argnum1 == 0) + argnum1 = 1; + insert_entry (&keywords, name, end - name, + (void *) (long) (argnum1 + (argnum2 << 10))); + } + } +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + x_python_keyword ("gettext"); + x_python_keyword ("dgettext:2"); + x_python_keyword ("_"); + default_keywords = false; + } +} + + +/* ================== Reading of characters and tokens. =================== */ + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; + + +/* 1. line_number handling. Also allow a lookahead of 9 characters. */ + +/* Maximum used guaranteed to be < UNINAME_MAX + 4. */ +static unsigned char phase1_pushback[UNINAME_MAX + 4]; +static int phase1_pushback_length; + +static int +phase1_getc () +{ + int c; + + if (phase1_pushback_length) + c = phase1_pushback[--phase1_pushback_length]; + else + { + c = getc (fp); + + if (c == EOF) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("error while reading \"%s\""), + real_file_name); + return EOF; + } + } + + if (c == '\n') + line_number++; + + return c; +} + +static void +phase1_ungetc (c) + int c; +{ + if (c != EOF) + { + if (c == '\n') + --line_number; + + phase1_pushback[phase1_pushback_length++] = c; + } +} + + +/* Accumulating comments. */ + +static char *buffer; +static size_t bufmax; +static size_t buflen; + +static inline void +comment_start () +{ + buflen = 0; +} + +static inline void +comment_add (c) + int c; +{ + /* We assume the program source is in ISO-8859-1 (for consistency with + Python's \ooo and \xnn syntax inside strings), but we produce a POT + file in UTF-8 encoding. */ + size_t len = ((unsigned char) c < 0x80 ? 1 : 2); + if (buflen + len > bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + if ((unsigned char) c < 0x80) + buffer[buflen++] = c; + else + { + buffer[buflen++] = 0xc0 | ((unsigned char) c >> 6); + buffer[buflen++] = 0x80 | ((unsigned char) c & 0x3f); + } +} + +static inline void +comment_line_end () +{ + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + if (buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen] = '\0'; + xgettext_comment_add (buffer); +} + + +/* 2. Outside strings, replace backslash-newline with nothing and a comment + with nothing. */ + +static int +phase2_getc () +{ + int c; + + for (;;) + { + c = phase1_getc (); + if (c == '\\') + { + c = phase1_getc (); + if (c != '\n') + { + phase1_ungetc (c); + /* This shouldn't happen usually, because "A backslash is + illegal elsewhere on a line outside a string literal." */ + return '\\'; + } + /* Eat backslash-newline. */ + } + else if (c == '#') + { + /* Eat a comment. */ + comment_start (); + for (;;) + { + c = phase1_getc (); + if (c == EOF || c == '\n') + break; + /* We skip all leading white space, but not EOLs. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + } + comment_line_end (); + return c; + } + else + return c; + } +} + +static void +phase2_ungetc (c) + int c; +{ + phase1_ungetc (c); +} + + +/* There are two different input syntaxes for strings, "abc" and r"abc", + and two different input syntaxes for Unicode strings, u"abc" and ur"abc". + Which escape sequences are understood, i.e. what is interpreted specially + after backslash? + "abc" \ \\ \' \" \a\b\f\n\r\t\v \ooo \xnn + r"abc" + u"abc" \ \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...} + ur"abc" \unnnn + The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two + \unnnn items. The \ooo and \xnn values are ISO-8859-1 values: u"\xff" and + u"\u00ff" are the same. */ + +#define P7_EOF (-1) +#define P7_STRING_END (-2) + +static int +phase7_getuc (quote_char, triple, interpret_ansic, interpret_unicode, backslash_counter) + int quote_char; + bool triple; + bool interpret_ansic; + bool interpret_unicode; + unsigned int *backslash_counter; +{ + int c; + + for (;;) + { + /* Use phase 1, because phase 2 elides comments. */ + c = phase1_getc (); + + if (c == EOF) + return P7_EOF; + + if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0)) + { + if (triple) + { + int c1 = phase1_getc (); + if (c1 == quote_char) + { + int c2 = phase1_getc (); + if (c2 == quote_char) + return P7_STRING_END; + phase1_ungetc (c2); + } + phase1_ungetc (c1); + return c; + } + else + return P7_STRING_END; + } + + if (c == '\n') + { + if (triple) + { + *backslash_counter = 0; + return '\n'; + } + /* In r"..." and ur"..." strings, newline is only allowed + immediately after an odd number of backslashes (although the + backslashes are not interpreted!). */ + if (!(interpret_ansic || (*backslash_counter & 1) == 0)) + { + *backslash_counter = 0; + return '\n'; + } + phase1_ungetc (c); + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated string"), + logical_file_name, line_number); + error_with_progname = true; + return P7_STRING_END; + } + + if (c != '\\') + { + *backslash_counter = 0; + return c; + } + + /* Backslash handling. */ + + if (!interpret_ansic && !interpret_unicode) + { + ++*backslash_counter; + return '\\'; + } + + /* Dispatch according to the character following the backslash. */ + c = phase1_getc (); + if (c == EOF) + { + ++*backslash_counter; + return '\\'; + } + + if (interpret_ansic) + switch (c) + { + case '\n': + continue; + case '\\': + ++*backslash_counter; + return c; + case '\'': case '"': + *backslash_counter = 0; + return c; + case 'a': + *backslash_counter = 0; + return ALERT_CHAR; + case 'b': + *backslash_counter = 0; + return '\b'; + case 'f': + *backslash_counter = 0; + return '\f'; + case 'n': + *backslash_counter = 0; + return '\n'; + case 'r': + *backslash_counter = 0; + return '\r'; + case 't': + *backslash_counter = 0; + return '\t'; + case 'v': + *backslash_counter = 0; + return '\v'; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + int n = c - '0'; + + c = phase1_getc (); + if (c != EOF) + { + if (c >= '0' && c <= '7') + { + n = (n << 3) + (c - '0'); + c = phase1_getc (); + if (c != EOF) + { + if (c >= '0' && c <= '7') + n = (n << 3) + (c - '0'); + else + phase1_ungetc (c); + } + } + else + phase1_ungetc (c); + } + *backslash_counter = 0; + return (unsigned char) n; + } + case 'x': + { + int c1 = phase1_getc (); + int n1; + + if (c1 >= '0' && c1 <= '9') + n1 = c1 - '0'; + else if (c1 >= 'A' && c1 <= 'F') + n1 = c1 - 'A' + 10; + else if (c1 >= 'a' && c1 <= 'f') + n1 = c1 - 'a' + 10; + else + n1 = -1; + + if (n1 >= 0) + { + int c2 = phase1_getc (); + int n2; + + if (c2 >= '0' && c2 <= '9') + n2 = c2 - '0'; + else if (c2 >= 'A' && c2 <= 'F') + n2 = c2 - 'A' + 10; + else if (c2 >= 'a' && c2 <= 'f') + n2 = c2 - 'a' + 10; + else + n2 = -1; + + if (n2 >= 0) + { + *backslash_counter = 0; + return (unsigned char) ((n1 << 4) + n2); + } + + phase1_ungetc (c2); + } + phase1_ungetc (c1); + phase1_ungetc (c); + ++*backslash_counter; + return '\\'; + } + } + + if (interpret_unicode) + { + if (c == 'u') + { + char buf[4]; + unsigned int n = 0; + int i; + + for (i = 0; i < 4; i++) + { + int c1 = phase1_getc (); + + if (c1 >= '0' && c1 <= '9') + n = (n << 4) + (c1 - '0'); + else if (c1 >= 'A' && c1 <= 'F') + n = (n << 4) + (c1 - 'A' + 10); + else if (c1 >= 'a' && c1 <= 'f') + n = (n << 4) + (c1 - 'a' + 10); + else + { + phase1_ungetc (c1); + while (--i >= 0) + phase1_ungetc (buf[i]); + phase1_ungetc (c); + ++*backslash_counter; + return '\\'; + } + + buf[i] = c1; + } + *backslash_counter = 0; + return n; + } + + if (interpret_ansic) + { + if (c == 'U') + { + char buf[8]; + unsigned int n = 0; + int i; + + for (i = 0; i < 8; i++) + { + int c1 = phase1_getc (); + + if (c1 >= '0' && c1 <= '9') + n = (n << 4) + (c1 - '0'); + else if (c1 >= 'A' && c1 <= 'F') + n = (n << 4) + (c1 - 'A' + 10); + else if (c1 >= 'a' && c1 <= 'f') + n = (n << 4) + (c1 - 'a' + 10); + else + { + phase1_ungetc (c1); + while (--i >= 0) + phase1_ungetc (buf[i]); + phase1_ungetc (c); + ++*backslash_counter; + return '\\'; + } + + buf[i] = c1; + } + if (n < 0x110000) + { + *backslash_counter = 0; + return n; + } + + error_with_progname = false; + error (0, 0, _("%s:%d: warning: invalid Unicode character"), + logical_file_name, line_number); + error_with_progname = true; + + while (--i >= 0) + phase1_ungetc (buf[i]); + phase1_ungetc (c); + ++*backslash_counter; + return '\\'; + } + + if (c == 'N') + { + int c1 = phase1_getc (); + if (c1 == '{') + { + char buf[UNINAME_MAX + 1]; + int i; + unsigned int n; + + for (i = 0; i < UNINAME_MAX; i++) + { + int c2 = phase1_getc (); + if (!(c2 >= ' ' && c2 <= '~')) + { + phase1_ungetc (c2); + while (--i >= 0) + phase1_ungetc (buf[i]); + phase1_ungetc (c1); + phase1_ungetc (c); + ++*backslash_counter; + return '\\'; + } + if (c2 == '}') + break; + buf[i] = c2; + } + buf[i] = '\0'; + + n = unicode_name_character (buf); + if (n != UNINAME_INVALID) + { + *backslash_counter = 0; + return n; + } + + phase1_ungetc ('}'); + while (--i >= 0) + phase1_ungetc (buf[i]); + } + phase1_ungetc (c1); + phase1_ungetc (c); + ++*backslash_counter; + return '\\'; + } + } + } + + phase1_ungetc (c); + ++*backslash_counter; + return '\\'; + } +} + + +/* Combine characters into tokens. Discard whitespace except newlines at + the end of logical lines. */ + +/* Number of pending open parentheses/braces/brackets. */ +static int open_pbb; + +/* Maximum used guaranteed to be < . */ +static token_ty phase5_pushback[2]; +static int phase5_pushback_length; + +static void +phase5_get (tp) + token_ty *tp; +{ + int c; + + if (phase5_pushback_length) + { + *tp = phase5_pushback[--phase5_pushback_length]; + return; + } + + for (;;) + { + tp->line_number = line_number; + c = phase2_getc (); + + switch (c) + { + case EOF: + tp->type = token_type_eof; + return; + + case ' ': + case '\t': + case '\f': + /* Ignore whitespace and comments. */ + continue; + + case '\n': + if (last_non_comment_line > last_comment_line) + xgettext_comment_reset (); + /* Ignore newline if and only if it is used for implicit line + joining. */ + if (open_pbb > 0) + continue; + tp->type = token_type_other; + return; + + case '.': + { + int c1 = phase2_getc (); + phase2_ungetc (c1); + if (!(c1 >= '0' && c1 <= '9')) + { + + tp->type = token_type_other; + return; + } + } + /* FALLTHROUGH */ + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': + case 'S': case 'T': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': + case 's': case 't': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + symbol: + /* Symbol, or part of a number. */ + { + static char *buffer; + static int bufmax; + int bufpos; + + bufpos = 0; + for (;;) + { + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + c = phase2_getc (); + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + continue; + default: + phase2_ungetc (c); + break; + } + break; + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = '\0'; + tp->string = xstrdup (buffer); + tp->type = token_type_symbol; + return; + } + + /* Strings. */ + { + static unsigned short *buffer; + static int bufmax; + int bufpos; + int quote_char; + bool interpret_ansic; + bool interpret_unicode; + bool triple; + unsigned int backslash_counter; + + case 'R': case 'r': + { + int c1 = phase1_getc (); + if (c1 == '"' || c1 == '\'') + { + quote_char = c1; + interpret_ansic = false; + interpret_unicode = false; + goto string; + } + phase1_ungetc (c1); + goto symbol; + } + + case 'U': case 'u': + { + int c1 = phase1_getc (); + if (c1 == '"' || c1 == '\'') + { + quote_char = c1; + interpret_ansic = true; + interpret_unicode = true; + goto string; + } + if (c1 == 'R' || c1 == 'r') + { + int c2 = phase1_getc (); + if (c2 == '"' || c2 == '\'') + { + quote_char = c2; + interpret_ansic = false; + interpret_unicode = true; + goto string; + } + phase1_ungetc (c2); + } + phase1_ungetc (c1); + goto symbol; + } + + case '"': case '\'': + quote_char = c; + interpret_ansic = true; + interpret_unicode = false; + string: + triple = false; + { + int c1 = phase1_getc (); + if (c1 == quote_char) + { + int c2 = phase1_getc (); + if (c2 == quote_char) + triple = true; + else + { + phase1_ungetc (c2); + phase1_ungetc (c1); + } + } + else + phase1_ungetc (c1); + } + backslash_counter = 0; + /* Start accumulating the string. We store the string in + UTF-16 before converting it to UTF-8. Why not converting + every character directly to UTF-8? Because a string can + contain surrogates like u"\uD800\uDF00", and we must + combine them to a single UTF-8 character. */ + bufpos = 0; + for (;;) + { + int uc = phase7_getuc (quote_char, triple, interpret_ansic, + interpret_unicode, &backslash_counter); + unsigned int len; + + if (uc == P7_EOF || uc == P7_STRING_END) + break; + + assert (uc >= 0 && uc < 0x110000); + len = (uc < 0x10000 ? 1 : 2); + if (bufpos + len > bufmax) + { + bufmax += 100; + buffer = + xrealloc (buffer, bufmax * sizeof (unsigned short)); + } + if (uc < 0x10000) + buffer[bufpos++] = uc; + else + { + buffer[bufpos++] = 0xd800 + ((uc - 0x10000) >> 10); + buffer[bufpos++] = 0xdc00 + ((uc - 0x10000) & 0x3ff); + } + } + /* Now convert from UTF-16 to UTF-8. */ + { + int pos; + unsigned char *utf8_string; + unsigned char *q; + + /* Each UTF-16 word needs 3 bytes at worst. */ + utf8_string = (unsigned char *) xmalloc (3 * bufpos + 1); + for (pos = 0, q = utf8_string; pos < bufpos; ) + { + unsigned int uc; + int n; + + pos += u16_mbtouc (&uc, buffer + pos, bufpos - pos); + n = u8_uctomb (q, uc, 6); + assert (n > 0); + q += n; + } + *q = '\0'; + assert (q - utf8_string <= 3 * bufpos); + tp->string = (char *) utf8_string; + } + tp->type = token_type_string; + return; + } + + case '(': + open_pbb++; + tp->type = token_type_lparen; + return; + + case ')': + if (open_pbb > 0) + open_pbb--; + tp->type = token_type_rparen; + return; + + case ',': + tp->type = token_type_comma; + return; + + case '[': case '{': + open_pbb++; + tp->type = token_type_other; + return; + + case ']': case '}': + if (open_pbb > 0) + open_pbb--; + tp->type = token_type_other; + return; + + default: + /* We could carefully recognize each of the 2 and 3 character + operators, but it is not necessary, as we only need to recognize + gettext invocations. Don't bother. */ + tp->type = token_type_other; + return; + } + } +} + +static void +phase5_unget (tp) + token_ty *tp; +{ + if (tp->type != token_type_eof) + phase5_pushback[phase5_pushback_length++] = *tp; +} + + +/* Combine adjacent strings to form a single string. Note that the end + of a logical line appears as a token of its own, therefore strings that + belong to different logical lines will not be concatenated. */ + +static void +x_python_lex (tp) + token_ty *tp; +{ + phase5_get (tp); + if (tp->type != token_type_string) + return; + for (;;) + { + token_ty tmp; + size_t len; + + phase5_get (&tmp); + if (tmp.type != token_type_string) + { + phase5_unget (&tmp); + return; + } + len = strlen (tp->string); + tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1); + strcpy (tp->string + len, tmp.string); + free (tmp.string); + } +} + + +/* ========================= Extracting strings. ========================== */ + +/* The file is broken into tokens. Scan the token stream, looking for + a keyword, followed by a left paren, followed by a string. When we + see this sequence, we have something to remember. We assume we are + looking at a valid C or C++ program, and leave the complaints about + the grammar to the compiler. + + Normal handling: Look for + keyword ( ... msgid ... ) + Plural handling: Look for + keyword ( ... msgid ... msgid_plural ... ) + + We use recursion because the arguments before msgid or between msgid + and msgid_plural can contain subexpressions of the same form. */ + + +/* Extract messages until the next balanced closing parenthesis. + Extracted messages are added to MLP. + When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and, + if also a plural argument shall be extracted, PLURAL_COMMAS > 0, + otherwise PLURAL_COMMAS = 0. + When no specific argument shall be extracted, COMMAS_TO_SKIP < 0. + Return true upon eof, false upon closing parenthesis. */ +static bool +extract_parenthesized (mlp, commas_to_skip, plural_commas) + message_list_ty *mlp; + int commas_to_skip; + int plural_commas; +{ + /* Remember the message containing the msgid, for msgid_plural. */ + message_ty *plural_mp = NULL; + + /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ + int state; + /* Parameters of the keyword just seen. Defined only in state 1. */ + int next_commas_to_skip = -1; + int next_plural_commas = 0; + + /* Start state is 0. */ + state = 0; + + while (1) + { + token_ty token; + + x_python_lex (&token); + switch (token.type) + { + case token_type_symbol: + /* No need to bother if we extract all strings anyway. */ + if (!extract_all) + { + void *keyword_value; + + if (find_entry (&keywords, token.string, strlen (token.string), + &keyword_value) + == 0) + { + int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1); + int argnum2 = (int) (long) keyword_value >> 10; + + next_commas_to_skip = argnum1 - 1; + next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0); + state = 1; + } + else + state = 0; + } + free (token.string); + continue; + + case token_type_lparen: + /* No need to recurse if we extract all strings anyway. */ + if (extract_all) + continue; + if (state + ? extract_parenthesized (mlp, next_commas_to_skip, + next_plural_commas) + : extract_parenthesized (mlp, -1, 0)) + return true; + state = 0; + continue; + + case token_type_rparen: + /* No need to return if we extract all strings anyway. */ + if (extract_all) + continue; + return false; + + case token_type_comma: + /* No need to bother if we extract all strings anyway. */ + if (extract_all) + continue; + if (commas_to_skip >= 0) + { + if (commas_to_skip > 0) + commas_to_skip--; + else + if (plural_mp != NULL && plural_commas > 0) + { + commas_to_skip = plural_commas - 1; + plural_commas = 0; + } + else + commas_to_skip = -1; + } + state = 0; + continue; + + case token_type_string: + { + lex_pos_ty pos; + pos.file_name = logical_file_name; + pos.line_number = token.line_number; + + if (extract_all) + remember_a_message (mlp, token.string, &pos); + else + { + if (commas_to_skip == 0) + { + if (plural_mp == NULL) + { + /* Seen an msgid. */ + message_ty *mp = remember_a_message (mlp, token.string, + &pos); + if (plural_commas > 0) + plural_mp = mp; + } + else + { + /* Seen an msgid_plural. */ + remember_a_message_plural (plural_mp, token.string, + &pos); + plural_mp = NULL; + } + } + else + free (token.string); + state = 0; + } + continue; + } + + case token_type_eof: + return true; + + case token_type_other: + state = 0; + continue; + + default: + abort (); + } + } +} + + +void +extract_python (f, real_filename, logical_filename, mdlp) + FILE *f; + const char *real_filename; + const char *logical_filename; + msgdomain_list_ty *mdlp; +{ + message_list_ty *mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + last_comment_line = -1; + last_non_comment_line = -1; + + open_pbb = 0; + + init_keywords (); + + /* Eat tokens until eof is seen. When extract_parenthesized returns + due to an unbalanced closing parenthesis, just restart it. */ + while (!extract_parenthesized (mlp, -1, 0)) + ; + + /* We converted our strings to UTF-8 encoding. If not all the strings + were plain ASCII, set the charset in the header to UTF-8. */ + if (!is_ascii_message_list (mlp)) + { + const char *canon_utf_8 = po_charset_canonicalize ("UTF-8"); + iconv_message_list (mlp, canon_utf_8, canon_utf_8); + } + + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} diff --git a/src/x-python.h b/src/x-python.h new file mode 100644 index 000000000..26b55ac93 --- /dev/null +++ b/src/x-python.h @@ -0,0 +1,32 @@ +/* xgettext Python backend. + Copyright (C) 2002 Free Software Foundation, Inc. + Written by Bruno Haible , 2002. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + + +#define EXTENSIONS_PYTHON \ + { "py", "Python" }, \ + +#define SCANNERS_PYTHON \ + { "Python", extract_python, &formatstring_python }, \ + +/* Scan a Python file and add its translatable strings to mdlp. */ +extern void extract_python PARAMS ((FILE *fp, const char *real_filename, + const char *logical_filename, + msgdomain_list_ty *mdlp)); + +extern void x_python_keyword PARAMS ((const char *keyword)); +extern void x_python_extract_all PARAMS ((void)); diff --git a/src/xgettext.c b/src/xgettext.c index a973f5607..d288a30c8 100644 --- a/src/xgettext.c +++ b/src/xgettext.c @@ -59,6 +59,7 @@ #include "x-c.h" #include "x-po.h" +#include "x-python.h" #include "x-lisp.h" #include "x-elisp.h" #include "x-librep.h" @@ -223,6 +224,7 @@ main (argc, argv) break; case 'a': x_c_extract_all (); + x_python_extract_all (); x_lisp_extract_all (); x_elisp_extract_all (); x_librep_extract_all (); @@ -277,6 +279,7 @@ main (argc, argv) if (optarg == NULL || *optarg != '\0') { x_c_keyword (optarg); + x_python_keyword (optarg); x_lisp_keyword (optarg); x_elisp_keyword (optarg); x_librep_keyword (optarg); @@ -1244,13 +1247,13 @@ language_to_extractor (name) { SCANNERS_C SCANNERS_PO + SCANNERS_PYTHON SCANNERS_LISP SCANNERS_ELISP SCANNERS_LIBREP SCANNERS_JAVA SCANNERS_YCP SCANNERS_RST - { "Python", extract_c, &formatstring_python }, /* Here will follow more languages and their scanners: awk, perl, etc... Make sure new scanners honor the --exclude-file option. */ }; @@ -1287,6 +1290,7 @@ extension_to_language (extension) { EXTENSIONS_C EXTENSIONS_PO + EXTENSIONS_PYTHON EXTENSIONS_LISP EXTENSIONS_ELISP EXTENSIONS_LIBREP diff --git a/tests/ChangeLog b/tests/ChangeLog index 309b0d917..d58af35fe 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,9 @@ +2002-02-02 Bruno Haible + + * xgettext-18: New file. + * lang-python: New file. + * Makefile.am (TESTS): Add xgettext-18, lang-python. + 2002-02-02 Bruno Haible * Makefile.am (INCLUDES): Add -I../lib. Needed for builds with diff --git a/tests/Makefile.am b/tests/Makefile.am index e4c299fc0..e7e7e335c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -45,6 +45,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-1 xgettext-2 xgettext-3 xgettext-4 xgettext-5 xgettext-6 \ xgettext-7 xgettext-8 xgettext-9 xgettext-10 xgettext-11 xgettext-12 \ xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \ + xgettext-18 \ format-c-1 format-c-2 \ format-elisp-1 format-elisp-2 \ format-java-1 format-java-2 \ @@ -54,7 +55,7 @@ TESTS = gettext-1 gettext-2 \ format-pascal-1 format-pascal-2 \ format-ycp-1 format-ycp-2 \ plural-1 plural-2 \ - lang-c lang-c++ lang-objc lang-clisp lang-elisp lang-librep lang-java lang-pascal lang-ycp lang-po lang-rst \ + lang-c lang-c++ lang-objc lang-python lang-clisp lang-elisp lang-librep lang-java lang-pascal lang-ycp lang-po lang-rst \ rpath-1a rpath-1b \ rpath-2aaa rpath-2aab rpath-2aac rpath-2aad \ rpath-2aba rpath-2abb rpath-2abc rpath-2abd \ diff --git a/tests/lang-python b/tests/lang-python new file mode 100755 index 000000000..5d1fd0c27 --- /dev/null +++ b/tests/lang-python @@ -0,0 +1,87 @@ +#! /bin/sh + +# Test of gettext facilities in the Python language. + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles prog.py" +cat <<\EOF > prog.py +import gettext + +gettext.textdomain('prog') +gettext.bindtextdomain('prog', '.') + +print gettext.gettext("'Your command, please?', asked the waiter.") +print gettext.gettext("%(oldCurrency)s is replaced by %(newCurrency)s.") \ + % { 'oldCurrency': "FF", 'newCurrency' : "EUR" } +EOF + +tmpfiles="$tmpfiles prog.pot" +: ${XGETTEXT=xgettext} +${XGETTEXT} -o prog.pot --omit-header --no-location prog.py + +tmpfiles="$tmpfiles prog.ok" +cat < prog.ok +msgid "'Your command, please?', asked the waiter." +msgstr "" + +#, python-format +msgid "%(oldCurrency)s is replaced by %(newCurrency)s." +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} prog.ok prog.pot || exit 1 + +tmpfiles="$tmpfiles fr.po" +cat <<\EOF > fr.po +msgid "" +msgstr "" +"Content-Type: text/plain; charset=ISO-8859-1\n" +"Plural-Forms: nplurals=2; plural=(n > 1);\n" + +msgid "'Your command, please?', asked the waiter." +msgstr "«Votre commande, s'il vous plait», dit le garçon." + +# Reverse the arguments. +#, python-format +msgid "%(oldCurrency)s is replaced by %(newCurrency)s." +msgstr "%(newCurrency)s remplace %(oldCurrency)s." +EOF + +tmpfiles="$tmpfiles fr.po.new" +: ${MSGMERGE=msgmerge} +${MSGMERGE} -q -o fr.po.new fr.po prog.pot + +: ${DIFF=diff} +${DIFF} fr.po fr.po.new || exit 1 + +tmpfiles="$tmpfiles fr" +test -d fr || mkdir fr +test -d fr/LC_MESSAGES || mkdir fr/LC_MESSAGES + +: ${MSGFMT=msgfmt} +${MSGFMT} -o fr/LC_MESSAGES/prog.mo fr.po + +tmpfiles="$tmpfiles prog.ok prog.out" +: ${DIFF=diff} +cat <<\EOF > prog.ok +«Votre commande, s'il vous plait», dit le garçon. +EUR remplace FF. +EOF + +# Test for presence of python version 2.0 or newer. +(python -V) >/dev/null 2>/dev/null \ + || { echo "SKIP: lang-python"; rm -fr $tmpfiles; exit 77; } +case `python -c 'import sys; print sys.hexversion >= 0x20000F0'` in + 1) ;; + *) echo "SKIP: lang-python"; rm -fr $tmpfiles; exit 77;; +esac + +LANGUAGE= LC_ALL=fr_FR python prog.py > prog.out || exit 1 +${DIFF} prog.ok prog.out || exit 1 + +rm -fr $tmpfiles + +exit 0 diff --git a/tests/xgettext-18 b/tests/xgettext-18 new file mode 100755 index 000000000..f285adaf0 --- /dev/null +++ b/tests/xgettext-18 @@ -0,0 +1,84 @@ +#!/bin/sh + +# Test of Python support. + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles xg-test18.py" +cat <<\EOF > xg-test18.py +# interpret_ansic = true, interpret_unicode = false +_("abc\ +\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); + +# interpret_ansic = false, interpret_unicode = false +_(r"abc\ +\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); + +# interpret_ansic = true, interpret_unicode = true +_(u"abc\ +\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); + +# interpret_ansic = false, interpret_unicode = true +_(ur"abc\ +\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); +EOF + +tmpfiles="$tmpfiles xg-test18.err xg-test18.tmp xg-test18.pot" +: ${XGETTEXT=xgettext} +${XGETTEXT} --add-comments --no-location -o xg-test18.tmp xg-test18.py 2>xg-test18.err +test $? = 0 || { cat xg-test18.err; rm -fr $tmpfiles; exit 1; } +grep -v 'POT-Creation-Date' < xg-test18.tmp > xg-test18.pot + +tmpfiles="$tmpfiles xg-test18.ok" +cat <<\EOF > xg-test18.ok +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. interpret_ansic = true, interpret_unicode = false +msgid "" +"abc\\def'ghi\"jkl\b\f\n" +"\r\t x x~y\\u0142\\U00010123\\N{LATIN SMALL LETTER Z}" +msgstr "" + +#. interpret_ansic = false, interpret_unicode = false +msgid "" +"abc\\\n" +"\\\\def\\'ghi\\\"jkl\\a\\b\\f\\n\\r\\t\\v x\\040x\\x7ey\\u0142\\U00010123\\N" +"{LATIN SMALL LETTER Z}" +msgstr "" + +#. interpret_ansic = true, interpret_unicode = true +msgid "" +"abc\\def'ghi\"jkl\b\f\n" +"\r\t x x~ył𐄣z" +msgstr "" + +#. interpret_ansic = false, interpret_unicode = true +msgid "" +"abc\\\n" +"\\\\def\\'ghi\\\"jkl\\a\\b\\f\\n\\r\\t\\v x\\040x\\x7eył\\U00010123\\N{LATIN " +"SMALL LETTER Z}" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} xg-test18.ok xg-test18.pot +result=$? + +rm -fr $tmpfiles + +exit $result