From: Bruno Haible Date: Mon, 19 Aug 2002 11:02:03 +0000 (+0000) Subject: Support for PHP. X-Git-Tag: v0.12~1288 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=012803fbf11d258e87f66b17b390c9bd7ee60ccf;p=thirdparty%2Fgettext.git Support for PHP. --- diff --git a/doc/ChangeLog b/doc/ChangeLog index 3786a4664..4a62043a0 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,13 @@ +2002-08-18 Bruno Haible + + * gettext.texi (php-format): New subsection. + (PHP): Update. + * xgettext.texi: Mention language PHP. + +2002-08-06 Bruno Haible + + * gettext-0.11.5 released. + 2002-07-16 Bruno Haible * matrix.texi: Update. diff --git a/doc/gettext.texi b/doc/gettext.texi index 1ac67b263..85d835ca9 100644 --- a/doc/gettext.texi +++ b/doc/gettext.texi @@ -318,6 +318,7 @@ The Translator's View * object-pascal-format:: Object Pascal Format Strings * ycp-format:: YCP Format Strings * tcl-format:: Tcl Format Strings +* php-format:: PHP Format Strings Individual Programming Languages @@ -6722,6 +6723,7 @@ strings. * object-pascal-format:: Object Pascal Format Strings * ycp-format:: YCP Format Strings * tcl-format:: Tcl Format Strings +* php-format:: PHP Format Strings @end menu @node c-format, python-format, Translators for other Languages, Translators for other Languages @@ -6808,12 +6810,18 @@ YCP sformat strings are described in the libycp documentation In summary, a directive starts with @samp{%} and is followed by @samp{%} or a nonzero digit (@samp{1} to @samp{9}). -@node tcl-format, , ycp-format, Translators for other Languages +@node tcl-format, php-format, ycp-format, Translators for other Languages @subsection Tcl Format Strings Tcl format strings are described in the @file{format.n} manual page, @uref{http://www.scriptics.com/man/tcl8.3/TclCmd/format.htm}. +@node php-format, , tcl-format, Translators for other Languages +@subsection PHP Format Strings + +PHP format strings are described in the documentation of the PHP function +@code{sprintf}, in @file{phpdoc/manual/function.sprintf.html}. + @node Maintainers for other Languages, List of Programming Languages, Translators for other Languages, Programming Languages @section The Maintainer's View @@ -6849,7 +6857,7 @@ that language, and to combine the resulting files using @code{msgcat}. @c Perl 1911 @c C++ 1379 * @c Java 1200 * -@c PHP 1051 +@c PHP 1051 * @c Python 613 * @c Unix Shell 357 @c Tcl 266 * @@ -7792,13 +7800,13 @@ use @table @asis @item RPMs -mod_php4, phplib, phpdoc +mod_php4, mod_php4-core, phplib, phpdoc @item File extension @code{php}, @code{php3}, @code{php4} @item String syntax -@code{"abc"} +@code{"abc"}, @code{'abc'} @item gettext shorthand @code{_("abc")} @@ -7813,7 +7821,7 @@ mod_php4, phplib, phpdoc @code{bindtextdomain} function @item setlocale -@code{setlocale} function +Programmer must call @code{setlocale (LC_ALL, "")} @item Prerequisite --- @@ -7822,10 +7830,10 @@ mod_php4, phplib, phpdoc use @item Extractor ---- +@code{xgettext} @item Formatting with positions ---- +@code{printf "%2\$d %1\$d"} @item Portability On platforms without gettext, the functions are not available. diff --git a/doc/xgettext.texi b/doc/xgettext.texi index 9143679f8..49e3c2083 100644 --- a/doc/xgettext.texi +++ b/doc/xgettext.texi @@ -71,7 +71,7 @@ is written to standard output. Specifies the language of the input files. The supported languages are @code{C}, @code{C++}, @code{ObjectiveC}, @code{PO}, @code{Python}, @code{Lisp}, @code{EmacsLisp}, @code{librep}, @code{Java}, @code{awk}, -@code{YCP}, @code{Tcl}, @code{RST}, @code{Glade}. +@code{YCP}, @code{Tcl}, @code{PHP}, @code{RST}, @code{Glade}. @item -C @itemx --c++ diff --git a/src/ChangeLog b/src/ChangeLog index 325e22160..326e23ab1 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,22 @@ +2002-08-18 Bruno Haible + + * message.h (enum format_type): New enum value 'format_php'. + (NFORMATS): Increment. + * message.c (format_language, format_language_pretty): Add entry + for php. + * format.h (formatstring_php): New declaration. + * format-php.c: New file. + * format.c (formatstring_parsers): Add entry for php. + * x-php.h: New file. + * x-php.c: New file. + * xgettext.c: Include x-php.c. + (main): Call x_php_extract_all, x_php_keyword. + (language_to_scanner): Add PHP rule. + (extension_to_language): Add PHP rule. + * Makefile.am (noinst_HEADERS): Add x-php.h. + (FORMAT_SOURCE): Add format-php.c. + (xgettext_SOURCES): Add x-php.c. + 2002-08-17 Bruno Haible * urlget.c (fetch): Also try invoking the 'curl' program. diff --git a/src/Makefile.am b/src/Makefile.am index 613b5865f..1ad2ad059 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -35,7 +35,7 @@ po-gram-gen.h po-hash-gen.h msgl-charset.h msgl-equal.h msgl-iconv.h \ msgl-ascii.h msgl-cat.h msgl-english.h msgfmt.h msgunfmt.h read-mo.h \ write-mo.h read-java.h write-java.h read-tcl.h write-tcl.h po-time.h \ plural-table.h format.h xgettext.h x-c.h x-po.h x-python.h x-lisp.h \ -x-elisp.h x-librep.h x-java.h x-awk.h x-ycp.h x-tcl.h x-rst.h x-glade.h +x-elisp.h x-librep.h x-java.h x-awk.h x-ycp.h x-tcl.h x-php.h x-rst.h x-glade.h EXTRA_DIST = FILES project-id msgunfmt.tcl \ gnu/gettext/DumpResource.java gnu/gettext/GetURL.java @@ -74,7 +74,8 @@ open-po.c dir-list.c str-list.c # xgettext and msgfmt deal with format strings. FORMAT_SOURCE = format.c \ format-c.c format-python.c format-lisp.c format-elisp.c format-librep.c \ -format-java.c format-awk.c format-pascal.c format-ycp.c format-tcl.c +format-java.c format-awk.c format-pascal.c format-ycp.c format-tcl.c \ +format-php.c # libgettextsrc contains all code that is needed by at least two programs. libgettextsrc_la_SOURCES = \ @@ -94,7 +95,7 @@ msgmerge_SOURCES = msgmerge.c msgunfmt_SOURCES = msgunfmt.c read-mo.c read-java.c read-tcl.c xgettext_SOURCES = xgettext.c \ x-c.c x-po.c x-python.c x-lisp.c x-elisp.c x-librep.c x-java.l x-awk.c \ - x-ycp.c x-tcl.c x-rst.c x-glade.c + x-ycp.c x-tcl.c x-php.c x-rst.c x-glade.c msgattrib_SOURCES = msgattrib.c msgcat_SOURCES = msgcat.c msgcomm_SOURCES = msgcomm.c diff --git a/src/format-php.c b/src/format-php.c new file mode 100644 index 000000000..9ca6ef0c8 --- /dev/null +++ b/src/format-php.c @@ -0,0 +1,497 @@ +/* PHP format strings. + Copyright (C) 2001-2002 Free Software Foundation, Inc. + Written by Bruno Haible , 2002. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include "format.h" +#include "xmalloc.h" +#include "error.h" +#include "progname.h" +#include "gettext.h" + +#define _(str) gettext (str) + +/* PHP format strings are described in phpdoc-4.0.6, file + phpdoc/manual/function.sprintf.html, and are implemented in + php-4.1.0/ext/standard/formatted_print.c. + A directive + - starts with '%' or '%m$' where m is a positive integer, + - is optionally followed by any of the characters '0', '-', ' ', or + "'", each of which acts as a flag, + - is optionally followed by a width specification: a nonempty digit + sequence, + - is optionally followed by '.' and a precision specification: a nonempty + digit sequence, + - is optionally followed by a size specifier 'l', which is ignored, + - is finished by a specifier + - 's', that needs a string argument, + - 'b', 'd', 'u', 'o', 'x', 'X', that need an integer argument, + - 'e', 'f', that need a floating-point argument, + - 'c', that needs a character argument. + Additionally there is the directive '%%', which takes no argument. + Numbered and unnumbered argument specifications can be used in the same + string. Numbered argument specifications have no influence on the + "current argument index", that is incremented each time an argument is read. + */ + +enum format_arg_type +{ + FAT_INTEGER, + FAT_FLOAT, + FAT_CHARACTER, + FAT_STRING +}; + +struct numbered_arg +{ + unsigned int number; + enum format_arg_type type; +}; + +struct spec +{ + unsigned int directives; + unsigned int numbered_arg_count; + unsigned int allocated; + struct numbered_arg *numbered; +}; + +/* Locale independent test for a decimal digit. + Argument can be 'char' or 'unsigned char'. (Whereas the argument of + isdigit must be an 'unsigned char'.) */ +#undef isdigit +#define isdigit(c) ((unsigned int) ((c) - '0') < 10) + + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +static int numbered_arg_compare PARAMS ((const void *p1, const void *p2)); +static void *format_parse PARAMS ((const char *format)); +static void format_free PARAMS ((void *descr)); +static int format_get_number_of_directives PARAMS ((void *descr)); +static bool format_check PARAMS ((const lex_pos_ty *pos, + void *msgid_descr, void *msgstr_descr, + bool equality, + bool noisy, const char *pretty_msgstr)); + + +static int +numbered_arg_compare (p1, p2) + const void *p1; + const void *p2; +{ + unsigned int n1 = ((const struct numbered_arg *) p1)->number; + unsigned int n2 = ((const struct numbered_arg *) p2)->number; + + return (n1 > n2 ? 1 : n1 < n2 ? -1 : 0); +} + +static void * +format_parse (format) + const char *format; +{ + unsigned int directives; + unsigned int numbered_arg_count; + unsigned int allocated; + struct numbered_arg *numbered; + unsigned int unnumbered_arg_count; + struct spec *result; + + directives = 0; + numbered_arg_count = 0; + allocated = 0; + numbered = NULL; + unnumbered_arg_count = 0; + + for (; *format != '\0';) + if (*format++ == '%') + { + /* A directive. */ + directives++; + + if (*format != '%') + { + /* A complex directive. */ + unsigned int number; + enum format_arg_type type; + + number = ++unnumbered_arg_count; + if (isdigit (*format)) + { + const char *f = format; + unsigned int m = 0; + + do + { + m = 10 * m + (*f - '0'); + f++; + } + while (isdigit (*f)); + + if (*f == '$') + { + if (m == 0) + goto bad_format; + number = m; + format = ++f; + --unnumbered_arg_count; + } + } + + /* Parse flags. */ + for (;;) + { + if (*format == '0' || *format == '-' || *format == ' ') + format++; + else if (*format == '\'') + { + format++; + if (*format == '\0') + goto bad_format; + format++; + } + else + break; + } + + /* Parse width. */ + if (isdigit (*format)) + { + do + format++; + while (isdigit (*format)); + } + + /* Parse precision. */ + if (*format == '.') + { + format++; + + if (isdigit (*format)) + { + do + format++; + while (isdigit (*format)); + } + else + --format; /* will jump to bad_format */ + } + + /* Parse size. */ + if (*format == 'l') + format++; + + switch (*format) + { + case 'b': case 'd': case 'u': case 'o': case 'x': case 'X': + type = FAT_INTEGER; + break; + case 'e': case 'f': + type = FAT_FLOAT; + break; + case 'c': + type = FAT_CHARACTER; + break; + case 's': + type = FAT_STRING; + break; + default: + goto bad_format; + } + + if (allocated == numbered_arg_count) + { + allocated = 2 * allocated + 1; + numbered = (struct numbered_arg *) xrealloc (numbered, allocated * sizeof (struct numbered_arg)); + } + numbered[numbered_arg_count].number = number; + numbered[numbered_arg_count].type = type; + numbered_arg_count++; + } + + format++; + } + + /* Sort the numbered argument array, and eliminate duplicates. */ + if (numbered_arg_count > 1) + { + unsigned int i, j; + bool err; + + qsort (numbered, numbered_arg_count, + sizeof (struct numbered_arg), numbered_arg_compare); + + /* Remove duplicates: Copy from i to j, keeping 0 <= j <= i. */ + err = false; + for (i = j = 0; i < numbered_arg_count; i++) + if (j > 0 && numbered[i].number == numbered[j-1].number) + { + enum format_arg_type type1 = numbered[i].type; + enum format_arg_type type2 = numbered[j-1].type; + enum format_arg_type type_both; + + if (type1 == type2) + type_both = type1; + else + /* Incompatible types. */ + type_both = type1, err = true; + + numbered[j-1].type = type_both; + } + else + { + if (j < i) + { + numbered[j].number = numbered[i].number; + numbered[j].type = numbered[i].type; + } + j++; + } + numbered_arg_count = j; + if (err) + goto bad_format; + } + + result = (struct spec *) xmalloc (sizeof (struct spec)); + result->directives = directives; + result->numbered_arg_count = numbered_arg_count; + result->allocated = allocated; + result->numbered = numbered; + return result; + + bad_format: + if (numbered != NULL) + free (numbered); + return NULL; +} + +static void +format_free (descr) + void *descr; +{ + struct spec *spec = (struct spec *) descr; + + if (spec->numbered != NULL) + free (spec->numbered); + free (spec); +} + +static int +format_get_number_of_directives (descr) + void *descr; +{ + struct spec *spec = (struct spec *) descr; + + return spec->directives; +} + +static bool +format_check (pos, msgid_descr, msgstr_descr, equality, noisy, pretty_msgstr) + const lex_pos_ty *pos; + void *msgid_descr; + void *msgstr_descr; + bool equality; + bool noisy; + const char *pretty_msgstr; +{ + struct spec *spec1 = (struct spec *) msgid_descr; + struct spec *spec2 = (struct spec *) msgstr_descr; + bool err = false; + + if (spec1->numbered_arg_count + spec2->numbered_arg_count > 0) + { + unsigned int i, j; + unsigned int n1 = spec1->numbered_arg_count; + unsigned int n2 = spec2->numbered_arg_count; + + /* Check the argument names are the same. + Both arrays are sorted. We search for the first difference. */ + for (i = 0, j = 0; i < n1 || j < n2; ) + { + int cmp = (i >= n1 ? 1 : + j >= n2 ? -1 : + spec1->numbered[i].number > spec2->numbered[j].number ? 1 : + spec1->numbered[i].number < spec2->numbered[j].number ? -1 : + 0); + + if (cmp > 0) + { + if (noisy) + { + error_with_progname = false; + error_at_line (0, 0, pos->file_name, pos->line_number, + _("a format specification for argument %u, as in '%s', doesn't exist in 'msgid'"), + spec2->numbered[j].number, pretty_msgstr); + error_with_progname = true; + } + err = true; + break; + } + else if (cmp < 0) + { + if (equality) + { + if (noisy) + { + error_with_progname = false; + error_at_line (0, 0, pos->file_name, pos->line_number, + _("a format specification for argument %u doesn't exist in '%s'"), + spec1->numbered[i].number, pretty_msgstr); + error_with_progname = true; + } + err = true; + break; + } + else + i++; + } + else + j++, i++; + } + /* Check the argument types are the same. */ + if (!err) + for (i = 0, j = 0; j < n2; ) + { + if (spec1->numbered[i].number == spec2->numbered[j].number) + { + if (spec1->numbered[i].type != spec2->numbered[j].type) + { + if (noisy) + { + error_with_progname = false; + error_at_line (0, 0, pos->file_name, pos->line_number, + _("format specifications in 'msgid' and '%s' for argument %u are not the same"), + pretty_msgstr, + spec2->numbered[j].number); + error_with_progname = true; + } + err = true; + break; + } + j++, i++; + } + else + i++; + } + } + + return err; +} + + +struct formatstring_parser formatstring_php = +{ + format_parse, + format_free, + format_get_number_of_directives, + format_check +}; + + +#ifdef TEST + +/* Test program: Print the argument list specification returned by + format_parse for strings read from standard input. */ + +#include +#include "getline.h" + +static void +format_print (descr) + void *descr; +{ + struct spec *spec = (struct spec *) descr; + unsigned int last; + unsigned int i; + + if (spec == NULL) + { + printf ("INVALID"); + return; + } + + printf ("("); + last = 1; + for (i = 0; i < spec->numbered_arg_count; i++) + { + unsigned int number = spec->numbered[i].number; + + if (i > 0) + printf (" "); + if (number < last) + abort (); + for (; last < number; last++) + printf ("_ "); + switch (spec->numbered[i].type) + { + case FAT_INTEGER: + printf ("i"); + break; + case FAT_FLOAT: + printf ("f"); + break; + case FAT_CHARACTER: + printf ("c"); + break; + case FAT_STRING: + printf ("s"); + break; + default: + abort (); + } + last = number + 1; + } + printf (")"); +} + +int +main () +{ + for (;;) + { + char *line = NULL; + size_t line_len = 0; + void *descr; + + if (getline (&line, &line_len, stdin) < 0) + break; + + descr = format_parse (line); + + format_print (descr); + printf ("\n"); + + free (line); + } + + return 0; +} + +/* + * For Emacs M-x compile + * Local Variables: + * compile-command: "/bin/sh ../libtool --mode=link gcc -o a.out -static -O -g -Wall -I.. -I../lib -I../intl -DHAVE_CONFIG_H -DTEST format-php.c ../lib/libgettextlib.la" + * End: + */ + +#endif /* TEST */ diff --git a/src/format.c b/src/format.c index a981eb787..00660a780 100644 --- a/src/format.c +++ b/src/format.c @@ -36,5 +36,6 @@ struct formatstring_parser *formatstring_parsers[NFORMATS] = /* format_awk */ &formatstring_awk, /* format_pascal */ &formatstring_pascal, /* format_ycp */ &formatstring_ycp, - /* format_tcl */ &formatstring_tcl + /* format_tcl */ &formatstring_tcl, + /* format_php */ &formatstring_php }; diff --git a/src/format.h b/src/format.h index f19634f5c..682744a53 100644 --- a/src/format.h +++ b/src/format.h @@ -65,6 +65,7 @@ extern struct formatstring_parser formatstring_awk; extern struct formatstring_parser formatstring_pascal; extern struct formatstring_parser formatstring_ycp; extern struct formatstring_parser formatstring_tcl; +extern struct formatstring_parser formatstring_php; /* Table of all format string parsers. */ extern struct formatstring_parser *formatstring_parsers[NFORMATS]; diff --git a/src/message.c b/src/message.c index 9ffcf3031..bb0f84bea 100644 --- a/src/message.c +++ b/src/message.c @@ -50,7 +50,8 @@ const char *const format_language[NFORMATS] = /* format_awk */ "awk", /* format_pascal */ "object-pascal", /* format_ycp */ "ycp", - /* format_tcl */ "tcl" + /* format_tcl */ "tcl", + /* format_php */ "php" }; const char *const format_language_pretty[NFORMATS] = @@ -65,7 +66,8 @@ const char *const format_language_pretty[NFORMATS] = /* format_awk */ "awk", /* format_pascal */ "Object Pascal", /* format_ycp */ "YCP", - /* format_tcl */ "Tcl" + /* format_tcl */ "Tcl", + /* format_php */ "PHP" }; diff --git a/src/message.h b/src/message.h index abe361f7d..ffec3ee08 100644 --- a/src/message.h +++ b/src/message.h @@ -44,9 +44,10 @@ enum format_type format_awk, format_pascal, format_ycp, - format_tcl + format_tcl, + format_php }; -#define NFORMATS 11 /* Number of format_type enum values. */ +#define NFORMATS 12 /* Number of format_type enum values. */ extern const char *const format_language[NFORMATS]; extern const char *const format_language_pretty[NFORMATS]; diff --git a/src/x-php.c b/src/x-php.c new file mode 100644 index 000000000..63a61078d --- /dev/null +++ b/src/x-php.c @@ -0,0 +1,1376 @@ +/* xgettext PHP backend. + Copyright (C) 2001-2002 Free Software Foundation, Inc. + + This file was written by Bruno Haible , 2002. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include "message.h" +#include "x-php.h" +#include "xgettext.h" +#include "error.h" +#include "xmalloc.h" +#include "exit.h" +#include "gettext.h" + +#define _(s) gettext(s) + + +/* The PHP syntax is defined in phpdoc/manual/langref.html. + See also php-4.1.0/Zend/zend_language_scanner.l. */ + +enum token_type_ty +{ + token_type_eof, + token_type_lparen, /* ( */ + token_type_rparen, /* ) */ + token_type_comma, /* , */ + token_type_string_literal, /* "abc" */ + token_type_symbol, /* symbol, number */ + token_type_other /* misc. operator */ +}; +typedef enum token_type_ty token_type_ty; + +typedef struct token_ty token_ty; +struct token_ty +{ + token_type_ty type; + char *string; /* for token_type_string_literal, token_type_symbol */ + int line_number; +}; + + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +static void init_keywords PARAMS ((void)); +static int phase1_getc PARAMS ((void)); +static void phase1_ungetc PARAMS ((int c)); +static void skip_html PARAMS ((void)); +#if 0 +static int phase2_getc PARAMS ((void)); +static void phase2_ungetc PARAMS ((int c)); +#endif +static inline void comment_start PARAMS ((void)); +static inline void comment_add PARAMS ((int c)); +static inline void comment_line_end PARAMS ((size_t chars_to_remove)); +static int phase3_getc PARAMS ((void)); +static void phase3_ungetc PARAMS ((int c)); +static inline void free_token PARAMS ((token_ty *tp)); +static void x_php_lex PARAMS ((token_ty *tp)); +static bool extract_parenthesized PARAMS ((message_list_ty *mlp, + int commas_to_skip, + int plural_commas)); + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_php_extract_all () +{ + extract_all = true; +} + + +void +x_php_keyword (name) + const char *name; +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + int argnum1; + int argnum2; + const char *colon; + + if (keywords.table == NULL) + init_hash (&keywords, 100); + + split_keywordspec (name, &end, &argnum1, &argnum2); + + /* The characters between name and end should form a valid C identifier. + A colon means an invalid parse in split_keywordspec(). */ + colon = strchr (name, ':'); + if (colon == NULL || colon >= end) + { + if (argnum1 == 0) + argnum1 = 1; + insert_entry (&keywords, name, end - name, + (void *) (long) (argnum1 + (argnum2 << 10))); + } + } +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + x_php_keyword ("_"); + x_php_keyword ("gettext"); + x_php_keyword ("dgettext:2"); + x_php_keyword ("dcgettext:2"); + default_keywords = false; + } +} + + +/* ======================== Reading of characters. ======================== */ + + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + + +/* 1. line_number handling. */ + +/* Maximum used guaranteed to be < 4. */ +static unsigned char phase1_pushback[4]; +static int phase1_pushback_length; + +static int +phase1_getc () +{ + int c; + + if (phase1_pushback_length) + c = phase1_pushback[--phase1_pushback_length]; + else + { + c = getc (fp); + + if (c == EOF) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("error while reading \"%s\""), + real_file_name); + return EOF; + } + } + + if (c == '\n') + line_number++; + + return c; +} + +static void +phase1_ungetc (c) + int c; +{ + if (c != EOF) + { + if (c == '\n') + --line_number; + + phase1_pushback[phase1_pushback_length++] = c; + } +} + + +/* 2. Ignore HTML sections. They are equivalent to PHP echo commands and + therefore don't contain translatable strings. */ + +static void +skip_html () +{ + for (;;) + { + int c = phase1_getc (); + + if (c == EOF) + return; + + if (c == '<') + { + int c2 = phase1_getc (); + + if (c2 == EOF) + break; + + if (c2 == '?') + { + /* + < script language = "php" > + < script language = 'php' > + are always recognized. */ + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') + c2 = phase1_getc (); + if (c2 != 's' && c2 != 'S') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'c' && c2 != 'C') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'r' && c2 != 'R') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'i' && c2 != 'I') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'p' && c2 != 'P') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 't' && c2 != 'T') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')) + { + phase1_ungetc (c2); + continue; + } + do + c2 = phase1_getc (); + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); + if (c2 != 'l' && c2 != 'L') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'a' && c2 != 'A') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'n' && c2 != 'N') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'g' && c2 != 'G') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'u' && c2 != 'U') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'a' && c2 != 'A') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'g' && c2 != 'G') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'e' && c2 != 'E') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') + c2 = phase1_getc (); + if (c2 != '=') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') + c2 = phase1_getc (); + if (c2 == '"') + { + c2 = phase1_getc (); + if (c2 != 'p') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'h') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'p') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != '"') + { + phase1_ungetc (c2); + continue; + } + } + else if (c2 == '\'') + { + c2 = phase1_getc (); + if (c2 != 'p') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'h') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'p') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != '\'') + { + phase1_ungetc (c2); + continue; + } + } + else + { + if (c2 != 'p') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'h') + { + phase1_ungetc (c2); + continue; + } + c2 = phase1_getc (); + if (c2 != 'p') + { + phase1_ungetc (c2); + continue; + } + } + c2 = phase1_getc (); + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') + c2 = phase1_getc (); + if (c2 != '>') + { + phase1_ungetc (c2); + continue; + } + return; + } + } +} + +#if 0 + +static unsigned char phase2_pushback[1]; +static int phase2_pushback_length; + +static int +phase2_getc () +{ + int c; + + if (phase2_pushback_length) + return phase2_pushback[--phase2_pushback_length]; + + c = phase1_getc (); + switch (c) + { + case '?': + case '%': + { + int c2 = phase1_getc (); + if (c2 == '>') + { + /* ?> and %> terminate PHP mode and switch back to HTML mode. */ + skip_html (); + return ' '; + } + phase1_ungetc (c2); + } + break; + + case '<': + { + int c2 = phase1_getc (); + + /* < / script > terminates PHP mode and switches back to HTML mode. */ + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') + c2 = phase1_getc (); + if (c2 == '/') + { + do + c2 = phase1_getc (); + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); + if (c2 == 's' || c2 == 'S') + { + c2 = phase1_getc (); + if (c2 == 'c' || c2 == 'C') + { + c2 = phase1_getc (); + if (c2 == 'r' || c2 == 'R') + { + c2 = phase1_getc (); + if (c2 == 'i' || c2 == 'I') + { + c2 = phase1_getc (); + if (c2 == 'p' || c2 == 'P') + { + c2 = phase1_getc (); + if (c2 == 't' || c2 == 'T') + { + do + c2 = phase1_getc (); + while (c2 == ' ' || c2 == '\t' + || c2 == '\n' || c2 == '\r'); + if (c2 == '>') + { + skip_html (); + return ' '; + } + } + } + } + } + } + } + } + phase1_ungetc (c2); + } + break; + } + + return c; +} + +static void +phase2_ungetc (c) + int c; +{ + if (c != EOF) + phase2_pushback[phase2_pushback_length++] = c; +} + +#endif + + +/* Accumulating comments. */ + +static char *buffer; +static size_t bufmax; +static size_t buflen; + +static inline void +comment_start () +{ + buflen = 0; +} + +static inline void +comment_add (c) + int c; +{ + if (buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen++] = c; +} + +static inline void +comment_line_end (chars_to_remove) + size_t chars_to_remove; +{ + buflen -= chars_to_remove; + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + if (chars_to_remove == 0 && buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen] = '\0'; + xgettext_comment_add (buffer); +} + + +/* 3. Replace each comment that is not inside a string literal with a + space character. We need to remember the comment for later, because + it may be attached to a keyword string. */ + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; + +static unsigned char phase3_pushback[1]; +static int phase3_pushback_length; + +static int +phase3_getc () +{ + int lineno; + int c; + + if (phase3_pushback_length) + return phase3_pushback[--phase3_pushback_length]; + + c = phase1_getc (); + + if (c == '#') + { + /* sh comment. */ + bool last_was_qmark; + + comment_start (); + lineno = line_number; + for (;;) + { + c = phase1_getc (); + if (c == '\n' || c == EOF) + { + comment_line_end (0); + break; + } + if (last_was_qmark && c == '>') + { + comment_line_end (1); + skip_html (); + break; + } + /* We skip all leading white space, but not EOLs. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + last_was_qmark = (c == '?' || c == '%'); + } + last_comment_line = lineno; + return '\n'; + } + else if (c == '/') + { + c = phase1_getc (); + + switch (c) + { + default: + phase1_ungetc (c); + return '/'; + + case '*': + { + /* C comment. */ + bool last_was_star; + + comment_start (); + lineno = line_number; + last_was_star = false; + for (;;) + { + c = phase1_getc (); + if (c == EOF) + break; + /* We skip all leading white space, but not EOLs. */ + if (buflen == 0 && (c == ' ' || c == '\t')) + continue; + comment_add (c); + switch (c) + { + case '\n': + comment_line_end (1); + comment_start (); + lineno = line_number; + last_was_star = false; + continue; + + case '*': + last_was_star = true; + continue; + + case '/': + if (last_was_star) + { + comment_line_end (2); + break; + } + /* FALLTHROUGH */ + + default: + last_was_star = false; + continue; + } + break; + } + last_comment_line = lineno; + return ' '; + } + + case '/': + { + /* C++ comment. */ + bool last_was_qmark; + + comment_start (); + lineno = line_number; + for (;;) + { + c = phase1_getc (); + if (c == '\n' || c == EOF) + { + comment_line_end (0); + break; + } + if (last_was_qmark && c == '>') + { + comment_line_end (1); + skip_html (); + break; + } + /* We skip all leading white space, but not EOLs. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + last_was_qmark = (c == '?' || c == '%'); + } + last_comment_line = lineno; + return '\n'; + } + } + } + else + return c; +} + +static void +phase3_ungetc (c) + int c; +{ + if (c != EOF) + phase3_pushback[phase3_pushback_length++] = c; +} + + +/* Free the memory pointed to by a 'struct token_ty'. */ +static inline void +free_token (tp) + token_ty *tp; +{ + if (tp->type == token_type_string_literal || tp->type == token_type_symbol) + free (tp->string); +} + + +/* 4. Combine characters into tokens. Discard whitespace. */ + +static void +x_php_lex (tp) + token_ty *tp; +{ + static char *buffer; + static int bufmax; + int bufpos; + int c; + + tp->string = NULL; + + for (;;) + { + tp->line_number = line_number; + c = phase3_getc (); + switch (c) + { + case EOF: + tp->type = token_type_eof; + return; + + case '\n': + if (last_non_comment_line > last_comment_line) + xgettext_comment_reset (); + /* FALLTHROUGH */ + case ' ': + case '\t': + case '\r': + /* Ignore whitespace. */ + continue; + } + + last_non_comment_line = tp->line_number; + + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': + case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': + case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': + case 'V': case 'W': case 'X': case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': + case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': + case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': + case 'v': case 'w': case 'x': case 'y': case 'z': + bufpos = 0; + for (;;) + { + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + c = phase1_getc (); + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + continue; + + default: + phase1_ungetc (c); + break; + } + break; + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = 0; + tp->string = xstrdup (buffer); + tp->type = token_type_symbol; + return; + + case '\'': + /* Single-quoted string literal. */ + bufpos = 0; + for (;;) + { + c = phase1_getc (); + if (c == EOF || c == '\'') + break; + if (c == '\\') + { + c = phase1_getc (); + if (c != '\\' && c != '\'') + { + phase1_ungetc (c); + c = '\\'; + } + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = 0; + tp->type = token_type_string_literal; + tp->string = xstrdup (buffer); + return; + + case '"': + /* Double-quoted string literal. */ + tp->type = token_type_string_literal; + bufpos = 0; + for (;;) + { + c = phase1_getc (); + if (c == EOF || c == '"') + break; + if (c == '$') + { + c = phase1_getc (); + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') + || c == '_' || c == '{' || c >= 0x7f) + { + /* String with variables. */ + tp->type = token_type_other; + continue; + } + phase1_ungetc (c); + c = '$'; + } + if (c == '{') + { + c = phase1_getc (); + if (c == '$') + { + /* String with expressions. */ + tp->type = token_type_other; + continue; + } + phase1_ungetc (c); + c = '{'; + } + if (c == '\\') + { + int n, j; + + c = phase1_getc (); + switch (c) + { + case '"': + case '\\': + case '$': + break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + n = 0; + for (j = 0; j < 3; ++j) + { + n = n * 8 + c - '0'; + c = phase1_getc (); + switch (c) + { + default: + break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + continue; + } + break; + } + phase1_ungetc (c); + c = n; + break; + + case 'x': + n = 0; + for (j = 0; j < 2; ++j) + { + c = phase1_getc (); + switch (c) + { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + n = n * 16 + c - '0'; + break; + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + n = n * 16 + 10 + c - 'A'; + break; + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + n = n * 16 + 10 + c - 'a'; + break; + default: + phase1_ungetc (c); + c = 0; + break; + } + if (c == 0) + break; + } + if (j == 0) + { + phase1_ungetc ('x'); + c = '\\'; + } + else + c = n; + break; + + case 'n': + c = '\n'; + break; + case 't': + c = '\t'; + break; + case 'r': + c = '\r'; + break; + + default: + phase1_ungetc (c); + c = '\\'; + break; + } + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = 0; + if (tp->type == token_type_string_literal) + tp->string = xstrdup (buffer); + return; + + case '?': + case '%': + { + int c2 = phase1_getc (); + if (c2 == '>') + { + /* ?> and %> terminate PHP mode and switch back to HTML + mode. */ + skip_html (); + } + else + phase1_ungetc (c2); + tp->type = token_type_other; + return; + } + + case '(': + tp->type = token_type_lparen; + return; + + case ')': + tp->type = token_type_rparen; + return; + + case ',': + tp->type = token_type_comma; + return; + + case '<': + { + int c2 = phase1_getc (); + if (c2 == '<') + { + int c3 = phase1_getc (); + if (c3 == '<') + { + /* Start of here document. + Parse whitespace, then label, then newline. */ + do + c = phase3_getc (); + while (c == ' ' || c == '\t' || c == '\n' || c == '\r'); + + bufpos = 0; + do + { + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + c = phase3_getc (); + } + while (c != EOF && c != '\n' && c != '\r'); + /* buffer[0..bufpos-1] now contains the label. */ + + /* Now skip the here document. */ + for (;;) + { + c = phase1_getc (); + if (c == EOF) + break; + if (c == '\n' || c == '\r') + { + int bufidx = 0; + + while (bufidx < bufpos) + { + c = phase1_getc (); + if (c == EOF) + break; + if (c != buffer[bufidx]) + { + phase1_ungetc (c); + break; + } + } + c = phase1_getc (); + if (c != ';') + phase1_ungetc (c); + c = phase1_getc (); + if (c == '\n' || c == '\r') + break; + } + } + + /* FIXME: Ideally we should turn the here document into a + string literal if it didn't contain $ substitution. And + we should also respect backslash escape sequences like + in double-quoted strings. */ + tp->type = token_type_other; + return; + } + phase1_ungetc (c3); + } + + /* < / script > terminates PHP mode and switches back to HTML + mode. */ + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') + c2 = phase1_getc (); + if (c2 == '/') + { + do + c2 = phase1_getc (); + while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); + if (c2 == 's' || c2 == 'S') + { + c2 = phase1_getc (); + if (c2 == 'c' || c2 == 'C') + { + c2 = phase1_getc (); + if (c2 == 'r' || c2 == 'R') + { + c2 = phase1_getc (); + if (c2 == 'i' || c2 == 'I') + { + c2 = phase1_getc (); + if (c2 == 'p' || c2 == 'P') + { + c2 = phase1_getc (); + if (c2 == 't' || c2 == 'T') + { + do + c2 = phase1_getc (); + while (c2 == ' ' || c2 == '\t' + || c2 == '\n' || c2 == '\r'); + if (c2 == '>') + { + skip_html (); + } + else + phase1_ungetc (c2); + } + else + phase1_ungetc (c2); + } + else + phase1_ungetc (c2); + } + else + phase1_ungetc (c2); + } + else + phase1_ungetc (c2); + } + else + phase1_ungetc (c2); + } + else + phase1_ungetc (c2); + } + else + phase1_ungetc (c2); + + tp->type = token_type_other; + return; + } + + case '`': + /* Execution operator. */ + default: + /* We could carefully recognize each of the 2 and 3 character + operators, but it is not necessary, as we only need to recognize + gettext invocations. Don't bother. */ + tp->type = token_type_other; + return; + } + } +} + +/* ========================= Extracting strings. ========================== */ + +/* The file is broken into tokens. Scan the token stream, looking for + a keyword, followed by a left paren, followed by a string. When we + see this sequence, we have something to remember. We assume we are + looking at a valid C or C++ program, and leave the complaints about + the grammar to the compiler. + + Normal handling: Look for + keyword ( ... msgid ... ) + Plural handling: Look for + keyword ( ... msgid ... msgid_plural ... ) + + We use recursion because the arguments before msgid or between msgid + and msgid_plural can contain subexpressions of the same form. */ + + +/* Extract messages until the next balanced closing parenthesis. + Extracted messages are added to MLP. + When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and, + if also a plural argument shall be extracted, PLURAL_COMMAS > 0, + otherwise PLURAL_COMMAS = 0. + When no specific argument shall be extracted, COMMAS_TO_SKIP < 0. + Return true upon eof, false upon closing parenthesis. */ +static bool +extract_parenthesized (mlp, commas_to_skip, plural_commas) + message_list_ty *mlp; + int commas_to_skip; + int plural_commas; +{ + /* Remember the message containing the msgid, for msgid_plural. */ + message_ty *plural_mp = NULL; + + /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ + int state; + /* Parameters of the keyword just seen. Defined only in state 1. */ + int next_commas_to_skip = -1; + int next_plural_commas = 0; + + /* Start state is 0. */ + state = 0; + + for (;;) + { + token_ty token; + + x_php_lex (&token); + switch (token.type) + { + case token_type_symbol: + /* No need to bother if we extract all strings anyway. */ + if (!extract_all) + { + void *keyword_value; + + if (find_entry (&keywords, token.string, strlen (token.string), + &keyword_value) + == 0) + { + int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1); + int argnum2 = (int) (long) keyword_value >> 10; + + next_commas_to_skip = argnum1 - 1; + next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0); + state = 1; + } + else + state = 0; + } + free (token.string); + continue; + + case token_type_lparen: + /* No need to recurse if we extract all strings anyway. */ + if (extract_all) + continue; + if (state + ? extract_parenthesized (mlp, next_commas_to_skip, + next_plural_commas) + : extract_parenthesized (mlp, -1, 0)) + return true; + state = 0; + continue; + + case token_type_rparen: + /* No need to return if we extract all strings anyway. */ + if (extract_all) + continue; + return false; + + case token_type_comma: + /* No need to bother if we extract all strings anyway. */ + if (extract_all) + continue; + if (commas_to_skip >= 0) + { + if (commas_to_skip > 0) + commas_to_skip--; + else + if (plural_mp != NULL && plural_commas > 0) + { + commas_to_skip = plural_commas - 1; + plural_commas = 0; + } + else + commas_to_skip = -1; + } + state = 0; + continue; + + case token_type_string_literal: + { + lex_pos_ty pos; + pos.file_name = logical_file_name; + pos.line_number = token.line_number; + + if (extract_all) + remember_a_message (mlp, token.string, &pos); + else + { + if (commas_to_skip == 0) + { + if (plural_mp == NULL) + { + /* Seen an msgid. */ + message_ty *mp = remember_a_message (mlp, token.string, + &pos); + if (plural_commas > 0) + plural_mp = mp; + } + else + { + /* Seen an msgid_plural. */ + remember_a_message_plural (plural_mp, token.string, + &pos); + plural_mp = NULL; + } + } + else + free (token.string); + state = 0; + } + continue; + } + + case token_type_other: + state = 0; + continue; + + case token_type_eof: + return true; + + default: + abort (); + } + } +} + + +void +extract_php (f, real_filename, logical_filename, mdlp) + FILE *f; + const char *real_filename; + const char *logical_filename; + msgdomain_list_ty *mdlp; +{ + message_list_ty *mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + last_comment_line = -1; + last_non_comment_line = -1; + + init_keywords (); + + /* Initial mode is HTML mode, not PHP mode. */ + skip_html (); + + /* Eat tokens until eof is seen. When extract_parenthesized returns + due to an unbalanced closing parenthesis, just restart it. */ + while (!extract_parenthesized (mlp, -1, 0)) + ; + + /* Close scanner. */ + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} diff --git a/src/x-php.h b/src/x-php.h new file mode 100644 index 000000000..8583356fc --- /dev/null +++ b/src/x-php.h @@ -0,0 +1,34 @@ +/* xgettext PHP backend. + Copyright (C) 2002 Free Software Foundation, Inc. + Written by Bruno Haible , 2002. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + + +#define EXTENSIONS_PHP \ + { "php", "PHP" }, \ + { "php3", "PHP" }, \ + { "php4", "PHP" }, \ + +#define SCANNERS_PHP \ + { "PHP", extract_php, &formatstring_php }, \ + +/* Scan a PHP file and add its translatable strings to mdlp. */ +extern void extract_php PARAMS ((FILE *fp, const char *real_filename, + const char *logical_filename, + msgdomain_list_ty *mdlp)); + +extern void x_php_keyword PARAMS ((const char *keyword)); +extern void x_php_extract_all PARAMS ((void)); diff --git a/src/xgettext.c b/src/xgettext.c index 33982848b..92b9dbf50 100644 --- a/src/xgettext.c +++ b/src/xgettext.c @@ -67,6 +67,7 @@ #include "x-awk.h" #include "x-ycp.h" #include "x-tcl.h" +#include "x-php.h" #include "x-rst.h" #include "x-glade.h" @@ -235,6 +236,7 @@ main (argc, argv) x_java_extract_all (); x_awk_extract_all (); x_tcl_extract_all (); + x_php_extract_all (); x_glade_extract_all (); break; case 'c': @@ -293,6 +295,7 @@ main (argc, argv) x_java_keyword (optarg); x_awk_keyword (optarg); x_tcl_keyword (optarg); + x_php_keyword (optarg); x_glade_keyword (optarg); } break; @@ -578,7 +581,7 @@ Choice of input file language:\n\ -L, --language=NAME recognise the specified language\n\ (C, C++, ObjectiveC, PO, Python, Lisp,\n\ EmacsLisp, librep, Java, awk, YCP, Tcl,\n\ - RST, Glade)\n\ + PHP, RST, Glade)\n\ -C, --c++ shorthand for --language=C++\n\ By default the language is guessed depending on the input file name extension.\n\ ")); @@ -1279,6 +1282,7 @@ language_to_extractor (name) SCANNERS_AWK SCANNERS_YCP SCANNERS_TCL + SCANNERS_PHP SCANNERS_RST SCANNERS_GLADE /* Here will follow more languages and their scanners: perl, etc... @@ -1325,6 +1329,7 @@ extension_to_language (extension) EXTENSIONS_AWK EXTENSIONS_YCP EXTENSIONS_TCL + EXTENSIONS_PHP EXTENSIONS_RST EXTENSIONS_GLADE /* Here will follow more file extensions: sh, pl ... */ diff --git a/tests/ChangeLog b/tests/ChangeLog index 4813e3b0a..12a0c5ada 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,14 @@ +2002-08-18 Bruno Haible + + * format-php-1: New file. + * format-php-2: New file. + * lang-php: New file. + * Makefile.am (TESTS): Add them. + +2002-08-06 Bruno Haible + + * gettext-0.11.5 released. + 2002-08-06 Bruno Haible * msgunfmt-2: Fix typo. diff --git a/tests/Makefile.am b/tests/Makefile.am index 925fe30ff..a7ed94a6a 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -52,12 +52,13 @@ TESTS = gettext-1 gettext-2 \ format-java-1 format-java-2 \ format-librep-1 format-librep-2 \ format-lisp-1 format-lisp-2 \ + format-php-1 format-php-2 \ format-python-1 format-python-2 \ format-pascal-1 format-pascal-2 \ format-tcl-1 format-tcl-2 \ format-ycp-1 format-ycp-2 \ plural-1 plural-2 \ - lang-c lang-c++ lang-objc lang-python lang-clisp lang-elisp lang-librep lang-java lang-gawk lang-pascal lang-ycp lang-tcl lang-po lang-rst \ + lang-c lang-c++ lang-objc lang-python lang-clisp lang-elisp lang-librep lang-java lang-gawk lang-pascal lang-ycp lang-tcl lang-php lang-po lang-rst \ rpath-1a rpath-1b \ rpath-2aaa rpath-2aab rpath-2aac rpath-2aad \ rpath-2aba rpath-2abb rpath-2abc rpath-2abd \ diff --git a/tests/format-php-1 b/tests/format-php-1 new file mode 100755 index 000000000..92389aa39 --- /dev/null +++ b/tests/format-php-1 @@ -0,0 +1,116 @@ +#! /bin/sh + +# Test recognition of PHP format strings. + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles f-ph-1.data" +cat <<\EOF > f-ph-1.data +# Valid: no argument +"abc%%" +# Valid: one string argument +"abc%s" +# Valid: one integer argument +"abc%b" +# Valid: one integer argument +"abc%d" +# Valid: one integer argument +"abc%u" +# Valid: one integer argument +"abc%o" +# Valid: one integer argument +"abc%x" +# Valid: one integer argument +"abc%X" +# Valid: one floating-point argument +"abc%e" +# Valid: one floating-point argument +"abc%f" +# Valid: one character argument +"abc%c" +# Valid: one argument with flags +"abc%-f" +# Valid: one argument with padding flags +"abc%'=f" +# Valid: one argument with width +"abc%2f" +# Valid: one argument with precision +"abc%.4f" +# Valid: one argument with width and precision +"abc%14.4f" +# Invalid: unterminated +"abc%" +# Invalid: unknown format specifier +"abc%y" +# Invalid: unknown format specifier +"abc%F" +# Invalid: flags after width +"abc%5-f" +# Invalid: twice precision +"abc%.4.2f" +# Valid: three arguments +"abc%d%x%x" +# Valid: a numbered argument +"abc%1$d" +# Invalid: zero +"abc%0$d" +# Valid: two-digit numbered arguments +"abc%11$def%10$dgh%9$dij%8$dkl%7$dmn%6$dop%5$dqr%4$dst%3$duv%2$dwx%1$dyz" +# Invalid: unterminated number +"abc%1" +# Invalid: flags before number +"abc%-1$d" +# Valid: three arguments, two with same number +"abc%1$4x,%2$c,%1$u" +# Invalid: argument with conflicting types +"abc%1$4x,%2$c,%1$s" +# Valid: no conflict +"abc%1$4x,%2$c,%1$u" +# Valid: mixing of numbered and unnumbered arguments +"abc%d%2$x" +# Valid: numbered argument with constant precision +"abc%1$.9x" +# Valid: missing non-final argument +"abc%2$x%3$s" +# Valid: permutation +"abc%2$ddef%1$d" +# Valid: multiple uses of same argument +"abc%2$xdef%1$sghi%2$x" +EOF + +: ${XGETTEXT=xgettext} +n=0 +while read comment; do + read string + n=`expr $n + 1` + tmpfiles="$tmpfiles f-ph-1-$n.in f-ph-1-$n.po" + echo "" | sed -e 's/\$/\\\$/g' > f-ph-1-$n.in + ${XGETTEXT} -L PHP -o f-ph-1-$n.po f-ph-1-$n.in || exit 1 + test -f f-ph-1-$n.po || exit 1 + fail= + if echo "$comment" | grep 'Valid:' > /dev/null; then + if grep php-format f-ph-1-$n.po > /dev/null; then + : + else + fail=yes + fi + else + if grep php-format f-ph-1-$n.po > /dev/null; then + fail=yes + else + : + fi + fi + if test -n "$fail"; then + echo "Format string recognition error:" 1>&2 + cat f-ph-1-$n.in 1>&2 + echo "Got:" 1>&2 + cat f-ph-1-$n.po 1>&2 + exit 1 + fi +done < f-ph-1.data + +rm -fr $tmpfiles + +exit 0 diff --git a/tests/format-php-2 b/tests/format-php-2 new file mode 100755 index 000000000..a7b96a12d --- /dev/null +++ b/tests/format-php-2 @@ -0,0 +1,123 @@ +#! /bin/sh + +# Test checking of PHP format strings. + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles f-ph-2.data" +cat <<\EOF > f-ph-2.data +# Valid: %% doesn't count +msgid "abc%%def" +msgstr "xyz" +# Invalid: invalid msgstr +msgid "abc%%def" +msgstr "xyz%" +# Valid: same arguments +msgid "abc%s%gdef" +msgstr "xyz%s%g" +# Valid: same arguments, with different widths +msgid "abc%2sdef" +msgstr "xyz%3s" +# Valid: same arguments but in numbered syntax +msgid "abc%s%gdef" +msgstr "xyz%1$s%2$g" +# Valid: permutation +msgid "abc%s%g%cdef" +msgstr "xyz%3$c%2$g%1$s" +# Invalid: too few arguments +msgid "abc%2$udef%1$s" +msgstr "xyz%1$s" +# Invalid: too few arguments +msgid "abc%sdef%u" +msgstr "xyz%s" +# Invalid: too many arguments +msgid "abc%udef" +msgstr "xyz%uvw%c" +# Valid: same numbered arguments, with different widths +msgid "abc%2$5s%1$4s" +msgstr "xyz%2$4s%1$5s" +# Invalid: missing argument +msgid "abc%2$sdef%1$u" +msgstr "xyz%1$u" +# Invalid: missing argument +msgid "abc%1$sdef%2$u" +msgstr "xyz%2$u" +# Invalid: added argument +msgid "abc%1$udef" +msgstr "xyz%1$uvw%2$c" +# Valid: type compatibility +msgid "abc%b" +msgstr "xyz%d" +# Valid: type compatibility +msgid "abc%u" +msgstr "xyz%d" +# Valid: type compatibility +msgid "abc%o" +msgstr "xyz%d" +# Valid: type compatibility +msgid "abc%x" +msgstr "xyz%d" +# Valid: type compatibility +msgid "abc%X" +msgstr "xyz%d" +# Valid: type compatibility +msgid "abc%e" +msgstr "xyz%f" +# Invalid: type incompatibility +msgid "abc%s" +msgstr "xyz%d" +# Invalid: type incompatibility +msgid "abc%s" +msgstr "xyz%e" +# Invalid: type incompatibility +msgid "abc%s" +msgstr "xyz%c" +# Invalid: type incompatibility +msgid "abc%d" +msgstr "xyz%e" +# Invalid: type incompatibility +msgid "abc%d" +msgstr "xyz%c" +# Invalid: type incompatibility +msgid "abc%e" +msgstr "xyz%c" +EOF + +: ${MSGFMT=msgfmt} +n=0 +while read comment; do + read msgid_line + read msgstr_line + n=`expr $n + 1` + tmpfiles="$tmpfiles f-ph-2-$n.po f-ph-2-$n.mo" + cat < f-ph-2-$n.po +#, php-format +${msgid_line} +${msgstr_line} +EOF + fail= + if echo "$comment" | grep 'Valid:' > /dev/null; then + if ${MSGFMT} --check-format -o f-ph-2-$n.mo f-ph-2-$n.po; then + : + else + fail=yes + fi + else + ${MSGFMT} --check-format -o f-ph-2-$n.mo f-ph-2-$n.po 2> /dev/null + if test $? = 1; then + : + else + fail=yes + fi + fi + if test -n "$fail"; then + echo "Format string checking error:" 1>&2 + cat f-ph-2-$n.po 1>&2 + exit 1 + fi +done < f-ph-2.data + +rm -fr $tmpfiles + +exit 0 diff --git a/tests/lang-php b/tests/lang-php new file mode 100755 index 000000000..3fd8fbe48 --- /dev/null +++ b/tests/lang-php @@ -0,0 +1,86 @@ +#! /bin/sh + +# Test of gettext facilities in the PHP language. +# Assumes an fr_FR locale is installed. +# Assumes the following packages are installed: mod_php4-core. + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles prog.php" +cat <<\EOF > prog.php + +EOF + +tmpfiles="$tmpfiles prog.pot" +: ${XGETTEXT=xgettext} +${XGETTEXT} -o prog.pot --omit-header --no-location prog.php + +tmpfiles="$tmpfiles prog.ok" +cat < prog.ok +msgid "'Your command, please?', asked the waiter." +msgstr "" + +#, php-format +msgid "%s is replaced by %s." +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} prog.ok prog.pot || exit 1 + +tmpfiles="$tmpfiles fr.po" +cat <<\EOF > fr.po +msgid "" +msgstr "Content-Type: text/plain; charset=ISO-8859-1\n" + +msgid "'Your command, please?', asked the waiter." +msgstr "«Votre commande, s'il vous plait», dit le garçon." + +# Reverse the arguments. +#, php-format +msgid "%s is replaced by %s." +msgstr "%2$s remplace %1$s." +EOF + +tmpfiles="$tmpfiles fr.po.new" +: ${MSGMERGE=msgmerge} +${MSGMERGE} -q -o fr.po.new fr.po prog.pot + +: ${DIFF=diff} +${DIFF} fr.po fr.po.new || exit 1 + +tmpfiles="$tmpfiles fr" +test -d fr || mkdir fr +test -d fr/LC_MESSAGES || mkdir fr/LC_MESSAGES + +: ${MSGFMT=msgfmt} +${MSGFMT} -o fr/LC_MESSAGES/prog.mo fr.po + +# Test for presence of gawk version 4.0 or newer. +case `(php -v) 2>/dev/null` in + [4-9].*) ;; + *) echo "SKIP: lang-php"; rm -fr $tmpfiles; exit 77;; +esac + +tmpfiles="$tmpfiles prog.ok prog.out" +: ${DIFF=diff} +cat <<\EOF > prog.ok +«Votre commande, s'il vous plait», dit le garçon. +EUR remplace FF. +EOF + +LANGUAGE= LC_ALL=fr_FR php -q prog.php > prog.out || exit 1 +${DIFF} prog.ok prog.out || exit 1 + +rm -fr $tmpfiles + +exit 0