From: Bruno Haible Date: Sun, 13 Oct 2024 16:47:47 +0000 (+0200) Subject: msgmerge, msginit: Reject POT files with non-ASCII characters, except in UTF-8. X-Git-Tag: v0.23~61 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bd94293cc8ee0ba818cff4209ab4c4466c25189a;p=thirdparty%2Fgettext.git msgmerge, msginit: Reject POT files with non-ASCII characters, except in UTF-8. Reported by Martin Quinson at . * gettext-tools/src/msgl-charset.h (check_pot_charset): New declaration. * gettext-tools/src/msgl-charset.c: Include msgl-ascii.h. (check_pot_charset): New function. * gettext-tools/src/msginit.c: Include msgl-charset.h. (main): Invoke check_pot_charset. * gettext-tools/src/msgmerge.c: Include msgl-charset.h. (merge): Invoke check_pot_charset. * gettext-tools/src/FILES: Update. * gettext-tools/tests/testdata/nonascii.pot: New file. * gettext-tools/tests/msginit-5: New file. * gettext-tools/tests/msgmerge-charset-4: New file. * gettext-tools/tests/Makefile.am (TESTS): Add them. (EXTRA_DIST): Add testdata/nonascii.pot. --- diff --git a/gettext-tools/src/FILES b/gettext-tools/src/FILES index dfdbe83b3..9e696772d 100644 --- a/gettext-tools/src/FILES +++ b/gettext-tools/src/FILES @@ -134,6 +134,10 @@ msgl-cat.c msgcmp.c Main source for the 'msgcmp' program. +msgl-charset.h +msgl-charset.c + Checking the encoding of a list-of-messages. + +-------------- The 'msgmerge' program | msgl-equal.h | msgl-equal.c @@ -156,11 +160,6 @@ msgcat.c Main source for the 'msgcat' program. msgconv.c Main source for the 'msgconv' program. msguniq.c Main source for the 'msguniq' program. -msgl-charset.h -msgl-charset.c - Compare the encoding of a list-of-messages with the locale - encoding. - msgexec.c Main source for the 'msgexec' program. msgfilter.c Main source for the 'msgfilter' program. msggrep.c Main source for the 'msggrep' program. diff --git a/gettext-tools/src/msginit.c b/gettext-tools/src/msginit.c index 5a66f7fda..02adc5f63 100644 --- a/gettext-tools/src/msginit.c +++ b/gettext-tools/src/msginit.c @@ -63,6 +63,7 @@ #include "write-po.h" #include "write-properties.h" #include "write-stringtable.h" +#include "msgl-charset.h" #include "xerror-handler.h" #include "po-charset.h" #include "localcharset.h" @@ -327,6 +328,7 @@ the output .po file through the --output-file option.\n"), /* Read input file. */ result = read_catalog_file (input_file, input_syntax); + check_pot_charset (result, input_file); #if defined _WIN32 || defined __CYGWIN__ /* The function fill_header invokes, directly or indirectly, some programs diff --git a/gettext-tools/src/msgl-charset.c b/gettext-tools/src/msgl-charset.c index cbbe5b474..dcc8cea80 100644 --- a/gettext-tools/src/msgl-charset.c +++ b/gettext-tools/src/msgl-charset.c @@ -29,6 +29,7 @@ #include #include +#include "msgl-ascii.h" #include "po-charset.h" #include "localcharset.h" #include "progname.h" @@ -42,6 +43,66 @@ #define _(str) gettext (str) +/* Check whether the POT file's encoding is ASCII or UTF-8. Otherwise + emit a warning. + Rationale: A POT file is routinely copied by a translator to a PO file. + If a POT file contains non-ASCII messages (or comments) in an encoding + other than UTF-8, the translator will most likely encounter trouble adding + her own translations in the same encoding. A translator should not have + to convert the POT file to UTF-8 first; instead, the POT file should + already be prepeared ready-to-use. */ +void +check_pot_charset (const msgdomain_list_ty *mdlp, const char *filename) +{ + size_t j, k; + + for (k = 0; k < mdlp->nitems; k++) + { + const message_list_ty *mlp = mdlp->item[k]->messages; + + for (j = 0; j < mlp->nitems; j++) + if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete) + { + const char *header = mlp->item[j]->msgstr; + + if (header != NULL) + { + const char *charsetstr = c_strstr (header, "charset="); + + if (charsetstr != NULL) + { + size_t len; + char *charset; + + charsetstr += strlen ("charset="); + len = strcspn (charsetstr, " \t\n"); + charset = (char *) xmalloca (len + 1); + memcpy (charset, charsetstr, len); + charset[len] = '\0'; + + const char *canon_charset = po_charset_canonicalize (charset); + + /* "CHARSET" is often used as a placeholder, equivalent + to "any" or "ASCII". */ + if (!(strcmp (charset, "CHARSET") == 0) + && canon_charset == NULL) + error (EXIT_FAILURE, 0, + _("%s: The present charset \"%s\" is not a portable encoding name."), + filename, charset); + if (!is_ascii_message_list (mlp) + && !(canon_charset == po_charset_ascii + || canon_charset == po_charset_utf8)) + error (EXIT_FAILURE, 0, + _("%s: The file contains non-ASCII characters but the present charset \"%s\" is not %s or %s."), + filename, charset, "ASCII", "UTF-8"); + + freea (charset); + } + } + } + } +} + void compare_po_locale_charsets (const msgdomain_list_ty *mdlp) { diff --git a/gettext-tools/src/msgl-charset.h b/gettext-tools/src/msgl-charset.h index 38c994c14..0c7189202 100644 --- a/gettext-tools/src/msgl-charset.h +++ b/gettext-tools/src/msgl-charset.h @@ -1,5 +1,5 @@ /* Message list charset and locale charset handling. - Copyright (C) 2001-2003 Free Software Foundation, Inc. + Copyright (C) 2001-2024 Free Software Foundation, Inc. Written by Bruno Haible , 2001. This program is free software: you can redistribute it and/or modify @@ -26,6 +26,9 @@ extern "C" { #endif +extern void + check_pot_charset (const msgdomain_list_ty *mdlp, const char *filename); + extern void compare_po_locale_charsets (const msgdomain_list_ty *mdlp); diff --git a/gettext-tools/src/msgmerge.c b/gettext-tools/src/msgmerge.c index 0e6d008cd..ea03b79ed 100644 --- a/gettext-tools/src/msgmerge.c +++ b/gettext-tools/src/msgmerge.c @@ -50,6 +50,7 @@ #include "write-po.h" #include "write-properties.h" #include "write-stringtable.h" +#include "msgl-charset.h" #include "format.h" #include "xalloc.h" #include "xmalloca.h" @@ -1798,6 +1799,7 @@ merge (const char *fn1, const char *fn2, catalog_input_format_ty input_syntax, /* This is the references file, created by groping the sources with the xgettext program. */ ref = read_catalog_file (fn2, input_syntax); + check_pot_charset (ref, fn2); /* Add a dummy header entry, if the references file contains none. */ for (k = 0; k < ref->nitems; k++) if (message_list_search (ref->item[k]->messages, NULL, "") == NULL) diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 34ba7b78f..b782aaa95 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -60,11 +60,12 @@ TESTS = gettext-1 gettext-2 \ msgfmt-xml-1 msgfmt-xml-2 msgfmt-xml-3 msgfmt-xml-4 msgfmt-xml-5 \ msggrep-1 msggrep-2 msggrep-3 msggrep-4 msggrep-5 msggrep-6 msggrep-7 \ msggrep-8 msggrep-9 msggrep-10 msggrep-11 \ - msginit-1 msginit-2 msginit-3 msginit-4 \ + msginit-1 msginit-2 msginit-3 msginit-4 msginit-5 \ msgmerge-1 msgmerge-2 msgmerge-3 msgmerge-4 msgmerge-5 msgmerge-6 \ msgmerge-7 msgmerge-8 msgmerge-9 msgmerge-10 msgmerge-11 msgmerge-12 \ msgmerge-13 msgmerge-14 msgmerge-15 \ msgmerge-charset-1 msgmerge-charset-2 msgmerge-charset-3 \ + msgmerge-charset-4 \ msgmerge-compendium-1 msgmerge-compendium-2 msgmerge-compendium-3 \ msgmerge-compendium-4 msgmerge-compendium-5 msgmerge-compendium-6 \ msgmerge-domain-1 msgmerge-domain-2 \ @@ -259,6 +260,7 @@ EXTRA_DIST += init.sh init.cfg $(TESTS) \ xgettext-1 \ xgettext-c-1 xg-c-comment-6.c xg-c-escape-3.c xg-vala-2.vala \ common/supplemental/plurals.xml \ + testdata/nonascii.pot \ testdata/tcltest_pl.po testdata/tcltest_pl.msg \ testdata/tcltest_cs.po testdata/tcltest_cs.msg \ testdata/xg-el-so-3.el testdata/xg-el-so-4.el \ diff --git a/gettext-tools/tests/msginit-5 b/gettext-tools/tests/msginit-5 new file mode 100755 index 000000000..2495bb3b4 --- /dev/null +++ b/gettext-tools/tests/msginit-5 @@ -0,0 +1,8 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test validation of POT file with non-ASCII messages. + +: ${MSGINIT=msginit} +${MSGINIT} -i "$wabs_srcdir"/testdata/nonascii.pot -l fr --no-translator -o mi-test5.tmp +test $? = 1 || Exit 1 diff --git a/gettext-tools/tests/msgmerge-charset-4 b/gettext-tools/tests/msgmerge-charset-4 new file mode 100755 index 000000000..f8e2baa9e --- /dev/null +++ b/gettext-tools/tests/msgmerge-charset-4 @@ -0,0 +1,31 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test validation of POT file with non-ASCII messages. + +cat <<\EOF > mm-ch-4.po +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# Bruno Haible , 2024. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: \n" +"PO-Revision-Date: 2024-10-13 18:36+0200\n" +"Last-Translator: Bruno Haible \n" +"Language-Team: French \n" +"Language: fr\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=2; plural=(n > 1);\n" + +msgid "You can find me résumé at <%s>." +msgstr "Vous trouvez mon CV sous <%s>." +EOF + +: ${MSGMERGE=msgmerge} +${MSGMERGE} -q -o mm-ch-4.tmp.po mm-ch-4.po "$wabs_srcdir"/testdata/nonascii.pot +test $? = 1 || Exit 1 diff --git a/gettext-tools/tests/testdata/nonascii.pot b/gettext-tools/tests/testdata/nonascii.pot new file mode 100644 index 000000000..900dec788 --- /dev/null +++ b/gettext-tools/tests/testdata/nonascii.pot @@ -0,0 +1,20 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: \n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=ISO-8859-1\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "You can find me résumé at <%s>." +msgstr ""