From: Bruno Haible Date: Wed, 14 Jun 2023 11:56:24 +0000 (+0200) Subject: Avoid the need for charset conversion at run time on musl libc. X-Git-Tag: v0.22~17 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5412a4f79929004cb6db15d545e07dc953330e8d;p=thirdparty%2Fgettext.git Avoid the need for charset conversion at run time on musl libc. This fixes 1 system-tests failure (intl-2) and 3 tests failures (lang-c, lang-c++, lang-sh). * gettext-tools/src/write-mo.h (no_convert_to_utf8): New declaration. (msgdomain_write_mo): Add a 4th parameter. * gettext-tools/src/write-mo.c: Include po-charset.h, msgl-iconv.h. (no_convert_to_utf8): New variable. (msgdomain_write_mo): Add a 4th parameter. Convert the message list to UTF-8. * gettext-tools/src/msgfmt.c (long_options): Add --no-convert. (main): Handle the --no-convert option. Pass a 4th argument to msgdomain_write_mo. (usage): Document the --no-convert option. * gettext-tools/doc/msgfmt.texi: Document the --no-convert option. * gettext-tools/doc/gettext.texi (MO Files): Document the character encoding of strings in MO files. * gettext-tools/tests/msgfmt-5: Use msgfmt option --no-convert. * gettext-tools/tests/msgfmt-12: Likewise. * gettext-tools/tests/lang-pascal: Likewise. * NEWS: Mention the change. --- diff --git a/NEWS b/NEWS index 4839de097..d5bc0ba43 100644 --- a/NEWS +++ b/NEWS @@ -35,6 +35,13 @@ Version 0.21.2 - June 2023 - Tcl: xgettext now supports the \x, \u, and \U escapes as defined in Tcl 8.6. +* Portability: + - On systems with musl libc, the *gettext() functions in libc now work + with MO files generated from PO files with an encoding other than UTF-8. + To this effect, the msgfmt program now converts the messages to UTF-8 + encoding before storing them in a MO file. You can prevent this by + using the msgfmt --no-convert option. + * xgettext: - The xgettext option '--sorted-output' is now deprecated. - xgettext input files of type PO that are not all ASCII and not UTF-8 diff --git a/gettext-tools/doc/gettext.texi b/gettext-tools/doc/gettext.texi index 75f87291c..962dfbaf1 100644 --- a/gettext-tools/doc/gettext.texi +++ b/gettext-tools/doc/gettext.texi @@ -5995,6 +5995,14 @@ translation are all stored consecutively, separated through a @key{NUL} byte. Here also, the length in the string descriptor includes all of them. +@cindex encoding in MO files +The character encoding of the strings can be any standard ASCII-compatible +encoding, such as UTF-8, ISO-8859-1, EUC-JP, etc., as long as the +encoding's name is stated in the header entry (@pxref{Header Entry}). +Starting with GNU @code{gettext} version 0.22, the MO files produced by +@code{msgfmt} have them in UTF-8 encoding, unless the @code{msgfmt} +option @samp{--no-convert} is used. + Nothing prevents a MO file from having embedded @key{NUL}s in strings. However, the program interface currently used already presumes that strings are @key{NUL} terminated, so embedded @key{NUL}s are diff --git a/gettext-tools/doc/msgfmt.texi b/gettext-tools/doc/msgfmt.texi index b5fd97f13..5c637ca46 100644 --- a/gettext-tools/doc/msgfmt.texi +++ b/gettext-tools/doc/msgfmt.texi @@ -1,5 +1,5 @@ @c This file is part of the GNU gettext manual. -@c Copyright (C) 1995-2019 Free Software Foundation, Inc. +@c Copyright (C) 1995-2023 Free Software Foundation, Inc. @c See the file gettext.texi for copying conditions. @pindex msgfmt @@ -375,6 +375,13 @@ a human translator. @subsection Output details @table @samp +@item --no-convert +@opindex --no-convert@r{, @code{msgfmt} option} +Don't convert the messages to UTF-8 encoding. By default, messages are +converted to UTF-8 encoding before being stored in a MO file; this helps +avoiding conversions at run time, since nowadays most locales use the +UTF-8 encoding. + @item -a @var{number} @itemx --alignment=@var{number} @opindex -a@r{, @code{msgfmt} option} diff --git a/gettext-tools/src/msgfmt.c b/gettext-tools/src/msgfmt.c index c5a880f42..d93ba5250 100644 --- a/gettext-tools/src/msgfmt.c +++ b/gettext-tools/src/msgfmt.c @@ -195,6 +195,7 @@ static const struct option long_options[] = { "keyword", optional_argument, NULL, 'k' }, { "language", required_argument, NULL, 'L' }, { "locale", required_argument, NULL, 'l' }, + { "no-convert", no_argument, NULL, CHAR_MAX + 17 }, { "no-hash", no_argument, NULL, CHAR_MAX + 6 }, { "output-file", required_argument, NULL, 'o' }, { "properties-input", no_argument, NULL, 'P' }, @@ -422,6 +423,9 @@ main (int argc, char *argv[]) desktop_template_name = optarg; xml_template_name = optarg; break; + case CHAR_MAX + 17: /* --no-convert */ + no_convert_to_utf8 = true; + break; default: usage (EXIT_FAILURE); break; @@ -771,6 +775,20 @@ There is NO WARRANTY, to the extent permitted by law.\n\ } } + /* Compose the input file name(s). + This is used for statistics and error messages. */ + char *all_input_file_names; + { + string_list_ty input_file_names; + + string_list_init (&input_file_names);; + for (arg_i = optind; arg_i < argc; arg_i++) + string_list_append (&input_file_names, argv[arg_i]); + all_input_file_names = + string_list_join (&input_file_names, ", ", '\0', false); + string_list_destroy (&input_file_names); + } + /* Now write out all domains. */ for (domain = domain_list; domain != NULL; domain = domain->next) { @@ -829,7 +847,7 @@ There is NO WARRANTY, to the extent permitted by law.\n\ else { if (msgdomain_write_mo (domain->mlp, domain->domain_name, - domain->file_name)) + domain->file_name, all_input_file_names)) exit_status = EXIT_FAILURE; } @@ -843,23 +861,9 @@ There is NO WARRANTY, to the extent permitted by law.\n\ if (do_statistics + verbose >= 2 && optind < argc) { /* Print the input file name(s) in front of the statistics line. */ - char *all_input_file_names; - - { - string_list_ty input_file_names; - - string_list_init (&input_file_names);; - for (arg_i = optind; arg_i < argc; arg_i++) - string_list_append (&input_file_names, argv[arg_i]); - all_input_file_names = - string_list_join (&input_file_names, ", ", '\0', false); - string_list_destroy (&input_file_names); - } - /* TRANSLATORS: The prefix before a statistics message. The argument is a file name or a comma separated list of file names. */ fprintf (stderr, _("%s: "), all_input_file_names); - free (all_input_file_names); } fprintf (stderr, ngettext ("%d translated message", "%d translated messages", @@ -1045,6 +1049,8 @@ Input file interpretation:\n")); printf (_("\ Output details:\n")); printf (_("\ + --no-convert don't convert the messages to UTF-8 encoding\n")); + printf (_("\ -a, --alignment=NUMBER align strings to NUMBER bytes (default: %d)\n"), DEFAULT_OUTPUT_ALIGNMENT); printf (_("\ --endianness=BYTEORDER write out 32-bit numbers in the given byte order\n\ diff --git a/gettext-tools/src/write-mo.c b/gettext-tools/src/write-mo.c index 5d7cc611f..d94f1863c 100644 --- a/gettext-tools/src/write-mo.c +++ b/gettext-tools/src/write-mo.c @@ -45,6 +45,8 @@ #include "xsize.h" #include "xalloc.h" #include "xmalloca.h" +#include "po-charset.h" +#include "msgl-iconv.h" #include "msgl-header.h" #include "binary-io.h" #include "supersede.h" @@ -66,6 +68,9 @@ #endif /* roundup */ +/* True if no conversion to UTF-8 is desired. */ +bool no_convert_to_utf8; + /* Alignment of strings in resulting .mo file. */ size_t alignment; @@ -828,11 +833,23 @@ write_table (FILE *output_file, message_list_ty *mlp) int msgdomain_write_mo (message_list_ty *mlp, const char *domain_name, - const char *file_name) + const char *file_name, + const char *input_file) { /* If no entry for this domain don't even create the file. */ if (mlp->nitems != 0) { + if (!no_convert_to_utf8) + { + /* Convert the messages to UTF-8. + This is necessary because the *gettext functions in musl libc + assume that both the locale encoding and the .mo encoding is UTF-8. + It is also helpful for performance on glibc systems, since most + locales nowadays have UTF-8 as locale encoding, whereas some PO + files still are encoded in EUC-JP or so. */ + iconv_message_list (mlp, NULL, po_charset_utf8, input_file); + } + /* Support for "reproducible builds": Delete information that may vary between builds in the same conditions. */ message_list_delete_header_field (mlp, "POT-Creation-Date:"); diff --git a/gettext-tools/src/write-mo.h b/gettext-tools/src/write-mo.h index 608386f10..6e0e72b8d 100644 --- a/gettext-tools/src/write-mo.h +++ b/gettext-tools/src/write-mo.h @@ -1,6 +1,5 @@ /* Writing binary .mo files. - Copyright (C) 1995-1998, 2000-2003, 2005-2006 Free Software - Foundation, Inc. + Copyright (C) 1995-1998, 2000-2003, 2005-2006, 2023 Free Software Foundation, Inc. Written by Ulrich Drepper , April 1995. This program is free software: you can redistribute it and/or modify @@ -24,6 +23,9 @@ #include "message.h" +/* True if no conversion to UTF-8 is desired. */ +extern bool no_convert_to_utf8; + /* Alignment of strings in resulting .mo file. */ extern size_t alignment; @@ -35,10 +37,12 @@ extern bool no_hash_table; /* Write a GNU mo file. mlp is a list containing the messages to be output. domain_name is the domain name, file_name is the desired file name. + input_file is the name of the input file. Return 0 if ok, nonzero on error. */ extern int msgdomain_write_mo (message_list_ty *mlp, const char *domain_name, - const char *file_name); + const char *file_name, + const char *input_file); #endif /* _WRITE_MO_H */ diff --git a/gettext-tools/tests/lang-pascal b/gettext-tools/tests/lang-pascal index 23c71ea1b..69e687557 100755 --- a/gettext-tools/tests/lang-pascal +++ b/gettext-tools/tests/lang-pascal @@ -83,7 +83,7 @@ test -d fr || mkdir fr test -d fr/LC_MESSAGES || mkdir fr/LC_MESSAGES : ${MSGFMT=msgfmt} -${MSGFMT} -o fr/LC_MESSAGES/pascalprog.mo fr.po +${MSGFMT} --no-convert -o fr/LC_MESSAGES/pascalprog.mo fr.po : ${DIFF=diff} cat <<\EOF > pascalprog.ok diff --git a/gettext-tools/tests/msgfmt-12 b/gettext-tools/tests/msgfmt-12 index 19f938337..95beb0906 100755 --- a/gettext-tools/tests/msgfmt-12 +++ b/gettext-tools/tests/msgfmt-12 @@ -20,7 +20,7 @@ msgstr " Fehler" EOF : ${MSGFMT=msgfmt} -${MSGFMT} -o mf-12.mo mf-12.po || Exit 1 +${MSGFMT} --no-convert -o mf-12.mo mf-12.po || Exit 1 : ${MSGUNFMT=msgunfmt} ${MSGUNFMT} -o mf-12.tmp mf-12.mo || Exit 1 diff --git a/gettext-tools/tests/msgfmt-5 b/gettext-tools/tests/msgfmt-5 index 2820ebcd8..94eeac52e 100755 --- a/gettext-tools/tests/msgfmt-5 +++ b/gettext-tools/tests/msgfmt-5 @@ -30,7 +30,7 @@ msgstr "A bient EOF : ${MSGFMT=msgfmt} -${MSGFMT} foo.po || Exit 1 +${MSGFMT} --no-convert foo.po || Exit 1 : ${MSGUNFMT=msgunfmt} ${MSGUNFMT} -o foo-de.tmp foo-de.mo || Exit 1