From 6934eec2d1dd16eeae032f0a46da8309d329fe41 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sun, 4 Apr 2021 18:26:57 +0200 Subject: [PATCH] msgconv: Handle protected file names with spaces. * gettext-tools/src/po-charset.h (po_lex_isolate_start, po_lex_isolate_end): New declarations. * gettext-tools/src/po-charset.c (po_lex_isolate_start, po_lex_isolate_end): New variables. (po_lex_charset_init, po_lex_charset_set, po_lex_charset_close): Initialize them. * gettext-tools/woe32dll/gettextsrc-exports.c: Export also po_lex_isolate_start, po_lex_isolate_end. * gettext-tools/src/read-catalog-abstract.c: Include , po-charset.h. (po_parse_comment_filepos): Parse the syntax of file names surrounded by control characters. * gettext-tools/src/msgl-iconv.c: Include msgl-ofn.h. (iconv_msgdomain_list): Signal error if the target encoding does not contain the control characters needed for escaping file names with spaces. * gettext-tools/tests/msgconv-8: New file. * gettext-tools/tests/Makefile.am (TESTS): Add it. --- gettext-tools/src/msgl-iconv.c | 12 ++- gettext-tools/src/po-charset.c | 31 +++++- gettext-tools/src/po-charset.h | 12 ++- gettext-tools/src/read-catalog-abstract.c | 103 +++++++++++++------- gettext-tools/tests/Makefile.am | 1 + gettext-tools/tests/msgconv-8 | 71 ++++++++++++++ gettext-tools/woe32dll/gettextsrc-exports.c | 4 +- 7 files changed, 195 insertions(+), 39 deletions(-) create mode 100755 gettext-tools/tests/msgconv-8 diff --git a/gettext-tools/src/msgl-iconv.c b/gettext-tools/src/msgl-iconv.c index 0b5ed107e..6bafd9506 100644 --- a/gettext-tools/src/msgl-iconv.c +++ b/gettext-tools/src/msgl-iconv.c @@ -1,5 +1,5 @@ /* Message list charset and locale charset handling. - Copyright (C) 2001-2003, 2005-2009, 2019-2020 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2005-2009, 2019-2021 Free Software Foundation, Inc. Written by Bruno Haible , 2001. This program is free software: you can redistribute it and/or modify @@ -40,6 +40,7 @@ #include "xstriconv.h" #include "xstriconveh.h" #include "msgl-ascii.h" +#include "msgl-ofn.h" #include "xalloc.h" #include "xmalloca.h" #include "c-strstr.h" @@ -363,6 +364,15 @@ iconv_msgdomain_list (msgdomain_list_ty *mdlp, xasprintf (_("target charset \"%s\" is not a portable encoding name."), to_code)); + /* Test whether the control characters required for escaping file names with + spaces are present in the target encoding. */ + if (msgdomain_list_has_filenames_with_spaces (mdlp) + && !(canon_to_code == po_charset_utf8 + || strcmp (canon_to_code, "GB18030") == 0)) + po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, + xasprintf (_("Cannot write the control characters that protect file names with spaces in the %s encoding"), + canon_to_code)); + for (k = 0; k < mdlp->nitems; k++) iconv_message_list_internal (mdlp->item[k]->messages, mdlp->encoding, canon_to_code, update_header, diff --git a/gettext-tools/src/po-charset.c b/gettext-tools/src/po-charset.c index 2e92c69e2..db95cf040 100644 --- a/gettext-tools/src/po-charset.c +++ b/gettext-tools/src/po-charset.c @@ -1,5 +1,5 @@ /* Charset handling while reading PO files. - Copyright (C) 2001-2007, 2010, 2019-2020 Free Software Foundation, Inc. + Copyright (C) 2001-2007, 2010, 2019-2021 Free Software Foundation, Inc. Written by Bruno Haible , 2001. This program is free software: you can redistribute it and/or modify @@ -436,6 +436,13 @@ po_charset_character_iterator (const char *canon_charset) /* The PO file's encoding, as specified in the header entry. */ const char *po_lex_charset; +/* Representation of U+2068 FIRST STRONG ISOLATE (FSI) in the PO file's + encoding, or NULL if not available. */ +const char *po_lex_isolate_start; +/* Representation of U+2069 POP DIRECTIONAL ISOLATE (PDI) in the PO file's + encoding, or NULL if not available. */ +const char *po_lex_isolate_end; + #if HAVE_ICONV /* Converter from the PO file's encoding to UTF-8. */ iconv_t po_lex_iconv; @@ -448,6 +455,8 @@ void po_lex_charset_init () { po_lex_charset = NULL; + po_lex_isolate_start = NULL; + po_lex_isolate_end = NULL; #if HAVE_ICONV po_lex_iconv = (iconv_t)(-1); #endif @@ -503,6 +512,24 @@ Message conversion to user's charset might not work.\n"), const char *envval; po_lex_charset = canon_charset; + + if (strcmp (canon_charset, "UTF-8") == 0) + { + po_lex_isolate_start = "\xE2\x81\xA8"; + po_lex_isolate_end = "\xE2\x81\xA9"; + } + else if (strcmp (canon_charset, "GB18030") == 0) + { + po_lex_isolate_start = "\x81\x36\xAC\x34"; + po_lex_isolate_end = "\x81\x36\xAC\x35"; + } + else + { + /* The other encodings don't contain U+2068, U+2069. */ + po_lex_isolate_start = NULL; + po_lex_isolate_end = NULL; + } + #if HAVE_ICONV if (po_lex_iconv != (iconv_t)(-1)) iconv_close (po_lex_iconv); @@ -666,6 +693,8 @@ void po_lex_charset_close () { po_lex_charset = NULL; + po_lex_isolate_start = NULL; + po_lex_isolate_end = NULL; #if HAVE_ICONV if (po_lex_iconv != (iconv_t)(-1)) { diff --git a/gettext-tools/src/po-charset.h b/gettext-tools/src/po-charset.h index 75769fc92..0ab49bd6d 100644 --- a/gettext-tools/src/po-charset.h +++ b/gettext-tools/src/po-charset.h @@ -1,5 +1,5 @@ /* Charset handling while reading PO files. - Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2006, 2021 Free Software Foundation, Inc. Written by Bruno Haible , 2001. This program is free software: you can redistribute it and/or modify @@ -33,7 +33,8 @@ extern "C" { /* Canonicalize an encoding name. The results of this function are statically allocated and can be - compared using ==. */ + compared using ==. + Return NULL if CHARSET is not a valid encoding name. */ extern const char *po_charset_canonicalize (const char *charset); /* The canonicalized encoding name for ASCII. */ @@ -66,6 +67,13 @@ extern character_iterator_t po_charset_character_iterator (const char *canon_cha /* The PO file's encoding, as specified in the header entry. */ extern DLL_VARIABLE const char *po_lex_charset; +/* Representation of U+2068 FIRST STRONG ISOLATE (FSI) in the PO file's + encoding, or NULL if not available. */ +extern DLL_VARIABLE const char *po_lex_isolate_start; +/* Representation of U+2069 POP DIRECTIONAL ISOLATE (PDI) in the PO file's + encoding, or NULL if not available. */ +extern DLL_VARIABLE const char *po_lex_isolate_end; + #if HAVE_ICONV /* Converter from the PO file's encoding to UTF-8. */ extern DLL_VARIABLE iconv_t po_lex_iconv; diff --git a/gettext-tools/src/read-catalog-abstract.c b/gettext-tools/src/read-catalog-abstract.c index 05436c6ba..aa17eddf8 100644 --- a/gettext-tools/src/read-catalog-abstract.c +++ b/gettext-tools/src/read-catalog-abstract.c @@ -1,5 +1,5 @@ /* Reading PO files, abstract class. - Copyright (C) 1995-1996, 1998, 2000-2009, 2013, 2015 Free Software + Copyright (C) 1995-1996, 1998, 2000-2009, 2013, 2015, 2021 Free Software Foundation, Inc. This file was written by Peter Miller @@ -26,9 +26,11 @@ #include "read-catalog-abstract.h" #include +#include #include #include +#include "po-charset.h" #include "xalloc.h" #include "xvasprintf.h" #include "po-xerror.h" @@ -453,6 +455,10 @@ po_parse_comment_special (const char *s, STRING The latter style, without line number, occurs in PO files converted e.g. from Pascal .rst files or from OpenOffice resource files. + The STRING is either + FILENAME + or + U+2068 FILENAME U+2069. Call po_callback_comment_filepos for each of them. */ static void po_parse_comment_filepos (const char *s) @@ -463,11 +469,41 @@ po_parse_comment_filepos (const char *s) s++; if (*s != '\0') { - const char *string_start = s; + bool isolated_filename = + (po_lex_isolate_start != NULL + && strncmp (s, po_lex_isolate_start, + strlen (po_lex_isolate_start)) == 0); + if (isolated_filename) + s += strlen (po_lex_isolate_start); - do - s++; - while (!(*s == '\0' || *s == ' ' || *s == '\t' || *s == '\n')); + const char *filename_start = s; + const char *filename_end; + + if (isolated_filename) + { + for (;; s++) + { + if (*s == '\0' || *s == '\n') + { + filename_end = s; + break; + } + if (strncmp (s, po_lex_isolate_end, + strlen (po_lex_isolate_end)) == 0) + { + filename_end = s; + s += strlen (po_lex_isolate_end); + break; + } + } + } + else + { + do + s++; + while (!(*s == '\0' || *s == ' ' || *s == '\t' || *s == '\n')); + filename_end = s; + } /* See if there is a COLON and NUMBER after the STRING, separated through optional spaces. */ @@ -499,16 +535,15 @@ po_parse_comment_filepos (const char *s) if (*p == '\0' || *p == ' ' || *p == '\t' || *p == '\n') { /* Parsed a GNU style file comment with spaces. */ - const char *string_end = s; - size_t string_length = string_end - string_start; - char *string = XNMALLOC (string_length + 1, char); + size_t filename_length = filename_end - filename_start; + char *filename = XNMALLOC (filename_length + 1, char); - memcpy (string, string_start, string_length); - string[string_length] = '\0'; + memcpy (filename, filename_start, filename_length); + filename[filename_length] = '\0'; - po_callback_comment_filepos (string, n); + po_callback_comment_filepos (filename, n); - free (string); + free (filename); s = p; continue; @@ -541,16 +576,16 @@ po_parse_comment_filepos (const char *s) if (*p == '\0' || *p == ' ' || *p == '\t' || *p == '\n') { /* Parsed a GNU style file comment with spaces. */ - const char *string_end = s - 1; - size_t string_length = string_end - string_start; - char *string = XNMALLOC (string_length + 1, char); + filename_end = s - 1; + size_t filename_length = filename_end - filename_start; + char *filename = XNMALLOC (filename_length + 1, char); - memcpy (string, string_start, string_length); - string[string_length] = '\0'; + memcpy (filename, filename_start, filename_length); + filename[filename_length] = '\0'; - po_callback_comment_filepos (string, n); + po_callback_comment_filepos (filename, n); - free (string); + free (filename); s = p; continue; @@ -563,7 +598,7 @@ po_parse_comment_filepos (const char *s) { const char *p = s; - while (p > string_start) + while (p > filename_start) { p--; if (!(*p >= '0' && *p <= '9')) @@ -577,7 +612,7 @@ po_parse_comment_filepos (const char *s) at the end of STRING. */ if (p < s - && p > string_start + 1 + && p > filename_start + 1 && p[-1] == ':') { /* Parsed a GNU style file comment without spaces. */ @@ -595,15 +630,16 @@ po_parse_comment_filepos (const char *s) while (p < s); { - size_t string_length = string_end - string_start; - char *string = XNMALLOC (string_length + 1, char); + filename_end = string_end; + size_t filename_length = filename_end - filename_start; + char *filename = XNMALLOC (filename_length + 1, char); - memcpy (string, string_start, string_length); - string[string_length] = '\0'; + memcpy (filename, filename_start, filename_length); + filename[filename_length] = '\0'; - po_callback_comment_filepos (string, n); + po_callback_comment_filepos (filename, n); - free (string); + free (filename); continue; } @@ -613,16 +649,15 @@ po_parse_comment_filepos (const char *s) /* Parsed a file comment without line number. */ { - const char *string_end = s; - size_t string_length = string_end - string_start; - char *string = XNMALLOC (string_length + 1, char); + size_t filename_length = filename_end - filename_start; + char *filename = XNMALLOC (filename_length + 1, char); - memcpy (string, string_start, string_length); - string[string_length] = '\0'; + memcpy (filename, filename_start, filename_length); + filename[filename_length] = '\0'; - po_callback_comment_filepos (string, (size_t)(-1)); + po_callback_comment_filepos (filename, (size_t)(-1)); - free (string); + free (filename); } } } diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 181288a0c..4cf3be2af 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -42,6 +42,7 @@ TESTS = gettext-1 gettext-2 \ msgcomm-20 msgcomm-21 msgcomm-22 msgcomm-23 msgcomm-24 msgcomm-25 \ msgcomm-26 msgcomm-27 msgcomm-28 \ msgconv-1 msgconv-2 msgconv-3 msgconv-4 msgconv-5 msgconv-6 msgconv-7 \ + msgconv-8 \ msgen-1 msgen-2 msgen-3 msgen-4 \ msgexec-1 msgexec-2 msgexec-3 msgexec-4 msgexec-5 msgexec-6 \ msgfilter-1 msgfilter-2 msgfilter-3 msgfilter-4 msgfilter-5 \ diff --git a/gettext-tools/tests/msgconv-8 b/gettext-tools/tests/msgconv-8 new file mode 100755 index 000000000..0d9837b30 --- /dev/null +++ b/gettext-tools/tests/msgconv-8 @@ -0,0 +1,71 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test file locations with file names that contain spaces. + +cat <<\EOF > mco-test8.po +msgid "" +msgstr "" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#: ⁨xg-test17 a.c⁩:1 ⁨xg-test17 x y.c⁩:1 +msgid "foo" +msgstr "" + +#: ⁨xg-test17 x y.c⁩:2 xg-test17z.c:1 +msgid "bar" +msgstr "" +EOF + +: ${MSGCONV=msgconv} +${MSGCONV} --to-code=UTF-8 \ + -o mco-test8.out1 mco-test8.po || Exit 1 + +: ${DIFF=diff} +${DIFF} mco-test8.po mco-test8.out1 || Exit 1 + +: ${MSGCONV=msgconv} +${MSGCONV} --to-code=GB18030 \ + -o mco-test8.2.po mco-test8.po || Exit 1 + +cat <<\EOF > mco-test8.ok +msgid "" +msgstr "" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=GB18030\n" +"Content-Transfer-Encoding: 8bit\n" + +#: 6¬4xg-test17 a.c6¬5:1 6¬4xg-test17 x y.c6¬5:1 +msgid "foo" +msgstr "" + +#: 6¬4xg-test17 x y.c6¬5:2 xg-test17z.c:1 +msgid "bar" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} mco-test8.ok mco-test8.2.po || Exit 1 + +: ${MSGCONV=msgconv} +${MSGCONV} --to-code=GB18030 \ + -o mco-test8.out2 mco-test8.2.po || Exit 1 + +: ${DIFF=diff} +${DIFF} mco-test8.2.po mco-test8.out2 || Exit 1 + +: ${MSGCONV=msgconv} +${MSGCONV} --to-code=UTF-8 \ + -o mco-test8.out3 mco-test8.2.po || Exit 1 + +: ${DIFF=diff} +${DIFF} mco-test8.po mco-test8.out3 || Exit 1 + +: ${MSGCONV=msgconv} +${MSGCONV} --to-code=ISO-8859-1 \ + -o mco-test8.out4 mco-test8.po 2>/dev/null +test $? = 1 || Exit 1 + +exit 0 diff --git a/gettext-tools/woe32dll/gettextsrc-exports.c b/gettext-tools/woe32dll/gettextsrc-exports.c index 6d7608959..6e8ea9c0f 100644 --- a/gettext-tools/woe32dll/gettextsrc-exports.c +++ b/gettext-tools/woe32dll/gettextsrc-exports.c @@ -1,5 +1,5 @@ /* List of exported symbols of libgettextsrc on Cygwin. - Copyright (C) 2006-2007, 2009-2011, 2013-2015, 2019 Free Software Foundation, + Copyright (C) 2006-2007, 2009-2011, 2013-2015, 2019, 2021 Free Software Foundation, Inc. Written by Bruno Haible , 2006. @@ -77,6 +77,8 @@ VARIABLE(po_error) VARIABLE(po_error_at_line) VARIABLE(po_gram_lval) VARIABLE(po_lex_charset) +VARIABLE(po_lex_isolate_start) +VARIABLE(po_lex_isolate_end) #if HAVE_ICONV VARIABLE(po_lex_iconv) #endif -- 2.47.3