]> git.ipfire.org Git - thirdparty/gettext.git/commitdiff
xgettext: Fix abort when outputting a msgid that has an invalid UTF-8 character.
authorBruno Haible <bruno@clisp.org>
Sun, 12 Mar 2023 00:39:57 +0000 (01:39 +0100)
committerBruno Haible <bruno@clisp.org>
Tue, 14 Mar 2023 01:57:28 +0000 (02:57 +0100)
* gettext-tools/src/xg-encoding.c: Include unistr.h.
(non_utf8_error_message): New function.
(from_current_source_encoding): When xgettext_current_source_encoding is
"UTF-8", check that the string is well-formed UTF-8.
* gettext-tools/tests/xgettext-c-8: New file.
* gettext-tools/tests/xgettext-python-5: New file.
* gettext-tools/tests/xgettext-elisp-3: New file.
* gettext-tools/tests/xgettext-librep-3: New file.
* gettext-tools/tests/xgettext-awk-3: New file.
* gettext-tools/tests/xgettext-lua-3: New file.
* gettext-tools/tests/xgettext-vala-4: New file.
* gettext-tools/tests/xgettext-php-5: New file.
* gettext-tools/tests/Makefile.am (TESTS): Add them.

gettext-tools/src/xg-encoding.c
gettext-tools/tests/Makefile.am
gettext-tools/tests/xgettext-awk-3 [new file with mode: 0755]
gettext-tools/tests/xgettext-c-8 [new file with mode: 0755]
gettext-tools/tests/xgettext-elisp-3 [new file with mode: 0755]
gettext-tools/tests/xgettext-librep-3 [new file with mode: 0755]
gettext-tools/tests/xgettext-lua-3 [new file with mode: 0755]
gettext-tools/tests/xgettext-php-5 [new file with mode: 0755]
gettext-tools/tests/xgettext-python-5 [new file with mode: 0755]
gettext-tools/tests/xgettext-vala-4 [new file with mode: 0755]

index 11793368e3f0e92cf3428dea35ace530fd26b171..d06587c8520357671276d196aa111056c940fe2a 100644 (file)
@@ -1,5 +1,5 @@
 /* Keeping track of the encoding of strings to be extracted.
-   Copyright (C) 2001-2019 Free Software Foundation, Inc.
+   Copyright (C) 2001-2023 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -27,6 +27,7 @@
 #include "msgl-ascii.h"
 #include "msgl-iconv.h"
 #include "po-charset.h"
+#include "unistr.h"
 #include "xalloc.h"
 #include "xerror.h"
 #include "xvasprintf.h"
@@ -90,6 +91,42 @@ non_ascii_error_message (lexical_context_ty lcontext,
   return errmsg;
 }
 
+/* Error message about non-UTF-8 character in a specific lexical context.  */
+static char *
+non_utf8_error_message (lexical_context_ty lcontext,
+                        const char *file_name, size_t line_number)
+{
+  char buffer[21];
+  char *errmsg;
+
+  if (line_number == (size_t)(-1))
+    buffer[0] = '\0';
+  else
+    sprintf (buffer, ":%ld", (long) line_number);
+
+  switch (lcontext)
+    {
+    case lc_outside:
+      errmsg =
+        xasprintf (_("Character at %s%s is not UTF-8 encoded."),
+                   file_name, buffer);
+      break;
+    case lc_comment:
+      errmsg =
+        xasprintf (_("Comment at or before %s%s is not UTF-8 encoded."),
+                   file_name, buffer);
+      break;
+    case lc_string:
+      errmsg =
+        xasprintf (_("String at %s%s is not UTF-8 encoded."),
+                   file_name, buffer);
+      break;
+    default:
+      abort ();
+    }
+  return errmsg;
+}
+
 /* Convert the given string from xgettext_current_source_encoding to
    the output file encoding (i.e. ASCII or UTF-8).
    The resulting string is either the argument string, or freshly allocated.
@@ -112,7 +149,20 @@ from_current_source_encoding (const char *string,
           exit (EXIT_FAILURE);
         }
     }
-  else if (xgettext_current_source_encoding != po_charset_utf8)
+  else if (xgettext_current_source_encoding == po_charset_utf8)
+    {
+      if (u8_check ((uint8_t *) string, strlen (string)) != NULL)
+        {
+          multiline_error (xstrdup (""),
+                           xasprintf ("%s\n%s\n",
+                                      non_utf8_error_message (lcontext,
+                                                              file_name,
+                                                              line_number),
+                                      _("Please specify the source encoding through --from-code.")));
+          exit (EXIT_FAILURE);
+        }
+    }
+  else
     {
 #if HAVE_ICONV
       struct conversion_context context;
index a9595dfc37a5e1d5ac0a6c903c8c1cc37a936d02..b0d96c83e4a3a1a4641bb0f0ad37a170f560ca12 100644 (file)
@@ -82,10 +82,10 @@ TESTS = gettext-1 gettext-2 \
        xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \
        xgettext-18 \
        xgettext-appdata-1 \
-       xgettext-awk-1 xgettext-awk-2 \
+       xgettext-awk-1 xgettext-awk-2 xgettext-awk-3 \
        xgettext-awk-stackovfl-1 xgettext-awk-stackovfl-2 \
        xgettext-c-2 xgettext-c-3 xgettext-c-4 xgettext-c-5 xgettext-c-6 \
-       xgettext-c-7 \
+       xgettext-c-7 xgettext-c-8 \
        xgettext-c-comment-1 xgettext-c-comment-2 xgettext-c-comment-3 \
        xgettext-c-comment-4 xgettext-c-comment-5 xgettext-c-comment-6 \
        xgettext-c-escape-1 xgettext-c-escape-2 xgettext-c-escape-3 \
@@ -100,7 +100,7 @@ TESTS = gettext-1 gettext-2 \
        xgettext-csharp-stackovfl-1 xgettext-csharp-stackovfl-2 \
        xgettext-csharp-stackovfl-3 xgettext-csharp-stackovfl-4 \
        xgettext-desktop-1 xgettext-desktop-2 \
-       xgettext-elisp-1 xgettext-elisp-2 \
+       xgettext-elisp-1 xgettext-elisp-2 xgettext-elisp-3 \
        xgettext-elisp-stackovfl-1 xgettext-elisp-stackovfl-2 \
        xgettext-elisp-stackovfl-3 xgettext-elisp-stackovfl-4 \
        xgettext-glade-1 xgettext-glade-2 xgettext-glade-3 xgettext-glade-4 \
@@ -118,11 +118,11 @@ TESTS = gettext-1 gettext-2 \
        xgettext-javascript-stackovfl-3 xgettext-javascript-stackovfl-4 \
        xgettext-javascript-stackovfl-5 xgettext-javascript-stackovfl-6 \
        xgettext-javascript-stackovfl-7 xgettext-javascript-stackovfl-8 \
-       xgettext-librep-1 xgettext-librep-2 \
+       xgettext-librep-1 xgettext-librep-2 xgettext-librep-3 \
        xgettext-librep-stackovfl-1 xgettext-librep-stackovfl-2 \
        xgettext-lisp-1 xgettext-lisp-2 \
        xgettext-lisp-stackovfl-1 xgettext-lisp-stackovfl-2 \
-       xgettext-lua-1 xgettext-lua-2 \
+       xgettext-lua-1 xgettext-lua-2 xgettext-lua-3 \
        xgettext-lua-stackovfl-1 xgettext-lua-stackovfl-2 \
        xgettext-lua-stackovfl-3 xgettext-lua-stackovfl-4 \
        xgettext-objc-1 xgettext-objc-2 \
@@ -131,6 +131,7 @@ TESTS = gettext-1 gettext-2 \
        xgettext-perl-stackovfl-1 xgettext-perl-stackovfl-2 \
        xgettext-perl-stackovfl-3 xgettext-perl-stackovfl-4 \
        xgettext-php-1 xgettext-php-2 xgettext-php-3 xgettext-php-4 \
+       xgettext-php-5 \
        xgettext-php-stackovfl-1 xgettext-php-stackovfl-2 \
        xgettext-php-stackovfl-3 xgettext-php-stackovfl-4 \
        xgettext-po-1 xgettext-po-2 xgettext-po-3 xgettext-po-4 \
@@ -138,7 +139,7 @@ TESTS = gettext-1 gettext-2 \
        xgettext-properties-4 \
        xgettext-rst-1 xgettext-rst-2 \
        xgettext-python-1 xgettext-python-2 xgettext-python-3 \
-       xgettext-python-4 \
+       xgettext-python-4 xgettext-python-5 \
        xgettext-python-stackovfl-1 xgettext-python-stackovfl-2 \
        xgettext-python-stackovfl-3 xgettext-python-stackovfl-4 \
        xgettext-ruby-1 \
@@ -155,7 +156,7 @@ TESTS = gettext-1 gettext-2 \
        xgettext-tcl-1 xgettext-tcl-2 xgettext-tcl-3 xgettext-tcl-4 \
        xgettext-tcl-stackovfl-1 xgettext-tcl-stackovfl-2 \
        xgettext-tcl-stackovfl-3 xgettext-tcl-stackovfl-4 \
-       xgettext-vala-1 xgettext-vala-2 xgettext-vala-3 \
+       xgettext-vala-1 xgettext-vala-2 xgettext-vala-3 xgettext-vala-4 \
        xgettext-vala-stackovfl-1 xgettext-vala-stackovfl-2 \
        xgettext-ycp-1 xgettext-ycp-2 xgettext-ycp-3 xgettext-ycp-4 \
        xgettext-ycp-stackovfl-1 xgettext-ycp-stackovfl-2 \
diff --git a/gettext-tools/tests/xgettext-awk-3 b/gettext-tools/tests/xgettext-awk-3
new file mode 100755 (executable)
index 0000000..c521517
--- /dev/null
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test awk support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-a-3.awk
+_"\xE0"
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-a-3.tmp xg-a-3.awk 2>xg-a-3.err
+result=$?
+cat xg-a-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-a-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-c-8 b/gettext-tools/tests/xgettext-c-8
new file mode 100755 (executable)
index 0000000..6a828b5
--- /dev/null
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test C support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-c-8.c
+gettext("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-c-8.tmp xg-c-8.c 2>xg-c-8.err
+result=$?
+cat xg-c-8.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-c-8.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-elisp-3 b/gettext-tools/tests/xgettext-elisp-3
new file mode 100755 (executable)
index 0000000..6d27e14
--- /dev/null
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test EmacsLisp support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-el-3.el
+(_ "\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-el-3.tmp xg-el-3.el 2>xg-el-3.err
+result=$?
+cat xg-el-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-el-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-librep-3 b/gettext-tools/tests/xgettext-librep-3
new file mode 100755 (executable)
index 0000000..05d459f
--- /dev/null
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test librep support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-lr-3.jl
+(_ "\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-lr-3.tmp xg-lr-3.jl 2>xg-lr-3.err
+result=$?
+cat xg-lr-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-lr-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-lua-3 b/gettext-tools/tests/xgettext-lua-3
new file mode 100755 (executable)
index 0000000..bd736ed
--- /dev/null
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Lua support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-lu-3.lua
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-lu-3.tmp xg-lu-3.lua 2>xg-lu-3.err
+result=$?
+cat xg-lu-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-lu-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-php-5 b/gettext-tools/tests/xgettext-php-5
new file mode 100755 (executable)
index 0000000..6c8a8e0
--- /dev/null
@@ -0,0 +1,20 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test PHP support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-ph-3.php
+<?
+_("\xE0")
+?>
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-ph-3.tmp xg-ph-3.php 2>xg-ph-3.err
+result=$?
+cat xg-ph-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-ph-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-python-5 b/gettext-tools/tests/xgettext-python-5
new file mode 100755 (executable)
index 0000000..f480b4a
--- /dev/null
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Python support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-py-5.py
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-py-5.tmp xg-py-5.py 2>xg-py-5.err
+result=$?
+cat xg-py-5.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-py-5.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-vala-4 b/gettext-tools/tests/xgettext-vala-4
new file mode 100755 (executable)
index 0000000..be1910c
--- /dev/null
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Vala support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-vala-4.vala
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-vala-4.tmp xg-vala-4.vala 2>xg-vala-4.err
+result=$?
+cat xg-vala-4.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-vala-4.err >/dev/null || Exit 1
+
+exit 0