* gettext-tools/src/xg-encoding.c: Include unistr.h.
(non_utf8_error_message): New function.
(from_current_source_encoding): When xgettext_current_source_encoding is
"UTF-8", check that the string is well-formed UTF-8.
* gettext-tools/tests/xgettext-c-8: New file.
* gettext-tools/tests/xgettext-python-5: New file.
* gettext-tools/tests/xgettext-elisp-3: New file.
* gettext-tools/tests/xgettext-librep-3: New file.
* gettext-tools/tests/xgettext-awk-3: New file.
* gettext-tools/tests/xgettext-lua-3: New file.
* gettext-tools/tests/xgettext-vala-4: New file.
* gettext-tools/tests/xgettext-php-5: New file.
* gettext-tools/tests/Makefile.am (TESTS): Add them.
/* Keeping track of the encoding of strings to be extracted.
- Copyright (C) 2001-2019 Free Software Foundation, Inc.
+ Copyright (C) 2001-2023 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include "msgl-ascii.h"
#include "msgl-iconv.h"
#include "po-charset.h"
+#include "unistr.h"
#include "xalloc.h"
#include "xerror.h"
#include "xvasprintf.h"
return errmsg;
}
+/* Error message about non-UTF-8 character in a specific lexical context. */
+static char *
+non_utf8_error_message (lexical_context_ty lcontext,
+ const char *file_name, size_t line_number)
+{
+ char buffer[21];
+ char *errmsg;
+
+ if (line_number == (size_t)(-1))
+ buffer[0] = '\0';
+ else
+ sprintf (buffer, ":%ld", (long) line_number);
+
+ switch (lcontext)
+ {
+ case lc_outside:
+ errmsg =
+ xasprintf (_("Character at %s%s is not UTF-8 encoded."),
+ file_name, buffer);
+ break;
+ case lc_comment:
+ errmsg =
+ xasprintf (_("Comment at or before %s%s is not UTF-8 encoded."),
+ file_name, buffer);
+ break;
+ case lc_string:
+ errmsg =
+ xasprintf (_("String at %s%s is not UTF-8 encoded."),
+ file_name, buffer);
+ break;
+ default:
+ abort ();
+ }
+ return errmsg;
+}
+
/* Convert the given string from xgettext_current_source_encoding to
the output file encoding (i.e. ASCII or UTF-8).
The resulting string is either the argument string, or freshly allocated.
exit (EXIT_FAILURE);
}
}
- else if (xgettext_current_source_encoding != po_charset_utf8)
+ else if (xgettext_current_source_encoding == po_charset_utf8)
+ {
+ if (u8_check ((uint8_t *) string, strlen (string)) != NULL)
+ {
+ multiline_error (xstrdup (""),
+ xasprintf ("%s\n%s\n",
+ non_utf8_error_message (lcontext,
+ file_name,
+ line_number),
+ _("Please specify the source encoding through --from-code.")));
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
{
#if HAVE_ICONV
struct conversion_context context;
xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \
xgettext-18 \
xgettext-appdata-1 \
- xgettext-awk-1 xgettext-awk-2 \
+ xgettext-awk-1 xgettext-awk-2 xgettext-awk-3 \
xgettext-awk-stackovfl-1 xgettext-awk-stackovfl-2 \
xgettext-c-2 xgettext-c-3 xgettext-c-4 xgettext-c-5 xgettext-c-6 \
- xgettext-c-7 \
+ xgettext-c-7 xgettext-c-8 \
xgettext-c-comment-1 xgettext-c-comment-2 xgettext-c-comment-3 \
xgettext-c-comment-4 xgettext-c-comment-5 xgettext-c-comment-6 \
xgettext-c-escape-1 xgettext-c-escape-2 xgettext-c-escape-3 \
xgettext-csharp-stackovfl-1 xgettext-csharp-stackovfl-2 \
xgettext-csharp-stackovfl-3 xgettext-csharp-stackovfl-4 \
xgettext-desktop-1 xgettext-desktop-2 \
- xgettext-elisp-1 xgettext-elisp-2 \
+ xgettext-elisp-1 xgettext-elisp-2 xgettext-elisp-3 \
xgettext-elisp-stackovfl-1 xgettext-elisp-stackovfl-2 \
xgettext-elisp-stackovfl-3 xgettext-elisp-stackovfl-4 \
xgettext-glade-1 xgettext-glade-2 xgettext-glade-3 xgettext-glade-4 \
xgettext-javascript-stackovfl-3 xgettext-javascript-stackovfl-4 \
xgettext-javascript-stackovfl-5 xgettext-javascript-stackovfl-6 \
xgettext-javascript-stackovfl-7 xgettext-javascript-stackovfl-8 \
- xgettext-librep-1 xgettext-librep-2 \
+ xgettext-librep-1 xgettext-librep-2 xgettext-librep-3 \
xgettext-librep-stackovfl-1 xgettext-librep-stackovfl-2 \
xgettext-lisp-1 xgettext-lisp-2 \
xgettext-lisp-stackovfl-1 xgettext-lisp-stackovfl-2 \
- xgettext-lua-1 xgettext-lua-2 \
+ xgettext-lua-1 xgettext-lua-2 xgettext-lua-3 \
xgettext-lua-stackovfl-1 xgettext-lua-stackovfl-2 \
xgettext-lua-stackovfl-3 xgettext-lua-stackovfl-4 \
xgettext-objc-1 xgettext-objc-2 \
xgettext-perl-stackovfl-1 xgettext-perl-stackovfl-2 \
xgettext-perl-stackovfl-3 xgettext-perl-stackovfl-4 \
xgettext-php-1 xgettext-php-2 xgettext-php-3 xgettext-php-4 \
+ xgettext-php-5 \
xgettext-php-stackovfl-1 xgettext-php-stackovfl-2 \
xgettext-php-stackovfl-3 xgettext-php-stackovfl-4 \
xgettext-po-1 xgettext-po-2 xgettext-po-3 xgettext-po-4 \
xgettext-properties-4 \
xgettext-rst-1 xgettext-rst-2 \
xgettext-python-1 xgettext-python-2 xgettext-python-3 \
- xgettext-python-4 \
+ xgettext-python-4 xgettext-python-5 \
xgettext-python-stackovfl-1 xgettext-python-stackovfl-2 \
xgettext-python-stackovfl-3 xgettext-python-stackovfl-4 \
xgettext-ruby-1 \
xgettext-tcl-1 xgettext-tcl-2 xgettext-tcl-3 xgettext-tcl-4 \
xgettext-tcl-stackovfl-1 xgettext-tcl-stackovfl-2 \
xgettext-tcl-stackovfl-3 xgettext-tcl-stackovfl-4 \
- xgettext-vala-1 xgettext-vala-2 xgettext-vala-3 \
+ xgettext-vala-1 xgettext-vala-2 xgettext-vala-3 xgettext-vala-4 \
xgettext-vala-stackovfl-1 xgettext-vala-stackovfl-2 \
xgettext-ycp-1 xgettext-ycp-2 xgettext-ycp-3 xgettext-ycp-4 \
xgettext-ycp-stackovfl-1 xgettext-ycp-stackovfl-2 \
--- /dev/null
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test awk support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-a-3.awk
+_"\xE0"
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-a-3.tmp xg-a-3.awk 2>xg-a-3.err
+result=$?
+cat xg-a-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-a-3.err >/dev/null || Exit 1
+
+exit 0
--- /dev/null
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test C support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-c-8.c
+gettext("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-c-8.tmp xg-c-8.c 2>xg-c-8.err
+result=$?
+cat xg-c-8.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-c-8.err >/dev/null || Exit 1
+
+exit 0
--- /dev/null
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test EmacsLisp support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-el-3.el
+(_ "\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-el-3.tmp xg-el-3.el 2>xg-el-3.err
+result=$?
+cat xg-el-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-el-3.err >/dev/null || Exit 1
+
+exit 0
--- /dev/null
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test librep support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-lr-3.jl
+(_ "\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-lr-3.tmp xg-lr-3.jl 2>xg-lr-3.err
+result=$?
+cat xg-lr-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-lr-3.err >/dev/null || Exit 1
+
+exit 0
--- /dev/null
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Lua support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-lu-3.lua
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-lu-3.tmp xg-lu-3.lua 2>xg-lu-3.err
+result=$?
+cat xg-lu-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-lu-3.err >/dev/null || Exit 1
+
+exit 0
--- /dev/null
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test PHP support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-ph-3.php
+<?
+_("\xE0")
+?>
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-ph-3.tmp xg-ph-3.php 2>xg-ph-3.err
+result=$?
+cat xg-ph-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-ph-3.err >/dev/null || Exit 1
+
+exit 0
--- /dev/null
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Python support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-py-5.py
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-py-5.tmp xg-py-5.py 2>xg-py-5.err
+result=$?
+cat xg-py-5.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-py-5.err >/dev/null || Exit 1
+
+exit 0
--- /dev/null
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Vala support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-vala-4.vala
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-vala-4.tmp xg-vala-4.vala 2>xg-vala-4.err
+result=$?
+cat xg-vala-4.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-vala-4.err >/dev/null || Exit 1
+
+exit 0