From: Bruno Haible <bruno@clisp.org>
Date: Sun, 1 Dec 2024 07:40:52 +0000 (+0100)
Subject: xgettext: C: Revisit "hexadecimal escape sequence out of range" warnings.
X-Git-Tag: v0.23~6
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f55873531e82adc985d87900a12c8c7fc2c71531;p=thirdparty%2Fgettext.git

xgettext: C: Revisit "hexadecimal escape sequence out of range" warnings.

Reported by Vaclav Slavik <vaclav@slavik.io>
at <https://savannah.gnu.org/bugs/?65053>.

* gettext-tools/src/x-c.c (get_string_element): Add context argument. Don't warn
for character literals. For wide strings, use a different warning and complain
about "out of range" only for values >= 0x110000.
(phase5_get): Distinguish normal string literals and wide string literals.
---

diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c
index 34ebbec65..4137f8e99 100644
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -1015,8 +1015,12 @@ struct token_ty
 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
 
 
+/* Parse an element of a character literal or string literal.
+   CONTEXT is -1 for a character literal (wide or not),
+              0 for a normal string literal,
+              1 for a wide string literal.  */
 static int
-get_string_element ()
+get_string_element (int context)
 {
   int c, j;
 
@@ -1100,7 +1104,12 @@ get_string_element ()
           break;
         }
       {
-        int n;
+        /* For the overflow detection:
+           - Valid character values in normal strings must be < 0x100.
+           - In wide strings, warn and assume the programmer meant Unicode code
+             points.  */
+        unsigned int n_limit = (context > 0 ? 0x110000 : 0x100);
+        unsigned int n;
         bool overflow;
 
         n = 0;
@@ -1112,29 +1121,40 @@ get_string_element ()
               {
               default:
                 phase3_ungetc (c);
-                if (overflow)
-                  if_error (IF_SEVERITY_WARNING,
-                            logical_file_name, line_number, (size_t)(-1), false,
-                            _("hexadecimal escape sequence out of range"));
-                return n;
+                /* Don't warn for character literals.  */
+                if (context >= 0)
+                  {
+                    if (context > 0 && n >= 0x80)
+                      /* Hexadecimal escape sequences outside the ASCII
+                         character range are platform and locale dependent.
+                         Cf. <https://savannah.gnu.org/bugs/?65053>.  */
+                      if_error (IF_SEVERITY_WARNING,
+                                logical_file_name, line_number, (size_t)(-1), false,
+                                _("hexadecimal escape sequence in wide string literal is unsupported; use \\u instead of \\x if you meant to designate a Unicode character"));
+                    if (overflow)
+                      if_error (IF_SEVERITY_WARNING,
+                                logical_file_name, line_number, (size_t)(-1), false,
+                                _("hexadecimal escape sequence out of range"));
+                  }
+                return (context > 0 ? UNICODE (n) : n);
 
               case '0': case '1': case '2': case '3': case '4':
               case '5': case '6': case '7': case '8': case '9':
-                if (n < 0x100 / 16)
+                if (n < n_limit / 16)
                   n = n * 16 + c - '0';
                 else
                   overflow = true;
                 break;
 
               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-                if (n < 0x100 / 16)
+                if (n < n_limit / 16)
                   n = n * 16 + 10 + c - 'A';
                 else
                   overflow = true;
                 break;
 
               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-                if (n < 0x100 / 16)
+                if (n < n_limit / 16)
                   n = n * 16 + 10 + c - 'a';
                 else
                   overflow = true;
@@ -1200,9 +1220,11 @@ get_string_element ()
         if (n < 0x110000)
           return UNICODE (n);
 
-        if_error (IF_SEVERITY_WARNING,
-                  logical_file_name, line_number, (size_t)(-1), false,
-                  _("invalid Unicode character"));
+        /* Don't warn for character literals.  */
+        if (context >= 0)
+          if_error (IF_SEVERITY_WARNING,
+                    logical_file_name, line_number, (size_t)(-1), false,
+                     _("invalid Unicode character"));
 
         while (--j >= 0)
           phase3_ungetc (buf[j]);
@@ -1332,13 +1354,17 @@ phase5_get (token_ty *tp)
                      Note: The programmer who passes an UTF-8 encoded string to
                      gettext() or similar API functions will have to have called
                      bind_textdomain_codeset (DOMAIN, "UTF-8") first.  */
-                  if ((buflen == 1
-                       && (buf[0] == 'u' || buf[0] == 'U' || buf[0] == 'L'))
+                  if ((buflen == 1 && (buf[0] == 'u' || buf[0] == 'U'))
                       || (buflen == 2 && buf[0] == 'u' && buf[1] == '8'))
                     {
                       sb_free (&buffer);
                       goto string_literal;
                     }
+                  if (buflen == 1 && buf[0] == 'L')
+                    {
+                      sb_free (&buffer);
+                      goto wide_string_literal;
+                    }
                   /* Recognize C++11 raw string literals.
                      See ISO C++ 11 section 2.14.5 [lex.string].
                      Here it is important to properly parse all cases according to
@@ -1669,7 +1695,7 @@ phase5_get (token_ty *tp)
          remember the character constant.  */
       for (;;)
         {
-          c = get_string_element ();
+          c = get_string_element (-1);
           if (c == SE_NEWLINE)
             {
               if_error (IF_SEVERITY_WARNING,
@@ -1693,6 +1719,9 @@ phase5_get (token_ty *tp)
          about the argument not matching the prototype.  Just pretend it
          won't happen.  */
       {
+        int wide = 0;
+        if (false)
+          wide_string_literal: wide = 1;
         struct mixed_string_buffer msb;
 
         /* Start accumulating the string.  */
@@ -1701,7 +1730,7 @@ phase5_get (token_ty *tp)
 
         for (;;)
           {
-            c = get_string_element ();
+            c = get_string_element (wide);
 
             /* Keep line_number in sync.  */
             msb.line_number = line_number;