gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V (GH-120365)

author Serhiy Storchaka <storchaka@gmail.com>

Mon, 24 Jun 2024 15:07:07 +0000 (18:07 +0300)

committer GitHub <noreply@github.com>

Mon, 24 Jun 2024 15:07:07 +0000 (18:07 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Mon, 24 Jun 2024 15:07:07 +0000 (18:07 +0300)
committer GitHub <noreply@github.com>
Mon, 24 Jun 2024 15:07:07 +0000 (18:07 +0300)
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py

index 36106b0730dd26a6ba3fb3efc81e7d6642c7afa3..48a802c3f8bcb29cb5472779caa15926b319dd01 100644 (file)
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -419,8 +419,29 @@ class CAPITest(unittest.TestCase):
          # truncated string
          check_format('abc',
                       b'%.3s', b'abcdef')
+        check_format('abc[',
+                     b'%.6s', 'abc[\u20ac]'.encode('utf8'))
+        check_format('abc[\u20ac',
+                     b'%.7s', 'abc[\u20ac]'.encode('utf8'))
          check_format('abc[\ufffd',
-                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
+                     b'%.5s', b'abc[\xff]')
+        check_format('abc[',
+                     b'%.6s', b'abc[\xe2\x82]')
+        check_format('abc[\ufffd]',
+                     b'%.7s', b'abc[\xe2\x82]')
+        check_format('abc[\ufffd',
+                     b'%.7s', b'abc[\xe2\x82\0')
+        check_format('      abc[',
+                     b'%10.6s', 'abc[\u20ac]'.encode('utf8'))
+        check_format('     abc[\u20ac',
+                     b'%10.7s', 'abc[\u20ac]'.encode('utf8'))
+        check_format('     abc[\ufffd',
+                     b'%10.5s', b'abc[\xff]')
+        check_format('      abc[',
+                     b'%10.6s', b'abc[\xe2\x82]')
+        check_format('    abc[\ufffd]',
+                     b'%10.7s', b'abc[\xe2\x82]')
+
          check_format("'\\u20acABC'",
                       b'%A', '\u20acABC')
          check_format("'\\u20",
@@ -433,10 +454,31 @@ class CAPITest(unittest.TestCase):
                       b'%.3S', '\u20acABCDEF')
          check_format('\u20acAB',
                       b'%.3U', '\u20acABCDEF')
+
          check_format('\u20acAB',
                       b'%.3V', '\u20acABCDEF', None)
+        check_format('abc[',
+                     b'%.6V', None, 'abc[\u20ac]'.encode('utf8'))
+        check_format('abc[\u20ac',
+                     b'%.7V', None, 'abc[\u20ac]'.encode('utf8'))
          check_format('abc[\ufffd',
-                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
+                     b'%.5V', None, b'abc[\xff]')
+        check_format('abc[',
+                     b'%.6V', None, b'abc[\xe2\x82]')
+        check_format('abc[\ufffd]',
+                     b'%.7V', None, b'abc[\xe2\x82]')
+        check_format('      abc[',
+                     b'%10.6V', None, 'abc[\u20ac]'.encode('utf8'))
+        check_format('     abc[\u20ac',
+                     b'%10.7V', None, 'abc[\u20ac]'.encode('utf8'))
+        check_format('     abc[\ufffd',
+                     b'%10.5V', None, b'abc[\xff]')
+        check_format('      abc[',
+                     b'%10.6V', None, b'abc[\xe2\x82]')
+        check_format('    abc[\ufffd]',
+                     b'%10.7V', None, b'abc[\xe2\x82]')
+        check_format('     abc[\ufffd',
+                     b'%10.7V', None, b'abc[\xe2\x82\0')
  
          # following tests comes from #7330
          # test width modifier and precision modifier with %S
diff --git a/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst

new file mode 100644 (file)

index 0000000..1eca36a
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst
@@ -0,0 +1,4 @@
+:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd``
+character for truncated C string when use precision with ``%s`` and ``%V``.
+It now truncates the string before the start of truncated multibyte
+sequences.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 279cdaa668e2914ae8df23f0a1d283c12f41a327..d11a9dca14b2805f561eb93c6a9d97210c9a3f1e 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2581,6 +2581,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
                                Py_ssize_t width, Py_ssize_t precision, int flags)
  {
      /* UTF-8 */
+    Py_ssize_t *pconsumed = NULL;
      Py_ssize_t length;
      if (precision == -1) {
          length = strlen(str);
@@ -2590,15 +2591,23 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
          while (length < precision && str[length]) {
              length++;
          }
+        if (length == precision) {
+            /* The input string is not NUL-terminated.  If it ends with an
+             * incomplete UTF-8 sequence, truncate the string just before it.
+             * Incomplete sequences in the middle and sequences which cannot
+             * be valid prefixes are still treated as errors and replaced
+             * with \xfffd. */
+            pconsumed = &length;
+        }
      }
  
      if (width < 0) {
          return unicode_decode_utf8_writer(writer, str, length,
-                                          _Py_ERROR_REPLACE, "replace", NULL);
+                                          _Py_ERROR_REPLACE, "replace", pconsumed);
      }
  
      PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
-                                                     "replace", NULL);
+                                                     "replace", pconsumed);
      if (unicode == NULL)
          return -1;
author	Serhiy Storchaka <storchaka@gmail.com>
	Mon, 24 Jun 2024 15:07:07 +0000 (18:07 +0300)
committer	GitHub <noreply@github.com>
	Mon, 24 Jun 2024 15:07:07 +0000 (18:07 +0300)
Lib/test/test_capi/test_unicode.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst	[new file with mode: 0644]	patch \| blob
Objects/unicodeobject.c		patch \| blob \| blame \| history