From: Serhiy Storchaka Date: Mon, 24 Jun 2024 15:07:07 +0000 (+0300) Subject: gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V (GH-120365) X-Git-Tag: v3.14.0a1~1350 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=6eb23b1311e7eebf2459076703460ee7f8044f05;p=thirdparty%2FPython%2Fcpython.git gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V (GH-120365) PyUnicode_FromFormat() no longer produces the ending \ufffd character for truncated C string when use precision with %s and %V. It now truncates the string before the start of truncated multibyte sequences. --- diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 36106b0730dd..48a802c3f8bc 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -419,8 +419,29 @@ class CAPITest(unittest.TestCase): # truncated string check_format('abc', b'%.3s', b'abcdef') + check_format('abc[', + b'%.6s', 'abc[\u20ac]'.encode('utf8')) + check_format('abc[\u20ac', + b'%.7s', 'abc[\u20ac]'.encode('utf8')) check_format('abc[\ufffd', - b'%.5s', 'abc[\u20ac]'.encode('utf8')) + b'%.5s', b'abc[\xff]') + check_format('abc[', + b'%.6s', b'abc[\xe2\x82]') + check_format('abc[\ufffd]', + b'%.7s', b'abc[\xe2\x82]') + check_format('abc[\ufffd', + b'%.7s', b'abc[\xe2\x82\0') + check_format(' abc[', + b'%10.6s', 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\u20ac', + b'%10.7s', 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\ufffd', + b'%10.5s', b'abc[\xff]') + check_format(' abc[', + b'%10.6s', b'abc[\xe2\x82]') + check_format(' abc[\ufffd]', + b'%10.7s', b'abc[\xe2\x82]') + check_format("'\\u20acABC'", b'%A', '\u20acABC') check_format("'\\u20", @@ -433,10 +454,31 @@ class CAPITest(unittest.TestCase): b'%.3S', '\u20acABCDEF') check_format('\u20acAB', b'%.3U', '\u20acABCDEF') + check_format('\u20acAB', b'%.3V', '\u20acABCDEF', None) + check_format('abc[', + b'%.6V', None, 'abc[\u20ac]'.encode('utf8')) + check_format('abc[\u20ac', + b'%.7V', None, 'abc[\u20ac]'.encode('utf8')) check_format('abc[\ufffd', - b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) + b'%.5V', None, b'abc[\xff]') + check_format('abc[', + b'%.6V', None, b'abc[\xe2\x82]') + check_format('abc[\ufffd]', + b'%.7V', None, b'abc[\xe2\x82]') + check_format(' abc[', + b'%10.6V', None, 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\u20ac', + b'%10.7V', None, 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\ufffd', + b'%10.5V', None, b'abc[\xff]') + check_format(' abc[', + b'%10.6V', None, b'abc[\xe2\x82]') + check_format(' abc[\ufffd]', + b'%10.7V', None, b'abc[\xe2\x82]') + check_format(' abc[\ufffd', + b'%10.7V', None, b'abc[\xe2\x82\0') # following tests comes from #7330 # test width modifier and precision modifier with %S diff --git a/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst new file mode 100644 index 000000000000..1eca36a86bc9 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst @@ -0,0 +1,4 @@ +:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd`` +character for truncated C string when use precision with ``%s`` and ``%V``. +It now truncates the string before the start of truncated multibyte +sequences. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 279cdaa668e2..d11a9dca14b2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2581,6 +2581,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, Py_ssize_t width, Py_ssize_t precision, int flags) { /* UTF-8 */ + Py_ssize_t *pconsumed = NULL; Py_ssize_t length; if (precision == -1) { length = strlen(str); @@ -2590,15 +2591,23 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, while (length < precision && str[length]) { length++; } + if (length == precision) { + /* The input string is not NUL-terminated. If it ends with an + * incomplete UTF-8 sequence, truncate the string just before it. + * Incomplete sequences in the middle and sequences which cannot + * be valid prefixes are still treated as errors and replaced + * with \xfffd. */ + pconsumed = &length; + } } if (width < 0) { return unicode_decode_utf8_writer(writer, str, length, - _Py_ERROR_REPLACE, "replace", NULL); + _Py_ERROR_REPLACE, "replace", pconsumed); } PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length, - "replace", NULL); + "replace", pconsumed); if (unicode == NULL) return -1;