From: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> Date: Tue, 17 Jan 2023 19:53:45 +0000 (-0800) Subject: gh-82052: Don't send partial UTF-8 sequences to the Windows API (GH-101103) X-Git-Tag: v3.10.10~38 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=940763140f7519a125229782ca7a095af01edda4;p=thirdparty%2FPython%2Fcpython.git gh-82052: Don't send partial UTF-8 sequences to the Windows API (GH-101103) Don't send partial UTF-8 sequences to the Windows API (cherry picked from commit f34176b77f222726d901595968a4b44456186da4) Co-authored-by: Paul Moore --- diff --git a/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst b/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst new file mode 100644 index 000000000000..4f7ab200b85c --- /dev/null +++ b/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst @@ -0,0 +1 @@ +Fixed an issue where writing more than 32K of Unicode output to the console screen in one go can result in mojibake. diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c index 460f2d3fa071..7d605d9f9034 100644 --- a/Modules/_io/winconsoleio.c +++ b/Modules/_io/winconsoleio.c @@ -956,7 +956,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b) { BOOL res = TRUE; wchar_t *wbuf; - DWORD len, wlen, n = 0; + DWORD len, wlen, orig_len, n = 0; HANDLE handle; if (self->fd == -1) @@ -986,6 +986,21 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b) have to reduce and recalculate. */ while (wlen > 32766 / sizeof(wchar_t)) { len /= 2; + orig_len = len; + /* Reduce the length until we hit the final byte of a UTF-8 sequence + * (top bit is unset). Fix for github issue 82052. + */ + while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0) + --len; + /* If we hit a length of 0, something has gone wrong. This shouldn't + * be possible, as valid UTF-8 can have at most 3 non-final bytes + * before a final one, and our buffer is way longer than that. + * But to be on the safe side, if we hit this issue we just restore + * the original length and let the console API sort it out. + */ + if (len == 0) { + len = orig_len; + } wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0); } Py_END_ALLOW_THREADS