gh-82052: Don't send partial UTF-8 sequences to the Windows API (GH-101103)

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Tue, 17 Jan 2023 19:52:50 +0000 (11:52 -0800)

committer GitHub <noreply@github.com>

Tue, 17 Jan 2023 19:52:50 +0000 (11:52 -0800)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Tue, 17 Jan 2023 19:52:50 +0000 (11:52 -0800)
committer GitHub <noreply@github.com>
Tue, 17 Jan 2023 19:52:50 +0000 (11:52 -0800)
diff --git a/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst b/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst

new file mode 100644 (file)

index 0000000..4f7ab20
--- /dev/null
+++ b/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst
@@ -0,0 +1 @@
+Fixed an issue where writing more than 32K of Unicode output to the console screen in one go can result in mojibake.
diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c

index 5c1a6dd86fc54f28edaececaa976655ba6118f6b..c8f3481e665abd4dc0114ac001bb1b3caac120b2 100644 (file)
--- a/Modules/_io/winconsoleio.c
+++ b/Modules/_io/winconsoleio.c
@@ -954,7 +954,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
  {
      BOOL res = TRUE;
      wchar_t *wbuf;
-    DWORD len, wlen, n = 0;
+    DWORD len, wlen, orig_len, n = 0;
      HANDLE handle;
  
      if (self->fd == -1)
@@ -984,6 +984,21 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
         have to reduce and recalculate. */
      while (wlen > 32766 / sizeof(wchar_t)) {
          len /= 2;
+        orig_len = len;
+        /* Reduce the length until we hit the final byte of a UTF-8 sequence
+         * (top bit is unset). Fix for github issue 82052.
+         */
+        while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
+            --len;
+        /* If we hit a length of 0, something has gone wrong. This shouldn't
+         * be possible, as valid UTF-8 can have at most 3 non-final bytes
+         * before a final one, and our buffer is way longer than that.
+         * But to be on the safe side, if we hit this issue we just restore
+         * the original length and let the console API sort it out.
+         */
+        if (len == 0) {
+            len = orig_len;
+        }
          wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
      }
      Py_END_ALLOW_THREADS
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Tue, 17 Jan 2023 19:52:50 +0000 (11:52 -0800)
committer	GitHub <noreply@github.com>
	Tue, 17 Jan 2023 19:52:50 +0000 (11:52 -0800)
Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst	[new file with mode: 0644]	patch \| blob
Modules/_io/winconsoleio.c		patch \| blob \| blame \| history