gh-110913: Fix WindowsConsoleIO chunking of UTF-8 text (GH-111007)

author Tamás Hegedűs <sorgloomer@users.noreply.github.com>

Fri, 20 Oct 2023 11:52:31 +0000 (13:52 +0200)

committer GitHub <noreply@github.com>

Fri, 20 Oct 2023 11:52:31 +0000 (12:52 +0100)
author Tamás Hegedűs <sorgloomer@users.noreply.github.com>
Fri, 20 Oct 2023 11:52:31 +0000 (13:52 +0200)
committer GitHub <noreply@github.com>
Fri, 20 Oct 2023 11:52:31 +0000 (12:52 +0100)
diff --git a/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst b/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst

new file mode 100644 (file)

index 0000000..d4c1b56
--- /dev/null
+++ b/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst
@@ -0,0 +1 @@
+WindowsConsoleIO now correctly chunks large buffers without splitting up UTF-8 sequences.
diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c

index 50b8818aad410b05b37aaa65c74faaf5807bf94c..6680488b740cfc612c373dfc5989fceeea677879 100644 (file)
--- a/Modules/_io/winconsoleio.c
+++ b/Modules/_io/winconsoleio.c
@@ -134,6 +134,23 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
      return m;
  }
  
+static DWORD
+_find_last_utf8_boundary(const char *buf, DWORD len)
+{
+    /* This function never returns 0, returns the original len instead */
+    DWORD count = 1;
+    if (len == 0 || (buf[len - 1] & 0x80) == 0) {
+        return len;
+    }
+    for (;; count++) {
+        if (count > 3 || count >= len) {
+            return len;
+        }
+        if ((buf[len - count] & 0xc0) != 0x80) {
+            return len - count;
+        }
+    }
+}
  
  /*[clinic input]
  module _io
@@ -975,7 +992,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
  {
      BOOL res = TRUE;
      wchar_t *wbuf;
-    DWORD len, wlen, orig_len, n = 0;
+    DWORD len, wlen, n = 0;
      HANDLE handle;
  
      if (self->fd == -1)
@@ -1007,21 +1024,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
         have to reduce and recalculate. */
      while (wlen > 32766 / sizeof(wchar_t)) {
          len /= 2;
-        orig_len = len;
-        /* Reduce the length until we hit the final byte of a UTF-8 sequence
-         * (top bit is unset). Fix for github issue 82052.
-         */
-        while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
-            --len;
-        /* If we hit a length of 0, something has gone wrong. This shouldn't
-         * be possible, as valid UTF-8 can have at most 3 non-final bytes
-         * before a final one, and our buffer is way longer than that.
-         * But to be on the safe side, if we hit this issue we just restore
-         * the original length and let the console API sort it out.
-         */
-        if (len == 0) {
-            len = orig_len;
-        }
+        /* Fix for github issues gh-110913 and gh-82052. */
+        len = _find_last_utf8_boundary(b->buf, len);
          wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
      }
      Py_END_ALLOW_THREADS
author	Tamás Hegedűs <sorgloomer@users.noreply.github.com>
	Fri, 20 Oct 2023 11:52:31 +0000 (13:52 +0200)
committer	GitHub <noreply@github.com>
	Fri, 20 Oct 2023 11:52:31 +0000 (12:52 +0100)
Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst	[new file with mode: 0644]	patch \| blob
Modules/_io/winconsoleio.c		patch \| blob \| blame \| history