]> git.ipfire.org Git - thirdparty/curl.git/commitdiff
tool_cb_wrt: fix invalid unicode for windows console
authorJay Satiro <raysatiro@yahoo.com>
Tue, 4 Apr 2023 09:10:52 +0000 (05:10 -0400)
committerJay Satiro <raysatiro@yahoo.com>
Tue, 1 Aug 2023 07:28:12 +0000 (03:28 -0400)
- Suppress an incomplete UTF-8 sequence at the end of the buffer.

- Attempt to reconstruct incomplete UTF-8 sequence from prior call(s)
  in current call.

Prior to this change, in Windows console UTF-8 sequences split between
two or more calls to the write callback would cause invalid "replacement
characters" U+FFFD to be printed instead of the actual Unicode
character. This is because in Windows only UTF-16 encoded characters are
printed to the console, therefore we convert the UTF-8 contents to
UTF-16, which cannot be done with partial UTF-8 sequences.

Reported-by: Maksim Arhipov
Fixes https://github.com/curl/curl/issues/9841
Closes https://github.com/curl/curl/pull/10890

src/tool_cb_hdr.c
src/tool_cb_wrt.c
src/tool_operate.c
src/tool_sdecls.h

index dc6069f1abca8d3214d4300bb73d9cac2cbfdd3a..47780232407700a03b7af9dbd3da9145b251cb1e 100644 (file)
@@ -87,6 +87,12 @@ size_t tool_header_cb(char *ptr, size_t size, size_t nmemb, void *userdata)
   }
 #endif
 
+#ifdef WIN32
+  /* Discard incomplete UTF-8 sequence buffered from body */
+  if(outs->utf8seq[0])
+    memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
+#endif
+
   /*
    * Write header data when curl option --dump-header (-D) is given.
    */
index 94d82cb84a870d97ffa2d834271af96743419dcd..2f8c6ac71eca5f29c95995465bd733e1cf581bab 100644 (file)
@@ -233,35 +233,132 @@ size_t tool_write_cb(char *buffer, size_t sz, size_t nmemb, void *userdata)
 
 #ifdef WIN32
   fhnd = _get_osfhandle(fileno(outs->stream));
+  /* if windows console then UTF-8 must be converted to UTF-16 */
   if(isatty(fileno(outs->stream)) &&
      GetConsoleScreenBufferInfo((HANDLE)fhnd, &console_info)) {
-    DWORD in_len = (DWORD)(sz * nmemb);
-    wchar_t* wc_buf;
+    wchar_t *wc_buf;
     DWORD wc_len;
+    unsigned char *rbuf = (unsigned char *)buffer;
+    DWORD rlen = (DWORD)bytes;
 
-    /* calculate buffer size for wide characters */
-    wc_len = MultiByteToWideChar(CP_UTF8, 0, buffer, in_len,  NULL, 0);
-    wc_buf = (wchar_t*) malloc(wc_len * sizeof(wchar_t));
-    if(!wc_buf)
-      return CURL_WRITEFUNC_ERROR;
+#define IS_TRAILING_BYTE(x) (0x80 <= (x) && (x) < 0xC0)
 
-    /* calculate buffer size for multi-byte characters */
-    wc_len = MultiByteToWideChar(CP_UTF8, 0, buffer, in_len, wc_buf, wc_len);
-    if(!wc_len) {
-      free(wc_buf);
-      return CURL_WRITEFUNC_ERROR;
+    /* attempt to complete an incomplete UTF-8 sequence from previous call.
+       the sequence does not have to be well-formed. */
+    if(outs->utf8seq[0] && rlen) {
+      bool complete = false;
+      /* two byte sequence (lead byte 110yyyyy) */
+      if(0xC0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xE0) {
+        outs->utf8seq[1] = *rbuf++;
+        --rlen;
+        complete = true;
+      }
+      /* three byte sequence (lead byte 1110zzzz) */
+      else if(0xE0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xF0) {
+        if(!outs->utf8seq[1]) {
+          outs->utf8seq[1] = *rbuf++;
+          --rlen;
+        }
+        if(rlen && !outs->utf8seq[2]) {
+          outs->utf8seq[2] = *rbuf++;
+          --rlen;
+          complete = true;
+        }
+      }
+      /* four byte sequence (lead byte 11110uuu) */
+      else if(0xF0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xF8) {
+        if(!outs->utf8seq[1]) {
+          outs->utf8seq[1] = *rbuf++;
+          --rlen;
+        }
+        if(rlen && !outs->utf8seq[2]) {
+          outs->utf8seq[2] = *rbuf++;
+          --rlen;
+        }
+        if(rlen && !outs->utf8seq[3]) {
+          outs->utf8seq[3] = *rbuf++;
+          --rlen;
+          complete = true;
+        }
+      }
+
+      if(complete) {
+        WCHAR prefix[3] = {0};  /* UTF-16 (1-2 WCHARs) + NUL */
+
+        if(MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)outs->utf8seq, -1,
+                               prefix, sizeof(prefix)/sizeof(prefix[0]))) {
+          DEBUGASSERT(prefix[3] == L'\0');
+          if(!WriteConsoleW(
+              (HANDLE) fhnd,
+              prefix,
+              prefix[1] ? 2 : 1,
+              NULL,
+              NULL)) {
+            return CURL_WRITEFUNC_ERROR;
+          }
+        }
+        /* else: UTF-8 input was not well formed and OS is pre-Vista which
+           drops invalid characters instead of writing U+FFFD to output.  */
+
+        memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
+      }
     }
 
-    if(!WriteConsoleW(
-        (HANDLE) fhnd,
-        wc_buf,
-        wc_len,
-        &wc_len,
-        NULL)) {
+    /* suppress an incomplete utf-8 sequence at end of rbuf */
+    if(!outs->utf8seq[0] && rlen && (rbuf[rlen - 1] & 0x80)) {
+      /* check for lead byte from a two, three or four byte sequence */
+      if(0xC0 <= rbuf[rlen - 1] && rbuf[rlen - 1] < 0xF8) {
+        outs->utf8seq[0] = rbuf[rlen - 1];
+        rlen -= 1;
+      }
+      else if(rlen >= 2 && IS_TRAILING_BYTE(rbuf[rlen - 1])) {
+        /* check for lead byte from a three or four byte sequence */
+        if(0xE0 <= rbuf[rlen - 2] && rbuf[rlen - 2] < 0xF8) {
+          outs->utf8seq[0] = rbuf[rlen - 2];
+          outs->utf8seq[1] = rbuf[rlen - 1];
+          rlen -= 2;
+        }
+        else if(rlen >= 3 && IS_TRAILING_BYTE(rbuf[rlen - 2])) {
+          /* check for lead byte from a four byte sequence */
+          if(0xF0 <= rbuf[rlen - 3] && rbuf[rlen - 3] < 0xF8) {
+            outs->utf8seq[0] = rbuf[rlen - 3];
+            outs->utf8seq[1] = rbuf[rlen - 2];
+            outs->utf8seq[2] = rbuf[rlen - 1];
+            rlen -= 3;
+          }
+        }
+      }
+    }
+
+    if(rlen) {
+      /* calculate buffer size for wide characters */
+      wc_len = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)rbuf, rlen, NULL, 0);
+      if(!wc_len)
+        return CURL_WRITEFUNC_ERROR;
+
+      wc_buf = (wchar_t*) malloc(wc_len * sizeof(wchar_t));
+      if(!wc_buf)
+        return CURL_WRITEFUNC_ERROR;
+
+      wc_len = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)rbuf, rlen, wc_buf,
+                                   wc_len);
+      if(!wc_len) {
+        free(wc_buf);
+        return CURL_WRITEFUNC_ERROR;
+      }
+
+      if(!WriteConsoleW(
+          (HANDLE) fhnd,
+          wc_buf,
+          wc_len,
+          NULL,
+          NULL)) {
+        free(wc_buf);
+        return CURL_WRITEFUNC_ERROR;
+      }
       free(wc_buf);
-      return CURL_WRITEFUNC_ERROR;
     }
-    free(wc_buf);
+
     rc = bytes;
   }
   else
index c93888b18da219816b929d169a4073db62678195..ca1313eeb8c09b2c9c6a472402022e38f60c7d11 100644 (file)
@@ -464,6 +464,12 @@ static CURLcode post_per_transfer(struct GlobalConfig *global,
     }
   }
 
+#ifdef WIN32
+  /* Discard incomplete UTF-8 sequence buffered from body */
+  if(outs->utf8seq[0])
+    memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
+#endif
+
   /* if retry-max-time is non-zero, make sure we haven't exceeded the
      time */
   if(per->retry_numretries &&
index 7c03a04a5766a4c87f455b49a9c2db93c7034973..7b2eb23381d5af9290adced1a1ead347e12968bf 100644 (file)
@@ -57,6 +57,9 @@
  * 'init' member holds original file size or offset at which truncation is
  * taking place. Always zero unless appending to a non-empty regular file.
  *
+ * [Windows]
+ * 'utf8seq' member holds an incomplete UTF-8 sequence destined for the console
+ * until it can be completed (1-4 bytes) + NUL.
  */
 
 struct OutStruct {
@@ -68,6 +71,9 @@ struct OutStruct {
   FILE *stream;
   curl_off_t bytes;
   curl_off_t init;
+#ifdef WIN32
+  unsigned char utf8seq[5];
+#endif
 };
 
 /*