From af3f4e419b9f339790de281c871640a773c391c0 Mon Sep 17 00:00:00 2001 From: Jay Satiro Date: Tue, 4 Apr 2023 05:10:52 -0400 Subject: [PATCH] tool_cb_wrt: fix invalid unicode for windows console - Suppress an incomplete UTF-8 sequence at the end of the buffer. - Attempt to reconstruct incomplete UTF-8 sequence from prior call(s) in current call. Prior to this change, in Windows console UTF-8 sequences split between two or more calls to the write callback would cause invalid "replacement characters" U+FFFD to be printed instead of the actual Unicode character. This is because in Windows only UTF-16 encoded characters are printed to the console, therefore we convert the UTF-8 contents to UTF-16, which cannot be done with partial UTF-8 sequences. Reported-by: Maksim Arhipov Fixes https://github.com/curl/curl/issues/9841 Closes https://github.com/curl/curl/pull/10890 --- src/tool_cb_hdr.c | 6 ++ src/tool_cb_wrt.c | 137 ++++++++++++++++++++++++++++++++++++++------- src/tool_operate.c | 6 ++ src/tool_sdecls.h | 6 ++ 4 files changed, 135 insertions(+), 20 deletions(-) diff --git a/src/tool_cb_hdr.c b/src/tool_cb_hdr.c index dc6069f1ab..4778023240 100644 --- a/src/tool_cb_hdr.c +++ b/src/tool_cb_hdr.c @@ -87,6 +87,12 @@ size_t tool_header_cb(char *ptr, size_t size, size_t nmemb, void *userdata) } #endif +#ifdef WIN32 + /* Discard incomplete UTF-8 sequence buffered from body */ + if(outs->utf8seq[0]) + memset(outs->utf8seq, 0, sizeof(outs->utf8seq)); +#endif + /* * Write header data when curl option --dump-header (-D) is given. */ diff --git a/src/tool_cb_wrt.c b/src/tool_cb_wrt.c index 94d82cb84a..2f8c6ac71e 100644 --- a/src/tool_cb_wrt.c +++ b/src/tool_cb_wrt.c @@ -233,35 +233,132 @@ size_t tool_write_cb(char *buffer, size_t sz, size_t nmemb, void *userdata) #ifdef WIN32 fhnd = _get_osfhandle(fileno(outs->stream)); + /* if windows console then UTF-8 must be converted to UTF-16 */ if(isatty(fileno(outs->stream)) && GetConsoleScreenBufferInfo((HANDLE)fhnd, &console_info)) { - DWORD in_len = (DWORD)(sz * nmemb); - wchar_t* wc_buf; + wchar_t *wc_buf; DWORD wc_len; + unsigned char *rbuf = (unsigned char *)buffer; + DWORD rlen = (DWORD)bytes; - /* calculate buffer size for wide characters */ - wc_len = MultiByteToWideChar(CP_UTF8, 0, buffer, in_len, NULL, 0); - wc_buf = (wchar_t*) malloc(wc_len * sizeof(wchar_t)); - if(!wc_buf) - return CURL_WRITEFUNC_ERROR; +#define IS_TRAILING_BYTE(x) (0x80 <= (x) && (x) < 0xC0) - /* calculate buffer size for multi-byte characters */ - wc_len = MultiByteToWideChar(CP_UTF8, 0, buffer, in_len, wc_buf, wc_len); - if(!wc_len) { - free(wc_buf); - return CURL_WRITEFUNC_ERROR; + /* attempt to complete an incomplete UTF-8 sequence from previous call. + the sequence does not have to be well-formed. */ + if(outs->utf8seq[0] && rlen) { + bool complete = false; + /* two byte sequence (lead byte 110yyyyy) */ + if(0xC0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xE0) { + outs->utf8seq[1] = *rbuf++; + --rlen; + complete = true; + } + /* three byte sequence (lead byte 1110zzzz) */ + else if(0xE0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xF0) { + if(!outs->utf8seq[1]) { + outs->utf8seq[1] = *rbuf++; + --rlen; + } + if(rlen && !outs->utf8seq[2]) { + outs->utf8seq[2] = *rbuf++; + --rlen; + complete = true; + } + } + /* four byte sequence (lead byte 11110uuu) */ + else if(0xF0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xF8) { + if(!outs->utf8seq[1]) { + outs->utf8seq[1] = *rbuf++; + --rlen; + } + if(rlen && !outs->utf8seq[2]) { + outs->utf8seq[2] = *rbuf++; + --rlen; + } + if(rlen && !outs->utf8seq[3]) { + outs->utf8seq[3] = *rbuf++; + --rlen; + complete = true; + } + } + + if(complete) { + WCHAR prefix[3] = {0}; /* UTF-16 (1-2 WCHARs) + NUL */ + + if(MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)outs->utf8seq, -1, + prefix, sizeof(prefix)/sizeof(prefix[0]))) { + DEBUGASSERT(prefix[3] == L'\0'); + if(!WriteConsoleW( + (HANDLE) fhnd, + prefix, + prefix[1] ? 2 : 1, + NULL, + NULL)) { + return CURL_WRITEFUNC_ERROR; + } + } + /* else: UTF-8 input was not well formed and OS is pre-Vista which + drops invalid characters instead of writing U+FFFD to output. */ + + memset(outs->utf8seq, 0, sizeof(outs->utf8seq)); + } } - if(!WriteConsoleW( - (HANDLE) fhnd, - wc_buf, - wc_len, - &wc_len, - NULL)) { + /* suppress an incomplete utf-8 sequence at end of rbuf */ + if(!outs->utf8seq[0] && rlen && (rbuf[rlen - 1] & 0x80)) { + /* check for lead byte from a two, three or four byte sequence */ + if(0xC0 <= rbuf[rlen - 1] && rbuf[rlen - 1] < 0xF8) { + outs->utf8seq[0] = rbuf[rlen - 1]; + rlen -= 1; + } + else if(rlen >= 2 && IS_TRAILING_BYTE(rbuf[rlen - 1])) { + /* check for lead byte from a three or four byte sequence */ + if(0xE0 <= rbuf[rlen - 2] && rbuf[rlen - 2] < 0xF8) { + outs->utf8seq[0] = rbuf[rlen - 2]; + outs->utf8seq[1] = rbuf[rlen - 1]; + rlen -= 2; + } + else if(rlen >= 3 && IS_TRAILING_BYTE(rbuf[rlen - 2])) { + /* check for lead byte from a four byte sequence */ + if(0xF0 <= rbuf[rlen - 3] && rbuf[rlen - 3] < 0xF8) { + outs->utf8seq[0] = rbuf[rlen - 3]; + outs->utf8seq[1] = rbuf[rlen - 2]; + outs->utf8seq[2] = rbuf[rlen - 1]; + rlen -= 3; + } + } + } + } + + if(rlen) { + /* calculate buffer size for wide characters */ + wc_len = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)rbuf, rlen, NULL, 0); + if(!wc_len) + return CURL_WRITEFUNC_ERROR; + + wc_buf = (wchar_t*) malloc(wc_len * sizeof(wchar_t)); + if(!wc_buf) + return CURL_WRITEFUNC_ERROR; + + wc_len = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)rbuf, rlen, wc_buf, + wc_len); + if(!wc_len) { + free(wc_buf); + return CURL_WRITEFUNC_ERROR; + } + + if(!WriteConsoleW( + (HANDLE) fhnd, + wc_buf, + wc_len, + NULL, + NULL)) { + free(wc_buf); + return CURL_WRITEFUNC_ERROR; + } free(wc_buf); - return CURL_WRITEFUNC_ERROR; } - free(wc_buf); + rc = bytes; } else diff --git a/src/tool_operate.c b/src/tool_operate.c index c93888b18d..ca1313eeb8 100644 --- a/src/tool_operate.c +++ b/src/tool_operate.c @@ -464,6 +464,12 @@ static CURLcode post_per_transfer(struct GlobalConfig *global, } } +#ifdef WIN32 + /* Discard incomplete UTF-8 sequence buffered from body */ + if(outs->utf8seq[0]) + memset(outs->utf8seq, 0, sizeof(outs->utf8seq)); +#endif + /* if retry-max-time is non-zero, make sure we haven't exceeded the time */ if(per->retry_numretries && diff --git a/src/tool_sdecls.h b/src/tool_sdecls.h index 7c03a04a57..7b2eb23381 100644 --- a/src/tool_sdecls.h +++ b/src/tool_sdecls.h @@ -57,6 +57,9 @@ * 'init' member holds original file size or offset at which truncation is * taking place. Always zero unless appending to a non-empty regular file. * + * [Windows] + * 'utf8seq' member holds an incomplete UTF-8 sequence destined for the console + * until it can be completed (1-4 bytes) + NUL. */ struct OutStruct { @@ -68,6 +71,9 @@ struct OutStruct { FILE *stream; curl_off_t bytes; curl_off_t init; +#ifdef WIN32 + unsigned char utf8seq[5]; +#endif }; /* -- 2.47.3