From f8b9b7439456c5ab8d95ad3c27f905ebd501685f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Pali=20Roh=C3=A1r?= Date: Sun, 9 Feb 2020 13:04:34 +0100 Subject: [PATCH] libblkid: Fix UTF-16 support in function blkid_encode_to_utf8() Function blkid_encode_to_utf8() says that is supports BLKID_ENC_UTF16LE and BLKID_ENC_UTF16BE encodings, but it is not truth and supports only UCS-2 (and not full UTF-16). As all places where BLKID_ENC_UTF16LE and BLKID_ENC_UTF16BE is used expects UTF-16 and not UCS-2, this patch changes implementation of encodings BLKID_ENC_UTF16LE and BLKID_ENC_UTF16BE to supports full UTF-16, including surrogate pairs and not only UCS-2. --- libblkid/src/encode.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/libblkid/src/encode.c b/libblkid/src/encode.c index 33d349127e..36ad1c9569 100644 --- a/libblkid/src/encode.c +++ b/libblkid/src/encode.c @@ -237,7 +237,8 @@ size_t blkid_encode_to_utf8(int enc, unsigned char *dest, size_t len, const unsigned char *src, size_t count) { size_t i, j; - uint16_t c; + uint32_t c; + uint16_t c2; for (j = i = 0; i < count; i++) { if (enc == BLKID_ENC_UTF16LE) { @@ -255,6 +256,17 @@ size_t blkid_encode_to_utf8(int enc, unsigned char *dest, size_t len, } else { return 0; } + if ((enc == BLKID_ENC_UTF16LE || enc == BLKID_ENC_UTF16BE) && + c >= 0xD800 && c <= 0xDBFF && i+2 < count) { + if (enc == BLKID_ENC_UTF16LE) + c2 = (src[i+2] << 8) | src[i+1]; + else + c2 = (src[i+1] << 8) | src[i+2]; + if (c2 >= 0xDC00 && c2 <= 0xDFFF) { + c = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00); + i += 2; + } + } if (c == 0) { dest[j] = '\0'; break; @@ -267,12 +279,19 @@ size_t blkid_encode_to_utf8(int enc, unsigned char *dest, size_t len, break; dest[j++] = (uint8_t) (0xc0 | (c >> 6)); dest[j++] = (uint8_t) (0x80 | (c & 0x3f)); - } else { + } else if (c < 0x10000) { if (j+3 >= len) break; dest[j++] = (uint8_t) (0xe0 | (c >> 12)); dest[j++] = (uint8_t) (0x80 | ((c >> 6) & 0x3f)); dest[j++] = (uint8_t) (0x80 | (c & 0x3f)); + } else { + if (j+4 >= len) + break; + dest[j++] = (uint8_t) (0xf0 | (c >> 18)); + dest[j++] = (uint8_t) (0x80 | ((c >> 12) & 0x3f)); + dest[j++] = (uint8_t) (0x80 | ((c >> 6) & 0x3f)); + dest[j++] = (uint8_t) (0x80 | (c & 0x3f)); } } dest[j] = '\0'; -- 2.47.2