From: John Wolfe Date: Mon, 9 Nov 2020 20:29:03 +0000 (-0800) Subject: JSON escape a UTF8 string, plus a general purpose routine. X-Git-Tag: stable-11.3.0~252 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4b09058fb66e07b2545d779e5aee80e18c02e6ed;p=thirdparty%2Fopen-vm-tools.git JSON escape a UTF8 string, plus a general purpose routine. Provide a JSON escape routine working with UTF8 built on top of a general purpose escape routine. --- diff --git a/open-vm-tools/lib/include/codeset.h b/open-vm-tools/lib/include/codeset.h index 8ec689a16..dd1df88f5 100644 --- a/open-vm-tools/lib/include/codeset.h +++ b/open-vm-tools/lib/include/codeset.h @@ -394,6 +394,16 @@ Bool CodeSet_IsStringValidUTF8(const char *string); // IN: Bool CodeSet_IsValidUTF8String(const char *bufIn, // IN: size_t sizeIn); // IN: +typedef struct { + char c; + char *escape; +} CodeSetEscapeEntry; + +char *CodeSet_Utf8Escape(const char *utf8, // IN: + const CodeSetEscapeEntry *entries); // IN: + +char *CodeSet_JsonEscape(const char *utf8); // IN: + /* *----------------------------------------------------------------------------- * diff --git a/open-vm-tools/lib/misc/codesetBase.c b/open-vm-tools/lib/misc/codesetBase.c index bc5f68d94..2b8e5bcff 100644 --- a/open-vm-tools/lib/misc/codesetBase.c +++ b/open-vm-tools/lib/misc/codesetBase.c @@ -1,5 +1,5 @@ /********************************************************* - * Copyright (C) 2010-2017 VMware, Inc. All rights reserved. + * Copyright (C) 2010-2020 VMware, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published @@ -56,6 +56,7 @@ CodeSet_GetUtf8(const char *string, // IN: string uint8 *e; uint32 c; int len; + ASSERT(string < end); c = *p; @@ -66,24 +67,24 @@ CodeSet_GetUtf8(const char *string, // IN: string goto out; } - if ((c < 0xc2) || (c > 0xf4)) { - // 0x81 to 0xbf are not valid first bytes - // 0xc0 and 0xc1 cannot appear in UTF-8, see below - // leading char can not be > 0xf4, illegal as well + if ((c < 0xC2) || (c > 0xF4)) { + // 0x81 to 0xBF are not valid first bytes + // 0xC0 and 0xC1 cannot appear in UTF-8, see below + // leading char cannot be > 0xF4, illegal as well return 0; } - if (c < 0xe0) { + if (c < 0xE0) { // U+0080 - U+07FF: 2 bytes of UTF-8. - c -= 0xc0; + c -= 0xC0; len = 2; - } else if (c < 0xf0) { + } else if (c < 0xF0) { // U+0800 - U+FFFF: 3 bytes of UTF-8. - c -= 0xe0; + c -= 0xE0; len = 3; } else { // U+10000 - U+10FFFF: 4 bytes of UTF-8. - c -= 0xf0; + c -= 0xF0; len = 4; } @@ -93,7 +94,7 @@ CodeSet_GetUtf8(const char *string, // IN: string } while (++p < e) { - if ((*p & 0xc0) != 0x80) { + if ((*p & 0xC0) != 0x80) { // bad trailing byte return 0; } @@ -109,7 +110,7 @@ CodeSet_GetUtf8(const char *string, // IN: string * termination. * * This test does not work for len == 2, but that case is handled - * by requiring the first byte to be 0xc2 or greater (see above). + * by requiring the first byte to be 0xC2 or greater (see above). */ if (c < 1U << (len * 5 - 4)) { @@ -152,7 +153,7 @@ CodeSet_LengthInCodePoints(const char *utf8) // IN: char *end; uint32 codePoints = 0; - ASSERT(utf8); + ASSERT(utf8 != NULL); p = (char *) utf8; end = p + strlen(utf8); @@ -190,13 +191,13 @@ CodeSet_LengthInCodePoints(const char *utf8) // IN: */ int -CodeSet_CodePointOffsetToByteOffset(const char *utf8, // IN - int codePointOffset) // IN +CodeSet_CodePointOffsetToByteOffset(const char *utf8, // IN: + int codePointOffset) // IN: { const char *p; const char *end; - ASSERT(utf8); + ASSERT(utf8 != NULL); p = utf8; end = p + strlen(utf8); @@ -248,7 +249,7 @@ CodeSet_UTF8ToUTF32(const char *utf8, // IN: uint32 *ptr; int codePoints; - ASSERT(utf32); + ASSERT(utf32 != NULL); if (utf8 == NULL) { // NULL is not an error *utf32 = NULL; @@ -311,7 +312,7 @@ CodeSet_UTF32ToUTF8(const char *utf32, // IN: uint8 bytes[4]; } value; - ASSERT(utf8); + ASSERT(utf8 != NULL); if (utf32 == NULL) { // NULL is not an error *utf8 = NULL; @@ -390,4 +391,3 @@ CodeSet_UTF32ToUTF8(const char *utf32, // IN: return TRUE; } -