From: Oliver Kurth Date: Mon, 4 May 2020 18:54:11 +0000 (-0700) Subject: CodeSet: Add CodeSet_IsValidUTF8String() and more comments X-Git-Tag: stable-11.2.0~233 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ced22726ccf583d54ebeaa83e5736fb249b730be;p=thirdparty%2Fopen-vm-tools.git CodeSet: Add CodeSet_IsValidUTF8String() and more comments This change adds a new function CodeSet_IsValidUTF8String() to lib/misc/codesetUTF8.c, and adds comments for CodeSet_IsValidUTF8() and CodeSet_IsStringValidUTF8(). --- diff --git a/open-vm-tools/lib/include/codeset.h b/open-vm-tools/lib/include/codeset.h index 70cdf6009..0dc937c65 100644 --- a/open-vm-tools/lib/include/codeset.h +++ b/open-vm-tools/lib/include/codeset.h @@ -1,10 +1,13 @@ /* ********************************************************** - * Copyright (C) 2007-2017 VMware, Inc. All rights reserved. + * Copyright (C) 1998-2020 VMware, Inc. All rights reserved. -- VMware Confidential * **********************************************************/ /* * codeset.h -- * + * Character set and encoding conversion functions --hpreg + * + * * UTF-16 handling macros. Based on utf16.h from ICU 1.8.1. * * ICU 1.8.1 license follows: @@ -46,6 +49,7 @@ * to promote the sale, use or other dealings in this Software * without prior written authorization of the copyright holder. */ + #ifndef __CODESET_H__ # define __CODESET_H__ @@ -391,6 +395,9 @@ Bool CodeSet_IsValidUTF8(const char *bufIn, // IN: Bool CodeSet_IsStringValidUTF8(const char *string); // IN: +Bool CodeSet_IsValidUTF8String(const char *bufIn, // IN: + size_t sizeIn); // IN: + /* *----------------------------------------------------------------------------- * diff --git a/open-vm-tools/lib/misc/codesetUTF8.c b/open-vm-tools/lib/misc/codesetUTF8.c index 128c5ecce..a6345d6cb 100644 --- a/open-vm-tools/lib/misc/codesetUTF8.c +++ b/open-vm-tools/lib/misc/codesetUTF8.c @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (C) 2015-2016 VMware, Inc. All rights reserved. + * Copyright (C) 2015-2020 VMware, Inc. All rights reserved. * **********************************************************/ /* @@ -61,12 +61,29 @@ CodeSetDecode(uint32 *state, // IN: } +/* + *---------------------------------------------------------------------------- + * + * CodeSet_IsStringValidUTF8 -- + * + * Check if the given buffer contains a valid UTF-8 string. + * This function will stop at first '\0' it sees. + * + * Results: + * TRUE if the given buffer contains a valid UTF-8 string, or FALSE. + * + * Side effects: + * None + * + *---------------------------------------------------------------------------- + */ + Bool CodeSet_IsStringValidUTF8(const char *bufIn) // IN: { uint32 state = UTF8_ACCEPT; - while (*bufIn) { + while (*bufIn != '\0') { CodeSetDecode(&state, (unsigned char) *bufIn++); } @@ -74,6 +91,23 @@ CodeSet_IsStringValidUTF8(const char *bufIn) // IN: } +/* + *---------------------------------------------------------------------------- + * + * CodeSet_IsValidUTF8 -- + * + * Check if the given buffer with given size, is UTF-8 encoded. + * This function will return TRUE even if there is '\0' in the buffer. + * + * Results: + * TRUE if the buffer is UTF-8 encoded, or FALSE. + * + * Side effects: + * None + * + *---------------------------------------------------------------------------- + */ + Bool CodeSet_IsValidUTF8(const char *bufIn, // IN: size_t sizeIn) // IN: @@ -87,3 +121,45 @@ CodeSet_IsValidUTF8(const char *bufIn, // IN: return state == UTF8_ACCEPT; } + + +/* + *---------------------------------------------------------------------------- + * + * CodeSet_IsValidUTF8String -- + * + * Check if the given buffer with given size, is a valid UTF-8 string, + * and without '\0' in it. + * + * Results: + * TRUE if passed, or FALSE. + * + * Side effects: + * None + * + *---------------------------------------------------------------------------- + */ + +Bool +CodeSet_IsValidUTF8String(const char *bufIn, // IN: + size_t sizeIn) // IN: +{ + size_t i; + uint32 state = UTF8_ACCEPT; + unsigned char c; + + for (i = 0; i < sizeIn; i++) { + c = (unsigned char) *bufIn++; + + if (UNLIKELY(c == '\0')) { + return FALSE; + } + + CodeSetDecode(&state, c); + } + + /* If everything went well we should have proper UTF8, the data + * might instead have ended in the middle of a UTF8 codepoint. + */ + return state == UTF8_ACCEPT; +}