From: Oliver Kurth <okurth@vmware.com>
Date: Mon, 4 May 2020 18:54:11 +0000 (-0700)
Subject: CodeSet: Add CodeSet_IsValidUTF8String() and more comments
X-Git-Tag: stable-11.2.0~233
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ced22726ccf583d54ebeaa83e5736fb249b730be;p=thirdparty%2Fopen-vm-tools.git

CodeSet: Add CodeSet_IsValidUTF8String() and more comments

This change adds a new function CodeSet_IsValidUTF8String() to
lib/misc/codesetUTF8.c, and adds comments for CodeSet_IsValidUTF8()
and CodeSet_IsStringValidUTF8().
---

diff --git a/open-vm-tools/lib/include/codeset.h b/open-vm-tools/lib/include/codeset.h
index 70cdf6009..0dc937c65 100644
--- a/open-vm-tools/lib/include/codeset.h
+++ b/open-vm-tools/lib/include/codeset.h
@@ -1,10 +1,13 @@
 /* **********************************************************
- * Copyright (C) 2007-2017 VMware, Inc.  All rights reserved.
+ * Copyright (C) 1998-2020 VMware, Inc.  All rights reserved. -- VMware Confidential
  * **********************************************************/
 
 /*
  * codeset.h --
  *
+ *    Character set and encoding conversion functions --hpreg
+ *
+ *
  *      UTF-16 handling macros. Based on utf16.h from ICU 1.8.1.
  *
  *      ICU 1.8.1 license follows:
@@ -46,6 +49,7 @@
  *      to promote the sale, use or other dealings in this Software
  *      without prior written authorization of the copyright holder.
  */
+
 #ifndef __CODESET_H__
 #   define __CODESET_H__
 
@@ -391,6 +395,9 @@ Bool CodeSet_IsValidUTF8(const char *bufIn,  // IN:
 
 Bool CodeSet_IsStringValidUTF8(const char *string);  // IN:
 
+Bool CodeSet_IsValidUTF8String(const char *bufIn,  // IN:
+                               size_t sizeIn);     // IN:
+
 /*
  *-----------------------------------------------------------------------------
  *
diff --git a/open-vm-tools/lib/misc/codesetUTF8.c b/open-vm-tools/lib/misc/codesetUTF8.c
index 128c5ecce..a6345d6cb 100644
--- a/open-vm-tools/lib/misc/codesetUTF8.c
+++ b/open-vm-tools/lib/misc/codesetUTF8.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (C) 2015-2016 VMware, Inc.  All rights reserved.
+ * Copyright (C) 2015-2020 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -61,12 +61,29 @@ CodeSetDecode(uint32 *state,  // IN:
 }
 
 
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_IsStringValidUTF8 --
+ *
+ *      Check if the given buffer contains a valid UTF-8 string.
+ *      This function will stop at first '\0' it sees.
+ *
+ * Results:
+ *      TRUE if the given buffer contains a valid UTF-8 string, or FALSE.
+ *
+ * Side effects:
+ *      None
+ *
+ *----------------------------------------------------------------------------
+ */
+
 Bool
 CodeSet_IsStringValidUTF8(const char *bufIn)  // IN:
 {
    uint32 state = UTF8_ACCEPT;
 
-   while (*bufIn) {
+   while (*bufIn != '\0') {
       CodeSetDecode(&state, (unsigned char) *bufIn++);
    }
 
@@ -74,6 +91,23 @@ CodeSet_IsStringValidUTF8(const char *bufIn)  // IN:
 }
 
 
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_IsValidUTF8 --
+ *
+ *      Check if the given buffer with given size, is UTF-8 encoded.
+ *      This function will return TRUE even if there is '\0' in the buffer.
+ *
+ * Results:
+ *      TRUE if the buffer is UTF-8 encoded, or FALSE.
+ *
+ * Side effects:
+ *      None
+ *
+ *----------------------------------------------------------------------------
+ */
+
 Bool
 CodeSet_IsValidUTF8(const char *bufIn,  // IN:
                     size_t sizeIn)      // IN:
@@ -87,3 +121,45 @@ CodeSet_IsValidUTF8(const char *bufIn,  // IN:
 
    return state == UTF8_ACCEPT;
 }
+
+
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_IsValidUTF8String --
+ *
+ *      Check if the given buffer with given size, is a valid UTF-8 string,
+ *      and without '\0' in it.
+ *
+ * Results:
+ *      TRUE if passed, or FALSE.
+ *
+ * Side effects:
+ *      None
+ *
+ *----------------------------------------------------------------------------
+ */
+
+Bool
+CodeSet_IsValidUTF8String(const char *bufIn,  // IN:
+                          size_t sizeIn)      // IN:
+{
+   size_t i;
+   uint32 state = UTF8_ACCEPT;
+   unsigned char c;
+
+   for (i = 0; i < sizeIn; i++) {
+      c = (unsigned char) *bufIn++;
+
+      if (UNLIKELY(c == '\0')) {
+         return FALSE;
+      }
+
+      CodeSetDecode(&state, c);
+   }
+
+   /* If everything went well we should have proper UTF8, the data
+    * might instead have ended in the middle of a UTF8 codepoint.
+    */
+   return state == UTF8_ACCEPT;
+}