CodeSet: Add CodeSet_IsValidUTF8String() and more comments

author Oliver Kurth <okurth@vmware.com>

Mon, 4 May 2020 18:54:11 +0000 (11:54 -0700)

committer Oliver Kurth <okurth@vmware.com>

Mon, 4 May 2020 18:54:11 +0000 (11:54 -0700)
author Oliver Kurth <okurth@vmware.com>
Mon, 4 May 2020 18:54:11 +0000 (11:54 -0700)
committer Oliver Kurth <okurth@vmware.com>
Mon, 4 May 2020 18:54:11 +0000 (11:54 -0700)
diff --git a/open-vm-tools/lib/include/codeset.h b/open-vm-tools/lib/include/codeset.h

index 70cdf6009ccf985fe9dab700e4856b6494f6d21f..0dc937c65851fac0a056de391526b76c85ef7e5b 100644 (file)
--- a/open-vm-tools/lib/include/codeset.h
+++ b/open-vm-tools/lib/include/codeset.h
@@ -1,10 +1,13 @@
  /* **********************************************************
- * Copyright (C) 2007-2017 VMware, Inc.  All rights reserved.
+ * Copyright (C) 1998-2020 VMware, Inc.  All rights reserved. -- VMware Confidential
   * **********************************************************/
  
  /*
   * codeset.h --
   *
+ *    Character set and encoding conversion functions --hpreg
+ *
+ *
   *      UTF-16 handling macros. Based on utf16.h from ICU 1.8.1.
   *
   *      ICU 1.8.1 license follows:
@@ -46,6 +49,7 @@
   *      to promote the sale, use or other dealings in this Software
   *      without prior written authorization of the copyright holder.
   */
+
  #ifndef __CODESET_H__
  #   define __CODESET_H__
  
@@ -391,6 +395,9 @@ Bool CodeSet_IsValidUTF8(const char *bufIn,  // IN:
  
  Bool CodeSet_IsStringValidUTF8(const char *string);  // IN:
  
+Bool CodeSet_IsValidUTF8String(const char *bufIn,  // IN:
+                               size_t sizeIn);     // IN:
+
  /*
   *-----------------------------------------------------------------------------
   *
diff --git a/open-vm-tools/lib/misc/codesetUTF8.c b/open-vm-tools/lib/misc/codesetUTF8.c

index 128c5ecceec2be2a51e5adffb04ad02ff26cbb79..a6345d6cbade041e18435b62bbb5c865ba3342d4 100644 (file)
--- a/open-vm-tools/lib/misc/codesetUTF8.c
+++ b/open-vm-tools/lib/misc/codesetUTF8.c
@@ -1,5 +1,5 @@
  /* **********************************************************
- * Copyright (C) 2015-2016 VMware, Inc.  All rights reserved.
+ * Copyright (C) 2015-2020 VMware, Inc.  All rights reserved.
   * **********************************************************/
  
  /*
@@ -61,12 +61,29 @@ CodeSetDecode(uint32 *state,  // IN:
  }
  
  
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_IsStringValidUTF8 --
+ *
+ *      Check if the given buffer contains a valid UTF-8 string.
+ *      This function will stop at first '\0' it sees.
+ *
+ * Results:
+ *      TRUE if the given buffer contains a valid UTF-8 string, or FALSE.
+ *
+ * Side effects:
+ *      None
+ *
+ *----------------------------------------------------------------------------
+ */
+
  Bool
  CodeSet_IsStringValidUTF8(const char *bufIn)  // IN:
  {
     uint32 state = UTF8_ACCEPT;
  
-   while (*bufIn) {
+   while (*bufIn != '\0') {
        CodeSetDecode(&state, (unsigned char) *bufIn++);
     }
  
@@ -74,6 +91,23 @@ CodeSet_IsStringValidUTF8(const char *bufIn)  // IN:
  }
  
  
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_IsValidUTF8 --
+ *
+ *      Check if the given buffer with given size, is UTF-8 encoded.
+ *      This function will return TRUE even if there is '\0' in the buffer.
+ *
+ * Results:
+ *      TRUE if the buffer is UTF-8 encoded, or FALSE.
+ *
+ * Side effects:
+ *      None
+ *
+ *----------------------------------------------------------------------------
+ */
+
  Bool
  CodeSet_IsValidUTF8(const char *bufIn,  // IN:
                      size_t sizeIn)      // IN:
@@ -87,3 +121,45 @@ CodeSet_IsValidUTF8(const char *bufIn,  // IN:
  
     return state == UTF8_ACCEPT;
  }
+
+
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_IsValidUTF8String --
+ *
+ *      Check if the given buffer with given size, is a valid UTF-8 string,
+ *      and without '\0' in it.
+ *
+ * Results:
+ *      TRUE if passed, or FALSE.
+ *
+ * Side effects:
+ *      None
+ *
+ *----------------------------------------------------------------------------
+ */
+
+Bool
+CodeSet_IsValidUTF8String(const char *bufIn,  // IN:
+                          size_t sizeIn)      // IN:
+{
+   size_t i;
+   uint32 state = UTF8_ACCEPT;
+   unsigned char c;
+
+   for (i = 0; i < sizeIn; i++) {
+      c = (unsigned char) *bufIn++;
+
+      if (UNLIKELY(c == '\0')) {
+         return FALSE;
+      }
+
+      CodeSetDecode(&state, c);
+   }
+
+   /* If everything went well we should have proper UTF8, the data
+    * might instead have ended in the middle of a UTF8 codepoint.
+    */
+   return state == UTF8_ACCEPT;
+}
author	Oliver Kurth <okurth@vmware.com>
	Mon, 4 May 2020 18:54:11 +0000 (11:54 -0700)
committer	Oliver Kurth <okurth@vmware.com>
	Mon, 4 May 2020 18:54:11 +0000 (11:54 -0700)
open-vm-tools/lib/include/codeset.h		patch \| blob \| blame \| history
open-vm-tools/lib/misc/codesetUTF8.c		patch \| blob \| blame \| history