lib/unicode: Unicode_Substr is broken

author VMware, Inc <>

Mon, 20 Dec 2010 21:44:25 +0000 (13:44 -0800)

committer Marcelo Vanzin <mvanzin@vmware.com>

Mon, 20 Dec 2010 21:44:25 +0000 (13:44 -0800)
author VMware, Inc <>
Mon, 20 Dec 2010 21:44:25 +0000 (13:44 -0800)
committer Marcelo Vanzin <mvanzin@vmware.com>
Mon, 20 Dec 2010 21:44:25 +0000 (13:44 -0800)
diff --git a/open-vm-tools/lib/unicode/unicodeSimpleOperations.c b/open-vm-tools/lib/unicode/unicodeSimpleOperations.c

index 4b2ac68f4e2bd9fff7499f4fa941edd55aee0c9b..89750fa0d0ad6e83f704cfb0ef3955557bdc0c88 100644 (file)
--- a/open-vm-tools/lib/unicode/unicodeSimpleOperations.c
+++ b/open-vm-tools/lib/unicode/unicodeSimpleOperations.c
@@ -379,9 +379,12 @@ Unicode_FindLastSubstrInRange(ConstUnicode str,             // IN
   *      Indices and lengths that are out of bounds are pinned to the
   *      edges of the string.
   *
- *      Pass -1 for any length parameter to indicate "from start until
+ *      Pass -1 for the length parameter to indicate "from start until
   *      end of string".
   *
+ *      The start and length arguments are in code points - unicode
+ *      "characters" - not bytes!
+ *
   * Results:
   *      The newly-allocated substring of 'str' in the range [index,
   *      index + length). Caller must free with Unicode_Free.
@@ -393,13 +396,42 @@ Unicode_FindLastSubstrInRange(ConstUnicode str,             // IN
   */
  
  Unicode
-Unicode_Substr(ConstUnicode str,    // IN
-               UnicodeIndex start,  // IN
-               UnicodeIndex length) // IN
+Unicode_Substr(ConstUnicode str,     // IN:
+               UnicodeIndex start,   // IN:
+               UnicodeIndex length)  // IN:
  {
-   UnicodePinIndices(str, &start, &length);
+   char *substr;
+   uint32 *utf32;
+   uint32 codePointLen;
+
+   ASSERT(str);
+   ASSERT((start >= 0) || (start == -1));
+   ASSERT((length >= 0) || (length == -1));
+
+   if (!CodeSet_UTF8ToUTF32(str, (char **) &utf32)) {
+      Panic("%s: invalid UTF8 string @ %p\n", __FUNCTION__, str);
+   }
+
+   codePointLen = 0;
+   while (utf32[codePointLen] != 0) {
+      codePointLen++;
+   }
+
+   if ((start < 0) || (start > codePointLen)) {
+      start = codePointLen;
+   }
+
+   if ((length < 0) || ((start + length) > codePointLen)) {
+      length = codePointLen - start;
+   }
+
+   utf32[start + length] = 0;
+
+   CodeSet_UTF32ToUTF8((char *) &utf32[start], &substr);
+
+   free(utf32);
  
-   return Util_SafeStrndup(((const char *)str) + start, length);
+   return substr;
  }
  
  
@@ -453,13 +485,10 @@ Unicode_ReplaceRange(ConstUnicode destination,       // IN
     result = Util_SafeMalloc(resultLength + 1);
  
     // Start with the destination bytes before the substring to be replaced.
-   memcpy(result,
-          destination,
-          destinationStart);
+   memcpy(result, destination, destinationStart);
  
     // Insert the substring of source in place of the destination substring.
-   memcpy(result + destinationStart,
-          (const char *)source + sourceStart,
+   memcpy(result + destinationStart, (const char *) source + sourceStart,
            sourceLength);
  
     // Append the remaining bytes of destination after the replaced substring.
author	VMware, Inc <>
	Mon, 20 Dec 2010 21:44:25 +0000 (13:44 -0800)
committer	Marcelo Vanzin <mvanzin@vmware.com>
	Mon, 20 Dec 2010 21:44:25 +0000 (13:44 -0800)