From: VMware, Inc <>
Date: Mon, 20 Dec 2010 21:44:25 +0000 (-0800)
Subject: lib/unicode: Unicode_Substr is broken
X-Git-Tag: 2010.12.19-339835~58
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=fd327a1c98b9dea1414888ee8a5de4cb944e95e9;p=thirdparty%2Fopen-vm-tools.git

lib/unicode: Unicode_Substr is broken

There is a confusion of code points and code units within lib/unicode
that needs to be cleaned up. Here I fix Unicode_Substr using the
technique I plan to use across most of the function in
unicodeSimpleOperations.c.

More changes will ensue until the base PR is fixed.

Signed-off-by: Marcelo Vanzin <mvanzin@vmware.com>
---

diff --git a/open-vm-tools/lib/unicode/unicodeSimpleOperations.c b/open-vm-tools/lib/unicode/unicodeSimpleOperations.c
index 4b2ac68f4..89750fa0d 100644
--- a/open-vm-tools/lib/unicode/unicodeSimpleOperations.c
+++ b/open-vm-tools/lib/unicode/unicodeSimpleOperations.c
@@ -379,9 +379,12 @@ Unicode_FindLastSubstrInRange(ConstUnicode str,             // IN
  *      Indices and lengths that are out of bounds are pinned to the
  *      edges of the string.
  *
- *      Pass -1 for any length parameter to indicate "from start until
+ *      Pass -1 for the length parameter to indicate "from start until
  *      end of string".
  *
+ *      The start and length arguments are in code points - unicode
+ *      "characters" - not bytes!
+ *
  * Results:
  *      The newly-allocated substring of 'str' in the range [index,
  *      index + length). Caller must free with Unicode_Free.
@@ -393,13 +396,42 @@ Unicode_FindLastSubstrInRange(ConstUnicode str,             // IN
  */
 
 Unicode
-Unicode_Substr(ConstUnicode str,    // IN
-               UnicodeIndex start,  // IN
-               UnicodeIndex length) // IN
+Unicode_Substr(ConstUnicode str,     // IN:
+               UnicodeIndex start,   // IN:
+               UnicodeIndex length)  // IN:
 {
-   UnicodePinIndices(str, &start, &length);
+   char *substr;
+   uint32 *utf32;
+   uint32 codePointLen;
+
+   ASSERT(str);
+   ASSERT((start >= 0) || (start == -1));
+   ASSERT((length >= 0) || (length == -1));
+
+   if (!CodeSet_UTF8ToUTF32(str, (char **) &utf32)) {
+      Panic("%s: invalid UTF8 string @ %p\n", __FUNCTION__, str);
+   }
+
+   codePointLen = 0;
+   while (utf32[codePointLen] != 0) {
+      codePointLen++;
+   }
+
+   if ((start < 0) || (start > codePointLen)) {
+      start = codePointLen;
+   }
+
+   if ((length < 0) || ((start + length) > codePointLen)) {
+      length = codePointLen - start;
+   }
+
+   utf32[start + length] = 0;
+
+   CodeSet_UTF32ToUTF8((char *) &utf32[start], &substr);
+
+   free(utf32);
 
-   return Util_SafeStrndup(((const char *)str) + start, length);
+   return substr;
 }
 
 
@@ -453,13 +485,10 @@ Unicode_ReplaceRange(ConstUnicode destination,       // IN
    result = Util_SafeMalloc(resultLength + 1);
 
    // Start with the destination bytes before the substring to be replaced.
-   memcpy(result,
-          destination,
-          destinationStart);
+   memcpy(result, destination, destinationStart);
 
    // Insert the substring of source in place of the destination substring.
-   memcpy(result + destinationStart,
-          (const char *)source + sourceStart,
+   memcpy(result + destinationStart, (const char *) source + sourceStart,
           sourceLength);
 
    // Append the remaining bytes of destination after the replaced substring.