Replace the use of appInfo's internal json escape function with

author John Wolfe <jwolfe@vmware.com>

Tue, 24 Aug 2021 03:13:37 +0000 (20:13 -0700)

committer John Wolfe <jwolfe@vmware.com>

Tue, 24 Aug 2021 03:13:37 +0000 (20:13 -0700)
author John Wolfe <jwolfe@vmware.com>
Tue, 24 Aug 2021 03:13:37 +0000 (20:13 -0700)
committer John Wolfe <jwolfe@vmware.com>
Tue, 24 Aug 2021 03:13:37 +0000 (20:13 -0700)
diff --git a/open-vm-tools/lib/misc/Makefile.am b/open-vm-tools/lib/misc/Makefile.am

index 8678f515bb51f5c1357614a953c93cdd525474df..d968f923571d2eb7e743de53b8c4faa4074fa3e9 100644 (file)
--- a/open-vm-tools/lib/misc/Makefile.am
+++ b/open-vm-tools/lib/misc/Makefile.am
@@ -1,5 +1,5 @@
  ################################################################################
-### Copyright (C) 2007-2017 VMware, Inc.  All rights reserved.
+### Copyright (C) 2007-2017,2021 VMware, Inc.  All rights reserved.
  ###
  ### This program is free software; you can redistribute it and/or modify
  ### it under the terms of version 2 of the GNU General Public License as
@@ -34,6 +34,7 @@ libMisc_la_SOURCES += hostname.c
  libMisc_la_SOURCES += hostType.c
  libMisc_la_SOURCES += idLinux.c
  libMisc_la_SOURCES += iovector.c
+libMisc_la_SOURCES += jsonUTF8.c
  libMisc_la_SOURCES += logFixed.c
  libMisc_la_SOURCES += machineID.c
  libMisc_la_SOURCES += miscSolaris.c
diff --git a/open-vm-tools/lib/misc/jsonUTF8.c b/open-vm-tools/lib/misc/jsonUTF8.c

new file mode 100644 (file)

index 0000000..816bca7
--- /dev/null
+++ b/open-vm-tools/lib/misc/jsonUTF8.c
@@ -0,0 +1,524 @@
+/*********************************************************
+ * Copyright (C) 2020-2021 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation version 2.1 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the Lesser GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA.
+ *
+ *********************************************************/
+
+#include <stdlib.h>
+#include "vmware.h"
+#include "codeset.h"
+#include "vm_ctype.h"
+#include "dynbuf.h"
+#include "unicodeBase.h"
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * CodeSetFindEscape --
+ *
+ *      Is there an escape for the specified character?
+ *
+ *      The last entry in the characters to be escaped entry must have
+ *      a 'c' on '\0' and an 'escape' of NULL.
+ *
+ * Results:
+ *      NULL No.
+ *     !NULL Yes. Pointer to the escape entry for the specified character.
+ *
+ * Side effects:
+ *      None
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static const CodeSetEscapeEntry *
+CodeSetFindEscape(char c,                             // IN:
+                  const CodeSetEscapeEntry *entries)  // IN:
+{
+   const CodeSetEscapeEntry *e;
+
+   for (e = entries; e->escape != NULL; e++) {
+      if (c == e->c) {
+         return e;
+      }
+   }
+
+   return NULL;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * CodeSet_Utf8Escape --
+ *
+ *      Escape the ASCII characters specified by the escape entries
+ *      within a UTF8 string.
+ *
+ *      The last entry in the characters to be escaped entry must have
+ *      a 'c' on '\0' and an 'escape' of NULL.
+ *
+ * Results:
+ *      NULL Failure!
+ *     !NULL Success! The escaped string. The caller is responsible to free
+ *                    this.
+ *
+ * Side effects:
+ *      Memory is allocated
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+char *
+CodeSet_Utf8Escape(const char *utf8,                   // IN:
+                   const CodeSetEscapeEntry *entries)  // IN:
+{
+   DynBuf b;
+   char *res;
+   const char *p;
+   const char *end;
+   Bool success = TRUE;
+
+   ASSERT(utf8 != NULL);
+   if (utf8 == NULL) {
+      return NULL;
+   }
+
+   DynBuf_Init(&b);
+
+   p = utf8;
+   end = p + strlen(utf8);
+
+   while (p < end) {
+      uint32 len = CodeSet_GetUtf8(p, end, NULL);
+
+      if (len == 0) {
+         success = FALSE;
+         break;
+      }
+
+      if (len == 1) {  // ASCII
+         const CodeSetEscapeEntry *e = CodeSetFindEscape(*p, entries);
+
+         if (e == NULL) {
+            DynBuf_Append(&b, p, len);
+         } else {
+            DynBuf_Append(&b, e->escape, strlen(e->escape));
+         }
+      } else {  // All others
+         DynBuf_Append(&b, p, len);
+      }
+
+      p += len;
+   }
+
+   if (success) {
+      res = DynBuf_DetachString(&b);
+   } else {
+      res = NULL;
+   }
+
+   DynBuf_Destroy(&b);
+
+   return res;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * CodeSet_JsonEscape --
+ *
+ *      Escape a unicode string following JSON rules.
+ *
+ *      Backspace       (\b)
+ *      Form Feed       (\f)
+ *      Line Feed       (\n)
+ *      Carriage Return (\r)
+ *      Tab             (\t)
+ *      Backslash       (\)
+ *      Double Quote    (")
+ *
+ * Results:
+ *      NULL Failure!
+ *     !NULL Success! The escaped string. The caller is responsible to free
+ *                    this.
+ *
+ * Side effects:
+ *      Memory is allocated
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+char *
+CodeSet_JsonEscape(const char *utf8)  // IN:
+{
+   static const CodeSetEscapeEntry JsonEscapes[] = {
+      { '\b', "\\b"  },
+      { '\f', "\\f"  },
+      { '\n', "\\n"  },
+      { '\r', "\\r"  },
+      { '\t', "\\t"  },
+      { '\\', "\\\\" },
+      { '\"', "\\\"" },
+      { '\0', NULL   }   // MUST BE LAST
+   };
+
+   return CodeSet_Utf8Escape(utf8, JsonEscapes);
+}
+
+
+/* Constants used by json unescape routines. */
+
+/* Number of hex digits in a "\u" escape sequence. */
+#define JSON_UESC_NDIGITS 4
+
+/*
+ * Maximum number of UTF-8 code units (bytes) per Unicode code point.
+ * From bora/lib/unicode/unicode/utf8.h.
+ */
+#ifndef U8_MAX_LENGTH
+#define U8_MAX_LENGTH 4
+#endif
+
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_JsonGetHex --
+ *
+ *      Retrieve and convert to an integer the four hex digits that are
+ *      part of the six character escape sequence that starts with "\u".
+ *
+ *      On entry, p points to the first code point following "\u."
+ *
+ * Results:
+ *      TRUE on success, with *value set to the integer value.
+ *
+ *      FALSE on failure.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------------
+ */
+
+static Bool
+CodeSet_JSonGetHex(const char *p,       // IN:
+                   const char *end,     // IN:
+                   int32 *value)        // OUT:
+{
+   char hexBuf[JSON_UESC_NDIGITS + 1];   /* +1 for NUL */
+   int numHexDigits = 0;
+
+   ASSERT(p <= end);
+
+   /*
+    * Assumes called with p set to first code point following "\u" and looks
+    * for four hex digits.   No need to call CodeSet_GetUtf8 to verify that
+    * the code point length of these characters is one since it's always on
+    * a code point boundary and it's OK to check directly for specific
+    * ASCII characters in such a case, and if there's a match to an ASCII
+    * character then advancing the pointer by a single character will advance
+    * to the next code point.
+    */
+   while (numHexDigits < JSON_UESC_NDIGITS) {
+      if (p >= end || !CType_IsXDigit(*p)) {
+         return FALSE;
+      }
+      hexBuf[numHexDigits++] = *p++;
+   }
+
+   hexBuf[numHexDigits] = '\0';
+   *value = strtol(hexBuf, NULL, 16);
+   return TRUE;
+}
+
+
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_JsonUnescapeU --
+ *
+ *      Handle a JSON escape sequence beginning with "\u", consisting either
+ *      of:
+ *         (1) "\u" followed by four hex digits; or
+ *         (2) two such consecutive sequences encoding a character
+ *             outside the Basic MultiLingual Plane as a UTF-16
+ *             surrogate pair.
+ *
+ *      Note "\u0000" is not allowed and is considered an error if
+ *      encountered.
+ *
+ *      On entry to the routine, p should be pointing at the backslash
+ *      character that starts the (possible) escape sequence.
+ *
+ *      outBuf is the base of a char array of size >= U8_MAX_LENGTH + 1, i.e.,
+ *      large enough to hold a NUL-terminated UTF-8 encoding of any Unicode
+ *      code point.
+ *
+ * Results:
+ *      On success, the length of the escape sequence, with the unescaped
+ *      result plus a NUL terminator in outBuf.
+ *
+ *      0 on failure.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------------
+ */
+
+static int
+CodeSet_JsonUnescapeU(const char *p,        // IN:
+                      const char *end,      // IN:
+                      char *outBuf)         // OUT:
+{
+   uint32 w;
+   uint32 utf32Buf[2];  /* code point value plus 0 terminator */
+   char *utf8String;
+   uint32 len;
+   const char *start = p;
+
+   /*
+    * Assumes called only if starts with "\u".  No need to call
+    * CodeSet_GetUtf8 in this ASSERT since this is checking for specific ASCII
+    * characters - see comment preceding ASSERT in CodeSet_JsonUnescapeOne
+    * below.
+    */
+   ASSERT(p < end && *p == '\\');
+   ASSERT(&p[1] < end && p[1] == 'u');
+
+   /* Code point of 0 ("\u0000") not allowed. */
+   if (!CodeSet_JSonGetHex(&p[2], end, &w) || w == 0) {
+      return 0;
+   }
+
+   /* Advance p past "\u" and the hex digits that follow. */
+   p += 2 + JSON_UESC_NDIGITS;
+
+   /* If the value is a leading surrogate, then handle the trailing one. */
+   if (U16_IS_LEAD(w)) {
+      uint32 trail;
+
+      /*
+       * Check for '\', 'u', and four digits representing a trailer.  As
+       * elsewhere, no need to call CodeSet_GetUtf8 since this is checking for
+       * specific ASCII characters, and bails out if any of the checks fail.
+       */
+      if (p < end && *p++ == '\\' && p < end && *p++ == 'u' &&
+          CodeSet_JSonGetHex(p, end, &trail) && U16_IS_TRAIL(trail)) {
+         w = U16_GET_SUPPLEMENTARY(w, trail);
+
+         /* Advance p past the digits that follow "\u". */
+         p += JSON_UESC_NDIGITS;
+      } else {
+         return 0;
+      }
+   } else if (U16_IS_TRAIL(w)) {
+      return 0;
+   }
+
+   /*
+    * To get the UTF-8 for this code point, create a UTF-32 string
+    * and convert to UTF-8.
+    */
+   utf32Buf[0] = w;
+   utf32Buf[1] = 0;   /* needs a 4-byte 0 terminator */
+
+   if (!CodeSet_UTF32ToUTF8((char *)utf32Buf, &utf8String)) {
+      return 0;
+   }
+
+   len = strlen(utf8String);
+   ASSERT(Unicode_IsBufferValid(utf8String, len, STRING_ENCODING_UTF8));
+   ASSERT(len <= U8_MAX_LENGTH);
+   memcpy(outBuf, utf8String, len + 1);
+
+   free(utf8String);
+   return p - start;
+}
+
+
+/*
+ *----------------------------------------------------------------------------
+ *
+ * CodeSet_JsonUnescapeOne --
+ *
+ *      Handle a single JSON escape sequence.
+ *
+ *      On entry to the routine, p should be pointing at the backslash
+ *      character that starts the (possible) escape sequence.
+ *
+ *      outBuf is the base of a char array of size >= U8_MAX_LENGTH + 1, i.e.,
+ *      large enough to hold a NUL-terminated UTF-8 encoding of any Unicode
+ *      code point.
+ *
+ * Results:
+ *      On success, the length of the escape sequence, with the unescaped
+ *      result plus a NUL terminator in outBuf.
+ *
+ *      0 on failure.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------------
+ */
+
+static int
+CodeSet_JsonUnescapeOne(const char *p,        // IN:
+                        const char *end,      // IN:
+                        char *outBuf)         // OUT:
+{
+   int len = 0;
+   const char *start = p;
+
+   /*
+    * Assumes called only if first character is '\'.  Note that in the
+    * ASSERT it's not necessary to call CodeSet_GetUtf8 to verify the
+    * code point length is 1.  Since this is on a code point boundary,
+    * if the byte matches a specific ASCII character (in this case, '\')
+    * that is sufficient to verify the code point length of 1.
+    */
+   ASSERT(p < end && *p == '\\');
+
+   /*
+    * Advance p by a single char to get to the next code point since it's
+    * known to be an ASCII character (i.e., '\') and therefore code point
+    * length is 1.
+    */
+   if (++p < end) {
+      /*
+       * Preset len and outBuf for common case of valid two-character escape
+       * sequence with one-character output; different values will be assigned
+       * if the sequence turns out to start with "\u"  or is invalid.
+       */
+      len = 2;
+      outBuf[1] = '\0';
+
+      /*
+       * As above, since this on a code point boundary and checking whether
+       * it matches specific ASCII characters, it's not necessary to call
+       * CodeSet_GetUtf8 to verify that the code point length is 1.  In the
+       * event *p is the first byte of a multi-byte UTF-8 code point, we'll
+       * end up in the default case of the switch and fail.
+       */
+      switch (*p) {
+         case '\"':
+         case '\\':
+         case '/':
+            outBuf[0] = *p;
+            break;
+         case 'b':
+            outBuf[0] = '\b';
+            break;
+         case 'f':
+            outBuf[0] = '\f';
+            break;
+         case 'r':
+            outBuf[0] = '\r';
+            break;
+         case 'n':
+            outBuf[0] = '\n';
+            break;
+         case 't':
+            outBuf[0] = '\t';
+            break;
+         case 'u':
+            len = CodeSet_JsonUnescapeU(start, end, outBuf);
+            break;
+         default:
+            len = 0;
+            break;
+      }
+   }
+   return len;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * CodeSet_JsonUnescape --
+ *
+ *      Copy a UTF8 string, reverting any JSON escape sequences found within
+ *      the string according to the STD-90 spec at
+ *      https://tools.ietf.org/html/std90.  This processes the same
+ *      escape sequences that are allowed by the jsmn parser, and generally
+ *      tries to follow the same logic as the jsmn escape parsing.  Any
+ *      strings passed in to this routine have likely been through jsmn, and
+ *      any invalid escape sequences should have been rejected.  However, this
+ *      routine and those it calls still check for the possibility of
+ *      invalid escape sequences and return failure when running into one, as
+ *      opposed to assuming and/or asserting they are valid.
+ *
+ *      A general unescape routine is difficult to do, so the logic here is
+ *      specific to JSON (as opposed to CodeSet_JsonEscape, which relies on
+ *      the more general CodeSet_Utf8Escape).
+ *
+ * Results:
+ *      NULL Failure!
+ *     !NULL Success! The un-escaped string. The caller is responsible to free
+ *                    this.
+ *
+ * Side effects:
+ *      Returns a dynamically allocated string that must be freed by the
+ *      caller.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+char *
+CodeSet_JsonUnescape(const char *utf8)   // IN:
+{
+   DynBuf b;
+   char *res;
+   const char *p;
+   const char *end;
+   Bool success = TRUE;
+
+   ASSERT(utf8 != NULL);
+
+   DynBuf_Init(&b);
+   p = utf8;
+   end = p + strlen(p);
+
+   while (p < end && success) {
+      char unescaped[U8_MAX_LENGTH + 1];  /* +1 for NUL */
+      uint32 len = CodeSet_GetUtf8(p, end, NULL);
+
+      if (len == 0) {
+         success = FALSE;
+      } else if (len > 1 || *p != '\\') {
+         DynBuf_Append(&b, p, len);
+      } else if ((len = CodeSet_JsonUnescapeOne(p, end, unescaped)) != 0) {
+         DynBuf_Append(&b, unescaped, strlen(unescaped));
+      } else {
+         success = FALSE;
+      }
+      p += len;
+   }
+
+   if (success) {
+      res = DynBuf_DetachString(&b);
+   } else {
+      res = NULL;
+   }
+
+   DynBuf_Destroy(&b);
+
+   return res;
+}
diff --git a/open-vm-tools/services/plugins/appInfo/appInfo.c b/open-vm-tools/services/plugins/appInfo/appInfo.c

index cab928bc340e9bdb7ce71768111824231093bc9d..0f70f4c44241ea7eb3a7d4a491907c684d2a7fae 100644 (file)
--- a/open-vm-tools/services/plugins/appInfo/appInfo.c
+++ b/open-vm-tools/services/plugins/appInfo/appInfo.c
@@ -30,6 +30,7 @@
  
  #include "appInfoInt.h"
  #include "vmware.h"
+#include "codeset.h"
  #include "conf.h"
  #include "dynbuf.h"
  #include "escape.h"
@@ -50,10 +51,6 @@
  VM_EMBED_VERSION(VMTOOLSD_VERSION_STRING);
  #endif
  
-#if defined(_WIN32)
-#include "codeset.h"
-#endif
-
  /**
   * Maximum size of the packet size that appInfo plugin should send
   * to the VMX. Currently, this is set to 62 KB.
@@ -102,51 +99,6 @@ static GSource *gAppInfoTimeoutSource = NULL;
  static void TweakGatherLoop(ToolsAppCtx *ctx, gboolean force);
  
  
-/*
- *****************************************************************************
- * EscapeJSONString --
- *
- * Escapes a string to be included in JSON content.
- *
- * @param[in] str The string to be escaped.
- *
- * @retval Pointer to a heap-allocated memory. This holds the escaped content
- *         of the string passed by the caller.
- *
- *****************************************************************************
- */
-
-static char *
-EscapeJSONString(const char *str)    // IN
-{
-   /*
-    * Escape '"' and '\' characters in the JSON string.
-    */
-
-   static const int bytesToEscape[] = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // "
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,   // '\'
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-   };
-
-   return Escape_DoString("\\u00", bytesToEscape, str, strlen(str),
-                          NULL);
-}
-
-
  /*
   *****************************************************************************
   * SetGuestInfo --
@@ -333,7 +285,7 @@ AppInfoGatherTask(ToolsAppCtx *ctx,    // IN
           goto next_entry;
        }
  
-      escapedCmd = EscapeJSONString(appInfo->appName);
+      escapedCmd = CodeSet_JsonEscape(appInfo->appName);
  
        if (NULL == escapedCmd) {
           g_warning("%s: Failed to escape the content of cmdName.\n",
@@ -341,7 +293,7 @@ AppInfoGatherTask(ToolsAppCtx *ctx,    // IN
           goto quit;
        }
  
-      escapedVersion = EscapeJSONString(appInfo->version);
+      escapedVersion = CodeSet_JsonEscape(appInfo->version);
        if (NULL == escapedVersion) {
           g_warning("%s: Failed to escape the content of version information.\n",
                     __FUNCTION__);
author	John Wolfe <jwolfe@vmware.com>
	Tue, 24 Aug 2021 03:13:37 +0000 (20:13 -0700)
committer	John Wolfe <jwolfe@vmware.com>
	Tue, 24 Aug 2021 03:13:37 +0000 (20:13 -0700)
open-vm-tools/lib/misc/Makefile.am		patch \| blob \| blame \| history
open-vm-tools/lib/misc/jsonUTF8.c	[new file with mode: 0644]	patch \| blob
open-vm-tools/services/plugins/appInfo/appInfo.c		patch \| blob \| blame \| history