From: John Wolfe <jwolfe@vmware.com>
Date: Tue, 12 Jul 2022 16:56:01 +0000 (-0700)
Subject: Escape all control characters in JSON.
X-Git-Tag: stable-12.1.0~33
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e5e9a9ba31791d4f10e7dd75a8dbab179926511f;p=thirdparty%2Fopen-vm-tools.git

Escape all control characters in JSON.

Update CodeSet_JsonEscape to escape all control characters between
U+0000 and U+001F as required by the JSON standard.  Delete
CodeSet_Utf8Escape as it is no longer used.  Also update datasets
in-guest API tests accordingly.
---

diff --git a/open-vm-tools/lib/include/codeset.h b/open-vm-tools/lib/include/codeset.h
index af9f8b752..9a93f1fec 100644
--- a/open-vm-tools/lib/include/codeset.h
+++ b/open-vm-tools/lib/include/codeset.h
@@ -394,14 +394,6 @@ Bool CodeSet_IsStringValidUTF8(const char *string);  // IN:
 Bool CodeSet_IsValidUTF8String(const char *bufIn,  // IN:
                                size_t sizeIn);     // IN:
 
-typedef struct {
-   char c;
-   char *escape;
-} CodeSetEscapeEntry;
-
-char *CodeSet_Utf8Escape(const char *utf8,                    // IN:
-                         const CodeSetEscapeEntry *entries);  // IN:
-
 char *CodeSet_JsonEscape(const char *utf8);  // IN:
 
 char *CodeSet_JsonUnescape(const char *utf8);  // IN:
diff --git a/open-vm-tools/lib/misc/jsonUTF8.c b/open-vm-tools/lib/misc/jsonUTF8.c
index 816bca795..722897147 100644
--- a/open-vm-tools/lib/misc/jsonUTF8.c
+++ b/open-vm-tools/lib/misc/jsonUTF8.c
@@ -1,5 +1,5 @@
 /*********************************************************
- * Copyright (C) 2020-2021 VMware, Inc. All rights reserved.
+ * Copyright (C) 2020-2022 VMware, Inc. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published
@@ -21,55 +21,42 @@
 #include "codeset.h"
 #include "vm_ctype.h"
 #include "dynbuf.h"
+#include "strutil.h"
 #include "unicodeBase.h"
 
 
 /*
  *-----------------------------------------------------------------------------
  *
- * CodeSetFindEscape --
+ * CodeSet_JsonEscape --
  *
- *      Is there an escape for the specified character?
+ *      Escape a unicode string following JSON rules.
  *
- *      The last entry in the characters to be escaped entry must have
- *      a 'c' on '\0' and an 'escape' of NULL.
+ *      From https://www.rfc-editor.org/rfc/rfc8259.html#section-7:
  *
- * Results:
- *      NULL No.
- *     !NULL Yes. Pointer to the escape entry for the specified character.
- *
- * Side effects:
- *      None
+ *      ... All Unicode characters may be placed within the
+ *      quotation marks, except for the characters that MUST be escaped:
+ *      quotation mark, reverse solidus, and the control characters (U+0000
+ *      through U+001F).
  *
- *-----------------------------------------------------------------------------
- */
-
-static const CodeSetEscapeEntry *
-CodeSetFindEscape(char c,                             // IN:
-                  const CodeSetEscapeEntry *entries)  // IN:
-{
-   const CodeSetEscapeEntry *e;
-
-   for (e = entries; e->escape != NULL; e++) {
-      if (c == e->c) {
-         return e;
-      }
-   }
-
-   return NULL;
-}
-
-
-/*
- *-----------------------------------------------------------------------------
+ *      ... If the character is in the Basic
+ *      Multilingual Plane (U+0000 through U+FFFF), then it may be
+ *      represented as a six-character sequence: a reverse solidus, followed
+ *      by the lowercase letter u, followed by four hexadecimal digits that
+ *      encode the character's code point....
  *
- * CodeSet_Utf8Escape --
+ *      Alternatively, there are two-character sequence escape
+ *      representations of some popular characters.  So, for example, a
+ *      string containing only a single reverse solidus character may be
+ *      represented more compactly as "\\"
  *
- *      Escape the ASCII characters specified by the escape entries
- *      within a UTF8 string.
+ *      ...
  *
- *      The last entry in the characters to be escaped entry must have
- *      a 'c' on '\0' and an 'escape' of NULL.
+ *                  %x62 /          ; b    backspace       U+0008
+ *                  %x66 /          ; f    form feed       U+000C
+ *                  %x6E /          ; n    line feed       U+000A
+ *                  %x72 /          ; r    carriage return U+000D
+ *                  %x74 /          ; t    tab             U+0009
  *
  * Results:
  *      NULL Failure!
@@ -83,8 +70,7 @@ CodeSetFindEscape(char c,                             // IN:
  */
 
 char *
-CodeSet_Utf8Escape(const char *utf8,                   // IN:
-                   const CodeSetEscapeEntry *entries)  // IN:
+CodeSet_JsonEscape(const char *utf8)                   // IN:
 {
    DynBuf b;
    char *res;
@@ -110,18 +96,35 @@ CodeSet_Utf8Escape(const char *utf8,                   // IN:
          break;
       }
 
-      if (len == 1) {  // ASCII
-         const CodeSetEscapeEntry *e = CodeSetFindEscape(*p, entries);
-
-         if (e == NULL) {
-            DynBuf_Append(&b, p, len);
-         } else {
-            DynBuf_Append(&b, e->escape, strlen(e->escape));
+      if (len > 1 || (*utf8 > 0x001F && *utf8 != '"' && *utf8 != '\\')) {
+         DynBuf_SafeAppend(&b, p, len);
+      } else {
+         DynBuf_SafeAppend(&b, "\\", 1);
+         switch (*p) {
+         case '"':
+         case '\\':
+            DynBuf_SafeAppend(&b, p, 1);
+            break;
+         case '\b':
+            DynBuf_SafeAppend(&b, "b", 1);
+            break;
+         case '\f':
+            DynBuf_SafeAppend(&b, "f", 1);
+            break;
+         case '\n':
+            DynBuf_SafeAppend(&b, "n", 1);
+            break;
+         case '\r':
+            DynBuf_SafeAppend(&b, "r", 1);
+            break;
+         case '\t':
+            DynBuf_SafeAppend(&b, "t", 1);
+            break;
+         default:
+            StrUtil_SafeDynBufPrintf(&b, "u%04x", *p);
+            break;
          }
-      } else {  // All others
-         DynBuf_Append(&b, p, len);
       }
-
       p += len;
    }
 
@@ -137,50 +140,6 @@ CodeSet_Utf8Escape(const char *utf8,                   // IN:
 }
 
 
-/*
- *-----------------------------------------------------------------------------
- *
- * CodeSet_JsonEscape --
- *
- *      Escape a unicode string following JSON rules.
- *
- *      Backspace       (\b)
- *      Form Feed       (\f)
- *      Line Feed       (\n)
- *      Carriage Return (\r)
- *      Tab             (\t)
- *      Backslash       (\)
- *      Double Quote    (")
- *
- * Results:
- *      NULL Failure!
- *     !NULL Success! The escaped string. The caller is responsible to free
- *                    this.
- *
- * Side effects:
- *      Memory is allocated
- *
- *-----------------------------------------------------------------------------
- */
-
-char *
-CodeSet_JsonEscape(const char *utf8)  // IN:
-{
-   static const CodeSetEscapeEntry JsonEscapes[] = {
-      { '\b', "\\b"  },
-      { '\f', "\\f"  },
-      { '\n', "\\n"  },
-      { '\r', "\\r"  },
-      { '\t', "\\t"  },
-      { '\\', "\\\\" },
-      { '\"', "\\\"" },
-      { '\0', NULL   }   // MUST BE LAST
-   };
-
-   return CodeSet_Utf8Escape(utf8, JsonEscapes);
-}
-
-
 /* Constants used by json unescape routines. */
 
 /* Number of hex digits in a "\u" escape sequence. */
@@ -417,32 +376,32 @@ CodeSet_JsonUnescapeOne(const char *p,        // IN:
        * end up in the default case of the switch and fail.
        */
       switch (*p) {
-         case '\"':
-         case '\\':
-         case '/':
-            outBuf[0] = *p;
-            break;
-         case 'b':
-            outBuf[0] = '\b';
-            break;
-         case 'f':
-            outBuf[0] = '\f';
-            break;
-         case 'r':
-            outBuf[0] = '\r';
-            break;
-         case 'n':
-            outBuf[0] = '\n';
-            break;
-         case 't':
-            outBuf[0] = '\t';
-            break;
-         case 'u':
-            len = CodeSet_JsonUnescapeU(start, end, outBuf);
-            break;
-         default:
-            len = 0;
-            break;
+      case '\"':
+      case '\\':
+      case '/':
+         outBuf[0] = *p;
+         break;
+      case 'b':
+         outBuf[0] = '\b';
+         break;
+      case 'f':
+         outBuf[0] = '\f';
+         break;
+      case 'r':
+         outBuf[0] = '\r';
+         break;
+      case 'n':
+         outBuf[0] = '\n';
+         break;
+      case 't':
+         outBuf[0] = '\t';
+         break;
+      case 'u':
+         len = CodeSet_JsonUnescapeU(start, end, outBuf);
+         break;
+      default:
+         len = 0;
+         break;
       }
    }
    return len;
@@ -503,9 +462,9 @@ CodeSet_JsonUnescape(const char *utf8)   // IN:
       if (len == 0) {
          success = FALSE;
       } else if (len > 1 || *p != '\\') {
-         DynBuf_Append(&b, p, len);
+         DynBuf_SafeAppend(&b, p, len);
       } else if ((len = CodeSet_JsonUnescapeOne(p, end, unescaped)) != 0) {
-         DynBuf_Append(&b, unescaped, strlen(unescaped));
+         DynBuf_SafeAppend(&b, unescaped, strlen(unescaped));
       } else {
          success = FALSE;
       }