Pull request #3326: JSN: decode String.fromCodePoint() JavaScript function

author Mike Stepanek (mstepane) <mstepane@cisco.com>

Wed, 30 Mar 2022 16:03:01 +0000 (16:03 +0000)

committer Mike Stepanek (mstepane) <mstepane@cisco.com>

Wed, 30 Mar 2022 16:03:01 +0000 (16:03 +0000)
author Mike Stepanek (mstepane) <mstepane@cisco.com>
Wed, 30 Mar 2022 16:03:01 +0000 (16:03 +0000)
committer Mike Stepanek (mstepane) <mstepane@cisco.com>
Wed, 30 Mar 2022 16:03:01 +0000 (16:03 +0000)
diff --git a/doc/user/http_inspect.txt b/doc/user/http_inspect.txt

index 02bd93adfb5fa328ed779562bca90a825c1671d2..cf7dcaf2eb5f3c7cbc1e1ccc55aa07ad7b0cc7bc 100755 (executable)
--- a/doc/user/http_inspect.txt
+++ b/doc/user/http_inspect.txt
@@ -80,7 +80,8 @@ and identifiers normalizer. Normalizer concatenates string literals whenever
  it's possible to do. This also works with any other normalizations that result
  in string literals. All JavaScript identifier names, except those from
  the ignore list, will be substituted with unified names in the following
-format: var_0000 -> var_ffff. The Normalizer tries to expand an escaped text,
+format: var_0000 -> var_ffff. But the unescape-like function names will be removed
+from the normalized data. The Normalizer tries to expand an escaped text,
  so it will appear in a usual form in the output. Moreover, Normalizer validates
  the syntax concerning ECMA-262 Standard, including scope tracking and restrictions
  for script elements. For more information on how additionally configure
diff --git a/src/service_inspectors/http_inspect/dev_notes.txt b/src/service_inspectors/http_inspect/dev_notes.txt

index eb69bbe30d5ad75964235cc1e2d428e5d780b119..997d4afb502cd4660733aa133de97951c8804310 100755 (executable)
--- a/src/service_inspectors/http_inspect/dev_notes.txt
+++ b/src/service_inspectors/http_inspect/dev_notes.txt
@@ -258,19 +258,20 @@ For example:
      a("hello") // will be substituted to 'console.log("hello")'
  
  In addition to the scope tracking, JS Normalizer specifically tracks unicode unescape
-functions(unescape, decodeURI, decodeURIComponent, String.fromCharCode). This allows detection of
-unescape functions nested within other unescape functions, which is a potential
-indicator of a multilevel obfuscation. The definition of a function call depends on
+functions(unescape, decodeURI, decodeURIComponent, String.fromCharCode, String.fromCodePoint).
+This allows detection of unescape functions nested within other unescape functions, which is
+a potential indicator of a multilevel obfuscation. The definition of a function call depends on
  identifier substitution, so such identifiers must be included in the ignore list in
  order to use this feature. After determining the unescape sequence, it is decoded into the
-corresponding string.
+corresponding string, and the name of unescape function will not be present in the output.
  
  For example:
  
-   unescape('\u0062\u0061\u0072')        -> 'bar'
-   decodeURI('%62%61%72')                -> 'bar'
-   decodeURIComponent('\x62\x61\x72')    -> 'bar'
-   String.fromCharCode(98, 0x0061, 0x72) -> 'bar'
+   unescape('\u0062\u0061\u0072')              -> 'bar'
+   decodeURI('%62%61%72')                      -> 'bar'
+   decodeURIComponent('\x62\x61\x72')          -> 'bar'
+   String.fromCharCode(98, 0x0061, 0x72)       -> 'bar'
+   String.fromCodePoint(65600, 65601, 0x10042) -> '𐁀𐁁𐁂'
  
  Supported formats follow
  
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h

index e7b84d7ed1c097c27d0783424a9cb4a0d0938ff1..21a1fd7258d6fdfb76505165b084c340ab5d75c7 100644 (file)
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -364,12 +364,13 @@ private:
          FuncType type;
      };
  
-    const std::array<FunctionIdentifier, 4> function_identifiers
+    const std::array<FunctionIdentifier, 5> function_identifiers
      {{
-        {"unescape",            FuncType::UNESCAPE  },
-        {"decodeURI",           FuncType::UNESCAPE  },
-        {"decodeURIComponent",  FuncType::UNESCAPE  },
-        {"String.fromCharCode", FuncType::CHAR_CODE }
+        {"unescape",             FuncType::UNESCAPE },
+        {"decodeURI",            FuncType::UNESCAPE },
+        {"decodeURIComponent",   FuncType::UNESCAPE },
+        {"String.fromCharCode",  FuncType::CHAR_CODE},
+        {"String.fromCodePoint", FuncType::CHAR_CODE}
      }};
  
      const uint32_t max_bracket_depth;
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l

index da6c8bf1582c2857e6c87fea63600b80dd1f5f26..263a917860911a37d5467e4062775c2046e5e4b7 100644 (file)
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -1237,8 +1237,15 @@ static std::string unicode_to_utf8(const unsigned int code)
          res += 0x80 | ((code >> 6) & 0x3f);
          res += 0x80 | (code & 0x3f);
      }
+    else if (code <= 0x1fffff)
+    {
+        res += 0xf0 | (code >> 18);
+        res += 0x80 | ((code >> 12) & 0x3f);
+        res += 0x80 | ((code >> 6) & 0x3f);
+        res += 0x80 | (code & 0x3f);
+    }
      else
-        res += "\uffff";
+        res += "\xf7\xbf\xbf\xbf";    // UTF-8 sequence for hex 0x1fffff
  
      return res;
  }
diff --git a/src/utils/test/js_unescape_test.cc b/src/utils/test/js_unescape_test.cc

index 3c8d29ccf957a681fb23d04c0192860f41e738e1..ee58b113774130c9bdc10211e3ce8e6570cdcb44 100644 (file)
--- a/src/utils/test/js_unescape_test.cc
+++ b/src/utils/test/js_unescape_test.cc
@@ -276,7 +276,11 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]")
          );
          test_normalization(
              "String.fromCharCode(65536)",
-            "'\uffff'"
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCodePoint(2097152)",
+            "'\xf7\xbf\xbf\xbf'"
          );
      }
  
@@ -315,12 +319,20 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]")
              "'\ueEfF'"
          );
          test_normalization(
-            "String.fromCharCode(0x10000)",
-            "'\uffff'"
+            "String.fromCodePoint(0x10000)",
+            "'\xf0\x90\x80\x80'"
          );
          test_normalization(
              "String.fromCharCode(0X10000)",
-            "'\uffff'"
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCodePoint(0x200000)",
+            "'\xf7\xbf\xbf\xbf'"
+        );
+        test_normalization(
+            "String.fromCodePoint(0X200000)",
+            "'\xf7\xbf\xbf\xbf'"
          );
      }
  }
@@ -825,6 +837,63 @@ TEST_CASE("String.fromCharCode()", "[JSNormalizer]")
      }
  }
  
+TEST_CASE("String.fromCodePoint()", "[JSNormalizer]")
+{
+    SECTION("decimal")
+    {
+        test_normalization(
+            "String.fromCodePoint(98, 97, 114)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(65600, 65601, 65602)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+    }
+
+    SECTION("hexadecimal")
+    {
+        test_normalization(
+            "String.fromCodePoint(0x62, 0x61, 0x72)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(0x10040, 0x10041, 0x10042)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+    }
+
+    SECTION("mixed sequence")
+    {
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint(98, 97, 0x72)",
+            "'bar'"
+        );
+
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint(0x00000062, 97, 114)",
+            "'bar'"
+        );
+
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint(65600, 0x10041, 65602)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint(0x10040, 65601, 0x10042)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+    }
+}
+
  TEST_CASE("Split", "[JSNormalizer]")
  {
      SECTION("unescape()")
@@ -1063,6 +1132,61 @@ TEST_CASE("Split", "[JSNormalizer]")
              { "114)", "'bar'" }
          });
      }
+
+    SECTION("String.fromCodePoint()")
+    {
+        test_normalization({
+            { "String.fromCodePoint(", "'" },
+            { ")", "''" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(9", "'\u0009" },
+            { "8, 97, 114)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(98,", "'b" },
+            { "97, 114)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(98, 97", "'ba" },
+            { ",114)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(98, 97, 114", "'bar" },
+            { ")", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(0x0062", "'b" },
+            { ",0x0061, 0x0072)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(0x00000062, 0x00000061", "'ba" },
+            { ", 0x0072)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072", "'bar" },
+            { ")", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(0x00000062,", "'b" },
+            { "0x00000061,", "'ba" },
+            { "0x72)",   "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(98,", "'b" },
+            { "97,", "'ba" },
+            { "114)", "'bar'" }
+        });
+    }
  }
  
  TEST_CASE("Mixed input", "[JSNormalizer]")
@@ -1109,6 +1233,10 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
              "String.fromCharCode (114, 0x72, eval('123'), 114, 0x72) ;",
              "'rr' eval('123'),114,0x72;"
          );
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint (114, 0x00000072, eval('123'), 114, 0x00000072) ;",
+            "'rr' eval('123'),114,0x00000072;"
+        );
      }
  
      SECTION("comment")
@@ -1125,6 +1253,18 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
              "String.fromCharCode(0x62, \r 0x61, <!-- HTML comment \r 0x72) ;",
              "'bar';"
          );
+        test_normalization(
+            "String.fromCodePoint(0x00000062, \n 0x00000061, // comment \n 0x00000072) ;",
+            "'bar';"
+        );
+        test_normalization(
+            "String.fromCodePoint(0x00000062, \t 0x00000061, /* comment */ 0x00000072) ;",
+            "'bar';"
+        );
+        test_normalization(
+            "String.fromCodePoint(0x00000062, \r 0x00000061, <!-- HTML comment \r 0x00000072) ;",
+            "'bar';"
+        );
      }
  
      SECTION("nested")
@@ -1137,6 +1277,14 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
              "document.write(unescape('%62%61%72')) ;",
              "document.write('bar');"
          );
+        test_normalization(
+            "String.fromCodePoint(0x0062, 0x0061, String.fromCharCode(0x0062, 0x0061, 0x0072));",
+            "'ba' 'bar';"
+        );
+        test_normalization(
+            "String.fromCharCode(0x0062, 0x0061, String.fromCodePoint(0x0062, 0x0061, 0x0072));",
+            "'ba' 'bar';"
+        );
      }
  }
author	Mike Stepanek (mstepane) <mstepane@cisco.com>
	Wed, 30 Mar 2022 16:03:01 +0000 (16:03 +0000)
committer	Mike Stepanek (mstepane) <mstepane@cisco.com>
	Wed, 30 Mar 2022 16:03:01 +0000 (16:03 +0000)
doc/user/http_inspect.txt		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/dev_notes.txt		patch \| blob \| blame \| history
src/utils/js_tokenizer.h		patch \| blob \| blame \| history
src/utils/js_tokenizer.l		patch \| blob \| blame \| history
src/utils/test/js_unescape_test.cc		patch \| blob \| blame \| history