From: Mike Stepanek (mstepane) <mstepane@cisco.com>
Date: Wed, 30 Mar 2022 16:03:01 +0000 (+0000)
Subject: Pull request #3326: JSN: decode String.fromCodePoint() JavaScript function
X-Git-Tag: 3.1.27.0~9
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=93e521a6bfe382c160d0fc73c52fe27fbb803e2b;p=thirdparty%2Fsnort3.git

Pull request #3326: JSN: decode String.fromCodePoint() JavaScript function

Merge in SNORT/snort3 from ~OSERHIIE/snort3:js_from_code_point to master

Squashed commit of the following:

commit a4e3c6cad84181fb907ccafec6e4941e4611a927
Author: Oleksandr Serhiienko <oserhiie@cisco.com>
Date:   Mon Mar 28 13:34:04 2022 +0300

    http_inspect: decode String.fromCodePoint() JavaScript function

        * utils: add support for supplementary characters in JS Normalizer
        * utils: add tracking and decoding of String.fromCodePoint() JavaScript
        function in JS Normalizer
        * utils: add unit test coverage
        * http_inspect: update dev notes
        * doc: update user manual
---

diff --git a/doc/user/http_inspect.txt b/doc/user/http_inspect.txt
index 02bd93adf..cf7dcaf2e 100755
--- a/doc/user/http_inspect.txt
+++ b/doc/user/http_inspect.txt
@@ -80,7 +80,8 @@ and identifiers normalizer. Normalizer concatenates string literals whenever
 it's possible to do. This also works with any other normalizations that result
 in string literals. All JavaScript identifier names, except those from
 the ignore list, will be substituted with unified names in the following
-format: var_0000 -> var_ffff. The Normalizer tries to expand an escaped text,
+format: var_0000 -> var_ffff. But the unescape-like function names will be removed
+from the normalized data. The Normalizer tries to expand an escaped text,
 so it will appear in a usual form in the output. Moreover, Normalizer validates
 the syntax concerning ECMA-262 Standard, including scope tracking and restrictions
 for script elements. For more information on how additionally configure
diff --git a/src/service_inspectors/http_inspect/dev_notes.txt b/src/service_inspectors/http_inspect/dev_notes.txt
index eb69bbe30..997d4afb5 100755
--- a/src/service_inspectors/http_inspect/dev_notes.txt
+++ b/src/service_inspectors/http_inspect/dev_notes.txt
@@ -258,19 +258,20 @@ For example:
     a("hello") // will be substituted to 'console.log("hello")'
 
 In addition to the scope tracking, JS Normalizer specifically tracks unicode unescape
-functions(unescape, decodeURI, decodeURIComponent, String.fromCharCode). This allows detection of
-unescape functions nested within other unescape functions, which is a potential
-indicator of a multilevel obfuscation. The definition of a function call depends on
+functions(unescape, decodeURI, decodeURIComponent, String.fromCharCode, String.fromCodePoint).
+This allows detection of unescape functions nested within other unescape functions, which is
+a potential indicator of a multilevel obfuscation. The definition of a function call depends on
 identifier substitution, so such identifiers must be included in the ignore list in
 order to use this feature. After determining the unescape sequence, it is decoded into the
-corresponding string.
+corresponding string, and the name of unescape function will not be present in the output.
 
 For example:
 
-   unescape('\u0062\u0061\u0072')        -> 'bar'
-   decodeURI('%62%61%72')                -> 'bar'
-   decodeURIComponent('\x62\x61\x72')    -> 'bar'
-   String.fromCharCode(98, 0x0061, 0x72) -> 'bar'
+   unescape('\u0062\u0061\u0072')              -> 'bar'
+   decodeURI('%62%61%72')                      -> 'bar'
+   decodeURIComponent('\x62\x61\x72')          -> 'bar'
+   String.fromCharCode(98, 0x0061, 0x72)       -> 'bar'
+   String.fromCodePoint(65600, 65601, 0x10042) -> 'ððð'
 
 Supported formats follow
 
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h
index e7b84d7ed..21a1fd725 100644
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -364,12 +364,13 @@ private:
         FuncType type;
     };
 
-    const std::array<FunctionIdentifier, 4> function_identifiers
+    const std::array<FunctionIdentifier, 5> function_identifiers
     {{
-        {"unescape",            FuncType::UNESCAPE  },
-        {"decodeURI",           FuncType::UNESCAPE  },
-        {"decodeURIComponent",  FuncType::UNESCAPE  },
-        {"String.fromCharCode", FuncType::CHAR_CODE }
+        {"unescape",             FuncType::UNESCAPE },
+        {"decodeURI",            FuncType::UNESCAPE },
+        {"decodeURIComponent",   FuncType::UNESCAPE },
+        {"String.fromCharCode",  FuncType::CHAR_CODE},
+        {"String.fromCodePoint", FuncType::CHAR_CODE}
     }};
 
     const uint32_t max_bracket_depth;
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l
index da6c8bf15..263a91786 100644
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -1237,8 +1237,15 @@ static std::string unicode_to_utf8(const unsigned int code)
         res += 0x80 | ((code >> 6) & 0x3f);
         res += 0x80 | (code & 0x3f);
     }
+    else if (code <= 0x1fffff)
+    {
+        res += 0xf0 | (code >> 18);
+        res += 0x80 | ((code >> 12) & 0x3f);
+        res += 0x80 | ((code >> 6) & 0x3f);
+        res += 0x80 | (code & 0x3f);
+    }
     else
-        res += "\uffff";
+        res += "\xf7\xbf\xbf\xbf";    // UTF-8 sequence for hex 0x1fffff
 
     return res;
 }
diff --git a/src/utils/test/js_unescape_test.cc b/src/utils/test/js_unescape_test.cc
index 3c8d29ccf..ee58b1137 100644
--- a/src/utils/test/js_unescape_test.cc
+++ b/src/utils/test/js_unescape_test.cc
@@ -276,7 +276,11 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]")
         );
         test_normalization(
             "String.fromCharCode(65536)",
-            "'\uffff'"
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCodePoint(2097152)",
+            "'\xf7\xbf\xbf\xbf'"
         );
     }
 
@@ -315,12 +319,20 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]")
             "'\ueEfF'"
         );
         test_normalization(
-            "String.fromCharCode(0x10000)",
-            "'\uffff'"
+            "String.fromCodePoint(0x10000)",
+            "'\xf0\x90\x80\x80'"
         );
         test_normalization(
             "String.fromCharCode(0X10000)",
-            "'\uffff'"
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCodePoint(0x200000)",
+            "'\xf7\xbf\xbf\xbf'"
+        );
+        test_normalization(
+            "String.fromCodePoint(0X200000)",
+            "'\xf7\xbf\xbf\xbf'"
         );
     }
 }
@@ -825,6 +837,63 @@ TEST_CASE("String.fromCharCode()", "[JSNormalizer]")
     }
 }
 
+TEST_CASE("String.fromCodePoint()", "[JSNormalizer]")
+{
+    SECTION("decimal")
+    {
+        test_normalization(
+            "String.fromCodePoint(98, 97, 114)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(65600, 65601, 65602)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+    }
+
+    SECTION("hexadecimal")
+    {
+        test_normalization(
+            "String.fromCodePoint(0x62, 0x61, 0x72)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(0x10040, 0x10041, 0x10042)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+    }
+
+    SECTION("mixed sequence")
+    {
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint(98, 97, 0x72)",
+            "'bar'"
+        );
+
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint(0x00000062, 97, 114)",
+            "'bar'"
+        );
+
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint(65600, 0x10041, 65602)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint(0x10040, 65601, 0x10042)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+    }
+}
+
 TEST_CASE("Split", "[JSNormalizer]")
 {
     SECTION("unescape()")
@@ -1063,6 +1132,61 @@ TEST_CASE("Split", "[JSNormalizer]")
             { "114)", "'bar'" }
         });
     }
+
+    SECTION("String.fromCodePoint()")
+    {
+        test_normalization({
+            { "String.fromCodePoint(", "'" },
+            { ")", "''" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(9", "'\u0009" },
+            { "8, 97, 114)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(98,", "'b" },
+            { "97, 114)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(98, 97", "'ba" },
+            { ",114)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(98, 97, 114", "'bar" },
+            { ")", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(0x0062", "'b" },
+            { ",0x0061, 0x0072)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(0x00000062, 0x00000061", "'ba" },
+            { ", 0x0072)", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072", "'bar" },
+            { ")", "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(0x00000062,", "'b" },
+            { "0x00000061,", "'ba" },
+            { "0x72)",   "'bar'" }
+        });
+
+        test_normalization({
+            { "String.fromCodePoint(98,", "'b" },
+            { "97,", "'ba" },
+            { "114)", "'bar'" }
+        });
+    }
 }
 
 TEST_CASE("Mixed input", "[JSNormalizer]")
@@ -1109,6 +1233,10 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
             "String.fromCharCode (114, 0x72, eval('123'), 114, 0x72) ;",
             "'rr' eval('123'),114,0x72;"
         );
+        test_normalization_mixed_encoding(
+            "String.fromCodePoint (114, 0x00000072, eval('123'), 114, 0x00000072) ;",
+            "'rr' eval('123'),114,0x00000072;"
+        );
     }
 
     SECTION("comment")
@@ -1125,6 +1253,18 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
             "String.fromCharCode(0x62, \r 0x61, <!-- HTML comment \r 0x72) ;",
             "'bar';"
         );
+        test_normalization(
+            "String.fromCodePoint(0x00000062, \n 0x00000061, // comment \n 0x00000072) ;",
+            "'bar';"
+        );
+        test_normalization(
+            "String.fromCodePoint(0x00000062, \t 0x00000061, /* comment */ 0x00000072) ;",
+            "'bar';"
+        );
+        test_normalization(
+            "String.fromCodePoint(0x00000062, \r 0x00000061, <!-- HTML comment \r 0x00000072) ;",
+            "'bar';"
+        );
     }
 
     SECTION("nested")
@@ -1137,6 +1277,14 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
             "document.write(unescape('%62%61%72')) ;",
             "document.write('bar');"
         );
+        test_normalization(
+            "String.fromCodePoint(0x0062, 0x0061, String.fromCharCode(0x0062, 0x0061, 0x0072));",
+            "'ba' 'bar';"
+        );
+        test_normalization(
+            "String.fromCharCode(0x0062, 0x0061, String.fromCodePoint(0x0062, 0x0061, 0x0072));",
+            "'ba' 'bar';"
+        );
     }
 }