From: Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Date: Thu, 4 Aug 2022 12:51:42 +0000 (+0000)
Subject: Pull request #3537: JS Normalizer: Escaped JavaScript Identifiers
X-Git-Tag: 3.1.39.0~5
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6e538616fbafed6110ad7ad9d00742aa5d1c4cd4;p=thirdparty%2Fsnort3.git

Pull request #3537: JS Normalizer: Escaped JavaScript Identifiers

Merge in SNORT/snort3 from ~OSERHIIE/snort3:js_unescape_ident to master

Squashed commit of the following:

commit 2b192d53735b7f6b346c17581adc28c1ee395b56
Author: Oleksandr Serhiienko <oserhiie@cisco.com>
Date:   Mon Aug 1 11:16:11 2022 +0300

    utils: fix compilation warning [-Wcomma]

commit ad2285d11ea0b1408937a7688179e7d65946031f
Author: Oleksandr Serhiienko <oserhiie@cisco.com>
Date:   Mon Aug 1 11:15:00 2022 +0300

    utils: validate escaped JavaScript identifiers
---

diff --git a/src/utils/js_identifier_ctx.cc b/src/utils/js_identifier_ctx.cc
index b172d6494..a56bb05a0 100644
--- a/src/utils/js_identifier_ctx.cc
+++ b/src/utils/js_identifier_ctx.cc
@@ -75,7 +75,7 @@ static void init_norm_names()
     assert(sizeof(norm_names) == c - norm_names);
 }
 
-static int _init_norm_names __attribute__((unused)) = (init_norm_names(), 0);
+static int _init_norm_names __attribute__((unused)) = (static_cast<void>(init_norm_names()), 0);
 
 JSIdentifierCtx::JSIdentifierCtx(int32_t depth, uint32_t max_scope_depth,
     const std::unordered_set<std::string>& ignored_ids_list,
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h
index 18c8ce392..3bcb33cc4 100644
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -331,6 +331,8 @@ private:
     JSIdentifierCtxBase& ident_ctx;
     size_t bytes_read;
     size_t tmp_bytes_read;
+    uint32_t tokens_read;
+    uint32_t tmp_tokens_read;
     bool ext_script;
     VStack<char> regex_stack;
 
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l
index ca5821a12..61db2e741 100644
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -1371,6 +1371,7 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out,
       max_template_nesting(max_template_nesting),
       ident_ctx(mapper),
       bytes_read(0),
+      tokens_read(0),
       tmp_buf(buf),
       tmp_buf_size(buf_size),
       tmp_cap_size(cap_size),
@@ -1397,6 +1398,7 @@ void JSTokenizer::switch_to_temporal(const std::string& data)
     yy_switch_to_buffer((YY_BUFFER_STATE)tmp_buffer);
 
     tmp_bytes_read = bytes_read;
+    tmp_tokens_read = tokens_read;
 }
 
 void JSTokenizer::switch_to_initial()
@@ -1406,6 +1408,7 @@ void JSTokenizer::switch_to_initial()
     tmp_buffer = nullptr;
 
     bytes_read = tmp_bytes_read;
+    tmp_tokens_read = tokens_read - tmp_tokens_read;
 }
 
 // A return value of this method uses to terminate the scanner
@@ -1414,16 +1417,20 @@ void JSTokenizer::switch_to_initial()
 // The return value should be used to make a decision about yyterminate() call
 JSTokenizer::JSRet JSTokenizer::eval_eof()
 {
-    // If the temporal scan buffer reaches EOF, cleanup and
-    // continue with the initial one
-    if (tmp_buffer)
-    {
-        switch_to_initial();
-        return EOS;
-    }
+    if (!tmp_buffer)
+        return SCRIPT_CONTINUE;
+
+    switch_to_initial();
 
-    // Normal termination
-    return SCRIPT_CONTINUE;
+    if (tmp_tokens_read != 1 or token != IDENTIFIER)
+        return BAD_TOKEN;
+
+    // remove temporal buffer normalization state
+    memset((void*)(states + sp), 0, sizeof(states[0]));
+    --sp;
+    sp %= JSTOKENIZER_MAX_STATES;
+
+    return EOS;
 }
 
 JSTokenizer::JSRet JSTokenizer::do_spacing(JSToken cur_token)
@@ -1745,6 +1752,7 @@ void JSTokenizer::states_over()
 bool JSTokenizer::states_process()
 {
     bytes_read += yyleng;
+    ++tokens_read;
 
     // Fulfillment goes after this check only in case of split over several input scripts.
     // Otherwise, new state is pushed.
@@ -2969,6 +2977,7 @@ JSTokenizer::JSRet JSTokenizer::process(size_t& bytes_in, bool external_script)
 
     bytes_in = std::max(bytes_read, bytes_in) - bytes_in;
     bytes_read = 0;
+    tokens_read = 0;
 
     return static_cast<JSTokenizer::JSRet>(r);
 }
diff --git a/src/utils/test/js_unescape_test.cc b/src/utils/test/js_unescape_test.cc
index 3b8cdb58a..ded5dbec3 100644
--- a/src/utils/test/js_unescape_test.cc
+++ b/src/utils/test/js_unescape_test.cc
@@ -798,6 +798,150 @@ TEST_CASE("String.fromCodePoint()", "[JSNormalizer]")
     }
 }
 
+TEST_CASE("Identifiers", "[JSNormalizer]")
+{
+    SECTION("all patterns")
+    {
+        test_normalization(
+            "\\u0061",
+            "var_0000"
+        );
+        test_normalization_bad(
+            "\\u0020",
+            "",
+            JSTokenizer::BAD_TOKEN
+        );
+
+        test_normalization(
+            "\\u{0061}",
+            "var_0000"
+        );
+        test_normalization(
+            "\\u{061}",
+            "var_0000"
+        );
+        test_normalization(
+            "\\u{61}",
+            "var_0000"
+        );
+        test_normalization_bad(
+            "\\u{1}",
+            "\u0001",
+            JSTokenizer::BAD_TOKEN
+        );
+    }
+
+    SECTION("valid sequence")
+    {
+        test_normalization(
+            " \\u0061bc ;",
+            "var_0000;"
+        );
+        test_normalization(
+            " a\\u0062c ;",
+            "var_0000;"
+        );
+        test_normalization(
+            " ab\\u0063 ;",
+            "var_0000;"
+        );
+    }
+
+    SECTION("invalid sequence")
+    {
+        test_normalization_bad(
+            " \\u0020bc ;",
+            "var_0000",
+            JSTokenizer::BAD_TOKEN
+        );
+        test_normalization_bad(
+            " a\\u0020c ;",
+            "var_0000 var_0001",
+            JSTokenizer::BAD_TOKEN
+        );
+        test_normalization_bad(
+            " ab\\u0020 ;",
+            "var_0000",
+            JSTokenizer::BAD_TOKEN
+        );
+    }
+
+    SECTION("valid code point")
+    {
+        test_normalization(
+            " \\u{61}bc ;",
+            "var_0000;"
+        );
+        test_normalization(
+            " a\\u{62}c ;",
+            "var_0000;"
+        );
+        test_normalization(
+            " ab\\u{63} ;",
+            "var_0000;"
+        );
+    }
+
+    SECTION("invalid code point")
+    {
+        test_normalization_bad(
+            " \\u{20}bc ;",
+            "var_0000",
+            JSTokenizer::BAD_TOKEN
+        );
+        test_normalization_bad(
+            " a\\u{20}c ;",
+            "var_0000 var_0001",
+            JSTokenizer::BAD_TOKEN
+        );
+        test_normalization_bad(
+            " ab\\u{20} ;",
+            "var_0000",
+            JSTokenizer::BAD_TOKEN
+        );
+    }
+
+    SECTION("valid dot accessor")
+    {
+        test_normalization(
+            "\\u0066\\u006f\\u006f.\\u0062\\u0061\\u0072 ;",
+            "var_0000.var_0001;"
+        );
+        test_normalization(
+            "console.\\u006c\\u006f\\u0067 ;",
+            "console.log;"
+        );
+        test_normalization(
+            "\\u0066\\u006f\\u006f.\\u006a\\u006f\\u0069\\u006e ;",
+            "var_0000.join;"
+        );
+    }
+
+    SECTION("invalid dot accessor")
+    {
+        test_normalization_bad(
+            "\\u0066\\u006f\\u006f.\\u0020\\u0061\\u0072 ;",
+            "var_0000.var_0001",
+            JSTokenizer::BAD_TOKEN
+        );
+        test_normalization_bad(
+            "\\u0066\\u0020\\u006f.\\u0062\\u0061\\u0072 ;",
+            "var_0000 var_0001",
+            JSTokenizer::BAD_TOKEN
+        );
+        test_normalization_bad(
+            "console.\\u006c\\u0020\\u0067 ;",
+            "console.l var_0000",
+            JSTokenizer::BAD_TOKEN
+        );
+        test_normalization_bad(
+            "\\u0066\\u0020\\u006f.\\u006a\\u006f\\u0069\\u006e ;",
+            "var_0000 var_0001",
+            JSTokenizer::BAD_TOKEN
+        );
+    }
+}
+
 TEST_CASE("Split", "[JSNormalizer]")
 {
     SECTION("unescape()")
@@ -1091,6 +1235,35 @@ TEST_CASE("Split", "[JSNormalizer]")
             { "114)", "'bar'" }
         });
     }
+
+    SECTION("identifier")
+    {
+        test_normalization({
+            { "\\u0062", "var_0000" },
+            { "\\u0061\\u0072", "var_0001" }
+        });
+        test_normalization({
+            { "\\u{62}", "var_0000" },
+            { "\\u{61}\\u{72}", "var_0001" }
+        });
+        test_normalization({
+            { "\\u0062", "var_0000" },
+            { "\\u{61}\\u{72}", "var_0001" }
+        });
+        test_normalization({
+            { "\\u{62}", "var_0000" },
+            { "\\u0061\\u0072", "var_0001" }
+        });
+        test_normalization({
+            { "\\u{63}\\u{6f}\\u{6e}", "var_0000" },
+            { "\\u{73}\\u{6f}\\u{6c}\\u{65}", "console" }
+        });
+        test_normalization({
+            { "\\u0062", "var_0000" },
+            { "\\u0061", "var_0001" },
+            { "\\u0072", "var_0002" }
+        });
+    }
 }
 
 TEST_CASE("Mixed input", "[JSNormalizer]")
@@ -1129,6 +1302,14 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
 
     SECTION("identifier")
     {
+        test_normalization(
+            "\\u0062\\u{61}\\u0072",
+            "var_0000"
+        );
+        test_normalization(
+            "\\u{62}\\u0061\\u{72}",
+            "var_0000"
+        );
         test_normalization(
             "unescape ( f(\"A\\u20B\\u20C\"), eval(\"\\u66\\u6f\\u6f\"), \"\\u66\\u6f\\u6f\" ) ;",
             "var_0000(\"A\\u20B\\u20C\"),eval(\"\\u66\\u6f\\u6f\"),\"foo\";"