From: Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) Date: Thu, 4 Aug 2022 12:51:42 +0000 (+0000) Subject: Pull request #3537: JS Normalizer: Escaped JavaScript Identifiers X-Git-Tag: 3.1.39.0~5 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6e538616fbafed6110ad7ad9d00742aa5d1c4cd4;p=thirdparty%2Fsnort3.git Pull request #3537: JS Normalizer: Escaped JavaScript Identifiers Merge in SNORT/snort3 from ~OSERHIIE/snort3:js_unescape_ident to master Squashed commit of the following: commit 2b192d53735b7f6b346c17581adc28c1ee395b56 Author: Oleksandr Serhiienko Date: Mon Aug 1 11:16:11 2022 +0300 utils: fix compilation warning [-Wcomma] commit ad2285d11ea0b1408937a7688179e7d65946031f Author: Oleksandr Serhiienko Date: Mon Aug 1 11:15:00 2022 +0300 utils: validate escaped JavaScript identifiers --- diff --git a/src/utils/js_identifier_ctx.cc b/src/utils/js_identifier_ctx.cc index b172d6494..a56bb05a0 100644 --- a/src/utils/js_identifier_ctx.cc +++ b/src/utils/js_identifier_ctx.cc @@ -75,7 +75,7 @@ static void init_norm_names() assert(sizeof(norm_names) == c - norm_names); } -static int _init_norm_names __attribute__((unused)) = (init_norm_names(), 0); +static int _init_norm_names __attribute__((unused)) = (static_cast(init_norm_names()), 0); JSIdentifierCtx::JSIdentifierCtx(int32_t depth, uint32_t max_scope_depth, const std::unordered_set& ignored_ids_list, diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index 18c8ce392..3bcb33cc4 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -331,6 +331,8 @@ private: JSIdentifierCtxBase& ident_ctx; size_t bytes_read; size_t tmp_bytes_read; + uint32_t tokens_read; + uint32_t tmp_tokens_read; bool ext_script; VStack regex_stack; diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index ca5821a12..61db2e741 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -1371,6 +1371,7 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out, max_template_nesting(max_template_nesting), ident_ctx(mapper), bytes_read(0), + tokens_read(0), tmp_buf(buf), tmp_buf_size(buf_size), tmp_cap_size(cap_size), @@ -1397,6 +1398,7 @@ void JSTokenizer::switch_to_temporal(const std::string& data) yy_switch_to_buffer((YY_BUFFER_STATE)tmp_buffer); tmp_bytes_read = bytes_read; + tmp_tokens_read = tokens_read; } void JSTokenizer::switch_to_initial() @@ -1406,6 +1408,7 @@ void JSTokenizer::switch_to_initial() tmp_buffer = nullptr; bytes_read = tmp_bytes_read; + tmp_tokens_read = tokens_read - tmp_tokens_read; } // A return value of this method uses to terminate the scanner @@ -1414,16 +1417,20 @@ void JSTokenizer::switch_to_initial() // The return value should be used to make a decision about yyterminate() call JSTokenizer::JSRet JSTokenizer::eval_eof() { - // If the temporal scan buffer reaches EOF, cleanup and - // continue with the initial one - if (tmp_buffer) - { - switch_to_initial(); - return EOS; - } + if (!tmp_buffer) + return SCRIPT_CONTINUE; + + switch_to_initial(); - // Normal termination - return SCRIPT_CONTINUE; + if (tmp_tokens_read != 1 or token != IDENTIFIER) + return BAD_TOKEN; + + // remove temporal buffer normalization state + memset((void*)(states + sp), 0, sizeof(states[0])); + --sp; + sp %= JSTOKENIZER_MAX_STATES; + + return EOS; } JSTokenizer::JSRet JSTokenizer::do_spacing(JSToken cur_token) @@ -1745,6 +1752,7 @@ void JSTokenizer::states_over() bool JSTokenizer::states_process() { bytes_read += yyleng; + ++tokens_read; // Fulfillment goes after this check only in case of split over several input scripts. // Otherwise, new state is pushed. @@ -2969,6 +2977,7 @@ JSTokenizer::JSRet JSTokenizer::process(size_t& bytes_in, bool external_script) bytes_in = std::max(bytes_read, bytes_in) - bytes_in; bytes_read = 0; + tokens_read = 0; return static_cast(r); } diff --git a/src/utils/test/js_unescape_test.cc b/src/utils/test/js_unescape_test.cc index 3b8cdb58a..ded5dbec3 100644 --- a/src/utils/test/js_unescape_test.cc +++ b/src/utils/test/js_unescape_test.cc @@ -798,6 +798,150 @@ TEST_CASE("String.fromCodePoint()", "[JSNormalizer]") } } +TEST_CASE("Identifiers", "[JSNormalizer]") +{ + SECTION("all patterns") + { + test_normalization( + "\\u0061", + "var_0000" + ); + test_normalization_bad( + "\\u0020", + "", + JSTokenizer::BAD_TOKEN + ); + + test_normalization( + "\\u{0061}", + "var_0000" + ); + test_normalization( + "\\u{061}", + "var_0000" + ); + test_normalization( + "\\u{61}", + "var_0000" + ); + test_normalization_bad( + "\\u{1}", + "\u0001", + JSTokenizer::BAD_TOKEN + ); + } + + SECTION("valid sequence") + { + test_normalization( + " \\u0061bc ;", + "var_0000;" + ); + test_normalization( + " a\\u0062c ;", + "var_0000;" + ); + test_normalization( + " ab\\u0063 ;", + "var_0000;" + ); + } + + SECTION("invalid sequence") + { + test_normalization_bad( + " \\u0020bc ;", + "var_0000", + JSTokenizer::BAD_TOKEN + ); + test_normalization_bad( + " a\\u0020c ;", + "var_0000 var_0001", + JSTokenizer::BAD_TOKEN + ); + test_normalization_bad( + " ab\\u0020 ;", + "var_0000", + JSTokenizer::BAD_TOKEN + ); + } + + SECTION("valid code point") + { + test_normalization( + " \\u{61}bc ;", + "var_0000;" + ); + test_normalization( + " a\\u{62}c ;", + "var_0000;" + ); + test_normalization( + " ab\\u{63} ;", + "var_0000;" + ); + } + + SECTION("invalid code point") + { + test_normalization_bad( + " \\u{20}bc ;", + "var_0000", + JSTokenizer::BAD_TOKEN + ); + test_normalization_bad( + " a\\u{20}c ;", + "var_0000 var_0001", + JSTokenizer::BAD_TOKEN + ); + test_normalization_bad( + " ab\\u{20} ;", + "var_0000", + JSTokenizer::BAD_TOKEN + ); + } + + SECTION("valid dot accessor") + { + test_normalization( + "\\u0066\\u006f\\u006f.\\u0062\\u0061\\u0072 ;", + "var_0000.var_0001;" + ); + test_normalization( + "console.\\u006c\\u006f\\u0067 ;", + "console.log;" + ); + test_normalization( + "\\u0066\\u006f\\u006f.\\u006a\\u006f\\u0069\\u006e ;", + "var_0000.join;" + ); + } + + SECTION("invalid dot accessor") + { + test_normalization_bad( + "\\u0066\\u006f\\u006f.\\u0020\\u0061\\u0072 ;", + "var_0000.var_0001", + JSTokenizer::BAD_TOKEN + ); + test_normalization_bad( + "\\u0066\\u0020\\u006f.\\u0062\\u0061\\u0072 ;", + "var_0000 var_0001", + JSTokenizer::BAD_TOKEN + ); + test_normalization_bad( + "console.\\u006c\\u0020\\u0067 ;", + "console.l var_0000", + JSTokenizer::BAD_TOKEN + ); + test_normalization_bad( + "\\u0066\\u0020\\u006f.\\u006a\\u006f\\u0069\\u006e ;", + "var_0000 var_0001", + JSTokenizer::BAD_TOKEN + ); + } +} + TEST_CASE("Split", "[JSNormalizer]") { SECTION("unescape()") @@ -1091,6 +1235,35 @@ TEST_CASE("Split", "[JSNormalizer]") { "114)", "'bar'" } }); } + + SECTION("identifier") + { + test_normalization({ + { "\\u0062", "var_0000" }, + { "\\u0061\\u0072", "var_0001" } + }); + test_normalization({ + { "\\u{62}", "var_0000" }, + { "\\u{61}\\u{72}", "var_0001" } + }); + test_normalization({ + { "\\u0062", "var_0000" }, + { "\\u{61}\\u{72}", "var_0001" } + }); + test_normalization({ + { "\\u{62}", "var_0000" }, + { "\\u0061\\u0072", "var_0001" } + }); + test_normalization({ + { "\\u{63}\\u{6f}\\u{6e}", "var_0000" }, + { "\\u{73}\\u{6f}\\u{6c}\\u{65}", "console" } + }); + test_normalization({ + { "\\u0062", "var_0000" }, + { "\\u0061", "var_0001" }, + { "\\u0072", "var_0002" } + }); + } } TEST_CASE("Mixed input", "[JSNormalizer]") @@ -1129,6 +1302,14 @@ TEST_CASE("Mixed input", "[JSNormalizer]") SECTION("identifier") { + test_normalization( + "\\u0062\\u{61}\\u0072", + "var_0000" + ); + test_normalization( + "\\u{62}\\u0061\\u{72}", + "var_0000" + ); test_normalization( "unescape ( f(\"A\\u20B\\u20C\"), eval(\"\\u66\\u6f\\u6f\"), \"\\u66\\u6f\\u6f\" ) ;", "var_0000(\"A\\u20B\\u20C\"),eval(\"\\u66\\u6f\\u6f\"),\"foo\";"