From: Mike Stepanek (mstepane) Date: Tue, 8 Mar 2022 21:06:40 +0000 (+0000) Subject: Pull request #3300: JS Normalizer refactoring. X-Git-Tag: 3.1.25.0~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=496aa0d346044b08e38d27f49f81cb01219c2f8a;p=thirdparty%2Fsnort3.git Pull request #3300: JS Normalizer refactoring. Merge in SNORT/snort3 from ~OSHUMEIK/snort3:js_perf to master Squashed commit of the following: commit 45a6b666b8c8ae9a6e67ed8d098acee76dc7d406 Author: Andrii Serbeniuk Date: Tue Mar 8 15:30:20 2022 +0200 utils: improve Flex matching patterns Try to match as much as possible at a time. commit 88b1d71905cda27a2231b95e1dfafbe7a91aa1e2 Author: Oleksii Shumeiko Date: Sun Mar 6 18:50:56 2022 +0200 utils: combine ignore list with normalization map An ID name is looked once in a combined map (normalized names and ignored names). commit af84510fd2527b9b20cd3a3fd6e41e6651c0d436 Author: Oleksii Shumeiko Date: Sun Mar 6 10:59:00 2022 +0200 utils: wrap unordered set with a fast lookup table commit 23a81bb9f19c51f9f3c57fc39afb5b045622d392 Author: Oleksii Shumeiko Date: Sat Mar 5 22:03:43 2022 +0200 utils: check more likely branches at first commit a043edabcee24c5a0f167939581ab6202b3e491b Author: Oleksii Shumeiko Date: Sat Mar 5 21:09:48 2022 +0200 utils: pre-compute ID normalized names commit c1c644e47b8a7f0b04126fa4a6e7e68ca2e283b0 Author: Oleksii Shumeiko Date: Fri Mar 4 20:57:24 2022 +0200 utils: refactor the alias lookup One search in the map is performed per alias lookup. Loops removed. The scope_contains() test function removed, it is redundant. --- diff --git a/src/utils/js_identifier_ctx.cc b/src/utils/js_identifier_ctx.cc index 41a9b37c7..49274c834 100644 --- a/src/utils/js_identifier_ctx.cc +++ b/src/utils/js_identifier_ctx.cc @@ -24,6 +24,7 @@ #include "js_identifier_ctx.h" #include +#include #if !defined(CATCH_TEST_BUILD) && !defined(BENCHMARK_TEST) #include "service_inspectors/http_inspect/http_enum.h" @@ -44,50 +45,98 @@ public: }; #endif // CATCH_TEST_BUILD -#define MAX_LAST_NAME 65535 -#define HEX_DIGIT_MASK 15 +#define NORM_NAME_SIZE 9 // size of the normalized form plus null symbol +#define NORM_NAME_CNT 65536 -static const char hex_digits[] = -{ - '0', '1','2','3', '4', '5', '6', '7', '8','9', 'a', 'b', 'c', 'd', 'e', 'f' -}; +static char norm_names[NORM_NAME_SIZE * NORM_NAME_CNT]; -static inline std::string format_name(int32_t num) +static void init_norm_names() { - std::string name("var_"); - name.reserve(8); - name.push_back(hex_digits[(num >> 12) & HEX_DIGIT_MASK]); - name.push_back(hex_digits[(num >> 8) & HEX_DIGIT_MASK]); - name.push_back(hex_digits[(num >> 4) & HEX_DIGIT_MASK]); - name.push_back(hex_digits[num & HEX_DIGIT_MASK]); - - return name; + static bool once = false; + + if (once) + return; + + once = true; + + char* c = norm_names; + const char hex[] = "0123456789abcdef"; + + for (int i = 0; i < NORM_NAME_CNT; ++i) + { + *c++ = 'v'; + *c++ = 'a'; + *c++ = 'r'; + *c++ = '_'; + *c++ = hex[(i >> 12) & 0xf]; + *c++ = hex[(i >> 8) & 0xf]; + *c++ = hex[(i >> 4) & 0xf]; + *c++ = hex[(i >> 0) & 0xf]; + *c++ = '\0'; + } + + assert(sizeof(norm_names) == c - norm_names); } JSIdentifierCtx::JSIdentifierCtx(int32_t depth, uint32_t max_scope_depth, - const std::unordered_set& ignored_ids) - : ignored_ids(ignored_ids), depth(depth), max_scope_depth(max_scope_depth) + const std::unordered_set& ignore_list) + : ignore_list(ignore_list), max_scope_depth(max_scope_depth) { + init_norm_names(); + + memset(id_fast, 0, sizeof(id_fast)); + norm_name = norm_names; + norm_name_end = norm_names + NORM_NAME_SIZE * std::min(depth, NORM_NAME_CNT); scopes.emplace_back(JSProgramScopeType::GLOBAL); + + for (const auto& iid : ignore_list) + if (iid.length() == 1) + id_fast[(unsigned)iid[0]] = iid.c_str(); + else + id_names[iid] = iid.c_str(); } -const char* JSIdentifierCtx::substitute(const char* identifier) +const char* JSIdentifierCtx::substitute(unsigned char c) { - const auto it = ident_names.find(identifier); - if (it != ident_names.end()) - return it->second.c_str(); + auto p = id_fast[c]; + if (p) + return p; + + if (norm_name >= norm_name_end) + return nullptr; + + auto n = norm_name; + norm_name += NORM_NAME_SIZE; + HttpModule::increment_peg_counts(HttpEnums::PEG_JS_IDENTIFIER); + + return id_fast[c] = n; +} + +const char* JSIdentifierCtx::substitute(const char* id_name) +{ + assert(*id_name); + + if (id_name[1] == '\0') + return substitute(*id_name); + + const auto it = id_names.find(id_name); + if (it != id_names.end()) + return it->second; - if (ident_last_name >= depth || ident_last_name > MAX_LAST_NAME) + if (norm_name >= norm_name_end) return nullptr; - ident_names[identifier] = format_name(ident_last_name++); + auto n = norm_name; + norm_name += NORM_NAME_SIZE; HttpModule::increment_peg_counts(HttpEnums::PEG_JS_IDENTIFIER); - return ident_names[identifier].c_str(); + + return id_names[id_name] = n; } -bool JSIdentifierCtx::is_ignored(const char* identifier) const +bool JSIdentifierCtx::is_ignored(const char* id_name) const { - return ignored_ids.count(identifier); + return id_name < norm_names || + id_name >= norm_names + NORM_NAME_SIZE * NORM_NAME_CNT; } bool JSIdentifierCtx::scope_push(JSProgramScopeType t) @@ -115,47 +164,38 @@ bool JSIdentifierCtx::scope_pop(JSProgramScopeType t) void JSIdentifierCtx::reset() { - ident_last_name = 0; - - ident_names.clear(); + memset(id_fast, 0, sizeof(id_fast)); + norm_name = norm_names; + id_names.clear(); scopes.clear(); scopes.emplace_back(JSProgramScopeType::GLOBAL); + + for (const auto& iid : ignore_list) + if (iid.length() == 1) + id_fast[(unsigned)iid[0]] = iid.c_str(); + else + id_names[iid] = iid.c_str(); } void JSIdentifierCtx::add_alias(const char* alias, const std::string&& value) { assert(alias); assert(!scopes.empty()); - scopes.back().add_alias(alias, std::move(value)); -} -const char* JSIdentifierCtx::alias_lookup(const char* alias) const -{ - assert(alias); + auto& a = aliases[alias]; + a.emplace_back(std::move(value)); - for (auto it = scopes.rbegin(); it != scopes.rend(); ++it) - { - if (const char* value = it->get_alias_value(alias)) - return value; - } - return nullptr; + scopes.back().reference(a); } -void JSIdentifierCtx::ProgramScope::add_alias(const char* alias, const std::string&& value) +const char* JSIdentifierCtx::alias_lookup(const char* alias) const { assert(alias); - aliases[alias] = value; -} -const char* JSIdentifierCtx::ProgramScope::get_alias_value(const char* alias) const -{ - assert(alias); + const auto& i = aliases.find(alias); - const auto it = aliases.find(alias); - if (it != aliases.end()) - return it->second.c_str(); - else - return nullptr; + return i != aliases.end() && !i->second.empty() + ? i->second.back().c_str() : nullptr; } // advanced program scope access for testing @@ -182,21 +222,8 @@ const std::list JSIdentifierCtx::get_types() const for(const auto& scope:scopes) { return_list.push_back(scope.type()); - } - return return_list; -} - -bool JSIdentifierCtx::scope_contains(size_t pos, const char* alias) const -{ - size_t offset = 0; - for (auto it = scopes.begin(); it != scopes.end(); ++it, ++offset) - { - if (offset == pos) - return it->get_alias_value(alias); } - assert(false); - return false; + return return_list; } #endif // CATCH_TEST_BUILD - diff --git a/src/utils/js_identifier_ctx.h b/src/utils/js_identifier_ctx.h index ae83b8fbd..e0aa57d4f 100644 --- a/src/utils/js_identifier_ctx.h +++ b/src/utils/js_identifier_ctx.h @@ -24,6 +24,7 @@ #include #include #include +#include enum JSProgramScopeType : unsigned int { @@ -55,7 +56,7 @@ class JSIdentifierCtx : public JSIdentifierCtxBase { public: JSIdentifierCtx(int32_t depth, uint32_t max_scope_depth, - const std::unordered_set& ignored_ids); + const std::unordered_set& ignore_list); virtual const char* substitute(const char* identifier) override; virtual void add_alias(const char* alias, const std::string&& value) override; @@ -72,39 +73,54 @@ public: virtual size_t size() const override { return (sizeof(JSIdentifierCtx) + (sizeof(std::string) * 2 * 500) + (sizeof(ProgramScope) * 3)); } + private: + using Alias = std::vector; + using AliasRef = std::list; + using AliasMap = std::unordered_map; + using NameMap = std::unordered_map; + class ProgramScope { public: - ProgramScope(JSProgramScopeType t) : t(t) {} + ProgramScope(JSProgramScopeType t) : t(t) + {} - void add_alias(const char* alias, const std::string&& value); - const char* get_alias_value(const char* alias) const; + ~ProgramScope() + { for (auto a : to_remove) a->pop_back(); } + + void reference(Alias& a) + { to_remove.push_back(&a); } JSProgramScopeType type() const { return t; } + private: - std::unordered_map aliases; JSProgramScopeType t; + AliasRef to_remove{}; }; + inline const char* substitute(unsigned char c); + + // do not swap next two lines, the destructor frees them in the reverse order + AliasMap aliases; std::list scopes; - std::unordered_map ident_names; - const std::unordered_set& ignored_ids; - int32_t ident_last_name = 0; - int32_t depth; + const char* id_fast[256]; + NameMap id_names; + const std::unordered_set& ignore_list; + + const char* norm_name; + const char* norm_name_end; uint32_t max_scope_depth; // advanced program scope access for testing -#ifdef CATCH_TEST_BUILD +#if defined(CATCH_TEST_BUILD) || defined(BENCHMARK_TEST) public: // compare scope list with the passed pattern bool scope_check(const std::list& compare) const; const std::list get_types() const; - bool scope_contains(size_t pos, const char* alias) const; #endif // CATCH_TEST_BUILD }; #endif // JS_IDENTIFIER_CTX - diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index 4e3ff0a2c..b7f93df12 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -961,15 +961,15 @@ LITERAL_HEX_INTEGER 0x[0-9a-fA-F]*|0X[0-9a-fA-F]* LITERAL_DQ_STRING_START \" LITERAL_DQ_STRING_END \" LITERAL_DQ_STRING_SKIP \\\" -LITERAL_DQ_STRING_TEXT . +LITERAL_DQ_STRING_TEXT [^\"\\\xA\xD\{0x10}(\xE2\x80\xA8)(\xE2\x80\xA9)("<"+(?i:script))("<"+(?i:\/script>))]{1,32} LITERAL_SQ_STRING_START \' LITERAL_SQ_STRING_END \' LITERAL_SQ_STRING_SKIP \\\' -LITERAL_SQ_STRING_TEXT . +LITERAL_SQ_STRING_TEXT [^\'\\\xA\xD\{0x10}(\xE2\x80\xA8)(\xE2\x80\xA9)("<"+(?i:script))("<"+(?i:\/script>))]{1,32} LITERAL_TEMPLATE_START \` LITERAL_TEMPLATE_END \` LITERAL_TEMPLATE_SUBST_START \$\{ -LITERAL_TEMPLATE_OTHER . +LITERAL_TEMPLATE_OTHER [^\\\`(\$\{)("<"+(?i:\/script>))]{1,32} LITERAL_REGEX_START \/[^*\/] LITERAL_REGEX_END \/[gimsuy]* LITERAL_REGEX_SKIP \\\/ @@ -1373,46 +1373,55 @@ JSTokenizer::JSRet JSTokenizer::do_identifier_substitution(const char* lexeme, b yyout << lexeme; return EOS; } - else - set_ident_norm(true); - if (ident_ctx.is_ignored(lexeme) && !id_part) + set_ident_norm(true); + + const char* name = ident_ctx.substitute(lexeme); + + if (!name) { - ignored_id_pos = yyout.rdbuf()->pubseekoff(0, yyout.cur, std::ios_base::out); - set_ident_norm(false); - yyout << lexeme; - return EOS; + debug_logf(6, http_trace, TRACE_JS_DUMP, nullptr, + "'%s' => IDENTIFIER_OVERFLOW\n", lexeme); + return IDENTIFIER_OVERFLOW; + } + + if (ident_ctx.is_ignored(name)) + { + if (id_part) + { + std::string n(name); + n.push_back('+'); // any illegal symbol as a part of ID name + name = ident_ctx.substitute(n.c_str()); + } + else + { + ignored_id_pos = yyout.rdbuf()->pubseekoff(0, yyout.cur, std::ios_base::out); + set_ident_norm(false); + yyout << name; + return EOS; + } } - const char *ident = nullptr; - if (!id_part) - ident = ident_ctx.alias_lookup(lexeme); - if (ident) + const char* alias = id_part ? nullptr : ident_ctx.alias_lookup(lexeme); + + if (alias) { set_ident_norm(false); ignored_id_pos = yyout.rdbuf()->pubseekoff(0, yyout.cur, std::ios_base::out); last_dealiased = std::string(YYText()); dealias_stored = true; - } - else - { - ignored_id_pos = -1; - ident = ident_ctx.substitute(lexeme); - } + yyout << alias; - if (!ident) - { debug_logf(6, http_trace, TRACE_JS_DUMP, nullptr, - "'%s' => IDENTIFIER_OVERFLOW\n", lexeme); - - return IDENTIFIER_OVERFLOW; + "'%s' => '%s'\n", lexeme, alias); + return EOS; } - debug_logf(6, http_trace, TRACE_JS_DUMP, nullptr, - "'%s' => '%s'\n", lexeme, ident); - - yyout << ident; + ignored_id_pos = -1; + yyout << name; + debug_logf(6, http_trace, TRACE_JS_DUMP, nullptr, + "'%s' => '%s'\n", lexeme, name); return EOS; } @@ -1569,9 +1578,6 @@ void JSTokenizer::states_over() bool JSTokenizer::states_process() { - if (!yyleng) - return true; - bytes_read += yyleng; // Fulfillment goes after this check only in case of split over several input scripts. @@ -1592,9 +1598,25 @@ bool JSTokenizer::states_process() bytes_skip = bytes_skip - yyleng; + // Continue normalization from the last state without any changes + if (bytes_skip == 0) + { + token = eof_token; + yy_start = eof_sc; + } + // Update parsing state every match + else if (bytes_skip > 0) + { + do { ++sp; sp %= JSTOKENIZER_MAX_STATES; } + while (states[sp].sc == 0); + + auto& state = states[sp]; + token = state.token; + yy_start = state.sc; + } // Ignore normalization till all the already normalized bytes are skipped or mismatch found. // If mismatch found, adjust normalization state and renormalize from the mismatch point. - if (bytes_skip < 0) + else { bytes_skip = 0; states_adjust(); @@ -1611,22 +1633,6 @@ bool JSTokenizer::states_process() return true; } - // Otherwise, continue normalization from the last state without any changes - else if (bytes_skip == 0) - { - token = eof_token; - yy_start = eof_sc; - } - // Meanwhile, update parsing state every match - else - { - do { ++sp; sp %= JSTOKENIZER_MAX_STATES; } - while (states[sp].sc == 0); - - auto& state = states[sp]; - token = state.token; - yy_start = state.sc; - } return false; } @@ -1776,12 +1782,12 @@ JSTokenizer::FuncType JSTokenizer::detect_func_type() case IDENTIFIER: { FuncType ret = FuncType::GENERAL; - if (ignored_id_pos >= 0) + if (ignored_id_pos >= 0) { std::streambuf* pbuf = yyout.rdbuf(); std::streamsize size = pbuf->pubseekoff(0, yyout.cur, yyout.out) - ignored_id_pos; assert(size >= 0); - + char tail[256]; assert((size_t)size <= sizeof(tail)); size = std::min((size_t)size, sizeof(tail)); @@ -1926,39 +1932,37 @@ void JSTokenizer::dealias_increment() void JSTokenizer::dealias_identifier(bool id_part, bool assignment_start) { auto lexeme = YYText(); - switch(alias_state) + + switch (alias_state) { - case ALIAS_NONE: + case ALIAS_NONE: + if (assignment_start) { - if (assignment_start) - { - alias = std::string(YYText()); - aliased.clear(); - aliased.str(""); - alias_state = ALIAS_DEFINITION; - } - break; + alias = std::string(YYText()); + aliased.clear(); + aliased.str(""); + alias_state = ALIAS_DEFINITION; } - case ALIAS_PREFIX: - case ALIAS_DEFINITION: - { + break; + case ALIAS_PREFIX: + case ALIAS_DEFINITION: + dealias_reset(); + break; + case ALIAS_EQUALS: + alias_state = ALIAS_VALUE; + // fallthrough + case ALIAS_VALUE: + { + auto dealias = ident_ctx.alias_lookup(lexeme); + if ((!ident_norm() && id_part) || + (ident_ctx.is_ignored(ident_ctx.substitute(lexeme)) && !id_part)) + aliased << YYText(); + else if (dealias) + aliased << dealias; + else dealias_reset(); - break; - } - case ALIAS_EQUALS: - alias_state = ALIAS_VALUE; - // fallthrough - case ALIAS_VALUE: - { - auto dealias = ident_ctx.alias_lookup(lexeme); - if ((ident_ctx.is_ignored(lexeme) && !id_part) || (!ident_norm() && id_part)) - aliased << YYText(); - else if (dealias) - aliased << dealias; - else - dealias_reset(); - break; - } + break; + } } } @@ -2532,4 +2536,4 @@ JSTokenizer::JSRet JSTokenizer::process(size_t& bytes_in) bytes_read = 0; return static_cast(r); -} \ No newline at end of file +} diff --git a/src/utils/test/js_identifier_ctx_test.cc b/src/utils/test/js_identifier_ctx_test.cc index 0364f48a8..6f781174c 100644 --- a/src/utils/test/js_identifier_ctx_test.cc +++ b/src/utils/test/js_identifier_ctx_test.cc @@ -93,8 +93,11 @@ TEST_CASE("JSIdentifierCtx::is_ignored()", "[JSIdentifierCtx]") { JSIdentifierCtx ident_ctx(DEPTH, SCOPE_DEPTH, s_ignored_ids); - CHECK(ident_ctx.is_ignored("console") == true); - CHECK(ident_ctx.is_ignored("foo") == false); + auto v1 = ident_ctx.substitute("console"); + auto v2 = ident_ctx.substitute("foo"); + + CHECK(ident_ctx.is_ignored(v1) == true); + CHECK(ident_ctx.is_ignored(v2) == false); } TEST_CASE("JSIdentifierCtx::scopes", "[JSIdentifierCtx]") @@ -120,30 +123,22 @@ TEST_CASE("JSIdentifierCtx::scopes", "[JSIdentifierCtx]") { ident_ctx.add_alias("a", "console.log"); ident_ctx.add_alias("b", "document"); - CHECK(ident_ctx.scope_contains(0, "a")); - CHECK(ident_ctx.scope_contains(0, "b")); CHECK(!strcmp(ident_ctx.alias_lookup("a"), "console.log")); CHECK(!strcmp(ident_ctx.alias_lookup("b"), "document")); REQUIRE(ident_ctx.scope_push(JSProgramScopeType::FUNCTION)); ident_ctx.add_alias("a", "document"); - CHECK(ident_ctx.scope_contains(1, "a")); - CHECK(!ident_ctx.scope_contains(1, "b")); CHECK(!strcmp(ident_ctx.alias_lookup("a"), "document")); CHECK(!strcmp(ident_ctx.alias_lookup("b"), "document")); REQUIRE(ident_ctx.scope_push(JSProgramScopeType::BLOCK)); ident_ctx.add_alias("b", "console.log"); - CHECK(ident_ctx.scope_contains(2, "b")); - CHECK(!ident_ctx.scope_contains(2, "a")); CHECK(!strcmp(ident_ctx.alias_lookup("b"), "console.log")); CHECK(!strcmp(ident_ctx.alias_lookup("a"), "document")); REQUIRE(ident_ctx.scope_pop(JSProgramScopeType::BLOCK)); REQUIRE(ident_ctx.scope_pop(JSProgramScopeType::FUNCTION)); ident_ctx.add_alias("a", "eval"); - CHECK(ident_ctx.scope_contains(0, "a")); - CHECK(ident_ctx.scope_contains(0, "b")); CHECK(!strcmp(ident_ctx.alias_lookup("a"), "eval")); CHECK(!strcmp(ident_ctx.alias_lookup("b"), "document")); diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index dedae02ab..f3887aaed 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -4226,7 +4226,7 @@ TEST_CASE("Function call tracking - basic", "[JSNormalizer]") SECTION("ignored fake defined function identifier") { const std::unordered_set s_ignored_ids_fake {"fake_unescape"}; - JSTokenizerTester tester_fake(norm_depth, max_scope_depth, s_ignored_ids_fake, + JSTokenizerTester tester_fake(norm_depth, max_scope_depth, s_ignored_ids_fake, max_template_nesting, max_bracket_depth); tester_fake.test_function_scopes({ {"fake_unescape(", "fake_unescape(", {FuncType::NOT_FUNC, FuncType::GENERAL}}