From: Mike Stepanek (mstepane) Date: Fri, 19 Nov 2021 14:02:33 +0000 (+0000) Subject: Pull request #3169: Reset Normalizer's context when new script starts X-Git-Tag: 3.1.18.0~20 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=31ea616ecfd4f889e63477ae5d908d4457c4f0f3;p=thirdparty%2Fsnort3.git Pull request #3169: Reset Normalizer's context when new script starts Merge in SNORT/snort3 from ~OSHUMEIK/snort3:js_buffers_fix to master Squashed commit of the following: commit bdee3121765f854f41e2a46b9a2a557408314fab Author: Oleksii Shumeiko Date: Tue Nov 16 11:18:33 2021 +0200 utils: reset Normalizer context when new script starts Since Normalizer context are no longer recreated for each new script started, a method to reset internal state was added. If a script continues in the next chunk, then context is not reset, but is being prepared to process the new chunk as a continuation. --- diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index 5079e23d5..f7e6bc754 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -84,12 +84,12 @@ private: ASI_GROUP_4, // ] ASI_GROUP_5, // ) ASI_GROUP_6, // + - - ASI_GROUP_7, // this true false null identifier literal + ASI_GROUP_7, // this true false null identifier literal //IDENTIFIER + LITERAL + KEYWORD_LITERAL ASI_GROUP_8, // ++ -- ASI_GROUP_9, // continue break return debugger // same as KEYWORD_BA ASI_GROUP_10, // var function new delete void typeof if do while for with - // switch throw try ~ + + // switch throw try ~ + ASI_GROUP_MAX }; @@ -140,8 +140,9 @@ private: JSRet process_subst_open(); void states_push(); - void states_apply(); void states_correct(int); + void states_reset(); + void states_over(); // scope stack servicing JSRet scope_push(ScopeType); @@ -199,4 +200,3 @@ private: }; #endif // JS_TOKENIZER_H - diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index f2239747b..2f48242a8 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -26,6 +26,7 @@ %option c++ %{ + #ifdef HAVE_CONFIG_H #include "config.h" #endif @@ -48,9 +49,38 @@ states_push(); \ } -#define EXEC(f) { auto r = (f); if (r) { BEGIN(regst); return r; } } -#define EEOF(f) { auto r = (f); if (r) { if (r != SCRIPT_CONTINUE) BEGIN(regst); return r; } } +#define RETURN(r) \ + { \ + if ((r) == SCRIPT_CONTINUE) \ + states_over(); \ + else \ + states_reset(); \ + return (r); \ + } + +#define EXEC(f) \ + { \ + auto r = (f); \ + if (r) \ + { \ + BEGIN(regst); \ + RETURN(r) \ + } \ + } + +#define EEOF(f) \ + { \ + auto r = (f); \ + if (r) \ + { \ + if (r != SCRIPT_CONTINUE) \ + BEGIN(regst); \ + RETURN(r) \ + } \ + } + constexpr bool JSTokenizer::insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX]; + %} /* The following grammar was created based on ECMAScript specification */ @@ -963,71 +993,72 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 %x regex %% + {WHITESPACES} { } {CHAR_ESCAPE_SEQUENCES} { } {LINE_TERMINATORS} { BEGIN(regst); newline_found = true; } -{HTML_TAG_SCRIPT_OPEN} { BEGIN(regst); return OPENING_TAG; } -{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); if (!global_scope()) return ENDED_IN_INNER_SCOPE; else return SCRIPT_ENDED; } +{HTML_TAG_SCRIPT_OPEN} { BEGIN(regst); RETURN(OPENING_TAG) } +{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); if (!global_scope()) RETURN(ENDED_IN_INNER_SCOPE) else RETURN(SCRIPT_ENDED) } {HTML_COMMENT_OPEN} { BEGIN(lcomm); } {LINE_COMMENT_START} { BEGIN(lcomm); } {LINE_COMMENT_END1} { BEGIN(regst); newline_found = true; } {LINE_COMMENT_END2} { BEGIN(regst); newline_found = true; } -{LINE_COMMENT_END3} { BEGIN(regst); return OPENING_TAG; } -{LINE_COMMENT_END4} { BEGIN(regst); return CLOSING_TAG; } +{LINE_COMMENT_END3} { BEGIN(regst); RETURN(OPENING_TAG) } +{LINE_COMMENT_END4} { BEGIN(regst); RETURN(CLOSING_TAG) } {LINE_COMMENT_SKIP} { } -<> { states_apply(); return SCRIPT_CONTINUE; } +<> { RETURN(SCRIPT_CONTINUE) } {BLOCK_COMMENT_START} { BEGIN(bcomm); } {BLOCK_COMMENT_END1} { BEGIN(regst); } -{BLOCK_COMMENT_END2} { BEGIN(regst); return OPENING_TAG; } -{BLOCK_COMMENT_END3} { BEGIN(regst); return CLOSING_TAG; } +{BLOCK_COMMENT_END2} { BEGIN(regst); RETURN(OPENING_TAG) } +{BLOCK_COMMENT_END3} { BEGIN(regst); RETURN(CLOSING_TAG) } {BLOCK_COMMENT_LINE1} | {BLOCK_COMMENT_LINE2} { newline_found = true;} {BLOCK_COMMENT_SKIP} { } -<> { states_apply(); return SCRIPT_CONTINUE; } +<> { RETURN(SCRIPT_CONTINUE) } {LITERAL_DQ_STRING_START} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(dqstr); set_ident_norm(true); } {LITERAL_DQ_STRING_END} { ECHO; BEGIN(divop); } -{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } +{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); RETURN(CLOSING_TAG) } \\{CR}{LF} { } \\{LF} { } \\{CR} { } -{LINE_TERMINATORS} { BEGIN(regst); return BAD_TOKEN; } +{LINE_TERMINATORS} { BEGIN(regst); RETURN(BAD_TOKEN) } {LITERAL_DQ_STRING_SKIP} { ECHO; } {LITERAL_DQ_STRING_TEXT} { ECHO; } -<> { states_apply(); return SCRIPT_CONTINUE; } +<> { RETURN(SCRIPT_CONTINUE) } {LITERAL_SQ_STRING_START} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(sqstr); set_ident_norm(true); } {LITERAL_SQ_STRING_END} { ECHO; BEGIN(divop); } -{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } +{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); RETURN(CLOSING_TAG) } \\{CR}{LF} { } \\{LF} { } \\{CR} { } -{LINE_TERMINATORS} { BEGIN(regst); return BAD_TOKEN; } +{LINE_TERMINATORS} { BEGIN(regst); RETURN(BAD_TOKEN) } {LITERAL_SQ_STRING_SKIP} { ECHO; } {LITERAL_SQ_STRING_TEXT} { ECHO; } -<> { states_apply(); return SCRIPT_CONTINUE; } +<> { RETURN(SCRIPT_CONTINUE) } {LITERAL_TEMPLATE_START} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(tmpll); set_ident_norm(true); } (\\\\)*{LITERAL_TEMPLATE_END} { ECHO; BEGIN(divop); } (\\\\)*{LITERAL_TEMPLATE_SUBST_START} { EXEC(process_subst_open()) } -{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } +{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); RETURN(CLOSING_TAG) } (\\\\)*\\{LITERAL_TEMPLATE_SUBST_START} | /* escaped template substitution */ (\\\\)*\\{LITERAL_TEMPLATE_END} | /* escaped backtick */ {LITERAL_TEMPLATE_OTHER} { ECHO; } -<> { return SCRIPT_CONTINUE; } +<> { RETURN(SCRIPT_CONTINUE) } {LITERAL_REGEX_START} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) yyout << '/'; states_correct(1); yyless(1); BEGIN(regex); set_ident_norm(true); } {LITERAL_REGEX_END} { ECHO; BEGIN(divop); } -{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } +{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); RETURN(CLOSING_TAG) } {LITERAL_REGEX_SKIP} { ECHO; } \\{LF} | \\{CR} | -{LINE_TERMINATORS} { BEGIN(regst); return BAD_TOKEN; } +{LINE_TERMINATORS} { BEGIN(regst); RETURN(BAD_TOKEN) } [^<{LF}{CR}{LS}{PS}\\\/]+ { ECHO; } -<> { states_apply(); return SCRIPT_CONTINUE; } +<> { RETURN(SCRIPT_CONTINUE) } {DIV_OPERATOR} | {DIV_ASSIGNMENT_OPERATOR} { previous_group = ASI_OTHER; ECHO; token = PUNCTUATOR; BEGIN(INITIAL); set_ident_norm(true); } @@ -1197,7 +1228,6 @@ JSTokenizer::JSRet JSTokenizer::eval_eof() } // Normal termination - states_apply(); return SCRIPT_CONTINUE; } @@ -1349,6 +1379,29 @@ JSTokenizer::JSRet JSTokenizer::process_subst_open() return scope_push(BRACES); } +void JSTokenizer::states_reset() +{ + if (tmp_buffer) + switch_to_initial(); + + brace_depth = {}; + token = UNDEFINED; + previous_group = ASI_OTHER; + + memset(states, 0, sizeof(states)); + + delete[] tmp_buf; + tmp_buf = nullptr; + tmp_buf_size = 0; + + output_steps_back = 0; + newline_found = false; + scope_stack = {}; + + scope_push(GLOBAL); + BEGIN(regst); +} + void JSTokenizer::states_push() { assert(yyleng != 0); @@ -1369,7 +1422,7 @@ void JSTokenizer::states_correct(int take_off) state.orig_len -= yyleng - take_off; } -void JSTokenizer::states_apply() +void JSTokenizer::states_over() { int tail_size = 0; int outbuf_pos = yyout.tellp(); diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index 99d16f19b..ea46db876 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -2217,6 +2217,44 @@ TEST_CASE("split in keyword", "[JSNormalizer]") } } +TEST_CASE("split and continuation combined", "[JSNormalizer]") +{ + SECTION("PDU 1 [cont] PDU 2 [end end cont end]") + { + const char src1[] = "a b" ""; + const char src2[] = "c d" ""; + const char src3[] = "" ""; + const char src4[] = "\n" ""; + + const char exp1[] = "var_0000 var_0001"; + const char exp2[] = "var_0000 var_0002 var_0003"; + const char exp3[] = "var_0000 var_0002 var_0003"; + const char exp4[] = "var_0000 var_0002 var_0003"; + + char dst1[sizeof(exp1)]; + char dst2[sizeof(exp2)]; + char dst3[sizeof(exp3)]; + char dst4[sizeof(exp4)]; + + JSIdentifierCtx ident_ctx(DEPTH, s_ident_built_in); + JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH); + + DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1); + CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1)); + + TRY(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1, JSTokenizer::SCRIPT_ENDED); + CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1)); + + TRY(src3, sizeof(src3) - 1, dst3, sizeof(dst3) - 1, JSTokenizer::SCRIPT_ENDED); + CHECK(!memcmp(exp3, dst3, sizeof(exp3) - 1)); + + DO(src4, sizeof(src4) - 1, dst4, sizeof(dst4) - 1); + CHECK(!memcmp(exp4, dst4, sizeof(exp4) - 1)); + + CLOSE(); + } +} + TEST_CASE("memcap", "[JSNormalizer]") { SECTION("3 tokens") @@ -3292,7 +3330,7 @@ TEST_CASE("built-in identifiers split", "[JSNormalizer]") static constexpr const char* s_closing_tag = ""; static const std::string make_input(const char* begin, const char* mid, - const char* end, size_t len) + const char* end, size_t len) { std::string s(begin); int fill = (len - strlen(begin) - strlen(end) - strlen(s_closing_tag)) / strlen(mid);