From: Oleksandr Serhiienko Date: Thu, 4 Aug 2022 09:51:17 +0000 (+0300) Subject: utils: fix JS split to reflect tokens correction and re-normalization X-Git-Tag: 3.1.39.0~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d45988e1632cfbc407989f336276af548518188a;p=thirdparty%2Fsnort3.git utils: fix JS split to reflect tokens correction and re-normalization --- diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index 3bcb33cc4..697b76d49 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -341,7 +341,8 @@ private: JSToken token = UNDEFINED; // the token before int orig_len = 0; // current token original length int norm_len = 0; // normalized length of previous tokens - int sc = 0; // current Starting Condition (0 means NOT_SET) + int sc = 0; // current Starting Condition (0 means NOT_SET) + int correction = 0; // correction length } states[JSTOKENIZER_MAX_STATES]; int sp = 0; // points to the top of states int eof_sp = 0; // points to the last state before the EOF diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index 61db2e741..2a6820968 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -1702,6 +1702,9 @@ void JSTokenizer::states_correct(int take_off) bytes_read -= delta; state.orig_len -= delta; + state.correction = take_off; + + yyless(take_off); } void JSTokenizer::states_over() @@ -1766,6 +1769,7 @@ bool JSTokenizer::states_process() state.orig_len = yyleng; state.norm_len = yyout.rdbuf()->pubseekoff(0, std::ios_base::cur, std::ios_base::out); state.sc = yy_start; + state.correction = 0; return true; } @@ -1781,6 +1785,16 @@ bool JSTokenizer::states_process() // Update parsing state every match else if (bytes_skip > 0) { + // if the state was corrected, reflect this during the parsing + if (auto correction = states[sp].correction) + { + auto delta = yyleng - correction; + bytes_skip += delta; + bytes_read -= delta; + + yyless(correction); + } + do { ++sp; sp %= JSTOKENIZER_MAX_STATES; } while (states[sp].sc == 0); @@ -1804,6 +1818,7 @@ bool JSTokenizer::states_process() state.orig_len = yyleng; state.norm_len = yyout.rdbuf()->pubseekoff(0, std::ios_base::cur, std::ios_base::out); state.sc = yy_start; + state.correction = 0; return true; } @@ -2316,7 +2331,6 @@ JSTokenizer::JSRet JSTokenizer::literal_regex_start() EXEC(do_spacing(LITERAL)) yyout << '/'; states_correct(1); - yyless(1); BEGIN(regex); set_ident_norm(true); regex_stack = VStack(); @@ -2957,7 +2971,6 @@ void JSTokenizer::explicit_otag() // discard match of the script tag and scan again without leading '<' states_correct(1); - yyless(1); // process leading '<' as a comparison operator operator_comparison(); diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index 7f6cd4218..c32c87e2a 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -2457,6 +2457,39 @@ TEST_CASE("split between tokens", "[JSNormalizer]") const char exp2[] = "