From: Mike Stepanek (mstepane) Date: Tue, 23 Nov 2021 19:10:01 +0000 (+0000) Subject: Pull request #3174: Switch FlexLexer to batch mode. X-Git-Tag: 3.1.18.0~16 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1211f5dea38fecff07c77e48f85706fe413e1939;p=thirdparty%2Fsnort3.git Pull request #3174: Switch FlexLexer to batch mode. Merge in SNORT/snort3 from ~OSHUMEIK/snort3:flex_batch to master Squashed commit of the following: commit 4cb787d5a367bb775fee452a828d8cfc67c78b43 Author: Oleksii Shumeiko Date: Fri Nov 12 15:59:53 2021 +0200 utils: do output adjustment in case of carryover commit facc72c26fd8d001effa2970579eee9c5705dd23 Author: Oleksii Shumeiko Date: Mon Oct 11 17:13:06 2021 +0300 utils: enable batch mode for Flex New options engaged: -Caf -8 'batch' 'never-interactive' --- diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index 8910de6b5..4b3b61346 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -21,7 +21,7 @@ set( UTIL_INCLUDES FLEX_TARGET ( js_tokenizer ${CMAKE_CURRENT_SOURCE_DIR}/js_tokenizer.l ${CMAKE_CURRENT_BINARY_DIR}/js_tokenizer.cc - COMPILE_FLAGS -Ca + COMPILE_FLAGS "-Caf -8" ) add_library ( utils OBJECT diff --git a/src/utils/js_normalizer.cc b/src/utils/js_normalizer.cc index 781092fcf..639ee7b19 100644 --- a/src/utils/js_normalizer.cc +++ b/src/utils/js_normalizer.cc @@ -51,6 +51,14 @@ JSNormalizer::~JSNormalizer() JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len) { + assert(src); + + if (src_len == 0) + { + src_next = src; + return JSTokenizer::SCRIPT_CONTINUE; + } + if (rem_bytes == 0 && !unlim) { debug_log(5, http_trace, TRACE_JS_PROC, nullptr, @@ -71,12 +79,16 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len) ->pubsetbuf(const_cast(src), len); out_buf.reserve(src_len * BUFF_EXP_FACTOR); - tokenizer.pre_yylex(); + size_t t_bytes = in_buf.last_chunk_offset(); + tokenizer.pre_yylex(t_bytes != 0); + JSTokenizer::JSRet ret = static_cast(tokenizer.yylex()); in.clear(); out.clear(); - size_t r_bytes = in_buf.last_chunk_offset(); + size_t r_bytes = tokenizer.get_bytes_read(); + r_bytes = max(r_bytes, t_bytes) - t_bytes; + if (!unlim) rem_bytes -= r_bytes; src_next = src + r_bytes; diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index f7e6bc754..fbfb22173 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -117,11 +117,14 @@ public: ~JSTokenizer() override; // internal actions before calling main loop - void pre_yylex(); + void pre_yylex(bool adjust_output = false); // returns JSRet int yylex() override; + size_t get_bytes_read() + { auto r = bytes_read; bytes_read = 0; return r; } + protected: [[noreturn]] void LexerError(const char* msg) override { snort::FatalError("%s", msg); } @@ -164,6 +167,8 @@ private: JSToken token = UNDEFINED; ASIGroup previous_group = ASI_OTHER; JSIdentifierCtxBase& ident_ctx; + size_t bytes_read; + size_t tmp_bytes_read; struct { diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index 9ae385393..649f78abd 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -25,6 +25,10 @@ /* Generate C++ scanner */ %option c++ +%option batch + +%option never-interactive + %{ #ifdef HAVE_CONFIG_H @@ -1174,6 +1178,7 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out, : yyFlexLexer(in, out), max_template_nesting(max_template_nesting), ident_ctx(mapper), + bytes_read(0), tmp_buf(buf), tmp_buf_size(buf_size), tmp_cap_size(cap_size), @@ -1192,10 +1197,14 @@ JSTokenizer::~JSTokenizer() tmp_buf_size = 0; } -void JSTokenizer::pre_yylex() +void JSTokenizer::pre_yylex(bool adjust_output) { assert(output_steps_back >= 0); - yyout.seekp(-output_steps_back, std::ios_base::cur); + + if (adjust_output) + yyout.seekp(-output_steps_back, std::ios_base::cur); + + yy_flush_buffer(YY_CURRENT_BUFFER); } void JSTokenizer::switch_to_temporal(const std::string& data) @@ -1204,6 +1213,8 @@ void JSTokenizer::switch_to_temporal(const std::string& data) cur_buffer = YY_CURRENT_BUFFER; tmp_buffer = yy_create_buffer(tmp, data.size()); yy_switch_to_buffer((YY_BUFFER_STATE)tmp_buffer); + + tmp_bytes_read = bytes_read; } void JSTokenizer::switch_to_initial() @@ -1211,6 +1222,8 @@ void JSTokenizer::switch_to_initial() yy_switch_to_buffer((YY_BUFFER_STATE)cur_buffer); yy_delete_buffer((YY_BUFFER_STATE)tmp_buffer); tmp_buffer = nullptr; + + bytes_read = tmp_bytes_read; } // A return value of this method uses to terminate the scanner @@ -1404,7 +1417,10 @@ void JSTokenizer::states_reset() void JSTokenizer::states_push() { - assert(yyleng != 0); + if (!yyleng) + return; + + bytes_read += yyleng; sp++; sp %= JSTOKENIZER_MAX_STATES; @@ -1418,8 +1434,11 @@ void JSTokenizer::states_push() void JSTokenizer::states_correct(int take_off) { + auto delta = yyleng - take_off; auto& state = states[sp]; - state.orig_len -= yyleng - take_off; + + bytes_read -= delta; + state.orig_len -= delta; } void JSTokenizer::states_over() diff --git a/src/utils/streambuf.cc b/src/utils/streambuf.cc index ff46f9653..66ac96868 100644 --- a/src/utils/streambuf.cc +++ b/src/utils/streambuf.cc @@ -42,9 +42,10 @@ istreambuf_glue::istreambuf_glue() : streamsize istreambuf_glue::last_chunk_offset() const { - auto c = gptr(); - auto b = eback(); - return last_chunk() ? c - b : 0; + if (chunks.empty()) + return 0; + + return get<2>(chunks.back()); } streambuf* istreambuf_glue::setbuf(char* s, streamsize n) diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index ea46db876..77a606f2c 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -3358,75 +3358,87 @@ static JSTokenizer::JSRet norm_ret(JSNormalizer& normalizer, const std::string& return normalizer.normalize(input.c_str(), input.size()); } -TEST_CASE("benchmarking - ::normalize() - literals", "[JSNormalizer]") +TEST_CASE("JS Normalizer, literals by 8 K", "[JSNormalizer]") { JSIdentifierCtxTest ident_ctx; JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH); char dst[DEPTH]; - auto whitespace = make_input("", " ", "", DEPTH); - auto block_comment = make_input("/*", " ", "*/", DEPTH); - auto double_quote = make_input("\"", " ", "\"", DEPTH); - BENCHMARK("memcpy - whitespaces - 65535 bytes") + constexpr size_t size = 1 << 13; + + auto data_pl = make_input("", ".", "", size); + auto data_ws = make_input("", " ", "", size); + auto data_bc = make_input("/*", " ", "*/", size); + auto data_dq = make_input("\"", " ", "\"", size); + + BENCHMARK("memcpy()") { - return memcpy(dst, whitespace.c_str(), whitespace.size()); + return memcpy(dst, data_pl.c_str(), data_pl.size()); }; - REQUIRE(norm_ret(normalizer, whitespace) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("whitespaces - 65535 bytes") + REQUIRE(norm_ret(normalizer, data_ws) == JSTokenizer::SCRIPT_ENDED); + BENCHMARK("whitespaces") { normalizer.rewind_output(); - return normalizer.normalize(whitespace.c_str(), whitespace.size()); + return normalizer.normalize(data_ws.c_str(), data_ws.size()); }; - REQUIRE(norm_ret(normalizer, block_comment) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("block comment - 65535 bytes") + REQUIRE(norm_ret(normalizer, data_bc) == JSTokenizer::SCRIPT_ENDED); + BENCHMARK("block comment") { normalizer.rewind_output(); - return normalizer.normalize(block_comment.c_str(), block_comment.size()); + return normalizer.normalize(data_bc.c_str(), data_bc.size()); }; - REQUIRE(norm_ret(normalizer, double_quote) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("double quotes string - 65535 bytes") + REQUIRE(norm_ret(normalizer, data_dq) == JSTokenizer::SCRIPT_ENDED); + BENCHMARK("double quotes string") { normalizer.rewind_output(); - return normalizer.normalize(double_quote.c_str(), double_quote.size()); + return normalizer.normalize(data_dq.c_str(), data_dq.size()); }; +} - constexpr size_t depth_8k = 8192; +TEST_CASE("JS Normalizer, literals by 64 K", "[JSNormalizer]") +{ + JSIdentifierCtxTest ident_ctx; + JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH); + char dst[DEPTH]; + + constexpr size_t size = 1 << 16; - auto whitespace_8k = make_input("", " ", "", depth_8k); - auto block_comment_8k = make_input("/*", " ", "*/", depth_8k); - auto double_quote_8k = make_input("\"", " ", "\"", depth_8k); + auto data_pl = make_input("", ".", "", size); + auto data_ws = make_input("", " ", "", size); + auto data_bc = make_input("/*", " ", "*/", size); + auto data_dq = make_input("\"", " ", "\"", size); - BENCHMARK("memcpy - whitespaces - 8192 bytes") + BENCHMARK("memcpy()") { - return memcpy(dst, whitespace_8k.c_str(), whitespace_8k.size()); + return memcpy(dst, data_pl.c_str(), data_pl.size()); }; - REQUIRE(norm_ret(normalizer, whitespace_8k) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("whitespaces - 8192 bytes") + REQUIRE(norm_ret(normalizer, data_ws) == JSTokenizer::SCRIPT_ENDED); + BENCHMARK("whitespaces") { normalizer.rewind_output(); - return normalizer.normalize(whitespace_8k.c_str(), whitespace_8k.size()); + return normalizer.normalize(data_ws.c_str(), data_ws.size()); }; - REQUIRE(norm_ret(normalizer, block_comment_8k) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("block comment - 8192 bytes") + REQUIRE(norm_ret(normalizer, data_bc) == JSTokenizer::SCRIPT_ENDED); + BENCHMARK("block comment") { normalizer.rewind_output(); - return normalizer.normalize(block_comment_8k.c_str(), block_comment_8k.size()); + return normalizer.normalize(data_bc.c_str(), data_bc.size()); }; - REQUIRE(norm_ret(normalizer, double_quote_8k) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("double quotes string - 8192 bytes") + REQUIRE(norm_ret(normalizer, data_dq) == JSTokenizer::SCRIPT_ENDED); + BENCHMARK("double quotes string") { normalizer.rewind_output(); - return normalizer.normalize(double_quote_8k.c_str(), double_quote_8k.size()); + return normalizer.normalize(data_dq.c_str(), data_dq.size()); }; } -TEST_CASE("benchmarking - ::normalize() - identifiers", "[JSNormalizer]") +TEST_CASE("JS Normalizer, id normalization", "[JSNormalizer]") { // around 11 000 identifiers std::string input; @@ -3471,53 +3483,47 @@ TEST_CASE("benchmarking - ::normalize() - identifiers", "[JSNormalizer]") }; } -TEST_CASE("benchmarking - ::normalize() - scope", "[JSNormalizer]") +TEST_CASE("JS Normalizer, scope tracking", "[JSNormalizer]") { constexpr uint32_t depth = 65535; JSIdentifierCtxTest ident_ctx; JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, depth); - char dst[depth]; auto src_ws = make_input("", " ", "", depth); auto src_brace_rep = make_input_repeat("{}", depth); auto src_paren_rep = make_input_repeat("()", depth); auto src_bracket_rep = make_input_repeat("[]", depth); - BENCHMARK("memcpy - ...{}{}{}... - 65535 bytes") - { - return memcpy(dst, src_brace_rep.c_str(), src_brace_rep.size()); - }; - REQUIRE(norm_ret(normalizer, src_ws) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("whitespaces - 65535 bytes") + BENCHMARK("whitespaces") { normalizer.rewind_output(); return normalizer.normalize(src_ws.c_str(), src_ws.size()); }; REQUIRE(norm_ret(normalizer, src_brace_rep) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("...{}{}{}... - 65535 bytes") + BENCHMARK("...{}{}{}...") { normalizer.rewind_output(); return normalizer.normalize(src_brace_rep.c_str(), src_brace_rep.size()); }; REQUIRE(norm_ret(normalizer, src_paren_rep) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("...()()()... - 65535 bytes") + BENCHMARK("...()()()...") { normalizer.rewind_output(); return normalizer.normalize(src_paren_rep.c_str(), src_paren_rep.size()); }; REQUIRE(norm_ret(normalizer, src_bracket_rep) == JSTokenizer::SCRIPT_ENDED); - BENCHMARK("...[][][]... - 65535 bytes") + BENCHMARK("...[][][]...") { normalizer.rewind_output(); return normalizer.normalize(src_bracket_rep.c_str(), src_bracket_rep.size()); }; } -TEST_CASE("benchmarking - ::normalize() - automatic semicolon insertion") +TEST_CASE("JS Normalizer, automatic semicolon", "[JSNormalizer]") { auto w_semicolons = make_input("", "a;\n", "", DEPTH); auto wo_semicolons = make_input("", "a \n", "", DEPTH); diff --git a/src/utils/test/streambuf_test.cc b/src/utils/test/streambuf_test.cc index d9285b83a..8dae9d794 100644 --- a/src/utils/test/streambuf_test.cc +++ b/src/utils/test/streambuf_test.cc @@ -132,6 +132,12 @@ using namespace std; CHECK(!memcmp((exp), act, (exp_len))); \ } +#define BYTES_READ(s, b) \ + ((s).clear(), \ + std::max((std::streamsize)(s).tellg(), \ + (std::streamsize)(b).last_chunk_offset()) \ + - (b).last_chunk_offset()) + TEST_CASE("input buffer - basic one source", "[Stream buffers]") { const char* exp = "Early bird gets a corn."; @@ -1349,16 +1355,20 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]") CHECK(0 == b.last_chunk_offset()); s.read(act, 1); - CHECK(1 == b.last_chunk_offset()); + CHECK(0 == b.last_chunk_offset()); + CHECK(1 == BYTES_READ(s, b)); s.read(act, 2); - CHECK(3 == b.last_chunk_offset()); + CHECK(0 == b.last_chunk_offset()); + CHECK(3 == BYTES_READ(s, b)); s.read(act, 5); - CHECK(8 == b.last_chunk_offset()); + CHECK(0 == b.last_chunk_offset()); + CHECK(8 == BYTES_READ(s, b)); s.read(act, 1); - CHECK(8 == b.last_chunk_offset()); + CHECK(0 == b.last_chunk_offset()); + CHECK(8 == BYTES_READ(s, b)); } SECTION("two buffers") @@ -1373,22 +1383,28 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]") CHECK(0 == b.last_chunk_offset()); b.pubsetbuf(dat1, strlen(dat1))->pubsetbuf(dat2, strlen(dat2)); - CHECK(0 == b.last_chunk_offset()); + CHECK(4 == b.last_chunk_offset()); + CHECK(0 == BYTES_READ(s, b)); s.read(act, 1); - CHECK(0 == b.last_chunk_offset()); + CHECK(4 == b.last_chunk_offset()); + CHECK(0 == BYTES_READ(s, b)); s.read(act, 1); - CHECK(0 == b.last_chunk_offset()); + CHECK(4 == b.last_chunk_offset()); + CHECK(0 == BYTES_READ(s, b)); s.read(act, 4); - CHECK(2 == b.last_chunk_offset()); + CHECK(4 == b.last_chunk_offset()); + CHECK(2 == BYTES_READ(s, b)); s.read(act, 2); CHECK(4 == b.last_chunk_offset()); + CHECK(4 == BYTES_READ(s, b)); s.read(act, 1); CHECK(4 == b.last_chunk_offset()); + CHECK(4 == BYTES_READ(s, b)); } SECTION("three buffers") @@ -1404,22 +1420,28 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]") CHECK(0 == b.last_chunk_offset()); b.pubsetbuf(dat1, strlen(dat1))->pubsetbuf(dat2, strlen(dat2))->pubsetbuf(dat3, strlen(dat3)); - CHECK(0 == b.last_chunk_offset()); + CHECK(8 == b.last_chunk_offset()); + CHECK(0 == BYTES_READ(s, b)); s.read(act, 3); - CHECK(0 == b.last_chunk_offset()); + CHECK(8 == b.last_chunk_offset()); + CHECK(0 == BYTES_READ(s, b)); s.read(act, 3); - CHECK(0 == b.last_chunk_offset()); + CHECK(8 == b.last_chunk_offset()); + CHECK(0 == BYTES_READ(s, b)); s.read(act, 3); - CHECK(1 == b.last_chunk_offset()); + CHECK(8 == b.last_chunk_offset()); + CHECK(1 == BYTES_READ(s, b)); s.read(act, 3); - CHECK(4 == b.last_chunk_offset()); + CHECK(8 == b.last_chunk_offset()); + CHECK(4 == BYTES_READ(s, b)); s.read(act, 1); - CHECK(4 == b.last_chunk_offset()); + CHECK(8 == b.last_chunk_offset()); + CHECK(4 == BYTES_READ(s, b)); } }