From: Mike Stepanek (mstepane) <mstepane@cisco.com>
Date: Tue, 23 Nov 2021 19:10:01 +0000 (+0000)
Subject: Pull request #3174: Switch FlexLexer to batch mode.
X-Git-Tag: 3.1.18.0~16
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1211f5dea38fecff07c77e48f85706fe413e1939;p=thirdparty%2Fsnort3.git

Pull request #3174: Switch FlexLexer to batch mode.

Merge in SNORT/snort3 from ~OSHUMEIK/snort3:flex_batch to master

Squashed commit of the following:

commit 4cb787d5a367bb775fee452a828d8cfc67c78b43
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Fri Nov 12 15:59:53 2021 +0200

    utils: do output adjustment in case of carryover

commit facc72c26fd8d001effa2970579eee9c5705dd23
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Mon Oct 11 17:13:06 2021 +0300

    utils: enable batch mode for Flex

    New options engaged: -Caf -8 'batch' 'never-interactive'
---

diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
index 8910de6b5..4b3b61346 100644
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -21,7 +21,7 @@ set( UTIL_INCLUDES
 
 FLEX_TARGET ( js_tokenizer ${CMAKE_CURRENT_SOURCE_DIR}/js_tokenizer.l
     ${CMAKE_CURRENT_BINARY_DIR}/js_tokenizer.cc
-    COMPILE_FLAGS -Ca
+    COMPILE_FLAGS "-Caf -8"
 )
 
 add_library ( utils OBJECT
diff --git a/src/utils/js_normalizer.cc b/src/utils/js_normalizer.cc
index 781092fcf..639ee7b19 100644
--- a/src/utils/js_normalizer.cc
+++ b/src/utils/js_normalizer.cc
@@ -51,6 +51,14 @@ JSNormalizer::~JSNormalizer()
 
 JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len)
 {
+    assert(src);
+
+    if (src_len == 0)
+    {
+        src_next = src;
+        return JSTokenizer::SCRIPT_CONTINUE;
+    }
+
     if (rem_bytes == 0 && !unlim)
     {
         debug_log(5, http_trace, TRACE_JS_PROC, nullptr,
@@ -71,12 +79,16 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len)
         ->pubsetbuf(const_cast<char*>(src), len);
     out_buf.reserve(src_len * BUFF_EXP_FACTOR);
 
-    tokenizer.pre_yylex();
+    size_t t_bytes = in_buf.last_chunk_offset();
+    tokenizer.pre_yylex(t_bytes != 0);
+
     JSTokenizer::JSRet ret = static_cast<JSTokenizer::JSRet>(tokenizer.yylex());
     in.clear();
     out.clear();
 
-    size_t r_bytes = in_buf.last_chunk_offset();
+    size_t r_bytes = tokenizer.get_bytes_read();
+    r_bytes = max(r_bytes, t_bytes) - t_bytes;
+
     if (!unlim)
         rem_bytes -= r_bytes;
     src_next = src + r_bytes;
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h
index f7e6bc754..fbfb22173 100644
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -117,11 +117,14 @@ public:
     ~JSTokenizer() override;
 
     // internal actions before calling main loop
-    void pre_yylex();
+    void pre_yylex(bool adjust_output = false);
 
     // returns JSRet
     int yylex() override;
 
+    size_t get_bytes_read()
+    { auto r = bytes_read; bytes_read = 0; return r; }
+
 protected:
     [[noreturn]] void LexerError(const char* msg) override
     { snort::FatalError("%s", msg); }
@@ -164,6 +167,8 @@ private:
     JSToken token = UNDEFINED;
     ASIGroup previous_group = ASI_OTHER;
     JSIdentifierCtxBase& ident_ctx;
+    size_t bytes_read;
+    size_t tmp_bytes_read;
 
     struct
     {
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l
index 9ae385393..649f78abd 100644
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -25,6 +25,10 @@
 /* Generate C++ scanner */
 %option c++
 
+%option batch
+
+%option never-interactive
+
 %{
 
 #ifdef HAVE_CONFIG_H
@@ -1174,6 +1178,7 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out,
     : yyFlexLexer(in, out),
       max_template_nesting(max_template_nesting),
       ident_ctx(mapper),
+      bytes_read(0),
       tmp_buf(buf),
       tmp_buf_size(buf_size),
       tmp_cap_size(cap_size),
@@ -1192,10 +1197,14 @@ JSTokenizer::~JSTokenizer()
     tmp_buf_size = 0;
 }
 
-void JSTokenizer::pre_yylex()
+void JSTokenizer::pre_yylex(bool adjust_output)
 {
     assert(output_steps_back >= 0);
-    yyout.seekp(-output_steps_back, std::ios_base::cur);
+
+    if (adjust_output)
+        yyout.seekp(-output_steps_back, std::ios_base::cur);
+
+    yy_flush_buffer(YY_CURRENT_BUFFER);
 }
 
 void JSTokenizer::switch_to_temporal(const std::string& data)
@@ -1204,6 +1213,8 @@ void JSTokenizer::switch_to_temporal(const std::string& data)
     cur_buffer = YY_CURRENT_BUFFER;
     tmp_buffer = yy_create_buffer(tmp, data.size());
     yy_switch_to_buffer((YY_BUFFER_STATE)tmp_buffer);
+
+    tmp_bytes_read = bytes_read;
 }
 
 void JSTokenizer::switch_to_initial()
@@ -1211,6 +1222,8 @@ void JSTokenizer::switch_to_initial()
     yy_switch_to_buffer((YY_BUFFER_STATE)cur_buffer);
     yy_delete_buffer((YY_BUFFER_STATE)tmp_buffer);
     tmp_buffer = nullptr;
+
+    bytes_read = tmp_bytes_read;
 }
 
 // A return value of this method uses to terminate the scanner
@@ -1404,7 +1417,10 @@ void JSTokenizer::states_reset()
 
 void JSTokenizer::states_push()
 {
-    assert(yyleng != 0);
+    if (!yyleng)
+        return;
+
+    bytes_read += yyleng;
 
     sp++;
     sp %= JSTOKENIZER_MAX_STATES;
@@ -1418,8 +1434,11 @@ void JSTokenizer::states_push()
 
 void JSTokenizer::states_correct(int take_off)
 {
+    auto delta = yyleng - take_off;
     auto& state = states[sp];
-    state.orig_len -= yyleng - take_off;
+
+    bytes_read -= delta;
+    state.orig_len -= delta;
 }
 
 void JSTokenizer::states_over()
diff --git a/src/utils/streambuf.cc b/src/utils/streambuf.cc
index ff46f9653..66ac96868 100644
--- a/src/utils/streambuf.cc
+++ b/src/utils/streambuf.cc
@@ -42,9 +42,10 @@ istreambuf_glue::istreambuf_glue() :
 
 streamsize istreambuf_glue::last_chunk_offset() const
 {
-    auto c = gptr();
-    auto b = eback();
-    return last_chunk() ? c - b : 0;
+    if (chunks.empty())
+        return 0;
+
+    return get<2>(chunks.back());
 }
 
 streambuf* istreambuf_glue::setbuf(char* s, streamsize n)
diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc
index ea46db876..77a606f2c 100644
--- a/src/utils/test/js_normalizer_test.cc
+++ b/src/utils/test/js_normalizer_test.cc
@@ -3358,75 +3358,87 @@ static JSTokenizer::JSRet norm_ret(JSNormalizer& normalizer, const std::string&
     return normalizer.normalize(input.c_str(), input.size());
 }
 
-TEST_CASE("benchmarking - ::normalize() - literals", "[JSNormalizer]")
+TEST_CASE("JS Normalizer, literals by 8 K", "[JSNormalizer]")
 {
     JSIdentifierCtxTest ident_ctx;
     JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH);
     char dst[DEPTH];
-    auto whitespace = make_input("", " ", "", DEPTH);
-    auto block_comment = make_input("/*", " ", "*/", DEPTH);
-    auto double_quote = make_input("\"", " ", "\"", DEPTH);
 
-    BENCHMARK("memcpy - whitespaces - 65535 bytes")
+    constexpr size_t size = 1 << 13;
+
+    auto data_pl = make_input("", ".", "", size);
+    auto data_ws = make_input("", " ", "", size);
+    auto data_bc = make_input("/*", " ", "*/", size);
+    auto data_dq = make_input("\"", " ", "\"", size);
+
+    BENCHMARK("memcpy()")
     {
-        return memcpy(dst, whitespace.c_str(), whitespace.size());
+        return memcpy(dst, data_pl.c_str(), data_pl.size());
     };
 
-    REQUIRE(norm_ret(normalizer, whitespace) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("whitespaces - 65535 bytes")
+    REQUIRE(norm_ret(normalizer, data_ws) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("whitespaces")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(whitespace.c_str(), whitespace.size());
+        return normalizer.normalize(data_ws.c_str(), data_ws.size());
     };
 
-    REQUIRE(norm_ret(normalizer, block_comment) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("block comment - 65535 bytes")
+    REQUIRE(norm_ret(normalizer, data_bc) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("block comment")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(block_comment.c_str(), block_comment.size());
+        return normalizer.normalize(data_bc.c_str(), data_bc.size());
     };
 
-    REQUIRE(norm_ret(normalizer, double_quote) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("double quotes string - 65535 bytes")
+    REQUIRE(norm_ret(normalizer, data_dq) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("double quotes string")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(double_quote.c_str(), double_quote.size());
+        return normalizer.normalize(data_dq.c_str(), data_dq.size());
     };
+}
 
-    constexpr size_t depth_8k = 8192;
+TEST_CASE("JS Normalizer, literals by 64 K", "[JSNormalizer]")
+{
+    JSIdentifierCtxTest ident_ctx;
+    JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH);
+    char dst[DEPTH];
+
+    constexpr size_t size = 1 << 16;
 
-    auto whitespace_8k = make_input("", " ", "", depth_8k);
-    auto block_comment_8k = make_input("/*", " ", "*/", depth_8k);
-    auto double_quote_8k = make_input("\"", " ", "\"", depth_8k);
+    auto data_pl = make_input("", ".", "", size);
+    auto data_ws = make_input("", " ", "", size);
+    auto data_bc = make_input("/*", " ", "*/", size);
+    auto data_dq = make_input("\"", " ", "\"", size);
 
-    BENCHMARK("memcpy - whitespaces - 8192 bytes")
+    BENCHMARK("memcpy()")
     {
-        return memcpy(dst, whitespace_8k.c_str(), whitespace_8k.size());
+        return memcpy(dst, data_pl.c_str(), data_pl.size());
     };
 
-    REQUIRE(norm_ret(normalizer, whitespace_8k) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("whitespaces - 8192 bytes")
+    REQUIRE(norm_ret(normalizer, data_ws) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("whitespaces")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(whitespace_8k.c_str(), whitespace_8k.size());
+        return normalizer.normalize(data_ws.c_str(), data_ws.size());
     };
 
-    REQUIRE(norm_ret(normalizer, block_comment_8k) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("block comment - 8192 bytes")
+    REQUIRE(norm_ret(normalizer, data_bc) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("block comment")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(block_comment_8k.c_str(), block_comment_8k.size());
+        return normalizer.normalize(data_bc.c_str(), data_bc.size());
     };
 
-    REQUIRE(norm_ret(normalizer, double_quote_8k) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("double quotes string - 8192 bytes")
+    REQUIRE(norm_ret(normalizer, data_dq) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("double quotes string")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(double_quote_8k.c_str(), double_quote_8k.size());
+        return normalizer.normalize(data_dq.c_str(), data_dq.size());
     };
 }
 
-TEST_CASE("benchmarking - ::normalize() - identifiers", "[JSNormalizer]")
+TEST_CASE("JS Normalizer, id normalization", "[JSNormalizer]")
 {
     // around 11 000 identifiers
     std::string input;
@@ -3471,53 +3483,47 @@ TEST_CASE("benchmarking - ::normalize() - identifiers", "[JSNormalizer]")
     };
 }
 
-TEST_CASE("benchmarking - ::normalize() - scope", "[JSNormalizer]")
+TEST_CASE("JS Normalizer, scope tracking", "[JSNormalizer]")
 {
     constexpr uint32_t depth = 65535;
     JSIdentifierCtxTest ident_ctx;
     JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, depth);
-    char dst[depth];
 
     auto src_ws = make_input("", " ", "", depth);
     auto src_brace_rep = make_input_repeat("{}", depth);
     auto src_paren_rep = make_input_repeat("()", depth);
     auto src_bracket_rep = make_input_repeat("[]", depth);
 
-    BENCHMARK("memcpy - ...{}{}{}... - 65535 bytes")
-    {
-        return memcpy(dst, src_brace_rep.c_str(), src_brace_rep.size());
-    };
-
     REQUIRE(norm_ret(normalizer, src_ws) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("whitespaces - 65535 bytes")
+    BENCHMARK("whitespaces")
     {
         normalizer.rewind_output();
         return normalizer.normalize(src_ws.c_str(), src_ws.size());
     };
 
     REQUIRE(norm_ret(normalizer, src_brace_rep) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("...{}{}{}... - 65535 bytes")
+    BENCHMARK("...{}{}{}...")
     {
         normalizer.rewind_output();
         return normalizer.normalize(src_brace_rep.c_str(), src_brace_rep.size());
     };
 
     REQUIRE(norm_ret(normalizer, src_paren_rep) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("...()()()... - 65535 bytes")
+    BENCHMARK("...()()()...")
     {
         normalizer.rewind_output();
         return normalizer.normalize(src_paren_rep.c_str(), src_paren_rep.size());
     };
 
     REQUIRE(norm_ret(normalizer, src_bracket_rep) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("...[][][]... - 65535 bytes")
+    BENCHMARK("...[][][]...")
     {
         normalizer.rewind_output();
         return normalizer.normalize(src_bracket_rep.c_str(), src_bracket_rep.size());
     };
 }
 
-TEST_CASE("benchmarking - ::normalize() - automatic semicolon insertion")
+TEST_CASE("JS Normalizer, automatic semicolon", "[JSNormalizer]")
 {
     auto w_semicolons = make_input("", "a;\n", "", DEPTH);
     auto wo_semicolons = make_input("", "a \n", "", DEPTH);
diff --git a/src/utils/test/streambuf_test.cc b/src/utils/test/streambuf_test.cc
index d9285b83a..8dae9d794 100644
--- a/src/utils/test/streambuf_test.cc
+++ b/src/utils/test/streambuf_test.cc
@@ -132,6 +132,12 @@ using namespace std;
         CHECK(!memcmp((exp), act, (exp_len)));                          \
     }
 
+#define BYTES_READ(s, b)                                           \
+    ((s).clear(),                                                  \
+     std::max((std::streamsize)(s).tellg(),                        \
+              (std::streamsize)(b).last_chunk_offset())            \
+     - (b).last_chunk_offset())
+
 TEST_CASE("input buffer - basic one source", "[Stream buffers]")
 {
     const char* exp = "Early bird gets a corn.";
@@ -1349,16 +1355,20 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]")
         CHECK(0 == b.last_chunk_offset());
 
         s.read(act, 1);
-        CHECK(1 == b.last_chunk_offset());
+        CHECK(0 == b.last_chunk_offset());
+        CHECK(1 == BYTES_READ(s, b));
 
         s.read(act, 2);
-        CHECK(3 == b.last_chunk_offset());
+        CHECK(0 == b.last_chunk_offset());
+        CHECK(3 == BYTES_READ(s, b));
 
         s.read(act, 5);
-        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == BYTES_READ(s, b));
 
         s.read(act, 1);
-        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == BYTES_READ(s, b));
     }
 
     SECTION("two buffers")
@@ -1373,22 +1383,28 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]")
         CHECK(0 == b.last_chunk_offset());
 
         b.pubsetbuf(dat1, strlen(dat1))->pubsetbuf(dat2, strlen(dat2));
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(4 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
 
         s.read(act, 1);
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(4 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
 
         s.read(act, 1);
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(4 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
 
         s.read(act, 4);
-        CHECK(2 == b.last_chunk_offset());
+        CHECK(4 == b.last_chunk_offset());
+        CHECK(2 == BYTES_READ(s, b));
 
         s.read(act, 2);
         CHECK(4 == b.last_chunk_offset());
+        CHECK(4 == BYTES_READ(s, b));
 
         s.read(act, 1);
         CHECK(4 == b.last_chunk_offset());
+        CHECK(4 == BYTES_READ(s, b));
     }
 
     SECTION("three buffers")
@@ -1404,22 +1420,28 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]")
         CHECK(0 == b.last_chunk_offset());
 
         b.pubsetbuf(dat1, strlen(dat1))->pubsetbuf(dat2, strlen(dat2))->pubsetbuf(dat3, strlen(dat3));
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
 
         s.read(act, 3);
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
 
         s.read(act, 3);
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
 
         s.read(act, 3);
-        CHECK(1 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(1 == BYTES_READ(s, b));
 
         s.read(act, 3);
-        CHECK(4 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(4 == BYTES_READ(s, b));
 
         s.read(act, 1);
-        CHECK(4 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(4 == BYTES_READ(s, b));
     }
 }