Pull request #3174: Switch FlexLexer to batch mode.

author Mike Stepanek (mstepane) <mstepane@cisco.com>

Tue, 23 Nov 2021 19:10:01 +0000 (19:10 +0000)

committer Mike Stepanek (mstepane) <mstepane@cisco.com>

Tue, 23 Nov 2021 19:10:01 +0000 (19:10 +0000)
author Mike Stepanek (mstepane) <mstepane@cisco.com>
Tue, 23 Nov 2021 19:10:01 +0000 (19:10 +0000)
committer Mike Stepanek (mstepane) <mstepane@cisco.com>
Tue, 23 Nov 2021 19:10:01 +0000 (19:10 +0000)
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt

index 8910de6b59de1b769b8430f2253e67b8d3f92bd6..4b3b61346496be922874b2bc31855f84c0ee109a 100644 (file)
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -21,7 +21,7 @@ set( UTIL_INCLUDES
  
  FLEX_TARGET ( js_tokenizer ${CMAKE_CURRENT_SOURCE_DIR}/js_tokenizer.l
      ${CMAKE_CURRENT_BINARY_DIR}/js_tokenizer.cc
-    COMPILE_FLAGS -Ca
+    COMPILE_FLAGS "-Caf -8"
  )
  
  add_library ( utils OBJECT
diff --git a/src/utils/js_normalizer.cc b/src/utils/js_normalizer.cc

index 781092fcfcd60594c98f01fb865ee505aa074553..639ee7b1961f9cb8b274a7e00c13b7dd60c412a2 100644 (file)
--- a/src/utils/js_normalizer.cc
+++ b/src/utils/js_normalizer.cc
@@ -51,6 +51,14 @@ JSNormalizer::~JSNormalizer()
  
  JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len)
  {
+    assert(src);
+
+    if (src_len == 0)
+    {
+        src_next = src;
+        return JSTokenizer::SCRIPT_CONTINUE;
+    }
+
      if (rem_bytes == 0 && !unlim)
      {
          debug_log(5, http_trace, TRACE_JS_PROC, nullptr,
@@ -71,12 +79,16 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len)
          ->pubsetbuf(const_cast<char*>(src), len);
      out_buf.reserve(src_len * BUFF_EXP_FACTOR);
  
-    tokenizer.pre_yylex();
+    size_t t_bytes = in_buf.last_chunk_offset();
+    tokenizer.pre_yylex(t_bytes != 0);
+
      JSTokenizer::JSRet ret = static_cast<JSTokenizer::JSRet>(tokenizer.yylex());
      in.clear();
      out.clear();
  
-    size_t r_bytes = in_buf.last_chunk_offset();
+    size_t r_bytes = tokenizer.get_bytes_read();
+    r_bytes = max(r_bytes, t_bytes) - t_bytes;
+
      if (!unlim)
          rem_bytes -= r_bytes;
      src_next = src + r_bytes;
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h

index f7e6bc754aa71b723902471e9d280eab9e13d29b..fbfb22173b3e159e82f92961eb92772683d60b86 100644 (file)
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -117,11 +117,14 @@ public:
      ~JSTokenizer() override;
  
      // internal actions before calling main loop
-    void pre_yylex();
+    void pre_yylex(bool adjust_output = false);
  
      // returns JSRet
      int yylex() override;
  
+    size_t get_bytes_read()
+    { auto r = bytes_read; bytes_read = 0; return r; }
+
  protected:
      [[noreturn]] void LexerError(const char* msg) override
      { snort::FatalError("%s", msg); }
@@ -164,6 +167,8 @@ private:
      JSToken token = UNDEFINED;
      ASIGroup previous_group = ASI_OTHER;
      JSIdentifierCtxBase& ident_ctx;
+    size_t bytes_read;
+    size_t tmp_bytes_read;
  
      struct
      {
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l

index 9ae385393d69fb4febb81be55975a9f167956667..649f78abdc3191ac09d453deda5c2822260e19d2 100644 (file)
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -25,6 +25,10 @@
  /* Generate C++ scanner */
  %option c++
  
+%option batch
+
+%option never-interactive
+
  %{
  
  #ifdef HAVE_CONFIG_H
@@ -1174,6 +1178,7 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out,
      : yyFlexLexer(in, out),
        max_template_nesting(max_template_nesting),
        ident_ctx(mapper),
+      bytes_read(0),
        tmp_buf(buf),
        tmp_buf_size(buf_size),
        tmp_cap_size(cap_size),
@@ -1192,10 +1197,14 @@ JSTokenizer::~JSTokenizer()
      tmp_buf_size = 0;
  }
  
-void JSTokenizer::pre_yylex()
+void JSTokenizer::pre_yylex(bool adjust_output)
  {
      assert(output_steps_back >= 0);
-    yyout.seekp(-output_steps_back, std::ios_base::cur);
+
+    if (adjust_output)
+        yyout.seekp(-output_steps_back, std::ios_base::cur);
+
+    yy_flush_buffer(YY_CURRENT_BUFFER);
  }
  
  void JSTokenizer::switch_to_temporal(const std::string& data)
@@ -1204,6 +1213,8 @@ void JSTokenizer::switch_to_temporal(const std::string& data)
      cur_buffer = YY_CURRENT_BUFFER;
      tmp_buffer = yy_create_buffer(tmp, data.size());
      yy_switch_to_buffer((YY_BUFFER_STATE)tmp_buffer);
+
+    tmp_bytes_read = bytes_read;
  }
  
  void JSTokenizer::switch_to_initial()
@@ -1211,6 +1222,8 @@ void JSTokenizer::switch_to_initial()
      yy_switch_to_buffer((YY_BUFFER_STATE)cur_buffer);
      yy_delete_buffer((YY_BUFFER_STATE)tmp_buffer);
      tmp_buffer = nullptr;
+
+    bytes_read = tmp_bytes_read;
  }
  
  // A return value of this method uses to terminate the scanner
@@ -1404,7 +1417,10 @@ void JSTokenizer::states_reset()
  
  void JSTokenizer::states_push()
  {
-    assert(yyleng != 0);
+    if (!yyleng)
+        return;
+
+    bytes_read += yyleng;
  
      sp++;
      sp %= JSTOKENIZER_MAX_STATES;
@@ -1418,8 +1434,11 @@ void JSTokenizer::states_push()
  
  void JSTokenizer::states_correct(int take_off)
  {
+    auto delta = yyleng - take_off;
      auto& state = states[sp];
-    state.orig_len -= yyleng - take_off;
+
+    bytes_read -= delta;
+    state.orig_len -= delta;
  }
  
  void JSTokenizer::states_over()
diff --git a/src/utils/streambuf.cc b/src/utils/streambuf.cc

index ff46f9653871d2278e4ae8297b577e589b36031b..66ac96868ea5725683a146a66dc828ca6268e275 100644 (file)
--- a/src/utils/streambuf.cc
+++ b/src/utils/streambuf.cc
@@ -42,9 +42,10 @@ istreambuf_glue::istreambuf_glue() :
  
  streamsize istreambuf_glue::last_chunk_offset() const
  {
-    auto c = gptr();
-    auto b = eback();
-    return last_chunk() ? c - b : 0;
+    if (chunks.empty())
+        return 0;
+
+    return get<2>(chunks.back());
  }
  
  streambuf* istreambuf_glue::setbuf(char* s, streamsize n)
diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc

index ea46db8767c65a47e28a8928f5a3364a2427b7ce..77a606f2c432ed9f6324e3995039fcb4ef06fe43 100644 (file)
--- a/src/utils/test/js_normalizer_test.cc
+++ b/src/utils/test/js_normalizer_test.cc
@@ -3358,75 +3358,87 @@ static JSTokenizer::JSRet norm_ret(JSNormalizer& normalizer, const std::string&
      return normalizer.normalize(input.c_str(), input.size());
  }
  
-TEST_CASE("benchmarking - ::normalize() - literals", "[JSNormalizer]")
+TEST_CASE("JS Normalizer, literals by 8 K", "[JSNormalizer]")
  {
      JSIdentifierCtxTest ident_ctx;
      JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH);
      char dst[DEPTH];
-    auto whitespace = make_input("", " ", "", DEPTH);
-    auto block_comment = make_input("/*", " ", "*/", DEPTH);
-    auto double_quote = make_input("\"", " ", "\"", DEPTH);
  
-    BENCHMARK("memcpy - whitespaces - 65535 bytes")
+    constexpr size_t size = 1 << 13;
+
+    auto data_pl = make_input("", ".", "", size);
+    auto data_ws = make_input("", " ", "", size);
+    auto data_bc = make_input("/*", " ", "*/", size);
+    auto data_dq = make_input("\"", " ", "\"", size);
+
+    BENCHMARK("memcpy()")
      {
-        return memcpy(dst, whitespace.c_str(), whitespace.size());
+        return memcpy(dst, data_pl.c_str(), data_pl.size());
      };
  
-    REQUIRE(norm_ret(normalizer, whitespace) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("whitespaces - 65535 bytes")
+    REQUIRE(norm_ret(normalizer, data_ws) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("whitespaces")
      {
          normalizer.rewind_output();
-        return normalizer.normalize(whitespace.c_str(), whitespace.size());
+        return normalizer.normalize(data_ws.c_str(), data_ws.size());
      };
  
-    REQUIRE(norm_ret(normalizer, block_comment) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("block comment - 65535 bytes")
+    REQUIRE(norm_ret(normalizer, data_bc) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("block comment")
      {
          normalizer.rewind_output();
-        return normalizer.normalize(block_comment.c_str(), block_comment.size());
+        return normalizer.normalize(data_bc.c_str(), data_bc.size());
      };
  
-    REQUIRE(norm_ret(normalizer, double_quote) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("double quotes string - 65535 bytes")
+    REQUIRE(norm_ret(normalizer, data_dq) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("double quotes string")
      {
          normalizer.rewind_output();
-        return normalizer.normalize(double_quote.c_str(), double_quote.size());
+        return normalizer.normalize(data_dq.c_str(), data_dq.size());
      };
+}
  
-    constexpr size_t depth_8k = 8192;
+TEST_CASE("JS Normalizer, literals by 64 K", "[JSNormalizer]")
+{
+    JSIdentifierCtxTest ident_ctx;
+    JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH);
+    char dst[DEPTH];
+
+    constexpr size_t size = 1 << 16;
  
-    auto whitespace_8k = make_input("", " ", "", depth_8k);
-    auto block_comment_8k = make_input("/*", " ", "*/", depth_8k);
-    auto double_quote_8k = make_input("\"", " ", "\"", depth_8k);
+    auto data_pl = make_input("", ".", "", size);
+    auto data_ws = make_input("", " ", "", size);
+    auto data_bc = make_input("/*", " ", "*/", size);
+    auto data_dq = make_input("\"", " ", "\"", size);
  
-    BENCHMARK("memcpy - whitespaces - 8192 bytes")
+    BENCHMARK("memcpy()")
      {
-        return memcpy(dst, whitespace_8k.c_str(), whitespace_8k.size());
+        return memcpy(dst, data_pl.c_str(), data_pl.size());
      };
  
-    REQUIRE(norm_ret(normalizer, whitespace_8k) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("whitespaces - 8192 bytes")
+    REQUIRE(norm_ret(normalizer, data_ws) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("whitespaces")
      {
          normalizer.rewind_output();
-        return normalizer.normalize(whitespace_8k.c_str(), whitespace_8k.size());
+        return normalizer.normalize(data_ws.c_str(), data_ws.size());
      };
  
-    REQUIRE(norm_ret(normalizer, block_comment_8k) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("block comment - 8192 bytes")
+    REQUIRE(norm_ret(normalizer, data_bc) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("block comment")
      {
          normalizer.rewind_output();
-        return normalizer.normalize(block_comment_8k.c_str(), block_comment_8k.size());
+        return normalizer.normalize(data_bc.c_str(), data_bc.size());
      };
  
-    REQUIRE(norm_ret(normalizer, double_quote_8k) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("double quotes string - 8192 bytes")
+    REQUIRE(norm_ret(normalizer, data_dq) == JSTokenizer::SCRIPT_ENDED);
+    BENCHMARK("double quotes string")
      {
          normalizer.rewind_output();
-        return normalizer.normalize(double_quote_8k.c_str(), double_quote_8k.size());
+        return normalizer.normalize(data_dq.c_str(), data_dq.size());
      };
  }
  
-TEST_CASE("benchmarking - ::normalize() - identifiers", "[JSNormalizer]")
+TEST_CASE("JS Normalizer, id normalization", "[JSNormalizer]")
  {
      // around 11 000 identifiers
      std::string input;
@@ -3471,53 +3483,47 @@ TEST_CASE("benchmarking - ::normalize() - identifiers", "[JSNormalizer]")
      };
  }
  
-TEST_CASE("benchmarking - ::normalize() - scope", "[JSNormalizer]")
+TEST_CASE("JS Normalizer, scope tracking", "[JSNormalizer]")
  {
      constexpr uint32_t depth = 65535;
      JSIdentifierCtxTest ident_ctx;
      JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTING, depth);
-    char dst[depth];
  
      auto src_ws = make_input("", " ", "", depth);
      auto src_brace_rep = make_input_repeat("{}", depth);
      auto src_paren_rep = make_input_repeat("()", depth);
      auto src_bracket_rep = make_input_repeat("[]", depth);
  
-    BENCHMARK("memcpy - ...{}{}{}... - 65535 bytes")
-    {
-        return memcpy(dst, src_brace_rep.c_str(), src_brace_rep.size());
-    };
-
      REQUIRE(norm_ret(normalizer, src_ws) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("whitespaces - 65535 bytes")
+    BENCHMARK("whitespaces")
      {
          normalizer.rewind_output();
          return normalizer.normalize(src_ws.c_str(), src_ws.size());
      };
  
      REQUIRE(norm_ret(normalizer, src_brace_rep) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("...{}{}{}... - 65535 bytes")
+    BENCHMARK("...{}{}{}...")
      {
          normalizer.rewind_output();
          return normalizer.normalize(src_brace_rep.c_str(), src_brace_rep.size());
      };
  
      REQUIRE(norm_ret(normalizer, src_paren_rep) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("...()()()... - 65535 bytes")
+    BENCHMARK("...()()()...")
      {
          normalizer.rewind_output();
          return normalizer.normalize(src_paren_rep.c_str(), src_paren_rep.size());
      };
  
      REQUIRE(norm_ret(normalizer, src_bracket_rep) == JSTokenizer::SCRIPT_ENDED);
-    BENCHMARK("...[][][]... - 65535 bytes")
+    BENCHMARK("...[][][]...")
      {
          normalizer.rewind_output();
          return normalizer.normalize(src_bracket_rep.c_str(), src_bracket_rep.size());
      };
  }
  
-TEST_CASE("benchmarking - ::normalize() - automatic semicolon insertion")
+TEST_CASE("JS Normalizer, automatic semicolon", "[JSNormalizer]")
  {
      auto w_semicolons = make_input("", "a;\n", "", DEPTH);
      auto wo_semicolons = make_input("", "a \n", "", DEPTH);
diff --git a/src/utils/test/streambuf_test.cc b/src/utils/test/streambuf_test.cc

index d9285b83ad77d1fa4c2c51437a035334e1b6387e..8dae9d794ebf31538fac34bc75975c6065108c6e 100644 (file)
--- a/src/utils/test/streambuf_test.cc
+++ b/src/utils/test/streambuf_test.cc
@@ -132,6 +132,12 @@ using namespace std;
          CHECK(!memcmp((exp), act, (exp_len)));                          \
      }
  
+#define BYTES_READ(s, b)                                           \
+    ((s).clear(),                                                  \
+     std::max((std::streamsize)(s).tellg(),                        \
+              (std::streamsize)(b).last_chunk_offset())            \
+     - (b).last_chunk_offset())
+
  TEST_CASE("input buffer - basic one source", "[Stream buffers]")
  {
      const char* exp = "Early bird gets a corn.";
@@ -1349,16 +1355,20 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]")
          CHECK(0 == b.last_chunk_offset());
  
          s.read(act, 1);
-        CHECK(1 == b.last_chunk_offset());
+        CHECK(0 == b.last_chunk_offset());
+        CHECK(1 == BYTES_READ(s, b));
  
          s.read(act, 2);
-        CHECK(3 == b.last_chunk_offset());
+        CHECK(0 == b.last_chunk_offset());
+        CHECK(3 == BYTES_READ(s, b));
  
          s.read(act, 5);
-        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == BYTES_READ(s, b));
  
          s.read(act, 1);
-        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == BYTES_READ(s, b));
      }
  
      SECTION("two buffers")
@@ -1373,22 +1383,28 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]")
          CHECK(0 == b.last_chunk_offset());
  
          b.pubsetbuf(dat1, strlen(dat1))->pubsetbuf(dat2, strlen(dat2));
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(4 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
  
          s.read(act, 1);
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(4 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
  
          s.read(act, 1);
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(4 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
  
          s.read(act, 4);
-        CHECK(2 == b.last_chunk_offset());
+        CHECK(4 == b.last_chunk_offset());
+        CHECK(2 == BYTES_READ(s, b));
  
          s.read(act, 2);
          CHECK(4 == b.last_chunk_offset());
+        CHECK(4 == BYTES_READ(s, b));
  
          s.read(act, 1);
          CHECK(4 == b.last_chunk_offset());
+        CHECK(4 == BYTES_READ(s, b));
      }
  
      SECTION("three buffers")
@@ -1404,22 +1420,28 @@ TEST_CASE("input stream - last chunk offset", "[Stream buffers]")
          CHECK(0 == b.last_chunk_offset());
  
          b.pubsetbuf(dat1, strlen(dat1))->pubsetbuf(dat2, strlen(dat2))->pubsetbuf(dat3, strlen(dat3));
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
  
          s.read(act, 3);
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
  
          s.read(act, 3);
-        CHECK(0 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(0 == BYTES_READ(s, b));
  
          s.read(act, 3);
-        CHECK(1 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(1 == BYTES_READ(s, b));
  
          s.read(act, 3);
-        CHECK(4 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(4 == BYTES_READ(s, b));
  
          s.read(act, 1);
-        CHECK(4 == b.last_chunk_offset());
+        CHECK(8 == b.last_chunk_offset());
+        CHECK(4 == BYTES_READ(s, b));
      }
  }
author	Mike Stepanek (mstepane) <mstepane@cisco.com>
	Tue, 23 Nov 2021 19:10:01 +0000 (19:10 +0000)
committer	Mike Stepanek (mstepane) <mstepane@cisco.com>
	Tue, 23 Nov 2021 19:10:01 +0000 (19:10 +0000)
src/utils/CMakeLists.txt		patch \| blob \| blame \| history
src/utils/js_normalizer.cc		patch \| blob \| blame \| history
src/utils/js_tokenizer.h		patch \| blob \| blame \| history
src/utils/js_tokenizer.l		patch \| blob \| blame \| history
src/utils/streambuf.cc		patch \| blob \| blame \| history
src/utils/test/js_normalizer_test.cc		patch \| blob \| blame \| history
src/utils/test/streambuf_test.cc		patch \| blob \| blame \| history