Pull request #3722: Add benchmark tests for PDF parser.

author Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>

Tue, 10 Jan 2023 21:34:48 +0000 (21:34 +0000)

committer Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>

Tue, 10 Jan 2023 21:34:48 +0000 (21:34 +0000)
author Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Tue, 10 Jan 2023 21:34:48 +0000 (21:34 +0000)
committer Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Tue, 10 Jan 2023 21:34:48 +0000 (21:34 +0000)
diff --git a/src/js_norm/js_identifier_ctx.h b/src/js_norm/js_identifier_ctx.h

index 6092d1bea8fc96d1b62c4688ea54d32127a08adb..57d8003ab78619198a577ff7453cfa69eb96fd5e 100644 (file)
--- a/src/js_norm/js_identifier_ctx.h
+++ b/src/js_norm/js_identifier_ctx.h
@@ -51,8 +51,6 @@ public:
      virtual bool scope_pop(JSProgramScopeType) = 0;
  
      virtual void reset() = 0;
-
-    virtual size_t size() const = 0;
  };
  
  class JSIdentifierCtx : public JSIdentifier
@@ -72,12 +70,6 @@ public:
  
      virtual void reset() override;
  
-    // approximated to 500 unique mappings insertions
-    // approximated to 3 program scopes in the list
-    virtual size_t size() const override
-    { return (sizeof(JSIdentifierCtx) + (sizeof(std::string) * 2 * 500) +
-        (sizeof(ProgramScope) * 3)); }
-
  private:
  
      struct NormId
diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l

index 6c13a9cdd1755557d53a204e26974b81143d19ad..b20a1ec3dfcb70951e04fed6d0badfcfe455a32c 100644 (file)
--- a/src/js_norm/pdf_tokenizer.l
+++ b/src/js_norm/pdf_tokenizer.l
@@ -85,8 +85,8 @@ COMMENT            %{GRP_NOT_NEWLINE}*{EOL_MARKER}
  OBJ_BOOLEAN        true|false
  
  /* 7.3.3 Numeric Objects */
-OBJ_INT_NUM        [+-]?[0-9]{1,64}
-OBJ_REL_NUM        [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64})
+OBJ_INT_NUM        [+-]?[0-9]{1,16}
+OBJ_REL_NUM        [+-]?("."?[0-9]{1,16}|[0-9]{1,16}"."?|[0-9]{1,16}"."?[0-9]{1,16})
  
  /* 7.3.4 String Objects */
  OBJ_LIT_STR_OPEN   "("
@@ -99,7 +99,7 @@ LIT_STR_ESC        \\[^0-7]
  LIT_STR_ESC_OCT    \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3}
  LIT_STR_ESC_EOL    \\[\x0d\x0a]|\\\x0d\x0a
  LIT_STR_EOL        [\x0d\x0a]|\x0d\x0a
-LIT_STR_BODY       [^\\\(\)]{1,64}
+LIT_STR_BODY       [^\\\(\)]{1,16}
  
  /* 7.9.2.2 Text String Type, UTF-16BE */
  /* RFC 2781: 4.3 Interpreting text labelled as UTF-16 */
@@ -109,8 +109,8 @@ LIT_STR_U16_UNESC  \\[(\)\\nrtbf]
  LIT_STR_U16_BODY   [^\\\(\)]{1,16}
  
  /* 7.3.4.3 Hexadecimal Strings */
-HEX_STR_BODY       [0-9A-Fa-f]{1,64}
-HEX_STR_SKIP       [^0-9A-Fa-f>]{1,64}
+HEX_STR_BODY       [0-9A-Fa-f]{1,16}
+HEX_STR_SKIP       [^0-9A-Fa-f>]{1,16}
  
  /* 7.3.5 Name Objects */
  OBJ_NAME           \/{GRP_REGULAR}{1,256}
@@ -128,7 +128,7 @@ OBJ_DICT_SKIP      .|{GRP_NEWLINE}
  /* 7.3.8 Stream Objects */
  OBJ_STREAM_OPEN    stream\r?\n
  OBJ_STREAM_CLOSE   {EOL_MARKER}endstream
-OBJ_STREAM_SKIP    {GRP_NOT_NEWLINE}{1,64}|{GRP_NEWLINE}
+OBJ_STREAM_SKIP    {GRP_NOT_NEWLINE}{1,16}|{GRP_NEWLINE}
  
  /* 7.3.9 Null Object */
  OBJ_NULL           null
@@ -142,8 +142,8 @@ OBJ_REFERENCE        {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}
  
  
  /* Not object start, not comments */
-SKIP              [^[:digit:]%]{1,64}|.
-WHITESPACE        {GRP_WHITESPACE}{1,64}
+SKIP              [^[:digit:]%]{1,16}|.
+WHITESPACE        {GRP_WHITESPACE}{1,16}
  
  /* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string, stream */
  %x indobj
diff --git a/src/js_norm/test/CMakeLists.txt b/src/js_norm/test/CMakeLists.txt

index 260c15a9c42fa59026f6939535854f9699f14f70..66d0ff0b6d1882d91e8338e71128a9b5c161e27c 100644 (file)
--- a/src/js_norm/test/CMakeLists.txt
+++ b/src/js_norm/test/CMakeLists.txt
@@ -20,20 +20,6 @@ add_catch_test( js_normalizer_test
          js_test_utils.cc
  )
  
-if (ENABLE_BENCHMARK_TESTS)
-    add_catch_test( js_norm_benchmark
-        SOURCES
-            ${js_tokenizer_OUTPUTS}
-            ../js_identifier_ctx.cc
-            ../js_normalizer.cc
-            ${CMAKE_SOURCE_DIR}/src/utils/streambuf.cc
-            ${CMAKE_SOURCE_DIR}/src/utils/util_cstring.cc
-            js_test_options.cc
-            js_test_stubs.cc
-            js_test_utils.cc
-    )
-endif(ENABLE_BENCHMARK_TESTS)
-
  add_catch_test( js_dealias_test
      SOURCES
          ${js_tokenizer_OUTPUTS}
@@ -79,3 +65,27 @@ add_catch_test( pdf_tokenizer_test
          ${pdf_tokenizer_OUTPUTS}
          js_test_stubs.cc
  )
+
+if (ENABLE_BENCHMARK_TESTS)
+
+    add_catch_test( js_norm_benchmark
+        SOURCES
+            ${js_tokenizer_OUTPUTS}
+            ../js_identifier_ctx.cc
+            ../js_normalizer.cc
+            ${CMAKE_SOURCE_DIR}/src/utils/streambuf.cc
+            ${CMAKE_SOURCE_DIR}/src/utils/util_cstring.cc
+            js_test_options.cc
+            js_test_stubs.cc
+            js_test_utils.cc
+    )
+
+    add_catch_test( pdf_tokenizer_benchmark
+        SOURCES
+            ${pdf_tokenizer_OUTPUTS}
+            ${CMAKE_SOURCE_DIR}/src/utils/streambuf.cc
+            ${CMAKE_SOURCE_DIR}/src/utils/util_cstring.cc
+            js_test_stubs.cc
+    )
+
+endif(ENABLE_BENCHMARK_TESTS)
diff --git a/src/js_norm/test/js_test_utils.h b/src/js_norm/test/js_test_utils.h

index 40be89ca04e2452020ad1353c6ed4449eeb6995e..1773e96097841b76b4a18c504182b9ad96df7bee 100644 (file)
--- a/src/js_norm/test/js_test_utils.h
+++ b/src/js_norm/test/js_test_utils.h
@@ -51,7 +51,6 @@ public:
      bool scope_push(JSProgramScopeType) override { return true; }
      bool scope_pop(JSProgramScopeType) override { return true; }
      void reset() override {}
-    size_t size() const override { return 0; }
  };
  
  class JSTestConfig;
diff --git a/src/js_norm/test/pdf_tokenizer_benchmark.cc b/src/js_norm/test/pdf_tokenizer_benchmark.cc

new file mode 100644 (file)

index 0000000..ab447eb
--- /dev/null
+++ b/src/js_norm/test/pdf_tokenizer_benchmark.cc
@@ -0,0 +1,208 @@
+//--------------------------------------------------------------------------
+// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// pdf_tokenizer_benchmark.cc author Cisco
+
+#ifdef BENCHMARK_TEST
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <cstring>
+#include <string>
+
+#include <FlexLexer.h>
+
+#include "catch/catch.hpp"
+#include "js_norm/pdf_tokenizer.h"
+#include "utils/streambuf.h"
+
+using namespace jsn;
+using namespace snort;
+using namespace std;
+
+static const string make_input(const char* begin, const char* mid, const char* end, size_t len)
+{
+    string str(begin);
+    int fill = (len - strlen(begin) - strlen(end)) / strlen(mid);
+
+    for (int i = 0; i < fill; ++i)
+        str.append(mid);
+    str.append(end);
+
+    return str;
+}
+
+static const string make_input_repeat(const char* pattern, int cnt)
+{
+    string str;
+
+    while (cnt--)
+        str.append(pattern);
+
+    return str;
+}
+
+#define set_input(input)                                                \
+    buf_in.pubsetbuf(nullptr, 0)->pubsetbuf((char*)(input).c_str(), (input).size()); \
+    buf_out.reserve((input).size())                                     \
+
+#define rewind()                                                \
+    buf_in.pubseekoff(0, ios_base::beg, ios_base::in);          \
+    buf_out.pubseekoff(0, ios_base::beg, ios_base::out)         \
+
+TEST_CASE("PDF Tokenizer, literals by 8 K", "[PDFTokenizer]")
+{
+    constexpr size_t size = 1 << 13;
+
+    const auto data_cpy = make_input("",           " ", "",            size);
+    const auto data_wsp = make_input("1 1 obj\n",  " ", "   \nendobj\n", size);
+    const auto data_com = make_input("1 2 obj\n%", "c", "\n \nendobj\n", size);
+    const auto data_str = make_input("1 3 obj\n(", "s", ")  \nendobj\n", size);
+    const auto data_hex = make_input("1 4 obj\n<", "0", ">  \nendobj\n", size);
+    const auto data_stm = make_input("1 5 obj\n<</Length 8192>>stream\n", ".",
+        "\nendstream\nendobj\n", size);
+
+    char dst[size + 128];
+    istreambuf_glue buf_in;
+    ostreambuf_infl buf_out;
+    istream in(&buf_in);
+    ostream out(&buf_out);
+    PDFTokenizer parser(in, out);
+
+    BENCHMARK("memcpy()")
+    {
+        return memcpy(dst, data_cpy.c_str(), data_cpy.size());
+    };
+
+    set_input(data_wsp);
+    BENCHMARK("whitespace")
+    {
+        rewind();
+        return parser.process();
+    };
+
+    set_input(data_com);
+    BENCHMARK("comment")
+    {
+        rewind();
+        return parser.process();
+    };
+
+    set_input(data_str);
+    BENCHMARK("literal string")
+    {
+        rewind();
+        return parser.process();
+    };
+
+    set_input(data_hex);
+    BENCHMARK("hexadecimal string")
+    {
+        rewind();
+        return parser.process();
+    };
+
+    set_input(data_stm);
+    BENCHMARK("stream")
+    {
+        rewind();
+        return parser.process();
+    };
+}
+
+TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]")
+{
+    constexpr size_t size = 1 << 16;
+
+    const auto data_cpy = make_input("",           " ", "",            size);
+    const auto data_wsp = make_input("1 1 obj\n",  " ", "   \nendobj\n", size);
+    const auto data_com = make_input("1 2 obj\n%", "c", "\n \nendobj\n", size);
+    const auto data_str = make_input("1 3 obj\n(", "s", ")  \nendobj\n", size);
+    const auto data_hex = make_input("1 4 obj\n<", "0", ">  \nendobj\n", size);
+    const auto data_stm = make_input("1 5 obj\n<</Length 65536>>stream\n", ".",
+        "\nendstream\nendobj\n", size);
+
+    char dst[size + 128];
+    istreambuf_glue buf_in;
+    ostreambuf_infl buf_out;
+    istream in(&buf_in);
+    ostream out(&buf_out);
+    PDFTokenizer parser(in, out);
+
+    BENCHMARK("memcpy()")
+    {
+        return memcpy(dst, data_cpy.c_str(), data_cpy.size());
+    };
+
+    set_input(data_wsp);
+    BENCHMARK("whitespace")
+    {
+        rewind();
+        return parser.process();
+    };
+
+    set_input(data_com);
+    BENCHMARK("comment")
+    {
+        rewind();
+        return parser.process();
+    };
+
+    set_input(data_str);
+    BENCHMARK("literal string")
+    {
+        rewind();
+        return parser.process();
+    };
+
+    set_input(data_hex);
+    BENCHMARK("hexadecimal string")
+    {
+        rewind();
+        return parser.process();
+    };
+
+    set_input(data_stm);
+    BENCHMARK("stream")
+    {
+        rewind();
+        return parser.process();
+    };
+}
+
+TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]")
+{
+    // 30 * 2048 = 61440
+    const auto data = make_input_repeat("1 0 obj\n % any object\n endobj\n", 2048);
+
+    istreambuf_glue buf_in;
+    ostreambuf_infl buf_out;
+    istream in(&buf_in);
+    ostream out(&buf_out);
+    PDFTokenizer parser(in, out);
+
+    set_input(data);
+    BENCHMARK("same object repeated")
+    {
+        rewind();
+        return parser.process();
+    };
+}
+
+#endif
author	Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
	Tue, 10 Jan 2023 21:34:48 +0000 (21:34 +0000)
committer	Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
	Tue, 10 Jan 2023 21:34:48 +0000 (21:34 +0000)
src/js_norm/js_identifier_ctx.h		patch \| blob \| blame \| history
src/js_norm/pdf_tokenizer.l		patch \| blob \| blame \| history
src/js_norm/test/CMakeLists.txt		patch \| blob \| blame \| history
src/js_norm/test/js_test_utils.h		patch \| blob \| blame \| history
src/js_norm/test/pdf_tokenizer_benchmark.cc	[new file with mode: 0644]	patch \| blob