From: Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) Date: Tue, 10 Jan 2023 21:34:48 +0000 (+0000) Subject: Pull request #3722: Add benchmark tests for PDF parser. X-Git-Tag: 3.1.51.0~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3531aa40391beb9e4fa0315906a34fcd5d1d112c;p=thirdparty%2Fsnort3.git Pull request #3722: Add benchmark tests for PDF parser. Merge in SNORT/snort3 from ~OSHUMEIK/snort3:jsn_pdf_bench to master Squashed commit of the following: commit 53ece926c098ed146e9e8e284c506767dabf2c64 Author: Oleksii Shumeiko Date: Thu Dec 22 16:02:17 2022 +0200 js_norm: delete unused method commit f0c0270b07fa72676a91382cea44ea69baaf5d17 Author: Oleksii Shumeiko Date: Wed Dec 21 10:49:36 2022 +0200 js_norm: tune PDF parser performance Decrease data chunk size. commit 97a247bc3236a27a8a91c9b6067214c5fb9333c3 Author: Oleksii Shumeiko Date: Thu Dec 15 15:03:01 2022 +0200 js_norm: add benchmark tests for PDF parser --- diff --git a/src/js_norm/js_identifier_ctx.h b/src/js_norm/js_identifier_ctx.h index 6092d1bea..57d8003ab 100644 --- a/src/js_norm/js_identifier_ctx.h +++ b/src/js_norm/js_identifier_ctx.h @@ -51,8 +51,6 @@ public: virtual bool scope_pop(JSProgramScopeType) = 0; virtual void reset() = 0; - - virtual size_t size() const = 0; }; class JSIdentifierCtx : public JSIdentifier @@ -72,12 +70,6 @@ public: virtual void reset() override; - // approximated to 500 unique mappings insertions - // approximated to 3 program scopes in the list - virtual size_t size() const override - { return (sizeof(JSIdentifierCtx) + (sizeof(std::string) * 2 * 500) + - (sizeof(ProgramScope) * 3)); } - private: struct NormId diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l index 6c13a9cdd..b20a1ec3d 100644 --- a/src/js_norm/pdf_tokenizer.l +++ b/src/js_norm/pdf_tokenizer.l @@ -85,8 +85,8 @@ COMMENT %{GRP_NOT_NEWLINE}*{EOL_MARKER} OBJ_BOOLEAN true|false /* 7.3.3 Numeric Objects */ -OBJ_INT_NUM [+-]?[0-9]{1,64} -OBJ_REL_NUM [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64}) +OBJ_INT_NUM [+-]?[0-9]{1,16} +OBJ_REL_NUM [+-]?("."?[0-9]{1,16}|[0-9]{1,16}"."?|[0-9]{1,16}"."?[0-9]{1,16}) /* 7.3.4 String Objects */ OBJ_LIT_STR_OPEN "(" @@ -99,7 +99,7 @@ LIT_STR_ESC \\[^0-7] LIT_STR_ESC_OCT \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3} LIT_STR_ESC_EOL \\[\x0d\x0a]|\\\x0d\x0a LIT_STR_EOL [\x0d\x0a]|\x0d\x0a -LIT_STR_BODY [^\\\(\)]{1,64} +LIT_STR_BODY [^\\\(\)]{1,16} /* 7.9.2.2 Text String Type, UTF-16BE */ /* RFC 2781: 4.3 Interpreting text labelled as UTF-16 */ @@ -109,8 +109,8 @@ LIT_STR_U16_UNESC \\[(\)\\nrtbf] LIT_STR_U16_BODY [^\\\(\)]{1,16} /* 7.3.4.3 Hexadecimal Strings */ -HEX_STR_BODY [0-9A-Fa-f]{1,64} -HEX_STR_SKIP [^0-9A-Fa-f>]{1,64} +HEX_STR_BODY [0-9A-Fa-f]{1,16} +HEX_STR_SKIP [^0-9A-Fa-f>]{1,16} /* 7.3.5 Name Objects */ OBJ_NAME \/{GRP_REGULAR}{1,256} @@ -128,7 +128,7 @@ OBJ_DICT_SKIP .|{GRP_NEWLINE} /* 7.3.8 Stream Objects */ OBJ_STREAM_OPEN stream\r?\n OBJ_STREAM_CLOSE {EOL_MARKER}endstream -OBJ_STREAM_SKIP {GRP_NOT_NEWLINE}{1,64}|{GRP_NEWLINE} +OBJ_STREAM_SKIP {GRP_NOT_NEWLINE}{1,16}|{GRP_NEWLINE} /* 7.3.9 Null Object */ OBJ_NULL null @@ -142,8 +142,8 @@ OBJ_REFERENCE {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE} /* Not object start, not comments */ -SKIP [^[:digit:]%]{1,64}|. -WHITESPACE {GRP_WHITESPACE}{1,64} +SKIP [^[:digit:]%]{1,16}|. +WHITESPACE {GRP_WHITESPACE}{1,16} /* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string, stream */ %x indobj diff --git a/src/js_norm/test/CMakeLists.txt b/src/js_norm/test/CMakeLists.txt index 260c15a9c..66d0ff0b6 100644 --- a/src/js_norm/test/CMakeLists.txt +++ b/src/js_norm/test/CMakeLists.txt @@ -20,20 +20,6 @@ add_catch_test( js_normalizer_test js_test_utils.cc ) -if (ENABLE_BENCHMARK_TESTS) - add_catch_test( js_norm_benchmark - SOURCES - ${js_tokenizer_OUTPUTS} - ../js_identifier_ctx.cc - ../js_normalizer.cc - ${CMAKE_SOURCE_DIR}/src/utils/streambuf.cc - ${CMAKE_SOURCE_DIR}/src/utils/util_cstring.cc - js_test_options.cc - js_test_stubs.cc - js_test_utils.cc - ) -endif(ENABLE_BENCHMARK_TESTS) - add_catch_test( js_dealias_test SOURCES ${js_tokenizer_OUTPUTS} @@ -79,3 +65,27 @@ add_catch_test( pdf_tokenizer_test ${pdf_tokenizer_OUTPUTS} js_test_stubs.cc ) + +if (ENABLE_BENCHMARK_TESTS) + + add_catch_test( js_norm_benchmark + SOURCES + ${js_tokenizer_OUTPUTS} + ../js_identifier_ctx.cc + ../js_normalizer.cc + ${CMAKE_SOURCE_DIR}/src/utils/streambuf.cc + ${CMAKE_SOURCE_DIR}/src/utils/util_cstring.cc + js_test_options.cc + js_test_stubs.cc + js_test_utils.cc + ) + + add_catch_test( pdf_tokenizer_benchmark + SOURCES + ${pdf_tokenizer_OUTPUTS} + ${CMAKE_SOURCE_DIR}/src/utils/streambuf.cc + ${CMAKE_SOURCE_DIR}/src/utils/util_cstring.cc + js_test_stubs.cc + ) + +endif(ENABLE_BENCHMARK_TESTS) diff --git a/src/js_norm/test/js_test_utils.h b/src/js_norm/test/js_test_utils.h index 40be89ca0..1773e9609 100644 --- a/src/js_norm/test/js_test_utils.h +++ b/src/js_norm/test/js_test_utils.h @@ -51,7 +51,6 @@ public: bool scope_push(JSProgramScopeType) override { return true; } bool scope_pop(JSProgramScopeType) override { return true; } void reset() override {} - size_t size() const override { return 0; } }; class JSTestConfig; diff --git a/src/js_norm/test/pdf_tokenizer_benchmark.cc b/src/js_norm/test/pdf_tokenizer_benchmark.cc new file mode 100644 index 000000000..ab447eb75 --- /dev/null +++ b/src/js_norm/test/pdf_tokenizer_benchmark.cc @@ -0,0 +1,208 @@ +//-------------------------------------------------------------------------- +// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved. +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License Version 2 as published +// by the Free Software Foundation. You may not use, modify or distribute +// this program under any other version of the GNU General Public License. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +//-------------------------------------------------------------------------- +// pdf_tokenizer_benchmark.cc author Cisco + +#ifdef BENCHMARK_TEST + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#include + +#include "catch/catch.hpp" +#include "js_norm/pdf_tokenizer.h" +#include "utils/streambuf.h" + +using namespace jsn; +using namespace snort; +using namespace std; + +static const string make_input(const char* begin, const char* mid, const char* end, size_t len) +{ + string str(begin); + int fill = (len - strlen(begin) - strlen(end)) / strlen(mid); + + for (int i = 0; i < fill; ++i) + str.append(mid); + str.append(end); + + return str; +} + +static const string make_input_repeat(const char* pattern, int cnt) +{ + string str; + + while (cnt--) + str.append(pattern); + + return str; +} + +#define set_input(input) \ + buf_in.pubsetbuf(nullptr, 0)->pubsetbuf((char*)(input).c_str(), (input).size()); \ + buf_out.reserve((input).size()) \ + +#define rewind() \ + buf_in.pubseekoff(0, ios_base::beg, ios_base::in); \ + buf_out.pubseekoff(0, ios_base::beg, ios_base::out) \ + +TEST_CASE("PDF Tokenizer, literals by 8 K", "[PDFTokenizer]") +{ + constexpr size_t size = 1 << 13; + + const auto data_cpy = make_input("", " ", "", size); + const auto data_wsp = make_input("1 1 obj\n", " ", " \nendobj\n", size); + const auto data_com = make_input("1 2 obj\n%", "c", "\n \nendobj\n", size); + const auto data_str = make_input("1 3 obj\n(", "s", ") \nendobj\n", size); + const auto data_hex = make_input("1 4 obj\n<", "0", "> \nendobj\n", size); + const auto data_stm = make_input("1 5 obj\n<>stream\n", ".", + "\nendstream\nendobj\n", size); + + char dst[size + 128]; + istreambuf_glue buf_in; + ostreambuf_infl buf_out; + istream in(&buf_in); + ostream out(&buf_out); + PDFTokenizer parser(in, out); + + BENCHMARK("memcpy()") + { + return memcpy(dst, data_cpy.c_str(), data_cpy.size()); + }; + + set_input(data_wsp); + BENCHMARK("whitespace") + { + rewind(); + return parser.process(); + }; + + set_input(data_com); + BENCHMARK("comment") + { + rewind(); + return parser.process(); + }; + + set_input(data_str); + BENCHMARK("literal string") + { + rewind(); + return parser.process(); + }; + + set_input(data_hex); + BENCHMARK("hexadecimal string") + { + rewind(); + return parser.process(); + }; + + set_input(data_stm); + BENCHMARK("stream") + { + rewind(); + return parser.process(); + }; +} + +TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]") +{ + constexpr size_t size = 1 << 16; + + const auto data_cpy = make_input("", " ", "", size); + const auto data_wsp = make_input("1 1 obj\n", " ", " \nendobj\n", size); + const auto data_com = make_input("1 2 obj\n%", "c", "\n \nendobj\n", size); + const auto data_str = make_input("1 3 obj\n(", "s", ") \nendobj\n", size); + const auto data_hex = make_input("1 4 obj\n<", "0", "> \nendobj\n", size); + const auto data_stm = make_input("1 5 obj\n<>stream\n", ".", + "\nendstream\nendobj\n", size); + + char dst[size + 128]; + istreambuf_glue buf_in; + ostreambuf_infl buf_out; + istream in(&buf_in); + ostream out(&buf_out); + PDFTokenizer parser(in, out); + + BENCHMARK("memcpy()") + { + return memcpy(dst, data_cpy.c_str(), data_cpy.size()); + }; + + set_input(data_wsp); + BENCHMARK("whitespace") + { + rewind(); + return parser.process(); + }; + + set_input(data_com); + BENCHMARK("comment") + { + rewind(); + return parser.process(); + }; + + set_input(data_str); + BENCHMARK("literal string") + { + rewind(); + return parser.process(); + }; + + set_input(data_hex); + BENCHMARK("hexadecimal string") + { + rewind(); + return parser.process(); + }; + + set_input(data_stm); + BENCHMARK("stream") + { + rewind(); + return parser.process(); + }; +} + +TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]") +{ + // 30 * 2048 = 61440 + const auto data = make_input_repeat("1 0 obj\n % any object\n endobj\n", 2048); + + istreambuf_glue buf_in; + ostreambuf_infl buf_out; + istream in(&buf_in); + ostream out(&buf_out); + PDFTokenizer parser(in, out); + + set_input(data); + BENCHMARK("same object repeated") + { + rewind(); + return parser.process(); + }; +} + +#endif