OBJ_BOOLEAN true|false
/* 7.3.3 Numeric Objects */
-OBJ_INT_NUM [+-]?[0-9]{1,64}
-OBJ_REL_NUM [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64})
+OBJ_INT_NUM [+-]?[0-9]{1,16}
+OBJ_REL_NUM [+-]?("."?[0-9]{1,16}|[0-9]{1,16}"."?|[0-9]{1,16}"."?[0-9]{1,16})
/* 7.3.4 String Objects */
OBJ_LIT_STR_OPEN "("
LIT_STR_ESC_OCT \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3}
LIT_STR_ESC_EOL \\[\x0d\x0a]|\\\x0d\x0a
LIT_STR_EOL [\x0d\x0a]|\x0d\x0a
-LIT_STR_BODY [^\\\(\)]{1,64}
+LIT_STR_BODY [^\\\(\)]{1,16}
/* 7.9.2.2 Text String Type, UTF-16BE */
/* RFC 2781: 4.3 Interpreting text labelled as UTF-16 */
LIT_STR_U16_BODY [^\\\(\)]{1,16}
/* 7.3.4.3 Hexadecimal Strings */
-HEX_STR_BODY [0-9A-Fa-f]{1,64}
-HEX_STR_SKIP [^0-9A-Fa-f>]{1,64}
+HEX_STR_BODY [0-9A-Fa-f]{1,16}
+HEX_STR_SKIP [^0-9A-Fa-f>]{1,16}
/* 7.3.5 Name Objects */
OBJ_NAME \/{GRP_REGULAR}{1,256}
/* 7.3.8 Stream Objects */
OBJ_STREAM_OPEN stream\r?\n
OBJ_STREAM_CLOSE {EOL_MARKER}endstream
-OBJ_STREAM_SKIP {GRP_NOT_NEWLINE}{1,64}|{GRP_NEWLINE}
+OBJ_STREAM_SKIP {GRP_NOT_NEWLINE}{1,16}|{GRP_NEWLINE}
/* 7.3.9 Null Object */
OBJ_NULL null
/* Not object start, not comments */
-SKIP [^[:digit:]%]{1,64}|.
-WHITESPACE {GRP_WHITESPACE}{1,64}
+SKIP [^[:digit:]%]{1,16}|.
+WHITESPACE {GRP_WHITESPACE}{1,16}
/* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string, stream */
%x indobj
--- /dev/null
+//--------------------------------------------------------------------------
+// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation. You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+//--------------------------------------------------------------------------
+// pdf_tokenizer_benchmark.cc author Cisco
+
+#ifdef BENCHMARK_TEST
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <cstring>
+#include <string>
+
+#include <FlexLexer.h>
+
+#include "catch/catch.hpp"
+#include "js_norm/pdf_tokenizer.h"
+#include "utils/streambuf.h"
+
+using namespace jsn;
+using namespace snort;
+using namespace std;
+
+static const string make_input(const char* begin, const char* mid, const char* end, size_t len)
+{
+ string str(begin);
+ int fill = (len - strlen(begin) - strlen(end)) / strlen(mid);
+
+ for (int i = 0; i < fill; ++i)
+ str.append(mid);
+ str.append(end);
+
+ return str;
+}
+
+static const string make_input_repeat(const char* pattern, int cnt)
+{
+ string str;
+
+ while (cnt--)
+ str.append(pattern);
+
+ return str;
+}
+
+#define set_input(input) \
+ buf_in.pubsetbuf(nullptr, 0)->pubsetbuf((char*)(input).c_str(), (input).size()); \
+ buf_out.reserve((input).size()) \
+
+#define rewind() \
+ buf_in.pubseekoff(0, ios_base::beg, ios_base::in); \
+ buf_out.pubseekoff(0, ios_base::beg, ios_base::out) \
+
+TEST_CASE("PDF Tokenizer, literals by 8 K", "[PDFTokenizer]")
+{
+ constexpr size_t size = 1 << 13;
+
+ const auto data_cpy = make_input("", " ", "", size);
+ const auto data_wsp = make_input("1 1 obj\n", " ", " \nendobj\n", size);
+ const auto data_com = make_input("1 2 obj\n%", "c", "\n \nendobj\n", size);
+ const auto data_str = make_input("1 3 obj\n(", "s", ") \nendobj\n", size);
+ const auto data_hex = make_input("1 4 obj\n<", "0", "> \nendobj\n", size);
+ const auto data_stm = make_input("1 5 obj\n<</Length 8192>>stream\n", ".",
+ "\nendstream\nendobj\n", size);
+
+ char dst[size + 128];
+ istreambuf_glue buf_in;
+ ostreambuf_infl buf_out;
+ istream in(&buf_in);
+ ostream out(&buf_out);
+ PDFTokenizer parser(in, out);
+
+ BENCHMARK("memcpy()")
+ {
+ return memcpy(dst, data_cpy.c_str(), data_cpy.size());
+ };
+
+ set_input(data_wsp);
+ BENCHMARK("whitespace")
+ {
+ rewind();
+ return parser.process();
+ };
+
+ set_input(data_com);
+ BENCHMARK("comment")
+ {
+ rewind();
+ return parser.process();
+ };
+
+ set_input(data_str);
+ BENCHMARK("literal string")
+ {
+ rewind();
+ return parser.process();
+ };
+
+ set_input(data_hex);
+ BENCHMARK("hexadecimal string")
+ {
+ rewind();
+ return parser.process();
+ };
+
+ set_input(data_stm);
+ BENCHMARK("stream")
+ {
+ rewind();
+ return parser.process();
+ };
+}
+
+TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]")
+{
+ constexpr size_t size = 1 << 16;
+
+ const auto data_cpy = make_input("", " ", "", size);
+ const auto data_wsp = make_input("1 1 obj\n", " ", " \nendobj\n", size);
+ const auto data_com = make_input("1 2 obj\n%", "c", "\n \nendobj\n", size);
+ const auto data_str = make_input("1 3 obj\n(", "s", ") \nendobj\n", size);
+ const auto data_hex = make_input("1 4 obj\n<", "0", "> \nendobj\n", size);
+ const auto data_stm = make_input("1 5 obj\n<</Length 65536>>stream\n", ".",
+ "\nendstream\nendobj\n", size);
+
+ char dst[size + 128];
+ istreambuf_glue buf_in;
+ ostreambuf_infl buf_out;
+ istream in(&buf_in);
+ ostream out(&buf_out);
+ PDFTokenizer parser(in, out);
+
+ BENCHMARK("memcpy()")
+ {
+ return memcpy(dst, data_cpy.c_str(), data_cpy.size());
+ };
+
+ set_input(data_wsp);
+ BENCHMARK("whitespace")
+ {
+ rewind();
+ return parser.process();
+ };
+
+ set_input(data_com);
+ BENCHMARK("comment")
+ {
+ rewind();
+ return parser.process();
+ };
+
+ set_input(data_str);
+ BENCHMARK("literal string")
+ {
+ rewind();
+ return parser.process();
+ };
+
+ set_input(data_hex);
+ BENCHMARK("hexadecimal string")
+ {
+ rewind();
+ return parser.process();
+ };
+
+ set_input(data_stm);
+ BENCHMARK("stream")
+ {
+ rewind();
+ return parser.process();
+ };
+}
+
+TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]")
+{
+ // 30 * 2048 = 61440
+ const auto data = make_input_repeat("1 0 obj\n % any object\n endobj\n", 2048);
+
+ istreambuf_glue buf_in;
+ ostreambuf_infl buf_out;
+ istream in(&buf_in);
+ ostream out(&buf_out);
+ PDFTokenizer parser(in, out);
+
+ set_input(data);
+ BENCHMARK("same object repeated")
+ {
+ rewind();
+ return parser.process();
+ };
+}
+
+#endif