From: Danylo Kyrylov -X (dkyrylov - SOFTSERVE INC at Cisco) Date: Mon, 21 Oct 2024 20:40:57 +0000 (+0000) Subject: Pull request #4482: js_norm: add cross-PDU PDF token reassembly X-Git-Tag: 3.5.1.0~13 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a79629b5eee24d7de996f499f7ba126215cc0234;p=thirdparty%2Fsnort3.git Pull request #4482: js_norm: add cross-PDU PDF token reassembly Merge in SNORT/snort3 from ~DKYRYLOV/snort3:js_pdf_token_split to master Squashed commit of the following: commit 9bb663ecbe181eec9401428277a80d0068a10801 Author: dkyrylov Date: Thu Oct 10 13:39:45 2024 +0300 js_norm: add cross-PDU PDF token reassembly --- diff --git a/src/js_norm/js_pdf_norm.cc b/src/js_norm/js_pdf_norm.cc index ad3d8df79..557a7a78c 100644 --- a/src/js_norm/js_pdf_norm.cc +++ b/src/js_norm/js_pdf_norm.cc @@ -50,6 +50,7 @@ bool PDFJSNorm::pre_proc() } buf_pdf_in.pubsetbuf(nullptr, 0) + ->pubsetbuf(state_buf, state_len) ->pubsetbuf(const_cast((const char*)src_ptr), src_end - src_ptr); pdf_out.clear(); delete[] buf_pdf_out.take_data(); diff --git a/src/js_norm/js_pdf_norm.h b/src/js_norm/js_pdf_norm.h index 47b379fde..4573ac786 100644 --- a/src/js_norm/js_pdf_norm.h +++ b/src/js_norm/js_pdf_norm.h @@ -47,14 +47,19 @@ public: PDFJSNorm(JSNormConfig* cfg, uint32_t gen_id) : JSNorm(cfg, false, gen_id), pdf_in(&buf_pdf_in), pdf_out(&buf_pdf_out), - extractor(pdf_in, pdf_out, cfg ? cfg->pdf_max_dictionary_depth : 0) + extractor(pdf_in, pdf_out, state_buf, state_len, cfg ? cfg->pdf_max_dictionary_depth : 0) { } + virtual ~PDFJSNorm() override + { delete[] state_buf; } + protected: bool pre_proc() override; bool post_proc(int) override; private: + char* state_buf = nullptr; + int state_len = 0; snort::istreambuf_glue buf_pdf_in; snort::ostreambuf_infl buf_pdf_out; std::istream pdf_in; diff --git a/src/js_norm/pdf_tokenizer.h b/src/js_norm/pdf_tokenizer.h index bb1972955..17223cec9 100644 --- a/src/js_norm/pdf_tokenizer.h +++ b/src/js_norm/pdf_tokenizer.h @@ -48,7 +48,7 @@ public: }; PDFTokenizer() = delete; - explicit PDFTokenizer(std::istream& in, std::ostream& out, int dictionaries_max_size); + explicit PDFTokenizer(std::istream& in, std::ostream& out, char*& state_buf, int& state_len, int dictionaries_max_size); ~PDFTokenizer() override; PDFRet process(); @@ -56,9 +56,15 @@ public: private: int yylex() override; + void state_add(int len); + void state_store(); + void state_clear(); + void state_act(); + PDFRet h_dict_open(); PDFRet h_dict_close(); PDFRet h_dict_name(); + PDFRet h_dict_number(); PDFRet h_dict_other(); inline bool h_lit_str(); inline bool h_hex_str(); @@ -70,9 +76,12 @@ private: PDFRet h_hex_hex2chr_u16(); PDFRet h_lit_u16(); PDFRet h_lit_u16_unescape(); - PDFRet h_stream_open(); - PDFRet h_stream(); PDFRet h_array_nesting(); + PDFRet h_stream_open(); + void h_stream(); + void h_stream_part_close(); + PDFRet h_stream_dump_remainder(); + PDFRet h_stream_dump_remainder_u16(); bool h_stream_close(); void h_stream_length(); void h_ref(); @@ -105,9 +114,10 @@ private: struct ObjectDictionary { void clear() - { key_value = true; array_level = 0; } + { key_value = true; consecutive_number = false; array_level = 0; } bool key_value = true; + bool consecutive_number = false; int array_level = 0; }; @@ -130,10 +140,15 @@ private: struct Stream { int rem_length = -1; + int endstream_part = 0; bool is_js = false; bool is_ref_len = false; }; + char*& state_buf; + int& state_len; + bool state_added = false; + ObjectString obj_string; ObjectArray obj_array; std::stack dictionaries; diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l index ee2b34cda..df5207c33 100644 --- a/src/js_norm/pdf_tokenizer.l +++ b/src/js_norm/pdf_tokenizer.l @@ -52,6 +52,7 @@ using namespace jsn; #define YY_USER_ACTION \ { \ + state_act(); \ debug_logf(5, js_trace, TRACE_PDF_PROC, nullptr, \ "PDF pattern #%d, sc %d\n", yy_act, YY_START); \ debug_logf(5, js_trace, TRACE_PDF_DUMP, nullptr, \ @@ -87,6 +88,7 @@ COMMENT_END {EOL_MARKER} /* 7.3.2 Boolean Objects */ OBJ_BOOLEAN true|false +OBJ_PARTIAL_BOOL t|tr|tru|f|fa|fal|fals /* 7.3.3 Numeric Objects */ OBJ_INT_NUM [+-]?[0-9]{1,16} @@ -132,17 +134,22 @@ OBJ_DICT_CLOSE ">>" OBJ_DICT_SKIP .|{GRP_NEWLINE} /* 7.3.8 Stream Objects */ -OBJ_STREAM_OPEN stream\r?\n -OBJ_STREAM_CLOSE {EOL_MARKER}endstream -OBJ_STREAM_SKIP {GRP_NOT_NEWLINE}{1,16}|{GRP_NEWLINE} +OBJ_STREAM_OPEN stream\r?\n +OBJ_STREAM_PARTIAL_OPEN s|st|str|stre|strea|stream|stream\r +OBJ_STREAM_CLOSE endstream +OBJ_STREAM_PARTIAL_CLOSE e|en|end|ends|endst|endstr|endstre|endstrea +OBJ_STREAM_SKIP [^e]{1,16} /* 7.3.9 Null Object */ OBJ_NULL null +OBJ_PARTIAL_NULL n|nu|nul /* 7.3.10 Indirect Objects */ -INDIRECT_OBJ_OPEN {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj +INDIRECT_OBJ_OPEN {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj +INDIRECT_OBJ_PARTIAL_OPEN {OBJ_INT_NUM}{GRP_WHITESPACE}*{OBJ_INT_NUM}?{GRP_WHITESPACE}*(o|ob) INDIRECT_OBJ_CLOSE endobj +INDIRECT_OBJ_PARTIAL_CLOSE e|en|end|endo|endob OBJ_REFERENCE {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R @@ -180,8 +187,10 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {COMMENT_END} { POP(); } {INDIRECT_OBJ_OPEN} { PUSH(indobj); h_ind_obj_open(); } +{INDIRECT_OBJ_PARTIAL_OPEN} { state_add(yyleng); } {WHITESPACE} { } {INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); EXEC(h_array_nesting()); } +{INDIRECT_OBJ_PARTIAL_CLOSE} { state_add(yyleng); } {OBJ_ARRAY_OPEN} { PUSH(array); ++obj_array.nesting_level; } {OBJ_REFERENCE} { indirect_obj.ref_met = true; } {OBJ_BOOLEAN} { } @@ -200,35 +209,41 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {OBJ_NULL} { } {OBJ_NAME} { } {OBJ_LIT_STR_OPEN} { if (h_lit_open()) PUSH(litstr); } -{OBJ_HEX_STR_OPEN} { PUSH(hexstr); } +{OBJ_HEX_STR_OPEN} { PUSH(hexstr); state_add(yyleng); } {OBJ_ARRAY_SKIP} { } {INDIRECT_OBJ_CLOSE} { return PDFRet::UNEXPECTED_SYMBOL; } {OBJ_STREAM_OPEN} { EXEC(h_stream_open()); PUSH(obj_stream.is_js ? u16 : stream); } -{OBJ_STREAM_SKIP} { EXEC(h_stream()); } -{OBJ_STREAM_SKIP} { EXEC(h_stream()); ECHO; } -{OBJ_STREAM_SKIP} { EXEC(h_stream()); EXEC(h_lit_u16()); } +{OBJ_STREAM_PARTIAL_OPEN} { state_add(yyleng); } {OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } +{OBJ_STREAM_PARTIAL_CLOSE} { EXEC(h_stream_dump_remainder()); h_stream_part_close(); state_add(yyleng); } {OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } +{OBJ_STREAM_PARTIAL_CLOSE} { EXEC(h_stream_dump_remainder()); h_stream_part_close(); state_add(yyleng); } {OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } +{OBJ_STREAM_PARTIAL_CLOSE} { EXEC(h_stream_dump_remainder_u16()); h_stream_part_close(); state_add(yyleng); } +{OBJ_STREAM_SKIP} { EXEC(h_stream_dump_remainder()); h_stream(); } +{OBJ_STREAM_SKIP} { EXEC(h_stream_dump_remainder()); h_stream(); ECHO; } +{OBJ_STREAM_SKIP} { EXEC(h_stream_dump_remainder_u16()); h_stream(); EXEC(h_lit_u16()); } {OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()); } {OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()); } {OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()); } {OBJ_DICT_CLOSE} { return PDFRet::INCORRECT_BRACKETS_NESTING; } {OBJ_DICT_CLOSE} { POP(); EXEC(h_dict_close()); } -{WHITESPACE} { } -{OBJ_REFERENCE} { EXEC(h_dict_other()); h_ref(); } +{WHITESPACE} { state_add(yyleng); } +{OBJ_REFERENCE} { dictionaries.top().consecutive_number = false; EXEC(h_dict_other()); h_ref(); } {OBJ_BOOLEAN} { EXEC(h_dict_other()); } -{OBJ_INT_NUM} { EXEC(h_dict_other()); h_stream_length(); } -{OBJ_REL_NUM} { EXEC(h_dict_other()); } +{OBJ_PARTIAL_BOOL} { state_add(yyleng); } +{OBJ_INT_NUM} { EXEC(h_dict_number()); h_stream_length(); state_add(yyleng); } +{OBJ_REL_NUM} { EXEC(h_dict_number()); state_add(yyleng); } {OBJ_NULL} { EXEC(h_dict_other()); } -{OBJ_NAME} { EXEC(h_dict_name()); } +{OBJ_PARTIAL_NULL} { state_add(yyleng); } +{OBJ_NAME} { EXEC(h_dict_name()); state_add(yyleng); } {OBJ_ARRAY_OPEN} { PUSH(array); ++obj_array.nesting_level; EXEC(h_dict_other()); } {OBJ_ARRAY_CLOSE} { return PDFRet::INCORRECT_BRACKETS_NESTING; } {OBJ_LIT_STR_OPEN} { EXEC(h_dict_other()); if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); } -{OBJ_HEX_STR_OPEN} { EXEC(h_dict_other()); if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); } -{OBJ_DICT_SKIP} { } +{OBJ_HEX_STR_OPEN} { EXEC(h_dict_other()); if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); state_add(yyleng); } +{OBJ_DICT_SKIP} { state_add(yyleng); } {INDIRECT_OBJ_CLOSE} { return PDFRet::UNEXPECTED_SYMBOL; } {OBJ_LIT_STR_OPEN} { if (h_lit_open()) PUSH(litstr); } @@ -240,7 +255,7 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {LIT_STR_EOL} { } {LIT_STR_BODY} { } -{OBJ_HEX_STR_OPEN} { PUSH(hexstr); } +{OBJ_HEX_STR_OPEN} { state_add(yyleng); PUSH(hexstr); } {OBJ_HEX_STR_CLOSE} { POP(); } {HEX_STR_BODY} { } {HEX_STR_SKIP} { } @@ -270,9 +285,9 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {HEX_STR_BODY} { EXEC(h_hex_hex2chr_u16()); } {HEX_STR_SKIP} { } -<*><> { return PDFRet::EOS; } +<*><> { state_store(); return PDFRet::EOS; } -{SKIP} { } +{SKIP} { state_add(yyleng); } <*>.|\n { return PDFRet::UNEXPECTED_SYMBOL; } %% @@ -314,6 +329,12 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_other() if (dictionaries.top().array_level != obj_array.nesting_level) return PDFRet::EOS; + if (dictionaries.top().consecutive_number) + { + dictionaries.top().consecutive_number = false; + dictionaries.top().key_value = !dictionaries.top().key_value; + } + if (dictionaries.top().key_value) return PDFRet::NOT_NAME_IN_DICTIONARY_KEY; @@ -328,11 +349,36 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_other() return PDFRet::EOS; } +PDFTokenizer::PDFRet PDFTokenizer::h_dict_number() +{ + if(!dictionaries.top().consecutive_number) + state_clear(); + + if (dictionaries.top().key_value) + return PDFRet::NOT_NAME_IN_DICTIONARY_KEY; + + debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, + "dictionary token: number\n"); + + debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, + "dictionary entry: %s, %s\n", obj_entry.key, yytext); + + dictionaries.top().consecutive_number = true; + + return PDFRet::EOS; +} + PDFTokenizer::PDFRet PDFTokenizer::h_dict_name() { if (dictionaries.top().array_level != obj_array.nesting_level) return PDFRet::EOS; + if (dictionaries.top().consecutive_number) + { + dictionaries.top().consecutive_number = false; + dictionaries.top().key_value = !dictionaries.top().key_value; + } + if (dictionaries.top().key_value) strncpy(obj_entry.key, yytext, sizeof(obj_entry.key) - 1); @@ -495,9 +541,36 @@ PDFTokenizer::PDFRet PDFTokenizer::h_stream_open() return PDFRet::EOS; } -PDFTokenizer::PDFRet PDFTokenizer::h_stream() +void PDFTokenizer::h_stream() { obj_stream.rem_length -= yyleng; +} + +void PDFTokenizer::h_stream_part_close() +{ + obj_stream.endstream_part = yyleng; +} + +static const char endstream_tag[] = "endstream"; + +PDFTokenizer::PDFRet PDFTokenizer::h_stream_dump_remainder() +{ + int part = obj_stream.endstream_part; + obj_stream.endstream_part = 0; + obj_stream.rem_length -= part; + if (YY_START == jsstream) + for(const char* c = endstream_tag; c < endstream_tag + part; c++) + yyout << *c; + return PDFRet::EOS; +} + +PDFTokenizer::PDFRet PDFTokenizer::h_stream_dump_remainder_u16() +{ + int part = obj_stream.endstream_part; + obj_stream.endstream_part = 0; + obj_stream.rem_length -= part; + for(const char* c = endstream_tag; c < endstream_tag + part; c++) + EXEC(u16_eval(*c)); return PDFRet::EOS; } @@ -506,11 +579,7 @@ bool PDFTokenizer::h_stream_close() obj_stream.rem_length -= yyleng; if (obj_stream.rem_length <= 0) - { - if (YY_START == jsstream) - yyout << '\n'; return true; - } if (YY_START == jsstream) ECHO; @@ -558,6 +627,7 @@ void PDFTokenizer::h_u16_start() default: assert(false); } + u16_state.cur_byte = 0; } void PDFTokenizer::h_u16_break() @@ -584,6 +654,7 @@ void PDFTokenizer::h_u16_hex_start() assert(YY_START == jshstr); POP(); PUSH(jshstru16); + u16_state.cur_byte = 0; } void PDFTokenizer::h_u16_hex_break() @@ -671,8 +742,8 @@ void PDFTokenizer::u16_to_u8(uint32_t code) yyout << out; } -PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out, int dictionaries_max_size) - : yyFlexLexer(in, out), dictionaries_max_size(dictionaries_max_size) +PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out, char*& state_buf, int& state_len, int dictionaries_max_size) + : yyFlexLexer(in, out), state_buf(state_buf), state_len(state_len), dictionaries_max_size(dictionaries_max_size) { dictionaries.push(ObjectDictionary()); } @@ -696,3 +767,50 @@ PDFTokenizer::PDFRet PDFTokenizer::process() return r; } + +void PDFTokenizer::state_add(int len) +{ + state_len += len; + state_added = true; +} + +void PDFTokenizer::state_store() +{ + state_act(); + + if (state_len == 0) + return; + + if (YY_START == hexstr) + POP(); + + if (!dictionaries.top().key_value and !dictionaries.top().consecutive_number) + dictionaries.top().key_value = true; + + obj_stream.endstream_part = 0; + + char* buf = new char[state_len]; + + yyin.seekg(-state_len, std::ios_base::end); + yyin.clear(); + yyin.read(buf, state_len); + + debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, + "storing %d bytes for reassembly: \"%.*s\"\n",state_len,state_len,buf); + + delete[] state_buf; + state_buf = buf; +} + +void PDFTokenizer::state_clear() +{ + state_len = 0; +} + +void PDFTokenizer::state_act() +{ + if (state_added) + state_added = false; + else + state_clear(); +} diff --git a/src/js_norm/test/CMakeLists.txt b/src/js_norm/test/CMakeLists.txt index 5ba935362..79c410052 100644 --- a/src/js_norm/test/CMakeLists.txt +++ b/src/js_norm/test/CMakeLists.txt @@ -63,6 +63,7 @@ add_catch_test( jsn_test add_catch_test( pdf_tokenizer_test SOURCES ${pdf_tokenizer_OUTPUTS} + ${CMAKE_SOURCE_DIR}/src/helpers/streambuf.cc js_test_stubs.cc ) diff --git a/src/js_norm/test/pdf_tokenizer_benchmark.cc b/src/js_norm/test/pdf_tokenizer_benchmark.cc index f484fe1bd..d0a26dec6 100644 --- a/src/js_norm/test/pdf_tokenizer_benchmark.cc +++ b/src/js_norm/test/pdf_tokenizer_benchmark.cc @@ -36,6 +36,8 @@ using namespace jsn; using namespace snort; using namespace std; +static constexpr int nesting_level = 10; + static const string make_input(const char* begin, const char* mid, const char* end, size_t len) { string str(begin); @@ -83,7 +85,9 @@ TEST_CASE("PDF Tokenizer, literals by 8 K", "[PDFTokenizer]") ostreambuf_infl buf_out; istream in(&buf_in); ostream out(&buf_out); - PDFTokenizer parser(in, out); + char* buf = nullptr; + int len; + PDFTokenizer parser(in, out, buf, len, nesting_level); BENCHMARK("memcpy()") { @@ -124,6 +128,8 @@ TEST_CASE("PDF Tokenizer, literals by 8 K", "[PDFTokenizer]") rewind(); return parser.process(); }; + + delete[] buf; } TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]") @@ -143,7 +149,9 @@ TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]") ostreambuf_infl buf_out; istream in(&buf_in); ostream out(&buf_out); - PDFTokenizer parser(in, out); + char* buf = nullptr; + int len; + PDFTokenizer parser(in, out, buf, len, nesting_level); BENCHMARK("memcpy()") { @@ -184,6 +192,8 @@ TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]") rewind(); return parser.process(); }; + + delete[] buf; } TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]") @@ -195,7 +205,9 @@ TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]") ostreambuf_infl buf_out; istream in(&buf_in); ostream out(&buf_out); - PDFTokenizer parser(in, out); + char* buf = nullptr; + int len; + PDFTokenizer parser(in, out, buf, len, nesting_level); set_input(data); BENCHMARK("same object repeated") @@ -203,6 +215,8 @@ TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]") rewind(); return parser.process(); }; + + delete[] buf; } #endif diff --git a/src/js_norm/test/pdf_tokenizer_test.cc b/src/js_norm/test/pdf_tokenizer_test.cc index 8b22fe7fa..5f43140ff 100644 --- a/src/js_norm/test/pdf_tokenizer_test.cc +++ b/src/js_norm/test/pdf_tokenizer_test.cc @@ -27,6 +27,7 @@ #include #include "catch/catch.hpp" +#include "helpers/streambuf.h" #include "js_norm/pdf_tokenizer.h" using namespace jsn; @@ -41,26 +42,35 @@ static void test_pdf_proc(const string& source, const string& expected, { istringstream in(source); ostringstream out; - PDFTokenizer extractor(in, out, nesting_level); + char* buf = nullptr; + int len = 0; + PDFTokenizer extractor(in, out, buf, len, nesting_level); auto r = extractor.process(); + delete[] buf; + CHECK(ret == r); CHECK(expected == out.str()); } static void test_pdf_proc(const vector& chunks) { - istringstream in; + snort::istreambuf_glue in_buf; + istream in(&in_buf); ostringstream out; - PDFTokenizer extractor(in, out, nesting_level); + char* state_buf = nullptr; + int state_len = 0; + PDFTokenizer extractor(in, out, state_buf, state_len, nesting_level); for (const auto& chunk : chunks) { auto src = chunk.first; auto exp = chunk.second; - in.str(src); + in_buf.pubsetbuf(nullptr,0) + ->pubsetbuf(state_buf, state_len) + ->pubsetbuf(const_cast(src.c_str()), src.length()); out.str(""); auto r = extractor.process(); @@ -68,17 +78,12 @@ static void test_pdf_proc(const vector& chunks) CHECK(PDFTokenizer::PDFRet::EOS == r); CHECK(exp == out.str()); } + + delete[] state_buf; } TEST_CASE("basic", "[PDFTokenizer]") { - SECTION("no input") - { - test_pdf_proc( - "", - "" - ); - } SECTION("minimal PDF") { test_pdf_proc( @@ -355,6 +360,45 @@ TEST_CASE("basic", "[PDFTokenizer]") "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY ); } + SECTION("number as a key") + { + test_pdf_proc( + "1 0 obj" + "<<" + "/K1 /V1" + "1234 /V2" + "/JS (foo)" + ">>" + "endobj", + "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY + ); + } + SECTION("number as a key after a value") + { + test_pdf_proc( + "1 0 obj" + "<<" + "/K1 null " + "1234 (bar)" + "/JS (foo)" + ">>" + "endobj", + "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY + ); + } + SECTION("value as a key after a number") + { + test_pdf_proc( + "1 0 obj" + "<<" + "/K1 1234 " + "null (bar) " + "/JS (foo)" + ">>" + "endobj", + "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY + ); + } SECTION("token too long") { test_pdf_proc( @@ -779,14 +823,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]") TEST_CASE("split", "[PDFTokenizer]") { - SECTION("no input") - { - test_pdf_proc({ - {"", ""}, - {"", ""}, - {"", ""} - }); - } + SECTION("minimal PDF") { test_pdf_proc({ @@ -814,6 +851,319 @@ TEST_CASE("split", "[PDFTokenizer]") {"<>\nendobj", "script 3"} }); } + + SECTION("split in indirect object index: first number") + { + test_pdf_proc({ + {"\n1", ""}, + {"23 0 obj\n", ""}, + {"<< /JS (a % b) >>\nendobj\n", "a % b"}, + }); + } + SECTION("split in indirect object index: between numbers") + { + test_pdf_proc({ + {"\n1", ""}, + {" 0 obj\n", ""}, + {"<< /JS (a % b) >>\nendobj\n", "a % b"}, + }); + } + SECTION("split in indirect object index: second number") + { + test_pdf_proc({ + {"\n1 12", ""}, + {"3 obj\n", ""}, + {"<< /JS (a % b) >>\nendobj\n", "a % b"}, + }); + } + SECTION("split in indirect object index: after numbers") + { + test_pdf_proc({ + {"\n1 0", ""}, + {" obj\n", ""}, + {"<< /JS (a % b) >>\nendobj\n", "a % b"}, + }); + } + SECTION("split in indirect object index: in keyword") + { + test_pdf_proc({ + {"\n1 0 ob", ""}, + {"j\n", ""}, + {"<< /JS (a % b) >>\nendobj\n", "a % b"}, + }); + } + SECTION("split in indirect object index: multi-PDU reassembly") + { + test_pdf_proc({ + {"\n1 ", ""}, + {"0 ob", ""}, + {"j\n<< /JS (a % b) >>\nendobj\n", "a % b"}, + }); + } + SECTION("split in indirect object close keyword") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<< /K /V /JS (foo % bar) >>\nend", "foo % bar"}, + {"obj\n", ""}, + {"2 0 obj\n<>\nendobj\n","c % d"} + }); + } + SECTION("split in dictionary opening brackets") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<", ""}, + {"< /JS (a % b) >>\nendobj\n", "a % b"}, + }); + } + SECTION("split in dictionary closing brackets") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<< /K /V /JS (foo % bar) >", "foo % bar"}, + {">\nendobj\n", ""}, + {"2 0 obj\n<>\nendobj\n","c % d"} + }); + } + SECTION("split in name as dict-key") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in name as dict-value") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split after name as dict-value") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<< /K /V ", ""}, + {"/JS (a % b) >>\nendobj\n", "a % b"} + }); + } + SECTION("split in object reference value: first number") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in object reference value: second number") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in integer number value") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in integer number value after sign") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in real number value before the dot") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in real number value after the dot") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in boolean value") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in null value") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", "a % b"} + }); + } + SECTION("split in object reference to stream") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<>\nendobj\n", ""}, + { + "2 0 obj\n" + "<< /Length 20 >>\n" + "stream\n" + "JavaScript in stream\n" + "endstream\n" + "endobj\n", + "JavaScript in stream\n" + } + }); + } + SECTION("split in hex string") + { + test_pdf_proc({ + {"\n1 0 obj\n", ""}, + {"<< /JS <", ""}, + {"66 6F 6F 20 62 61 72> >>\nendobj\n", "foo bar"}, + }); + } + SECTION("split in stream: length key") + { + test_pdf_proc({ + { + "\n1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<< /Leng", + "" + }, + { + "th 20 >>\n" + "stream\n" + "JavaScript in stream\n" + "endstream\n" + "endobj\n", + "JavaScript in stream\n" + } + }); + } + SECTION("split in stream: length value") + { + test_pdf_proc({ + { + "\n1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<< /Length 2", + "" + }, + { + "0 >>\n" + "stream\n" + "JavaScript in stream\n" + "endstream\n" + "endobj\n", + "JavaScript in stream\n" + } + }); + } + SECTION("split in stream: stream keyword") + { + test_pdf_proc({ + { + "\n1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<< /Length 20 >>\n st", + "" + }, + { + "ream\n" + "JavaScript in stream\n" + "endstream\n" + "endobj\n", + "JavaScript in stream\n" + } + }); + } + SECTION("split in stream: content") + { + test_pdf_proc({ + { + "\n1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<< /Length 20 >>\n stream\n" + "JavaScript", + "JavaScript" + }, + { + " in stream\n" + "endstream\n" + "endobj\n", + " in stream\n" + } + }); + } + SECTION("split in stream: content that looks like endstream") + { + test_pdf_proc({ + { + "\n1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<< /Length 23 >>\n stream\n" + "end", + "" + }, + { + "stream in JavaScript\n" + "endstream\n" + "endobj\n", + "endstream in JavaScript\n" + } + }); + } + SECTION("split in stream: endstream keyword") + { + test_pdf_proc({ + { + "\n1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<< /Length 20 >>\n stream\n" + "JavaScript in stream\n" + "end", + "JavaScript in stream\n" + }, + { + "stream\n" + "endobj\n" + "\n3 0 obj\n" + "<>\n" + "endobj\n", + "foo" + } + }); + } } TEST_CASE("stream object", "[PDFTokenizer]") @@ -866,7 +1216,7 @@ TEST_CASE("stream object", "[PDFTokenizer]") "bar\r\n" "endstream\n" "endobj\n", - "bar\n" + "bar\r\n" ); } SECTION("reference as length") @@ -903,7 +1253,7 @@ TEST_CASE("stream object", "[PDFTokenizer]") "\nendstream\n \r\n" "endstream\n" "endobj\n", - "\nendstream\n \n" + "\nendstream\n \r\n" ); } SECTION("referenced JavaScript") @@ -1143,8 +1493,7 @@ TEST_CASE("stream object malformed", "[PDFTokenizer]") "foo" "endstream\n" "endobj\n", - "fooendstream\n" - "endobj\n", PDFTokenizer::PDFRet::EOS + "foo", PDFTokenizer::PDFRet::EOS ); } } @@ -1292,6 +1641,24 @@ TEST_CASE("UTF-16, basic", "[PDFTokenizer]") "foo"s ); } + SECTION("stream with 'endstream' content") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS 2 0 R" + ">>\n" + "endobj\n" + "2 0 obj\n" + "<>\n" + "stream\n" + "\xfe\xff\0e\0n\0d\0s\0t\0r\0e\0a\0m\n" + "endstream\n" + "endobj"s, + "endstream"s + ); + } SECTION("hexadecimal string") { test_pdf_proc( @@ -1424,6 +1791,17 @@ TEST_CASE("UTF-16, basic", "[PDFTokenizer]") PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL ); } + SECTION("unfinished trailing symbol") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0f\0o\0o\xF0)" + "/JS (\xfe\xff\0b\0a\0r\xBA)"s, + "foobar"s + ); + } } TEST_CASE("UTF-16, cross-PDU", "[PDFTokenizer]") @@ -1524,4 +1902,48 @@ TEST_CASE("UTF-16, cross-PDU", "[PDFTokenizer]") } }); } + SECTION("split in stream that looks like endstream") + { + test_pdf_proc({ + { + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<>\n" + "stream\n" + "\xfe\xff\0f\0o\0o\0e"s, + "foo"s + }, + { + "\0n\0d\n" + "endstream\n" + "endobj"s, + "end"s + } + }); + } + SECTION("split in endstream tag") + { + test_pdf_proc({ + { + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<>\n" + "stream\n" + "\xfe\xff\0f\0o\0o\nend"s, + "foo"s + }, + { + "stream\n" + "endobj\n" + "3 0 obj\n" + "<>\n" + "endobj\n"s, + "foo"s + } + }); + } }