From: Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) Date: Tue, 13 Dec 2022 18:42:24 +0000 (+0000) Subject: Pull request #3698: js_norm: add PDF stream processing X-Git-Tag: 3.1.49.0~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7a61d3fb0487249a7080cf883c3356efe6ce4cf7;p=thirdparty%2Fsnort3.git Pull request #3698: js_norm: add PDF stream processing Merge in SNORT/snort3 from ~DKYRYLOV/snort3:js_pdf_stream to master Squashed commit of the following: commit e4712275b6662de60d9dca67031bf693cfcf896c Author: dkyrylov Date: Mon Dec 5 17:31:18 2022 +0200 js_norm: add PDF stream processing --- diff --git a/src/js_norm/dev_notes.txt b/src/js_norm/dev_notes.txt index 0e0a6b94c..c00432846 100644 --- a/src/js_norm/dev_notes.txt +++ b/src/js_norm/dev_notes.txt @@ -134,7 +134,11 @@ PDF parser follows "PDF 32000-1:2008 First Edition 2008-7-1 Document management Portable document format Part 1: PDF 1.7". Known limitations: * Nested dictionaries are not fully supported. Properties of the last object -are tracked. Once the nested object ends, it clears all info about the object -type. + are tracked. Once the nested object ends, it clears all info about the object + type. * Nested dictionaries are not allowed in JavaScript-type dictionary. -* Stream objects are ignored. +* JavaScript in streams is tracked only when a reference to that stream is found + earlier in that file. +* Compressed JavaScript streams are handled correctly only if PDF decompression is + enabled (http_inspect.decompress_pdf = true, and the same option for other inspectors) + diff --git a/src/js_norm/pdf_tokenizer.h b/src/js_norm/pdf_tokenizer.h index 9a31841c7..bd4c20b0b 100644 --- a/src/js_norm/pdf_tokenizer.h +++ b/src/js_norm/pdf_tokenizer.h @@ -20,11 +20,9 @@ #ifndef PDF_TOKENIZER_H #define PDF_TOKENIZER_H -#include #include #include -#include -#include +#include #include "main/snort_types.h" @@ -41,6 +39,8 @@ public: EOS = 0, NOT_NAME_IN_DICTIONARY_KEY, INCOMPLETE_ARRAY_IN_DICTIONARY, + STREAM_NO_LENGTH, + UNEXPECTED_SYMBOL, MAX }; @@ -64,6 +64,13 @@ private: PDFRet h_lit_unescape(); PDFRet h_lit_oct2chr(); PDFRet h_hex_hex2chr(); + PDFRet h_stream_open(); + PDFRet h_stream(); + bool h_stream_close(); + void h_stream_length(); + void h_ref(); + void h_ind_obj_open(); + inline void h_ind_obj_close(); struct ObjectString { @@ -98,10 +105,18 @@ private: char key[PDFTOKENIZER_NAME_MAX_SIZE] = {0}; }; + struct Stream + { + int rem_length = -1; + bool is_js = false; + }; + ObjectString obj_string; ObjectArray obj_array; ObjectDictionary obj_dictionary; DictionaryEntry obj_entry; + Stream obj_stream; + std::unordered_set js_stream_refs; }; bool PDFTokenizer::h_lit_str() @@ -124,6 +139,11 @@ bool PDFTokenizer::h_lit_close() return --obj_string.parenthesis_level == 0; } +void PDFTokenizer::h_ind_obj_close() +{ + obj_stream.is_js = false; +} + } #endif diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l index 20d497ee6..6224c39bb 100644 --- a/src/js_norm/pdf_tokenizer.l +++ b/src/js_norm/pdf_tokenizer.l @@ -31,9 +31,7 @@ #endif -#include #include -#include #include "js_norm/js_enum.h" #include "js_norm/pdf_tokenizer.h" @@ -51,20 +49,19 @@ using namespace jsn; #define PUSH(x) yy_push_state(x) #define POP() yy_pop_state() -#define YY_USER_ACTION \ - { \ - debug_logf(5, js_trace, TRACE_PDF_PROC, nullptr, \ - "PDF pattern #%d, sc %d\n", yy_act, YY_START); \ - \ - debug_logf(5, js_trace, TRACE_PDF_DUMP, nullptr, \ - "PDF text '%s'\n", YYText()); \ +#define YY_USER_ACTION \ + { \ + debug_logf(5, js_trace, TRACE_PDF_PROC, nullptr, \ + "PDF pattern #%d, sc %d\n", yy_act, YY_START); \ + debug_logf(5, js_trace, TRACE_PDF_DUMP, nullptr, \ + "PDF text '%s'\n", YYText()); \ } -#define EXEC(f) \ - { \ - auto r = (f); \ - if (r) \ - return r; \ +#define EXEC(f) \ + { \ + auto r = (f); \ + if (r) \ + return r; \ } %} @@ -72,87 +69,109 @@ using namespace jsn; /* PDF 32000-1:2008 definitions follow */ /* 7.2.2 Character Set */ -CHARS_WHITESPACE \x00\x09\x0a\x0c\x0d\x20 -CHARS_DELIMITER \(\)\<\>\[\]\{\}\/\% -GRP_WHITESPACE [\x00\x09\x0a\x0c\x0d\x20] -GRP_DELIMITER [\(\)\<\>\[\]\{\}\/\%] -GRP_REGULAR [^\x00\x09\x0a\x0c\x0d\x20\(\)\<\>\[\]\{\}\/\%] +CHARS_WHITESPACE \x00\x09\x0a\x0c\x0d\x20 +CHARS_DELIMITER \(\)\<\>\[\]\{\}\/\% +GRP_WHITESPACE [\x00\x09\x0a\x0c\x0d\x20] +EOL_MARKER \r|\n|\r\n +GRP_NEWLINE [\x0d\x0a] +GRP_NOT_NEWLINE [^\x0d\x0a] +GRP_DELIMITER [\(\)\<\>\[\]\{\}\/\%] +GRP_REGULAR [^\x00\x09\x0a\x0c\x0d\x20\(\)\<\>\[\]\{\}\/\%] /* 7.2.3 Comments */ -COMMENT %.* +COMMENT %{GRP_NOT_NEWLINE}*{EOL_MARKER} /* 7.3.2 Boolean Objects */ -OBJ_BOOLEAN true|false +OBJ_BOOLEAN true|false /* 7.3.3 Numeric Objects */ -OBJ_INT_NUM [+-]?[0-9]{1,64} -OBJ_REL_NUM [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64}) +OBJ_INT_NUM [+-]?[0-9]{1,64} +OBJ_REL_NUM [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64}) /* 7.3.4 String Objects */ -OBJ_LIT_STR_OPEN "(" -OBJ_LIT_STR_CLOSE ")" -OBJ_HEX_STR_OPEN "<" -OBJ_HEX_STR_CLOSE ">" +OBJ_LIT_STR_OPEN "(" +OBJ_LIT_STR_CLOSE ")" +OBJ_HEX_STR_OPEN "<" +OBJ_HEX_STR_CLOSE ">" /* 7.3.4.2 Literal Strings */ -LIT_STR_ESC \\[^0-7] -LIT_STR_ESC_OCT \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3} -LIT_STR_ESC_EOL \\[\x0d\x0a]|\\\x0d\x0a -LIT_STR_EOL [\x0d\x0a]|\x0d\x0a -LIT_STR_BODY [^\\\(\)]{1,64} +LIT_STR_ESC \\[^0-7] +LIT_STR_ESC_OCT \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3} +LIT_STR_ESC_EOL \\[\x0d\x0a]|\\\x0d\x0a +LIT_STR_EOL [\x0d\x0a]|\x0d\x0a +LIT_STR_BODY [^\\\(\)]{1,64} /* 7.3.4.3 Hexadecimal Strings */ -HEX_STR_BODY [0-9A-Fa-f]{1,64} -HEX_STR_SKIP [^0-9A-Fa-f>]{1,64} +HEX_STR_BODY [0-9A-Fa-f]{1,64} +HEX_STR_SKIP [^0-9A-Fa-f>]{1,64} /* 7.3.5 Name Objects */ -OBJ_NAME \/{GRP_REGULAR}{1,256} +OBJ_NAME \/{GRP_REGULAR}{1,256} /* 7.3.6 Array Objects */ -OBJ_ARRAY_OPEN "[" -OBJ_ARRAY_CLOSE "]" +OBJ_ARRAY_OPEN "[" +OBJ_ARRAY_CLOSE "]" /* 7.3.7 Dictionary Objects */ -OBJ_DICT_OPEN "<<" -OBJ_DICT_CLOSE ">>" +OBJ_DICT_OPEN "<<" +OBJ_DICT_CLOSE ">>" -/* FIXIT: improve bytes consuming */ -OBJ_DICT_SKIP . +OBJ_DICT_SKIP .|{GRP_NEWLINE} /* 7.3.8 Stream Objects */ -OBJ_STREAM_OPEN stream$ -OBJ_STREAM_CLOSE ^endstream +OBJ_STREAM_OPEN stream\r?\n +OBJ_STREAM_CLOSE {EOL_MARKER}endstream +OBJ_STREAM_SKIP {GRP_NOT_NEWLINE}{1,64}|{GRP_NEWLINE} /* 7.3.9 Null Object */ -OBJ_NULL null +OBJ_NULL null /* 7.3.10 Indirect Objects */ -INDIRECT_OBJ {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj -RECORD_OBJ {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R +INDIRECT_OBJ_OPEN {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj -/* Not dictionary, not strings */ -SKIP [^<\(%]{1,64} +INDIRECT_OBJ_CLOSE endobj + +OBJ_REFERENCE {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R + + +/* Not object start, not comments */ +SKIP [^[:digit:]%]{1,64}|. WHITESPACE {GRP_WHITESPACE}{1,64} /* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string */ +%x indobj +%x stream %x dictnr %x litstr %x hexstr %x jslstr %x jshstr +%x jsstream %% {SKIP} { } {COMMENT} { } -{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()) } +{INDIRECT_OBJ_OPEN} { PUSH(indobj); h_ind_obj_open(); } +{COMMENT} { } +{WHITESPACE} { } +{INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); } + +{OBJ_STREAM_OPEN} { EXEC(h_stream_open()) PUSH(obj_stream.is_js ? jsstream : stream); } +{OBJ_STREAM_SKIP} { EXEC(h_stream()) } +{OBJ_STREAM_SKIP} { EXEC(h_stream()) ECHO; } +{OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } +{OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } + +{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()) } +{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()) } {OBJ_DICT_CLOSE} { POP(); EXEC(h_dict_close()) } {COMMENT} { } {WHITESPACE} { } -{RECORD_OBJ} { EXEC(h_dict_other()) } +{OBJ_REFERENCE} { EXEC(h_dict_other()) h_ref(); } {OBJ_BOOLEAN} { EXEC(h_dict_other()) } -{OBJ_INT_NUM} { EXEC(h_dict_other()) } +{OBJ_INT_NUM} { EXEC(h_dict_other()) h_stream_length(); } {OBJ_REL_NUM} { EXEC(h_dict_other()) } {OBJ_NULL} { EXEC(h_dict_other()) } {OBJ_NAME} { EXEC(h_dict_name()) } @@ -162,7 +181,7 @@ WHITESPACE {GRP_WHITESPACE}{1,64} {OBJ_HEX_STR_OPEN} { EXEC(h_dict_other()) if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); } {OBJ_DICT_SKIP} { } -{OBJ_LIT_STR_OPEN} { if (h_lit_open()) PUSH(litstr); } +{OBJ_LIT_STR_OPEN} { if (h_lit_open()) PUSH(litstr); } {OBJ_LIT_STR_OPEN} { h_lit_open(); } {OBJ_LIT_STR_CLOSE} { if (h_lit_close()) POP(); } {LIT_STR_ESC} { } @@ -171,7 +190,7 @@ WHITESPACE {GRP_WHITESPACE}{1,64} {LIT_STR_EOL} { } {LIT_STR_BODY} { } -{OBJ_HEX_STR_OPEN} { PUSH(hexstr); } +{OBJ_HEX_STR_OPEN} { PUSH(hexstr); } {OBJ_HEX_STR_CLOSE} { POP(); } {HEX_STR_BODY} { } {HEX_STR_SKIP} { } @@ -189,7 +208,9 @@ WHITESPACE {GRP_WHITESPACE}{1,64} {HEX_STR_BODY} { EXEC(h_hex_hex2chr()) } {HEX_STR_SKIP} { } -<> { return PDFRet::EOS; } +<*><> { return PDFRet::EOS; } + +<*>.|\n { return PDFRet::UNEXPECTED_SYMBOL; } %% @@ -228,6 +249,9 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_other() debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, "dictionary token: other\n"); + debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, + "dictionary entry: %s, %s\n", obj_entry.key, yytext); + obj_dictionary.key_value = !obj_dictionary.key_value; return PDFRet::EOS; @@ -320,6 +344,57 @@ PDFTokenizer::PDFRet PDFTokenizer::h_hex_hex2chr() return PDFRet::EOS; } +PDFTokenizer::PDFRet PDFTokenizer::h_stream_open() +{ + if (obj_stream.rem_length < 0) + return PDFRet::STREAM_NO_LENGTH; + + debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, + "Starting %s stream, length %d\n", obj_stream.is_js ? "JavaScript" : "skipping", obj_stream.rem_length); + + return PDFRet::EOS; +} + +PDFTokenizer::PDFRet PDFTokenizer::h_stream() +{ + obj_stream.rem_length -= yyleng; + return PDFRet::EOS; +} + +bool PDFTokenizer::h_stream_close() +{ + obj_stream.rem_length -= yyleng; + if (obj_stream.rem_length <= 0) + { + if (YY_START == jsstream) + yyout << '\n'; + return true; + } + + if (YY_START == jsstream) + ECHO; + return false; +} + +void PDFTokenizer::h_stream_length() +{ + if (!strcmp(obj_entry.key, "/Length")) + obj_stream.rem_length = atoi(yytext); +} + +void PDFTokenizer::h_ref() +{ + if (!strcmp(obj_entry.key, "/JS")) + js_stream_refs.insert(atoi(yytext)); +} + +void PDFTokenizer::h_ind_obj_open() +{ + int value = atoi(yytext); + if (js_stream_refs.count(value) > 0) + obj_stream.is_js = true; +} + PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out) : yyFlexLexer(in, out) { @@ -332,6 +407,5 @@ PDFTokenizer::~PDFTokenizer() PDFTokenizer::PDFRet PDFTokenizer::process() { auto r = yylex(); - - return (PDFRet)r; + return static_cast(r); } diff --git a/src/js_norm/test/pdf_tokenizer_test.cc b/src/js_norm/test/pdf_tokenizer_test.cc index 13eb326ca..db1801336 100644 --- a/src/js_norm/test/pdf_tokenizer_test.cc +++ b/src/js_norm/test/pdf_tokenizer_test.cc @@ -156,13 +156,16 @@ TEST_CASE("basic", "[PDFTokenizer]") SECTION("comments") { test_pdf_proc( + "1 0 obj\n" "% comment 1\n" "<>" - "<>\n" - "(% not a comment)\n" + "<>" + "<>\n" "% comment 2\n" - "<>", - "a % b; script 2" + "<>" + "(% not a comment)\n" + "endobj\n", + "script 2; a % b; script 3" ); } SECTION("escapes in string") @@ -215,42 +218,50 @@ TEST_CASE("basic", "[PDFTokenizer]") SECTION("not name for key") { test_pdf_proc( + "1 0 obj" "<<" "/K1 /V1" "[/K2] /V2" "/K3 /V3" - ">>", + ">>" + "endobj", "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY ); } SECTION("literal string as a key") { test_pdf_proc( + "1 0 obj" "<<" "/K1 /V1" "(foo) /V2" "/K3 /V3" - ">>", + ">>" + "endobj", "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY ); } SECTION("hex string as a key") { test_pdf_proc( + "1 0 obj" "<<" "/K1 /V1" "<62617a> /V2" "/K3 /V3" - ">>", + ">>" + "endobj", "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY ); } SECTION("incomplete array") { test_pdf_proc( + "1 0 obj" "<<" "/K1 [ /V1 /V2 /V3 " - ">>", + ">>" + "endobj", "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY ); } @@ -268,43 +279,43 @@ TEST_CASE("JS location", "[PDFTokenizer]") SECTION("no sub-type") { test_pdf_proc( - "<< /JS (script) >>", + "1 0 obj\n<< /JS (script) >>", "script" ); } SECTION("no sub-type checks") { test_pdf_proc( - "<< /JS (script) /S /JavaScript >>", + "1 0 obj\n<< /JS (script) /S /JavaScript >>", "script" ); } SECTION("no spaces") { test_pdf_proc( - "<>", + "1 0 obj\n<>", "script" ); } SECTION("as hex string") { test_pdf_proc( - "<< /JS <62617a> >>", + "1 0 obj\n<< /JS <62617a> >>", "baz" ); test_pdf_proc( - "<< /JS <70> >>", + "1 0 obj\n<< /JS <70> >>", "p" ); test_pdf_proc( - "<< /JS <7> >>", + "1 0 obj\n<< /JS <7> >>", "p" ); } SECTION("prepended with records") { test_pdf_proc( - "<>", + "1 0 obj\n<>", "script" ); } @@ -315,7 +326,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]") SECTION("simple text") { test_pdf_proc( - "<>", "var _abc1 = 'Hello World!';" @@ -324,7 +335,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]") SECTION("balanced parenthesis") { test_pdf_proc( - "<>", "function foo() { console.log(\"Hello world!\") }" @@ -333,7 +344,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]") SECTION("with escapes") { test_pdf_proc( - "<>", "function bar(var x)\r{\r console.log(\"baz\")\r}" @@ -342,7 +353,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]") SECTION("all escapes") { test_pdf_proc( - "<>", "() \n\r\t\b\f()\\ \123 ABC xyz" @@ -351,7 +362,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]") SECTION("escaped new line") { test_pdf_proc( - "<>", "var str = 'Hello, world!';" @@ -386,14 +397,410 @@ TEST_CASE("split", "[PDFTokenizer]") { test_pdf_proc({ {"% comment", ""}, - {"\n", ""}, + {"\n1 0 obj\n", ""}, {"<>\n", "a % b"}, - {"(% not a", ""}, - {"comment)\n", ""}, + {"endobj\n2 0 obj\n(% not a", ""}, + {"comment)\nendobj\n3 0 obj\n", ""}, {"<>", ""}, - {"<>", "script 3"} + {">>\nendobj\n4 0 obj\n", ""}, + {"<>\nendobj", "script 3"} + }); + } +} + +TEST_CASE("stream object", "[PDFTokenizer]") +{ + SECTION("zero length") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 0" + ">>" + "stream\n" + "\n" + "endstream\n" + "endobj\n", + "\n" + ); + } + SECTION("exact length") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 6\n" + ">>\n" + "stream\n" + "foobar\n" + "endstream\n" + "endobj\n", + "foobar\n" + ); + } + SECTION("carriage return and line feed as EOL") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 3\n" + ">>" + "stream\r\n" + "bar\r\n" + "endstream\n" + "endobj\n", + "bar\n" + ); + } + SECTION("special symbols in a stream") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 13\n" + ">>" + "stream\n" + "\nendstream\n \r\n" + "endstream\n" + "endobj\n", + "\nendstream\n \n" + ); + } + SECTION("referenced JavaScript") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 9\n" + ">>" + "stream\n" + "var a = 0\n" + "endstream\n" + "endobj\n", + "var a = 0\n" + ); + } + SECTION("referenced JavaScript after another stream") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "3 0 obj\n" + "<>\n" + "stream\n" + " \n" + "endstream\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 9\n" + ">>" + "stream\n" + "var a = 0\n" + "endstream\n" + "endobj\n", + "var a = 0\n" + ); + } + SECTION("multiple revisions") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<>\n" + "stream\n" + "//revision 1\n\n" + "endstream\n" + "endobj\n" + "2 1 obj\n" + "<>\n" + "stream\n" + "//revision 2\n\n" + "endstream\n" + "endobj\n", + "//revision 1\n\n" + "//revision 2\n\n" + ); + } +} + +TEST_CASE("stream object malformed", "[PDFTokenizer]") +{ + SECTION("no dictionary") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "stream\n" + "foo\n" + "endstream\n" + "endobj\n", + "", PDFTokenizer::PDFRet::STREAM_NO_LENGTH + ); + } + SECTION("a direct stream") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "<<" + "/Length 3" + ">>\n" + "stream\n" + "foo\n" + "endstream\n", + "", PDFTokenizer::PDFRet::EOS + ); + } + SECTION("an indirect dictionary") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 3" + ">>\n" + "endobj\n" + "3 0 obj\n" + "2 0 R\n" + "stream\n" + "foo\n" + "endstream\n" + "endobj\n", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + SECTION("no length") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Creator (Acrobat Pro DC 22.1.20169)" + ">>\n" + "stream\n" + "foo\n" + "endstream\n" + "endobj\n", + "", PDFTokenizer::PDFRet::STREAM_NO_LENGTH + ); + } + SECTION("length less") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 2" + ">>\n" + "stream\n" + "foo\n" + "endstream\n" + "endobj\n", + "foo\n", PDFTokenizer::PDFRet::EOS + ); + } + SECTION("length greater within a few bytes") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 4" + ">>\n" + "stream\n" + "foo\n" + "endstream\n" + "endobj\n", + "foo\n", PDFTokenizer::PDFRet::EOS + // note that '\n' in expected is not extracted from source data. + // preprocessor does not extract exactly "/Length" bytes, and as long + // as length is greater by no more than a few bytes stream will be read + // correctly up to endstream marker. + ); + } + SECTION("length greater") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 100" + ">>\n" + "stream\n" + "foo\n" + "endstream\n" + "endobj\n", + "foo\n" + "endstream\n" + "endobj\n", PDFTokenizer::PDFRet::EOS + ); + } + SECTION("carriage return following the keyword stream") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 3" + ">>\n" + "stream\r" + "foo\r" + "endstream\n" + "endobj\n", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + SECTION("no end-off-line marker specified") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 3" + ">>\n" + "stream" + "foo" + "endstream\n" + "endobj\n", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + SECTION("no end-off-line marker in stream data") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 3" + ">>\n" + "stream\n" + "foo" + "endstream\n" + "endobj\n", + "fooendstream\n" + "endobj\n", PDFTokenizer::PDFRet::EOS + ); + } +} + +TEST_CASE("stream object over PDU", "[PDFTokenizer]") +{ + SECTION("split inside non-JS stream") + { + test_pdf_proc({ + { + "10 0 obj\n" + "<>\n" + "stream\n" + "foo", + "" + }, + { + "bar\n" + "endstream\n" + "endobj\n", + "" + } + }); + } + SECTION("split inside JavaScript stream") + { + test_pdf_proc({ + { + "1 0 obj\n" + "<>\n" + "endobj\n" + "10 0 obj\n" + "<>\n" + "stream\n" + "foo", + "foo" + }, + { + "bar\n" + "endstream\n" + "endobj\n", + "bar\n" + } + }); + } + SECTION("split between reference and stream obj") + { + test_pdf_proc({ + { + "1 0 obj\n" + "<>\n" + "endobj\n", + "" + }, + { + "10 0 obj\n" + "<>\n" + "stream\n" + "foobar\n" + "endstream\n" + "endobj\n", + "foobar\n" + } + }); + } + SECTION("split between dictionary and stream") + { + test_pdf_proc({ + { + "1 0 obj\n" + "<>\n" + "endobj\n" + "10 0 obj\n" + "<>\n", + "" + }, + { + "stream\n" + "foobar\n" + "endstream\n" + "endobj\n", + "foobar\n" + } }); } } diff --git a/src/service_inspectors/http_inspect/dev_notes_js_norm.txt b/src/service_inspectors/http_inspect/dev_notes_js_norm.txt index 5eb613c17..f60f51a6e 100644 --- a/src/service_inspectors/http_inspect/dev_notes_js_norm.txt +++ b/src/service_inspectors/http_inspect/dev_notes_js_norm.txt @@ -4,7 +4,7 @@ the Legacy Normalizer and the Enhanced Normalizer. In NHI, there are three JSNorm extensions: * HttpInlineJSNorm, processes content of HTML script tags. * HttpExternalJSNorm, processes payload with JavaScript MIME type. - * HttpPDFJSNorm, processes payload with PDF MIME type. + * HttpPDFJSNorm, processes payload with PDF MIME type and PDF files in MIME attachments. Normalization context is per transaction. It is created once js_data calls for normalized JS data, and is deleted once transaction ends. Partial inspections feed data incrementally to JS Normalizer, @@ -17,9 +17,9 @@ During message body analysis the Enhanced Normalizer does one of the following: 2. If it is an HTML-page, Normalizer searches for an opening tag and processes subsequent bytes in a stream mode, until it finds a closing tag. It proceeds and scans the entire message body for inline scripts. -3. If it is PDF file transferred as MIME attachment or as a message body then - Normalizer extracts strings marked with '/JS' keyword and normalizes their - content as JS text. +3. If it is a PDF file transferred as MIME attachment or as a message body then + Normalizer extracts strings assigned to '/JS' key, or streams referred from + the same value, and normalizes their content as JS text. Also, js_data IPS option's buffer is a part of NHI processing in order to start the normalization.