From: Yurii Chalov -X (ychalov - SOFTSERVE INC at Cisco) Date: Fri, 27 Sep 2024 14:25:12 +0000 (+0000) Subject: Pull request #4450: js_norm: allow processing complex nested PDF objects X-Git-Tag: 3.4.0.0~15 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e0261e2fdfc83c2f9dd085c73e1f25f34ca38c87;p=thirdparty%2Fsnort3.git Pull request #4450: js_norm: allow processing complex nested PDF objects Merge in SNORT/snort3 from ~YCHALOV/snort3:pdf_tokenizer_improve to master Squashed commit of the following: commit a8a63adb802cc2dc3fa7d3c0eb112993e1845f11 Author: Yurii Chalov Date: Mon Sep 9 17:02:21 2024 +0200 js_norm: allow processing complex nested PDF objects --- diff --git a/src/js_norm/js_config.h b/src/js_norm/js_config.h index f8bbb7714..ea4589b3a 100644 --- a/src/js_norm/js_config.h +++ b/src/js_norm/js_config.h @@ -30,6 +30,7 @@ struct JSNormConfig uint8_t max_template_nesting = 32; uint32_t max_bracket_depth = 256; uint32_t max_scope_depth = 256; + uint32_t pdf_max_dictionary_depth = 32; std::unordered_set ignored_ids; std::unordered_set ignored_props; }; diff --git a/src/js_norm/js_norm_module.cc b/src/js_norm/js_norm_module.cc index 69166e17b..7b03dddd0 100644 --- a/src/js_norm/js_norm_module.cc +++ b/src/js_norm/js_norm_module.cc @@ -68,6 +68,9 @@ const Parameter JSNormModule::params[] = { "max_scope_depth", Parameter::PT_INT, "1:65535", "256", "maximum depth of scope nesting that enhanced JavaScript normalizer will process" }, + { "pdf_max_dictionary_depth", Parameter::PT_INT, "1:65535", "32", + "maximum depth of dictionary nesting that PDF parser will process" }, + { "ident_ignore", Parameter::PT_LIST, ident_ignore_param, nullptr, "list of JavaScript ignored identifiers which will not be normalized" }, @@ -163,6 +166,10 @@ bool JSNormModule::set(const char*, Value& v, SnortConfig*) { config->ignored_props.insert(v.get_string()); } + else if (v.is("pdf_max_dictionary_depth")) + { + config->pdf_max_dictionary_depth = v.get_uint32(); + } return true; } diff --git a/src/js_norm/js_pdf_norm.h b/src/js_norm/js_pdf_norm.h index 68cdbe5c2..47b379fde 100644 --- a/src/js_norm/js_pdf_norm.h +++ b/src/js_norm/js_pdf_norm.h @@ -46,7 +46,8 @@ public: PDFJSNorm(JSNormConfig* cfg, uint32_t gen_id) : JSNorm(cfg, false, gen_id), - pdf_in(&buf_pdf_in), pdf_out(&buf_pdf_out), extractor(pdf_in, pdf_out) + pdf_in(&buf_pdf_in), pdf_out(&buf_pdf_out), + extractor(pdf_in, pdf_out, cfg ? cfg->pdf_max_dictionary_depth : 0) { } protected: diff --git a/src/js_norm/pdf_tokenizer.h b/src/js_norm/pdf_tokenizer.h index 6d2b4c8d7..bb1972955 100644 --- a/src/js_norm/pdf_tokenizer.h +++ b/src/js_norm/pdf_tokenizer.h @@ -22,6 +22,7 @@ #include #include +#include #include #include "main/snort_types.h" @@ -38,15 +39,16 @@ public: { EOS = 0, NOT_NAME_IN_DICTIONARY_KEY, - INCOMPLETE_ARRAY_IN_DICTIONARY, + INCORRECT_BRACKETS_NESTING, STREAM_NO_LENGTH, UNEXPECTED_SYMBOL, TOKEN_TOO_LONG, + DICTIONARY_NESTING_OVERFLOW, MAX }; PDFTokenizer() = delete; - explicit PDFTokenizer(std::istream& in, std::ostream& out); + explicit PDFTokenizer(std::istream& in, std::ostream& out, int dictionaries_max_size); ~PDFTokenizer() override; PDFRet process(); @@ -134,11 +136,12 @@ private: ObjectString obj_string; ObjectArray obj_array; - ObjectDictionary obj_dictionary; + std::stack dictionaries; DictionaryEntry obj_entry; Stream obj_stream; IndirectObject indirect_obj; std::unordered_set js_stream_refs; + unsigned dictionaries_max_size; // represents UTF-16BE code point struct @@ -151,12 +154,12 @@ private: bool PDFTokenizer::h_lit_str() { - return obj_dictionary.array_level == obj_array.nesting_level and !strcmp(obj_entry.key, "/JS"); + return dictionaries.top().array_level == obj_array.nesting_level and !strcmp(obj_entry.key, "/JS"); } bool PDFTokenizer::h_hex_str() { - return obj_dictionary.array_level == obj_array.nesting_level and !strcmp(obj_entry.key, "/JS"); + return dictionaries.top().array_level == obj_array.nesting_level and !strcmp(obj_entry.key, "/JS"); } bool PDFTokenizer::h_lit_open() diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l index 26878b24e..ee2b34cda 100644 --- a/src/js_norm/pdf_tokenizer.l +++ b/src/js_norm/pdf_tokenizer.l @@ -59,11 +59,12 @@ using namespace jsn; } #define EXEC(f) \ + do \ { \ auto r = (f); \ if (r) \ return r; \ - } + } while (0) %} @@ -122,6 +123,8 @@ OBJ_NAME \/{GRP_REGULAR}{1,256} OBJ_ARRAY_OPEN "[" OBJ_ARRAY_CLOSE "]" +OBJ_ARRAY_SKIP .|{GRP_NEWLINE} + /* 7.3.7 Dictionary Objects */ OBJ_DICT_OPEN "<<" OBJ_DICT_CLOSE ">>" @@ -152,6 +155,7 @@ WHITESPACE {GRP_WHITESPACE}{1,16} %x comment %x indobj %x dictnr +%x array /* Start conditions: literals: regular, hexadecimal, stream */ %x litstr @@ -171,15 +175,14 @@ WHITESPACE {GRP_WHITESPACE}{1,16} %% -{COMMENT_START} { PUSH(comment); } +{COMMENT_START} { PUSH(comment); } {COMMENT_CONTENT} { } {COMMENT_END} { POP(); } {INDIRECT_OBJ_OPEN} { PUSH(indobj); h_ind_obj_open(); } {WHITESPACE} { } -{INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); EXEC(h_array_nesting()) } -{OBJ_ARRAY_OPEN} { ++obj_array.nesting_level; } -{OBJ_ARRAY_CLOSE} { --obj_array.nesting_level; } +{INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); EXEC(h_array_nesting()); } +{OBJ_ARRAY_OPEN} { PUSH(array); ++obj_array.nesting_level; } {OBJ_REFERENCE} { indirect_obj.ref_met = true; } {OBJ_BOOLEAN} { } {OBJ_INT_NUM} { } @@ -187,28 +190,44 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {OBJ_NULL} { } {OBJ_NAME} { } -{OBJ_STREAM_OPEN} { EXEC(h_stream_open()) PUSH(obj_stream.is_js ? u16 : stream); } -{OBJ_STREAM_SKIP} { EXEC(h_stream()) } -{OBJ_STREAM_SKIP} { EXEC(h_stream()) ECHO; } -{OBJ_STREAM_SKIP} { EXEC(h_stream()) EXEC(h_lit_u16()) } +{WHITESPACE} { } +{OBJ_ARRAY_OPEN} { PUSH(array); ++obj_array.nesting_level; } +{OBJ_ARRAY_CLOSE} { POP(); --obj_array.nesting_level; if (YY_START == dictnr) EXEC(h_dict_other()); } +{OBJ_REFERENCE} { indirect_obj.ref_met = true; } +{OBJ_BOOLEAN} { } +{OBJ_INT_NUM} { } +{OBJ_REL_NUM} { } +{OBJ_NULL} { } +{OBJ_NAME} { } +{OBJ_LIT_STR_OPEN} { if (h_lit_open()) PUSH(litstr); } +{OBJ_HEX_STR_OPEN} { PUSH(hexstr); } +{OBJ_ARRAY_SKIP} { } +{INDIRECT_OBJ_CLOSE} { return PDFRet::UNEXPECTED_SYMBOL; } + +{OBJ_STREAM_OPEN} { EXEC(h_stream_open()); PUSH(obj_stream.is_js ? u16 : stream); } +{OBJ_STREAM_SKIP} { EXEC(h_stream()); } +{OBJ_STREAM_SKIP} { EXEC(h_stream()); ECHO; } +{OBJ_STREAM_SKIP} { EXEC(h_stream()); EXEC(h_lit_u16()); } {OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } {OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } {OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } -{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()) } -{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()) } -{OBJ_DICT_CLOSE} { POP(); EXEC(h_dict_close()) } +{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()); } +{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()); } +{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()); } +{OBJ_DICT_CLOSE} { return PDFRet::INCORRECT_BRACKETS_NESTING; } +{OBJ_DICT_CLOSE} { POP(); EXEC(h_dict_close()); } {WHITESPACE} { } -{OBJ_REFERENCE} { EXEC(h_dict_other()) h_ref(); } -{OBJ_BOOLEAN} { EXEC(h_dict_other()) } -{OBJ_INT_NUM} { EXEC(h_dict_other()) h_stream_length(); } -{OBJ_REL_NUM} { EXEC(h_dict_other()) } -{OBJ_NULL} { EXEC(h_dict_other()) } -{OBJ_NAME} { EXEC(h_dict_name()) } -{OBJ_ARRAY_OPEN} { ++obj_array.nesting_level; EXEC(h_dict_other()) } -{OBJ_ARRAY_CLOSE} { --obj_array.nesting_level; EXEC(h_dict_other()) } -{OBJ_LIT_STR_OPEN} { EXEC(h_dict_other()) if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); } -{OBJ_HEX_STR_OPEN} { EXEC(h_dict_other()) if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); } +{OBJ_REFERENCE} { EXEC(h_dict_other()); h_ref(); } +{OBJ_BOOLEAN} { EXEC(h_dict_other()); } +{OBJ_INT_NUM} { EXEC(h_dict_other()); h_stream_length(); } +{OBJ_REL_NUM} { EXEC(h_dict_other()); } +{OBJ_NULL} { EXEC(h_dict_other()); } +{OBJ_NAME} { EXEC(h_dict_name()); } +{OBJ_ARRAY_OPEN} { PUSH(array); ++obj_array.nesting_level; EXEC(h_dict_other()); } +{OBJ_ARRAY_CLOSE} { return PDFRet::INCORRECT_BRACKETS_NESTING; } +{OBJ_LIT_STR_OPEN} { EXEC(h_dict_other()); if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); } +{OBJ_HEX_STR_OPEN} { EXEC(h_dict_other()); if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); } {OBJ_DICT_SKIP} { } {INDIRECT_OBJ_CLOSE} { return PDFRet::UNEXPECTED_SYMBOL; } @@ -228,8 +247,8 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {OBJ_LIT_STR_OPEN} { if (!h_lit_open()) ECHO; else PUSH(u16); } {OBJ_LIT_STR_CLOSE} { if (h_lit_close()) POP(); else ECHO; } -{LIT_STR_ESC} { EXEC(h_lit_unescape()) } -{LIT_STR_ESC_OCT} { EXEC(h_lit_oct2chr()) } +{LIT_STR_ESC} { EXEC(h_lit_unescape()); } +{LIT_STR_ESC_OCT} { EXEC(h_lit_oct2chr()); } {LIT_STR_ESC_EOL}{WHITESPACE} { } {LIT_STR_EOL} { ECHO; } {LIT_STR_BODY} { ECHO; } @@ -239,16 +258,16 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {OBJ_LIT_STR_CLOSE} { if (h_lit_close()) POP(); } {LIT_STR_ESC_EOL} { } -{LIT_STR_U16_UNESC} { EXEC(h_lit_u16_unescape()) } -{LIT_STR_U16_BODY} { EXEC(h_lit_u16()) } +{LIT_STR_U16_UNESC} { EXEC(h_lit_u16_unescape()); } +{LIT_STR_U16_BODY} { EXEC(h_lit_u16()); } {U16_BOM_HEX} { h_u16_hex_start(); } .|\n { h_u16_hex_break(); } {OBJ_HEX_STR_OPEN} { PUSH(u16hex); } {OBJ_HEX_STR_CLOSE} { POP(); } -{HEX_STR_BODY} { EXEC(h_hex_hex2chr()) } -{HEX_STR_BODY} { EXEC(h_hex_hex2chr_u16()) } +{HEX_STR_BODY} { EXEC(h_hex_hex2chr()); } +{HEX_STR_BODY} { EXEC(h_hex_hex2chr_u16()); } {HEX_STR_SKIP} { } <*><> { return PDFRet::EOS; } @@ -260,8 +279,11 @@ WHITESPACE {GRP_WHITESPACE}{1,16} PDFTokenizer::PDFRet PDFTokenizer::h_dict_open() { - obj_dictionary.clear(); - obj_dictionary.array_level = obj_array.nesting_level; + if (dictionaries.size() > dictionaries_max_size) + return PDFRet::DICTIONARY_NESTING_OVERFLOW; + dictionaries.push(ObjectDictionary()); + dictionaries.top().clear(); + dictionaries.top().array_level = obj_array.nesting_level; debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, "dictionary open, at array level %d\n", obj_array.nesting_level); @@ -274,21 +296,25 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_close() debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, "dictionary close, at array level %d\n", obj_array.nesting_level); - auto dict_arr_lvl = obj_dictionary.array_level; - obj_dictionary.clear(); + auto dict_arr_lvl = dictionaries.top().array_level; if (dict_arr_lvl != obj_array.nesting_level) - return PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY; + return PDFRet::INCORRECT_BRACKETS_NESTING; + + dictionaries.pop(); + + if (YY_START == dictnr) + dictionaries.top().key_value = true; return PDFRet::EOS; } PDFTokenizer::PDFRet PDFTokenizer::h_dict_other() { - if (obj_dictionary.array_level != obj_array.nesting_level) + if (dictionaries.top().array_level != obj_array.nesting_level) return PDFRet::EOS; - if (obj_dictionary.key_value) + if (dictionaries.top().key_value) return PDFRet::NOT_NAME_IN_DICTIONARY_KEY; debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, @@ -297,26 +323,26 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_other() debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, "dictionary entry: %s, %s\n", obj_entry.key, yytext); - obj_dictionary.key_value = !obj_dictionary.key_value; + dictionaries.top().key_value = true; return PDFRet::EOS; } PDFTokenizer::PDFRet PDFTokenizer::h_dict_name() { - if (obj_dictionary.array_level != obj_array.nesting_level) + if (dictionaries.top().array_level != obj_array.nesting_level) return PDFRet::EOS; - if (obj_dictionary.key_value) + if (dictionaries.top().key_value) strncpy(obj_entry.key, yytext, sizeof(obj_entry.key) - 1); - obj_dictionary.key_value = !obj_dictionary.key_value; + dictionaries.top().key_value = !dictionaries.top().key_value; debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, - "dictionary token: name as %s\n", obj_dictionary.key_value ? "value" : "key"); + "dictionary token: name as %s\n", dictionaries.top().key_value ? "value" : "key"); debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, - "dictionary entry: %s, %s\n", obj_entry.key, obj_dictionary.key_value ? yytext : "..."); + "dictionary entry: %s, %s\n", obj_entry.key, dictionaries.top().key_value ? yytext : "..."); return PDFRet::EOS; } @@ -397,7 +423,7 @@ PDFTokenizer::PDFRet PDFTokenizer::h_hex_hex2chr_u16() { unsigned v; sscanf(ptr, "%02x", &v); - EXEC(u16_eval((uint8_t)v)) + EXEC(u16_eval((uint8_t)v)); ptr += 2; } @@ -405,7 +431,7 @@ PDFTokenizer::PDFRet PDFTokenizer::h_hex_hex2chr_u16() { unsigned v; sscanf(ptr, "%01x", &v); - EXEC(u16_eval((uint8_t)(v << 4))) + EXEC(u16_eval((uint8_t)(v << 4))); } debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, @@ -421,7 +447,7 @@ PDFTokenizer::PDFRet PDFTokenizer::h_lit_u16() while (ptr < end) { - EXEC(u16_eval(*ptr)) + EXEC(u16_eval(*ptr)); ++ptr; } @@ -436,7 +462,7 @@ PDFTokenizer::PDFRet PDFTokenizer::h_lit_u16_unescape() assert(yyleng == 2); // the reverse solidus behaves as a split point in this case and should be removed - EXEC(u16_eval(literal_unescape(yytext[1]))) + EXEC(u16_eval(literal_unescape(yytext[1]))); debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, "string, in UTF-16BE, escaped: %s\n", yytext); @@ -447,7 +473,7 @@ PDFTokenizer::PDFRet PDFTokenizer::h_lit_u16_unescape() PDFTokenizer::PDFRet PDFTokenizer::h_array_nesting() { if (obj_array.nesting_level) - return PDFRet::UNEXPECTED_SYMBOL; + return PDFRet::INCORRECT_BRACKETS_NESTING; else return PDFRet::EOS; } @@ -645,9 +671,10 @@ void PDFTokenizer::u16_to_u8(uint32_t code) yyout << out; } -PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out) - : yyFlexLexer(in, out) +PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out, int dictionaries_max_size) + : yyFlexLexer(in, out), dictionaries_max_size(dictionaries_max_size) { + dictionaries.push(ObjectDictionary()); } PDFTokenizer::~PDFTokenizer() diff --git a/src/js_norm/test/pdf_tokenizer_test.cc b/src/js_norm/test/pdf_tokenizer_test.cc index f8f8bfdc6..8b22fe7fa 100644 --- a/src/js_norm/test/pdf_tokenizer_test.cc +++ b/src/js_norm/test/pdf_tokenizer_test.cc @@ -33,6 +33,7 @@ using namespace jsn; using namespace std; using namespace std::string_literals; +static constexpr int nesting_level = 10; typedef pair Chunk; static void test_pdf_proc(const string& source, const string& expected, @@ -40,7 +41,7 @@ static void test_pdf_proc(const string& source, const string& expected, { istringstream in(source); ostringstream out; - PDFTokenizer extractor(in, out); + PDFTokenizer extractor(in, out, nesting_level); auto r = extractor.process(); @@ -52,7 +53,7 @@ static void test_pdf_proc(const vector& chunks) { istringstream in; ostringstream out; - PDFTokenizer extractor(in, out); + PDFTokenizer extractor(in, out, nesting_level); for (const auto& chunk : chunks) { @@ -273,6 +274,15 @@ TEST_CASE("basic", "[PDFTokenizer]") "" ); } + SECTION("hex string in array") + { + test_pdf_proc( + "1 0 obj\n" + "[ <0001020304 05> ] \n" + "endobj\n", + "" + ); + } SECTION("key after literal string") { test_pdf_proc( @@ -354,6 +364,15 @@ TEST_CASE("basic", "[PDFTokenizer]") "", PDFTokenizer::PDFRet::TOKEN_TOO_LONG ); } + SECTION("dictionary nesting overflow") + { + test_pdf_proc( + "1 0 obj" + "<< << << << << << << << << << << << << >> >> >> >> >> >> >> >> >> >> >> >> >>" + "endobj", + "", PDFTokenizer::PDFRet::DICTIONARY_NESTING_OVERFLOW + ); + } } TEST_CASE("brackets balancing", "[PDFTokenizer]") @@ -457,7 +476,7 @@ TEST_CASE("brackets balancing", "[PDFTokenizer]") "<< /nested /dict " "]" "endobj", - "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + "", PDFTokenizer::PDFRet::INCORRECT_BRACKETS_NESTING ); } SECTION("redundant end") @@ -468,7 +487,7 @@ TEST_CASE("brackets balancing", "[PDFTokenizer]") "<< /nested /dict >> >>" "]" "endobj", - "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + "", PDFTokenizer::PDFRet::INCORRECT_BRACKETS_NESTING ); } } @@ -491,7 +510,7 @@ TEST_CASE("brackets balancing", "[PDFTokenizer]") "/K1 [ /V1 /V2 /V3 " ">>" "endobj", - "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY + "", PDFTokenizer::PDFRet::INCORRECT_BRACKETS_NESTING ); } SECTION("redundant end") @@ -502,7 +521,7 @@ TEST_CASE("brackets balancing", "[PDFTokenizer]") "/K1 [ /V1 /V2 /V3 ]]" ">>" "endobj", - "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY + "", PDFTokenizer::PDFRet::INCORRECT_BRACKETS_NESTING ); } } @@ -571,6 +590,51 @@ TEST_CASE("brackets balancing", "[PDFTokenizer]") } SECTION("multiple tokens inter-nesting") { + SECTION("array-dict-array") + { + test_pdf_proc( + "1 0 obj" + "[ << /key [] >> ]" + "endobj", + "" + ); + } + SECTION("array-dict-dict") + { + test_pdf_proc( + "1 0 obj" + "[ << /key << /key2 null >> >> ]" + "endobj", + "" + ); + } + SECTION("dict-array-array") + { + test_pdf_proc( + "1 0 obj" + "<< /key [ [ null ] ] >>" + "endobj", + "" + ); + } + SECTION("dict-array-dict") + { + test_pdf_proc( + "1 0 obj" + "<< /key [ << /key2 /value >> ] >>" + "endobj", + "" + ); + } + SECTION("complex-dict-array-nesting") + { + test_pdf_proc( + "1 0 obj" + "<< /key /value /key [ << /key2 /value >> << /key [ [ << /key [ << /key /value >> ] >> ] ] >> ] >>" + "endobj", + "" + ); + } SECTION("array-array-array") { test_pdf_proc(