From: Andrii Serbeniuk -X (aserbeni - SOFTSERVE INC at Cisco) Date: Mon, 22 Jul 2024 11:52:26 +0000 (+0000) Subject: Pull request #4373: js_norm: address pdf tokenizer issues X-Git-Tag: 3.3.2.0~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=19c5c0cc821f869906f7e33cfba131efd89b76a4;p=thirdparty%2Fsnort3.git Pull request #4373: js_norm: address pdf tokenizer issues Merge in SNORT/snort3 from ~ASERBENI/snort3:js_pdf_misses to master Squashed commit of the following: commit 44070c0661f54ab9fc8cfdd1bb79e887bd3d9ed3 Author: Andrii Serbeniuk Date: Mon Jun 17 12:40:40 2024 +0300 js_norm: address pdf tokenizer issues - implement support for missed types of indirect objects - allow stream length to be defined with a reference - improve array nesting checks --- diff --git a/src/js_norm/pdf_tokenizer.h b/src/js_norm/pdf_tokenizer.h index 4aca27c91..6d2b4c8d7 100644 --- a/src/js_norm/pdf_tokenizer.h +++ b/src/js_norm/pdf_tokenizer.h @@ -70,6 +70,7 @@ private: PDFRet h_lit_u16_unescape(); PDFRet h_stream_open(); PDFRet h_stream(); + PDFRet h_array_nesting(); bool h_stream_close(); void h_stream_length(); void h_ref(); @@ -116,10 +117,19 @@ private: char key[PDFTOKENIZER_NAME_MAX_SIZE] = {0}; }; + struct IndirectObject + { + void clear() + { ref_met = false; } + + bool ref_met = false; + }; + struct Stream { int rem_length = -1; bool is_js = false; + bool is_ref_len = false; }; ObjectString obj_string; @@ -127,6 +137,7 @@ private: ObjectDictionary obj_dictionary; DictionaryEntry obj_entry; Stream obj_stream; + IndirectObject indirect_obj; std::unordered_set js_stream_refs; // represents UTF-16BE code point @@ -160,7 +171,9 @@ bool PDFTokenizer::h_lit_close() void PDFTokenizer::h_ind_obj_close() { + indirect_obj.clear(); obj_stream.is_js = false; + obj_stream.is_ref_len = false; } } diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l index 68bae2847..26878b24e 100644 --- a/src/js_norm/pdf_tokenizer.l +++ b/src/js_norm/pdf_tokenizer.l @@ -37,6 +37,7 @@ #include "js_norm/pdf_tokenizer.h" #include "log/messages.h" #include "trace/trace_api.h" +#include "utils/util_cstring.h" extern THREAD_LOCAL const snort::Trace* js_trace; @@ -140,12 +141,12 @@ INDIRECT_OBJ_OPEN {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+o INDIRECT_OBJ_CLOSE endobj -OBJ_REFERENCE {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R +OBJ_REFERENCE {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R /* Not object start, not comments */ -SKIP [^[:digit:]%]{1,16}|. -WHITESPACE {GRP_WHITESPACE}{1,16} +SKIP [^[:digit:]%]{1,16}|. +WHITESPACE {GRP_WHITESPACE}{1,16} /* Start conditions: structures: comment, indirect object, dictionary or array */ %x comment @@ -176,7 +177,15 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {INDIRECT_OBJ_OPEN} { PUSH(indobj); h_ind_obj_open(); } {WHITESPACE} { } -{INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); } +{INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); EXEC(h_array_nesting()) } +{OBJ_ARRAY_OPEN} { ++obj_array.nesting_level; } +{OBJ_ARRAY_CLOSE} { --obj_array.nesting_level; } +{OBJ_REFERENCE} { indirect_obj.ref_met = true; } +{OBJ_BOOLEAN} { } +{OBJ_INT_NUM} { } +{OBJ_REL_NUM} { } +{OBJ_NULL} { } +{OBJ_NAME} { } {OBJ_STREAM_OPEN} { EXEC(h_stream_open()) PUSH(obj_stream.is_js ? u16 : stream); } {OBJ_STREAM_SKIP} { EXEC(h_stream()) } @@ -201,6 +210,7 @@ WHITESPACE {GRP_WHITESPACE}{1,16} {OBJ_LIT_STR_OPEN} { EXEC(h_dict_other()) if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); } {OBJ_HEX_STR_OPEN} { EXEC(h_dict_other()) if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); } {OBJ_DICT_SKIP} { } +{INDIRECT_OBJ_CLOSE} { return PDFRet::UNEXPECTED_SYMBOL; } {OBJ_LIT_STR_OPEN} { if (h_lit_open()) PUSH(litstr); } {OBJ_LIT_STR_OPEN} { h_lit_open(); } @@ -261,12 +271,13 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_open() PDFTokenizer::PDFRet PDFTokenizer::h_dict_close() { - obj_dictionary.clear(); - debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, "dictionary close, at array level %d\n", obj_array.nesting_level); - if (obj_dictionary.array_level != obj_array.nesting_level) + auto dict_arr_lvl = obj_dictionary.array_level; + obj_dictionary.clear(); + + if (dict_arr_lvl != obj_array.nesting_level) return PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY; return PDFRet::EOS; @@ -433,11 +444,25 @@ PDFTokenizer::PDFRet PDFTokenizer::h_lit_u16_unescape() return PDFRet::EOS; } +PDFTokenizer::PDFRet PDFTokenizer::h_array_nesting() +{ + if (obj_array.nesting_level) + return PDFRet::UNEXPECTED_SYMBOL; + else + return PDFRet::EOS; +} + PDFTokenizer::PDFRet PDFTokenizer::h_stream_open() { - if (obj_stream.rem_length < 0) + if (obj_stream.rem_length < 0 and !obj_stream.is_ref_len) return PDFRet::STREAM_NO_LENGTH; + if (indirect_obj.ref_met) + { + indirect_obj.clear(); + return PDFRet::UNEXPECTED_SYMBOL; // indirect streams must have direct dictionaries + } + debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr, "Starting %s stream, length %d\n", obj_stream.is_js ? "JavaScript" : "skipping", obj_stream.rem_length); @@ -453,6 +478,7 @@ PDFTokenizer::PDFRet PDFTokenizer::h_stream() bool PDFTokenizer::h_stream_close() { obj_stream.rem_length -= yyleng; + if (obj_stream.rem_length <= 0) { if (YY_START == jsstream) @@ -462,24 +488,29 @@ bool PDFTokenizer::h_stream_close() if (YY_START == jsstream) ECHO; - return false; + return obj_stream.is_ref_len; } void PDFTokenizer::h_stream_length() { if (!strcmp(obj_entry.key, "/Length")) - obj_stream.rem_length = atoi(yytext); + obj_stream.rem_length = snort::SnortStrtol(yytext, nullptr, 10); } void PDFTokenizer::h_ref() { if (!strcmp(obj_entry.key, "/JS")) - js_stream_refs.insert(atoi(yytext)); + js_stream_refs.insert(snort::SnortStrtoul(yytext, nullptr, 10)); + else if (!strcmp(obj_entry.key, "/Length")) + { + obj_stream.is_ref_len = true; + obj_stream.rem_length = -1; + } } void PDFTokenizer::h_ind_obj_open() { - int value = atoi(yytext); + unsigned int value = snort::SnortStrtoul(yytext, nullptr, 10); if (js_stream_refs.count(value) > 0) obj_stream.is_js = true; } diff --git a/src/js_norm/test/pdf_tokenizer_test.cc b/src/js_norm/test/pdf_tokenizer_test.cc index f6b53b874..f8f8bfdc6 100644 --- a/src/js_norm/test/pdf_tokenizer_test.cc +++ b/src/js_norm/test/pdf_tokenizer_test.cc @@ -101,7 +101,7 @@ TEST_CASE("basic", "[PDFTokenizer]") "" ); } - SECTION("indirect object") + SECTION("indirect dictionary") { test_pdf_proc( "19 0 obj" @@ -112,6 +112,83 @@ TEST_CASE("basic", "[PDFTokenizer]") "" ); } + + SECTION("indirect array") + { + test_pdf_proc( + "1 0 obj" + "[" + "null 1 2 3.14 (string) << /SubDict [/Sub /Array] >> true 2 0 R" + "]" + "endobj", + "" + ); + } + + SECTION("indirect imbalanced array") + { + test_pdf_proc( + "1 0 obj" + "[" + "1 2 3\n" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + + SECTION("indirect number") + { + test_pdf_proc( + "1 0 obj\n" + "1\n" + "endobj\n" + "2 0 obj\n" + "3.14\n" + "endobj", + "" + ); + } + + SECTION("indirect ref") + { + test_pdf_proc( + "1 0 obj\n" + "2 0 R\n" + "endobj", + "" + ); + } + + SECTION("indirect bool") + { + test_pdf_proc( + "1 0 obj\n" + "false\n" + "endobj\n", + "" + ); + } + + SECTION("indirect name") + { + test_pdf_proc( + "1 0 obj\n" + "/name\n" + "endobj", + "" + ); + } + + SECTION("indirect null") + { + test_pdf_proc( + "1 0 obj\n" + "null\n" + "endobj\n", + "" + ); + } + SECTION("records") { test_pdf_proc( @@ -268,26 +345,269 @@ TEST_CASE("basic", "[PDFTokenizer]") "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY ); } - SECTION("incomplete array") + SECTION("token too long") + { + test_pdf_proc( + "1"s + std::string(16 * 1024,' ') + " 0 obj" + "<< >>" + "endobj"s, + "", PDFTokenizer::PDFRet::TOKEN_TOO_LONG + ); + } +} + +TEST_CASE("brackets balancing", "[PDFTokenizer]") +{ + SECTION("imbalanced array") + { + SECTION("missing end") + { + test_pdf_proc( + "1 0 obj" + "[ 0 " + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + SECTION("redundant end") + { + test_pdf_proc( + "1 0 obj" + "[ 0 ]]" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + } + SECTION("imbalanced dictionary") + { + SECTION("missing end") + { + test_pdf_proc( + "1 0 obj" + "<< /dict " + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + SECTION("redundant end") + { + test_pdf_proc( + "1 0 obj" + "<< /dict >> >>" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + } + SECTION("balanced array in array") { test_pdf_proc( "1 0 obj" - "<<" - "/K1 [ /V1 /V2 /V3 " - ">>" + "[" + "[ /nested /array ]" + "]" "endobj", - "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY + "" ); } - SECTION("token too long") + SECTION("imbalanced array in array") + { + SECTION("missing end") + { + test_pdf_proc( + "1 0 obj" + "[" + "[ /nested /array " + "]" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + SECTION("redundant end") + { + test_pdf_proc( + "1 0 obj" + "[" + "[ /nested /array ] ]" + "]" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + } + SECTION("balanced dictionary in array") { test_pdf_proc( - "1"s + std::string(16 * 1024,' ') + " 0 obj" - "<< >>" - "endobj"s, - "", PDFTokenizer::PDFRet::TOKEN_TOO_LONG + "1 0 obj" + "[" + "<< /nested /dict >>" + "]" + "endobj", + "" + ); + } + SECTION("imbalanced dictionary in array") + { + SECTION("missing end") + { + test_pdf_proc( + "1 0 obj" + "[" + "<< /nested /dict " + "]" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + SECTION("redundant end") + { + test_pdf_proc( + "1 0 obj" + "[" + "<< /nested /dict >> >>" + "]" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + } + SECTION("balanced array in dictionary") + { + test_pdf_proc( + "1 0 obj" + "<< /array [] >>" + "endobj", + "" + ); + } + SECTION("imbalanced array in dictionary") + { + SECTION("missing end") + { + test_pdf_proc( + "1 0 obj" + "<<" + "/K1 [ /V1 /V2 /V3 " + ">>" + "endobj", + "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY + ); + } + SECTION("redundant end") + { + test_pdf_proc( + "1 0 obj" + "<<" + "/K1 [ /V1 /V2 /V3 ]]" + ">>" + "endobj", + "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY + ); + } + } + SECTION("balanced strings") + { + test_pdf_proc( + "1 0 obj" + "( a string with ( parentheses ) in it )" + "endobj", + "" + ); + } + SECTION("imbalanced strings") + { + SECTION("missing end") + { + // NOTE: such syntax doesn't generate an error, because it's possible + // to have a string continuation in next PDUs. Same holds true for + // hex strings too + test_pdf_proc( + "1 0 obj" + "( a string with ( parentheses in it )" + "endobj", + "" + ); + } + SECTION("redundant end") + { + test_pdf_proc( + "1 0 obj" + "( a string with ( parentheses in it )))" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + } + SECTION("balanced hex strings") + { + test_pdf_proc( + "1 0 obj" + "" + "endobj", + "" ); } + SECTION("imbalanced hex strings") + { + SECTION("missing end") + { + test_pdf_proc( + "1 0 obj" + ">" + "endobj", + "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + } + SECTION("multiple tokens inter-nesting") + { + SECTION("array-array-array") + { + test_pdf_proc( + "1 0 obj" + "[ [ [ null ] ] ]" + "endobj", + "" + ); + } + SECTION("array-array-dict") + { + test_pdf_proc( + "1 0 obj" + "[ [ << /key /value >> ] ]" + "endobj", + "" + ); + } + SECTION("dict-dict-array") + { + test_pdf_proc( + "1 0 obj" + "<< /key1 << /key2 [ null ] >> >>" + "endobj", + "" + ); + } + SECTION("dict-dict-dict") + { + test_pdf_proc( + "1 0 obj" + "<< /key1 << /key2 << /key3 /val3 >> >> >>" + "endobj", + "" + ); + } + } } TEST_CASE("JS location", "[PDFTokenizer]") @@ -485,6 +805,26 @@ TEST_CASE("stream object", "[PDFTokenizer]") "bar\n" ); } + SECTION("reference as length") + { + test_pdf_proc( + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<<" + "/Length 3 0 R" + ">>\n" + "stream\n" + "foo\n" + "endstream\n" + "endobj\n" + "3 0 obj\n" + "3\n" + "endobj\n", + "foo\n", PDFTokenizer::PDFRet::EOS + ); + } SECTION("special symbols in a stream") { test_pdf_proc(