]> git.ipfire.org Git - thirdparty/snort3.git/commitdiff
Pull request #4373: js_norm: address pdf tokenizer issues
authorAndrii Serbeniuk -X (aserbeni - SOFTSERVE INC at Cisco) <aserbeni@cisco.com>
Mon, 22 Jul 2024 11:52:26 +0000 (11:52 +0000)
committerOleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Mon, 22 Jul 2024 11:52:26 +0000 (11:52 +0000)
Merge in SNORT/snort3 from ~ASERBENI/snort3:js_pdf_misses to master

Squashed commit of the following:

commit 44070c0661f54ab9fc8cfdd1bb79e887bd3d9ed3
Author: Andrii Serbeniuk <aserbeni@cisco.com>
Date:   Mon Jun 17 12:40:40 2024 +0300

    js_norm: address pdf tokenizer issues

    - implement support for missed types of indirect objects
    - allow stream length to be defined with a reference
    - improve array nesting checks

src/js_norm/pdf_tokenizer.h
src/js_norm/pdf_tokenizer.l
src/js_norm/test/pdf_tokenizer_test.cc

index 4aca27c914e554a2a3533deb9c368a6089bd361c..6d2b4c8d70cc055f3f64e33b1488d0a6a6117cd5 100644 (file)
@@ -70,6 +70,7 @@ private:
     PDFRet h_lit_u16_unescape();
     PDFRet h_stream_open();
     PDFRet h_stream();
+    PDFRet h_array_nesting();
     bool h_stream_close();
     void h_stream_length();
     void h_ref();
@@ -116,10 +117,19 @@ private:
         char key[PDFTOKENIZER_NAME_MAX_SIZE] = {0};
     };
 
+    struct IndirectObject
+    {
+        void clear()
+        { ref_met = false; }
+
+        bool ref_met = false;
+    };
+
     struct Stream
     {
         int rem_length = -1;
         bool is_js = false;
+        bool is_ref_len = false;
     };
 
     ObjectString obj_string;
@@ -127,6 +137,7 @@ private:
     ObjectDictionary obj_dictionary;
     DictionaryEntry obj_entry;
     Stream obj_stream;
+    IndirectObject indirect_obj;
     std::unordered_set<unsigned int> js_stream_refs;
 
     // represents UTF-16BE code point
@@ -160,7 +171,9 @@ bool PDFTokenizer::h_lit_close()
 
 void PDFTokenizer::h_ind_obj_close()
 {
+    indirect_obj.clear();
     obj_stream.is_js = false;
+    obj_stream.is_ref_len = false;
 }
 
 }
index 68bae284728606341f8d4ad663abde779c7efb51..26878b24e3e028fb6878c9f6f8152b89f0f948a5 100644 (file)
@@ -37,6 +37,7 @@
 #include "js_norm/pdf_tokenizer.h"
 #include "log/messages.h"
 #include "trace/trace_api.h"
+#include "utils/util_cstring.h"
 
 extern THREAD_LOCAL const snort::Trace* js_trace;
 
@@ -140,12 +141,12 @@ INDIRECT_OBJ_OPEN  {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+o
 
 INDIRECT_OBJ_CLOSE endobj
 
-OBJ_REFERENCE        {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
+OBJ_REFERENCE      {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
 
 
 /* Not object start, not comments */
-SKIP              [^[:digit:]%]{1,16}|.
-WHITESPACE        {GRP_WHITESPACE}{1,16}
+SKIP               [^[:digit:]%]{1,16}|.
+WHITESPACE         {GRP_WHITESPACE}{1,16}
 
 /* Start conditions: structures: comment, indirect object, dictionary or array */
 %x comment
@@ -176,7 +177,15 @@ WHITESPACE        {GRP_WHITESPACE}{1,16}
 
 <INITIAL>{INDIRECT_OBJ_OPEN}                      { PUSH(indobj); h_ind_obj_open(); }
 <indobj>{WHITESPACE}                              { }
-<indobj>{INDIRECT_OBJ_CLOSE}                      { POP(); h_ind_obj_close(); }
+<indobj>{INDIRECT_OBJ_CLOSE}                      { POP(); h_ind_obj_close(); EXEC(h_array_nesting()) }
+<indobj>{OBJ_ARRAY_OPEN}                          { ++obj_array.nesting_level; }
+<indobj>{OBJ_ARRAY_CLOSE}                         { --obj_array.nesting_level; }
+<indobj>{OBJ_REFERENCE}                           { indirect_obj.ref_met = true; }
+<indobj>{OBJ_BOOLEAN}                             { }
+<indobj>{OBJ_INT_NUM}                             { }
+<indobj>{OBJ_REL_NUM}                             { }
+<indobj>{OBJ_NULL}                                { }
+<indobj>{OBJ_NAME}                                { }
 
 <indobj>{OBJ_STREAM_OPEN}                         { EXEC(h_stream_open()) PUSH(obj_stream.is_js ? u16 : stream); }
 <stream>{OBJ_STREAM_SKIP}                         { EXEC(h_stream()) }
@@ -201,6 +210,7 @@ WHITESPACE        {GRP_WHITESPACE}{1,16}
 <dictnr>{OBJ_LIT_STR_OPEN}                        { EXEC(h_dict_other()) if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); }
 <dictnr>{OBJ_HEX_STR_OPEN}                        { EXEC(h_dict_other()) if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); }
 <dictnr>{OBJ_DICT_SKIP}                           { }
+<dictnr>{INDIRECT_OBJ_CLOSE}                      { return PDFRet::UNEXPECTED_SYMBOL; }
 
 <indobj>{OBJ_LIT_STR_OPEN}                        { if (h_lit_open()) PUSH(litstr); }
 <litstr>{OBJ_LIT_STR_OPEN}                        { h_lit_open(); }
@@ -261,12 +271,13 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_open()
 
 PDFTokenizer::PDFRet PDFTokenizer::h_dict_close()
 {
-    obj_dictionary.clear();
-
     debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
         "dictionary close, at array level %d\n", obj_array.nesting_level);
 
-    if (obj_dictionary.array_level != obj_array.nesting_level)
+    auto dict_arr_lvl = obj_dictionary.array_level;
+    obj_dictionary.clear();
+
+    if (dict_arr_lvl != obj_array.nesting_level)
         return PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY;
 
     return PDFRet::EOS;
@@ -433,11 +444,25 @@ PDFTokenizer::PDFRet PDFTokenizer::h_lit_u16_unescape()
     return PDFRet::EOS;
 }
 
+PDFTokenizer::PDFRet PDFTokenizer::h_array_nesting()
+{
+    if (obj_array.nesting_level)
+        return PDFRet::UNEXPECTED_SYMBOL;
+    else
+        return PDFRet::EOS;
+}
+
 PDFTokenizer::PDFRet PDFTokenizer::h_stream_open()
 {
-    if (obj_stream.rem_length < 0)
+    if (obj_stream.rem_length < 0 and !obj_stream.is_ref_len)
         return PDFRet::STREAM_NO_LENGTH;
 
+    if (indirect_obj.ref_met)
+    {
+        indirect_obj.clear();
+        return PDFRet::UNEXPECTED_SYMBOL; // indirect streams must have direct dictionaries
+    }
+
     debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
         "Starting %s stream, length %d\n", obj_stream.is_js ? "JavaScript" : "skipping", obj_stream.rem_length);
 
@@ -453,6 +478,7 @@ PDFTokenizer::PDFRet PDFTokenizer::h_stream()
 bool PDFTokenizer::h_stream_close()
 {
     obj_stream.rem_length -= yyleng;
+
     if (obj_stream.rem_length <= 0)
     {
         if (YY_START == jsstream)
@@ -462,24 +488,29 @@ bool PDFTokenizer::h_stream_close()
 
     if (YY_START == jsstream)
         ECHO;
-    return false;
+    return obj_stream.is_ref_len;
 }
 
 void PDFTokenizer::h_stream_length()
 {
     if (!strcmp(obj_entry.key, "/Length"))
-        obj_stream.rem_length = atoi(yytext);
+        obj_stream.rem_length = snort::SnortStrtol(yytext, nullptr, 10);
 }
 
 void PDFTokenizer::h_ref()
 {
     if (!strcmp(obj_entry.key, "/JS"))
-        js_stream_refs.insert(atoi(yytext));
+        js_stream_refs.insert(snort::SnortStrtoul(yytext, nullptr, 10));
+    else if (!strcmp(obj_entry.key, "/Length"))
+    {
+        obj_stream.is_ref_len = true;
+        obj_stream.rem_length = -1;
+    }
 }
 
 void PDFTokenizer::h_ind_obj_open()
 {
-    int value = atoi(yytext);
+    unsigned int value = snort::SnortStrtoul(yytext, nullptr, 10);
     if (js_stream_refs.count(value) > 0)
         obj_stream.is_js = true;
 }
index f6b53b874bd4cf0a9db2fc5149647f2da2e6eb9c..f8f8bfdc6abcd2cac1e7eb0813f020550bca3b51 100644 (file)
@@ -101,7 +101,7 @@ TEST_CASE("basic", "[PDFTokenizer]")
             ""
         );
     }
-    SECTION("indirect object")
+    SECTION("indirect dictionary")
     {
         test_pdf_proc(
             "19 0 obj"
@@ -112,6 +112,83 @@ TEST_CASE("basic", "[PDFTokenizer]")
             ""
         );
     }
+
+    SECTION("indirect array")
+    {
+        test_pdf_proc(
+            "1 0 obj"
+            "["
+            "null 1 2 3.14 (string) << /SubDict [/Sub /Array] >> true 2 0 R"
+            "]"
+            "endobj",
+            ""
+        );
+    }
+
+    SECTION("indirect imbalanced array")
+    {
+        test_pdf_proc(
+            "1 0 obj"
+            "["
+            "1 2 3\n"
+            "endobj",
+            "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+        );
+    }
+
+    SECTION("indirect number")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "1\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "3.14\n"
+            "endobj",
+            ""
+        );
+    }
+
+    SECTION("indirect ref")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "2 0 R\n"
+            "endobj",
+            ""
+        );
+    }
+
+    SECTION("indirect bool")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "false\n"
+            "endobj\n",
+            ""
+        );
+    }
+
+    SECTION("indirect name")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "/name\n"
+            "endobj",
+            ""
+        );
+    }
+
+     SECTION("indirect null")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "null\n"
+            "endobj\n",
+            ""
+        );
+    }
+
     SECTION("records")
     {
         test_pdf_proc(
@@ -268,26 +345,269 @@ TEST_CASE("basic", "[PDFTokenizer]")
             "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
         );
     }
-    SECTION("incomplete array")
+    SECTION("token too long")
+    {
+        test_pdf_proc(
+            "1"s + std::string(16 * 1024,' ') + " 0 obj"
+            "<< >>"
+            "endobj"s,
+            "",  PDFTokenizer::PDFRet::TOKEN_TOO_LONG
+        );
+    }
+}
+
+TEST_CASE("brackets balancing", "[PDFTokenizer]")
+{
+    SECTION("imbalanced array")
+    {
+        SECTION("missing end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "[ 0 "
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+        SECTION("redundant end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "[ 0 ]]"
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+    }
+    SECTION("imbalanced dictionary")
+    {
+        SECTION("missing end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "<< /dict "
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+        SECTION("redundant end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "<< /dict >> >>"
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+    }
+    SECTION("balanced array in array")
     {
         test_pdf_proc(
             "1 0 obj"
-            "<<"
-            "/K1 [ /V1 /V2 /V3 "
-            ">>"
+            "["
+            "[ /nested /array ]"
+            "]"
             "endobj",
-            "",  PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
+            ""
         );
     }
-    SECTION("token too long")
+    SECTION("imbalanced array in array")
+    {
+        SECTION("missing end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "["
+                "[ /nested /array "
+                "]"
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+        SECTION("redundant end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "["
+                "[ /nested /array ] ]"
+                "]"
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+    }
+    SECTION("balanced dictionary in array")
     {
         test_pdf_proc(
-            "1"s + std::string(16 * 1024,' ') + " 0 obj"
-            "<< >>"
-            "endobj"s,
-            "",  PDFTokenizer::PDFRet::TOKEN_TOO_LONG
+            "1 0 obj"
+            "["
+            "<< /nested /dict >>"
+            "]"
+            "endobj",
+            ""
+        );
+    }
+    SECTION("imbalanced dictionary in array")
+    {
+        SECTION("missing end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "["
+                "<< /nested /dict "
+                "]"
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+        SECTION("redundant end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "["
+                "<< /nested /dict >> >>"
+                "]"
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+    }
+    SECTION("balanced array in dictionary")
+    {
+        test_pdf_proc(
+                "1 0 obj"
+                "<< /array [] >>"
+                "endobj",
+                ""
+            );
+    }
+    SECTION("imbalanced array in dictionary")
+    {
+        SECTION("missing end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "<<"
+                "/K1 [ /V1 /V2 /V3 "
+                ">>"
+                "endobj",
+                "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
+            );
+        }
+        SECTION("redundant end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "<<"
+                "/K1 [ /V1 /V2 /V3 ]]"
+                ">>"
+                "endobj",
+                "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
+            );
+        }
+    }
+    SECTION("balanced strings")
+    {
+        test_pdf_proc(
+            "1 0 obj"
+            "( a string with ( parentheses ) in it )"
+            "endobj",
+            ""
+        );
+    }
+    SECTION("imbalanced strings")
+    {
+        SECTION("missing end")
+        {
+            // NOTE: such syntax doesn't generate an error, because it's possible
+            // to have a string continuation in next PDUs. Same holds true for
+            // hex strings too
+            test_pdf_proc(
+                "1 0 obj"
+                "( a string with ( parentheses  in it )"
+                "endobj",
+                ""
+            );
+        }
+        SECTION("redundant end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "( a string with ( parentheses  in it )))"
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+    }
+    SECTION("balanced hex strings")
+    {
+        test_pdf_proc(
+            "1 0 obj"
+            "<FE FF 00 66 006F 00 6F>"
+            "endobj",
+            ""
         );
     }
+    SECTION("imbalanced hex strings")
+    {
+        SECTION("missing end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "<FE FF 00 66 006F 00 6F "
+                "endobj",
+                ""
+            );
+        }
+        SECTION("redundant end")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "<FE FF 00 66 006F 00 6F>>"
+                "endobj",
+                "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+            );
+        }
+    }
+    SECTION("multiple tokens inter-nesting")
+    {
+        SECTION("array-array-array")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "[ [ [ null ] ] ]"
+                "endobj",
+                ""
+            );
+        }
+        SECTION("array-array-dict")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "[ [ << /key /value >> ] ]"
+                "endobj",
+                ""
+            );
+        }
+        SECTION("dict-dict-array")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "<< /key1 << /key2 [ null ] >> >>"
+                "endobj",
+                ""
+            );
+        }
+        SECTION("dict-dict-dict")
+        {
+            test_pdf_proc(
+                "1 0 obj"
+                "<< /key1 << /key2 << /key3 /val3 >> >> >>"
+                "endobj",
+                ""
+            );
+        }
+    }
 }
 
 TEST_CASE("JS location", "[PDFTokenizer]")
@@ -485,6 +805,26 @@ TEST_CASE("stream object", "[PDFTokenizer]")
             "bar\n"
         );
     }
+    SECTION("reference as length")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 3 0 R"
+            ">>\n"
+            "stream\n"
+            "foo\n"
+            "endstream\n"
+            "endobj\n"
+            "3 0 obj\n"
+            "3\n"
+            "endobj\n",
+            "foo\n", PDFTokenizer::PDFRet::EOS
+        );
+    }
     SECTION("special symbols in a stream")
     {
         test_pdf_proc(