Pull request #3698: js_norm: add PDF stream processing

author Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>

Tue, 13 Dec 2022 18:42:24 +0000 (18:42 +0000)

committer Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>

Tue, 13 Dec 2022 18:42:24 +0000 (18:42 +0000)
author Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Tue, 13 Dec 2022 18:42:24 +0000 (18:42 +0000)
committer Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Tue, 13 Dec 2022 18:42:24 +0000 (18:42 +0000)
diff --git a/src/js_norm/dev_notes.txt b/src/js_norm/dev_notes.txt

index 0e0a6b94cd1f46ec9090f2914a2f4c06adbed8db..c004328460411c8f59ec7cc6c81830018d20516d 100644 (file)
--- a/src/js_norm/dev_notes.txt
+++ b/src/js_norm/dev_notes.txt
@@ -134,7 +134,11 @@ PDF parser follows "PDF 32000-1:2008 First Edition 2008-7-1 Document
  management Portable document format Part 1: PDF 1.7".
  Known limitations:
  * Nested dictionaries are not fully supported. Properties of the last object
-are tracked. Once the nested object ends, it clears all info about the object
-type.
+  are tracked. Once the nested object ends, it clears all info about the object
+  type.
  * Nested dictionaries are not allowed in JavaScript-type dictionary.
-* Stream objects are ignored.
+* JavaScript in streams is tracked only when a reference to that stream is found
+  earlier in that file.
+* Compressed JavaScript streams are handled correctly only if PDF decompression is
+  enabled (http_inspect.decompress_pdf = true, and the same option for other inspectors)
+
diff --git a/src/js_norm/pdf_tokenizer.h b/src/js_norm/pdf_tokenizer.h

index 9a31841c7e96f66171a7331b4d4ee8dcdbe70cf0..bd4c20b0b40aaba965953ba176ecbdeab4c4170a 100644 (file)
--- a/src/js_norm/pdf_tokenizer.h
+++ b/src/js_norm/pdf_tokenizer.h
@@ -20,11 +20,9 @@
  #ifndef PDF_TOKENIZER_H
  #define PDF_TOKENIZER_H
  
-#include <array>
  #include <cstring>
  #include <sstream>
-#include <stack>
-#include <vector>
+#include <unordered_set>
  
  #include "main/snort_types.h"
  
@@ -41,6 +39,8 @@ public:
          EOS = 0,
          NOT_NAME_IN_DICTIONARY_KEY,
          INCOMPLETE_ARRAY_IN_DICTIONARY,
+        STREAM_NO_LENGTH,
+        UNEXPECTED_SYMBOL,
          MAX
      };
  
@@ -64,6 +64,13 @@ private:
      PDFRet h_lit_unescape();
      PDFRet h_lit_oct2chr();
      PDFRet h_hex_hex2chr();
+    PDFRet h_stream_open();
+    PDFRet h_stream();
+    bool h_stream_close();
+    void h_stream_length();
+    void h_ref();
+    void h_ind_obj_open();
+    inline void h_ind_obj_close();
  
      struct ObjectString
      {
@@ -98,10 +105,18 @@ private:
          char key[PDFTOKENIZER_NAME_MAX_SIZE] = {0};
      };
  
+    struct Stream
+    {
+        int rem_length = -1;
+        bool is_js = false;
+    };
+
      ObjectString obj_string;
      ObjectArray obj_array;
      ObjectDictionary obj_dictionary;
      DictionaryEntry obj_entry;
+    Stream obj_stream;
+    std::unordered_set<unsigned int> js_stream_refs;
  };
  
  bool PDFTokenizer::h_lit_str()
@@ -124,6 +139,11 @@ bool PDFTokenizer::h_lit_close()
      return --obj_string.parenthesis_level == 0;
  }
  
+void PDFTokenizer::h_ind_obj_close()
+{
+    obj_stream.is_js = false;
+}
+
  }
  
  #endif
diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l

index 20d497ee61f87fce360a15e2bfe5260e10bfac42..6224c39bbe1e7ba82bb52ce96a90f56dd95530d6 100644 (file)
--- a/src/js_norm/pdf_tokenizer.l
+++ b/src/js_norm/pdf_tokenizer.l
@@ -31,9 +31,7 @@
  #endif
  
  
-#include <algorithm>
  #include <cassert>
-#include <stdio.h>
  
  #include "js_norm/js_enum.h"
  #include "js_norm/pdf_tokenizer.h"
@@ -51,20 +49,19 @@ using namespace jsn;
  #define PUSH(x) yy_push_state(x)
  #define POP() yy_pop_state()
  
-#define YY_USER_ACTION                                      \
-    {                                                       \
-        debug_logf(5, js_trace, TRACE_PDF_PROC, nullptr,    \
-            "PDF pattern #%d, sc %d\n", yy_act, YY_START);  \
-                                                            \
-        debug_logf(5, js_trace, TRACE_PDF_DUMP, nullptr,    \
-            "PDF text '%s'\n", YYText());                   \
+#define YY_USER_ACTION                                                         \
+    {                                                                          \
+        debug_logf(5, js_trace, TRACE_PDF_PROC, nullptr,                       \
+            "PDF pattern #%d, sc %d\n", yy_act, YY_START);                     \
+        debug_logf(5, js_trace, TRACE_PDF_DUMP, nullptr,                       \
+            "PDF text '%s'\n", YYText());                                      \
      }
  
-#define EXEC(f)                                 \
-    {                                           \
-        auto r = (f);                           \
-        if (r)                                  \
-            return r;                           \
+#define EXEC(f)                                                                \
+    {                                                                          \
+        auto r = (f);                                                          \
+        if (r)                                                                 \
+            return r;                                                          \
      }
  
  %}
@@ -72,87 +69,109 @@ using namespace jsn;
  /* PDF 32000-1:2008 definitions follow */
  
  /* 7.2.2 Character Set */
-CHARS_WHITESPACE  \x00\x09\x0a\x0c\x0d\x20
-CHARS_DELIMITER   \(\)\<\>\[\]\{\}\/\%
-GRP_WHITESPACE    [\x00\x09\x0a\x0c\x0d\x20]
-GRP_DELIMITER     [\(\)\<\>\[\]\{\}\/\%]
-GRP_REGULAR       [^\x00\x09\x0a\x0c\x0d\x20\(\)\<\>\[\]\{\}\/\%]
+CHARS_WHITESPACE   \x00\x09\x0a\x0c\x0d\x20
+CHARS_DELIMITER    \(\)\<\>\[\]\{\}\/\%
+GRP_WHITESPACE     [\x00\x09\x0a\x0c\x0d\x20]
+EOL_MARKER         \r|\n|\r\n
+GRP_NEWLINE        [\x0d\x0a]
+GRP_NOT_NEWLINE    [^\x0d\x0a]
+GRP_DELIMITER      [\(\)\<\>\[\]\{\}\/\%]
+GRP_REGULAR        [^\x00\x09\x0a\x0c\x0d\x20\(\)\<\>\[\]\{\}\/\%]
  
  /* 7.2.3 Comments */
-COMMENT           %.*
+COMMENT            %{GRP_NOT_NEWLINE}*{EOL_MARKER}
  
  /* 7.3.2 Boolean Objects */
-OBJ_BOOLEAN       true|false
+OBJ_BOOLEAN        true|false
  
  /* 7.3.3 Numeric Objects */
-OBJ_INT_NUM       [+-]?[0-9]{1,64}
-OBJ_REL_NUM       [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64})
+OBJ_INT_NUM        [+-]?[0-9]{1,64}
+OBJ_REL_NUM        [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64})
  
  /* 7.3.4 String Objects */
-OBJ_LIT_STR_OPEN  "("
-OBJ_LIT_STR_CLOSE ")"
-OBJ_HEX_STR_OPEN  "<"
-OBJ_HEX_STR_CLOSE ">"
+OBJ_LIT_STR_OPEN   "("
+OBJ_LIT_STR_CLOSE  ")"
+OBJ_HEX_STR_OPEN   "<"
+OBJ_HEX_STR_CLOSE  ">"
  
  /* 7.3.4.2 Literal Strings */
-LIT_STR_ESC       \\[^0-7]
-LIT_STR_ESC_OCT   \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3}
-LIT_STR_ESC_EOL   \\[\x0d\x0a]|\\\x0d\x0a
-LIT_STR_EOL       [\x0d\x0a]|\x0d\x0a
-LIT_STR_BODY      [^\\\(\)]{1,64}
+LIT_STR_ESC        \\[^0-7]
+LIT_STR_ESC_OCT    \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3}
+LIT_STR_ESC_EOL    \\[\x0d\x0a]|\\\x0d\x0a
+LIT_STR_EOL        [\x0d\x0a]|\x0d\x0a
+LIT_STR_BODY       [^\\\(\)]{1,64}
  
  /* 7.3.4.3 Hexadecimal Strings */
-HEX_STR_BODY      [0-9A-Fa-f]{1,64}
-HEX_STR_SKIP      [^0-9A-Fa-f>]{1,64}
+HEX_STR_BODY       [0-9A-Fa-f]{1,64}
+HEX_STR_SKIP       [^0-9A-Fa-f>]{1,64}
  
  /* 7.3.5 Name Objects */
-OBJ_NAME          \/{GRP_REGULAR}{1,256}
+OBJ_NAME           \/{GRP_REGULAR}{1,256}
  
  /* 7.3.6 Array Objects */
-OBJ_ARRAY_OPEN    "["
-OBJ_ARRAY_CLOSE   "]"
+OBJ_ARRAY_OPEN     "["
+OBJ_ARRAY_CLOSE    "]"
  
  /* 7.3.7 Dictionary Objects */
-OBJ_DICT_OPEN     "<<"
-OBJ_DICT_CLOSE    ">>"
+OBJ_DICT_OPEN      "<<"
+OBJ_DICT_CLOSE     ">>"
  
-/* FIXIT: improve bytes consuming */
-OBJ_DICT_SKIP     .
+OBJ_DICT_SKIP      .|{GRP_NEWLINE}
  
  /* 7.3.8 Stream Objects */
-OBJ_STREAM_OPEN   stream$
-OBJ_STREAM_CLOSE  ^endstream
+OBJ_STREAM_OPEN    stream\r?\n
+OBJ_STREAM_CLOSE   {EOL_MARKER}endstream
+OBJ_STREAM_SKIP    {GRP_NOT_NEWLINE}{1,64}|{GRP_NEWLINE}
  
  /* 7.3.9 Null Object */
-OBJ_NULL          null
+OBJ_NULL           null
  
  /* 7.3.10 Indirect Objects */
-INDIRECT_OBJ      {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj
-RECORD_OBJ        {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
+INDIRECT_OBJ_OPEN  {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj
  
-/* Not dictionary, not strings */
-SKIP              [^<\(%]{1,64}
+INDIRECT_OBJ_CLOSE endobj
+
+OBJ_REFERENCE        {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
+
+
+/* Not object start, not comments */
+SKIP              [^[:digit:]%]{1,64}|.
  WHITESPACE        {GRP_WHITESPACE}{1,64}
  
  /* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string */
+%x indobj
+%x stream
  %x dictnr
  %x litstr
  %x hexstr
  %x jslstr
  %x jshstr
+%x jsstream
  
  %%
  
  {SKIP}                                            { }
  {COMMENT}                                         { }
  
-<INITIAL,dictnr>{OBJ_DICT_OPEN}                   { PUSH(dictnr); EXEC(h_dict_open())  }
+<INITIAL>{INDIRECT_OBJ_OPEN}                      { PUSH(indobj); h_ind_obj_open(); }
+<indobj>{COMMENT}                                 { }
+<indobj>{WHITESPACE}                              { }
+<indobj>{INDIRECT_OBJ_CLOSE}                      { POP(); h_ind_obj_close(); }
+
+<indobj>{OBJ_STREAM_OPEN}                         { EXEC(h_stream_open()) PUSH(obj_stream.is_js ? jsstream : stream); }
+<stream>{OBJ_STREAM_SKIP}                         { EXEC(h_stream()) }
+<jsstream>{OBJ_STREAM_SKIP}                       { EXEC(h_stream()) ECHO; }
+<stream>{OBJ_STREAM_CLOSE}                        { if (h_stream_close()) POP(); }
+<jsstream>{OBJ_STREAM_CLOSE}                      { if (h_stream_close()) POP(); }
+
+<dictnr>{OBJ_DICT_OPEN}                           { PUSH(dictnr); EXEC(h_dict_open()) }
+<indobj>{OBJ_DICT_OPEN}                           { PUSH(dictnr); EXEC(h_dict_open()) }
  <dictnr>{OBJ_DICT_CLOSE}                          { POP(); EXEC(h_dict_close()) }
  <dictnr>{COMMENT}                                 { }
  <dictnr>{WHITESPACE}                              { }
-<dictnr>{RECORD_OBJ}                              { EXEC(h_dict_other()) }
+<dictnr>{OBJ_REFERENCE}                           { EXEC(h_dict_other()) h_ref(); }
  <dictnr>{OBJ_BOOLEAN}                             { EXEC(h_dict_other()) }
-<dictnr>{OBJ_INT_NUM}                             { EXEC(h_dict_other()) }
+<dictnr>{OBJ_INT_NUM}                             { EXEC(h_dict_other()) h_stream_length(); }
  <dictnr>{OBJ_REL_NUM}                             { EXEC(h_dict_other()) }
  <dictnr>{OBJ_NULL}                                { EXEC(h_dict_other()) }
  <dictnr>{OBJ_NAME}                                { EXEC(h_dict_name()) }
@@ -162,7 +181,7 @@ WHITESPACE        {GRP_WHITESPACE}{1,64}
  <dictnr>{OBJ_HEX_STR_OPEN}                        { EXEC(h_dict_other()) if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); }
  <dictnr>{OBJ_DICT_SKIP}                           { }
  
-<INITIAL>{OBJ_LIT_STR_OPEN}                       { if (h_lit_open()) PUSH(litstr); }
+<indobj>{OBJ_LIT_STR_OPEN}                        { if (h_lit_open()) PUSH(litstr); }
  <litstr>{OBJ_LIT_STR_OPEN}                        { h_lit_open(); }
  <litstr>{OBJ_LIT_STR_CLOSE}                       { if (h_lit_close()) POP(); }
  <litstr>{LIT_STR_ESC}                             { }
@@ -171,7 +190,7 @@ WHITESPACE        {GRP_WHITESPACE}{1,64}
  <litstr>{LIT_STR_EOL}                             { }
  <litstr>{LIT_STR_BODY}                            { }
  
-<INITIAL>{OBJ_HEX_STR_OPEN}                       { PUSH(hexstr); }
+<indobj>{OBJ_HEX_STR_OPEN}                        { PUSH(hexstr); }
  <hexstr>{OBJ_HEX_STR_CLOSE}                       { POP(); }
  <hexstr>{HEX_STR_BODY}                            { }
  <hexstr>{HEX_STR_SKIP}                            { }
@@ -189,7 +208,9 @@ WHITESPACE        {GRP_WHITESPACE}{1,64}
  <jshstr>{HEX_STR_BODY}                            { EXEC(h_hex_hex2chr()) }
  <jshstr>{HEX_STR_SKIP}                            { }
  
-<INITIAL,dictnr,litstr,hexstr,jslstr,jshstr><<EOF>>    { return PDFRet::EOS; }
+<*><<EOF>>                                        { return PDFRet::EOS; }
+
+<*>.|\n                                           { return PDFRet::UNEXPECTED_SYMBOL; }
  
  %%
  
@@ -228,6 +249,9 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_other()
      debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
          "dictionary token: other\n");
  
+    debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+        "dictionary entry: %s, %s\n", obj_entry.key, yytext);
+
      obj_dictionary.key_value = !obj_dictionary.key_value;
  
      return PDFRet::EOS;
@@ -320,6 +344,57 @@ PDFTokenizer::PDFRet PDFTokenizer::h_hex_hex2chr()
      return PDFRet::EOS;
  }
  
+PDFTokenizer::PDFRet PDFTokenizer::h_stream_open()
+{
+    if (obj_stream.rem_length < 0)
+        return PDFRet::STREAM_NO_LENGTH;
+
+    debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+        "Starting %s stream, length %d\n", obj_stream.is_js ? "JavaScript" : "skipping", obj_stream.rem_length);
+
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_stream()
+{
+    obj_stream.rem_length -= yyleng;
+    return PDFRet::EOS;
+}
+
+bool PDFTokenizer::h_stream_close()
+{
+    obj_stream.rem_length -= yyleng;
+    if (obj_stream.rem_length <= 0)
+    {
+        if (YY_START == jsstream)
+            yyout << '\n';
+        return true;
+    }
+
+    if (YY_START == jsstream)
+        ECHO;
+    return false;
+}
+
+void PDFTokenizer::h_stream_length()
+{
+    if (!strcmp(obj_entry.key, "/Length"))
+        obj_stream.rem_length = atoi(yytext);
+}
+
+void PDFTokenizer::h_ref()
+{
+    if (!strcmp(obj_entry.key, "/JS"))
+        js_stream_refs.insert(atoi(yytext));
+}
+
+void PDFTokenizer::h_ind_obj_open()
+{
+    int value = atoi(yytext);
+    if (js_stream_refs.count(value) > 0)
+        obj_stream.is_js = true;
+}
+
  PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out)
      : yyFlexLexer(in, out)
  {
@@ -332,6 +407,5 @@ PDFTokenizer::~PDFTokenizer()
  PDFTokenizer::PDFRet PDFTokenizer::process()
  {
      auto r = yylex();
-
-    return (PDFRet)r;
+    return static_cast<PDFTokenizer::PDFRet>(r);
  }
diff --git a/src/js_norm/test/pdf_tokenizer_test.cc b/src/js_norm/test/pdf_tokenizer_test.cc

index 13eb326cac2cc7a4374ba1257e616dec0bbede58..db180133682a9393daa4e524c4894ebba130a7f5 100644 (file)
--- a/src/js_norm/test/pdf_tokenizer_test.cc
+++ b/src/js_norm/test/pdf_tokenizer_test.cc
@@ -156,13 +156,16 @@ TEST_CASE("basic", "[PDFTokenizer]")
      SECTION("comments")
      {
          test_pdf_proc(
+            "1 0 obj\n"
              "% comment 1\n"
              "<</K/V % comment /JS (script 1)\n>>"
-            "<</K/V /JS (a % b)>>\n"
-            "(% not a comment)\n"
+            "<</K/V % comment\r /JS (script 2; )\n>>"
+            "<</K/V /JS (a % b; )>>\n"
              "% comment 2\n"
-            "<</JS (; script 2) % comment 3\n>>",
-            "a % b; script 2"
+            "<</JS (script 3) % comment 3\n>>"
+            "(% not a comment)\n"
+            "endobj\n",
+            "script 2; a % b; script 3"
          );
      }
      SECTION("escapes in string")
@@ -215,42 +218,50 @@ TEST_CASE("basic", "[PDFTokenizer]")
      SECTION("not name for key")
      {
          test_pdf_proc(
+            "1 0 obj"
              "<<"
              "/K1 /V1"
              "[/K2] /V2"
              "/K3 /V3"
-            ">>",
+            ">>"
+            "endobj",
              "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
          );
      }
      SECTION("literal string as a key")
      {
          test_pdf_proc(
+            "1 0 obj"
              "<<"
              "/K1 /V1"
              "(foo) /V2"
              "/K3 /V3"
-            ">>",
+            ">>"
+            "endobj",
              "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
          );
      }
      SECTION("hex string as a key")
      {
          test_pdf_proc(
+            "1 0 obj"
              "<<"
              "/K1 /V1"
              "<62617a> /V2"
              "/K3 /V3"
-            ">>",
+            ">>"
+            "endobj",
              "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
          );
      }
      SECTION("incomplete array")
      {
          test_pdf_proc(
+            "1 0 obj"
              "<<"
              "/K1 [ /V1 /V2 /V3 "
-            ">>",
+            ">>"
+            "endobj",
              "",  PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
          );
      }
@@ -268,43 +279,43 @@ TEST_CASE("JS location", "[PDFTokenizer]")
      SECTION("no sub-type")
      {
          test_pdf_proc(
-            "<< /JS (script) >>",
+            "1 0 obj\n<< /JS (script) >>",
              "script"
          );
      }
      SECTION("no sub-type checks")
      {
          test_pdf_proc(
-            "<< /JS (script) /S /JavaScript >>",
+            "1 0 obj\n<< /JS (script) /S /JavaScript >>",
              "script"
          );
      }
      SECTION("no spaces")
      {
          test_pdf_proc(
-            "<</S/JavaScript/JS(script)>>",
+            "1 0 obj\n<</S/JavaScript/JS(script)>>",
              "script"
          );
      }
      SECTION("as hex string")
      {
          test_pdf_proc(
-            "<< /JS <62617a> >>",
+            "1 0 obj\n<< /JS <62617a> >>",
              "baz"
          );
          test_pdf_proc(
-            "<< /JS <70> >>",
+            "1 0 obj\n<< /JS <70> >>",
              "p"
          );
          test_pdf_proc(
-            "<< /JS <7> >>",
+            "1 0 obj\n<< /JS <7> >>",
              "p"
          );
      }
      SECTION("prepended with records")
      {
          test_pdf_proc(
-            "<</A 10 0 R /B 11 1 R/S/JavaScript/JS(script)>>",
+            "1 0 obj\n<</A 10 0 R /B 11 1 R/S/JavaScript/JS(script)>>",
              "script"
          );
      }
@@ -315,7 +326,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]")
      SECTION("simple text")
      {
          test_pdf_proc(
-            "<</JS"
+            "1 0 obj\n<</JS"
              "(var _abc1 = 'Hello World!';)"
              ">>",
              "var _abc1 = 'Hello World!';"
@@ -324,7 +335,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]")
      SECTION("balanced parenthesis")
      {
          test_pdf_proc(
-            "<</JS"
+            "1 0 obj\n<</JS"
              "(function foo() { console.log(\"Hello world!\") })"
              ">>",
              "function foo() { console.log(\"Hello world!\") }"
@@ -333,7 +344,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]")
      SECTION("with escapes")
      {
          test_pdf_proc(
-            "<</JS"
+            "1 0 obj\n<</JS"
              "(function bar\\(var x\\)\\r{\\r    console.log\\(\"baz\"\\)\\r})"
              ">>",
              "function bar(var x)\r{\r    console.log(\"baz\")\r}"
@@ -342,7 +353,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]")
      SECTION("all escapes")
      {
          test_pdf_proc(
-            "<</JS"
+            "1 0 obj\n<</JS"
              "(() \\n\\r\\t\\b\\f\\(\\)\\\\ \\123 \\A\\B\\C \\x\\y\\z)"
              ">>",
              "() \n\r\t\b\f()\\ \123 ABC xyz"
@@ -351,7 +362,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]")
      SECTION("escaped new line")
      {
          test_pdf_proc(
-            "<</JS"
+            "1 0 obj\n<</JS"
              "(var str = 'Hello\\\n , \\\r    world\\\r\n\t!';)"
              ">>",
              "var str = 'Hello, world!';"
@@ -386,14 +397,410 @@ TEST_CASE("split", "[PDFTokenizer]")
      {
          test_pdf_proc({
              {"% comment", ""},
-            {"\n", ""},
+            {"\n1 0 obj\n", ""},
              {"<</K/V /JS (a % b)>>\n", "a % b"},
-            {"(% not a", ""},
-            {"comment)\n", ""},
+            {"endobj\n2 0 obj\n(% not a", ""},
+            {"comment)\nendobj\n3 0 obj\n", ""},
              {"<</JS (;", ";"},
              {"script 2)", "script 2"},
-            {">>", ""},
-            {"<</JS(script 3)>>", "script 3"}
+            {">>\nendobj\n4 0 obj\n", ""},
+            {"<</JS(script 3)>>\nendobj", "script 3"}
+        });
+    }
+}
+
+TEST_CASE("stream object", "[PDFTokenizer]")
+{
+    SECTION("zero length")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 0"
+            ">>"
+            "stream\n"
+            "\n"
+            "endstream\n"
+            "endobj\n",
+            "\n"
+        );
+    }
+    SECTION("exact length")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 6\n"
+            ">>\n"
+            "stream\n"
+            "foobar\n"
+            "endstream\n"
+            "endobj\n",
+            "foobar\n"
+        );
+    }
+    SECTION("carriage return and line feed as EOL")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 3\n"
+            ">>"
+            "stream\r\n"
+            "bar\r\n"
+            "endstream\n"
+            "endobj\n",
+            "bar\n"
+        );
+    }
+    SECTION("special symbols in a stream")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 13\n"
+            ">>"
+            "stream\n"
+            "\nendstream\n \r\n"
+            "endstream\n"
+            "endobj\n",
+            "\nendstream\n \n"
+        );
+    }
+    SECTION("referenced JavaScript")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 9\n"
+            ">>"
+            "stream\n"
+            "var a = 0\n"
+            "endstream\n"
+            "endobj\n",
+            "var a = 0\n"
+        );
+    }
+    SECTION("referenced JavaScript after another stream")
+    {
+        test_pdf_proc(
+           "1 0 obj\n"
+           "<</S /JavaScript /JS 2 0 R>>\n"
+           "endobj\n"
+           "3 0 obj\n"
+           "<</Length 1>>\n"
+           "stream\n"
+           " \n"
+           "endstream\n"
+           "endobj\n"
+           "2 0 obj\n"
+           "<<"
+           "/Length 9\n"
+           ">>"
+           "stream\n"
+           "var a = 0\n"
+           "endstream\n"
+           "endobj\n",
+           "var a = 0\n"
+        );
+    }
+    SECTION("multiple revisions")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 1 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<</Length 13>>\n"
+            "stream\n"
+            "//revision 1\n\n"
+            "endstream\n"
+            "endobj\n"
+            "2 1 obj\n"
+            "<</Length 13>>\n"
+            "stream\n"
+            "//revision 2\n\n"
+            "endstream\n"
+            "endobj\n",
+            "//revision 1\n\n"
+            "//revision 2\n\n"
+        );
+    }
+}
+
+TEST_CASE("stream object malformed", "[PDFTokenizer]")
+{
+    SECTION("no dictionary")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "stream\n"
+            "foo\n"
+            "endstream\n"
+            "endobj\n",
+            "", PDFTokenizer::PDFRet::STREAM_NO_LENGTH
+        );
+    }
+    SECTION("a direct stream")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "<<"
+            "/Length 3"
+            ">>\n"
+            "stream\n"
+            "foo\n"
+            "endstream\n",
+            "", PDFTokenizer::PDFRet::EOS
+        );
+    }
+    SECTION("an indirect dictionary")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 3 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 3"
+            ">>\n"
+            "endobj\n"
+            "3 0 obj\n"
+            "2 0 R\n"
+            "stream\n"
+            "foo\n"
+            "endstream\n"
+            "endobj\n",
+            "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+        );
+    }
+    SECTION("no length")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Creator (Acrobat Pro DC 22.1.20169)"
+            ">>\n"
+            "stream\n"
+            "foo\n"
+            "endstream\n"
+            "endobj\n",
+            "", PDFTokenizer::PDFRet::STREAM_NO_LENGTH
+        );
+    }
+    SECTION("length less")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 2"
+            ">>\n"
+            "stream\n"
+            "foo\n"
+            "endstream\n"
+            "endobj\n",
+            "foo\n", PDFTokenizer::PDFRet::EOS
+        );
+    }
+    SECTION("length greater within a few bytes")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 4"
+            ">>\n"
+            "stream\n"
+            "foo\n"
+            "endstream\n"
+            "endobj\n",
+            "foo\n", PDFTokenizer::PDFRet::EOS
+            // note that '\n' in expected is not extracted from source data.
+            // preprocessor does not extract exactly "/Length" bytes, and as long
+            // as length is greater by no more than a few bytes stream will be read
+            // correctly up to endstream marker.
+        );
+    }
+    SECTION("length greater")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 100"
+            ">>\n"
+            "stream\n"
+            "foo\n"
+            "endstream\n"
+            "endobj\n",
+            "foo\n"
+            "endstream\n"
+            "endobj\n", PDFTokenizer::PDFRet::EOS
+        );
+    }
+    SECTION("carriage return following the keyword stream")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 3"
+            ">>\n"
+            "stream\r"
+            "foo\r"
+            "endstream\n"
+            "endobj\n",
+            "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+        );
+    }
+    SECTION("no end-off-line marker specified")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 3"
+            ">>\n"
+            "stream"
+            "foo"
+            "endstream\n"
+            "endobj\n",
+            "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+        );
+    }
+    SECTION("no end-off-line marker in stream data")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<</S /JavaScript /JS 2 0 R>>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<<"
+            "/Length 3"
+            ">>\n"
+            "stream\n"
+            "foo"
+            "endstream\n"
+            "endobj\n",
+            "fooendstream\n"
+            "endobj\n", PDFTokenizer::PDFRet::EOS
+        );
+    }
+}
+
+TEST_CASE("stream object over PDU", "[PDFTokenizer]")
+{
+    SECTION("split inside non-JS stream")
+    {
+        test_pdf_proc({
+            {
+                "10 0 obj\n"
+                "<</Length 6>>\n"
+                "stream\n"
+                "foo",
+                ""
+            },
+            {
+                "bar\n"
+                "endstream\n"
+                "endobj\n",
+                ""
+            }
+        });
+    }
+    SECTION("split inside JavaScript stream")
+    {
+        test_pdf_proc({
+            {
+                "1 0 obj\n"
+                "<</JS 10 0 R>>\n"
+                "endobj\n"
+                "10 0 obj\n"
+                "<</Length 6>>\n"
+                "stream\n"
+                "foo",
+                "foo"
+            },
+            {
+                "bar\n"
+                "endstream\n"
+                "endobj\n",
+                "bar\n"
+            }
+        });
+    }
+    SECTION("split between reference and stream obj")
+    {
+        test_pdf_proc({
+            {
+                "1 0 obj\n"
+                "<</JS 10 0 R>>\n"
+                "endobj\n",
+                ""
+            },
+            {
+                "10 0 obj\n"
+                "<</Length 6>>\n"
+                "stream\n"
+                "foobar\n"
+                "endstream\n"
+                "endobj\n",
+                "foobar\n"
+            }
+        });
+    }
+    SECTION("split between dictionary and stream")
+    {
+        test_pdf_proc({
+            {
+                "1 0 obj\n"
+                "<</JS 10 0 R>>\n"
+                "endobj\n"
+                "10 0 obj\n"
+                "<</Length 6>>\n",
+                ""
+            },
+            {
+                "stream\n"
+                "foobar\n"
+                "endstream\n"
+                "endobj\n",
+                "foobar\n"
+            }
          });
      }
  }
diff --git a/src/service_inspectors/http_inspect/dev_notes_js_norm.txt b/src/service_inspectors/http_inspect/dev_notes_js_norm.txt

index 5eb613c1745ccf386e58f5e445ef722ee91233e7..f60f51a6e5c25666bd592dcb5d5b8095e01a97e2 100644 (file)
--- a/src/service_inspectors/http_inspect/dev_notes_js_norm.txt
+++ b/src/service_inspectors/http_inspect/dev_notes_js_norm.txt
@@ -4,7 +4,7 @@ the Legacy Normalizer and the Enhanced Normalizer.
  In NHI, there are three JSNorm extensions:
   * HttpInlineJSNorm, processes content of HTML script tags.
   * HttpExternalJSNorm, processes payload with JavaScript MIME type.
- * HttpPDFJSNorm, processes payload with PDF MIME type.
+ * HttpPDFJSNorm, processes payload with PDF MIME type and PDF files in MIME attachments.
  
  Normalization context is per transaction. It is created once js_data calls for normalized JS data,
  and is deleted once transaction ends. Partial inspections feed data incrementally to JS Normalizer,
@@ -17,9 +17,9 @@ During message body analysis the Enhanced Normalizer does one of the following:
  2. If it is an HTML-page, Normalizer searches for an opening tag and processes
     subsequent bytes in a stream mode, until it finds a closing tag.
     It proceeds and scans the entire message body for inline scripts.
-3. If it is PDF file transferred as MIME attachment or as a message body then
-   Normalizer extracts strings marked with '/JS' keyword and normalizes their
-   content as JS text.
+3. If it is a PDF file transferred as MIME attachment or as a message body then
+   Normalizer extracts strings assigned to '/JS' key, or streams referred from
+   the same value, and normalizes their content as JS text.
  
  Also, js_data IPS option's buffer is a part of NHI processing in order to start the normalization.
author	Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
	Tue, 13 Dec 2022 18:42:24 +0000 (18:42 +0000)
committer	Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
	Tue, 13 Dec 2022 18:42:24 +0000 (18:42 +0000)
src/js_norm/dev_notes.txt		patch \| blob \| blame \| history
src/js_norm/pdf_tokenizer.h		patch \| blob \| blame \| history
src/js_norm/pdf_tokenizer.l		patch \| blob \| blame \| history
src/js_norm/test/pdf_tokenizer_test.cc		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/dev_notes_js_norm.txt		patch \| blob \| blame \| history