Pull request #3681: js_norm: implement Enhanced JS Normalization for PDF

author Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>

Tue, 29 Nov 2022 13:56:17 +0000 (13:56 +0000)

committer Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>

Tue, 29 Nov 2022 13:56:17 +0000 (13:56 +0000)
author Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Tue, 29 Nov 2022 13:56:17 +0000 (13:56 +0000)
committer Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Tue, 29 Nov 2022 13:56:17 +0000 (13:56 +0000)
diff --git a/src/js_norm/CMakeLists.txt b/src/js_norm/CMakeLists.txt

index 7128c458b9144d2667eefc4335dd1806441199c5..15aa9c20c7c99896ca5220d2b115d1897a5976d8 100644 (file)
--- a/src/js_norm/CMakeLists.txt
+++ b/src/js_norm/CMakeLists.txt
@@ -1,9 +1,15 @@
+FLEX ( pdf_tokenizer
+    ${CMAKE_CURRENT_SOURCE_DIR}/pdf_tokenizer.l
+    ${CMAKE_CURRENT_BINARY_DIR}/pdf_tokenizer.cc
+)
+
  FLEX ( js_tokenizer
      ${CMAKE_CURRENT_SOURCE_DIR}/js_tokenizer.l
      ${CMAKE_CURRENT_BINARY_DIR}/js_tokenizer.cc
  )
  
  set ( JS_SOURCES
+    ${pdf_tokenizer_OUTPUTS}
      ${js_tokenizer_OUTPUTS}
      js_config.h
      js_enum.h
@@ -16,6 +22,7 @@ set ( JS_SOURCES
      js_normalizer.cc
      js_normalizer.h
      js_tokenizer.h
+    pdf_tokenizer.h
  )
  
  add_library(js_norm OBJECT ${JS_SOURCES})
diff --git a/src/js_norm/dev_notes.txt b/src/js_norm/dev_notes.txt

index 6196aabe6dfb289c3c925e06b6842f33eac0dc3a..0e0a6b94cd1f46ec9090f2914a2f4c06adbed8db 100644 (file)
--- a/src/js_norm/dev_notes.txt
+++ b/src/js_norm/dev_notes.txt
@@ -12,7 +12,7 @@ So, the number of unique identifiers available is 65536 names per transaction.
  If Normalizer overruns the configured limit, built-in alert is generated.
  
  A config option to set the limit manually:
- * js_norm.identifier_depth.
+ * js_norm.identifier_depth
  
  Identifiers from the ident_ignore list will be placed as is, without substitution. Starting with 
  the listed identifier, any chain of dot accessors, brackets and function calls will be kept
@@ -129,3 +129,12 @@ Verbosity levels:
  4. Temporary buffer (debug build only)
  5. Matched token (debug build only)
  6. Identifier substitution (debug build only)
+
+PDF parser follows "PDF 32000-1:2008 First Edition 2008-7-1 Document
+management Portable document format Part 1: PDF 1.7".
+Known limitations:
+* Nested dictionaries are not fully supported. Properties of the last object
+are tracked. Once the nested object ends, it clears all info about the object
+type.
+* Nested dictionaries are not allowed in JavaScript-type dictionary.
+* Stream objects are ignored.
diff --git a/src/js_norm/js_enum.h b/src/js_norm/js_enum.h

index 8f25b66fff75cdc83e94e045e0ca74bdd5f87af7..8df38525c12329447dcca64b609e105a377ddecb 100644 (file)
--- a/src/js_norm/js_enum.h
+++ b/src/js_norm/js_enum.h
@@ -30,7 +30,9 @@ static constexpr unsigned js_gid = 154;
  enum
  {
      TRACE_PROC = 0,
-    TRACE_DUMP
+    TRACE_DUMP,
+    TRACE_PDF_PROC,
+    TRACE_PDF_DUMP
  };
  
  // This enum must be synchronized with JSNormModule::peg_names[] in js_norm_module.cc
diff --git a/src/js_norm/js_norm.cc b/src/js_norm/js_norm.cc

index e21ba8bb48efb9390f2b76da0221deaa421b235d..a7e023002eca76e2da781da5191b0851fb9ca473 100644 (file)
--- a/src/js_norm/js_norm.cc
+++ b/src/js_norm/js_norm.cc
@@ -23,9 +23,11 @@
  
  #include "js_norm.h"
  
+#include "log/messages.h"
+#include "trace/trace_api.h"
+
  #include "js_identifier_ctx.h"
  #include "js_normalizer.h"
-
  #include "js_norm_module.h"
  
  using namespace jsn;
@@ -110,14 +112,21 @@ void JSNorm::normalize(const void* in_data, size_t in_len, const void*& data, si
      }
      pdu_cnt = 0;
  
+    const Packet* packet = DetectionEngine::get_current_packet();
      src_ptr = (const uint8_t*)in_data;
      src_end = src_ptr + in_len;
  
      while (alive and pre_proc())
      {
+        trace_logf(3, js_trace, TRACE_DUMP, packet,
+            "original[%zu]: %.*s\n", src_end - src_ptr, (int)(src_end - src_ptr), src_ptr);
+
          auto ret = jsn_ctx->normalize((const char*)src_ptr, src_end - src_ptr, ext_script_type);
          const uint8_t* next = (const uint8_t*)jsn_ctx->get_src_next();
  
+        trace_logf(3, js_trace, TRACE_PROC, packet,
+            "normalizer returned with %d '%s'\n", ret, jsn::ret2str(ret));
+
          JSNormModule::increment_peg_counts(PEG_BYTES, next - src_ptr);
          src_ptr = next;
  
diff --git a/src/js_norm/js_norm_module.cc b/src/js_norm/js_norm_module.cc

index af573ba37df7d56d064ebbcf5b0aba62b6a6aba2..01d241e6bc547cce0b410516acfd236c1a67064d 100644 (file)
--- a/src/js_norm/js_norm_module.cc
+++ b/src/js_norm/js_norm_module.cc
@@ -79,8 +79,12 @@ const Parameter JSNormModule::params[] =
  
  static const TraceOption trace_options[] =
  {
-    { "proc",  TRACE_PROC,  "enable processing logging" },
-    { "dump",  TRACE_DUMP,  "enable data logging" },
+    { "proc", TRACE_PROC, "enable processing logging" },
+    { "dump", TRACE_DUMP, "enable data logging" },
+#ifdef DEBUG_MSGS
+    { "pdf_proc", TRACE_PDF_PROC, "enable processing logging for PDF extractor" },
+    { "pdf_dump", TRACE_PDF_DUMP, "enable data logging for PDF extractor" },
+#endif
      { nullptr, 0, nullptr }
  };
  
diff --git a/src/js_norm/js_normalizer.cc b/src/js_norm/js_normalizer.cc

index 5407ee2ba57b86ca5be111a16bcb564e0052cad0..72df7eb1042c58d3d6d8b32878d3a0ad2af71f46 100644 (file)
--- a/src/js_norm/js_normalizer.cc
+++ b/src/js_norm/js_normalizer.cc
@@ -24,6 +24,15 @@
  #include "js_normalizer.h"
  
  #include "js_norm/js_enum.h"
+#include "log/messages.h"
+#include "trace/trace_api.h"
+
+namespace snort
+{
+class Trace;
+}
+
+extern THREAD_LOCAL const snort::Trace* js_trace;
  
  #define BUFF_EXP_FACTOR 1.3
  
diff --git a/src/js_norm/js_tokenizer.h b/src/js_norm/js_tokenizer.h

index f5c97de0f09ca68fe67850f7b40c810da1428699..8eba086837490b917fb1682952c0f38b1b03afe5 100644 (file)
--- a/src/js_norm/js_tokenizer.h
+++ b/src/js_norm/js_tokenizer.h
@@ -25,11 +25,6 @@
  #include <stack>
  #include <vector>
  
-#include "log/messages.h"
-#include "trace/trace_api.h"
-
-extern THREAD_LOCAL const snort::Trace* js_trace;
-
  // The longest pattern has 9 characters " < / s c r i p t > ",
  // 8 of them can reside in 1st chunk
  // Each character in the identifier forms its own group (pattern matching case),
diff --git a/src/js_norm/js_tokenizer.l b/src/js_norm/js_tokenizer.l

index d368dabc694b4e51a34703fae190f6035da28a4d..77214d6ca3703e1f0102efa7b63b2b78a2eb0588 100644 (file)
--- a/src/js_norm/js_tokenizer.l
+++ b/src/js_norm/js_tokenizer.l
@@ -20,6 +20,7 @@
  
  %option c++
  %option yyclass="JSTokenizer"
+%option prefix="js"
  %option align full 8bit batch never-interactive
  %option noinput nounput noyywrap
  %option noyy_push_state noyy_pop_state noyy_top_state
@@ -37,8 +38,12 @@
  #include "js_norm/js_enum.h"
  #include "js_norm/js_identifier_ctx.h"
  #include "js_norm/js_tokenizer.h"
+#include "log/messages.h"
+#include "trace/trace_api.h"
  #include "utils/util_cstring.h"
  
+extern THREAD_LOCAL const snort::Trace* js_trace;
+
  using namespace jsn;
  
  #define YY_NO_UNPUT
diff --git a/src/js_norm/pdf_tokenizer.h b/src/js_norm/pdf_tokenizer.h

new file mode 100644 (file)

index 0000000..1b80be7
--- /dev/null
+++ b/src/js_norm/pdf_tokenizer.h
@@ -0,0 +1,127 @@
+//--------------------------------------------------------------------------
+// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// pdf_tokenizer.h author Cisco
+
+#ifndef PDF_TOKENIZER_H
+#define PDF_TOKENIZER_H
+
+#include <array>
+#include <cstring>
+#include <sstream>
+#include <stack>
+#include <vector>
+
+#define PDFTOKENIZER_NAME_MAX_SIZE 16
+
+namespace jsn
+{
+
+class PDFTokenizer : public yyFlexLexer
+{
+public:
+    enum PDFRet
+    {
+        EOS = 0,
+        NOT_NAME_IN_DICTIONARY_KEY,
+        INCOMPLETE_ARRAY_IN_DICTIONARY,
+        MAX
+    };
+
+    PDFTokenizer() = delete;
+    explicit PDFTokenizer(std::istream& in, std::ostream& out);
+    ~PDFTokenizer() override;
+
+    PDFRet process();
+
+private:
+    int yylex() override;
+
+    PDFRet h_dict_open();
+    PDFRet h_dict_close();
+    PDFRet h_dict_name();
+    PDFRet h_dict_other();
+    inline bool h_lit_str();
+    inline bool h_hex_str();
+    inline bool h_lit_open();
+    inline bool h_lit_close();
+    PDFRet h_lit_unescape();
+    PDFRet h_lit_oct2chr();
+    PDFRet h_hex_hex2chr();
+
+    struct ObjectString
+    {
+        void clear()
+        { parenthesis_level = 0; }
+
+        int parenthesis_level = 0;
+    };
+
+    struct ObjectArray
+    {
+        void clear()
+        { nesting_level = 0; }
+
+        int nesting_level = 0;
+    };
+
+    struct ObjectDictionary
+    {
+        void clear()
+        { key_value = true; array_level = 0; }
+
+        bool key_value = true;
+        int array_level = 0;
+    };
+
+    struct DictionaryEntry
+    {
+        void clear()
+        { key[0] = '\0'; }
+
+        char key[PDFTOKENIZER_NAME_MAX_SIZE] = {0};
+    };
+
+    ObjectString obj_string;
+    ObjectArray obj_array;
+    ObjectDictionary obj_dictionary;
+    DictionaryEntry obj_entry;
+};
+
+bool PDFTokenizer::h_lit_str()
+{
+    return obj_dictionary.array_level == obj_array.nesting_level and !strcmp(obj_entry.key, "/JS");
+}
+
+bool PDFTokenizer::h_hex_str()
+{
+    return obj_dictionary.array_level == obj_array.nesting_level and !strcmp(obj_entry.key, "/JS");
+}
+
+bool PDFTokenizer::h_lit_open()
+{
+    return ++obj_string.parenthesis_level == 1;
+}
+
+bool PDFTokenizer::h_lit_close()
+{
+    return --obj_string.parenthesis_level == 0;
+}
+
+}
+
+#endif
diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l

new file mode 100644 (file)

index 0000000..181c219
--- /dev/null
+++ b/src/js_norm/pdf_tokenizer.l
@@ -0,0 +1,337 @@
+/*--------------------------------------------------------------------------
+// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// pdf_tokenizer.l author Cisco
+*/
+
+%option c++
+%option yyclass="PDFTokenizer"
+%option prefix="pdf"
+%option align full 8bit batch never-interactive stack
+%option noinput nounput noyywrap noyy_top_state
+
+%{
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <algorithm>
+#include <cassert>
+#include <stdio.h>
+
+#include "js_norm/js_enum.h"
+#include "js_norm/pdf_tokenizer.h"
+#include "log/messages.h"
+#include "trace/trace_api.h"
+
+extern THREAD_LOCAL const snort::Trace* js_trace;
+
+using namespace jsn;
+
+#define YY_NO_UNPUT
+
+#define YY_FATAL_ERROR(msg) { snort::FatalError("%s", msg); }
+
+#define PUSH(x) yy_push_state(x)
+#define POP() yy_pop_state()
+
+#define YY_USER_ACTION                                      \
+    {                                                       \
+        debug_logf(5, js_trace, TRACE_PDF_PROC, nullptr,    \
+            "PDF pattern #%d, sc %d\n", yy_act, YY_START);  \
+                                                            \
+        debug_logf(5, js_trace, TRACE_PDF_DUMP, nullptr,    \
+            "PDF text '%s'\n", YYText());                   \
+    }
+
+#define EXEC(f)                                 \
+    {                                           \
+        auto r = (f);                           \
+        if (r)                                  \
+            return r;                           \
+    }
+
+%}
+
+/* PDF 32000-1:2008 definitions follow */
+
+/* 7.2.2 Character Set */
+CHARS_WHITESPACE  \x00\x09\x0a\x0c\x0d\x20
+CHARS_DELIMITER   \(\)\<\>\[\]\{\}\/\%
+GRP_WHITESPACE    [\x00\x09\x0a\x0c\x0d\x20]
+GRP_DELIMITER     [\(\)\<\>\[\]\{\}\/\%]
+GRP_REGULAR       [^\x00\x09\x0a\x0c\x0d\x20\(\)\<\>\[\]\{\}\/\%]
+
+/* 7.2.3 Comments */
+COMMENT           %.*
+
+/* 7.3.2 Boolean Objects */
+OBJ_BOOLEAN       true|false
+
+/* 7.3.3 Numeric Objects */
+OBJ_INT_NUM       [+-]?[0-9]{1,64}
+OBJ_REL_NUM       [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64})
+
+/* 7.3.4 String Objects */
+OBJ_LIT_STR_OPEN  "("
+OBJ_LIT_STR_CLOSE ")"
+OBJ_HEX_STR_OPEN  "<"
+OBJ_HEX_STR_CLOSE ">"
+
+/* 7.3.4.2 Literal Strings */
+LIT_STR_ESC       \\[^0-7]
+LIT_STR_ESC_OCT   \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3}
+LIT_STR_ESC_EOL   \\[\x0d\x0a]|\\\x0d\x0a
+LIT_STR_EOL       [\x0d\x0a]|\x0d\x0a
+LIT_STR_BODY      [^\\\(\)]{1,64}
+
+/* 7.3.4.3 Hexadecimal Strings */
+HEX_STR_BODY      [0-9A-Fa-f]{1,64}
+HEX_STR_SKIP      [^0-9A-Fa-f>]{1,64}
+
+/* 7.3.5 Name Objects */
+OBJ_NAME          \/{GRP_REGULAR}{1,256}
+
+/* 7.3.6 Array Objects */
+OBJ_ARRAY_OPEN    "["
+OBJ_ARRAY_CLOSE   "]"
+
+/* 7.3.7 Dictionary Objects */
+OBJ_DICT_OPEN     "<<"
+OBJ_DICT_CLOSE    ">>"
+
+/* FIXIT: improve bytes consuming */
+OBJ_DICT_SKIP     .
+
+/* 7.3.8 Stream Objects */
+OBJ_STREAM_OPEN   stream$
+OBJ_STREAM_CLOSE  ^endstream
+
+/* 7.3.9 Null Object */
+OBJ_NULL          null
+
+/* 7.3.10 Indirect Objects */
+INDIRECT_OBJ      {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj
+RECORD_OBJ        {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
+
+/* Not dictionary, not strings */
+SKIP              [^<\(%]{1,64}
+WHITESPACE        {GRP_WHITESPACE}{1,64}
+
+/* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string */
+%x dictnr
+%x litstr
+%x hexstr
+%x jslstr
+%x jshstr
+
+%%
+
+{SKIP}                                            { }
+{COMMENT}                                         { }
+
+<INITIAL,dictnr>{OBJ_DICT_OPEN}                   { PUSH(dictnr); EXEC(h_dict_open())  }
+<dictnr>{OBJ_DICT_CLOSE}                          { POP(); EXEC(h_dict_close()) }
+<dictnr>{COMMENT}                                 { }
+<dictnr>{WHITESPACE}                              { }
+<dictnr>{RECORD_OBJ}                              { EXEC(h_dict_other()) }
+<dictnr>{OBJ_BOOLEAN}                             { EXEC(h_dict_other()) }
+<dictnr>{OBJ_INT_NUM}                             { EXEC(h_dict_other()) }
+<dictnr>{OBJ_REL_NUM}                             { EXEC(h_dict_other()) }
+<dictnr>{OBJ_NULL}                                { EXEC(h_dict_other()) }
+<dictnr>{OBJ_NAME}                                { EXEC(h_dict_name()) }
+<dictnr>{OBJ_ARRAY_OPEN}                          { ++obj_array.nesting_level; EXEC(h_dict_other()) }
+<dictnr>{OBJ_ARRAY_CLOSE}                         { --obj_array.nesting_level; EXEC(h_dict_other()) }
+<dictnr>{OBJ_LIT_STR_OPEN}                        { EXEC(h_dict_other()) if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); }
+<dictnr>{OBJ_HEX_STR_OPEN}                        { EXEC(h_dict_other()) if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); }
+<dictnr>{OBJ_DICT_SKIP}                           { }
+
+<INITIAL>{OBJ_LIT_STR_OPEN}                       { if (h_lit_open()) PUSH(litstr); }
+<litstr>{OBJ_LIT_STR_OPEN}                        { h_lit_open(); }
+<litstr>{OBJ_LIT_STR_CLOSE}                       { if (h_lit_close()) POP(); }
+<litstr>{LIT_STR_ESC}                             { }
+<litstr>{LIT_STR_ESC_OCT}                         { }
+<litstr>{LIT_STR_ESC_EOL}                         { }
+<litstr>{LIT_STR_EOL}                             { }
+<litstr>{LIT_STR_BODY}                            { }
+
+<INITIAL>{OBJ_HEX_STR_OPEN}                       { PUSH(hexstr); }
+<hexstr>{OBJ_HEX_STR_CLOSE}                       { POP(); }
+<hexstr>{HEX_STR_BODY}                            { }
+<hexstr>{HEX_STR_SKIP}                            { }
+
+<jslstr>{OBJ_LIT_STR_OPEN}                        { if (!h_lit_open()) ECHO; }
+<jslstr>{OBJ_LIT_STR_CLOSE}                       { if (h_lit_close()) POP(); else ECHO; }
+<jslstr>{LIT_STR_ESC}                             { EXEC(h_lit_unescape()) }
+<jslstr>{LIT_STR_ESC_OCT}                         { EXEC(h_lit_oct2chr()) }
+<jslstr>{LIT_STR_ESC_EOL}{WHITESPACE}             { }
+<jslstr>{LIT_STR_EOL}                             { ECHO; }
+<jslstr>{LIT_STR_BODY}                            { ECHO; }
+
+<jshstr>{OBJ_HEX_STR_OPEN}                        { }
+<jshstr>{OBJ_HEX_STR_CLOSE}                       { POP(); }
+<jshstr>{HEX_STR_BODY}                            { EXEC(h_hex_hex2chr()) }
+<jshstr>{HEX_STR_SKIP}                            { }
+
+<<EOF>>                                           { return PDFRet::EOS; }
+
+%%
+
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_open()
+{
+    obj_dictionary.clear();
+    obj_dictionary.array_level = obj_array.nesting_level;
+
+    debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+        "dictionary open, at array level %d\n", obj_array.nesting_level);
+
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_close()
+{
+    obj_dictionary.clear();
+
+    debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+        "dictionary close, at array level %d\n", obj_array.nesting_level);
+
+    if (obj_dictionary.array_level != obj_array.nesting_level)
+        return PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY;
+
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_other()
+{
+    if (obj_dictionary.array_level != obj_array.nesting_level)
+        return PDFRet::EOS;
+
+    if (obj_dictionary.key_value)
+        return PDFRet::NOT_NAME_IN_DICTIONARY_KEY;
+
+    debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+        "dictionary token: other\n");
+
+    obj_dictionary.key_value = !obj_dictionary.key_value;
+
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_name()
+{
+    if (obj_dictionary.array_level != obj_array.nesting_level)
+        return PDFRet::EOS;
+
+    if (obj_dictionary.key_value)
+        strncpy(obj_entry.key, yytext, sizeof(obj_entry.key) - 1);
+
+    obj_dictionary.key_value = !obj_dictionary.key_value;
+
+    debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+        "dictionary token: name as %s\n", obj_dictionary.key_value ? "value" : "key");
+
+    debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+        "dictionary entry: %s, %s\n", obj_entry.key, obj_dictionary.key_value ? yytext : "...");
+
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_lit_unescape()
+{
+    assert(yyleng == 2);
+    assert(yytext[0] == '\\');
+
+    char c;
+
+    // 7.3.4.2 Literal Strings, Table 3 Escape sequences in literal strings
+    switch (yytext[1])
+    {
+    case 'n': c = '\n'; break;
+    case 'r': c = '\r'; break;
+    case 't': c = '\t'; break;
+    case 'b': c = '\b'; break;
+    case 'f': c = '\f'; break;
+    case '(': c = '('; break;
+    case ')': c = ')'; break;
+    case '\\': c = '\\'; break;
+    default: c = yytext[1];
+    }
+
+    yyout << c;
+
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_lit_oct2chr()
+{
+    assert(0 < yyleng and yyleng < 5);
+    assert(yytext[0] == '\\');
+
+    unsigned v;
+    sscanf(yytext + 1, "%o", &v);
+    yyout << (char)v;
+
+    debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+        "literal string, %s to %c \n", yytext, v);
+
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_hex_hex2chr()
+{
+    int len = yyleng & ~1;
+    const char* ptr = yytext;
+    const char* end = yytext + len;
+
+    while (ptr < end)
+    {
+        unsigned v;
+        sscanf(ptr, "%02x", &v);
+        yyout << (char)v;
+        ptr += 2;
+    }
+
+    if (len != yyleng)
+    {
+        unsigned v;
+        sscanf(ptr, "%01x", &v);
+        yyout << (char)(v << 4);
+    }
+
+    debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+        "literal string, in hex: %s\n", yytext);
+
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out)
+    : yyFlexLexer(in, out)
+{
+}
+
+PDFTokenizer::~PDFTokenizer()
+{
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::process()
+{
+    auto r = yylex();
+
+    return (PDFRet)r;
+}
diff --git a/src/js_norm/test/CMakeLists.txt b/src/js_norm/test/CMakeLists.txt

index 4dce8615b0a95aaf64c0441a95b179629b504956..260c15a9c42fa59026f6939535854f9699f14f70 100644 (file)
--- a/src/js_norm/test/CMakeLists.txt
+++ b/src/js_norm/test/CMakeLists.txt
@@ -3,6 +3,11 @@ FLEX ( js_tokenizer
      ${CMAKE_CURRENT_BINARY_DIR}/../js_tokenizer.cc
  )
  
+FLEX ( pdf_tokenizer
+    ${CMAKE_CURRENT_SOURCE_DIR}/../pdf_tokenizer.l
+    ${CMAKE_CURRENT_BINARY_DIR}/../pdf_tokenizer.cc
+)
+
  add_catch_test( js_normalizer_test
      SOURCES
          ${js_tokenizer_OUTPUTS}
@@ -68,3 +73,9 @@ add_catch_test( jsn_test
          ${CMAKE_SOURCE_DIR}/src/utils/streambuf.cc
          js_test_stubs.cc
  )
+
+add_catch_test( pdf_tokenizer_test
+    SOURCES
+        ${pdf_tokenizer_OUTPUTS}
+        js_test_stubs.cc
+)
diff --git a/src/js_norm/test/js_test_options.cc b/src/js_norm/test/js_test_options.cc

index cd5c9dffaa2f19f665617f2042f77c6b81da2be7..5910ffeedb31b43aba872bdb537378c296ed234b 100644 (file)
--- a/src/js_norm/test/js_test_options.cc
+++ b/src/js_norm/test/js_test_options.cc
@@ -23,6 +23,8 @@
  
  #include "js_test_options.h"
  
+#include <assert.h>
+
  Config::Config(const Config& other) : type(other.type)
  {
      switch (other.type)
diff --git a/src/js_norm/test/js_test_stubs.cc b/src/js_norm/test/js_test_stubs.cc

index 645598b11fd5f3f8099e586a79399b6d32d4294e..58fb88a4724bbef43bdea06a970c3868df455a09 100644 (file)
--- a/src/js_norm/test/js_test_stubs.cc
+++ b/src/js_norm/test/js_test_stubs.cc
@@ -37,4 +37,5 @@ uint8_t TraceApi::get_constraints_generation() { return 0; }
  void TraceApi::filter(const Packet&) { }
  
  int DetectionEngine::queue_event(unsigned int, unsigned int) { return 0; }
+Packet* DetectionEngine::get_current_packet() { return nullptr; }
  }
diff --git a/src/js_norm/test/pdf_tokenizer_test.cc b/src/js_norm/test/pdf_tokenizer_test.cc

new file mode 100644 (file)

index 0000000..b298653
--- /dev/null
+++ b/src/js_norm/test/pdf_tokenizer_test.cc
@@ -0,0 +1,399 @@
+//--------------------------------------------------------------------------
+// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// pdf_tokenizer_test.cc author Cisco
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <vector>
+
+#include <FlexLexer.h>
+
+#include "catch/catch.hpp"
+#include "js_norm/pdf_tokenizer.h"
+
+using namespace jsn;
+using namespace std;
+
+typedef pair<string, string> Chunk;
+
+static void test_pdf_proc(const string& source, const string& expected,
+    PDFTokenizer::PDFRet ret = PDFTokenizer::PDFRet::EOS)
+{
+    istringstream in(source);
+    ostringstream out;
+    PDFTokenizer extractor(in, out);
+
+    auto r = extractor.process();
+
+    CHECK(ret == r);
+    CHECK(expected == out.str());
+}
+
+static void test_pdf_proc(const vector<Chunk>& chunks)
+{
+    istringstream in;
+    ostringstream out;
+    PDFTokenizer extractor(in, out);
+
+    for (auto& chunk : chunks)
+    {
+        auto src = chunk.first;
+        auto exp = chunk.second;
+
+        in.rdbuf()->pubsetbuf((char*)src.c_str(), src.length());
+        out.str("");
+
+        auto r = extractor.process();
+
+        CHECK(PDFTokenizer::PDFRet::EOS == r);
+        CHECK(exp == out.str());
+    }
+}
+
+TEST_CASE("basic", "[PDFTokenizer]")
+{
+    SECTION("no input")
+    {
+        test_pdf_proc(
+            "",
+            ""
+        );
+    }
+    SECTION("minimal PDF")
+    {
+        test_pdf_proc(
+            "20 0 obj"
+            "<<"
+            "/Creator (Acrobat Pro DC 22.1.20169)"
+            "/ModDate (D:20220714154535+03'00')"
+            "/CreationDate (D:20220714153909+03'00')"
+            "/Producer (Acrobat Pro DC 22.1.20169)"
+            ">>"
+            "endobj",
+            ""
+        );
+    }
+    SECTION("direct object")
+    {
+        test_pdf_proc(
+            "<<"
+            "/S /JavaScript"
+            ">>",
+            ""
+        );
+    }
+    SECTION("indirect object")
+    {
+        test_pdf_proc(
+            "19 0 obj"
+            "<<"
+            "/S /JavaScript"
+            ">>"
+            "endobj",
+            ""
+        );
+    }
+    SECTION("records")
+    {
+        test_pdf_proc(
+            "1 0 R"
+            "<<"
+            "/T 2 0 R"
+            ">>",
+            ""
+        );
+    }
+    SECTION("sub array")
+    {
+        test_pdf_proc(
+            "<<"
+            "/K [ /name1 /name2 /name3 ]"
+            ">>",
+            ""
+        );
+    }
+    SECTION("sub dictionary")
+    {
+        test_pdf_proc(
+            "<<"
+            "/K << /k1 /v1 /k2 /v2 >> "
+            ">>",
+            ""
+        );
+    }
+    SECTION("more items")
+    {
+        test_pdf_proc(
+            "<00>"
+            "<< >>"
+            "<<"
+            "/K << /k1 /v1 /k2 [ /i1 /i2 /i3 /i4 ] /k3 /v3 /k4 <000102> /k5 (abc) >>"
+            ">>"
+            "["
+            "<</k1/v1/k2/v2/k3/v3>> <</k1[/i1/i2/i3[/j1/j2]]/k2<00>>> <</k1<</t1<00>>>>>"
+            "]",
+            ""
+        );
+    }
+    SECTION("comments")
+    {
+        test_pdf_proc(
+            "% comment 1\n"
+            "<</K/V % comment /JS (script 1)\n>>"
+            "<</K/V /JS (a % b)>>\n"
+            "(% not a comment)\n"
+            "% comment 2\n"
+            "<</JS (; script 2) % comment 3\n>>",
+            "a % b; script 2"
+        );
+    }
+    SECTION("escapes in string")
+    {
+        test_pdf_proc(
+            "(() \\n\\r\\t\\b\\f\\(\\)\\\\ \\123 \\A\\B\\C \\x\\y\\z)",
+            ""
+        );
+    }
+    SECTION("hex string")
+    {
+        test_pdf_proc(
+            "<000102030405>",
+            ""
+        );
+    }
+    SECTION("key after literal string")
+    {
+        test_pdf_proc(
+            "<<"
+            "/Lang (EN-GB)"
+            "/K [12 0 R]"
+            ">>",
+            ""
+        );
+    }
+    SECTION("key after hex string")
+    {
+        test_pdf_proc(
+            "<<"
+            "/Lang <62617a>"
+            "/K [12 0 R]"
+            ">>",
+            ""
+        );
+    }
+    SECTION("number values")
+    {
+        test_pdf_proc(
+            "<<"
+            "/N 10"
+            "/N 1.0"
+            "/N 1."
+            "/N .1"
+            "/N 1"
+            ">>",
+            ""
+        );
+    }
+    SECTION("not name for key")
+    {
+        test_pdf_proc(
+            "<<"
+            "/K1 /V1"
+            "[/K2] /V2"
+            "/K3 /V3"
+            ">>",
+            "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+        );
+    }
+    SECTION("literal string as a key")
+    {
+        test_pdf_proc(
+            "<<"
+            "/K1 /V1"
+            "(foo) /V2"
+            "/K3 /V3"
+            ">>",
+            "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+        );
+    }
+    SECTION("hex string as a key")
+    {
+        test_pdf_proc(
+            "<<"
+            "/K1 /V1"
+            "<62617a> /V2"
+            "/K3 /V3"
+            ">>",
+            "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+        );
+    }
+    SECTION("incomplete array")
+    {
+        test_pdf_proc(
+            "<<"
+            "/K1 [ /V1 /V2 /V3 "
+            ">>",
+            "",  PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
+        );
+    }
+}
+
+TEST_CASE("JS location", "[PDFTokenizer]")
+{
+    SECTION("wrong type")
+    {
+        test_pdf_proc(
+            "<</S /JavaScript /JS /script >>",
+            ""
+        );
+    }
+    SECTION("no sub-type")
+    {
+        test_pdf_proc(
+            "<< /JS (script) >>",
+            "script"
+        );
+    }
+    SECTION("no sub-type checks")
+    {
+        test_pdf_proc(
+            "<< /JS (script) /S /JavaScript >>",
+            "script"
+        );
+    }
+    SECTION("no spaces")
+    {
+        test_pdf_proc(
+            "<</S/JavaScript/JS(script)>>",
+            "script"
+        );
+    }
+    SECTION("as hex string")
+    {
+        test_pdf_proc(
+            "<< /JS <62617a> >>",
+            "baz"
+        );
+        test_pdf_proc(
+            "<< /JS <70> >>",
+            "p"
+        );
+        test_pdf_proc(
+            "<< /JS <7> >>",
+            "p"
+        );
+    }
+    SECTION("prepended with records")
+    {
+        test_pdf_proc(
+            "<</A 10 0 R /B 11 1 R/S/JavaScript/JS(script)>>",
+            "script"
+        );
+    }
+}
+
+TEST_CASE("JS processing", "[PDFTokenizer]")
+{
+    SECTION("simple text")
+    {
+        test_pdf_proc(
+            "<</JS"
+            "(var _abc1 = 'Hello World!';)"
+            ">>",
+            "var _abc1 = 'Hello World!';"
+        );
+    }
+    SECTION("balanced parenthesis")
+    {
+        test_pdf_proc(
+            "<</JS"
+            "(function foo() { console.log(\"Hello world!\") })"
+            ">>",
+            "function foo() { console.log(\"Hello world!\") }"
+        );
+    }
+    SECTION("with escapes")
+    {
+        test_pdf_proc(
+            "<</JS"
+            "(function bar\\(var x\\)\\r{\\r    console.log\\(\"baz\"\\)\\r})"
+            ">>",
+            "function bar(var x)\r{\r    console.log(\"baz\")\r}"
+        );
+    }
+    SECTION("all escapes")
+    {
+        test_pdf_proc(
+            "<</JS"
+            "(() \\n\\r\\t\\b\\f\\(\\)\\\\ \\123 \\A\\B\\C \\x\\y\\z)"
+            ">>",
+            "() \n\r\t\b\f()\\ \123 ABC xyz"
+        );
+    }
+    SECTION("escaped new line")
+    {
+        test_pdf_proc(
+            "<</JS"
+            "(var str = 'Hello\\\n , \\\r    world\\\r\n\t!';)"
+            ">>",
+            "var str = 'Hello, world!';"
+        );
+    }
+}
+
+TEST_CASE("split", "[PDFTokenizer]")
+{
+    SECTION("no input")
+    {
+        test_pdf_proc({
+            {"", ""},
+            {"", ""},
+            {"", ""}
+        });
+    }
+    SECTION("minimal PDF")
+    {
+        test_pdf_proc({
+            {"20 0 obj", ""},
+            {"<<", ""},
+            {"/Creator (Acrobat Pro DC 22.1.20169)", ""},
+            {"/ModDate (D:20220714154535+03'00')", ""},
+            {"/CreationDate (D:20220714153909+03'00')", ""},
+            {"/Producer (Acrobat Pro DC 22.1.20169)", ""},
+            {">>", ""},
+            {"endobj", ""}
+        });
+    }
+    SECTION("script")
+    {
+        test_pdf_proc({
+            {"% comment", ""},
+            {"\n", ""},
+            {"<</K/V /JS (a % b)>>\n", "a % b"},
+            {"(% not a", ""},
+            {"comment)\n", ""},
+            {"<</JS (;", ";"},
+            {"script 2)", "script 2"},
+            {">>", ""},
+            {"<</JS(script 3)>>", "script 3"}
+        });
+    }
+}
diff --git a/src/service_inspectors/http_inspect/dev_notes_js_norm.txt b/src/service_inspectors/http_inspect/dev_notes_js_norm.txt

index 42a88673d964db87fa46eb79f1bd01be34ae2821..5eb613c1745ccf386e58f5e445ef722ee91233e7 100644 (file)
--- a/src/service_inspectors/http_inspect/dev_notes_js_norm.txt
+++ b/src/service_inspectors/http_inspect/dev_notes_js_norm.txt
@@ -1,9 +1,10 @@
  HttpJsNorm class serves as a script Normalizer, and currently has two implementations:
  the Legacy Normalizer and the Enhanced Normalizer.
  
-In NHI, there are two JSNorm extensions:
+In NHI, there are three JSNorm extensions:
   * HttpInlineJSNorm, processes content of HTML script tags.
   * HttpExternalJSNorm, processes payload with JavaScript MIME type.
+ * HttpPDFJSNorm, processes payload with PDF MIME type.
  
  Normalization context is per transaction. It is created once js_data calls for normalized JS data,
  and is deleted once transaction ends. Partial inspections feed data incrementally to JS Normalizer,
@@ -16,6 +17,9 @@ During message body analysis the Enhanced Normalizer does one of the following:
  2. If it is an HTML-page, Normalizer searches for an opening tag and processes
     subsequent bytes in a stream mode, until it finds a closing tag.
     It proceeds and scans the entire message body for inline scripts.
+3. If it is PDF file transferred as MIME attachment or as a message body then
+   Normalizer extracts strings marked with '/JS' keyword and normalizes their
+   content as JS text.
  
  Also, js_data IPS option's buffer is a part of NHI processing in order to start the normalization.
  
diff --git a/src/service_inspectors/http_inspect/http_enum.h b/src/service_inspectors/http_inspect/http_enum.h

index 9b2032dc5c4d5a24dfb9ec6cf553f85226a19a75..8615bb5c84dc7262ac38838a0f42a0ea8be33331 100755 (executable)
--- a/src/service_inspectors/http_inspect/http_enum.h
+++ b/src/service_inspectors/http_inspect/http_enum.h
@@ -63,7 +63,7 @@ enum PEG_COUNT { PEG_FLOW = 0, PEG_SCAN, PEG_REASSEMBLE, PEG_INSPECT, PEG_REQUES
      PEG_CONCURRENT_SESSIONS, PEG_MAX_CONCURRENT_SESSIONS, PEG_SCRIPT_DETECTION,
      PEG_PARTIAL_INSPECT, PEG_EXCESS_PARAMS, PEG_PARAMS, PEG_CUTOVERS, PEG_SSL_SEARCH_ABND_EARLY,
      PEG_PIPELINED_FLOWS, PEG_PIPELINED_REQUESTS, PEG_TOTAL_BYTES, PEG_JS_INLINE, PEG_JS_EXTERNAL,
-    PEG_SKIP_MIME_ATTACH, PEG_COUNT_MAX };
+    PEG_JS_PDF, PEG_SKIP_MIME_ATTACH, PEG_COUNT_MAX };
  
  // Result of scanning by splitter
  enum ScanResult { SCAN_NOT_FOUND, SCAN_NOT_FOUND_ACCELERATE, SCAN_FOUND, SCAN_FOUND_PIECE,
@@ -121,11 +121,12 @@ enum Contentcoding { CONTENTCODE__OTHER=1, CONTENTCODE_GZIP, CONTENTCODE_DEFLATE
      CONTENTCODE_XPRESS, CONTENTCODE_XZ };
  
  // Content media-types (MIME types)
-enum ContentType { CT__OTHER=1, CT_APPLICATION_JAVASCRIPT, CT_APPLICATION_ECMASCRIPT,
-    CT_APPLICATION_X_JAVASCRIPT, CT_APPLICATION_X_ECMASCRIPT, CT_APPLICATION_XHTML_XML,
-    CT_TEXT_JAVASCRIPT, CT_TEXT_JAVASCRIPT_1_0, CT_TEXT_JAVASCRIPT_1_1, CT_TEXT_JAVASCRIPT_1_2,
-    CT_TEXT_JAVASCRIPT_1_3, CT_TEXT_JAVASCRIPT_1_4, CT_TEXT_JAVASCRIPT_1_5, CT_TEXT_ECMASCRIPT,
-    CT_TEXT_X_JAVASCRIPT, CT_TEXT_X_ECMASCRIPT, CT_TEXT_JSCRIPT, CT_TEXT_LIVESCRIPT, CT_TEXT_HTML };
+enum ContentType { CT__OTHER=1, CT_APPLICATION_PDF, CT_APPLICATION_OCTET_STREAM,
+    CT_APPLICATION_JAVASCRIPT, CT_APPLICATION_ECMASCRIPT, CT_APPLICATION_X_JAVASCRIPT,
+    CT_APPLICATION_X_ECMASCRIPT, CT_APPLICATION_XHTML_XML, CT_TEXT_JAVASCRIPT,
+    CT_TEXT_JAVASCRIPT_1_0, CT_TEXT_JAVASCRIPT_1_1, CT_TEXT_JAVASCRIPT_1_2, CT_TEXT_JAVASCRIPT_1_3,
+    CT_TEXT_JAVASCRIPT_1_4, CT_TEXT_JAVASCRIPT_1_5, CT_TEXT_ECMASCRIPT, CT_TEXT_X_JAVASCRIPT,
+    CT_TEXT_X_ECMASCRIPT, CT_TEXT_JSCRIPT, CT_TEXT_LIVESCRIPT, CT_TEXT_HTML };
  
  // Transfer-Encoding header values
  enum TransferEncoding { TE__OTHER=1, TE_CHUNKED, TE_IDENTITY };
diff --git a/src/service_inspectors/http_inspect/http_flow_data.cc b/src/service_inspectors/http_inspect/http_flow_data.cc

index c8c7a210d85323c0b4dfa33a325849ff1776ed96..1ca2c94d3fcb7941ecd16018892a27e8f9248bbf 100644 (file)
--- a/src/service_inspectors/http_inspect/http_flow_data.cc
+++ b/src/service_inspectors/http_inspect/http_flow_data.cc
@@ -116,6 +116,7 @@ HttpFlowData::~HttpFlowData()
          if (fd_state[k] != nullptr)
              File_Decomp_StopFree(fd_state[k]);
          delete js_ctx[k];
+        delete js_ctx_mime[k];
      }
  
      delete_pipeline();
diff --git a/src/service_inspectors/http_inspect/http_flow_data.h b/src/service_inspectors/http_inspect/http_flow_data.h

index 2ae92e7153ac8e23d9be217e2465f118a6dc816e..69dcb64989984bd81adf72e0a3383af7f5e02689 100644 (file)
--- a/src/service_inspectors/http_inspect/http_flow_data.h
+++ b/src/service_inspectors/http_inspect/http_flow_data.h
@@ -208,6 +208,7 @@ private:
      void delete_pipeline();
  
      HttpJSNorm* js_ctx[2] = { nullptr, nullptr };
+    HttpJSNorm* js_ctx_mime[2] = { nullptr, nullptr };
      bool cutover_on_clear = false;
      bool ssl_search_abandoned = false;
  
diff --git a/src/service_inspectors/http_inspect/http_js_norm.cc b/src/service_inspectors/http_inspect/http_js_norm.cc

index 58b66214807bbfad18d633b5121c2e275561d57c..6744a6905b0e988cc3118b30418b5cd6d980eb7e 100644 (file)
--- a/src/service_inspectors/http_inspect/http_js_norm.cc
+++ b/src/service_inspectors/http_inspect/http_js_norm.cc
@@ -381,17 +381,11 @@ bool HttpInlineJSNorm::pre_proc()
      ext_script_type = false;
      output_size = jsn_ctx->script_size();
  
-    trace_logf(3, js_trace, TRACE_DUMP, packet,
-        "original[%zu]: %.*s\n", src_end - src_ptr, (int)(src_end - src_ptr), src_ptr);
-
      return true;
  }
  
  bool HttpInlineJSNorm::post_proc(int ret)
  {
-    trace_logf(3, js_trace, TRACE_PROC, DetectionEngine::get_current_packet(),
-        "normalizer returned with %d '%s'\n", ret, jsn::ret2str(ret));
-
      assert(http_events);
      assert(infractions);
  
@@ -431,16 +425,63 @@ bool HttpExternalJSNorm::pre_proc()
              "script continues\n");
      }
  
-    trace_logf(3, js_trace, TRACE_DUMP, packet,
-        "original[%zu]: %.*s\n", src_end - src_ptr, (int)(src_end - src_ptr), src_ptr);
-
      return true;
  }
  
  bool HttpExternalJSNorm::post_proc(int ret)
  {
-    trace_logf(3, js_trace, TRACE_PROC, DetectionEngine::get_current_packet(),
-        "normalizer returned with %d '%s'\n", ret, jsn::ret2str(ret));
+    script_continue = ret == (int)jsn::JSTokenizer::SCRIPT_CONTINUE;
+
+    return JSNorm::post_proc(ret);
+}
+
+bool HttpPDFJSNorm::pre_proc()
+{
+    if (src_ptr >= src_end)
+        return false;
+
+    const Packet* packet = DetectionEngine::get_current_packet();
+
+    if (!ext_script_type)
+    {
+        HttpModule::increment_peg_counts(PEG_JS_PDF);
+        trace_logf(1, js_trace, TRACE_PROC, packet,
+            "PDF starts\n");
+        ext_script_type = true;
+    }
+    else
+    {
+        trace_logf(2, js_trace, TRACE_PROC, packet,
+            "PDF continues\n");
+    }
+
+    // an input stream should not write to its buffer
+    pdf_in.rdbuf()->pubsetbuf(const_cast<char*>((const char*)src_ptr), src_end - src_ptr);
+    pdf_out.clear();
+    delete[] buf_pdf_out.take_data();
+
+    auto r = extractor.process();
+
+    if (r != PDFTokenizer::PDFRet::EOS)
+    {
+        trace_logf(2, js_trace, TRACE_PROC, DetectionEngine::get_current_packet(),
+            "pdf processing failed: %d\n", (int)r);
+        return false;
+    }
+
+    src_ptr = (const uint8_t*)buf_pdf_out.data();
+    src_end = src_ptr + buf_pdf_out.data_len();
+
+    // script object not found
+    if (!src_ptr)
+        return false;
+
+    return true;
+}
+
+bool HttpPDFJSNorm::post_proc(int ret)
+{
+    src_ptr = src_end; // one time per PDU, even if JS Normalizer has not finished
  
      script_continue = ret == (int)jsn::JSTokenizer::SCRIPT_CONTINUE;
  
diff --git a/src/service_inspectors/http_inspect/http_js_norm.h b/src/service_inspectors/http_inspect/http_js_norm.h

index ee61c14e0b8c5d6914e8d6b04eea6cd386e26f10..1b750f0e17d40a7505413896493ed242ff3c4a3d 100644 (file)
--- a/src/service_inspectors/http_inspect/http_js_norm.h
+++ b/src/service_inspectors/http_inspect/http_js_norm.h
@@ -22,9 +22,12 @@
  #define HTTP_JS_NORM_H
  
  #include <cstring>
+#include <FlexLexer.h>
  
  #include "js_norm/js_norm.h"
+#include "js_norm/pdf_tokenizer.h"
  #include "search_engines/search_tool.h"
+#include "utils/streambuf.h"
  
  #include "http_field.h"
  #include "http_flow_data.h"
@@ -88,5 +91,30 @@ protected:
      bool post_proc(int) override;
  };
  
+class HttpPDFJSNorm : public HttpJSNorm
+{
+public:
+    static bool is_pdf(const void* data, size_t len)
+    {
+        constexpr char magic[] = "%PDF-1.";
+        constexpr int magic_len = sizeof(magic) - 1;
+        return magic_len < len and !strncmp((const char*)data, magic, magic_len);
+    }
+
+    HttpPDFJSNorm(JSNormConfig* jsn_config, uint64_t tid) :
+        HttpJSNorm(jsn_config), pdf_out(&buf_pdf_out), extractor(pdf_in, pdf_out)
+    { trans_num = tid; }
+
+protected:
+    bool pre_proc() override;
+    bool post_proc(int) override;
+
+private:
+    snort::ostreambuf_infl buf_pdf_out;
+    std::istringstream pdf_in;
+    std::ostream pdf_out;
+    jsn::PDFTokenizer extractor;
+};
+
  #endif
  
diff --git a/src/service_inspectors/http_inspect/http_msg_body.cc b/src/service_inspectors/http_inspect/http_msg_body.cc

index 6ab2b765b42b692c1507c2a913b9722f1e542e18..1102dcc6a8092dfdb1a393fff08d0aac2dd15728 100644 (file)
--- a/src/service_inspectors/http_inspect/http_msg_body.cc
+++ b/src/service_inspectors/http_inspect/http_msg_body.cc
@@ -504,13 +504,48 @@ HttpJSNorm* HttpMsgBody::acquire_js_ctx()
          js_ctx = new HttpInlineJSNorm(jsn_config, trans_num, params->js_norm_param.mpse_otag,
              params->js_norm_param.mpse_attr);
          break;
+
+    case CT_APPLICATION_PDF:
+        js_ctx = new HttpPDFJSNorm(jsn_config, trans_num);
+        break;
+
+    case CT_APPLICATION_OCTET_STREAM:
+        js_ctx = first_body and HttpPDFJSNorm::is_pdf(decompressed_file_body.start(), decompressed_file_body.length()) ?
+            new HttpPDFJSNorm(jsn_config, trans_num) : nullptr;
+        break;
      }
  
      session_data->js_ctx[source_id] = js_ctx;
+    return js_ctx;
+}
+
+HttpJSNorm* HttpMsgBody::acquire_js_ctx_mime()
+{
+    HttpJSNorm* js_ctx = session_data->js_ctx_mime[source_id];
+
+    if (js_ctx)
+    {
+        if (js_ctx->get_trans_num() == trans_num)
+            return js_ctx;
+
+        delete js_ctx;
+        js_ctx = nullptr;
+    }
  
+    JSNormConfig* jsn_config = get_inspection_policy()->jsn_config;
+    js_ctx = HttpPDFJSNorm::is_pdf(decompressed_file_body.start(), decompressed_file_body.length()) ?
+        new HttpPDFJSNorm(jsn_config, trans_num) : nullptr;
+
+    session_data->js_ctx_mime[source_id] = js_ctx;
      return js_ctx;
  }
  
+void HttpMsgBody::clear_js_ctx_mime()
+{
+    delete session_data->js_ctx_mime[source_id];
+    session_data->js_ctx_mime[source_id] = nullptr;
+}
+
  void HttpMsgBody::do_file_processing(const Field& file_data)
  {
      // Using the trick that cutter is deleted when regular or chunked body is complete
@@ -582,19 +617,37 @@ bool HttpMsgBody::run_detection(snort::Packet* p)
          return false;
      if ((mime_bufs != nullptr) && !mime_bufs->empty())
      {
+        HttpJSNorm* js_ctx_tmp = nullptr;
          auto mb = mime_bufs->cbegin();
+        uint32_t mime_bufs_size = mime_bufs->size();
+
          for (uint32_t count = 0; (count < params->max_mime_attach) && (mb != mime_bufs->cend());
              ++count, ++mb)
          {
+            bool is_last_attachment = ((count + 1 == mime_bufs_size) ||
+                (count + 1 == params->max_mime_attach));
              const uint64_t idx = get_header(source_id)->get_multi_file_processing_id();
              set_file_data(mb->file.start(), mb->file.length(), idx,
                  count or mb->file.is_accumulated(),
                  std::next(mb) != mime_bufs->end() or last_attachment_complete);
              if (mb->vba.length() > 0)
                  ole_data.set(mb->vba.length(), mb->vba.start());
+            decompressed_file_body.reset();
+            decompressed_file_body.set(mb->file.length(), mb->file.start());
+
+            js_ctx_tmp = session_data->js_ctx[source_id];
+            session_data->js_ctx[source_id] = acquire_js_ctx_mime();
+
              DetectionEngine::detect(p);
+
+            if (!is_last_attachment || last_attachment_complete)
+                clear_js_ctx_mime();
+
+            session_data->js_ctx[source_id] = js_ctx_tmp;
+
              ole_data.reset();
              decompressed_vba_data.reset();
+            decompressed_file_body.reset();
          }
          if (mb != mime_bufs->cend())
          {
diff --git a/src/service_inspectors/http_inspect/http_msg_body.h b/src/service_inspectors/http_inspect/http_msg_body.h

index 10ceb6b17b9266c889a3b24370844908ec57a5eb..48ef219cb1d14512c74016fd1ddcd47e034dab64 100644 (file)
--- a/src/service_inspectors/http_inspect/http_msg_body.h
+++ b/src/service_inspectors/http_inspect/http_msg_body.h
@@ -73,6 +73,8 @@ private:
      void do_file_decompression(const Field& input, Field& output);
      void do_legacy_js_normalization(const Field& input, Field& output);
      HttpJSNorm* acquire_js_ctx();
+    HttpJSNorm* acquire_js_ctx_mime();
+    void clear_js_ctx_mime();
  
      void clean_partial(uint32_t& partial_inspected_octets, uint32_t& partial_detect_length,
          uint8_t*& partial_detect_buffer,  uint32_t& partial_js_detect_length);
diff --git a/src/service_inspectors/http_inspect/http_tables.cc b/src/service_inspectors/http_inspect/http_tables.cc

index 1db07a9e368f81f2819861fc611750d682b953e3..ab2bb07efb420c5459bd9406cae80a3ddbafa930 100755 (executable)
--- a/src/service_inspectors/http_inspect/http_tables.cc
+++ b/src/service_inspectors/http_inspect/http_tables.cc
@@ -168,6 +168,8 @@ const StrCode HttpMsgHeadShared::content_code_list[] =
  
  const StrCode HttpMsgHeadShared::content_type_list[] =
  {
+    { CT_APPLICATION_PDF,          "application/pdf" },
+    { CT_APPLICATION_OCTET_STREAM, "application/octet-stream" },
      { CT_APPLICATION_JAVASCRIPT,   "application/javascript" },
      { CT_APPLICATION_ECMASCRIPT,   "application/ecmascript" },
      { CT_APPLICATION_X_JAVASCRIPT, "application/x-javascript" },
@@ -383,6 +385,7 @@ const PegInfo HttpModule::peg_names[PEG_COUNT_MAX+1] =
      { CountType::SUM, "total_bytes", "total HTTP data bytes inspected" },
      { CountType::SUM, "js_inline_scripts", "total number of inline JavaScripts processed" },
      { CountType::SUM, "js_external_scripts", "total number of external JavaScripts processed" },
+    { CountType::SUM, "js_pdf_scripts", "total number of PDF JavaScripts processed" },
      { CountType::SUM, "skip_mime_attach", "total number of HTTP requests with too many MIME attachments to inspect" },
      { CountType::END, nullptr, nullptr }
  };
author	Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
	Tue, 29 Nov 2022 13:56:17 +0000 (13:56 +0000)
committer	Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
	Tue, 29 Nov 2022 13:56:17 +0000 (13:56 +0000)
src/js_norm/CMakeLists.txt		patch \| blob \| blame \| history
src/js_norm/dev_notes.txt		patch \| blob \| blame \| history
src/js_norm/js_enum.h		patch \| blob \| blame \| history
src/js_norm/js_norm.cc		patch \| blob \| blame \| history
src/js_norm/js_norm_module.cc		patch \| blob \| blame \| history
src/js_norm/js_normalizer.cc		patch \| blob \| blame \| history
src/js_norm/js_tokenizer.h		patch \| blob \| blame \| history
src/js_norm/js_tokenizer.l		patch \| blob \| blame \| history
src/js_norm/pdf_tokenizer.h	[new file with mode: 0644]	patch \| blob
src/js_norm/pdf_tokenizer.l	[new file with mode: 0644]	patch \| blob
src/js_norm/test/CMakeLists.txt		patch \| blob \| blame \| history
src/js_norm/test/js_test_options.cc		patch \| blob \| blame \| history
src/js_norm/test/js_test_stubs.cc		patch \| blob \| blame \| history
src/js_norm/test/pdf_tokenizer_test.cc	[new file with mode: 0644]	patch \| blob
src/service_inspectors/http_inspect/dev_notes_js_norm.txt		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_enum.h		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_flow_data.cc		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_flow_data.h		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_js_norm.cc		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_js_norm.h		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_msg_body.cc		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_msg_body.h		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_tables.cc		patch \| blob \| blame \| history