+FLEX ( pdf_tokenizer
+ ${CMAKE_CURRENT_SOURCE_DIR}/pdf_tokenizer.l
+ ${CMAKE_CURRENT_BINARY_DIR}/pdf_tokenizer.cc
+)
+
FLEX ( js_tokenizer
${CMAKE_CURRENT_SOURCE_DIR}/js_tokenizer.l
${CMAKE_CURRENT_BINARY_DIR}/js_tokenizer.cc
)
set ( JS_SOURCES
+ ${pdf_tokenizer_OUTPUTS}
${js_tokenizer_OUTPUTS}
js_config.h
js_enum.h
js_normalizer.cc
js_normalizer.h
js_tokenizer.h
+ pdf_tokenizer.h
)
add_library(js_norm OBJECT ${JS_SOURCES})
If Normalizer overruns the configured limit, built-in alert is generated.
A config option to set the limit manually:
- * js_norm.identifier_depth.
+ * js_norm.identifier_depth
Identifiers from the ident_ignore list will be placed as is, without substitution. Starting with
the listed identifier, any chain of dot accessors, brackets and function calls will be kept
4. Temporary buffer (debug build only)
5. Matched token (debug build only)
6. Identifier substitution (debug build only)
+
+PDF parser follows "PDF 32000-1:2008 First Edition 2008-7-1 Document
+management Portable document format Part 1: PDF 1.7".
+Known limitations:
+* Nested dictionaries are not fully supported. Properties of the last object
+are tracked. Once the nested object ends, it clears all info about the object
+type.
+* Nested dictionaries are not allowed in JavaScript-type dictionary.
+* Stream objects are ignored.
enum
{
TRACE_PROC = 0,
- TRACE_DUMP
+ TRACE_DUMP,
+ TRACE_PDF_PROC,
+ TRACE_PDF_DUMP
};
// This enum must be synchronized with JSNormModule::peg_names[] in js_norm_module.cc
#include "js_norm.h"
+#include "log/messages.h"
+#include "trace/trace_api.h"
+
#include "js_identifier_ctx.h"
#include "js_normalizer.h"
-
#include "js_norm_module.h"
using namespace jsn;
}
pdu_cnt = 0;
+ const Packet* packet = DetectionEngine::get_current_packet();
src_ptr = (const uint8_t*)in_data;
src_end = src_ptr + in_len;
while (alive and pre_proc())
{
+ trace_logf(3, js_trace, TRACE_DUMP, packet,
+ "original[%zu]: %.*s\n", src_end - src_ptr, (int)(src_end - src_ptr), src_ptr);
+
auto ret = jsn_ctx->normalize((const char*)src_ptr, src_end - src_ptr, ext_script_type);
const uint8_t* next = (const uint8_t*)jsn_ctx->get_src_next();
+ trace_logf(3, js_trace, TRACE_PROC, packet,
+ "normalizer returned with %d '%s'\n", ret, jsn::ret2str(ret));
+
JSNormModule::increment_peg_counts(PEG_BYTES, next - src_ptr);
src_ptr = next;
static const TraceOption trace_options[] =
{
- { "proc", TRACE_PROC, "enable processing logging" },
- { "dump", TRACE_DUMP, "enable data logging" },
+ { "proc", TRACE_PROC, "enable processing logging" },
+ { "dump", TRACE_DUMP, "enable data logging" },
+#ifdef DEBUG_MSGS
+ { "pdf_proc", TRACE_PDF_PROC, "enable processing logging for PDF extractor" },
+ { "pdf_dump", TRACE_PDF_DUMP, "enable data logging for PDF extractor" },
+#endif
{ nullptr, 0, nullptr }
};
#include "js_normalizer.h"
#include "js_norm/js_enum.h"
+#include "log/messages.h"
+#include "trace/trace_api.h"
+
+namespace snort
+{
+class Trace;
+}
+
+extern THREAD_LOCAL const snort::Trace* js_trace;
#define BUFF_EXP_FACTOR 1.3
#include <stack>
#include <vector>
-#include "log/messages.h"
-#include "trace/trace_api.h"
-
-extern THREAD_LOCAL const snort::Trace* js_trace;
-
// The longest pattern has 9 characters " < / s c r i p t > ",
// 8 of them can reside in 1st chunk
// Each character in the identifier forms its own group (pattern matching case),
%option c++
%option yyclass="JSTokenizer"
+%option prefix="js"
%option align full 8bit batch never-interactive
%option noinput nounput noyywrap
%option noyy_push_state noyy_pop_state noyy_top_state
#include "js_norm/js_enum.h"
#include "js_norm/js_identifier_ctx.h"
#include "js_norm/js_tokenizer.h"
+#include "log/messages.h"
+#include "trace/trace_api.h"
#include "utils/util_cstring.h"
+extern THREAD_LOCAL const snort::Trace* js_trace;
+
using namespace jsn;
#define YY_NO_UNPUT
--- /dev/null
+//--------------------------------------------------------------------------
+// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation. You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+//--------------------------------------------------------------------------
+// pdf_tokenizer.h author Cisco
+
+#ifndef PDF_TOKENIZER_H
+#define PDF_TOKENIZER_H
+
+#include <array>
+#include <cstring>
+#include <sstream>
+#include <stack>
+#include <vector>
+
+#define PDFTOKENIZER_NAME_MAX_SIZE 16
+
+namespace jsn
+{
+
+class PDFTokenizer : public yyFlexLexer
+{
+public:
+ enum PDFRet
+ {
+ EOS = 0,
+ NOT_NAME_IN_DICTIONARY_KEY,
+ INCOMPLETE_ARRAY_IN_DICTIONARY,
+ MAX
+ };
+
+ PDFTokenizer() = delete;
+ explicit PDFTokenizer(std::istream& in, std::ostream& out);
+ ~PDFTokenizer() override;
+
+ PDFRet process();
+
+private:
+ int yylex() override;
+
+ PDFRet h_dict_open();
+ PDFRet h_dict_close();
+ PDFRet h_dict_name();
+ PDFRet h_dict_other();
+ inline bool h_lit_str();
+ inline bool h_hex_str();
+ inline bool h_lit_open();
+ inline bool h_lit_close();
+ PDFRet h_lit_unescape();
+ PDFRet h_lit_oct2chr();
+ PDFRet h_hex_hex2chr();
+
+ struct ObjectString
+ {
+ void clear()
+ { parenthesis_level = 0; }
+
+ int parenthesis_level = 0;
+ };
+
+ struct ObjectArray
+ {
+ void clear()
+ { nesting_level = 0; }
+
+ int nesting_level = 0;
+ };
+
+ struct ObjectDictionary
+ {
+ void clear()
+ { key_value = true; array_level = 0; }
+
+ bool key_value = true;
+ int array_level = 0;
+ };
+
+ struct DictionaryEntry
+ {
+ void clear()
+ { key[0] = '\0'; }
+
+ char key[PDFTOKENIZER_NAME_MAX_SIZE] = {0};
+ };
+
+ ObjectString obj_string;
+ ObjectArray obj_array;
+ ObjectDictionary obj_dictionary;
+ DictionaryEntry obj_entry;
+};
+
+bool PDFTokenizer::h_lit_str()
+{
+ return obj_dictionary.array_level == obj_array.nesting_level and !strcmp(obj_entry.key, "/JS");
+}
+
+bool PDFTokenizer::h_hex_str()
+{
+ return obj_dictionary.array_level == obj_array.nesting_level and !strcmp(obj_entry.key, "/JS");
+}
+
+bool PDFTokenizer::h_lit_open()
+{
+ return ++obj_string.parenthesis_level == 1;
+}
+
+bool PDFTokenizer::h_lit_close()
+{
+ return --obj_string.parenthesis_level == 0;
+}
+
+}
+
+#endif
--- /dev/null
+/*--------------------------------------------------------------------------
+// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation. You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+//--------------------------------------------------------------------------
+// pdf_tokenizer.l author Cisco
+*/
+
+%option c++
+%option yyclass="PDFTokenizer"
+%option prefix="pdf"
+%option align full 8bit batch never-interactive stack
+%option noinput nounput noyywrap noyy_top_state
+
+%{
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <algorithm>
+#include <cassert>
+#include <stdio.h>
+
+#include "js_norm/js_enum.h"
+#include "js_norm/pdf_tokenizer.h"
+#include "log/messages.h"
+#include "trace/trace_api.h"
+
+extern THREAD_LOCAL const snort::Trace* js_trace;
+
+using namespace jsn;
+
+#define YY_NO_UNPUT
+
+#define YY_FATAL_ERROR(msg) { snort::FatalError("%s", msg); }
+
+#define PUSH(x) yy_push_state(x)
+#define POP() yy_pop_state()
+
+#define YY_USER_ACTION \
+ { \
+ debug_logf(5, js_trace, TRACE_PDF_PROC, nullptr, \
+ "PDF pattern #%d, sc %d\n", yy_act, YY_START); \
+ \
+ debug_logf(5, js_trace, TRACE_PDF_DUMP, nullptr, \
+ "PDF text '%s'\n", YYText()); \
+ }
+
+#define EXEC(f) \
+ { \
+ auto r = (f); \
+ if (r) \
+ return r; \
+ }
+
+%}
+
+/* PDF 32000-1:2008 definitions follow */
+
+/* 7.2.2 Character Set */
+CHARS_WHITESPACE \x00\x09\x0a\x0c\x0d\x20
+CHARS_DELIMITER \(\)\<\>\[\]\{\}\/\%
+GRP_WHITESPACE [\x00\x09\x0a\x0c\x0d\x20]
+GRP_DELIMITER [\(\)\<\>\[\]\{\}\/\%]
+GRP_REGULAR [^\x00\x09\x0a\x0c\x0d\x20\(\)\<\>\[\]\{\}\/\%]
+
+/* 7.2.3 Comments */
+COMMENT %.*
+
+/* 7.3.2 Boolean Objects */
+OBJ_BOOLEAN true|false
+
+/* 7.3.3 Numeric Objects */
+OBJ_INT_NUM [+-]?[0-9]{1,64}
+OBJ_REL_NUM [+-]?("."?[0-9]{1,64}|[0-9]{1,64}"."?|[0-9]{1,64}"."?[0-9]{1,64})
+
+/* 7.3.4 String Objects */
+OBJ_LIT_STR_OPEN "("
+OBJ_LIT_STR_CLOSE ")"
+OBJ_HEX_STR_OPEN "<"
+OBJ_HEX_STR_CLOSE ">"
+
+/* 7.3.4.2 Literal Strings */
+LIT_STR_ESC \\[^0-7]
+LIT_STR_ESC_OCT \\[0-7]{1}|\\[0-7]{2}|\\[0-7]{3}
+LIT_STR_ESC_EOL \\[\x0d\x0a]|\\\x0d\x0a
+LIT_STR_EOL [\x0d\x0a]|\x0d\x0a
+LIT_STR_BODY [^\\\(\)]{1,64}
+
+/* 7.3.4.3 Hexadecimal Strings */
+HEX_STR_BODY [0-9A-Fa-f]{1,64}
+HEX_STR_SKIP [^0-9A-Fa-f>]{1,64}
+
+/* 7.3.5 Name Objects */
+OBJ_NAME \/{GRP_REGULAR}{1,256}
+
+/* 7.3.6 Array Objects */
+OBJ_ARRAY_OPEN "["
+OBJ_ARRAY_CLOSE "]"
+
+/* 7.3.7 Dictionary Objects */
+OBJ_DICT_OPEN "<<"
+OBJ_DICT_CLOSE ">>"
+
+/* FIXIT: improve bytes consuming */
+OBJ_DICT_SKIP .
+
+/* 7.3.8 Stream Objects */
+OBJ_STREAM_OPEN stream$
+OBJ_STREAM_CLOSE ^endstream
+
+/* 7.3.9 Null Object */
+OBJ_NULL null
+
+/* 7.3.10 Indirect Objects */
+INDIRECT_OBJ {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj
+RECORD_OBJ {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
+
+/* Not dictionary, not strings */
+SKIP [^<\(%]{1,64}
+WHITESPACE {GRP_WHITESPACE}{1,64}
+
+/* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string */
+%x dictnr
+%x litstr
+%x hexstr
+%x jslstr
+%x jshstr
+
+%%
+
+{SKIP} { }
+{COMMENT} { }
+
+<INITIAL,dictnr>{OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()) }
+<dictnr>{OBJ_DICT_CLOSE} { POP(); EXEC(h_dict_close()) }
+<dictnr>{COMMENT} { }
+<dictnr>{WHITESPACE} { }
+<dictnr>{RECORD_OBJ} { EXEC(h_dict_other()) }
+<dictnr>{OBJ_BOOLEAN} { EXEC(h_dict_other()) }
+<dictnr>{OBJ_INT_NUM} { EXEC(h_dict_other()) }
+<dictnr>{OBJ_REL_NUM} { EXEC(h_dict_other()) }
+<dictnr>{OBJ_NULL} { EXEC(h_dict_other()) }
+<dictnr>{OBJ_NAME} { EXEC(h_dict_name()) }
+<dictnr>{OBJ_ARRAY_OPEN} { ++obj_array.nesting_level; EXEC(h_dict_other()) }
+<dictnr>{OBJ_ARRAY_CLOSE} { --obj_array.nesting_level; EXEC(h_dict_other()) }
+<dictnr>{OBJ_LIT_STR_OPEN} { EXEC(h_dict_other()) if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); }
+<dictnr>{OBJ_HEX_STR_OPEN} { EXEC(h_dict_other()) if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); }
+<dictnr>{OBJ_DICT_SKIP} { }
+
+<INITIAL>{OBJ_LIT_STR_OPEN} { if (h_lit_open()) PUSH(litstr); }
+<litstr>{OBJ_LIT_STR_OPEN} { h_lit_open(); }
+<litstr>{OBJ_LIT_STR_CLOSE} { if (h_lit_close()) POP(); }
+<litstr>{LIT_STR_ESC} { }
+<litstr>{LIT_STR_ESC_OCT} { }
+<litstr>{LIT_STR_ESC_EOL} { }
+<litstr>{LIT_STR_EOL} { }
+<litstr>{LIT_STR_BODY} { }
+
+<INITIAL>{OBJ_HEX_STR_OPEN} { PUSH(hexstr); }
+<hexstr>{OBJ_HEX_STR_CLOSE} { POP(); }
+<hexstr>{HEX_STR_BODY} { }
+<hexstr>{HEX_STR_SKIP} { }
+
+<jslstr>{OBJ_LIT_STR_OPEN} { if (!h_lit_open()) ECHO; }
+<jslstr>{OBJ_LIT_STR_CLOSE} { if (h_lit_close()) POP(); else ECHO; }
+<jslstr>{LIT_STR_ESC} { EXEC(h_lit_unescape()) }
+<jslstr>{LIT_STR_ESC_OCT} { EXEC(h_lit_oct2chr()) }
+<jslstr>{LIT_STR_ESC_EOL}{WHITESPACE} { }
+<jslstr>{LIT_STR_EOL} { ECHO; }
+<jslstr>{LIT_STR_BODY} { ECHO; }
+
+<jshstr>{OBJ_HEX_STR_OPEN} { }
+<jshstr>{OBJ_HEX_STR_CLOSE} { POP(); }
+<jshstr>{HEX_STR_BODY} { EXEC(h_hex_hex2chr()) }
+<jshstr>{HEX_STR_SKIP} { }
+
+<<EOF>> { return PDFRet::EOS; }
+
+%%
+
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_open()
+{
+ obj_dictionary.clear();
+ obj_dictionary.array_level = obj_array.nesting_level;
+
+ debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+ "dictionary open, at array level %d\n", obj_array.nesting_level);
+
+ return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_close()
+{
+ obj_dictionary.clear();
+
+ debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+ "dictionary close, at array level %d\n", obj_array.nesting_level);
+
+ if (obj_dictionary.array_level != obj_array.nesting_level)
+ return PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY;
+
+ return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_other()
+{
+ if (obj_dictionary.array_level != obj_array.nesting_level)
+ return PDFRet::EOS;
+
+ if (obj_dictionary.key_value)
+ return PDFRet::NOT_NAME_IN_DICTIONARY_KEY;
+
+ debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+ "dictionary token: other\n");
+
+ obj_dictionary.key_value = !obj_dictionary.key_value;
+
+ return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_name()
+{
+ if (obj_dictionary.array_level != obj_array.nesting_level)
+ return PDFRet::EOS;
+
+ if (obj_dictionary.key_value)
+ strncpy(obj_entry.key, yytext, sizeof(obj_entry.key) - 1);
+
+ obj_dictionary.key_value = !obj_dictionary.key_value;
+
+ debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+ "dictionary token: name as %s\n", obj_dictionary.key_value ? "value" : "key");
+
+ debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+ "dictionary entry: %s, %s\n", obj_entry.key, obj_dictionary.key_value ? yytext : "...");
+
+ return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_lit_unescape()
+{
+ assert(yyleng == 2);
+ assert(yytext[0] == '\\');
+
+ char c;
+
+ // 7.3.4.2 Literal Strings, Table 3 Escape sequences in literal strings
+ switch (yytext[1])
+ {
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case '(': c = '('; break;
+ case ')': c = ')'; break;
+ case '\\': c = '\\'; break;
+ default: c = yytext[1];
+ }
+
+ yyout << c;
+
+ return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_lit_oct2chr()
+{
+ assert(0 < yyleng and yyleng < 5);
+ assert(yytext[0] == '\\');
+
+ unsigned v;
+ sscanf(yytext + 1, "%o", &v);
+ yyout << (char)v;
+
+ debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+ "literal string, %s to %c \n", yytext, v);
+
+ return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_hex_hex2chr()
+{
+ int len = yyleng & ~1;
+ const char* ptr = yytext;
+ const char* end = yytext + len;
+
+ while (ptr < end)
+ {
+ unsigned v;
+ sscanf(ptr, "%02x", &v);
+ yyout << (char)v;
+ ptr += 2;
+ }
+
+ if (len != yyleng)
+ {
+ unsigned v;
+ sscanf(ptr, "%01x", &v);
+ yyout << (char)(v << 4);
+ }
+
+ debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+ "literal string, in hex: %s\n", yytext);
+
+ return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out)
+ : yyFlexLexer(in, out)
+{
+}
+
+PDFTokenizer::~PDFTokenizer()
+{
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::process()
+{
+ auto r = yylex();
+
+ return (PDFRet)r;
+}
${CMAKE_CURRENT_BINARY_DIR}/../js_tokenizer.cc
)
+FLEX ( pdf_tokenizer
+ ${CMAKE_CURRENT_SOURCE_DIR}/../pdf_tokenizer.l
+ ${CMAKE_CURRENT_BINARY_DIR}/../pdf_tokenizer.cc
+)
+
add_catch_test( js_normalizer_test
SOURCES
${js_tokenizer_OUTPUTS}
${CMAKE_SOURCE_DIR}/src/utils/streambuf.cc
js_test_stubs.cc
)
+
+add_catch_test( pdf_tokenizer_test
+ SOURCES
+ ${pdf_tokenizer_OUTPUTS}
+ js_test_stubs.cc
+)
#include "js_test_options.h"
+#include <assert.h>
+
Config::Config(const Config& other) : type(other.type)
{
switch (other.type)
void TraceApi::filter(const Packet&) { }
int DetectionEngine::queue_event(unsigned int, unsigned int) { return 0; }
+Packet* DetectionEngine::get_current_packet() { return nullptr; }
}
--- /dev/null
+//--------------------------------------------------------------------------
+// Copyright (C) 2022-2022 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation. You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+//--------------------------------------------------------------------------
+// pdf_tokenizer_test.cc author Cisco
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <vector>
+
+#include <FlexLexer.h>
+
+#include "catch/catch.hpp"
+#include "js_norm/pdf_tokenizer.h"
+
+using namespace jsn;
+using namespace std;
+
+typedef pair<string, string> Chunk;
+
+static void test_pdf_proc(const string& source, const string& expected,
+ PDFTokenizer::PDFRet ret = PDFTokenizer::PDFRet::EOS)
+{
+ istringstream in(source);
+ ostringstream out;
+ PDFTokenizer extractor(in, out);
+
+ auto r = extractor.process();
+
+ CHECK(ret == r);
+ CHECK(expected == out.str());
+}
+
+static void test_pdf_proc(const vector<Chunk>& chunks)
+{
+ istringstream in;
+ ostringstream out;
+ PDFTokenizer extractor(in, out);
+
+ for (auto& chunk : chunks)
+ {
+ auto src = chunk.first;
+ auto exp = chunk.second;
+
+ in.rdbuf()->pubsetbuf((char*)src.c_str(), src.length());
+ out.str("");
+
+ auto r = extractor.process();
+
+ CHECK(PDFTokenizer::PDFRet::EOS == r);
+ CHECK(exp == out.str());
+ }
+}
+
+TEST_CASE("basic", "[PDFTokenizer]")
+{
+ SECTION("no input")
+ {
+ test_pdf_proc(
+ "",
+ ""
+ );
+ }
+ SECTION("minimal PDF")
+ {
+ test_pdf_proc(
+ "20 0 obj"
+ "<<"
+ "/Creator (Acrobat Pro DC 22.1.20169)"
+ "/ModDate (D:20220714154535+03'00')"
+ "/CreationDate (D:20220714153909+03'00')"
+ "/Producer (Acrobat Pro DC 22.1.20169)"
+ ">>"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("direct object")
+ {
+ test_pdf_proc(
+ "<<"
+ "/S /JavaScript"
+ ">>",
+ ""
+ );
+ }
+ SECTION("indirect object")
+ {
+ test_pdf_proc(
+ "19 0 obj"
+ "<<"
+ "/S /JavaScript"
+ ">>"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("records")
+ {
+ test_pdf_proc(
+ "1 0 R"
+ "<<"
+ "/T 2 0 R"
+ ">>",
+ ""
+ );
+ }
+ SECTION("sub array")
+ {
+ test_pdf_proc(
+ "<<"
+ "/K [ /name1 /name2 /name3 ]"
+ ">>",
+ ""
+ );
+ }
+ SECTION("sub dictionary")
+ {
+ test_pdf_proc(
+ "<<"
+ "/K << /k1 /v1 /k2 /v2 >> "
+ ">>",
+ ""
+ );
+ }
+ SECTION("more items")
+ {
+ test_pdf_proc(
+ "<00>"
+ "<< >>"
+ "<<"
+ "/K << /k1 /v1 /k2 [ /i1 /i2 /i3 /i4 ] /k3 /v3 /k4 <000102> /k5 (abc) >>"
+ ">>"
+ "["
+ "<</k1/v1/k2/v2/k3/v3>> <</k1[/i1/i2/i3[/j1/j2]]/k2<00>>> <</k1<</t1<00>>>>>"
+ "]",
+ ""
+ );
+ }
+ SECTION("comments")
+ {
+ test_pdf_proc(
+ "% comment 1\n"
+ "<</K/V % comment /JS (script 1)\n>>"
+ "<</K/V /JS (a % b)>>\n"
+ "(% not a comment)\n"
+ "% comment 2\n"
+ "<</JS (; script 2) % comment 3\n>>",
+ "a % b; script 2"
+ );
+ }
+ SECTION("escapes in string")
+ {
+ test_pdf_proc(
+ "(() \\n\\r\\t\\b\\f\\(\\)\\\\ \\123 \\A\\B\\C \\x\\y\\z)",
+ ""
+ );
+ }
+ SECTION("hex string")
+ {
+ test_pdf_proc(
+ "<000102030405>",
+ ""
+ );
+ }
+ SECTION("key after literal string")
+ {
+ test_pdf_proc(
+ "<<"
+ "/Lang (EN-GB)"
+ "/K [12 0 R]"
+ ">>",
+ ""
+ );
+ }
+ SECTION("key after hex string")
+ {
+ test_pdf_proc(
+ "<<"
+ "/Lang <62617a>"
+ "/K [12 0 R]"
+ ">>",
+ ""
+ );
+ }
+ SECTION("number values")
+ {
+ test_pdf_proc(
+ "<<"
+ "/N 10"
+ "/N 1.0"
+ "/N 1."
+ "/N .1"
+ "/N 1"
+ ">>",
+ ""
+ );
+ }
+ SECTION("not name for key")
+ {
+ test_pdf_proc(
+ "<<"
+ "/K1 /V1"
+ "[/K2] /V2"
+ "/K3 /V3"
+ ">>",
+ "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+ );
+ }
+ SECTION("literal string as a key")
+ {
+ test_pdf_proc(
+ "<<"
+ "/K1 /V1"
+ "(foo) /V2"
+ "/K3 /V3"
+ ">>",
+ "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+ );
+ }
+ SECTION("hex string as a key")
+ {
+ test_pdf_proc(
+ "<<"
+ "/K1 /V1"
+ "<62617a> /V2"
+ "/K3 /V3"
+ ">>",
+ "", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+ );
+ }
+ SECTION("incomplete array")
+ {
+ test_pdf_proc(
+ "<<"
+ "/K1 [ /V1 /V2 /V3 "
+ ">>",
+ "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
+ );
+ }
+}
+
+TEST_CASE("JS location", "[PDFTokenizer]")
+{
+ SECTION("wrong type")
+ {
+ test_pdf_proc(
+ "<</S /JavaScript /JS /script >>",
+ ""
+ );
+ }
+ SECTION("no sub-type")
+ {
+ test_pdf_proc(
+ "<< /JS (script) >>",
+ "script"
+ );
+ }
+ SECTION("no sub-type checks")
+ {
+ test_pdf_proc(
+ "<< /JS (script) /S /JavaScript >>",
+ "script"
+ );
+ }
+ SECTION("no spaces")
+ {
+ test_pdf_proc(
+ "<</S/JavaScript/JS(script)>>",
+ "script"
+ );
+ }
+ SECTION("as hex string")
+ {
+ test_pdf_proc(
+ "<< /JS <62617a> >>",
+ "baz"
+ );
+ test_pdf_proc(
+ "<< /JS <70> >>",
+ "p"
+ );
+ test_pdf_proc(
+ "<< /JS <7> >>",
+ "p"
+ );
+ }
+ SECTION("prepended with records")
+ {
+ test_pdf_proc(
+ "<</A 10 0 R /B 11 1 R/S/JavaScript/JS(script)>>",
+ "script"
+ );
+ }
+}
+
+TEST_CASE("JS processing", "[PDFTokenizer]")
+{
+ SECTION("simple text")
+ {
+ test_pdf_proc(
+ "<</JS"
+ "(var _abc1 = 'Hello World!';)"
+ ">>",
+ "var _abc1 = 'Hello World!';"
+ );
+ }
+ SECTION("balanced parenthesis")
+ {
+ test_pdf_proc(
+ "<</JS"
+ "(function foo() { console.log(\"Hello world!\") })"
+ ">>",
+ "function foo() { console.log(\"Hello world!\") }"
+ );
+ }
+ SECTION("with escapes")
+ {
+ test_pdf_proc(
+ "<</JS"
+ "(function bar\\(var x\\)\\r{\\r console.log\\(\"baz\"\\)\\r})"
+ ">>",
+ "function bar(var x)\r{\r console.log(\"baz\")\r}"
+ );
+ }
+ SECTION("all escapes")
+ {
+ test_pdf_proc(
+ "<</JS"
+ "(() \\n\\r\\t\\b\\f\\(\\)\\\\ \\123 \\A\\B\\C \\x\\y\\z)"
+ ">>",
+ "() \n\r\t\b\f()\\ \123 ABC xyz"
+ );
+ }
+ SECTION("escaped new line")
+ {
+ test_pdf_proc(
+ "<</JS"
+ "(var str = 'Hello\\\n , \\\r world\\\r\n\t!';)"
+ ">>",
+ "var str = 'Hello, world!';"
+ );
+ }
+}
+
+TEST_CASE("split", "[PDFTokenizer]")
+{
+ SECTION("no input")
+ {
+ test_pdf_proc({
+ {"", ""},
+ {"", ""},
+ {"", ""}
+ });
+ }
+ SECTION("minimal PDF")
+ {
+ test_pdf_proc({
+ {"20 0 obj", ""},
+ {"<<", ""},
+ {"/Creator (Acrobat Pro DC 22.1.20169)", ""},
+ {"/ModDate (D:20220714154535+03'00')", ""},
+ {"/CreationDate (D:20220714153909+03'00')", ""},
+ {"/Producer (Acrobat Pro DC 22.1.20169)", ""},
+ {">>", ""},
+ {"endobj", ""}
+ });
+ }
+ SECTION("script")
+ {
+ test_pdf_proc({
+ {"% comment", ""},
+ {"\n", ""},
+ {"<</K/V /JS (a % b)>>\n", "a % b"},
+ {"(% not a", ""},
+ {"comment)\n", ""},
+ {"<</JS (;", ";"},
+ {"script 2)", "script 2"},
+ {">>", ""},
+ {"<</JS(script 3)>>", "script 3"}
+ });
+ }
+}
HttpJsNorm class serves as a script Normalizer, and currently has two implementations:
the Legacy Normalizer and the Enhanced Normalizer.
-In NHI, there are two JSNorm extensions:
+In NHI, there are three JSNorm extensions:
* HttpInlineJSNorm, processes content of HTML script tags.
* HttpExternalJSNorm, processes payload with JavaScript MIME type.
+ * HttpPDFJSNorm, processes payload with PDF MIME type.
Normalization context is per transaction. It is created once js_data calls for normalized JS data,
and is deleted once transaction ends. Partial inspections feed data incrementally to JS Normalizer,
2. If it is an HTML-page, Normalizer searches for an opening tag and processes
subsequent bytes in a stream mode, until it finds a closing tag.
It proceeds and scans the entire message body for inline scripts.
+3. If it is PDF file transferred as MIME attachment or as a message body then
+ Normalizer extracts strings marked with '/JS' keyword and normalizes their
+ content as JS text.
Also, js_data IPS option's buffer is a part of NHI processing in order to start the normalization.
PEG_CONCURRENT_SESSIONS, PEG_MAX_CONCURRENT_SESSIONS, PEG_SCRIPT_DETECTION,
PEG_PARTIAL_INSPECT, PEG_EXCESS_PARAMS, PEG_PARAMS, PEG_CUTOVERS, PEG_SSL_SEARCH_ABND_EARLY,
PEG_PIPELINED_FLOWS, PEG_PIPELINED_REQUESTS, PEG_TOTAL_BYTES, PEG_JS_INLINE, PEG_JS_EXTERNAL,
- PEG_SKIP_MIME_ATTACH, PEG_COUNT_MAX };
+ PEG_JS_PDF, PEG_SKIP_MIME_ATTACH, PEG_COUNT_MAX };
// Result of scanning by splitter
enum ScanResult { SCAN_NOT_FOUND, SCAN_NOT_FOUND_ACCELERATE, SCAN_FOUND, SCAN_FOUND_PIECE,
CONTENTCODE_XPRESS, CONTENTCODE_XZ };
// Content media-types (MIME types)
-enum ContentType { CT__OTHER=1, CT_APPLICATION_JAVASCRIPT, CT_APPLICATION_ECMASCRIPT,
- CT_APPLICATION_X_JAVASCRIPT, CT_APPLICATION_X_ECMASCRIPT, CT_APPLICATION_XHTML_XML,
- CT_TEXT_JAVASCRIPT, CT_TEXT_JAVASCRIPT_1_0, CT_TEXT_JAVASCRIPT_1_1, CT_TEXT_JAVASCRIPT_1_2,
- CT_TEXT_JAVASCRIPT_1_3, CT_TEXT_JAVASCRIPT_1_4, CT_TEXT_JAVASCRIPT_1_5, CT_TEXT_ECMASCRIPT,
- CT_TEXT_X_JAVASCRIPT, CT_TEXT_X_ECMASCRIPT, CT_TEXT_JSCRIPT, CT_TEXT_LIVESCRIPT, CT_TEXT_HTML };
+enum ContentType { CT__OTHER=1, CT_APPLICATION_PDF, CT_APPLICATION_OCTET_STREAM,
+ CT_APPLICATION_JAVASCRIPT, CT_APPLICATION_ECMASCRIPT, CT_APPLICATION_X_JAVASCRIPT,
+ CT_APPLICATION_X_ECMASCRIPT, CT_APPLICATION_XHTML_XML, CT_TEXT_JAVASCRIPT,
+ CT_TEXT_JAVASCRIPT_1_0, CT_TEXT_JAVASCRIPT_1_1, CT_TEXT_JAVASCRIPT_1_2, CT_TEXT_JAVASCRIPT_1_3,
+ CT_TEXT_JAVASCRIPT_1_4, CT_TEXT_JAVASCRIPT_1_5, CT_TEXT_ECMASCRIPT, CT_TEXT_X_JAVASCRIPT,
+ CT_TEXT_X_ECMASCRIPT, CT_TEXT_JSCRIPT, CT_TEXT_LIVESCRIPT, CT_TEXT_HTML };
// Transfer-Encoding header values
enum TransferEncoding { TE__OTHER=1, TE_CHUNKED, TE_IDENTITY };
if (fd_state[k] != nullptr)
File_Decomp_StopFree(fd_state[k]);
delete js_ctx[k];
+ delete js_ctx_mime[k];
}
delete_pipeline();
void delete_pipeline();
HttpJSNorm* js_ctx[2] = { nullptr, nullptr };
+ HttpJSNorm* js_ctx_mime[2] = { nullptr, nullptr };
bool cutover_on_clear = false;
bool ssl_search_abandoned = false;
ext_script_type = false;
output_size = jsn_ctx->script_size();
- trace_logf(3, js_trace, TRACE_DUMP, packet,
- "original[%zu]: %.*s\n", src_end - src_ptr, (int)(src_end - src_ptr), src_ptr);
-
return true;
}
bool HttpInlineJSNorm::post_proc(int ret)
{
- trace_logf(3, js_trace, TRACE_PROC, DetectionEngine::get_current_packet(),
- "normalizer returned with %d '%s'\n", ret, jsn::ret2str(ret));
-
assert(http_events);
assert(infractions);
"script continues\n");
}
- trace_logf(3, js_trace, TRACE_DUMP, packet,
- "original[%zu]: %.*s\n", src_end - src_ptr, (int)(src_end - src_ptr), src_ptr);
-
return true;
}
bool HttpExternalJSNorm::post_proc(int ret)
{
- trace_logf(3, js_trace, TRACE_PROC, DetectionEngine::get_current_packet(),
- "normalizer returned with %d '%s'\n", ret, jsn::ret2str(ret));
+ script_continue = ret == (int)jsn::JSTokenizer::SCRIPT_CONTINUE;
+
+ return JSNorm::post_proc(ret);
+}
+
+bool HttpPDFJSNorm::pre_proc()
+{
+ if (src_ptr >= src_end)
+ return false;
+
+ const Packet* packet = DetectionEngine::get_current_packet();
+
+ if (!ext_script_type)
+ {
+ HttpModule::increment_peg_counts(PEG_JS_PDF);
+ trace_logf(1, js_trace, TRACE_PROC, packet,
+ "PDF starts\n");
+ ext_script_type = true;
+ }
+ else
+ {
+ trace_logf(2, js_trace, TRACE_PROC, packet,
+ "PDF continues\n");
+ }
+
+ // an input stream should not write to its buffer
+ pdf_in.rdbuf()->pubsetbuf(const_cast<char*>((const char*)src_ptr), src_end - src_ptr);
+ pdf_out.clear();
+ delete[] buf_pdf_out.take_data();
+
+ auto r = extractor.process();
+
+ if (r != PDFTokenizer::PDFRet::EOS)
+ {
+ trace_logf(2, js_trace, TRACE_PROC, DetectionEngine::get_current_packet(),
+ "pdf processing failed: %d\n", (int)r);
+ return false;
+ }
+
+ src_ptr = (const uint8_t*)buf_pdf_out.data();
+ src_end = src_ptr + buf_pdf_out.data_len();
+
+ // script object not found
+ if (!src_ptr)
+ return false;
+
+ return true;
+}
+
+bool HttpPDFJSNorm::post_proc(int ret)
+{
+ src_ptr = src_end; // one time per PDU, even if JS Normalizer has not finished
script_continue = ret == (int)jsn::JSTokenizer::SCRIPT_CONTINUE;
#define HTTP_JS_NORM_H
#include <cstring>
+#include <FlexLexer.h>
#include "js_norm/js_norm.h"
+#include "js_norm/pdf_tokenizer.h"
#include "search_engines/search_tool.h"
+#include "utils/streambuf.h"
#include "http_field.h"
#include "http_flow_data.h"
bool post_proc(int) override;
};
+class HttpPDFJSNorm : public HttpJSNorm
+{
+public:
+ static bool is_pdf(const void* data, size_t len)
+ {
+ constexpr char magic[] = "%PDF-1.";
+ constexpr int magic_len = sizeof(magic) - 1;
+ return magic_len < len and !strncmp((const char*)data, magic, magic_len);
+ }
+
+ HttpPDFJSNorm(JSNormConfig* jsn_config, uint64_t tid) :
+ HttpJSNorm(jsn_config), pdf_out(&buf_pdf_out), extractor(pdf_in, pdf_out)
+ { trans_num = tid; }
+
+protected:
+ bool pre_proc() override;
+ bool post_proc(int) override;
+
+private:
+ snort::ostreambuf_infl buf_pdf_out;
+ std::istringstream pdf_in;
+ std::ostream pdf_out;
+ jsn::PDFTokenizer extractor;
+};
+
#endif
js_ctx = new HttpInlineJSNorm(jsn_config, trans_num, params->js_norm_param.mpse_otag,
params->js_norm_param.mpse_attr);
break;
+
+ case CT_APPLICATION_PDF:
+ js_ctx = new HttpPDFJSNorm(jsn_config, trans_num);
+ break;
+
+ case CT_APPLICATION_OCTET_STREAM:
+ js_ctx = first_body and HttpPDFJSNorm::is_pdf(decompressed_file_body.start(), decompressed_file_body.length()) ?
+ new HttpPDFJSNorm(jsn_config, trans_num) : nullptr;
+ break;
}
session_data->js_ctx[source_id] = js_ctx;
+ return js_ctx;
+}
+
+HttpJSNorm* HttpMsgBody::acquire_js_ctx_mime()
+{
+ HttpJSNorm* js_ctx = session_data->js_ctx_mime[source_id];
+
+ if (js_ctx)
+ {
+ if (js_ctx->get_trans_num() == trans_num)
+ return js_ctx;
+
+ delete js_ctx;
+ js_ctx = nullptr;
+ }
+ JSNormConfig* jsn_config = get_inspection_policy()->jsn_config;
+ js_ctx = HttpPDFJSNorm::is_pdf(decompressed_file_body.start(), decompressed_file_body.length()) ?
+ new HttpPDFJSNorm(jsn_config, trans_num) : nullptr;
+
+ session_data->js_ctx_mime[source_id] = js_ctx;
return js_ctx;
}
+void HttpMsgBody::clear_js_ctx_mime()
+{
+ delete session_data->js_ctx_mime[source_id];
+ session_data->js_ctx_mime[source_id] = nullptr;
+}
+
void HttpMsgBody::do_file_processing(const Field& file_data)
{
// Using the trick that cutter is deleted when regular or chunked body is complete
return false;
if ((mime_bufs != nullptr) && !mime_bufs->empty())
{
+ HttpJSNorm* js_ctx_tmp = nullptr;
auto mb = mime_bufs->cbegin();
+ uint32_t mime_bufs_size = mime_bufs->size();
+
for (uint32_t count = 0; (count < params->max_mime_attach) && (mb != mime_bufs->cend());
++count, ++mb)
{
+ bool is_last_attachment = ((count + 1 == mime_bufs_size) ||
+ (count + 1 == params->max_mime_attach));
const uint64_t idx = get_header(source_id)->get_multi_file_processing_id();
set_file_data(mb->file.start(), mb->file.length(), idx,
count or mb->file.is_accumulated(),
std::next(mb) != mime_bufs->end() or last_attachment_complete);
if (mb->vba.length() > 0)
ole_data.set(mb->vba.length(), mb->vba.start());
+ decompressed_file_body.reset();
+ decompressed_file_body.set(mb->file.length(), mb->file.start());
+
+ js_ctx_tmp = session_data->js_ctx[source_id];
+ session_data->js_ctx[source_id] = acquire_js_ctx_mime();
+
DetectionEngine::detect(p);
+
+ if (!is_last_attachment || last_attachment_complete)
+ clear_js_ctx_mime();
+
+ session_data->js_ctx[source_id] = js_ctx_tmp;
+
ole_data.reset();
decompressed_vba_data.reset();
+ decompressed_file_body.reset();
}
if (mb != mime_bufs->cend())
{
void do_file_decompression(const Field& input, Field& output);
void do_legacy_js_normalization(const Field& input, Field& output);
HttpJSNorm* acquire_js_ctx();
+ HttpJSNorm* acquire_js_ctx_mime();
+ void clear_js_ctx_mime();
void clean_partial(uint32_t& partial_inspected_octets, uint32_t& partial_detect_length,
uint8_t*& partial_detect_buffer, uint32_t& partial_js_detect_length);
const StrCode HttpMsgHeadShared::content_type_list[] =
{
+ { CT_APPLICATION_PDF, "application/pdf" },
+ { CT_APPLICATION_OCTET_STREAM, "application/octet-stream" },
{ CT_APPLICATION_JAVASCRIPT, "application/javascript" },
{ CT_APPLICATION_ECMASCRIPT, "application/ecmascript" },
{ CT_APPLICATION_X_JAVASCRIPT, "application/x-javascript" },
{ CountType::SUM, "total_bytes", "total HTTP data bytes inspected" },
{ CountType::SUM, "js_inline_scripts", "total number of inline JavaScripts processed" },
{ CountType::SUM, "js_external_scripts", "total number of external JavaScripts processed" },
+ { CountType::SUM, "js_pdf_scripts", "total number of PDF JavaScripts processed" },
{ CountType::SUM, "skip_mime_attach", "total number of HTTP requests with too many MIME attachments to inspect" },
{ CountType::END, nullptr, nullptr }
};