]> git.ipfire.org Git - thirdparty/snort3.git/commitdiff
Pull request #4482: js_norm: add cross-PDU PDF token reassembly
authorDanylo Kyrylov -X (dkyrylov - SOFTSERVE INC at Cisco) <dkyrylov@cisco.com>
Mon, 21 Oct 2024 20:40:57 +0000 (20:40 +0000)
committerOleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Mon, 21 Oct 2024 20:40:57 +0000 (20:40 +0000)
Merge in SNORT/snort3 from ~DKYRYLOV/snort3:js_pdf_token_split to master

Squashed commit of the following:

commit 9bb663ecbe181eec9401428277a80d0068a10801
Author: dkyrylov <dkyrylov@cisco.com>
Date:   Thu Oct 10 13:39:45 2024 +0300

    js_norm: add cross-PDU PDF token reassembly

src/js_norm/js_pdf_norm.cc
src/js_norm/js_pdf_norm.h
src/js_norm/pdf_tokenizer.h
src/js_norm/pdf_tokenizer.l
src/js_norm/test/CMakeLists.txt
src/js_norm/test/pdf_tokenizer_benchmark.cc
src/js_norm/test/pdf_tokenizer_test.cc

index ad3d8df79420fd1cb2ffcf2064e3762d03f663f8..557a7a78c7886f092707d304e38d0fd52d01b3c0 100644 (file)
@@ -50,6 +50,7 @@ bool PDFJSNorm::pre_proc()
     }
 
     buf_pdf_in.pubsetbuf(nullptr, 0)
+        ->pubsetbuf(state_buf, state_len)
         ->pubsetbuf(const_cast<char*>((const char*)src_ptr), src_end - src_ptr);
     pdf_out.clear();
     delete[] buf_pdf_out.take_data();
index 47b379fdeeb59b2baa703f74522305bbfb8ad4cf..4573ac7863b97ea20d0d69b3661f3ecb6be1b5f5 100644 (file)
@@ -47,14 +47,19 @@ public:
     PDFJSNorm(JSNormConfig* cfg, uint32_t gen_id) :
         JSNorm(cfg, false, gen_id),
         pdf_in(&buf_pdf_in), pdf_out(&buf_pdf_out),
-        extractor(pdf_in, pdf_out, cfg ? cfg->pdf_max_dictionary_depth : 0)
+        extractor(pdf_in, pdf_out, state_buf, state_len, cfg ? cfg->pdf_max_dictionary_depth : 0)
     { }
 
+    virtual ~PDFJSNorm() override
+    { delete[] state_buf; }
+
 protected:
     bool pre_proc() override;
     bool post_proc(int) override;
 
 private:
+    char* state_buf = nullptr;
+    int state_len = 0;
     snort::istreambuf_glue buf_pdf_in;
     snort::ostreambuf_infl buf_pdf_out;
     std::istream pdf_in;
index bb1972955b5932c963552dc65dcafb373fdd407d..17223cec9d93e66be15e49741a3ed654d686f4be 100644 (file)
@@ -48,7 +48,7 @@ public:
     };
 
     PDFTokenizer() = delete;
-    explicit PDFTokenizer(std::istream& in, std::ostream& out, int dictionaries_max_size);
+    explicit PDFTokenizer(std::istream& in, std::ostream& out, char*& state_buf, int& state_len, int dictionaries_max_size);
     ~PDFTokenizer() override;
 
     PDFRet process();
@@ -56,9 +56,15 @@ public:
 private:
     int yylex() override;
 
+    void state_add(int len);
+    void state_store();
+    void state_clear();
+    void state_act();
+
     PDFRet h_dict_open();
     PDFRet h_dict_close();
     PDFRet h_dict_name();
+    PDFRet h_dict_number();
     PDFRet h_dict_other();
     inline bool h_lit_str();
     inline bool h_hex_str();
@@ -70,9 +76,12 @@ private:
     PDFRet h_hex_hex2chr_u16();
     PDFRet h_lit_u16();
     PDFRet h_lit_u16_unescape();
-    PDFRet h_stream_open();
-    PDFRet h_stream();
     PDFRet h_array_nesting();
+    PDFRet h_stream_open();
+    void h_stream();
+    void h_stream_part_close();
+    PDFRet h_stream_dump_remainder();
+    PDFRet h_stream_dump_remainder_u16();
     bool h_stream_close();
     void h_stream_length();
     void h_ref();
@@ -105,9 +114,10 @@ private:
     struct ObjectDictionary
     {
         void clear()
-        { key_value = true; array_level = 0; }
+        { key_value = true; consecutive_number = false; array_level = 0; }
 
         bool key_value = true;
+        bool consecutive_number = false;
         int array_level = 0;
     };
 
@@ -130,10 +140,15 @@ private:
     struct Stream
     {
         int rem_length = -1;
+        int endstream_part = 0;
         bool is_js = false;
         bool is_ref_len = false;
     };
 
+    char*& state_buf;
+    int& state_len;
+    bool state_added = false;
+
     ObjectString obj_string;
     ObjectArray obj_array;
     std::stack<ObjectDictionary> dictionaries;
index ee2b34cda3a5c4b78906aed4cca23441f3ad80c0..df5207c336f9e62ee4daca79007787c132eefdc2 100644 (file)
@@ -52,6 +52,7 @@ using namespace jsn;
 
 #define YY_USER_ACTION                                                         \
     {                                                                          \
+        state_act();                                                           \
         debug_logf(5, js_trace, TRACE_PDF_PROC, nullptr,                       \
             "PDF pattern #%d, sc %d\n", yy_act, YY_START);                     \
         debug_logf(5, js_trace, TRACE_PDF_DUMP, nullptr,                       \
@@ -87,6 +88,7 @@ COMMENT_END        {EOL_MARKER}
 
 /* 7.3.2 Boolean Objects */
 OBJ_BOOLEAN        true|false
+OBJ_PARTIAL_BOOL   t|tr|tru|f|fa|fal|fals
 
 /* 7.3.3 Numeric Objects */
 OBJ_INT_NUM        [+-]?[0-9]{1,16}
@@ -132,17 +134,22 @@ OBJ_DICT_CLOSE     ">>"
 OBJ_DICT_SKIP      .|{GRP_NEWLINE}
 
 /* 7.3.8 Stream Objects */
-OBJ_STREAM_OPEN    stream\r?\n
-OBJ_STREAM_CLOSE   {EOL_MARKER}endstream
-OBJ_STREAM_SKIP    {GRP_NOT_NEWLINE}{1,16}|{GRP_NEWLINE}
+OBJ_STREAM_OPEN            stream\r?\n
+OBJ_STREAM_PARTIAL_OPEN    s|st|str|stre|strea|stream|stream\r
+OBJ_STREAM_CLOSE           endstream
+OBJ_STREAM_PARTIAL_CLOSE   e|en|end|ends|endst|endstr|endstre|endstrea
+OBJ_STREAM_SKIP            [^e]{1,16}
 
 /* 7.3.9 Null Object */
 OBJ_NULL           null
+OBJ_PARTIAL_NULL   n|nu|nul
 
 /* 7.3.10 Indirect Objects */
-INDIRECT_OBJ_OPEN  {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj
+INDIRECT_OBJ_OPEN          {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+obj
+INDIRECT_OBJ_PARTIAL_OPEN  {OBJ_INT_NUM}{GRP_WHITESPACE}*{OBJ_INT_NUM}?{GRP_WHITESPACE}*(o|ob)
 
 INDIRECT_OBJ_CLOSE endobj
+INDIRECT_OBJ_PARTIAL_CLOSE e|en|end|endo|endob
 
 OBJ_REFERENCE      {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
 
@@ -180,8 +187,10 @@ WHITESPACE         {GRP_WHITESPACE}{1,16}
 <comment>{COMMENT_END}                            { POP(); }
 
 <INITIAL>{INDIRECT_OBJ_OPEN}                      { PUSH(indobj); h_ind_obj_open(); }
+<INITIAL>{INDIRECT_OBJ_PARTIAL_OPEN}              { state_add(yyleng); }
 <indobj>{WHITESPACE}                              { }
 <indobj>{INDIRECT_OBJ_CLOSE}                      { POP(); h_ind_obj_close(); EXEC(h_array_nesting()); }
+<indobj>{INDIRECT_OBJ_PARTIAL_CLOSE}              { state_add(yyleng); }
 <indobj>{OBJ_ARRAY_OPEN}                          { PUSH(array); ++obj_array.nesting_level; }
 <indobj>{OBJ_REFERENCE}                           { indirect_obj.ref_met = true; }
 <indobj>{OBJ_BOOLEAN}                             { }
@@ -200,35 +209,41 @@ WHITESPACE         {GRP_WHITESPACE}{1,16}
 <array>{OBJ_NULL}                                 { }
 <array>{OBJ_NAME}                                 { }
 <array>{OBJ_LIT_STR_OPEN}                         { if (h_lit_open()) PUSH(litstr); }
-<array>{OBJ_HEX_STR_OPEN}                         { PUSH(hexstr); }
+<array>{OBJ_HEX_STR_OPEN}                         { PUSH(hexstr); state_add(yyleng); }
 <array>{OBJ_ARRAY_SKIP}                           { }
 <array>{INDIRECT_OBJ_CLOSE}                       { return PDFRet::UNEXPECTED_SYMBOL; }
 
 <indobj>{OBJ_STREAM_OPEN}                         { EXEC(h_stream_open()); PUSH(obj_stream.is_js ? u16 : stream); }
-<stream>{OBJ_STREAM_SKIP}                         { EXEC(h_stream()); }
-<jsstream>{OBJ_STREAM_SKIP}                       { EXEC(h_stream()); ECHO; }
-<jsstreamu16>{OBJ_STREAM_SKIP}                    { EXEC(h_stream()); EXEC(h_lit_u16()); }
+<indobj>{OBJ_STREAM_PARTIAL_OPEN}                 { state_add(yyleng); }
 <stream>{OBJ_STREAM_CLOSE}                        { if (h_stream_close()) POP(); }
+<stream>{OBJ_STREAM_PARTIAL_CLOSE}                { EXEC(h_stream_dump_remainder()); h_stream_part_close(); state_add(yyleng); }
 <jsstream>{OBJ_STREAM_CLOSE}                      { if (h_stream_close()) POP(); }
+<jsstream>{OBJ_STREAM_PARTIAL_CLOSE}              { EXEC(h_stream_dump_remainder()); h_stream_part_close(); state_add(yyleng); }
 <jsstreamu16>{OBJ_STREAM_CLOSE}                   { if (h_stream_close()) POP(); }
+<jsstreamu16>{OBJ_STREAM_PARTIAL_CLOSE}           { EXEC(h_stream_dump_remainder_u16()); h_stream_part_close(); state_add(yyleng); }
+<stream>{OBJ_STREAM_SKIP}                         { EXEC(h_stream_dump_remainder()); h_stream(); }
+<jsstream>{OBJ_STREAM_SKIP}                       { EXEC(h_stream_dump_remainder()); h_stream(); ECHO; }
+<jsstreamu16>{OBJ_STREAM_SKIP}                    { EXEC(h_stream_dump_remainder_u16()); h_stream(); EXEC(h_lit_u16()); }
 
 <dictnr>{OBJ_DICT_OPEN}                           { PUSH(dictnr); EXEC(h_dict_open()); }
 <indobj>{OBJ_DICT_OPEN}                           { PUSH(dictnr); EXEC(h_dict_open()); }
 <array>{OBJ_DICT_OPEN}                            { PUSH(dictnr); EXEC(h_dict_open()); }
 <array>{OBJ_DICT_CLOSE}                           { return PDFRet::INCORRECT_BRACKETS_NESTING; }
 <dictnr>{OBJ_DICT_CLOSE}                          { POP(); EXEC(h_dict_close()); }
-<dictnr>{WHITESPACE}                              { }
-<dictnr>{OBJ_REFERENCE}                           { EXEC(h_dict_other()); h_ref(); }
+<dictnr>{WHITESPACE}                              { state_add(yyleng); }
+<dictnr>{OBJ_REFERENCE}                           { dictionaries.top().consecutive_number = false; EXEC(h_dict_other()); h_ref(); }
 <dictnr>{OBJ_BOOLEAN}                             { EXEC(h_dict_other()); }
-<dictnr>{OBJ_INT_NUM}                             { EXEC(h_dict_other()); h_stream_length(); }
-<dictnr>{OBJ_REL_NUM}                             { EXEC(h_dict_other()); }
+<dictnr>{OBJ_PARTIAL_BOOL}                        { state_add(yyleng); }
+<dictnr>{OBJ_INT_NUM}                             { EXEC(h_dict_number()); h_stream_length(); state_add(yyleng); }
+<dictnr>{OBJ_REL_NUM}                             { EXEC(h_dict_number()); state_add(yyleng); }
 <dictnr>{OBJ_NULL}                                { EXEC(h_dict_other()); }
-<dictnr>{OBJ_NAME}                                { EXEC(h_dict_name()); }
+<dictnr>{OBJ_PARTIAL_NULL}                        { state_add(yyleng); }
+<dictnr>{OBJ_NAME}                                { EXEC(h_dict_name()); state_add(yyleng); }
 <dictnr>{OBJ_ARRAY_OPEN}                          { PUSH(array); ++obj_array.nesting_level; EXEC(h_dict_other()); }
 <dictnr>{OBJ_ARRAY_CLOSE}                         { return PDFRet::INCORRECT_BRACKETS_NESTING; }
 <dictnr>{OBJ_LIT_STR_OPEN}                        { EXEC(h_dict_other()); if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); }
-<dictnr>{OBJ_HEX_STR_OPEN}                        { EXEC(h_dict_other()); if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); }
-<dictnr>{OBJ_DICT_SKIP}                           { }
+<dictnr>{OBJ_HEX_STR_OPEN}                        { EXEC(h_dict_other()); if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); state_add(yyleng); }
+<dictnr>{OBJ_DICT_SKIP}                           { state_add(yyleng); }
 <dictnr>{INDIRECT_OBJ_CLOSE}                      { return PDFRet::UNEXPECTED_SYMBOL; }
 
 <indobj>{OBJ_LIT_STR_OPEN}                        { if (h_lit_open()) PUSH(litstr); }
@@ -240,7 +255,7 @@ WHITESPACE         {GRP_WHITESPACE}{1,16}
 <litstr>{LIT_STR_EOL}                             { }
 <litstr>{LIT_STR_BODY}                            { }
 
-<indobj>{OBJ_HEX_STR_OPEN}                        { PUSH(hexstr); }
+<indobj>{OBJ_HEX_STR_OPEN}                        { state_add(yyleng); PUSH(hexstr); }
 <hexstr>{OBJ_HEX_STR_CLOSE}                       { POP(); }
 <hexstr>{HEX_STR_BODY}                            { }
 <hexstr>{HEX_STR_SKIP}                            { }
@@ -270,9 +285,9 @@ WHITESPACE         {GRP_WHITESPACE}{1,16}
 <jshstru16>{HEX_STR_BODY}                         { EXEC(h_hex_hex2chr_u16()); }
 <jshstr,jshstru16>{HEX_STR_SKIP}                  { }
 
-<*><<EOF>>                                        { return PDFRet::EOS; }
+<*><<EOF>>                                        { state_store(); return PDFRet::EOS; }
 
-{SKIP}                                            { }
+{SKIP}                                            { state_add(yyleng); }
 <*>.|\n                                           { return PDFRet::UNEXPECTED_SYMBOL; }
 
 %%
@@ -314,6 +329,12 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_other()
     if (dictionaries.top().array_level != obj_array.nesting_level)
         return PDFRet::EOS;
 
+    if (dictionaries.top().consecutive_number)
+    {
+        dictionaries.top().consecutive_number = false;
+        dictionaries.top().key_value = !dictionaries.top().key_value;
+    }
+
     if (dictionaries.top().key_value)
         return PDFRet::NOT_NAME_IN_DICTIONARY_KEY;
 
@@ -328,11 +349,36 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_other()
     return PDFRet::EOS;
 }
 
+PDFTokenizer::PDFRet PDFTokenizer::h_dict_number()
+{
+    if(!dictionaries.top().consecutive_number)
+        state_clear();
+
+    if (dictionaries.top().key_value)
+        return PDFRet::NOT_NAME_IN_DICTIONARY_KEY;
+
+    debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
+        "dictionary token: number\n");
+
+    debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+        "dictionary entry: %s, %s\n", obj_entry.key, yytext);
+
+    dictionaries.top().consecutive_number = true;
+
+    return PDFRet::EOS;
+}
+
 PDFTokenizer::PDFRet PDFTokenizer::h_dict_name()
 {
     if (dictionaries.top().array_level != obj_array.nesting_level)
         return PDFRet::EOS;
 
+    if (dictionaries.top().consecutive_number)
+    {
+         dictionaries.top().consecutive_number = false;
+         dictionaries.top().key_value = !dictionaries.top().key_value;
+    }
+
     if (dictionaries.top().key_value)
         strncpy(obj_entry.key, yytext, sizeof(obj_entry.key) - 1);
 
@@ -495,9 +541,36 @@ PDFTokenizer::PDFRet PDFTokenizer::h_stream_open()
     return PDFRet::EOS;
 }
 
-PDFTokenizer::PDFRet PDFTokenizer::h_stream()
+void PDFTokenizer::h_stream()
 {
     obj_stream.rem_length -= yyleng;
+}
+
+void PDFTokenizer::h_stream_part_close()
+{
+    obj_stream.endstream_part = yyleng;
+}
+
+static const char endstream_tag[] = "endstream";
+
+PDFTokenizer::PDFRet PDFTokenizer::h_stream_dump_remainder()
+{
+    int part = obj_stream.endstream_part;
+    obj_stream.endstream_part = 0;
+    obj_stream.rem_length -= part;
+    if (YY_START == jsstream)
+        for(const char* c = endstream_tag; c < endstream_tag + part; c++)
+            yyout << *c;
+    return PDFRet::EOS;
+}
+
+PDFTokenizer::PDFRet PDFTokenizer::h_stream_dump_remainder_u16()
+{
+    int part = obj_stream.endstream_part;
+    obj_stream.endstream_part = 0;
+    obj_stream.rem_length -= part;
+    for(const char* c = endstream_tag; c < endstream_tag + part; c++)
+        EXEC(u16_eval(*c));
     return PDFRet::EOS;
 }
 
@@ -506,11 +579,7 @@ bool PDFTokenizer::h_stream_close()
     obj_stream.rem_length -= yyleng;
 
     if (obj_stream.rem_length <= 0)
-    {
-        if (YY_START == jsstream)
-            yyout << '\n';
         return true;
-    }
 
     if (YY_START == jsstream)
         ECHO;
@@ -558,6 +627,7 @@ void PDFTokenizer::h_u16_start()
     default:
         assert(false);
     }
+    u16_state.cur_byte = 0;
 }
 
 void PDFTokenizer::h_u16_break()
@@ -584,6 +654,7 @@ void PDFTokenizer::h_u16_hex_start()
     assert(YY_START == jshstr);
     POP();
     PUSH(jshstru16);
+    u16_state.cur_byte = 0;
 }
 
 void PDFTokenizer::h_u16_hex_break()
@@ -671,8 +742,8 @@ void PDFTokenizer::u16_to_u8(uint32_t code)
     yyout << out;
 }
 
-PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out, int dictionaries_max_size)
-    : yyFlexLexer(in, out), dictionaries_max_size(dictionaries_max_size)
+PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out, char*& state_buf, int& state_len, int dictionaries_max_size)
+    : yyFlexLexer(in, out), state_buf(state_buf), state_len(state_len), dictionaries_max_size(dictionaries_max_size)
 {
     dictionaries.push(ObjectDictionary());
 }
@@ -696,3 +767,50 @@ PDFTokenizer::PDFRet PDFTokenizer::process()
 
     return r;
 }
+
+void PDFTokenizer::state_add(int len)
+{
+    state_len += len;
+    state_added = true;
+}
+
+void PDFTokenizer::state_store()
+{
+    state_act();
+
+    if (state_len == 0)
+        return;
+
+    if (YY_START == hexstr)
+        POP();
+
+    if (!dictionaries.top().key_value and !dictionaries.top().consecutive_number)
+        dictionaries.top().key_value = true;
+
+    obj_stream.endstream_part = 0;
+
+    char* buf = new char[state_len];
+
+    yyin.seekg(-state_len, std::ios_base::end);
+    yyin.clear();
+    yyin.read(buf, state_len);
+
+    debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr,
+        "storing %d bytes for reassembly: \"%.*s\"\n",state_len,state_len,buf);
+
+    delete[] state_buf;
+    state_buf = buf;
+}
+
+void PDFTokenizer::state_clear()
+{
+    state_len = 0;
+}
+
+void PDFTokenizer::state_act()
+{
+   if (state_added)
+       state_added = false;
+   else
+       state_clear();
+}
index 5ba935362318a72db0e3f92c5f632a7f6cafee5c..79c410052373a6de3b67984a3cf1c571a391ef0f 100644 (file)
@@ -63,6 +63,7 @@ add_catch_test( jsn_test
 add_catch_test( pdf_tokenizer_test
     SOURCES
         ${pdf_tokenizer_OUTPUTS}
+        ${CMAKE_SOURCE_DIR}/src/helpers/streambuf.cc
         js_test_stubs.cc
 )
 
index f484fe1bd36c8724b1e4b4af340c29bc0dcc745e..d0a26dec69cce8145a1af62c845da6d85a654a5c 100644 (file)
@@ -36,6 +36,8 @@ using namespace jsn;
 using namespace snort;
 using namespace std;
 
+static constexpr int nesting_level = 10;
+
 static const string make_input(const char* begin, const char* mid, const char* end, size_t len)
 {
     string str(begin);
@@ -83,7 +85,9 @@ TEST_CASE("PDF Tokenizer, literals by 8 K", "[PDFTokenizer]")
     ostreambuf_infl buf_out;
     istream in(&buf_in);
     ostream out(&buf_out);
-    PDFTokenizer parser(in, out);
+    char* buf = nullptr;
+    int len;
+    PDFTokenizer parser(in, out, buf, len, nesting_level);
 
     BENCHMARK("memcpy()")
     {
@@ -124,6 +128,8 @@ TEST_CASE("PDF Tokenizer, literals by 8 K", "[PDFTokenizer]")
         rewind();
         return parser.process();
     };
+
+    delete[] buf;
 }
 
 TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]")
@@ -143,7 +149,9 @@ TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]")
     ostreambuf_infl buf_out;
     istream in(&buf_in);
     ostream out(&buf_out);
-    PDFTokenizer parser(in, out);
+    char* buf = nullptr;
+    int len;
+    PDFTokenizer parser(in, out, buf, len, nesting_level);
 
     BENCHMARK("memcpy()")
     {
@@ -184,6 +192,8 @@ TEST_CASE("PDF Parser, literals by 64 K", "[PDFTokenizer]")
         rewind();
         return parser.process();
     };
+
+    delete[] buf;
 }
 
 TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]")
@@ -195,7 +205,9 @@ TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]")
     ostreambuf_infl buf_out;
     istream in(&buf_in);
     ostream out(&buf_out);
-    PDFTokenizer parser(in, out);
+    char* buf = nullptr;
+    int len;
+    PDFTokenizer parser(in, out, buf, len, nesting_level);
 
     set_input(data);
     BENCHMARK("same object repeated")
@@ -203,6 +215,8 @@ TEST_CASE("PDF Tokenizer, indirect objects", "[PDFTokenizer]")
         rewind();
         return parser.process();
     };
+
+    delete[] buf;
 }
 
 #endif
index 8b22fe7fa307403d16f69ba673ce727b6c3dd0f5..5f43140ff27d1b6d91241a087ec89716cc50c6f8 100644 (file)
@@ -27,6 +27,7 @@
 #include <FlexLexer.h>
 
 #include "catch/catch.hpp"
+#include "helpers/streambuf.h"
 #include "js_norm/pdf_tokenizer.h"
 
 using namespace jsn;
@@ -41,26 +42,35 @@ static void test_pdf_proc(const string& source, const string& expected,
 {
     istringstream in(source);
     ostringstream out;
-    PDFTokenizer extractor(in, out, nesting_level);
+    char* buf = nullptr;
+    int len = 0;
+    PDFTokenizer extractor(in, out, buf, len, nesting_level);
 
     auto r = extractor.process();
 
+    delete[] buf;
+
     CHECK(ret == r);
     CHECK(expected == out.str());
 }
 
 static void test_pdf_proc(const vector<Chunk>& chunks)
 {
-    istringstream in;
+    snort::istreambuf_glue in_buf;
+    istream in(&in_buf);
     ostringstream out;
-    PDFTokenizer extractor(in, out, nesting_level);
+    char* state_buf = nullptr;
+    int state_len = 0;
+    PDFTokenizer extractor(in, out, state_buf, state_len, nesting_level);
 
     for (const auto& chunk : chunks)
     {
         auto src = chunk.first;
         auto exp = chunk.second;
 
-        in.str(src);
+        in_buf.pubsetbuf(nullptr,0)
+            ->pubsetbuf(state_buf, state_len)
+            ->pubsetbuf(const_cast<char*>(src.c_str()), src.length());
         out.str("");
 
         auto r = extractor.process();
@@ -68,17 +78,12 @@ static void test_pdf_proc(const vector<Chunk>& chunks)
         CHECK(PDFTokenizer::PDFRet::EOS == r);
         CHECK(exp == out.str());
     }
+
+    delete[] state_buf;
 }
 
 TEST_CASE("basic", "[PDFTokenizer]")
 {
-    SECTION("no input")
-    {
-        test_pdf_proc(
-            "",
-            ""
-        );
-    }
     SECTION("minimal PDF")
     {
         test_pdf_proc(
@@ -355,6 +360,45 @@ TEST_CASE("basic", "[PDFTokenizer]")
             "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
         );
     }
+    SECTION("number as a key")
+    {
+        test_pdf_proc(
+            "1 0 obj"
+            "<<"
+            "/K1 /V1"
+            "1234 /V2"
+            "/JS (foo)"
+            ">>"
+            "endobj",
+            "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+        );
+    }
+    SECTION("number as a key after a value")
+    {
+        test_pdf_proc(
+            "1 0 obj"
+            "<<"
+            "/K1 null "
+            "1234 (bar)"
+            "/JS (foo)"
+            ">>"
+            "endobj",
+            "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+        );
+    }
+    SECTION("value as a key after a number")
+    {
+        test_pdf_proc(
+            "1 0 obj"
+            "<<"
+            "/K1 1234 "
+            "null (bar) "
+            "/JS (foo)"
+            ">>"
+            "endobj",
+            "",  PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
+        );
+    }
     SECTION("token too long")
     {
         test_pdf_proc(
@@ -779,14 +823,7 @@ TEST_CASE("JS processing", "[PDFTokenizer]")
 
 TEST_CASE("split", "[PDFTokenizer]")
 {
-    SECTION("no input")
-    {
-        test_pdf_proc({
-            {"", ""},
-            {"", ""},
-            {"", ""}
-        });
-    }
+
     SECTION("minimal PDF")
     {
         test_pdf_proc({
@@ -814,6 +851,319 @@ TEST_CASE("split", "[PDFTokenizer]")
             {"<</JS(script 3)>>\nendobj", "script 3"}
         });
     }
+
+    SECTION("split in indirect object index: first number")
+    {
+        test_pdf_proc({
+            {"\n1", ""},
+            {"23 0 obj\n", ""},
+            {"<< /JS (a % b) >>\nendobj\n", "a % b"},
+        });
+    }
+    SECTION("split in indirect object index: between numbers")
+    {
+        test_pdf_proc({
+            {"\n1", ""},
+            {" 0 obj\n", ""},
+            {"<< /JS (a % b) >>\nendobj\n", "a % b"},
+        });
+    }
+    SECTION("split in indirect object index: second number")
+    {
+        test_pdf_proc({
+            {"\n1 12", ""},
+            {"3 obj\n", ""},
+            {"<< /JS (a % b) >>\nendobj\n", "a % b"},
+        });
+    }
+    SECTION("split in indirect object index: after numbers")
+    {
+        test_pdf_proc({
+            {"\n1 0", ""},
+            {" obj\n", ""},
+            {"<< /JS (a % b) >>\nendobj\n", "a % b"},
+        });
+    }
+    SECTION("split in indirect object index: in keyword")
+    {
+        test_pdf_proc({
+            {"\n1 0 ob", ""},
+            {"j\n", ""},
+            {"<< /JS (a % b) >>\nendobj\n", "a % b"},
+        });
+    }
+    SECTION("split in indirect object index: multi-PDU reassembly")
+    {
+        test_pdf_proc({
+            {"\n1 ", ""},
+            {"0 ob", ""},
+            {"j\n<< /JS (a % b) >>\nendobj\n", "a % b"},
+        });
+    }
+    SECTION("split in indirect object close keyword")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<< /K /V /JS (foo % bar) >>\nend", "foo % bar"},
+            {"obj\n", ""},
+            {"2 0 obj\n<</JS (c % d)>>\nendobj\n","c % d"}
+        });
+    }
+    SECTION("split in dictionary opening brackets")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<", ""},
+            {"< /JS (a % b) >>\nendobj\n", "a % b"},
+        });
+    }
+    SECTION("split in dictionary closing brackets")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<< /K /V /JS (foo % bar) >", "foo % bar"},
+            {">\nendobj\n", ""},
+            {"2 0 obj\n<</JS (c % d)>>\nendobj\n","c % d"}
+        });
+    }
+    SECTION("split in name as dict-key")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K/V /J", ""},
+            {"S (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in name as dict-value")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K/V /foo /foo", ""},
+            {"bar /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split after name as dict-value")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<< /K /V ", ""},
+            {"/JS (a % b) >>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in object reference value: first number")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K 12", ""},
+            {"3 0 R /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in object reference value: second number")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K 123 1", ""},
+            {"0 R /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in integer number value")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K 12", ""},
+            {"345 /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in integer number value after sign")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K +", ""},
+            {"1 /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in real number value before the dot")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K 123 0", ""},
+            {".5 /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in real number value after the dot")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K 123 0.", ""},
+            {"5 /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in boolean value")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K tr", ""},
+            {"ue /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in null value")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K nu", ""},
+            {"ll /JS (a % b)>>\nendobj\n", "a % b"}
+        });
+    }
+    SECTION("split in object reference to stream")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<</K /V /JS 2 ", ""},
+            {" 0 R >>\nendobj\n", ""},
+            {
+                "2 0 obj\n"
+                "<< /Length 20 >>\n"
+                "stream\n"
+                "JavaScript in stream\n"
+                "endstream\n"
+                "endobj\n",
+                "JavaScript in stream\n"
+            }
+        });
+    }
+    SECTION("split in hex string")
+    {
+        test_pdf_proc({
+            {"\n1 0 obj\n", ""},
+            {"<< /JS <", ""},
+            {"66 6F 6F 20 62 61 72> >>\nendobj\n", "foo bar"},
+        });
+    }
+    SECTION("split in stream: length key")
+    {
+        test_pdf_proc({
+            {
+                "\n1 0 obj\n"
+                "<</K /V /JS 2 0 R>>\n"
+                "endobj\n"
+                "2 0 obj\n"
+                "<< /Leng",
+                ""
+            },
+            {
+                "th 20 >>\n"
+                "stream\n"
+                "JavaScript in stream\n"
+                "endstream\n"
+                "endobj\n",
+                "JavaScript in stream\n"
+            }
+        });
+    }
+    SECTION("split in stream: length value")
+    {
+        test_pdf_proc({
+            {
+                "\n1 0 obj\n"
+                "<</K /V /JS 2 0 R>>\n"
+                "endobj\n"
+                "2 0 obj\n"
+                "<< /Length 2",
+                ""
+            },
+            {
+                "0 >>\n"
+                "stream\n"
+                "JavaScript in stream\n"
+                "endstream\n"
+                "endobj\n",
+                "JavaScript in stream\n"
+            }
+        });
+    }
+    SECTION("split in stream: stream keyword")
+    {
+        test_pdf_proc({
+            {
+                "\n1 0 obj\n"
+                "<</K /V /JS 2 0 R>>\n"
+                "endobj\n"
+                "2 0 obj\n"
+                "<< /Length 20 >>\n st",
+                ""
+            },
+            {
+                "ream\n"
+                "JavaScript in stream\n"
+                "endstream\n"
+                "endobj\n",
+                "JavaScript in stream\n"
+            }
+        });
+    }
+    SECTION("split in stream: content")
+    {
+        test_pdf_proc({
+            {
+                "\n1 0 obj\n"
+                "<</K /V /JS 2 0 R>>\n"
+                "endobj\n"
+                "2 0 obj\n"
+                "<< /Length 20 >>\n stream\n"
+                "JavaScript",
+                "JavaScript"
+            },
+            {
+                " in stream\n"
+                "endstream\n"
+                "endobj\n",
+                " in stream\n"
+            }
+        });
+    }
+    SECTION("split in stream: content that looks like endstream")
+    {
+        test_pdf_proc({
+            {
+                "\n1 0 obj\n"
+                "<</K /V /JS 2 0 R>>\n"
+                "endobj\n"
+                "2 0 obj\n"
+                "<< /Length 23 >>\n stream\n"
+                "end",
+                ""
+            },
+            {
+                "stream in JavaScript\n"
+                "endstream\n"
+                "endobj\n",
+                "endstream in JavaScript\n"
+            }
+        });
+    }
+    SECTION("split in stream: endstream keyword")
+    {
+        test_pdf_proc({
+            {
+                "\n1 0 obj\n"
+                "<</K /V /JS 2 0 R>>\n"
+                "endobj\n"
+                "2 0 obj\n"
+                "<< /Length 20 >>\n stream\n"
+                "JavaScript in stream\n"
+                "end",
+                "JavaScript in stream\n"
+            },
+            {
+                "stream\n"
+                "endobj\n"
+                "\n3 0 obj\n"
+                "<</K /V /JS (foo)>>\n"
+                "endobj\n",
+                "foo"
+            }
+        });
+    }
 }
 
 TEST_CASE("stream object", "[PDFTokenizer]")
@@ -866,7 +1216,7 @@ TEST_CASE("stream object", "[PDFTokenizer]")
             "bar\r\n"
             "endstream\n"
             "endobj\n",
-            "bar\n"
+            "bar\r\n"
         );
     }
     SECTION("reference as length")
@@ -903,7 +1253,7 @@ TEST_CASE("stream object", "[PDFTokenizer]")
             "\nendstream\n \r\n"
             "endstream\n"
             "endobj\n",
-            "\nendstream\n \n"
+            "\nendstream\n \r\n"
         );
     }
     SECTION("referenced JavaScript")
@@ -1143,8 +1493,7 @@ TEST_CASE("stream object malformed", "[PDFTokenizer]")
             "foo"
             "endstream\n"
             "endobj\n",
-            "fooendstream\n"
-            "endobj\n", PDFTokenizer::PDFRet::EOS
+            "foo", PDFTokenizer::PDFRet::EOS
         );
     }
 }
@@ -1292,6 +1641,24 @@ TEST_CASE("UTF-16, basic", "[PDFTokenizer]")
             "foo"s
         );
     }
+    SECTION("stream with 'endstream' content")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<<"
+            "/S /JavaScript"
+            "/JS 2 0 R"
+            ">>\n"
+            "endobj\n"
+            "2 0 obj\n"
+            "<</Length 8>>\n"
+            "stream\n"
+            "\xfe\xff\0e\0n\0d\0s\0t\0r\0e\0a\0m\n"
+            "endstream\n"
+            "endobj"s,
+            "endstream"s
+        );
+    }
     SECTION("hexadecimal string")
     {
         test_pdf_proc(
@@ -1424,6 +1791,17 @@ TEST_CASE("UTF-16, basic", "[PDFTokenizer]")
             PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
         );
     }
+    SECTION("unfinished trailing symbol")
+    {
+        test_pdf_proc(
+            "1 0 obj\n"
+            "<<"
+            "/S /JavaScript"
+            "/JS (\xfe\xff\0f\0o\0o\xF0)"
+            "/JS (\xfe\xff\0b\0a\0r\xBA)"s,
+            "foobar"s
+        );
+    }
 }
 
 TEST_CASE("UTF-16, cross-PDU", "[PDFTokenizer]")
@@ -1524,4 +1902,48 @@ TEST_CASE("UTF-16, cross-PDU", "[PDFTokenizer]")
             }
         });
     }
+    SECTION("split in stream that looks like endstream")
+    {
+        test_pdf_proc({
+            {
+                "1 0 obj\n"
+                "<</S/JavaScript/JS 2 0 R>>\n"
+                "endobj\n"
+                "2 0 obj\n"
+                "<</Length 14>>\n"
+                "stream\n"
+                "\xfe\xff\0f\0o\0o\0e"s,
+                "foo"s
+            },
+            {
+                "\0n\0d\n"
+                "endstream\n"
+                "endobj"s,
+                "end"s
+            }
+        });
+    }
+    SECTION("split in endstream tag")
+    {
+        test_pdf_proc({
+            {
+                "1 0 obj\n"
+                "<</S/JavaScript/JS 2 0 R>>\n"
+                "endobj\n"
+                "2 0 obj\n"
+                "<</Length 14>>\n"
+                "stream\n"
+                "\xfe\xff\0f\0o\0o\nend"s,
+                "foo"s
+            },
+            {
+                "stream\n"
+                "endobj\n"
+                "3 0 obj\n"
+                "<</S/JavaScript/JS(foo)>>\n"
+                "endobj\n"s,
+                "foo"s
+            }
+        });
+    }
 }