]> git.ipfire.org Git - thirdparty/snort3.git/commitdiff
Merge pull request #3016 in SNORT/snort3 from ~OSHUMEIK/snort3:over_pdus to master
authorMike Stepanek (mstepane) <mstepane@cisco.com>
Thu, 19 Aug 2021 14:55:45 +0000 (14:55 +0000)
committerMike Stepanek (mstepane) <mstepane@cisco.com>
Thu, 19 Aug 2021 14:55:45 +0000 (14:55 +0000)
Squashed commit of the following:

commit 2c30e5ef0968f45b98b9618342f5311b32146c97
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Mon Jul 26 14:59:35 2021 +0300

    utils: support streamed processing of JS text

    Unit tests added.

    Custom streambuf introduced, so Normalizer will see the next chunk
    as a continuation.

    The capacity of the tracking stack is set to 8, since Lexer has '.' pattern
    for identifiers, and a single character could form a complete token (in Lexer terms).

commit 96f844e272943906c4373790c69f4236a8799be7
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Mon Aug 9 14:52:08 2021 +0300

    utils: address compiler warning

commit 9511296dd877a85da574b146ef43689713369d41
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Mon Jul 26 14:34:35 2021 +0300

    http_inspect: check if Normalizer has consumed input

    The inspector logic expects that Normalizer must consume input bytes.
    If not the normalization is aborted, since there is no other consumer available.

src/service_inspectors/http_inspect/http_js_norm.cc
src/utils/js_normalizer.cc
src/utils/js_normalizer.h
src/utils/js_tokenizer.h
src/utils/js_tokenizer.l
src/utils/test/js_normalizer_test.cc

index 44f806d4bb74e7b7baee9fbc95682fd41a2e9814..e371d5f413a3ff5a7b1131879d88c9e4bb96feaf 100644 (file)
@@ -37,11 +37,14 @@ static inline JSTokenizer::JSRet js_normalize(JSNormalizer& ctx, const char* con
     const char* dst_end, const char*& ptr, char*& dst)
 {
     auto ret = ctx.normalize(ptr, end - ptr, dst, dst_end - dst);
-    
     auto next = ctx.get_src_next();
-    HttpModule::increment_peg_counts(PEG_JS_BYTES, next - ptr);
 
-    ptr = next;   
+    if (next > ptr)
+        HttpModule::increment_peg_counts(PEG_JS_BYTES, next - ptr);
+    else
+        next = end; // Normalizer has failed, thus aborting the remaining input
+
+    ptr = next;
     dst = ctx.get_dst_next();
 
     return ret;
index 3687be6ced39afbc7c5a498124cecd49b82a0273..9e6067782f1a5e31f12b03732053efdd364e6814 100644 (file)
 #include "js_normalizer.h"
 
 using namespace snort;
+using namespace std;
 
 JSNormalizer::JSNormalizer(JSIdentifierCtxBase& js_ident_ctx, size_t norm_depth,
-    uint8_t max_template_nesting)
+    uint8_t max_template_nesting, int tmp_cap_size)
     : depth(norm_depth),
       rem_bytes(norm_depth),
-      unlim(norm_depth == (size_t) - 1),
+      unlim(norm_depth == static_cast<size_t>(-1)),
       src_next(nullptr),
       dst_next(nullptr),
-      tokenizer(in, out, js_ident_ctx, max_template_nesting)
+      tmp_buf(nullptr),
+      tmp_buf_size(0),
+      in(&in_buf),
+      out(&out_buf),
+      tokenizer(in, out, js_ident_ctx, max_template_nesting, tmp_buf, tmp_buf_size, tmp_cap_size)
 {
 }
 
+JSNormalizer::~JSNormalizer()
+{
+    delete[] tmp_buf;
+    tmp_buf = nullptr;
+    tmp_buf_size = 0;
+}
+
 JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len, char* dst, size_t dst_len)
 {
     if (rem_bytes == 0 && !unlim)
@@ -47,13 +59,14 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len, char
 
     size_t len = unlim ? src_len :
         src_len < rem_bytes ? src_len : rem_bytes;
-    in.rdbuf()->pubsetbuf(const_cast<char*>(src), len);
-    out.rdbuf()->pubsetbuf(dst, dst_len);
 
-    JSTokenizer::JSRet ret = (JSTokenizer::JSRet)tokenizer.yylex();
+    in_buf.pubsetbuf(tmp_buf, tmp_buf_size, const_cast<char*>(src), len);
+    out_buf.pubsetbuf(dst, dst_len);
+
+    JSTokenizer::JSRet ret = static_cast<JSTokenizer::JSRet>(tokenizer.yylex());
     in.clear();
     out.clear();
-    size_t r_bytes = in.tellg();
+    size_t r_bytes = in_buf.glued() ? static_cast<size_t>(in.tellg()) : 0;
     size_t w_bytes = out.tellp();
 
     if (!unlim)
index 84e58bc3f763bf249264cf2ad863a7581c6398c2..f0dd5896959ac6c605ff554f2eab0097cc14c41a 100644 (file)
 namespace snort
 {
 
+class gluebuf : public std::stringbuf
+{
+public:
+    gluebuf() :
+        std::stringbuf(), once(true),
+        src1(nullptr), len1(0), src2(nullptr), len2(0)
+    { }
+
+    std::streambuf* pubsetbuf(char* buf1, std::streamsize buf1_len,
+        char* buf2, std::streamsize buf2_len)
+    {
+        once = !(buf1 && buf1_len);
+
+        if (once)
+        {
+            setbuf(buf2, buf2_len);
+            current_src_len = buf2_len;
+        }
+        else
+        {
+            setbuf(buf1, buf1_len);
+            current_src_len = buf1_len;
+        }
+        src1 = buf1;
+        len1 = buf1_len;
+        src2 = buf2;
+        len2 = buf2_len;
+        return this;
+    }
+
+    bool glued() const
+    {
+        return once;
+    }
+
+protected:
+    virtual std::streampos seekoff(std::streamoff off,
+        std::ios_base::seekdir way, std::ios_base::openmode which) override
+    {
+        if (way != std::ios_base::end)
+            return std::stringbuf::seekoff(off, way, which);
+
+        if (current_src_len + off < 0 and once)
+        {
+            off += current_src_len;
+            once = false;
+            setbuf(src1, len1);
+            current_src_len = len1;
+        }
+
+        return std::stringbuf::seekoff(off, way, which);
+    }
+
+    virtual int underflow() override
+    {
+        if (once)
+            return EOF;
+
+        once = true;
+        setbuf(src2, len2);
+        current_src_len = len2;
+        return sgetc();
+    }
+
+private:
+    bool once;
+    std::streamsize current_src_len;
+    char* src1;
+    std::streamsize len1;
+    char* src2;
+    std::streamsize len2;
+};
+
 class JSNormalizer
 {
 public:
-    JSNormalizer(JSIdentifierCtxBase& js_ident_ctx, size_t depth, uint8_t max_template_nesting);
+    JSNormalizer(JSIdentifierCtxBase& js_ident_ctx, size_t depth,
+        uint8_t max_template_nesting, int tmp_cap_size = JSTOKENIZER_BUF_MAX_SIZE);
+    ~JSNormalizer();
 
     const char* get_src_next() const
     { return src_next; }
@@ -54,12 +129,16 @@ private:
     const char* src_next;
     char* dst_next;
 
-    std::stringstream in;
-    std::stringstream out;
+    char* tmp_buf;
+    size_t tmp_buf_size;
+
+    gluebuf in_buf;
+    std::stringbuf out_buf;
+    std::istream in;
+    std::ostream out;
     JSTokenizer tokenizer;
 };
 
 }
 
 #endif //JS_NORMALIZER_H
-
index 3bb13a99fb01a998b5af388db7548cfba795dae2..c6c3bc1f0aea069e403bd78484acffe8ce51eae5 100644 (file)
 
 #include "log/messages.h"
 
+// The longest pattern has 9 characters " < / s c r i p t > ",
+// 8 of them can reside in 1st chunk
+// Each character in the identifier forms its own group (pattern matching case),
+// i.e. in the current implementation IDENTIFIER has " . " rule.
+#define JSTOKENIZER_MAX_STATES 8
+
+// To hold potentially long identifiers
+#define JSTOKENIZER_BUF_MAX_SIZE 256
+
 class JSIdentifierCtxBase;
 
 class JSTokenizer : public yyFlexLexer
@@ -55,7 +64,9 @@ public:
         TEMPLATE_NESTING_OVERFLOW
     };
 
-    JSTokenizer(std::istream& in, std::ostream& out, JSIdentifierCtxBase& ident_ctx, uint8_t max_template_nesting);
+    JSTokenizer(std::istream& in, std::ostream& out, JSIdentifierCtxBase& ident_ctx,
+        uint8_t max_template_nesting, char*& buf, size_t& buf_size,
+        int cap_size = JSTOKENIZER_BUF_MAX_SIZE);
     ~JSTokenizer() override;
 
     // returns JSRet
@@ -77,7 +88,10 @@ private:
     void process_closing_bracket();
     JSRet process_subst_open();
 
-private:
+    void states_push();
+    void states_apply();
+    void states_correct(int);
+
     void* cur_buffer;
     void* tmp_buffer = nullptr;
     std::stringstream tmp;
@@ -85,6 +99,18 @@ private:
     std::stack<uint16_t, std::vector<uint16_t>> bracket_depth;
     JSToken token = UNDEFINED;
     JSIdentifierCtxBase& ident_ctx;
+
+    struct
+    {
+        JSToken token = UNDEFINED;          // the token before
+        int length = 0;                     // current token length
+        int sc = 0;                         // current Starting Condition
+    } states[JSTOKENIZER_MAX_STATES];
+    int sp = 0;                             // points to the top of states
+
+    char*& tmp_buf;
+    size_t& tmp_buf_size;
+    const int tmp_cap_size;
 };
 
 #endif // JS_TOKENIZER_H
index 11972f120157356bea7e47c98a0614708a062b97..d2cb3e04c294ce4b293dbdb44a9f0534047ccc05 100644 (file)
 %option c++
 
 %{
-    #ifdef HAVE_CONFIG_H
-    #include "config.h"
-    #endif
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
 
-    #include "utils/js_identifier_ctx.h"
-    #include "utils/js_tokenizer.h"
+#include "utils/js_tokenizer.h"
 
-    #include <cassert>
+#include <cassert>
 
-    #include "utils/util_cstring.h"
+#include "utils/js_identifier_ctx.h"
+#include "utils/util_cstring.h"
 
-    #define EXEC(f) { auto r = (f); if (r) { BEGIN(regst); return r; } }
+#define YY_USER_ACTION { states_push(); }
+#define EXEC(f) { auto r = (f); if (r) { BEGIN(regst); return r; } }
+#define EEOF(f) { auto r = (f); if (r) { if (r != SCRIPT_CONTINUE) BEGIN(regst); return r; } }
 %}
 
 /* The following grammar was created based on ECMAScript specification */
@@ -881,12 +883,15 @@ LITERAL_HEX_INTEGER           0x[0-9a-fA-F]*|0X[0-9a-fA-F]*
 LITERAL_DQ_STRING_START       \"
 LITERAL_DQ_STRING_END         \"
 LITERAL_DQ_STRING_SKIP        \\\"
+LITERAL_DQ_STRING_TEXT        .
 LITERAL_SQ_STRING_START       \'
 LITERAL_SQ_STRING_END         \'
 LITERAL_SQ_STRING_SKIP        \\\'
+LITERAL_SQ_STRING_TEXT        .
 LITERAL_TEMPLATE_START        \`
 LITERAL_TEMPLATE_END          \`
 LITERAL_TEMPLATE_SUBST_START  \$\{
+LITERAL_TEMPLATE_OTHER        .
 LITERAL_REGEX_START           \/[^*\/]
 LITERAL_REGEX_END             \/[gimsuy]*
 LITERAL_REGEX_SKIP            \\\/
@@ -946,14 +951,14 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <lcomm>{LINE_COMMENT_END3}          { BEGIN(regst); return OPENING_TAG; }
 <lcomm>{LINE_COMMENT_END4}          { BEGIN(regst); return CLOSING_TAG; }
 <lcomm>{LINE_COMMENT_SKIP}          { }
-<lcomm><<EOF>>                      { return SCRIPT_CONTINUE; }
+<lcomm><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
        {BLOCK_COMMENT_START}        { BEGIN(bcomm); }
 <bcomm>{BLOCK_COMMENT_END1}         { BEGIN(regst); }
 <bcomm>{BLOCK_COMMENT_END2}         { BEGIN(regst); return OPENING_TAG; }
 <bcomm>{BLOCK_COMMENT_END3}         { BEGIN(regst); return CLOSING_TAG; }
 <bcomm>{BLOCK_COMMENT_SKIP}         { }
-<bcomm><<EOF>>                      { return SCRIPT_CONTINUE; }
+<bcomm><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
        {LITERAL_DQ_STRING_START}    { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(dqstr); }
 <dqstr>{LITERAL_DQ_STRING_END}      { ECHO; BEGIN(divop); }
@@ -963,8 +968,8 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <dqstr>\\{CR}                       { }
 <dqstr>{LINE_TERMINATORS}           { BEGIN(regst); return BAD_TOKEN; }
 <dqstr>{LITERAL_DQ_STRING_SKIP}     { ECHO; }
-<dqstr>.                            { ECHO; }
-<dqstr><<EOF>>                      { return SCRIPT_CONTINUE; }
+<dqstr>{LITERAL_DQ_STRING_TEXT}     { ECHO; }
+<dqstr><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
        {LITERAL_SQ_STRING_START}    { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(sqstr); }
 <sqstr>{LITERAL_SQ_STRING_END}      { ECHO; BEGIN(divop); }
@@ -974,10 +979,10 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <sqstr>\\{CR}                       { }
 <sqstr>{LINE_TERMINATORS}           { BEGIN(regst); return BAD_TOKEN; }
 <sqstr>{LITERAL_SQ_STRING_SKIP}     { ECHO; }
-<sqstr>.                            { ECHO; }
-<sqstr><<EOF>>                      { return SCRIPT_CONTINUE; }
+<sqstr>{LITERAL_SQ_STRING_TEXT}     { ECHO; }
+<sqstr><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
-{OPEN_BRACKET}                      { if (not bracket_depth.empty()) bracket_depth.top()++; process_punctuator(); }
+{OPEN_BRACKET}                      { if (!bracket_depth.empty()) bracket_depth.top()++; process_punctuator(); }
 {CLOSE_BRACKET}                     { process_closing_bracket(); }
 
        {LITERAL_TEMPLATE_START}                  { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(tmpll); }
@@ -986,10 +991,10 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <tmpll>{HTML_TAG_SCRIPT_CLOSE}                   { BEGIN(regst); return CLOSING_TAG; }
 <tmpll>(\\\\)*\\{LITERAL_TEMPLATE_SUBST_START}   | /* escaped template substitution */
 <tmpll>(\\\\)*\\{LITERAL_TEMPLATE_END}           | /* escaped backtick */
-<tmpll>.                                         { ECHO; }
+<tmpll>{LITERAL_TEMPLATE_OTHER}                  { ECHO; }
 <tmpll><<EOF>>                                   { return SCRIPT_CONTINUE; }
 
-<regst>{LITERAL_REGEX_START}        { EXEC(do_spacing(LITERAL)) yyout << '/'; yyless(1); BEGIN(regex); }
+<regst>{LITERAL_REGEX_START}        { EXEC(do_spacing(LITERAL)) yyout << '/'; states_correct(1); yyless(1); BEGIN(regex); }
 <regex>{LITERAL_REGEX_END}          { ECHO; BEGIN(divop); }
 <regex>{HTML_TAG_SCRIPT_CLOSE}      { BEGIN(regst); return CLOSING_TAG; }
 <regex>{LITERAL_REGEX_SKIP}         { ECHO; }
@@ -997,7 +1002,7 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <regex>\\{CR}                       |
 <regex>{LINE_TERMINATORS}           { BEGIN(regst); return BAD_TOKEN; }
 <regex>[^<{LF}{CR}{LS}{PS}\\\/]+    { ECHO; }
-<regex><<EOF>>                      { return SCRIPT_CONTINUE; }
+<regex><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
 <divop>{DIV_OPERATOR}               |
 <divop>{DIV_ASSIGNMENT_OPERATOR}    { ECHO; token = PUNCTUATOR; BEGIN(INITIAL); }
@@ -1013,7 +1018,7 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 {IDENTIFIER}                        { if (unescape(YYText())) { EXEC(do_spacing(IDENTIFIER)) EXEC(do_identifier_substitution(YYText())) } BEGIN(divop); }
 
 .|{ALL_UNICODE}                     { ECHO; token = UNDEFINED; BEGIN(INITIAL); }
-<<EOF>>                             { EXEC(eval_eof()) }
+<<EOF>>                             { EEOF(eval_eof()) }
 
 %%
 
@@ -1023,18 +1028,18 @@ static std::string unicode_to_utf8(const unsigned int code)
 {
     std::string res;
 
-    if ( code <= 0x7f )
+    if (code <= 0x7f)
         res += (char)code;
-    else if ( code <= 0x7ff )
+    else if (code <= 0x7ff)
     {
-        res += ( 0xc0 | (code >> 6) );
-        res += ( 0x80 | (code & 0x3f) );
+        res += 0xc0 | (code >> 6);
+        res += 0x80 | (code & 0x3f);
     }
-    else if ( code <= 0xffff )
+    else if (code <= 0xffff)
     {
-        res += ( 0xe0 | (code >> 12) );
-        res += ( 0x80 | ((code >> 6) & 0x3f) );
-        res += ( 0x80 | (code & 0x3f) );
+        res += 0xe0 | (code >> 12);
+        res += 0x80 | ((code >> 6) & 0x3f);
+        res += 0x80 | (code & 0x3f);
     }
 
     return res;
@@ -1052,17 +1057,17 @@ static std::string unescape_unicode(const char* lexeme)
     short digits_left = 4;
     std::string unicode_str;
 
-    for ( const auto& ch : lex )
+    for (const auto& ch : lex)
     {
-        if ( ch == '\\' )
+        if (ch == '\\')
         {
             is_unescape = true;
             continue;
         }
 
-        if ( is_unescape )
+        if (is_unescape)
         {
-            if ( ch == 'u' )
+            if (ch == 'u')
             {
                 is_unicode = true;
                 continue;
@@ -1070,10 +1075,10 @@ static std::string unescape_unicode(const char* lexeme)
             is_unescape = false;
         }
 
-        if ( is_unicode )
+        if (is_unicode)
         {
             unicode_str += ch;
-            if ( !(--digits_left) )
+            if (!(--digits_left))
             {
                 const unsigned int unicode = std::stoi(unicode_str, nullptr, 16);
                 res += unicode_to_utf8(unicode);
@@ -1093,11 +1098,15 @@ static std::string unescape_unicode(const char* lexeme)
 
 // JSTokenizer members
 
-JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out, JSIdentifierCtxBase& ident_ctx,
-    uint8_t max_template_nesting)
+JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out,
+    JSIdentifierCtxBase& mapper, uint8_t max_template_nesting,
+    char*& buf, size_t& buf_size, int cap_size)
     : yyFlexLexer(in, out),
       max_template_nesting(max_template_nesting),
-      ident_ctx(ident_ctx)
+      ident_ctx(mapper),
+      tmp_buf(buf),
+      tmp_buf_size(buf_size),
+      tmp_cap_size(cap_size)
 {
     BEGIN(regst);
 }
@@ -1105,6 +1114,9 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out, JSIdentifierCtxBas
 JSTokenizer::~JSTokenizer()
 {
     yy_delete_buffer((YY_BUFFER_STATE)tmp_buffer);
+    delete[] tmp_buf;
+    tmp_buf = nullptr;
+    tmp_buf_size = 0;
 }
 
 void JSTokenizer::switch_to_temporal(const std::string& data)
@@ -1130,13 +1142,15 @@ JSTokenizer::JSRet JSTokenizer::eval_eof()
 {
     // If the temporal scan buffer reaches EOF, cleanup and
     // continue with the initial one
-    if ( tmp_buffer )
+    if (tmp_buffer)
     {
         switch_to_initial();
         return EOS;
     }
 
     // Normal termination
+    states_apply();
+
     return SCRIPT_CONTINUE;
 }
 
@@ -1203,7 +1217,7 @@ JSTokenizer::JSRet JSTokenizer::do_identifier_substitution(const char* lexeme)
 
 bool JSTokenizer::unescape(const char* lexeme)
 {
-    if ( strstr(lexeme, "\\u") )
+    if (strstr(lexeme, "\\u"))
     {
         const std::string unescaped_lex = unescape_unicode(lexeme);
         switch_to_temporal(unescaped_lex);
@@ -1222,9 +1236,9 @@ void JSTokenizer::process_punctuator()
 
 void JSTokenizer::process_closing_bracket()
 {
-    if ( not bracket_depth.empty() ) 
+    if (!bracket_depth.empty())
     {
-        if ( bracket_depth.top() )
+        if (bracket_depth.top())
             bracket_depth.top()--;
         else
         {
@@ -1239,11 +1253,63 @@ void JSTokenizer::process_closing_bracket()
 
 JSTokenizer::JSRet JSTokenizer::process_subst_open()
 {
-    if ( bracket_depth.size() >= max_template_nesting )
+    if (bracket_depth.size() >= max_template_nesting)
         return TEMPLATE_NESTING_OVERFLOW;
     bracket_depth.push(0);
     token = PUNCTUATOR;
     ECHO;
-    BEGIN(divop); 
+    BEGIN(divop);
     return EOS;
-}
\ No newline at end of file
+}
+
+void JSTokenizer::states_push()
+{
+    assert(yyleng != 0);
+
+    sp++;
+    sp %= JSTOKENIZER_MAX_STATES;
+    auto& state = states[sp];
+
+    state.token = token;
+    state.length = yyleng;
+    state.sc = yy_start;
+}
+
+void JSTokenizer::states_correct(int take_off)
+{
+    auto& state = states[sp];
+    state.length -= yyleng - take_off;
+}
+
+void JSTokenizer::states_apply()
+{
+    int tail_size = 0;
+
+    for (int i = JSTOKENIZER_MAX_STATES; i > 0 && tail_size < tmp_cap_size; --i)
+    {
+        auto idx = sp + i;
+        idx %= JSTOKENIZER_MAX_STATES;
+        auto& state = states[idx];
+
+        if (state.length == 0)
+            continue;
+
+        token = state.token;
+        yy_start = state.sc;
+        tail_size += state.length;
+        tail_size = tail_size < tmp_cap_size ? tail_size : tmp_cap_size;
+    }
+
+    for (int i = 0; i < JSTOKENIZER_MAX_STATES; ++i)
+        states[i].length = 0;
+
+    char* buf = new char[tail_size];
+
+    yyin.seekg(-tail_size, std::ios_base::end);
+    yyin.clear();
+    yyin.read(buf, tail_size);
+
+    delete[] tmp_buf;
+    tmp_buf = buf;
+    tmp_buf_size = tail_size;
+}
index 79fbb9278c211421aa1dec7a75c1ec134a34a6c3..00cfa6b16be9e087d3900af06f42db657ae2de28 100644 (file)
@@ -43,13 +43,14 @@ public:
     const char* substitute(const char* identifier) override
     { return identifier; }
     void reset() override {}
-    size_t size() const override {}
+    size_t size() const override { return 0; }
 };
 
 using namespace snort;
 
 #define DEPTH 65535
 #define MAX_TEMPLATE_NESTNIG 4
+#define DST_SIZE 512
 
 #define NORMALIZE(src, expected)                                   \
     char dst[sizeof(expected)];                                    \
@@ -80,6 +81,121 @@ using namespace snort;
         len = norm.get_dst_next() - dst;                              \
     }
 
+#define DO(src, slen, dst, dlen)                            \
+    {                                                       \
+        auto ret = norm.normalize(src, slen, dst, dlen);    \
+        CHECK(ret == JSTokenizer::SCRIPT_CONTINUE);         \
+        auto nsrc = norm.get_src_next();                    \
+        auto ndst = norm.get_dst_next();                    \
+        REQUIRE(nsrc - src == slen);                        \
+        REQUIRE(ndst - dst == dlen);                        \
+    }
+
+#define TRY(src, slen, dst, dlen, rexp)                     \
+    {                                                       \
+        auto ret = norm.normalize(src, slen, dst, dlen);    \
+        CHECK(ret == rexp);                                 \
+        auto ndst = norm.get_dst_next();                    \
+        REQUIRE(ndst - dst == dlen);                        \
+    }
+
+#define CLOSE()                                                         \
+    {                                                                   \
+        const char end[] = "</script>";                                 \
+        char dst[DST_SIZE];                                             \
+        auto ret = norm.normalize(end, sizeof(end) - 1, dst, sizeof(dst) - 1); \
+        CHECK(ret == JSTokenizer::SCRIPT_ENDED);                        \
+    }
+
+#define NORMALIZE_2(src1, src2, exp1, exp2)                         \
+    {                                                               \
+        char dst1[sizeof(exp1)];                                    \
+        char dst2[sizeof(exp2)];                                    \
+                                                                    \
+        JSIdentifierCtxTest ident_ctx;                              \
+        JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG);  \
+                                                                    \
+        DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1);         \
+        CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1));               \
+                                                                    \
+        DO(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1);         \
+        CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1));               \
+                                                                    \
+        CLOSE();                                                    \
+    }
+
+#define NORMALIZE_3(src1, src2, src3, exp1, exp2, exp3)             \
+    {                                                               \
+        char dst1[sizeof(exp1)];                                    \
+        char dst2[sizeof(exp2)];                                    \
+        char dst3[sizeof(exp3)];                                    \
+                                                                    \
+        JSIdentifierCtxTest ident_ctx;                              \
+        JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG);  \
+                                                                    \
+        DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1);         \
+        CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1));               \
+                                                                    \
+        DO(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1);         \
+        CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1));               \
+                                                                    \
+        DO(src3, sizeof(src3) - 1, dst3, sizeof(dst3) - 1);         \
+        CHECK(!memcmp(exp3, dst3, sizeof(exp3) - 1));               \
+                                                                    \
+        CLOSE();                                                    \
+    }
+
+#define NORM_BAD_2(src1, src2, exp1, exp2, code)                    \
+    {                                                               \
+        char dst1[sizeof(exp1)];                                    \
+        char dst2[sizeof(exp2)];                                    \
+                                                                    \
+        JSIdentifierCtxTest ident_ctx;                              \
+        JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG);  \
+                                                                    \
+        DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1);         \
+        CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1));               \
+                                                                    \
+        TRY(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1, code);  \
+        CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1));               \
+    }
+
+#define NORM_BAD_3(src1, src2, src3, exp1, exp2, exp3, code)        \
+    {                                                               \
+        char dst1[sizeof(exp1)];                                    \
+        char dst2[sizeof(exp2)];                                    \
+        char dst3[sizeof(exp3)];                                    \
+                                                                    \
+        JSIdentifierCtxTest ident_ctx;                              \
+        JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG);  \
+                                                                    \
+        DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1);         \
+        CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1));               \
+                                                                    \
+        DO(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1);         \
+        CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1));               \
+                                                                    \
+        TRY(src3, sizeof(src3) - 1, dst3, sizeof(dst3) - 1, code);  \
+        CHECK(!memcmp(exp3, dst3, sizeof(exp3) - 1));               \
+    }
+
+#define NORM_LIMITED(limit, src1, src2, exp1, exp2)                     \
+    {                                                                   \
+        char dst1[sizeof(exp1)];                                        \
+        char dst2[sizeof(exp2)];                                        \
+                                                                        \
+        JSIdentifierCtxTest ident_ctx;                                  \
+        JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG, limit); \
+                                                                        \
+        DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1);             \
+        CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1));                   \
+                                                                        \
+        DO(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1);             \
+        CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1));                   \
+                                                                        \
+        CLOSE();                                                        \
+    }
+
 // ClamAV test cases
 static const char clamav_buf0[] =
     "function foo(a, b) {\n"
@@ -1292,3 +1408,457 @@ TEST_CASE("nested script tags", "[JSNormalizer]")
     }
 }
 
+TEST_CASE("split between tokens", "[JSNormalizer]")
+{
+    SECTION("operator string")
+    {
+        const char dat1[] = "var s = ";
+        const char dat2[] = "'string';";
+        const char exp1[] = "var s=";
+        const char exp2[] = "var s='string';";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("operator number")
+    {
+        const char dat1[] = "a = 5 +";
+        const char dat2[] = "b + c;";
+        const char exp1[] = "a=5+";
+        const char exp2[] = "a=5+b+c;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("comment function")
+    {
+        const char dat1[] = "// no comments\n";
+        const char dat2[] = "foo(bar, baz);";
+        const char exp1[] = "";
+        const char exp2[] = "foo(bar,baz);";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("operator identifier")
+    {
+        const char dat1[] = "var ";
+        const char dat2[] = "a = ";
+        const char dat3[] = "b  ;";
+        const char exp1[] = "var";
+        const char exp2[] = "var a=";
+        const char exp3[] = "var a=b;";
+
+        NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+    }
+}
+
+TEST_CASE("split in comments", "[JSNormalizer]")
+{
+    SECTION("/ /")
+    {
+        const char dat1[] = "/";
+        const char dat2[] = "/comment\n";
+        const char exp1[] = "/";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("/ / msg")
+    {
+        const char dat1[] = "//";
+        const char dat2[] = "comment\n";
+        const char exp1[] = "";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("/ / LF")
+    {
+        const char dat1[] = "//comment";
+        const char dat2[] = "\n";
+        const char exp1[] = "";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+
+    SECTION("/ *")
+    {
+        const char dat1[] = "/";
+        const char dat2[] = "* comment */";
+        const char exp1[] = "/";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("/ * msg")
+    {
+        const char dat1[] = "/* t";
+        const char dat2[] = "ext */";
+        const char exp1[] = "";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("* /")
+    {
+        const char dat1[] = "/* comment *";
+        const char dat2[] = "/";
+        const char exp1[] = "";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("/ * msg * /")
+    {
+        const char dat1[] = "/";
+        const char dat2[] = "* comment *";
+        const char dat3[] = "/";
+        const char exp1[] = "/";
+        const char exp2[] = "";
+        const char exp3[] = "";
+
+        NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+    }
+
+    SECTION("< !--")
+    {
+        const char dat1[] = "<";
+        const char dat2[] = "!-- comment\n";
+        const char exp1[] = "<";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("<! --")
+    {
+        const char dat1[] = "<!";
+        const char dat2[] = "-- comment\n";
+        const char exp1[] = "<!";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("<!- -")
+    {
+        const char dat1[] = "<!-";
+        const char dat2[] = "- comment\n";
+        const char exp1[] = "<!-";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("<!-- msg")
+    {
+        const char dat1[] = "<!--";
+        const char dat2[] = "comment\n";
+        const char exp1[] = "";
+        const char exp2[] = "";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("<! -- msg")
+    {
+        const char dat1[] = "<";
+        const char dat2[] = "!-";
+        const char dat3[] = "-comment\n";
+        const char exp1[] = "<";
+        const char exp2[] = "<!-";
+        const char exp3[] = "";
+
+        NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+    }
+}
+
+TEST_CASE("split in opening tag", "[JSNormalizer]")
+{
+    SECTION("< script")
+    {
+        const char dat1[] = "<";
+        const char dat2[] = "script";
+        const char exp1[] = "<";
+        const char exp2[] = "";
+
+        NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::OPENING_TAG);
+    }
+    SECTION("str='<s cript'")
+    {
+        const char dat1[] = "var str ='<s";
+        const char dat2[] = "cript';";
+        const char exp1[] = "var str='<s";
+        const char exp2[] = "var str='";
+
+        NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::OPENING_TAG);
+    }
+    SECTION("str='<scrip t'")
+    {
+        const char dat1[] = "var str ='<scrip";
+        const char dat2[] = "t';";
+        const char exp1[] = "var str='<scrip";
+        const char exp2[] = "='";
+
+        NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::OPENING_TAG);
+    }
+    SECTION("< scr ipt")
+    {
+        const char dat1[] = "<";
+        const char dat2[] = "scr";
+        const char dat3[] = "ipt";
+        const char exp1[] = "<";
+        const char exp2[] = "<scr";
+        const char exp3[] = "";
+
+        NORM_BAD_3(dat1, dat2, dat3, exp1, exp2, exp3, JSTokenizer::OPENING_TAG);
+    }
+    SECTION("str='<sc rip t'")
+    {
+        const char dat1[] = "var str =\"<sc";
+        const char dat2[] = "rip";
+        const char dat3[] = "t\";";
+        const char exp1[] = "var str=\"<sc";
+        const char exp2[] = " str=\"<scrip";
+        const char exp3[] = "=\"";
+
+        NORM_BAD_3(dat1, dat2, dat3, exp1, exp2, exp3, JSTokenizer::OPENING_TAG);
+    }
+}
+
+TEST_CASE("split in closing tag", "[JSNormalizer]")
+{
+    SECTION("< /script>")
+    {
+        const char dat1[] = "<";
+        const char dat2[] = "/script>";
+        const char exp1[] = "<";
+        const char exp2[] = "";
+
+        NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::SCRIPT_ENDED);
+    }
+    SECTION("</script >")
+    {
+        const char dat1[] = "</script";
+        const char dat2[] = ">";
+        const char exp1[] = "</script";
+        const char exp2[] = "";
+
+        NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::SCRIPT_ENDED);
+    }
+    SECTION("str='</ script>'")
+    {
+        const char dat1[] = "var str ='</";
+        const char dat2[] = "script>';";
+        const char exp1[] = "var str='</";
+        const char exp2[] = "var str='";
+
+        NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::CLOSING_TAG);
+    }
+    SECTION("str='</scrip t>'")
+    {
+        const char dat1[] = "var str ='</scrip";
+        const char dat2[] = "t>';";
+        const char exp1[] = "var str='</scrip";
+        const char exp2[] = "'";
+
+        NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::CLOSING_TAG);
+    }
+    SECTION("</ scr ipt>")
+    {
+        const char dat1[] = "</";
+        const char dat2[] = "scr";
+        const char dat3[] = "ipt>";
+        const char exp1[] = "</";
+        const char exp2[] = "</scr";
+        const char exp3[] = "";
+
+        NORM_BAD_3(dat1, dat2, dat3, exp1, exp2, exp3, JSTokenizer::SCRIPT_ENDED);
+    }
+    SECTION("str='</sc rip t>'")
+    {
+        const char dat1[] = "var str =\"</sc";
+        const char dat2[] = "rip";
+        const char dat3[] = "t>\";";
+        const char exp1[] = "var str=\"</sc";
+        const char exp2[] = " str=\"</scrip";
+        const char exp3[] = "\"";
+
+        NORM_BAD_3(dat1, dat2, dat3, exp1, exp2, exp3, JSTokenizer::CLOSING_TAG);
+    }
+}
+
+TEST_CASE("split in string literal", "[JSNormalizer]")
+{
+    SECTION("\\ LF")
+    {
+        const char dat1[] = "var str =\"any\\";
+        const char dat2[] = "\none\";";
+        const char exp1[] = "var str=\"any\\";
+        const char exp2[] = " str=\"anyone\";";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("\\ CR")
+    {
+        const char dat1[] = "var str =\"any\\";
+        const char dat2[] = "\rone\";";
+        const char exp1[] = "var str=\"any\\";
+        const char exp2[] = " str=\"anyone\";";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("\\CR LF")
+    {
+        const char dat1[] = "var str =\"any\\\r";
+        const char dat2[] = "\none\";";
+        const char exp1[] = "var str=\"any";
+        const char exp2[] = " str=\"anyone\";";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("\\ CRLF")
+    {
+        const char dat1[] = "var str =\"any\\";
+        const char dat2[] = "\r\none\";";
+        const char exp1[] = "var str=\"any\\";
+        const char exp2[] = " str=\"anyone\";";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("\\ \"")
+    {
+        const char dat1[] = "var str =\"any\\";
+        const char dat2[] = "\"one\";";
+        const char exp1[] = "var str=\"any\\";
+        const char exp2[] = " str=\"any\\\"one\";";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("\\ \'")
+    {
+        const char dat1[] = "var str =\"any\\";
+        const char dat2[] = "\'one\";";
+        const char exp1[] = "var str=\"any\\";
+        const char exp2[] = " str=\"any\\\'one\";";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("\\ u1234tx")
+    {
+        const char dat1[] = "var str =\"any\\";
+        const char dat2[] = "u1234tx\";";
+        const char exp1[] = "var str=\"any\\";
+        const char exp2[] = " str=\"any\\u1234tx\";";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("\\u 1234tx")
+    {
+        const char dat1[] = "var str =\"any\\u";
+        const char dat2[] = "1234tx\";";
+        const char exp1[] = "var str=\"any\\u";
+        const char exp2[] = "=\"any\\u1234tx\";";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+}
+
+TEST_CASE("split in identifier", "[JSNormalizer]")
+{
+    SECTION("abc def")
+    {
+        const char dat1[] = "var abc";
+        const char dat2[] = "def = 5";
+        const char exp1[] = "var abc";
+        const char exp2[] = "var abcdef=5";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("abc def")
+    {
+        const char dat1[] = "var abc";
+        const char dat2[] = "def = 5";
+        const char exp1[] = "var abc";
+        const char exp2[] = "var abcdef=5";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("long identifier")
+    {
+        const char dat1[] = "var res = something + long_id_starts_here";
+        const char dat2[] = "_long_id_ends_here;";
+        const char exp1[] = "var res=something+long_id_starts_here";
+        const char exp2[] = "=something+long_id_starts_here_long_id_ends_here;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+}
+
+TEST_CASE("split in keyword", "[JSNormalizer]")
+{
+    SECTION("finally")
+    {
+        const char dat1[] = "\nfin";
+        const char dat2[] = "ally;";
+        const char exp1[] = "fin";
+        const char exp2[] = "finally;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("in")
+    {
+        const char dat1[] = "i";
+        const char dat2[] = "n";
+        const char exp1[] = "i";
+        const char exp2[] = "in";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+    }
+    SECTION("instanceof")
+    {
+        const char dat1[] = "in";
+        const char dat2[] = "stance";
+        const char dat3[] = "of";
+        const char exp1[] = "in";
+        const char exp2[] = "instance";
+        const char exp3[] = "instanceof";
+
+        NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+    }
+}
+
+TEST_CASE("memcap", "[JSNormalizer]")
+{
+    SECTION("3 tokens")
+    {
+        const char dat1[] = "var abc=in";
+        const char dat2[] = "put;";
+        const char exp1[] = "var abc=in";
+        const char exp2[] = " abc=input;";
+
+        NORM_LIMITED(6, dat1, dat2, exp1, exp2);
+    }
+    SECTION("2 tokens and a half")
+    {
+        const char dat1[] = "var abc=in";
+        const char dat2[] = "put;";
+        const char exp1[] = "var abc=in";
+        const char exp2[] = " c=input;";
+
+        NORM_LIMITED(4, dat1, dat2, exp1, exp2);
+    }
+    SECTION("1 token")
+    {
+        const char dat1[] = "var abc=in";
+        const char dat2[] = "put;";
+        const char exp1[] = "var abc=in";
+        const char exp2[] = "input;";
+
+        NORM_LIMITED(2, dat1, dat2, exp1, exp2);
+    }
+    SECTION("a half")
+    {
+        const char dat1[] = "var abc=extract";
+        const char dat2[] = "// just a comment\n";
+        const char exp1[] = "var abc=extract";
+        const char exp2[] = "tract";
+
+        NORM_LIMITED(5, dat1, dat2, exp1, exp2);
+    }
+}