From: Mike Stepanek (mstepane) Date: Thu, 19 Aug 2021 14:55:45 +0000 (+0000) Subject: Merge pull request #3016 in SNORT/snort3 from ~OSHUMEIK/snort3:over_pdus to master X-Git-Tag: 3.1.11.0~13 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f8bc3e83452225f157185898c8e8bf8b71b6ec90;p=thirdparty%2Fsnort3.git Merge pull request #3016 in SNORT/snort3 from ~OSHUMEIK/snort3:over_pdus to master Squashed commit of the following: commit 2c30e5ef0968f45b98b9618342f5311b32146c97 Author: Oleksii Shumeiko Date: Mon Jul 26 14:59:35 2021 +0300 utils: support streamed processing of JS text Unit tests added. Custom streambuf introduced, so Normalizer will see the next chunk as a continuation. The capacity of the tracking stack is set to 8, since Lexer has '.' pattern for identifiers, and a single character could form a complete token (in Lexer terms). commit 96f844e272943906c4373790c69f4236a8799be7 Author: Oleksii Shumeiko Date: Mon Aug 9 14:52:08 2021 +0300 utils: address compiler warning commit 9511296dd877a85da574b146ef43689713369d41 Author: Oleksii Shumeiko Date: Mon Jul 26 14:34:35 2021 +0300 http_inspect: check if Normalizer has consumed input The inspector logic expects that Normalizer must consume input bytes. If not the normalization is aborted, since there is no other consumer available. --- diff --git a/src/service_inspectors/http_inspect/http_js_norm.cc b/src/service_inspectors/http_inspect/http_js_norm.cc index 44f806d4b..e371d5f41 100644 --- a/src/service_inspectors/http_inspect/http_js_norm.cc +++ b/src/service_inspectors/http_inspect/http_js_norm.cc @@ -37,11 +37,14 @@ static inline JSTokenizer::JSRet js_normalize(JSNormalizer& ctx, const char* con const char* dst_end, const char*& ptr, char*& dst) { auto ret = ctx.normalize(ptr, end - ptr, dst, dst_end - dst); - auto next = ctx.get_src_next(); - HttpModule::increment_peg_counts(PEG_JS_BYTES, next - ptr); - ptr = next; + if (next > ptr) + HttpModule::increment_peg_counts(PEG_JS_BYTES, next - ptr); + else + next = end; // Normalizer has failed, thus aborting the remaining input + + ptr = next; dst = ctx.get_dst_next(); return ret; diff --git a/src/utils/js_normalizer.cc b/src/utils/js_normalizer.cc index 3687be6ce..9e6067782 100644 --- a/src/utils/js_normalizer.cc +++ b/src/utils/js_normalizer.cc @@ -24,18 +24,30 @@ #include "js_normalizer.h" using namespace snort; +using namespace std; JSNormalizer::JSNormalizer(JSIdentifierCtxBase& js_ident_ctx, size_t norm_depth, - uint8_t max_template_nesting) + uint8_t max_template_nesting, int tmp_cap_size) : depth(norm_depth), rem_bytes(norm_depth), - unlim(norm_depth == (size_t) - 1), + unlim(norm_depth == static_cast(-1)), src_next(nullptr), dst_next(nullptr), - tokenizer(in, out, js_ident_ctx, max_template_nesting) + tmp_buf(nullptr), + tmp_buf_size(0), + in(&in_buf), + out(&out_buf), + tokenizer(in, out, js_ident_ctx, max_template_nesting, tmp_buf, tmp_buf_size, tmp_cap_size) { } +JSNormalizer::~JSNormalizer() +{ + delete[] tmp_buf; + tmp_buf = nullptr; + tmp_buf_size = 0; +} + JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len, char* dst, size_t dst_len) { if (rem_bytes == 0 && !unlim) @@ -47,13 +59,14 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len, char size_t len = unlim ? src_len : src_len < rem_bytes ? src_len : rem_bytes; - in.rdbuf()->pubsetbuf(const_cast(src), len); - out.rdbuf()->pubsetbuf(dst, dst_len); - JSTokenizer::JSRet ret = (JSTokenizer::JSRet)tokenizer.yylex(); + in_buf.pubsetbuf(tmp_buf, tmp_buf_size, const_cast(src), len); + out_buf.pubsetbuf(dst, dst_len); + + JSTokenizer::JSRet ret = static_cast(tokenizer.yylex()); in.clear(); out.clear(); - size_t r_bytes = in.tellg(); + size_t r_bytes = in_buf.glued() ? static_cast(in.tellg()) : 0; size_t w_bytes = out.tellp(); if (!unlim) diff --git a/src/utils/js_normalizer.h b/src/utils/js_normalizer.h index 84e58bc3f..f0dd58969 100644 --- a/src/utils/js_normalizer.h +++ b/src/utils/js_normalizer.h @@ -29,10 +29,85 @@ namespace snort { +class gluebuf : public std::stringbuf +{ +public: + gluebuf() : + std::stringbuf(), once(true), + src1(nullptr), len1(0), src2(nullptr), len2(0) + { } + + std::streambuf* pubsetbuf(char* buf1, std::streamsize buf1_len, + char* buf2, std::streamsize buf2_len) + { + once = !(buf1 && buf1_len); + + if (once) + { + setbuf(buf2, buf2_len); + current_src_len = buf2_len; + } + else + { + setbuf(buf1, buf1_len); + current_src_len = buf1_len; + } + src1 = buf1; + len1 = buf1_len; + src2 = buf2; + len2 = buf2_len; + return this; + } + + bool glued() const + { + return once; + } + +protected: + virtual std::streampos seekoff(std::streamoff off, + std::ios_base::seekdir way, std::ios_base::openmode which) override + { + if (way != std::ios_base::end) + return std::stringbuf::seekoff(off, way, which); + + if (current_src_len + off < 0 and once) + { + off += current_src_len; + once = false; + setbuf(src1, len1); + current_src_len = len1; + } + + return std::stringbuf::seekoff(off, way, which); + } + + virtual int underflow() override + { + if (once) + return EOF; + + once = true; + setbuf(src2, len2); + current_src_len = len2; + return sgetc(); + } + +private: + bool once; + std::streamsize current_src_len; + char* src1; + std::streamsize len1; + char* src2; + std::streamsize len2; +}; + class JSNormalizer { public: - JSNormalizer(JSIdentifierCtxBase& js_ident_ctx, size_t depth, uint8_t max_template_nesting); + JSNormalizer(JSIdentifierCtxBase& js_ident_ctx, size_t depth, + uint8_t max_template_nesting, int tmp_cap_size = JSTOKENIZER_BUF_MAX_SIZE); + ~JSNormalizer(); const char* get_src_next() const { return src_next; } @@ -54,12 +129,16 @@ private: const char* src_next; char* dst_next; - std::stringstream in; - std::stringstream out; + char* tmp_buf; + size_t tmp_buf_size; + + gluebuf in_buf; + std::stringbuf out_buf; + std::istream in; + std::ostream out; JSTokenizer tokenizer; }; } #endif //JS_NORMALIZER_H - diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index 3bb13a99f..c6c3bc1f0 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -26,6 +26,15 @@ #include "log/messages.h" +// The longest pattern has 9 characters " < / s c r i p t > ", +// 8 of them can reside in 1st chunk +// Each character in the identifier forms its own group (pattern matching case), +// i.e. in the current implementation IDENTIFIER has " . " rule. +#define JSTOKENIZER_MAX_STATES 8 + +// To hold potentially long identifiers +#define JSTOKENIZER_BUF_MAX_SIZE 256 + class JSIdentifierCtxBase; class JSTokenizer : public yyFlexLexer @@ -55,7 +64,9 @@ public: TEMPLATE_NESTING_OVERFLOW }; - JSTokenizer(std::istream& in, std::ostream& out, JSIdentifierCtxBase& ident_ctx, uint8_t max_template_nesting); + JSTokenizer(std::istream& in, std::ostream& out, JSIdentifierCtxBase& ident_ctx, + uint8_t max_template_nesting, char*& buf, size_t& buf_size, + int cap_size = JSTOKENIZER_BUF_MAX_SIZE); ~JSTokenizer() override; // returns JSRet @@ -77,7 +88,10 @@ private: void process_closing_bracket(); JSRet process_subst_open(); -private: + void states_push(); + void states_apply(); + void states_correct(int); + void* cur_buffer; void* tmp_buffer = nullptr; std::stringstream tmp; @@ -85,6 +99,18 @@ private: std::stack> bracket_depth; JSToken token = UNDEFINED; JSIdentifierCtxBase& ident_ctx; + + struct + { + JSToken token = UNDEFINED; // the token before + int length = 0; // current token length + int sc = 0; // current Starting Condition + } states[JSTOKENIZER_MAX_STATES]; + int sp = 0; // points to the top of states + + char*& tmp_buf; + size_t& tmp_buf_size; + const int tmp_cap_size; }; #endif // JS_TOKENIZER_H diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index 11972f120..d2cb3e04c 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -26,18 +26,20 @@ %option c++ %{ - #ifdef HAVE_CONFIG_H - #include "config.h" - #endif +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif - #include "utils/js_identifier_ctx.h" - #include "utils/js_tokenizer.h" +#include "utils/js_tokenizer.h" - #include +#include - #include "utils/util_cstring.h" +#include "utils/js_identifier_ctx.h" +#include "utils/util_cstring.h" - #define EXEC(f) { auto r = (f); if (r) { BEGIN(regst); return r; } } +#define YY_USER_ACTION { states_push(); } +#define EXEC(f) { auto r = (f); if (r) { BEGIN(regst); return r; } } +#define EEOF(f) { auto r = (f); if (r) { if (r != SCRIPT_CONTINUE) BEGIN(regst); return r; } } %} /* The following grammar was created based on ECMAScript specification */ @@ -881,12 +883,15 @@ LITERAL_HEX_INTEGER 0x[0-9a-fA-F]*|0X[0-9a-fA-F]* LITERAL_DQ_STRING_START \" LITERAL_DQ_STRING_END \" LITERAL_DQ_STRING_SKIP \\\" +LITERAL_DQ_STRING_TEXT . LITERAL_SQ_STRING_START \' LITERAL_SQ_STRING_END \' LITERAL_SQ_STRING_SKIP \\\' +LITERAL_SQ_STRING_TEXT . LITERAL_TEMPLATE_START \` LITERAL_TEMPLATE_END \` LITERAL_TEMPLATE_SUBST_START \$\{ +LITERAL_TEMPLATE_OTHER . LITERAL_REGEX_START \/[^*\/] LITERAL_REGEX_END \/[gimsuy]* LITERAL_REGEX_SKIP \\\/ @@ -946,14 +951,14 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {LINE_COMMENT_END3} { BEGIN(regst); return OPENING_TAG; } {LINE_COMMENT_END4} { BEGIN(regst); return CLOSING_TAG; } {LINE_COMMENT_SKIP} { } -<> { return SCRIPT_CONTINUE; } +<> { states_apply(); return SCRIPT_CONTINUE; } {BLOCK_COMMENT_START} { BEGIN(bcomm); } {BLOCK_COMMENT_END1} { BEGIN(regst); } {BLOCK_COMMENT_END2} { BEGIN(regst); return OPENING_TAG; } {BLOCK_COMMENT_END3} { BEGIN(regst); return CLOSING_TAG; } {BLOCK_COMMENT_SKIP} { } -<> { return SCRIPT_CONTINUE; } +<> { states_apply(); return SCRIPT_CONTINUE; } {LITERAL_DQ_STRING_START} { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(dqstr); } {LITERAL_DQ_STRING_END} { ECHO; BEGIN(divop); } @@ -963,8 +968,8 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 \\{CR} { } {LINE_TERMINATORS} { BEGIN(regst); return BAD_TOKEN; } {LITERAL_DQ_STRING_SKIP} { ECHO; } -. { ECHO; } -<> { return SCRIPT_CONTINUE; } +{LITERAL_DQ_STRING_TEXT} { ECHO; } +<> { states_apply(); return SCRIPT_CONTINUE; } {LITERAL_SQ_STRING_START} { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(sqstr); } {LITERAL_SQ_STRING_END} { ECHO; BEGIN(divop); } @@ -974,10 +979,10 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 \\{CR} { } {LINE_TERMINATORS} { BEGIN(regst); return BAD_TOKEN; } {LITERAL_SQ_STRING_SKIP} { ECHO; } -. { ECHO; } -<> { return SCRIPT_CONTINUE; } +{LITERAL_SQ_STRING_TEXT} { ECHO; } +<> { states_apply(); return SCRIPT_CONTINUE; } -{OPEN_BRACKET} { if (not bracket_depth.empty()) bracket_depth.top()++; process_punctuator(); } +{OPEN_BRACKET} { if (!bracket_depth.empty()) bracket_depth.top()++; process_punctuator(); } {CLOSE_BRACKET} { process_closing_bracket(); } {LITERAL_TEMPLATE_START} { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(tmpll); } @@ -986,10 +991,10 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } (\\\\)*\\{LITERAL_TEMPLATE_SUBST_START} | /* escaped template substitution */ (\\\\)*\\{LITERAL_TEMPLATE_END} | /* escaped backtick */ -. { ECHO; } +{LITERAL_TEMPLATE_OTHER} { ECHO; } <> { return SCRIPT_CONTINUE; } -{LITERAL_REGEX_START} { EXEC(do_spacing(LITERAL)) yyout << '/'; yyless(1); BEGIN(regex); } +{LITERAL_REGEX_START} { EXEC(do_spacing(LITERAL)) yyout << '/'; states_correct(1); yyless(1); BEGIN(regex); } {LITERAL_REGEX_END} { ECHO; BEGIN(divop); } {HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } {LITERAL_REGEX_SKIP} { ECHO; } @@ -997,7 +1002,7 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 \\{CR} | {LINE_TERMINATORS} { BEGIN(regst); return BAD_TOKEN; } [^<{LF}{CR}{LS}{PS}\\\/]+ { ECHO; } -<> { return SCRIPT_CONTINUE; } +<> { states_apply(); return SCRIPT_CONTINUE; } {DIV_OPERATOR} | {DIV_ASSIGNMENT_OPERATOR} { ECHO; token = PUNCTUATOR; BEGIN(INITIAL); } @@ -1013,7 +1018,7 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {IDENTIFIER} { if (unescape(YYText())) { EXEC(do_spacing(IDENTIFIER)) EXEC(do_identifier_substitution(YYText())) } BEGIN(divop); } .|{ALL_UNICODE} { ECHO; token = UNDEFINED; BEGIN(INITIAL); } -<> { EXEC(eval_eof()) } +<> { EEOF(eval_eof()) } %% @@ -1023,18 +1028,18 @@ static std::string unicode_to_utf8(const unsigned int code) { std::string res; - if ( code <= 0x7f ) + if (code <= 0x7f) res += (char)code; - else if ( code <= 0x7ff ) + else if (code <= 0x7ff) { - res += ( 0xc0 | (code >> 6) ); - res += ( 0x80 | (code & 0x3f) ); + res += 0xc0 | (code >> 6); + res += 0x80 | (code & 0x3f); } - else if ( code <= 0xffff ) + else if (code <= 0xffff) { - res += ( 0xe0 | (code >> 12) ); - res += ( 0x80 | ((code >> 6) & 0x3f) ); - res += ( 0x80 | (code & 0x3f) ); + res += 0xe0 | (code >> 12); + res += 0x80 | ((code >> 6) & 0x3f); + res += 0x80 | (code & 0x3f); } return res; @@ -1052,17 +1057,17 @@ static std::string unescape_unicode(const char* lexeme) short digits_left = 4; std::string unicode_str; - for ( const auto& ch : lex ) + for (const auto& ch : lex) { - if ( ch == '\\' ) + if (ch == '\\') { is_unescape = true; continue; } - if ( is_unescape ) + if (is_unescape) { - if ( ch == 'u' ) + if (ch == 'u') { is_unicode = true; continue; @@ -1070,10 +1075,10 @@ static std::string unescape_unicode(const char* lexeme) is_unescape = false; } - if ( is_unicode ) + if (is_unicode) { unicode_str += ch; - if ( !(--digits_left) ) + if (!(--digits_left)) { const unsigned int unicode = std::stoi(unicode_str, nullptr, 16); res += unicode_to_utf8(unicode); @@ -1093,11 +1098,15 @@ static std::string unescape_unicode(const char* lexeme) // JSTokenizer members -JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out, JSIdentifierCtxBase& ident_ctx, - uint8_t max_template_nesting) +JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out, + JSIdentifierCtxBase& mapper, uint8_t max_template_nesting, + char*& buf, size_t& buf_size, int cap_size) : yyFlexLexer(in, out), max_template_nesting(max_template_nesting), - ident_ctx(ident_ctx) + ident_ctx(mapper), + tmp_buf(buf), + tmp_buf_size(buf_size), + tmp_cap_size(cap_size) { BEGIN(regst); } @@ -1105,6 +1114,9 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out, JSIdentifierCtxBas JSTokenizer::~JSTokenizer() { yy_delete_buffer((YY_BUFFER_STATE)tmp_buffer); + delete[] tmp_buf; + tmp_buf = nullptr; + tmp_buf_size = 0; } void JSTokenizer::switch_to_temporal(const std::string& data) @@ -1130,13 +1142,15 @@ JSTokenizer::JSRet JSTokenizer::eval_eof() { // If the temporal scan buffer reaches EOF, cleanup and // continue with the initial one - if ( tmp_buffer ) + if (tmp_buffer) { switch_to_initial(); return EOS; } // Normal termination + states_apply(); + return SCRIPT_CONTINUE; } @@ -1203,7 +1217,7 @@ JSTokenizer::JSRet JSTokenizer::do_identifier_substitution(const char* lexeme) bool JSTokenizer::unescape(const char* lexeme) { - if ( strstr(lexeme, "\\u") ) + if (strstr(lexeme, "\\u")) { const std::string unescaped_lex = unescape_unicode(lexeme); switch_to_temporal(unescaped_lex); @@ -1222,9 +1236,9 @@ void JSTokenizer::process_punctuator() void JSTokenizer::process_closing_bracket() { - if ( not bracket_depth.empty() ) + if (!bracket_depth.empty()) { - if ( bracket_depth.top() ) + if (bracket_depth.top()) bracket_depth.top()--; else { @@ -1239,11 +1253,63 @@ void JSTokenizer::process_closing_bracket() JSTokenizer::JSRet JSTokenizer::process_subst_open() { - if ( bracket_depth.size() >= max_template_nesting ) + if (bracket_depth.size() >= max_template_nesting) return TEMPLATE_NESTING_OVERFLOW; bracket_depth.push(0); token = PUNCTUATOR; ECHO; - BEGIN(divop); + BEGIN(divop); return EOS; -} \ No newline at end of file +} + +void JSTokenizer::states_push() +{ + assert(yyleng != 0); + + sp++; + sp %= JSTOKENIZER_MAX_STATES; + auto& state = states[sp]; + + state.token = token; + state.length = yyleng; + state.sc = yy_start; +} + +void JSTokenizer::states_correct(int take_off) +{ + auto& state = states[sp]; + state.length -= yyleng - take_off; +} + +void JSTokenizer::states_apply() +{ + int tail_size = 0; + + for (int i = JSTOKENIZER_MAX_STATES; i > 0 && tail_size < tmp_cap_size; --i) + { + auto idx = sp + i; + idx %= JSTOKENIZER_MAX_STATES; + auto& state = states[idx]; + + if (state.length == 0) + continue; + + token = state.token; + yy_start = state.sc; + tail_size += state.length; + tail_size = tail_size < tmp_cap_size ? tail_size : tmp_cap_size; + } + + for (int i = 0; i < JSTOKENIZER_MAX_STATES; ++i) + states[i].length = 0; + + char* buf = new char[tail_size]; + + yyin.seekg(-tail_size, std::ios_base::end); + yyin.clear(); + yyin.read(buf, tail_size); + + delete[] tmp_buf; + tmp_buf = buf; + tmp_buf_size = tail_size; +} diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index 79fbb9278..00cfa6b16 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -43,13 +43,14 @@ public: const char* substitute(const char* identifier) override { return identifier; } void reset() override {} - size_t size() const override {} + size_t size() const override { return 0; } }; using namespace snort; #define DEPTH 65535 #define MAX_TEMPLATE_NESTNIG 4 +#define DST_SIZE 512 #define NORMALIZE(src, expected) \ char dst[sizeof(expected)]; \ @@ -80,6 +81,121 @@ using namespace snort; len = norm.get_dst_next() - dst; \ } +#define DO(src, slen, dst, dlen) \ + { \ + auto ret = norm.normalize(src, slen, dst, dlen); \ + CHECK(ret == JSTokenizer::SCRIPT_CONTINUE); \ + auto nsrc = norm.get_src_next(); \ + auto ndst = norm.get_dst_next(); \ + REQUIRE(nsrc - src == slen); \ + REQUIRE(ndst - dst == dlen); \ + } + +#define TRY(src, slen, dst, dlen, rexp) \ + { \ + auto ret = norm.normalize(src, slen, dst, dlen); \ + CHECK(ret == rexp); \ + auto ndst = norm.get_dst_next(); \ + REQUIRE(ndst - dst == dlen); \ + } + +#define CLOSE() \ + { \ + const char end[] = ""; \ + char dst[DST_SIZE]; \ + auto ret = norm.normalize(end, sizeof(end) - 1, dst, sizeof(dst) - 1); \ + CHECK(ret == JSTokenizer::SCRIPT_ENDED); \ + } + +#define NORMALIZE_2(src1, src2, exp1, exp2) \ + { \ + char dst1[sizeof(exp1)]; \ + char dst2[sizeof(exp2)]; \ + \ + JSIdentifierCtxTest ident_ctx; \ + JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG); \ + \ + DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1); \ + CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1)); \ + \ + DO(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1); \ + CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1)); \ + \ + CLOSE(); \ + } + +#define NORMALIZE_3(src1, src2, src3, exp1, exp2, exp3) \ + { \ + char dst1[sizeof(exp1)]; \ + char dst2[sizeof(exp2)]; \ + char dst3[sizeof(exp3)]; \ + \ + JSIdentifierCtxTest ident_ctx; \ + JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG); \ + \ + DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1); \ + CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1)); \ + \ + DO(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1); \ + CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1)); \ + \ + DO(src3, sizeof(src3) - 1, dst3, sizeof(dst3) - 1); \ + CHECK(!memcmp(exp3, dst3, sizeof(exp3) - 1)); \ + \ + CLOSE(); \ + } + +#define NORM_BAD_2(src1, src2, exp1, exp2, code) \ + { \ + char dst1[sizeof(exp1)]; \ + char dst2[sizeof(exp2)]; \ + \ + JSIdentifierCtxTest ident_ctx; \ + JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG); \ + \ + DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1); \ + CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1)); \ + \ + TRY(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1, code); \ + CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1)); \ + } + +#define NORM_BAD_3(src1, src2, src3, exp1, exp2, exp3, code) \ + { \ + char dst1[sizeof(exp1)]; \ + char dst2[sizeof(exp2)]; \ + char dst3[sizeof(exp3)]; \ + \ + JSIdentifierCtxTest ident_ctx; \ + JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG); \ + \ + DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1); \ + CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1)); \ + \ + DO(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1); \ + CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1)); \ + \ + TRY(src3, sizeof(src3) - 1, dst3, sizeof(dst3) - 1, code); \ + CHECK(!memcmp(exp3, dst3, sizeof(exp3) - 1)); \ + } + +#define NORM_LIMITED(limit, src1, src2, exp1, exp2) \ + { \ + char dst1[sizeof(exp1)]; \ + char dst2[sizeof(exp2)]; \ + \ + JSIdentifierCtxTest ident_ctx; \ + JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTNIG, limit); \ + \ + DO(src1, sizeof(src1) - 1, dst1, sizeof(dst1) - 1); \ + CHECK(!memcmp(exp1, dst1, sizeof(exp1) - 1)); \ + \ + DO(src2, sizeof(src2) - 1, dst2, sizeof(dst2) - 1); \ + CHECK(!memcmp(exp2, dst2, sizeof(exp2) - 1)); \ + \ + CLOSE(); \ + } + // ClamAV test cases static const char clamav_buf0[] = "function foo(a, b) {\n" @@ -1292,3 +1408,457 @@ TEST_CASE("nested script tags", "[JSNormalizer]") } } +TEST_CASE("split between tokens", "[JSNormalizer]") +{ + SECTION("operator string") + { + const char dat1[] = "var s = "; + const char dat2[] = "'string';"; + const char exp1[] = "var s="; + const char exp2[] = "var s='string';"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("operator number") + { + const char dat1[] = "a = 5 +"; + const char dat2[] = "b + c;"; + const char exp1[] = "a=5+"; + const char exp2[] = "a=5+b+c;"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("comment function") + { + const char dat1[] = "// no comments\n"; + const char dat2[] = "foo(bar, baz);"; + const char exp1[] = ""; + const char exp2[] = "foo(bar,baz);"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("operator identifier") + { + const char dat1[] = "var "; + const char dat2[] = "a = "; + const char dat3[] = "b ;"; + const char exp1[] = "var"; + const char exp2[] = "var a="; + const char exp3[] = "var a=b;"; + + NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3); + } +} + +TEST_CASE("split in comments", "[JSNormalizer]") +{ + SECTION("/ /") + { + const char dat1[] = "/"; + const char dat2[] = "/comment\n"; + const char exp1[] = "/"; + const char exp2[] = ""; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("/ / msg") + { + const char dat1[] = "//"; + const char dat2[] = "comment\n"; + const char exp1[] = ""; + const char exp2[] = ""; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("/ / LF") + { + const char dat1[] = "//comment"; + const char dat2[] = "\n"; + const char exp1[] = ""; + const char exp2[] = ""; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + + SECTION("/ *") + { + const char dat1[] = "/"; + const char dat2[] = "* comment */"; + const char exp1[] = "/"; + const char exp2[] = ""; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("/ * msg") + { + const char dat1[] = "/* t"; + const char dat2[] = "ext */"; + const char exp1[] = ""; + const char exp2[] = ""; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("* /") + { + const char dat1[] = "/* comment *"; + const char dat2[] = "/"; + const char exp1[] = ""; + const char exp2[] = ""; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("/ * msg * /") + { + const char dat1[] = "/"; + const char dat2[] = "* comment *"; + const char dat3[] = "/"; + const char exp1[] = "/"; + const char exp2[] = ""; + const char exp3[] = ""; + + NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3); + } + + SECTION("< !--") + { + const char dat1[] = "<"; + const char dat2[] = "!-- comment\n"; + const char exp1[] = "<"; + const char exp2[] = ""; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + } + SECTION("") + { + const char dat1[] = "<"; + const char dat2[] = "/script>"; + const char exp1[] = "<"; + const char exp2[] = ""; + + NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::SCRIPT_ENDED); + } + SECTION("") + { + const char dat1[] = "") + { + const char dat1[] = ""; + const char exp1[] = "'") + { + const char dat1[] = "var str =\"