From: Mike Stepanek (mstepane) Date: Fri, 8 Apr 2022 13:06:17 +0000 (+0000) Subject: Pull request #3366: An improvment for JS regex literals. X-Git-Tag: 3.1.28.0~24 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=29740ffa83b441c6dc9c9f5e45bb6b54605cfa59;p=thirdparty%2Fsnort3.git Pull request #3366: An improvment for JS regex literals. Merge in SNORT/snort3 from ~OSHUMEIK/snort3:js_regex to master Squashed commit of the following: commit 4079a93365262390d6d77144b5ce8b2c29f4d8af Author: dkyrylov Date: Sun Jul 25 16:13:30 2021 +0300 utils: track groups and escaped symbols in JavaScript regex literals --- diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index c263a1a2c..15f70c900 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -255,6 +255,9 @@ private: JSRet literal_sq_string_start(); JSRet literal_template_start(); JSRet literal_regex_start(); + JSRet literal_regex_end(); + JSRet literal_regex_g_open(); + JSRet literal_regex_g_close(); void div_assignment_operator(); JSRet open_brace(); JSRet close_brace(); @@ -321,6 +324,7 @@ private: size_t bytes_read; size_t tmp_bytes_read; bool ext_script; + std::stack regex_stack; struct { diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index d881ce2c7..cdfde3cd0 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -985,11 +985,11 @@ LITERAL_INTEGER [0-9]* LITERAL_HEX_INTEGER 0x[0-9a-fA-F]*|0X[0-9a-fA-F]* LITERAL_DQ_STRING_START \" LITERAL_DQ_STRING_END \" -LITERAL_DQ_STRING_SKIP \\\" +LITERAL_DQ_STRING_SKIP \\\"|\\\\ LITERAL_DQ_STRING_TEXT [^\"\\%\xA\xD\{0x10}(\xE2\x80\xA8)(\xE2\x80\xA9)("<"+(?i:script))("<"+(?i:\/script>))]{1,32} LITERAL_SQ_STRING_START \' LITERAL_SQ_STRING_END \' -LITERAL_SQ_STRING_SKIP \\\' +LITERAL_SQ_STRING_SKIP \\\'|\\\\ LITERAL_SQ_STRING_TEXT [^\'\\%\xA\xD\{0x10}(\xE2\x80\xA8)(\xE2\x80\xA9)("<"+(?i:script))("<"+(?i:\/script>))]{1,32} LITERAL_TEMPLATE_START \` LITERAL_TEMPLATE_END \` @@ -997,7 +997,10 @@ LITERAL_TEMPLATE_SUBST_START \$\{ LITERAL_TEMPLATE_OTHER [^\\%\`(\$\{)("<"+(?i:\/script>))]{1,32} LITERAL_REGEX_START \/[^*\/] LITERAL_REGEX_END \/[gimsuy]* -LITERAL_REGEX_SKIP \\\/ +LITERAL_REGEX_SKIP \\\/|\\\\|\\\(|\\\)|\\\[|\\\]|\\\{|\\\} +LITERAL_REGEX_TEXT [^<{LF}{CR}{LS}{PS}\\\/\(\[\{\)\]\}]+ +LITERAL_REGEX_G_OPEN \(|\[|\{ +LITERAL_REGEX_G_CLOSE \)|\]|\} /* extra literals */ /* according to https://ecma-international.org/ecma-262/5.1/#sec-4.3 */ LITERAL_UNDEFINED undefined @@ -1149,13 +1152,15 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {URL_ESCAPE_SEQUENCE} { set_encoding(IS_PERCENT); escaped_url_sequence(); } {LITERAL_REGEX_START} { EXEC(literal_regex_start()) } -{LITERAL_REGEX_END} { ECHO; BEGIN(divop); } +{LITERAL_REGEX_END} { EXEC(literal_regex_end()) } {HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); RETURN(CLOSING_TAG) } -{LITERAL_REGEX_SKIP} { ECHO; } \\{LF} | \\{CR} | {LINE_TERMINATORS} { BEGIN(regst); RETURN(BAD_TOKEN) } -[^<{LF}{CR}{LS}{PS}\\\/]+ { ECHO; } +{LITERAL_REGEX_SKIP} { ECHO; } +{LITERAL_REGEX_TEXT} { ECHO; } +{LITERAL_REGEX_G_OPEN} { EXEC(literal_regex_g_open()) } +{LITERAL_REGEX_G_CLOSE} { EXEC(literal_regex_g_close()) } {UNICODE_ESCAPE_SEQUENCE} | {HEX_ESCAPE_SEQUENCE} { escaped_unicode(); } <> { RETURN(SCRIPT_CONTINUE) } @@ -1666,6 +1671,7 @@ void JSTokenizer::states_reset() scope_stack = std::stack(); scope_stack.emplace(GLOBAL); + BEGIN(regst); } @@ -2276,6 +2282,56 @@ JSTokenizer::JSRet JSTokenizer::literal_regex_start() yyless(1); BEGIN(regex); set_ident_norm(true); + regex_stack = std::stack(); + return EOS; +} + +JSTokenizer::JSRet JSTokenizer::literal_regex_end() +{ + if (regex_stack.empty()) + { + ECHO; + BEGIN(divop); + return EOS; + } + else + { + ECHO; + return EOS; + } +} + +JSTokenizer::JSRet JSTokenizer::literal_regex_g_open() +{ + regex_stack.push(yytext[0]); + ECHO; + return EOS; +} + +JSTokenizer::JSRet JSTokenizer::literal_regex_g_close() +{ + if (regex_stack.empty()) + { + debug_logf(5, http_trace, TRACE_JS_PROC, nullptr, + "no group to close, .. %c\n", yytext[0]); + return BAD_TOKEN; + } + + char c = yytext[0]; + char o = regex_stack.top(); + char d = o == '(' ? 1 : 2; + + regex_stack.pop(); + + if (o + d != c) + { + debug_logf(5, http_trace, TRACE_JS_PROC, nullptr, + "closing symbol mismatch, %c .. %c\n", o, c); + return BAD_TOKEN; + } + + ECHO; + return EOS; } diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index 717a96769..b516f9b27 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -649,6 +649,36 @@ static const char all_patterns_buf6[] = static const char all_patterns_expected6[] = "tag ` template\n ${a+b} template`"; +static const char all_patterns_buf7[] = + "/Day: \\d{2}\\/Month: \\d{2}\\/Year: \\d{4}/;" + "/<\\d{3}>\\//g"; + +static const char all_patterns_expected7[] = + "/Day: \\d{2}\\/Month: \\d{2}\\/Year: \\d{4}/;" + "/<\\d{3}>\\//g"; + +static const char all_patterns_buf8[] = + "a = \" \\\" \\\\\\\" \\\\\";" + "b = ` \\` \\\\\\` \\\\`;" + "c = ' \\' \\\\\\' \\\\';" + "d = / \\/ \\\\\\/ \\\\/;" + "a + b;"; + +static const char all_patterns_expected8[] = + "a=\" \\\" \\\\\\\" \\\\\";" + "b=` \\` \\\\\\` \\\\`;" + "c=' \\' \\\\\\' \\\\';" + "d=/ \\/ \\\\\\/ \\\\/;" + "a+b;"; + +static const char all_patterns_buf9[] = + "var r = /^(?:(?:https?|mailto|ftp):|[^:/?#]*(?:[/?#]|$))/i;" + "new Lb(function(a){return /^[^:]*([/?#]|$)/.test(a)})"; + +static const char all_patterns_expected9[] = + "var r=/^(?:(?:https?|mailto|ftp):|[^:/?#]*(?:[/?#]|$))/i;" + "new Lb(function(a){return /^[^:]*([/?#]|$)/.test(a)})"; + TEST_CASE("all patterns", "[JSNormalizer]") { SECTION("whitespaces and special characters") @@ -739,6 +769,21 @@ TEST_CASE("all patterns", "[JSNormalizer]") NORMALIZE(all_patterns_buf6); VALIDATE(all_patterns_buf6, all_patterns_expected6); } + SECTION("regex literal with slashes") + { + NORMALIZE(all_patterns_buf7); + VALIDATE(all_patterns_buf7, all_patterns_expected7); + } + SECTION("multiple escaped slashes") + { + NORMALIZE(all_patterns_buf8); + VALIDATE(all_patterns_buf8, all_patterns_expected8); + } + SECTION("slashes and braces") + { + NORMALIZE(all_patterns_buf9); + VALIDATE(all_patterns_buf9, all_patterns_expected9); + } } // Test vectors for different syntax cases @@ -3318,6 +3363,197 @@ TEST_CASE("scope tail handling", "[JSNormalizer]") } } +TEST_CASE("scope regex groups", "[JSNormalizer]") +{ + SECTION("parentheses") + { + const char dat1[] = "a=/()/;"; + const char dat2[] = "b=/()()()/;"; + const char dat3[] = "c=/((()))/;"; + const char exp1[] = "a=/()/;"; + const char exp2[] = "b=/()()()/;"; + const char exp3[] = "c=/((()))/;"; + + NORMALIZE_1(dat1, exp1); + NORMALIZE_1(dat2, exp2); + NORMALIZE_1(dat3, exp3); + } + SECTION("curly braces") + { + const char dat1[] = "a=/{}/;"; + const char dat2[] = "b=/{}{}{}/;"; + const char dat3[] = "c=/{{{}}}/;"; + const char exp1[] = "a=/{}/;"; + const char exp2[] = "b=/{}{}{}/;"; + const char exp3[] = "c=/{{{}}}/;"; + + NORMALIZE_1(dat1, exp1); + NORMALIZE_1(dat2, exp2); + NORMALIZE_1(dat3, exp3); + } + SECTION("square brackets") + { + const char dat1[] = "a=/[]/;"; + const char dat2[] = "b=/[][][]/;"; + const char dat3[] = "c=/[[[]]]/;"; + const char exp1[] = "a=/[]/;"; + const char exp2[] = "b=/[][][]/;"; + const char exp3[] = "c=/[[[]]]/;"; + + NORMALIZE_1(dat1, exp1); + NORMALIZE_1(dat2, exp2); + NORMALIZE_1(dat3, exp3); + } + SECTION("mix of brackets") + { + const char dat1[] = "a=/(){}[]/"; + const char dat2[] = "b=/({})[]/"; + const char dat3[] = "c=/(){[]}/"; + const char exp1[] = "a=/(){}[]/"; + const char exp2[] = "b=/({})[]/"; + const char exp3[] = "c=/(){[]}/"; + + NORMALIZE_1(dat1, exp1); + NORMALIZE_1(dat2, exp2); + NORMALIZE_1(dat3, exp3); + } + SECTION("parentheses - wrong closing symbol") + { + const char dat1[] = "/({[ (} ]})/"; + const char dat2[] = "/({[ (] ]})/"; + const char exp1[] = "/({[ ("; + const char exp2[] = "/({[ ("; + + NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN); + } + SECTION("curly braces - wrong closing symbol") + { + const char dat1[] = "/({[ {) ]})/"; + const char dat2[] = "/({[ {] ]})/"; + const char exp1[] = "/({[ {"; + const char exp2[] = "/({[ {"; + + NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN); + } + SECTION("square brackets - wrong closing symbol") + { + const char dat1[] = "/([{ [) }])/"; + const char dat2[] = "/([{ [} }])/"; + const char exp1[] = "/([{ ["; + const char exp2[] = "/([{ ["; + + NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN); + } + SECTION("parentheses - mismatch") + { + const char dat1[] = "/)/"; + const char dat2[] = "/())/"; + const char dat3[] = "/({[ ()) ]})/"; + const char exp1[] = "/"; + const char exp2[] = "/()"; + const char exp3[] = "/({[ ()"; + + NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat3, exp3, JSTokenizer::BAD_TOKEN); + } + SECTION("curly braces - mismatch") + { + const char dat1[] = "/}/"; + const char dat2[] = "/{}}/"; + const char dat3[] = "/({[ {}} ]})/"; + const char exp1[] = "/"; + const char exp2[] = "/{}"; + const char exp3[] = "/({[ {}"; + + NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat3, exp3, JSTokenizer::BAD_TOKEN); + } + SECTION("square brackets - mismatch") + { + const char dat1[] = "/]/"; + const char dat2[] = "/[]]/"; + const char dat3[] = "/([{ []] }])/"; + const char exp1[] = "/"; + const char exp2[] = "/[]"; + const char exp3[] = "/([{ []"; + + NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN); + NORM_BAD_1(dat3, exp3, JSTokenizer::BAD_TOKEN); + } + SECTION("parentheses - continuation") + { + const char dat1[] = "/(("; + const char dat2[] = "))/"; + const char exp1[] = "/(("; + const char exp2[] = "))/"; + const char exp[] = "/(())/"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION("curly braces - continuation") + { + const char dat1[] = "/{{"; + const char dat2[] = "}}/"; + const char exp1[] = "/{{"; + const char exp2[] = "}}/"; + const char exp[] = "/{{}}/"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION("square brackets - continuation") + { + const char dat1[] = "/[["; + const char dat2[] = "]]/"; + const char exp1[] = "/[["; + const char exp2[] = "]]/"; + const char exp[] = "/[[]]/"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION("parentheses - mismatch in continuation") + { + const char dat1[] = "/("; + const char dat2[] = "))/"; + const char exp1[] = "/("; + const char exp2[] = ")"; + const char exp[] = "/()"; + + NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::BAD_TOKEN); + NORM_COMBINED_BAD_2(dat1, dat2, exp, JSTokenizer::BAD_TOKEN); + } + SECTION("curly braces - mismatch in continuation") + { + const char dat1[] = "/{"; + const char dat2[] = "}}/"; + const char exp1[] = "/{"; + const char exp2[] = "}"; + const char exp[] = "/{}"; + + NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::BAD_TOKEN); + NORM_COMBINED_BAD_2(dat1, dat2, exp, JSTokenizer::BAD_TOKEN); + } + SECTION("square brackets - mismatch in continuation") + { + const char dat1[] = "/["; + const char dat2[] = "]]/"; + const char exp1[] = "/["; + const char exp2[] = "]"; + const char exp[] = "/[]"; + + NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::BAD_TOKEN); + NORM_COMBINED_BAD_2(dat1, dat2, exp, JSTokenizer::BAD_TOKEN); + } +} + TEST_CASE("ignored identifiers", "[JSNormalizer]") { // 'console' 'eval' 'document' are in the ignore list