LITERAL_HEX_INTEGER 0x[0-9a-fA-F]*|0X[0-9a-fA-F]*
LITERAL_DQ_STRING_START \"
LITERAL_DQ_STRING_END \"
-LITERAL_DQ_STRING_SKIP \\\"
+LITERAL_DQ_STRING_SKIP \\\"|\\\\
LITERAL_DQ_STRING_TEXT [^\"\\%\xA\xD\{0x10}(\xE2\x80\xA8)(\xE2\x80\xA9)("<"+(?i:script))("<"+(?i:\/script>))]{1,32}
LITERAL_SQ_STRING_START \'
LITERAL_SQ_STRING_END \'
-LITERAL_SQ_STRING_SKIP \\\'
+LITERAL_SQ_STRING_SKIP \\\'|\\\\
LITERAL_SQ_STRING_TEXT [^\'\\%\xA\xD\{0x10}(\xE2\x80\xA8)(\xE2\x80\xA9)("<"+(?i:script))("<"+(?i:\/script>))]{1,32}
LITERAL_TEMPLATE_START \`
LITERAL_TEMPLATE_END \`
LITERAL_TEMPLATE_OTHER [^\\%\`(\$\{)("<"+(?i:\/script>))]{1,32}
LITERAL_REGEX_START \/[^*\/]
LITERAL_REGEX_END \/[gimsuy]*
-LITERAL_REGEX_SKIP \\\/
+LITERAL_REGEX_SKIP \\\/|\\\\|\\\(|\\\)|\\\[|\\\]|\\\{|\\\}
+LITERAL_REGEX_TEXT [^<{LF}{CR}{LS}{PS}\\\/\(\[\{\)\]\}]+
+LITERAL_REGEX_G_OPEN \(|\[|\{
+LITERAL_REGEX_G_CLOSE \)|\]|\}
/* extra literals */
/* according to https://ecma-international.org/ecma-262/5.1/#sec-4.3 */
LITERAL_UNDEFINED undefined
<unesc_tmpll>{URL_ESCAPE_SEQUENCE} { set_encoding(IS_PERCENT); escaped_url_sequence(); }
<regst>{LITERAL_REGEX_START} { EXEC(literal_regex_start()) }
-<regex>{LITERAL_REGEX_END} { ECHO; BEGIN(divop); }
+<regex>{LITERAL_REGEX_END} { EXEC(literal_regex_end()) }
<regex>{HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); RETURN(CLOSING_TAG) }
-<regex>{LITERAL_REGEX_SKIP} { ECHO; }
<regex>\\{LF} |
<regex>\\{CR} |
<regex>{LINE_TERMINATORS} { BEGIN(regst); RETURN(BAD_TOKEN) }
-<regex>[^<{LF}{CR}{LS}{PS}\\\/]+ { ECHO; }
+<regex>{LITERAL_REGEX_SKIP} { ECHO; }
+<regex>{LITERAL_REGEX_TEXT} { ECHO; }
+<regex>{LITERAL_REGEX_G_OPEN} { EXEC(literal_regex_g_open()) }
+<regex>{LITERAL_REGEX_G_CLOSE} { EXEC(literal_regex_g_close()) }
<regex>{UNICODE_ESCAPE_SEQUENCE} |
<regex>{HEX_ESCAPE_SEQUENCE} { escaped_unicode(); }
<regex><<EOF>> { RETURN(SCRIPT_CONTINUE) }
scope_stack = std::stack<Scope>();
scope_stack.emplace(GLOBAL);
+
BEGIN(regst);
}
yyless(1);
BEGIN(regex);
set_ident_norm(true);
+ regex_stack = std::stack<char>();
+ return EOS;
+}
+
+JSTokenizer::JSRet JSTokenizer::literal_regex_end()
+{
+ if (regex_stack.empty())
+ {
+ ECHO;
+ BEGIN(divop);
+ return EOS;
+ }
+ else
+ {
+ ECHO;
+ return EOS;
+ }
+}
+
+JSTokenizer::JSRet JSTokenizer::literal_regex_g_open()
+{
+ regex_stack.push(yytext[0]);
+ ECHO;
+ return EOS;
+}
+
+JSTokenizer::JSRet JSTokenizer::literal_regex_g_close()
+{
+ if (regex_stack.empty())
+ {
+ debug_logf(5, http_trace, TRACE_JS_PROC, nullptr,
+ "no group to close, .. %c\n", yytext[0]);
+ return BAD_TOKEN;
+ }
+
+ char c = yytext[0];
+ char o = regex_stack.top();
+ char d = o == '(' ? 1 : 2;
+
+ regex_stack.pop();
+
+ if (o + d != c)
+ {
+ debug_logf(5, http_trace, TRACE_JS_PROC, nullptr,
+ "closing symbol mismatch, %c .. %c\n", o, c);
+ return BAD_TOKEN;
+ }
+
+ ECHO;
+
return EOS;
}
static const char all_patterns_expected6[] =
"tag ` template\n ${a+b} template`";
+static const char all_patterns_buf7[] =
+ "/Day: \\d{2}\\/Month: \\d{2}\\/Year: \\d{4}/;"
+ "/<\\d{3}>\\//g";
+
+static const char all_patterns_expected7[] =
+ "/Day: \\d{2}\\/Month: \\d{2}\\/Year: \\d{4}/;"
+ "/<\\d{3}>\\//g";
+
+static const char all_patterns_buf8[] =
+ "a = \" \\\" \\\\\\\" \\\\\";"
+ "b = ` \\` \\\\\\` \\\\`;"
+ "c = ' \\' \\\\\\' \\\\';"
+ "d = / \\/ \\\\\\/ \\\\/;"
+ "a + b;";
+
+static const char all_patterns_expected8[] =
+ "a=\" \\\" \\\\\\\" \\\\\";"
+ "b=` \\` \\\\\\` \\\\`;"
+ "c=' \\' \\\\\\' \\\\';"
+ "d=/ \\/ \\\\\\/ \\\\/;"
+ "a+b;";
+
+static const char all_patterns_buf9[] =
+ "var r = /^(?:(?:https?|mailto|ftp):|[^:/?#]*(?:[/?#]|$))/i;"
+ "new Lb(function(a){return /^[^:]*([/?#]|$)/.test(a)})";
+
+static const char all_patterns_expected9[] =
+ "var r=/^(?:(?:https?|mailto|ftp):|[^:/?#]*(?:[/?#]|$))/i;"
+ "new Lb(function(a){return /^[^:]*([/?#]|$)/.test(a)})";
+
TEST_CASE("all patterns", "[JSNormalizer]")
{
SECTION("whitespaces and special characters")
NORMALIZE(all_patterns_buf6);
VALIDATE(all_patterns_buf6, all_patterns_expected6);
}
+ SECTION("regex literal with slashes")
+ {
+ NORMALIZE(all_patterns_buf7);
+ VALIDATE(all_patterns_buf7, all_patterns_expected7);
+ }
+ SECTION("multiple escaped slashes")
+ {
+ NORMALIZE(all_patterns_buf8);
+ VALIDATE(all_patterns_buf8, all_patterns_expected8);
+ }
+ SECTION("slashes and braces")
+ {
+ NORMALIZE(all_patterns_buf9);
+ VALIDATE(all_patterns_buf9, all_patterns_expected9);
+ }
}
// Test vectors for different syntax cases
}
}
+TEST_CASE("scope regex groups", "[JSNormalizer]")
+{
+ SECTION("parentheses")
+ {
+ const char dat1[] = "a=/()/;";
+ const char dat2[] = "b=/()()()/;";
+ const char dat3[] = "c=/((()))/;";
+ const char exp1[] = "a=/()/;";
+ const char exp2[] = "b=/()()()/;";
+ const char exp3[] = "c=/((()))/;";
+
+ NORMALIZE_1(dat1, exp1);
+ NORMALIZE_1(dat2, exp2);
+ NORMALIZE_1(dat3, exp3);
+ }
+ SECTION("curly braces")
+ {
+ const char dat1[] = "a=/{}/;";
+ const char dat2[] = "b=/{}{}{}/;";
+ const char dat3[] = "c=/{{{}}}/;";
+ const char exp1[] = "a=/{}/;";
+ const char exp2[] = "b=/{}{}{}/;";
+ const char exp3[] = "c=/{{{}}}/;";
+
+ NORMALIZE_1(dat1, exp1);
+ NORMALIZE_1(dat2, exp2);
+ NORMALIZE_1(dat3, exp3);
+ }
+ SECTION("square brackets")
+ {
+ const char dat1[] = "a=/[]/;";
+ const char dat2[] = "b=/[][][]/;";
+ const char dat3[] = "c=/[[[]]]/;";
+ const char exp1[] = "a=/[]/;";
+ const char exp2[] = "b=/[][][]/;";
+ const char exp3[] = "c=/[[[]]]/;";
+
+ NORMALIZE_1(dat1, exp1);
+ NORMALIZE_1(dat2, exp2);
+ NORMALIZE_1(dat3, exp3);
+ }
+ SECTION("mix of brackets")
+ {
+ const char dat1[] = "a=/(){}[]/";
+ const char dat2[] = "b=/({})[]/";
+ const char dat3[] = "c=/(){[]}/";
+ const char exp1[] = "a=/(){}[]/";
+ const char exp2[] = "b=/({})[]/";
+ const char exp3[] = "c=/(){[]}/";
+
+ NORMALIZE_1(dat1, exp1);
+ NORMALIZE_1(dat2, exp2);
+ NORMALIZE_1(dat3, exp3);
+ }
+ SECTION("parentheses - wrong closing symbol")
+ {
+ const char dat1[] = "/({[ (} ]})/";
+ const char dat2[] = "/({[ (] ]})/";
+ const char exp1[] = "/({[ (";
+ const char exp2[] = "/({[ (";
+
+ NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN);
+ }
+ SECTION("curly braces - wrong closing symbol")
+ {
+ const char dat1[] = "/({[ {) ]})/";
+ const char dat2[] = "/({[ {] ]})/";
+ const char exp1[] = "/({[ {";
+ const char exp2[] = "/({[ {";
+
+ NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN);
+ }
+ SECTION("square brackets - wrong closing symbol")
+ {
+ const char dat1[] = "/([{ [) }])/";
+ const char dat2[] = "/([{ [} }])/";
+ const char exp1[] = "/([{ [";
+ const char exp2[] = "/([{ [";
+
+ NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN);
+ }
+ SECTION("parentheses - mismatch")
+ {
+ const char dat1[] = "/)/";
+ const char dat2[] = "/())/";
+ const char dat3[] = "/({[ ()) ]})/";
+ const char exp1[] = "/";
+ const char exp2[] = "/()";
+ const char exp3[] = "/({[ ()";
+
+ NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat3, exp3, JSTokenizer::BAD_TOKEN);
+ }
+ SECTION("curly braces - mismatch")
+ {
+ const char dat1[] = "/}/";
+ const char dat2[] = "/{}}/";
+ const char dat3[] = "/({[ {}} ]})/";
+ const char exp1[] = "/";
+ const char exp2[] = "/{}";
+ const char exp3[] = "/({[ {}";
+
+ NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat3, exp3, JSTokenizer::BAD_TOKEN);
+ }
+ SECTION("square brackets - mismatch")
+ {
+ const char dat1[] = "/]/";
+ const char dat2[] = "/[]]/";
+ const char dat3[] = "/([{ []] }])/";
+ const char exp1[] = "/";
+ const char exp2[] = "/[]";
+ const char exp3[] = "/([{ []";
+
+ NORM_BAD_1(dat1, exp1, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat2, exp2, JSTokenizer::BAD_TOKEN);
+ NORM_BAD_1(dat3, exp3, JSTokenizer::BAD_TOKEN);
+ }
+ SECTION("parentheses - continuation")
+ {
+ const char dat1[] = "/((";
+ const char dat2[] = "))/";
+ const char exp1[] = "/((";
+ const char exp2[] = "))/";
+ const char exp[] = "/(())/";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION("curly braces - continuation")
+ {
+ const char dat1[] = "/{{";
+ const char dat2[] = "}}/";
+ const char exp1[] = "/{{";
+ const char exp2[] = "}}/";
+ const char exp[] = "/{{}}/";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION("square brackets - continuation")
+ {
+ const char dat1[] = "/[[";
+ const char dat2[] = "]]/";
+ const char exp1[] = "/[[";
+ const char exp2[] = "]]/";
+ const char exp[] = "/[[]]/";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION("parentheses - mismatch in continuation")
+ {
+ const char dat1[] = "/(";
+ const char dat2[] = "))/";
+ const char exp1[] = "/(";
+ const char exp2[] = ")";
+ const char exp[] = "/()";
+
+ NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::BAD_TOKEN);
+ NORM_COMBINED_BAD_2(dat1, dat2, exp, JSTokenizer::BAD_TOKEN);
+ }
+ SECTION("curly braces - mismatch in continuation")
+ {
+ const char dat1[] = "/{";
+ const char dat2[] = "}}/";
+ const char exp1[] = "/{";
+ const char exp2[] = "}";
+ const char exp[] = "/{}";
+
+ NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::BAD_TOKEN);
+ NORM_COMBINED_BAD_2(dat1, dat2, exp, JSTokenizer::BAD_TOKEN);
+ }
+ SECTION("square brackets - mismatch in continuation")
+ {
+ const char dat1[] = "/[";
+ const char dat2[] = "]]/";
+ const char exp1[] = "/[";
+ const char exp2[] = "]";
+ const char exp[] = "/[]";
+
+ NORM_BAD_2(dat1, dat2, exp1, exp2, JSTokenizer::BAD_TOKEN);
+ NORM_COMBINED_BAD_2(dat1, dat2, exp, JSTokenizer::BAD_TOKEN);
+ }
+}
+
TEST_CASE("ignored identifiers", "[JSNormalizer]")
{
// 'console' 'eval' 'document' are in the ignore list