From: Mike Stepanek (mstepane) Date: Mon, 11 Oct 2021 10:54:31 +0000 (+0000) Subject: Merge pull request #3089 in SNORT/snort3 from ~DKYRYLOV/snort3:js_norm_asi to master X-Git-Tag: 3.1.15.0~12 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0ebba6cc35d38549221373ec0fc392b2baa48808;p=thirdparty%2Fsnort3.git Merge pull request #3089 in SNORT/snort3 from ~DKYRYLOV/snort3:js_norm_asi to master Squashed commit of the following: commit feeedee58a22544fb4788a2646af52c65f1dc8cf Author: dkyrylov Date: Mon Sep 20 14:48:53 2021 +0300 http_inspect: add automatic semicolon insertion --- diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index d7f73d054..4b9c0fe2b 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -55,6 +55,24 @@ private: DIRECTIVE }; + enum ASIGroup + { + ASI_OTHER = 0, + ASI_GROUP_1, // { + ASI_GROUP_2, // } + ASI_GROUP_3, // [ ( + ASI_GROUP_4, // ] + ASI_GROUP_5, // ) + ASI_GROUP_6, // + - + ASI_GROUP_7, // this true false null identifier literal + //IDENTIFIER + LITERAL + KEYWORD_LITERAL + ASI_GROUP_8, // ++ -- + ASI_GROUP_9, // continue break return debugger // same as KEYWORD_BA + ASI_GROUP_10, // var function new delete void typeof if do while for with + // switch throw try ~ + + ASI_GROUP_MAX + }; + public: enum JSRet { @@ -87,6 +105,7 @@ private: JSRet eval_eof(); JSRet do_spacing(JSToken cur_token); JSRet do_operator_spacing(JSToken cur_token); + void do_semicolon_insertion(ASIGroup current); JSRet do_identifier_substitution(const char* lexeme); bool unescape(const char* lexeme); void process_punctuator(); @@ -103,6 +122,7 @@ private: uint8_t max_template_nesting; std::stack> bracket_depth; JSToken token = UNDEFINED; + ASIGroup previous_group = ASI_OTHER; JSIdentifierCtxBase& ident_ctx; struct @@ -116,6 +136,21 @@ private: char*& tmp_buf; size_t& tmp_buf_size; const int tmp_cap_size; + bool newline_found = false; + constexpr static bool insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX] + { + {false, false, false, false, false, false, false, false, false, false, false,}, + {false, false, false, false, false, false, false, false, false, false, false,}, + {false, false, false, false, false, false, false, false, false, false, false,}, + {false, false, false, false, false, false, false, false, false, false, false,}, + {false, true, false, false, false, false, false, true, true, true, true, }, + {false, false, false, false, false, false, false, true, true, true, true, }, + {false, false, false, false, false, false, false, false, false, false, false,}, + {false, true, false, false, false, false, false, true, true, true, true, }, + {false, true, false, true, false, false, false, true, true, true, true, }, + {false, true, false, true, false, false, true, true, true, true, true, }, + {false, false, false, false, false, false, false, false, false, false, false,} + }; }; #endif // JS_TOKENIZER_H diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index ea8a350d5..81d8f30fe 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -50,69 +50,12 @@ #define EXEC(f) { auto r = (f); if (r) { BEGIN(regst); return r; } } #define EEOF(f) { auto r = (f); if (r) { if (r != SCRIPT_CONTINUE) BEGIN(regst); return r; } } +constexpr bool JSTokenizer::insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX]; %} /* The following grammar was created based on ECMAScript specification */ /* source https://ecma-international.org/ecma-262/5.1/ */ -/* whitespaces */ -/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.2 */ -TAB \x9 -VT \xB -FF \xC -SP \x20 -NBSP \xA0 -BOM \xEF\xBB\xBF -WHITESPACES {TAB}|{VT}|{FF}|{SP}|{NBSP}|{BOM} - -/* single char escape sequences */ -/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 */ -NUL \x0 -BS \x8 -HT \x9 -CHAR_ESCAPE_SEQUENCES {NUL}|{BS}|{HT} - -/* line terminators */ -/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.3 */ -LF \xA -CR \xD -LS \xE2\x80\xA8 -PS \xE2\x80\xA9 -LINE_TERMINATORS {LF}|{CR}|{LS}|{PS} - -/* comments */ -/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.4 */ -LINE_COMMENT_START "//" -LINE_COMMENT_END1 [^<\xA\xD]*\xA -LINE_COMMENT_END2 [^<\xA\xD]*\xD -LINE_COMMENT_END3 [^<\xA\xD]*"<"+(?i:script) -LINE_COMMENT_END4 [^<\xA\xD]*"<"+(?i:\/script>) -LINE_COMMENT_SKIP [^<\xA\xD]*"<"? -BLOCK_COMMENT_START "/*" -BLOCK_COMMENT_END1 [^<*]*"*"+"/" -BLOCK_COMMENT_END2 [^<*]*"<"+(?i:script) -BLOCK_COMMENT_END3 [^<*]*"<"+(?i:\/script>) -BLOCK_COMMENT_SKIP [^<*]*[<*]? - -/* directives */ -/* according to https://ecma-international.org/ecma-262/5.1/#sec-14.1 */ -USE_STRICT_DIRECTIVE "\"use strict\""|"\'use strict\'" -USE_STRICT_DIRECTIVE_SC "\"use strict\"";*|"\'use strict\'";* - -/* keywords */ -/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6.1.1 */ -KEYWORD break|case|debugger|in|import|protected|do|else|function|try|implements|static|instanceof|new|this|class|let|typeof|var|with|enum|private|catch|continue|default|extends|public|finally|for|if|super|yield|return|switch|throw|const|interface|void|while|delete|export|package - -/* punctuators */ -/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.7 */ -CLOSING_BRACES ")"|"]" -OPEN_BRACKET "{" -CLOSE_BRACKET "}" -PUNCTUATOR "("|"["|">="|"=="|"!="|"==="|"!=="|"."|";"|","|"<"|">"|"<="|"<<"|">>"|">>>"|"&"|"|"|"^"|"!"|"&&"|"||"|"?"|":"|"="|"+="|"-="|"*="|"%="|"<<="|">>="|">>>="|"&="|"|="|"^="|"~" -OPERATOR "+"|"-"|"*"|"++"|"--"|"%" -DIV_OPERATOR "/" -DIV_ASSIGNMENT_OPERATOR "/=" - /* Unicode letter ranges (categories Lu, Ll, Lt, Lm, Lo and Nl) */ /* generated with unicode_range_generator.l */ /* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */ @@ -878,6 +821,76 @@ UNICODE_ZWJ \xE2\x80\x8D /* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 (escape sequence) */ UNICODE_ESCAPE_SEQUENCE \\u[0-9a-fA-F]{4} +/* whitespaces */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.2 */ +TAB \x9 +VT \xB +FF \xC +SP \x20 +NBSP \xA0 +BOM \xEF\xBB\xBF +WHITESPACES {TAB}|{VT}|{FF}|{SP}|{NBSP}|{BOM} + +/* single char escape sequences */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 */ +NUL \x0 +BS \x8 +HT \x9 +CHAR_ESCAPE_SEQUENCES {NUL}|{BS}|{HT} + +/* line terminators */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.3 */ +LF \xA +CR \xD +LS \xE2\x80\xA8 +PS \xE2\x80\xA9 +LINE_TERMINATORS {LF}|{CR}|{LS}|{PS} + +/* comments */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.4 */ +LINE_COMMENT_START "//" +LINE_COMMENT_END1 [^<\xA\xD]*\xA +LINE_COMMENT_END2 [^<\xA\xD]*\xD +LINE_COMMENT_END3 [^<\xA\xD]*"<"+(?i:script) +LINE_COMMENT_END4 [^<\xA\xD]*"<"+(?i:\/script>) +LINE_COMMENT_SKIP [^<\xA\xD]*"<"? +BLOCK_COMMENT_START "/*" +BLOCK_COMMENT_END1 [^<*\xA\xD]*"*"+"/" +BLOCK_COMMENT_END2 [^<*\xA\xD]*"<"+(?i:script) +BLOCK_COMMENT_END3 [^<*\xA\xD]*"<"+(?i:\/script>) +BLOCK_COMMENT_LINE1 [^<*\xA\xD]*\xA +BLOCK_COMMENT_LINE2 [^<*\xA\xD]*\xD +BLOCK_COMMENT_SKIP [^<*\xA\xD]*[<*]? + +/* directives */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-14.1 */ +USE_STRICT_DIRECTIVE "\"use strict\""|"\'use strict\'" +USE_STRICT_DIRECTIVE_SC "\"use strict\"";*|"\'use strict\'";* + +/* keywords */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6.1.1 */ +/* keywords that can appear at the begining or the end of Statement*/ +KEYWORD_BA break|continue|debugger|return +/* keywords that can appear at the beginning of Statement*/ +KEYWORD_B delete|do|for|function|if|new|switch|throw|try|typeof|var|void|while|with +/* keywords that can not appear at the beginning or the end of Statement*/ +KEYWORD_OTHER case|catch|class|const|default|else|enum|export|extends|finally|implements|import|in|instanceof|interface|let|package|private|protected|public|static|super|yield + +/* punctuators */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.7 */ +CLOSING_PAREN ")" +CLOSING_BRACE "]" +OPEN_BRACKET "{" +CLOSE_BRACKET "}" +PUNCTUATOR_PREFIX "~"|"!" +OPEN_PAREN_BRACE "("|"[" +PUNCTUATOR ">="|"=="|"!="|"==="|"!=="|"."|";"|","|"<"|">"|"<="|"<<"|">>"|">>>"|"&"|"|"|"^"|"&&"|"||"|"?"|":"|"="|"+="|"-="|"*="|"%="|"<<="|">>="|">>>="|"&="|"|="|"^=" +OPERATOR_PREFIX "+"|"-" +OPERATOR_INCR_DECR "--"|"++" +OPERATOR "*"|"%" +DIV_OPERATOR "/" +DIV_ASSIGNMENT_OPERATOR "/=" + /* identifiers */ /* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6 */ IDENTIFIER_START [_$]|({UNICODE_LETTER})|{UNICODE_ESCAPE_SEQUENCE} @@ -887,6 +900,7 @@ IDENTIFIER ({IDENTIFIER_START}{IDENTIFIER_PART})* /* literals */ /* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8 */ LITERAL_NULL null +LITERAL_THIS this LITERAL_BOOLEAN true|false LITERAL_DECIMAL [.]?[0-9]+[\.]?[0-9]*[eE]?[0-9]* LITERAL_HEX_INTEGER 0x[0-9a-fA-F]*|0X[0-9a-fA-F]* @@ -910,7 +924,7 @@ LITERAL_REGEX_SKIP \\\/ LITERAL_UNDEFINED undefined LITERAL_INFINITY Infinity|\xE2\x88\x9E LITERAL_NAN NaN -LITERAL {LITERAL_NULL}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN} +LITERAL {LITERAL_NULL}|{LITERAL_THIS}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN} HTML_COMMENT_OPEN "<"+"!--" HTML_TAG_SCRIPT_OPEN "<"+(?i:script) @@ -949,15 +963,15 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 %% {WHITESPACES} { } {CHAR_ESCAPE_SEQUENCES} { } -{LINE_TERMINATORS} { BEGIN(regst); } +{LINE_TERMINATORS} { BEGIN(regst); newline_found = true; } {HTML_TAG_SCRIPT_OPEN} { BEGIN(regst); return OPENING_TAG; } {HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return SCRIPT_ENDED; } {HTML_COMMENT_OPEN} { BEGIN(lcomm); } {LINE_COMMENT_START} { BEGIN(lcomm); } -{LINE_COMMENT_END1} { BEGIN(regst); } -{LINE_COMMENT_END2} { BEGIN(regst); } +{LINE_COMMENT_END1} { BEGIN(regst); newline_found = true; } +{LINE_COMMENT_END2} { BEGIN(regst); newline_found = true; } {LINE_COMMENT_END3} { BEGIN(regst); return OPENING_TAG; } {LINE_COMMENT_END4} { BEGIN(regst); return CLOSING_TAG; } {LINE_COMMENT_SKIP} { } @@ -967,10 +981,12 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {BLOCK_COMMENT_END1} { BEGIN(regst); } {BLOCK_COMMENT_END2} { BEGIN(regst); return OPENING_TAG; } {BLOCK_COMMENT_END3} { BEGIN(regst); return CLOSING_TAG; } +{BLOCK_COMMENT_LINE1} | +{BLOCK_COMMENT_LINE2} { newline_found = true;} {BLOCK_COMMENT_SKIP} { } <> { states_apply(); return SCRIPT_CONTINUE; } - {LITERAL_DQ_STRING_START} { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(dqstr); } + {LITERAL_DQ_STRING_START} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(dqstr); } {LITERAL_DQ_STRING_END} { ECHO; BEGIN(divop); } {HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } \\{CR}{LF} { } @@ -981,7 +997,7 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {LITERAL_DQ_STRING_TEXT} { ECHO; } <> { states_apply(); return SCRIPT_CONTINUE; } - {LITERAL_SQ_STRING_START} { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(sqstr); } + {LITERAL_SQ_STRING_START} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(sqstr); } {LITERAL_SQ_STRING_END} { ECHO; BEGIN(divop); } {HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } \\{CR}{LF} { } @@ -992,10 +1008,10 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {LITERAL_SQ_STRING_TEXT} { ECHO; } <> { states_apply(); return SCRIPT_CONTINUE; } -{OPEN_BRACKET} { if (!bracket_depth.empty()) bracket_depth.top()++; process_punctuator(); } -{CLOSE_BRACKET} { process_closing_bracket(); } +{OPEN_BRACKET} { do_semicolon_insertion(ASI_GROUP_1); if (!bracket_depth.empty()) bracket_depth.top()++; process_punctuator(); } +{CLOSE_BRACKET} { do_semicolon_insertion(ASI_GROUP_2); process_closing_bracket(); } - {LITERAL_TEMPLATE_START} { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(tmpll); } + {LITERAL_TEMPLATE_START} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(tmpll); } (\\\\)*{LITERAL_TEMPLATE_END} { ECHO; BEGIN(divop); } (\\\\)*{LITERAL_TEMPLATE_SUBST_START} { EXEC(process_subst_open()) } {HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } @@ -1004,7 +1020,7 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {LITERAL_TEMPLATE_OTHER} { ECHO; } <> { return SCRIPT_CONTINUE; } -{LITERAL_REGEX_START} { EXEC(do_spacing(LITERAL)) yyout << '/'; states_correct(1); yyless(1); BEGIN(regex); } +{LITERAL_REGEX_START} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) yyout << '/'; states_correct(1); yyless(1); BEGIN(regex); } {LITERAL_REGEX_END} { ECHO; BEGIN(divop); } {HTML_TAG_SCRIPT_CLOSE} { BEGIN(regst); return CLOSING_TAG; } {LITERAL_REGEX_SKIP} { ECHO; } @@ -1015,19 +1031,28 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 <> { states_apply(); return SCRIPT_CONTINUE; } {DIV_OPERATOR} | -{DIV_ASSIGNMENT_OPERATOR} { ECHO; token = PUNCTUATOR; BEGIN(INITIAL); } +{DIV_ASSIGNMENT_OPERATOR} { previous_group = ASI_OTHER; ECHO; token = PUNCTUATOR; BEGIN(INITIAL); } + +{CLOSING_PAREN} { do_semicolon_insertion(ASI_GROUP_5); ECHO; token = PUNCTUATOR; BEGIN(divop); } +{CLOSING_BRACE} { do_semicolon_insertion(ASI_GROUP_4); ECHO; token = PUNCTUATOR; BEGIN(divop); } +{PUNCTUATOR_PREFIX} { do_semicolon_insertion(ASI_GROUP_10); process_punctuator(); } +{OPEN_PAREN_BRACE} { do_semicolon_insertion(ASI_GROUP_3); process_punctuator(); } +{PUNCTUATOR} { previous_group = ASI_OTHER; process_punctuator(); } -{CLOSING_BRACES} { ECHO; token = PUNCTUATOR; BEGIN(divop); } -{PUNCTUATOR} { process_punctuator(); } +{USE_STRICT_DIRECTIVE} { previous_group = ASI_OTHER; EXEC(do_spacing(DIRECTIVE)) ECHO; BEGIN(INITIAL); yyout << ';'; } +{USE_STRICT_DIRECTIVE_SC} { previous_group = ASI_OTHER; EXEC(do_spacing(DIRECTIVE)) ECHO; BEGIN(INITIAL); } +{KEYWORD_B} { do_semicolon_insertion(ASI_GROUP_10); EXEC(do_spacing(KEYWORD)) ECHO; BEGIN(regst); } +{KEYWORD_BA} { do_semicolon_insertion(ASI_GROUP_9); EXEC(do_spacing(KEYWORD)) ECHO; BEGIN(regst); } +{KEYWORD_OTHER} { previous_group = ASI_OTHER; EXEC(do_spacing(KEYWORD)) ECHO; BEGIN(regst); } -{USE_STRICT_DIRECTIVE} { EXEC(do_spacing(DIRECTIVE)) ECHO; BEGIN(INITIAL); yyout << ';'; } -{USE_STRICT_DIRECTIVE_SC} { EXEC(do_spacing(DIRECTIVE)) ECHO; BEGIN(INITIAL); } -{KEYWORD} { EXEC(do_spacing(KEYWORD)) ECHO; BEGIN(regst); } -{OPERATOR} { EXEC(do_operator_spacing(OPERATOR)) ECHO; BEGIN(divop); } -{LITERAL} { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(divop); } -{IDENTIFIER} { if (unescape(YYText())) { EXEC(do_spacing(IDENTIFIER)) EXEC(do_identifier_substitution(YYText())) } BEGIN(divop); } +{OPERATOR_PREFIX} { do_semicolon_insertion(ASI_GROUP_6); EXEC(do_operator_spacing(OPERATOR)) ECHO; BEGIN(divop); } +{OPERATOR_INCR_DECR} { do_semicolon_insertion(ASI_GROUP_8); EXEC(do_operator_spacing(OPERATOR)) ECHO; BEGIN(divop); } +{OPERATOR} { previous_group = ASI_OTHER; EXEC(do_operator_spacing(OPERATOR)) ECHO; BEGIN(divop); } -.|{ALL_UNICODE} { ECHO; token = UNDEFINED; BEGIN(INITIAL); } +{LITERAL} { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(divop); } +{IDENTIFIER} { do_semicolon_insertion(ASI_GROUP_7); if (unescape(YYText())) { EXEC(do_spacing(IDENTIFIER)) EXEC(do_identifier_substitution(YYText())) } BEGIN(divop); } + +.|{ALL_UNICODE} { previous_group = ASI_OTHER; ECHO; token = UNDEFINED; BEGIN(INITIAL); } <> { EEOF(eval_eof()) } %% @@ -1231,6 +1256,23 @@ JSTokenizer::JSRet JSTokenizer::do_identifier_substitution(const char* lexeme) return IDENTIFIER_OVERFLOW; } +void JSTokenizer::do_semicolon_insertion(ASIGroup current) +{ + assert(current >= 0 and current < ASI_GROUP_MAX); + if (newline_found) + { + newline_found = false; + if (insert_semicolon[previous_group][current]) + { + yyout << ';'; + previous_group = ASI_OTHER; + token = PUNCTUATOR; + return; + } + } + previous_group = current; +} + bool JSTokenizer::unescape(const char* lexeme) { if (strstr(lexeme, "\\u")) diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index ae750328a..5b059466b 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -482,8 +482,8 @@ static const char all_patterns_buf5[] = "ab\xE2\x80\xA9ww ab\xEF\xBB\xBFww ab∞ww 2abc"; static const char all_patterns_expected5[] = - "$2abc _2abc abc $__$ 肖晗 XÆA12 \u0041abc \u00FBdef \u1234ghi ab ww " - "ab ww ab ww ab ∞ ww 2 abc"; + "$2abc _2abc abc $__$ 肖晗 XÆA12 \u0041abc \u00FBdef \u1234ghi ab;ww " + "ab;ww ab ww ab ∞ ww 2 abc"; static const char all_patterns_buf6[] = "tag` template\n ${ a + b } template`"; @@ -692,7 +692,7 @@ static const char syntax_cases_buf5[] = static const char syntax_cases_expected5[] = "var i=1;while(i<100){i*=2;document.write(i+\", \");}i=1;do{i*=2;" - "document.write(i+\", \");}while(i<100)for(var i=0;i<10;i++){if(i==5){break;}" + "document.write(i+\", \");}while(i<100);for(var i=0;i<10;i++){if(i==5){break;}" "document.write(i+\", \");}for(var i=0;i<10;i++){if(i==5){continue;}" "document.write(i+\", \");}"; @@ -787,7 +787,7 @@ static const char syntax_cases_buf10[] = "var a = 2\n/ab -cd/"; static const char syntax_cases_expected10[] = - "var a=2 /ab -cd/"; + "var a=2;/ab -cd/"; static const char syntax_cases_buf11[] = "var d_str1 = \"\\\\ \" ; var d_str2 = \"abc\\\"def\" ;" @@ -1033,6 +1033,333 @@ TEST_CASE("template literal overflow", "[JSNormalizer]") } } +static const char asi_cases_buf0[] = + "array[0]\n{}"; + +static const char asi_cases_expected0[] = + "array[0];{}"; + +static const char asi_cases_buf1[] = + "array[0]\ntrue"; + +static const char asi_cases_expected1[] = + "array[0];true"; + +static const char asi_cases_buf2[] = + "array[0]\n++"; + +static const char asi_cases_expected2[] = + "array[0];++"; + +static const char asi_cases_buf3[] = + "array[0]\ncontinue"; + +static const char asi_cases_expected3[] = + "array[0];continue"; + +static const char asi_cases_buf4[] = + "array[0]\nvar b;"; + +static const char asi_cases_expected4[] = + "array[0];var b;"; + +static const char asi_cases_buf5[] = + "func()\ntrue"; + +static const char asi_cases_expected5[] = + "func();true"; + +static const char asi_cases_buf6[] = + "func()\n++"; + +static const char asi_cases_expected6[] = + "func();++"; + +static const char asi_cases_buf7[] = + "func()\ncontinue"; + +static const char asi_cases_expected7[] = + "func();continue"; + +static const char asi_cases_buf8[] = + "func()\nvar b;"; + +static const char asi_cases_expected8[] = + "func();var b;"; + +static const char asi_cases_buf9[] = + "1024\n{}"; + +static const char asi_cases_expected9[] = + "1024;{}"; + +static const char asi_cases_buf10[] = + "1024\ntrue"; + +static const char asi_cases_expected10[] = + "1024;true"; + +static const char asi_cases_buf11[] = + "1024\n++"; + +static const char asi_cases_expected11[] = + "1024;++"; + +static const char asi_cases_buf12[] = + "1024\ncontinue"; + +static const char asi_cases_expected12[] = + "1024;continue"; + +static const char asi_cases_buf13[] = + "1024\nvar b;"; + +static const char asi_cases_expected13[] = + "1024;var b;"; + +static const char asi_cases_buf14[] = + "++\n{}"; + +static const char asi_cases_expected14[] = + "++;{}"; + +static const char asi_cases_buf15[] = + "++\n[1,2,3]"; + +static const char asi_cases_expected15[] = + "++;[1,2,3]"; + +static const char asi_cases_buf16[] = + "++\ntrue"; + +static const char asi_cases_expected16[] = + "++;true"; + +static const char asi_cases_buf17[] = + "++\n++"; + +static const char asi_cases_expected17[] = + "++;++"; + +static const char asi_cases_buf18[] = + "++\ncontinue"; + +static const char asi_cases_expected18[] = + "++;continue"; + +static const char asi_cases_buf19[] = + "++\nvar b;"; + +static const char asi_cases_expected19[] = + "++;var b;"; + +static const char asi_cases_buf20[] = + "return\n{}"; + +static const char asi_cases_expected20[] = + "return;{}"; + +static const char asi_cases_buf21[] = + "return\n[1,2,3]"; + +static const char asi_cases_expected21[] = + "return;[1,2,3]"; + +static const char asi_cases_buf22[] = + "return\n+a"; + +static const char asi_cases_expected22[] = + "return;+a"; + +static const char asi_cases_buf23[] = + "return\ntrue"; + +static const char asi_cases_expected23[] = + "return;true"; + +static const char asi_cases_buf24[] = + "return\n++"; + +static const char asi_cases_expected24[] = + "return;++"; + +static const char asi_cases_buf25[] = + "return\ncontinue"; + +static const char asi_cases_expected25[] = + "return;continue"; + +static const char asi_cases_buf26[] = + "return\nvar b;"; + +static const char asi_cases_expected26[] = + "return;var b;"; + +TEST_CASE("automatic semicolon insertion", "[JSNormalizer]") +{ + SECTION("group_4 to group_1") + { + NORMALIZE(asi_cases_buf0); + VALIDATE(asi_cases_buf0, asi_cases_expected0); + } + + SECTION("group_4 to group_7") + { + NORMALIZE(asi_cases_buf1); + VALIDATE(asi_cases_buf1, asi_cases_expected1); + } + + SECTION("group_4 to group_8") + { + NORMALIZE(asi_cases_buf2); + VALIDATE(asi_cases_buf2, asi_cases_expected2); + } + + SECTION("group_4 to group_9") + { + NORMALIZE(asi_cases_buf3); + VALIDATE(asi_cases_buf3, asi_cases_expected3); + } + + SECTION("group_4 to group_10") + { + NORMALIZE(asi_cases_buf4); + VALIDATE(asi_cases_buf4, asi_cases_expected4); + } + + SECTION("group_5 to group_7") + { + NORMALIZE(asi_cases_buf5); + VALIDATE(asi_cases_buf5, asi_cases_expected5); + } + + SECTION("group_5 to group_8") + { + NORMALIZE(asi_cases_buf6); + VALIDATE(asi_cases_buf6, asi_cases_expected6); + } + + SECTION("group_5 to group_9") + { + NORMALIZE(asi_cases_buf7); + VALIDATE(asi_cases_buf7, asi_cases_expected7); + } + + SECTION("group_5 to group_10") + { + NORMALIZE(asi_cases_buf8); + VALIDATE(asi_cases_buf8, asi_cases_expected8); + } + + SECTION("group_7 to group_1") + { + NORMALIZE(asi_cases_buf9); + VALIDATE(asi_cases_buf9, asi_cases_expected9); + } + + SECTION("group_7 to group_7") + { + NORMALIZE(asi_cases_buf10); + VALIDATE(asi_cases_buf10, asi_cases_expected10); + } + + SECTION("group_7 to group_8") + { + NORMALIZE(asi_cases_buf11); + VALIDATE(asi_cases_buf11, asi_cases_expected11); + } + + SECTION("group_7 to group_9") + { + NORMALIZE(asi_cases_buf12); + VALIDATE(asi_cases_buf12, asi_cases_expected12); + } + + SECTION("group_7 to group_10") + { + NORMALIZE(asi_cases_buf13); + VALIDATE(asi_cases_buf13, asi_cases_expected13); + } + + SECTION("group_8 to group_1") + { + NORMALIZE(asi_cases_buf14); + VALIDATE(asi_cases_buf14, asi_cases_expected14); + } + + SECTION("group_8 to group_3") + { + NORMALIZE(asi_cases_buf15); + VALIDATE(asi_cases_buf15, asi_cases_expected15); + } + + SECTION("group_8 to group_7") + { + NORMALIZE(asi_cases_buf16); + VALIDATE(asi_cases_buf16, asi_cases_expected16); + } + + SECTION("group_8 to group_8") + { + NORMALIZE(asi_cases_buf17); + VALIDATE(asi_cases_buf17, asi_cases_expected17); + } + + SECTION("group_8 to group_9") + { + NORMALIZE(asi_cases_buf18); + VALIDATE(asi_cases_buf18, asi_cases_expected18); + } + + SECTION("group_8 to group_10") + { + NORMALIZE(asi_cases_buf19); + VALIDATE(asi_cases_buf19, asi_cases_expected19); + } + + SECTION("group_9 to group_1") + { + NORMALIZE(asi_cases_buf20); + VALIDATE(asi_cases_buf20, asi_cases_expected20); + } + + SECTION("group_9 to group_3") + { + NORMALIZE(asi_cases_buf21); + VALIDATE(asi_cases_buf21, asi_cases_expected21); + } + + SECTION("group_9 to group_6") + { + NORMALIZE(asi_cases_buf22); + VALIDATE(asi_cases_buf22, asi_cases_expected22); + } + + SECTION("group_9 to group_7") + { + NORMALIZE(asi_cases_buf23); + VALIDATE(asi_cases_buf23, asi_cases_expected23); + } + + SECTION("group_9 to group_8") + { + NORMALIZE(asi_cases_buf24); + VALIDATE(asi_cases_buf24, asi_cases_expected24); + } + + SECTION("group_9 to group_9") + { + NORMALIZE(asi_cases_buf25); + VALIDATE(asi_cases_buf25, asi_cases_expected25); + } + + SECTION("group_9 to group_10") + { + NORMALIZE(asi_cases_buf26); + VALIDATE(asi_cases_buf26, asi_cases_expected26); + } +} + TEST_CASE("endings", "[JSNormalizer]") { SECTION("script closing tag is present", "[JSNormalizer]") @@ -1889,68 +2216,69 @@ TEST_CASE("memcap", "[JSNormalizer]") static constexpr const char* s_closing_tag = ""; -#define MAKE_INPUT(src, src_len, start, mid, end, depth) \ - std::string input_##src(start); \ - input_##src.append(depth - strlen(start) - strlen(end) - strlen(s_closing_tag), mid); \ - input_##src.append(end, strlen(end)); \ - input_##src.append(s_closing_tag, strlen(s_closing_tag)); \ - const char* src = input_##src.c_str(); \ - size_t src_len = input_##src.size() +static const std::string make_input(const char* begin, const char* mid, + const char* end, size_t len) +{ + std::string s(begin); + int fill = (len - strlen(begin) - strlen(end)) / strlen(mid); + for (int i = 0; i < fill; ++i) + s.append(mid); + s.append(end); + return s; +} TEST_CASE("benchmarking - ::normalize() - literals", "[JSNormalizer]") { JSIdentifierCtxTest ident_ctx; JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTNIG); char dst[DEPTH]; - - MAKE_INPUT(src_ws, src_ws_len, "", ' ', "", DEPTH); - MAKE_INPUT(src_bcomm, src_bcomm_len, "/*", ' ', "*/", DEPTH); - MAKE_INPUT(src_dqstr, src_dqstr_len, "\"", ' ', "\"", DEPTH); - + auto whitespace = make_input("", " ", "", DEPTH); + auto block_comment = make_input("/*", " ", "*/", DEPTH); + auto double_quote = make_input("\"", " ", "\"", DEPTH); BENCHMARK("memcpy - whitespaces - 65535 bytes") { - return memcpy(dst, src_ws, src_ws_len); + return memcpy(dst, whitespace.c_str(), whitespace.size()); }; BENCHMARK("whitespaces - 65535 bytes") { normalizer.rewind_output(); - return normalizer.normalize(src_ws, src_ws_len); + return normalizer.normalize(whitespace.c_str(), whitespace.size()); }; BENCHMARK("block comment - 65535 bytes") { normalizer.rewind_output(); - return normalizer.normalize(src_bcomm, src_bcomm_len); + return normalizer.normalize(block_comment.c_str(), block_comment.size()); }; BENCHMARK("double quotes string - 65535 bytes") { normalizer.rewind_output(); - return normalizer.normalize(src_dqstr, src_dqstr_len); + return normalizer.normalize(double_quote.c_str(), double_quote.size()); }; constexpr size_t depth_8k = 8192; - MAKE_INPUT(src_ws_8k, src_ws_len_8k, "", ' ', "", depth_8k); - MAKE_INPUT(src_bcomm_8k, src_bcomm_len_8k, "/*", ' ', "*/", depth_8k); - MAKE_INPUT(src_dqstr_8k, src_dqstr_len_8k, "\"", ' ', "\"", depth_8k); + auto whitespace_8k = make_input("", " ", "", depth_8k); + auto block_comment_8k = make_input("/*", " ", "*/", depth_8k); + auto double_quote_8k = make_input("\"", " ", "\"", depth_8k); BENCHMARK("memcpy - whitespaces - 8192 bytes") { - return memcpy(dst, src_ws_8k, src_ws_len_8k); + return memcpy(dst, whitespace_8k.c_str(), whitespace_8k.size()); }; BENCHMARK("whitespaces - 8192 bytes") { normalizer.rewind_output(); - return normalizer.normalize(src_ws_8k, src_ws_len_8k); + return normalizer.normalize(whitespace_8k.c_str(), whitespace_8k.size()); }; BENCHMARK("block comment - 8192 bytes") { normalizer.rewind_output(); - return normalizer.normalize(src_bcomm_8k, src_bcomm_len_8k); + return normalizer.normalize(block_comment_8k.c_str(), block_comment_8k.size()); }; BENCHMARK("double quotes string - 8192 bytes") { normalizer.rewind_output(); - return normalizer.normalize(src_dqstr_8k, src_dqstr_len_8k); + return normalizer.normalize(double_quote_8k.c_str(), double_quote_8k.size()); }; } @@ -1985,4 +2313,25 @@ TEST_CASE("benchmarking - ::normalize() - identifiers") }; } +TEST_CASE("benchmarking - ::normalize() - automatic semicolon insertion") +{ + auto w_semicolons = make_input("", "a;\n", s_closing_tag, DEPTH); + auto wo_semicolons = make_input("", "a \n", s_closing_tag, DEPTH); + const char* src_w_semicolons = w_semicolons.c_str(); + const char* src_wo_semicolons = wo_semicolons.c_str(); + size_t src_len = w_semicolons.size(); + + JSIdentifierCtxTest ident_ctx_mock; + JSNormalizer normalizer_wo_ident(ident_ctx_mock, UNLIM_DEPTH, MAX_TEMPLATE_NESTNIG); + + BENCHMARK("without semicolon insertion") + { + return normalizer_wo_ident.normalize(src_w_semicolons, src_len); + }; + + BENCHMARK("with semicolon insertion") + { + return normalizer_wo_ident.normalize(src_wo_semicolons, src_len); + }; +} #endif // BENCHMARK_TEST