]> git.ipfire.org Git - thirdparty/snort3.git/commitdiff
Merge pull request #3089 in SNORT/snort3 from ~DKYRYLOV/snort3:js_norm_asi to master
authorMike Stepanek (mstepane) <mstepane@cisco.com>
Mon, 11 Oct 2021 10:54:31 +0000 (10:54 +0000)
committerMike Stepanek (mstepane) <mstepane@cisco.com>
Mon, 11 Oct 2021 10:54:31 +0000 (10:54 +0000)
Squashed commit of the following:

commit feeedee58a22544fb4788a2646af52c65f1dc8cf
Author: dkyrylov <dkyrylov@cisco.com>
Date:   Mon Sep 20 14:48:53 2021 +0300

    http_inspect: add automatic semicolon insertion

src/utils/js_tokenizer.h
src/utils/js_tokenizer.l
src/utils/test/js_normalizer_test.cc

index d7f73d054e9088263a4470369900e9d8da7abb2e..4b9c0fe2b5ee05bd54701ade2700c35e202353be 100644 (file)
@@ -55,6 +55,24 @@ private:
         DIRECTIVE
     };
 
+    enum ASIGroup
+    {
+        ASI_OTHER = 0,
+        ASI_GROUP_1,    // {
+        ASI_GROUP_2,    // }
+        ASI_GROUP_3,    // [ (
+        ASI_GROUP_4,    // ]
+        ASI_GROUP_5,    // )
+        ASI_GROUP_6,    // + -
+        ASI_GROUP_7,    // this true false null identifier literal 
+                        //IDENTIFIER + LITERAL + KEYWORD_LITERAL
+        ASI_GROUP_8,    // ++ --
+        ASI_GROUP_9,    // continue break return debugger // same as KEYWORD_BA
+        ASI_GROUP_10,   // var function new delete void typeof if do while for with
+                        // switch throw try ~ + 
+        ASI_GROUP_MAX
+    };
+
 public:
     enum JSRet
     {
@@ -87,6 +105,7 @@ private:
     JSRet eval_eof();
     JSRet do_spacing(JSToken cur_token);
     JSRet do_operator_spacing(JSToken cur_token);
+    void do_semicolon_insertion(ASIGroup current);
     JSRet do_identifier_substitution(const char* lexeme);
     bool unescape(const char* lexeme);
     void process_punctuator();
@@ -103,6 +122,7 @@ private:
     uint8_t max_template_nesting;
     std::stack<uint16_t, std::vector<uint16_t>> bracket_depth;
     JSToken token = UNDEFINED;
+    ASIGroup previous_group = ASI_OTHER;
     JSIdentifierCtxBase& ident_ctx;
 
     struct
@@ -116,6 +136,21 @@ private:
     char*& tmp_buf;
     size_t& tmp_buf_size;
     const int tmp_cap_size;
+    bool newline_found = false;
+    constexpr static bool insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX]
+    {
+        {false, false, false, false, false, false, false, false, false, false, false,},
+        {false, false, false, false, false, false, false, false, false, false, false,},
+        {false, false, false, false, false, false, false, false, false, false, false,},
+        {false, false, false, false, false, false, false, false, false, false, false,},
+        {false, true,  false, false, false, false, false, true,  true,  true,  true, },
+        {false, false, false, false, false, false, false, true,  true,  true,  true, },
+        {false, false, false, false, false, false, false, false, false, false, false,},
+        {false, true,  false, false, false, false, false, true,  true,  true,  true, },
+        {false, true,  false, true,  false, false, false, true,  true,  true,  true, },
+        {false, true,  false, true,  false, false, true,  true,  true,  true,  true, },
+        {false, false, false, false, false, false, false, false, false, false, false,}
+    };
 };
 
 #endif // JS_TOKENIZER_H
index ea8a350d53c4bbad944f7f850fc3a00fbec563b4..81d8f30fec250d953159bdaf3f374be9f581d89e 100644 (file)
 
 #define EXEC(f) { auto r = (f); if (r) { BEGIN(regst); return r; } }
 #define EEOF(f) { auto r = (f); if (r) { if (r != SCRIPT_CONTINUE) BEGIN(regst); return r; } }
+constexpr bool JSTokenizer::insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX];
 %}
 
 /* The following grammar was created based on ECMAScript specification */
 /* source https://ecma-international.org/ecma-262/5.1/ */
 
-/* whitespaces */
-/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.2 */
-TAB            \x9
-VT             \xB
-FF             \xC
-SP             \x20
-NBSP           \xA0
-BOM            \xEF\xBB\xBF
-WHITESPACES    {TAB}|{VT}|{FF}|{SP}|{NBSP}|{BOM}
-
-/* single char escape sequences */
-/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 */
-NUL                      \x0
-BS                       \x8
-HT                       \x9
-CHAR_ESCAPE_SEQUENCES    {NUL}|{BS}|{HT}
-
-/* line terminators */
-/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.3 */
-LF                  \xA
-CR                  \xD
-LS                  \xE2\x80\xA8
-PS                  \xE2\x80\xA9
-LINE_TERMINATORS    {LF}|{CR}|{LS}|{PS}
-
-/* comments */
-/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.4 */
-LINE_COMMENT_START   "//"
-LINE_COMMENT_END1    [^<\xA\xD]*\xA
-LINE_COMMENT_END2    [^<\xA\xD]*\xD
-LINE_COMMENT_END3    [^<\xA\xD]*"<"+(?i:script)
-LINE_COMMENT_END4    [^<\xA\xD]*"<"+(?i:\/script>)
-LINE_COMMENT_SKIP    [^<\xA\xD]*"<"?
-BLOCK_COMMENT_START  "/*"
-BLOCK_COMMENT_END1   [^<*]*"*"+"/"
-BLOCK_COMMENT_END2   [^<*]*"<"+(?i:script)
-BLOCK_COMMENT_END3   [^<*]*"<"+(?i:\/script>)
-BLOCK_COMMENT_SKIP   [^<*]*[<*]?
-
-/* directives */
-/* according to https://ecma-international.org/ecma-262/5.1/#sec-14.1 */
-USE_STRICT_DIRECTIVE    "\"use strict\""|"\'use strict\'"
-USE_STRICT_DIRECTIVE_SC "\"use strict\"";*|"\'use strict\'";*
-
-/* keywords */
-/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6.1.1 */
-KEYWORD    break|case|debugger|in|import|protected|do|else|function|try|implements|static|instanceof|new|this|class|let|typeof|var|with|enum|private|catch|continue|default|extends|public|finally|for|if|super|yield|return|switch|throw|const|interface|void|while|delete|export|package
-
-/* punctuators */
-/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.7 */
-CLOSING_BRACES             ")"|"]"
-OPEN_BRACKET               "{"
-CLOSE_BRACKET              "}"
-PUNCTUATOR                 "("|"["|">="|"=="|"!="|"==="|"!=="|"."|";"|","|"<"|">"|"<="|"<<"|">>"|">>>"|"&"|"|"|"^"|"!"|"&&"|"||"|"?"|":"|"="|"+="|"-="|"*="|"%="|"<<="|">>="|">>>="|"&="|"|="|"^="|"~"
-OPERATOR                   "+"|"-"|"*"|"++"|"--"|"%"
-DIV_OPERATOR               "/"
-DIV_ASSIGNMENT_OPERATOR    "/="
-
 /* Unicode letter ranges (categories Lu, Ll, Lt, Lm, Lo and Nl) */
 /* generated with unicode_range_generator.l */
 /* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */
@@ -878,6 +821,76 @@ UNICODE_ZWJ     \xE2\x80\x8D
 /* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 (escape sequence) */
 UNICODE_ESCAPE_SEQUENCE    \\u[0-9a-fA-F]{4}
 
+/* whitespaces */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.2 */
+TAB            \x9
+VT             \xB
+FF             \xC
+SP             \x20
+NBSP           \xA0
+BOM            \xEF\xBB\xBF
+WHITESPACES    {TAB}|{VT}|{FF}|{SP}|{NBSP}|{BOM}
+
+/* single char escape sequences */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 */
+NUL                      \x0
+BS                       \x8
+HT                       \x9
+CHAR_ESCAPE_SEQUENCES    {NUL}|{BS}|{HT}
+
+/* line terminators */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.3 */
+LF                  \xA
+CR                  \xD
+LS                  \xE2\x80\xA8
+PS                  \xE2\x80\xA9
+LINE_TERMINATORS    {LF}|{CR}|{LS}|{PS}
+
+/* comments */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.4 */
+LINE_COMMENT_START   "//"
+LINE_COMMENT_END1    [^<\xA\xD]*\xA
+LINE_COMMENT_END2    [^<\xA\xD]*\xD
+LINE_COMMENT_END3    [^<\xA\xD]*"<"+(?i:script)
+LINE_COMMENT_END4    [^<\xA\xD]*"<"+(?i:\/script>)
+LINE_COMMENT_SKIP    [^<\xA\xD]*"<"?
+BLOCK_COMMENT_START  "/*"
+BLOCK_COMMENT_END1   [^<*\xA\xD]*"*"+"/"
+BLOCK_COMMENT_END2   [^<*\xA\xD]*"<"+(?i:script)
+BLOCK_COMMENT_END3   [^<*\xA\xD]*"<"+(?i:\/script>)
+BLOCK_COMMENT_LINE1  [^<*\xA\xD]*\xA
+BLOCK_COMMENT_LINE2  [^<*\xA\xD]*\xD
+BLOCK_COMMENT_SKIP   [^<*\xA\xD]*[<*]?
+
+/* directives */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-14.1 */
+USE_STRICT_DIRECTIVE    "\"use strict\""|"\'use strict\'"
+USE_STRICT_DIRECTIVE_SC "\"use strict\"";*|"\'use strict\'";*
+
+/* keywords */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6.1.1 */
+/* keywords that can appear at the begining or the end of Statement*/
+KEYWORD_BA break|continue|debugger|return
+/* keywords that can appear at the beginning of Statement*/
+KEYWORD_B  delete|do|for|function|if|new|switch|throw|try|typeof|var|void|while|with
+/* keywords that can not appear at the beginning or the end of Statement*/
+KEYWORD_OTHER    case|catch|class|const|default|else|enum|export|extends|finally|implements|import|in|instanceof|interface|let|package|private|protected|public|static|super|yield
+
+/* punctuators */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.7 */
+CLOSING_PAREN              ")"
+CLOSING_BRACE              "]"
+OPEN_BRACKET               "{"
+CLOSE_BRACKET              "}"
+PUNCTUATOR_PREFIX          "~"|"!"
+OPEN_PAREN_BRACE           "("|"["
+PUNCTUATOR                 ">="|"=="|"!="|"==="|"!=="|"."|";"|","|"<"|">"|"<="|"<<"|">>"|">>>"|"&"|"|"|"^"|"&&"|"||"|"?"|":"|"="|"+="|"-="|"*="|"%="|"<<="|">>="|">>>="|"&="|"|="|"^="
+OPERATOR_PREFIX            "+"|"-"
+OPERATOR_INCR_DECR         "--"|"++"
+OPERATOR                   "*"|"%"
+DIV_OPERATOR               "/"
+DIV_ASSIGNMENT_OPERATOR    "/="
+
 /* identifiers */
 /* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6 */
 IDENTIFIER_START    [_$]|({UNICODE_LETTER})|{UNICODE_ESCAPE_SEQUENCE}
@@ -887,6 +900,7 @@ IDENTIFIER          ({IDENTIFIER_START}{IDENTIFIER_PART})*
 /* literals */
 /* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8 */
 LITERAL_NULL                  null
+LITERAL_THIS                  this
 LITERAL_BOOLEAN               true|false
 LITERAL_DECIMAL               [.]?[0-9]+[\.]?[0-9]*[eE]?[0-9]*
 LITERAL_HEX_INTEGER           0x[0-9a-fA-F]*|0X[0-9a-fA-F]*
@@ -910,7 +924,7 @@ LITERAL_REGEX_SKIP            \\\/
 LITERAL_UNDEFINED             undefined
 LITERAL_INFINITY              Infinity|\xE2\x88\x9E
 LITERAL_NAN                   NaN
-LITERAL                       {LITERAL_NULL}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN}
+LITERAL                       {LITERAL_NULL}|{LITERAL_THIS}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN}
 
 HTML_COMMENT_OPEN         "<"+"!--"
 HTML_TAG_SCRIPT_OPEN      "<"+(?i:script)
@@ -949,15 +963,15 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 %%
 {WHITESPACES}                       { }
 {CHAR_ESCAPE_SEQUENCES}             { }
-{LINE_TERMINATORS}                  { BEGIN(regst); }
+{LINE_TERMINATORS}                  { BEGIN(regst); newline_found = true; }
 
 <INITIAL,regex,dqstr,regst,sqstr,divop>{HTML_TAG_SCRIPT_OPEN} { BEGIN(regst); return OPENING_TAG; }
 {HTML_TAG_SCRIPT_CLOSE}             { BEGIN(regst); return SCRIPT_ENDED; }
 
        {HTML_COMMENT_OPEN}          { BEGIN(lcomm); }
        {LINE_COMMENT_START}         { BEGIN(lcomm); }
-<lcomm>{LINE_COMMENT_END1}          { BEGIN(regst); }
-<lcomm>{LINE_COMMENT_END2}          { BEGIN(regst); }
+<lcomm>{LINE_COMMENT_END1}          { BEGIN(regst); newline_found = true; }
+<lcomm>{LINE_COMMENT_END2}          { BEGIN(regst); newline_found = true; }
 <lcomm>{LINE_COMMENT_END3}          { BEGIN(regst); return OPENING_TAG; }
 <lcomm>{LINE_COMMENT_END4}          { BEGIN(regst); return CLOSING_TAG; }
 <lcomm>{LINE_COMMENT_SKIP}          { }
@@ -967,10 +981,12 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <bcomm>{BLOCK_COMMENT_END1}         { BEGIN(regst); }
 <bcomm>{BLOCK_COMMENT_END2}         { BEGIN(regst); return OPENING_TAG; }
 <bcomm>{BLOCK_COMMENT_END3}         { BEGIN(regst); return CLOSING_TAG; }
+<bcomm>{BLOCK_COMMENT_LINE1}        |
+<bcomm>{BLOCK_COMMENT_LINE2}        { newline_found = true;}
 <bcomm>{BLOCK_COMMENT_SKIP}         { }
 <bcomm><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
-       {LITERAL_DQ_STRING_START}    { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(dqstr); }
+       {LITERAL_DQ_STRING_START}    { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(dqstr); }
 <dqstr>{LITERAL_DQ_STRING_END}      { ECHO; BEGIN(divop); }
 <dqstr>{HTML_TAG_SCRIPT_CLOSE}      { BEGIN(regst); return CLOSING_TAG; }
 <dqstr>\\{CR}{LF}                   { }
@@ -981,7 +997,7 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <dqstr>{LITERAL_DQ_STRING_TEXT}     { ECHO; }
 <dqstr><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
-       {LITERAL_SQ_STRING_START}    { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(sqstr); }
+       {LITERAL_SQ_STRING_START}    { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(sqstr); }
 <sqstr>{LITERAL_SQ_STRING_END}      { ECHO; BEGIN(divop); }
 <sqstr>{HTML_TAG_SCRIPT_CLOSE}      { BEGIN(regst); return CLOSING_TAG; }
 <sqstr>\\{CR}{LF}                   { }
@@ -992,10 +1008,10 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <sqstr>{LITERAL_SQ_STRING_TEXT}     { ECHO; }
 <sqstr><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
-{OPEN_BRACKET}                      { if (!bracket_depth.empty()) bracket_depth.top()++; process_punctuator(); }
-{CLOSE_BRACKET}                     { process_closing_bracket(); }
+{OPEN_BRACKET}                      { do_semicolon_insertion(ASI_GROUP_1); if (!bracket_depth.empty()) bracket_depth.top()++; process_punctuator(); }
+{CLOSE_BRACKET}                     { do_semicolon_insertion(ASI_GROUP_2); process_closing_bracket(); }
 
-       {LITERAL_TEMPLATE_START}                  { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(tmpll); }
+       {LITERAL_TEMPLATE_START}                  { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(tmpll); }
 <tmpll>(\\\\)*{LITERAL_TEMPLATE_END}             { ECHO; BEGIN(divop); }
 <tmpll>(\\\\)*{LITERAL_TEMPLATE_SUBST_START}     { EXEC(process_subst_open()) }
 <tmpll>{HTML_TAG_SCRIPT_CLOSE}                   { BEGIN(regst); return CLOSING_TAG; }
@@ -1004,7 +1020,7 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <tmpll>{LITERAL_TEMPLATE_OTHER}                  { ECHO; }
 <tmpll><<EOF>>                                   { return SCRIPT_CONTINUE; }
 
-<regst>{LITERAL_REGEX_START}        { EXEC(do_spacing(LITERAL)) yyout << '/'; states_correct(1); yyless(1); BEGIN(regex); }
+<regst>{LITERAL_REGEX_START}        { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) yyout << '/'; states_correct(1); yyless(1); BEGIN(regex); }
 <regex>{LITERAL_REGEX_END}          { ECHO; BEGIN(divop); }
 <regex>{HTML_TAG_SCRIPT_CLOSE}      { BEGIN(regst); return CLOSING_TAG; }
 <regex>{LITERAL_REGEX_SKIP}         { ECHO; }
@@ -1015,19 +1031,28 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 <regex><<EOF>>                      { states_apply(); return SCRIPT_CONTINUE; }
 
 <divop>{DIV_OPERATOR}               |
-<divop>{DIV_ASSIGNMENT_OPERATOR}    { ECHO; token = PUNCTUATOR; BEGIN(INITIAL); }
+<divop>{DIV_ASSIGNMENT_OPERATOR}    { previous_group = ASI_OTHER; ECHO; token = PUNCTUATOR; BEGIN(INITIAL); }
+
+{CLOSING_PAREN}                     { do_semicolon_insertion(ASI_GROUP_5); ECHO; token = PUNCTUATOR; BEGIN(divop); }
+{CLOSING_BRACE}                     { do_semicolon_insertion(ASI_GROUP_4); ECHO; token = PUNCTUATOR; BEGIN(divop); }
+{PUNCTUATOR_PREFIX}                 { do_semicolon_insertion(ASI_GROUP_10); process_punctuator(); }
+{OPEN_PAREN_BRACE}                  { do_semicolon_insertion(ASI_GROUP_3); process_punctuator(); }
+{PUNCTUATOR}                        { previous_group = ASI_OTHER; process_punctuator(); }
 
-{CLOSING_BRACES}                    { ECHO; token = PUNCTUATOR; BEGIN(divop); }
-{PUNCTUATOR}                        { process_punctuator(); }
+{USE_STRICT_DIRECTIVE}              { previous_group = ASI_OTHER; EXEC(do_spacing(DIRECTIVE)) ECHO; BEGIN(INITIAL); yyout << ';'; }
+{USE_STRICT_DIRECTIVE_SC}           { previous_group = ASI_OTHER; EXEC(do_spacing(DIRECTIVE)) ECHO; BEGIN(INITIAL); }
+{KEYWORD_B}                         { do_semicolon_insertion(ASI_GROUP_10); EXEC(do_spacing(KEYWORD)) ECHO; BEGIN(regst); }
+{KEYWORD_BA}                        { do_semicolon_insertion(ASI_GROUP_9); EXEC(do_spacing(KEYWORD)) ECHO; BEGIN(regst); }
+{KEYWORD_OTHER}                     { previous_group = ASI_OTHER; EXEC(do_spacing(KEYWORD)) ECHO; BEGIN(regst); }
 
-{USE_STRICT_DIRECTIVE}              { EXEC(do_spacing(DIRECTIVE)) ECHO; BEGIN(INITIAL); yyout << ';'; }
-{USE_STRICT_DIRECTIVE_SC}           { EXEC(do_spacing(DIRECTIVE)) ECHO; BEGIN(INITIAL); }
-{KEYWORD}                           { EXEC(do_spacing(KEYWORD)) ECHO; BEGIN(regst); }
-{OPERATOR}                          { EXEC(do_operator_spacing(OPERATOR)) ECHO; BEGIN(divop); }
-{LITERAL}                           { EXEC(do_spacing(LITERAL)) ECHO; BEGIN(divop); }
-{IDENTIFIER}                        { if (unescape(YYText())) { EXEC(do_spacing(IDENTIFIER)) EXEC(do_identifier_substitution(YYText())) } BEGIN(divop); }
+{OPERATOR_PREFIX}                   { do_semicolon_insertion(ASI_GROUP_6); EXEC(do_operator_spacing(OPERATOR)) ECHO; BEGIN(divop); }
+{OPERATOR_INCR_DECR}                { do_semicolon_insertion(ASI_GROUP_8); EXEC(do_operator_spacing(OPERATOR)) ECHO; BEGIN(divop); }
+{OPERATOR}                          { previous_group = ASI_OTHER; EXEC(do_operator_spacing(OPERATOR)) ECHO; BEGIN(divop); }
 
-.|{ALL_UNICODE}                     { ECHO; token = UNDEFINED; BEGIN(INITIAL); }
+{LITERAL}                           { do_semicolon_insertion(ASI_GROUP_7); EXEC(do_spacing(LITERAL)) ECHO; BEGIN(divop); }
+{IDENTIFIER}                        { do_semicolon_insertion(ASI_GROUP_7); if (unescape(YYText())) { EXEC(do_spacing(IDENTIFIER)) EXEC(do_identifier_substitution(YYText())) } BEGIN(divop); }
+
+.|{ALL_UNICODE}                     { previous_group = ASI_OTHER; ECHO; token = UNDEFINED; BEGIN(INITIAL); }
 <<EOF>>                             { EEOF(eval_eof()) }
 
 %%
@@ -1231,6 +1256,23 @@ JSTokenizer::JSRet JSTokenizer::do_identifier_substitution(const char* lexeme)
     return IDENTIFIER_OVERFLOW;
 }
 
+void JSTokenizer::do_semicolon_insertion(ASIGroup current)
+{
+    assert(current >= 0 and current < ASI_GROUP_MAX);
+    if (newline_found)
+    {
+        newline_found = false;
+        if (insert_semicolon[previous_group][current])
+        {
+            yyout << ';';
+            previous_group = ASI_OTHER;
+            token = PUNCTUATOR;
+            return;
+        }
+    }
+    previous_group = current;
+}
+
 bool JSTokenizer::unescape(const char* lexeme)
 {
     if (strstr(lexeme, "\\u"))
index ae750328a9eafeb204b723b6a70f961068c9f94f..5b059466b89dad2f7c5484d6ac23b0425879fcb3 100644 (file)
@@ -482,8 +482,8 @@ static const char all_patterns_buf5[] =
     "ab\xE2\x80\xA9ww ab\xEF\xBB\xBFww ab∞ww 2abc";
 
 static const char all_patterns_expected5[] =
-    "$2abc _2abc abc $__$ 肖晗 XÆA12 \u0041abc \u00FBdef \u1234ghi ab ww "
-    "ab ww ab ww ab ∞ ww 2 abc";
+    "$2abc _2abc abc $__$ 肖晗 XÆA12 \u0041abc \u00FBdef \u1234ghi ab;ww "
+    "ab;ww ab ww ab ∞ ww 2 abc";
 
 static const char all_patterns_buf6[] =
     "tag` template\n   ${ a   +   b }   template`";
@@ -692,7 +692,7 @@ static const char syntax_cases_buf5[] =
 
 static const char syntax_cases_expected5[] =
     "var i=1;while(i<100){i*=2;document.write(i+\", \");}i=1;do{i*=2;"
-    "document.write(i+\", \");}while(i<100)for(var i=0;i<10;i++){if(i==5){break;}"
+    "document.write(i+\", \");}while(i<100);for(var i=0;i<10;i++){if(i==5){break;}"
     "document.write(i+\", \");}for(var i=0;i<10;i++){if(i==5){continue;}"
     "document.write(i+\", \");}";
 
@@ -787,7 +787,7 @@ static const char syntax_cases_buf10[] =
     "var a = 2\n/ab -cd/";
 
 static const char syntax_cases_expected10[] =
-    "var a=2 /ab -cd/";
+    "var a=2;/ab -cd/";
 
 static const char syntax_cases_buf11[] =
     "var d_str1 = \"\\\\ \" ; var d_str2 = \"abc\\\"def\" ;"
@@ -1033,6 +1033,333 @@ TEST_CASE("template literal overflow", "[JSNormalizer]")
     }
 }
 
+static const char asi_cases_buf0[] =
+    "array[0]\n{}";
+
+static const char asi_cases_expected0[] =
+    "array[0];{}";
+
+static const char asi_cases_buf1[] =
+    "array[0]\ntrue";
+
+static const char asi_cases_expected1[] =
+    "array[0];true";
+
+static const char asi_cases_buf2[] =
+    "array[0]\n++";
+
+static const char asi_cases_expected2[] =
+    "array[0];++";
+
+static const char asi_cases_buf3[] =
+    "array[0]\ncontinue";
+
+static const char asi_cases_expected3[] =
+    "array[0];continue";
+
+static const char asi_cases_buf4[] =
+    "array[0]\nvar b;";
+
+static const char asi_cases_expected4[] =
+    "array[0];var b;";
+
+static const char asi_cases_buf5[] =
+    "func()\ntrue";
+
+static const char asi_cases_expected5[] =
+    "func();true";
+
+static const char asi_cases_buf6[] =
+    "func()\n++";
+
+static const char asi_cases_expected6[] =
+    "func();++";
+
+static const char asi_cases_buf7[] =
+    "func()\ncontinue";
+
+static const char asi_cases_expected7[] =
+    "func();continue";
+
+static const char asi_cases_buf8[] =
+    "func()\nvar b;";
+
+static const char asi_cases_expected8[] =
+    "func();var b;";
+
+static const char asi_cases_buf9[] =
+    "1024\n{}";
+
+static const char asi_cases_expected9[] =
+    "1024;{}";
+
+static const char asi_cases_buf10[] =
+    "1024\ntrue";
+
+static const char asi_cases_expected10[] =
+    "1024;true";
+
+static const char asi_cases_buf11[] =
+    "1024\n++";
+
+static const char asi_cases_expected11[] =
+    "1024;++";
+
+static const char asi_cases_buf12[] =
+    "1024\ncontinue";
+
+static const char asi_cases_expected12[] =
+    "1024;continue";
+
+static const char asi_cases_buf13[] =
+    "1024\nvar b;";
+
+static const char asi_cases_expected13[] =
+    "1024;var b;";
+
+static const char asi_cases_buf14[] =
+    "++\n{}";
+
+static const char asi_cases_expected14[] =
+    "++;{}";
+
+static const char asi_cases_buf15[] =
+    "++\n[1,2,3]";
+
+static const char asi_cases_expected15[] =
+    "++;[1,2,3]";
+
+static const char asi_cases_buf16[] =
+    "++\ntrue";
+
+static const char asi_cases_expected16[] =
+    "++;true";
+
+static const char asi_cases_buf17[] =
+    "++\n++";
+
+static const char asi_cases_expected17[] =
+    "++;++";
+
+static const char asi_cases_buf18[] =
+    "++\ncontinue";
+
+static const char asi_cases_expected18[] =
+    "++;continue";
+
+static const char asi_cases_buf19[] =
+    "++\nvar b;";
+
+static const char asi_cases_expected19[] =
+    "++;var b;";
+
+static const char asi_cases_buf20[] =
+    "return\n{}";
+
+static const char asi_cases_expected20[] =
+    "return;{}";
+
+static const char asi_cases_buf21[] =
+    "return\n[1,2,3]";
+
+static const char asi_cases_expected21[] =
+    "return;[1,2,3]";
+
+static const char asi_cases_buf22[] =
+    "return\n+a";
+
+static const char asi_cases_expected22[] =
+    "return;+a";
+
+static const char asi_cases_buf23[] =
+    "return\ntrue";
+
+static const char asi_cases_expected23[] =
+    "return;true";
+
+static const char asi_cases_buf24[] =
+    "return\n++";
+
+static const char asi_cases_expected24[] =
+    "return;++";
+
+static const char asi_cases_buf25[] =
+    "return\ncontinue";
+
+static const char asi_cases_expected25[] =
+    "return;continue";
+
+static const char asi_cases_buf26[] =
+    "return\nvar b;";
+
+static const char asi_cases_expected26[] =
+    "return;var b;";
+
+TEST_CASE("automatic semicolon insertion", "[JSNormalizer]")
+{
+    SECTION("group_4 to group_1")
+    {
+        NORMALIZE(asi_cases_buf0);
+        VALIDATE(asi_cases_buf0, asi_cases_expected0);
+    }
+
+    SECTION("group_4 to group_7")
+    {
+        NORMALIZE(asi_cases_buf1);
+        VALIDATE(asi_cases_buf1, asi_cases_expected1);
+    }
+
+    SECTION("group_4 to group_8")
+    {
+        NORMALIZE(asi_cases_buf2);
+        VALIDATE(asi_cases_buf2, asi_cases_expected2);
+    }
+
+    SECTION("group_4 to group_9")
+    {
+        NORMALIZE(asi_cases_buf3);
+        VALIDATE(asi_cases_buf3, asi_cases_expected3);
+    }
+
+    SECTION("group_4 to group_10")
+    {
+        NORMALIZE(asi_cases_buf4);
+        VALIDATE(asi_cases_buf4, asi_cases_expected4);
+    }
+
+    SECTION("group_5 to group_7")
+    {
+        NORMALIZE(asi_cases_buf5);
+        VALIDATE(asi_cases_buf5, asi_cases_expected5);
+    }
+
+    SECTION("group_5 to group_8")
+    {
+        NORMALIZE(asi_cases_buf6);
+        VALIDATE(asi_cases_buf6, asi_cases_expected6);
+    }
+
+    SECTION("group_5 to group_9")
+    {
+        NORMALIZE(asi_cases_buf7);
+        VALIDATE(asi_cases_buf7, asi_cases_expected7);
+    }
+
+    SECTION("group_5 to group_10")
+    {
+        NORMALIZE(asi_cases_buf8);
+        VALIDATE(asi_cases_buf8, asi_cases_expected8);
+    }
+
+    SECTION("group_7 to group_1")
+    {
+        NORMALIZE(asi_cases_buf9);
+        VALIDATE(asi_cases_buf9, asi_cases_expected9);
+    }
+
+    SECTION("group_7 to group_7")
+    {
+        NORMALIZE(asi_cases_buf10);
+        VALIDATE(asi_cases_buf10, asi_cases_expected10);
+    }
+
+    SECTION("group_7 to group_8")
+    {
+        NORMALIZE(asi_cases_buf11);
+        VALIDATE(asi_cases_buf11, asi_cases_expected11);
+    }
+
+    SECTION("group_7 to group_9")
+    {
+        NORMALIZE(asi_cases_buf12);
+        VALIDATE(asi_cases_buf12, asi_cases_expected12);
+    }
+
+    SECTION("group_7 to group_10")
+    {
+        NORMALIZE(asi_cases_buf13);
+        VALIDATE(asi_cases_buf13, asi_cases_expected13);
+    }
+
+    SECTION("group_8 to group_1")
+    {
+        NORMALIZE(asi_cases_buf14);
+        VALIDATE(asi_cases_buf14, asi_cases_expected14);
+    }
+
+    SECTION("group_8 to group_3")
+    {
+        NORMALIZE(asi_cases_buf15);
+        VALIDATE(asi_cases_buf15, asi_cases_expected15);
+    }
+
+    SECTION("group_8 to group_7")
+    {
+        NORMALIZE(asi_cases_buf16);
+        VALIDATE(asi_cases_buf16, asi_cases_expected16);
+    }
+
+    SECTION("group_8 to group_8")
+    {
+        NORMALIZE(asi_cases_buf17);
+        VALIDATE(asi_cases_buf17, asi_cases_expected17);
+    }
+
+    SECTION("group_8 to group_9")
+    {
+        NORMALIZE(asi_cases_buf18);
+        VALIDATE(asi_cases_buf18, asi_cases_expected18);
+    }
+
+    SECTION("group_8 to group_10")
+    {
+        NORMALIZE(asi_cases_buf19);
+        VALIDATE(asi_cases_buf19, asi_cases_expected19);
+    }
+
+    SECTION("group_9 to group_1")
+    {
+        NORMALIZE(asi_cases_buf20);
+        VALIDATE(asi_cases_buf20, asi_cases_expected20);
+    }
+
+    SECTION("group_9 to group_3")
+    {
+        NORMALIZE(asi_cases_buf21);
+        VALIDATE(asi_cases_buf21, asi_cases_expected21);
+    }
+
+    SECTION("group_9 to group_6")
+    {
+        NORMALIZE(asi_cases_buf22);
+        VALIDATE(asi_cases_buf22, asi_cases_expected22);
+    }
+
+    SECTION("group_9 to group_7")
+    {
+        NORMALIZE(asi_cases_buf23);
+        VALIDATE(asi_cases_buf23, asi_cases_expected23);
+    }
+
+    SECTION("group_9 to group_8")
+    {
+        NORMALIZE(asi_cases_buf24);
+        VALIDATE(asi_cases_buf24, asi_cases_expected24);
+    }
+
+    SECTION("group_9 to group_9")
+    {
+        NORMALIZE(asi_cases_buf25);
+        VALIDATE(asi_cases_buf25, asi_cases_expected25);
+    }
+
+    SECTION("group_9 to group_10")
+    {
+        NORMALIZE(asi_cases_buf26);
+        VALIDATE(asi_cases_buf26, asi_cases_expected26);
+    }
+}
+
 TEST_CASE("endings", "[JSNormalizer]")
 {
     SECTION("script closing tag is present", "[JSNormalizer]")
@@ -1889,68 +2216,69 @@ TEST_CASE("memcap", "[JSNormalizer]")
 
 static constexpr const char* s_closing_tag = "</script>";
 
-#define MAKE_INPUT(src, src_len, start, mid, end, depth) \
-    std::string input_##src(start); \
-    input_##src.append(depth - strlen(start) - strlen(end) - strlen(s_closing_tag), mid); \
-    input_##src.append(end, strlen(end)); \
-    input_##src.append(s_closing_tag, strlen(s_closing_tag)); \
-    const char* src = input_##src.c_str(); \
-    size_t src_len = input_##src.size()
+static const std::string make_input(const char* begin, const char* mid,
+                             const char* end, size_t len) 
+{
+    std::string s(begin);
+    int fill = (len - strlen(begin) - strlen(end)) / strlen(mid);
+    for (int i = 0; i < fill; ++i)
+        s.append(mid);
+    s.append(end);
+    return s;
+}
 
 TEST_CASE("benchmarking - ::normalize() - literals", "[JSNormalizer]")
 {
     JSIdentifierCtxTest ident_ctx;
     JSNormalizer normalizer(ident_ctx, UNLIM_DEPTH, MAX_TEMPLATE_NESTNIG);
     char dst[DEPTH];
-
-    MAKE_INPUT(src_ws, src_ws_len, "", ' ', "", DEPTH);
-    MAKE_INPUT(src_bcomm, src_bcomm_len, "/*", ' ', "*/", DEPTH);
-    MAKE_INPUT(src_dqstr, src_dqstr_len, "\"", ' ', "\"", DEPTH);
-
+    auto whitespace = make_input("", " ", "", DEPTH);
+    auto block_comment = make_input("/*", " ", "*/", DEPTH);
+    auto double_quote = make_input("\"", " ", "\"", DEPTH);
     BENCHMARK("memcpy - whitespaces - 65535 bytes")
     {
-        return memcpy(dst, src_ws, src_ws_len);
+        return memcpy(dst, whitespace.c_str(), whitespace.size());
     };
     BENCHMARK("whitespaces - 65535 bytes")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(src_ws, src_ws_len);
+        return normalizer.normalize(whitespace.c_str(), whitespace.size());
     };
     BENCHMARK("block comment - 65535 bytes")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(src_bcomm, src_bcomm_len);
+        return normalizer.normalize(block_comment.c_str(), block_comment.size());
     };
     BENCHMARK("double quotes string - 65535 bytes")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(src_dqstr, src_dqstr_len);
+        return normalizer.normalize(double_quote.c_str(), double_quote.size());
     };
 
     constexpr size_t depth_8k = 8192;
 
-    MAKE_INPUT(src_ws_8k, src_ws_len_8k, "", ' ', "", depth_8k);
-    MAKE_INPUT(src_bcomm_8k, src_bcomm_len_8k, "/*", ' ', "*/", depth_8k);
-    MAKE_INPUT(src_dqstr_8k, src_dqstr_len_8k, "\"", ' ', "\"", depth_8k);
+    auto whitespace_8k = make_input("", " ", "", depth_8k);
+    auto block_comment_8k = make_input("/*", " ", "*/", depth_8k);
+    auto double_quote_8k = make_input("\"", " ", "\"", depth_8k);
 
     BENCHMARK("memcpy - whitespaces - 8192 bytes")
     {
-        return memcpy(dst, src_ws_8k, src_ws_len_8k);
+        return memcpy(dst, whitespace_8k.c_str(), whitespace_8k.size());
     };
     BENCHMARK("whitespaces - 8192 bytes")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(src_ws_8k, src_ws_len_8k);
+        return normalizer.normalize(whitespace_8k.c_str(), whitespace_8k.size());
     };
     BENCHMARK("block comment - 8192 bytes")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(src_bcomm_8k, src_bcomm_len_8k);
+        return normalizer.normalize(block_comment_8k.c_str(), block_comment_8k.size());
     };
     BENCHMARK("double quotes string - 8192 bytes")
     {
         normalizer.rewind_output();
-        return normalizer.normalize(src_dqstr_8k, src_dqstr_len_8k);
+        return normalizer.normalize(double_quote_8k.c_str(), double_quote_8k.size());
     };
 }
 
@@ -1985,4 +2313,25 @@ TEST_CASE("benchmarking - ::normalize() - identifiers")
     };
 }
 
+TEST_CASE("benchmarking - ::normalize() - automatic semicolon insertion")
+{
+    auto w_semicolons = make_input("", "a;\n", s_closing_tag, DEPTH); 
+    auto wo_semicolons = make_input("", "a \n", s_closing_tag, DEPTH); 
+    const char* src_w_semicolons = w_semicolons.c_str();
+    const char* src_wo_semicolons = wo_semicolons.c_str();
+    size_t src_len = w_semicolons.size();
+
+    JSIdentifierCtxTest ident_ctx_mock;
+    JSNormalizer normalizer_wo_ident(ident_ctx_mock, UNLIM_DEPTH, MAX_TEMPLATE_NESTNIG);
+
+    BENCHMARK("without semicolon insertion")
+    {
+        return normalizer_wo_ident.normalize(src_w_semicolons, src_len);
+    };
+
+    BENCHMARK("with semicolon insertion")
+    {
+        return normalizer_wo_ident.normalize(src_wo_semicolons, src_len);
+    };
+}
 #endif // BENCHMARK_TEST