From: Mike Stepanek (mstepane) <mstepane@cisco.com>
Date: Mon, 28 Mar 2022 10:59:05 +0000 (+0000)
Subject: Pull request #3320: JSN: String literals concatenation
X-Git-Tag: 3.1.27.0~13
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c60b70a2d566062ed98e341399d738a0343bbb07;p=thirdparty%2Fsnort3.git

Pull request #3320: JSN: String literals concatenation

Merge in SNORT/snort3 from ~ASERBENI/snort3:string_concat to master

Squashed commit of the following:

commit 34a89bea5e85a417f37bc26aaf859727e3148456
Author: Andrii Serbeniuk <aserbeni@cisco.com>
Date:   Fri Mar 11 12:54:48 2022 +0200

    utils: add string concatenation for Enchanced JS Normalizer
---

diff --git a/doc/user/http_inspect.txt b/doc/user/http_inspect.txt
index 2d583e0cd..02bd93adf 100755
--- a/doc/user/http_inspect.txt
+++ b/doc/user/http_inspect.txt
@@ -76,7 +76,9 @@ will be removed.
 Having ips option 'js_data' in the rules automatically enables Enhanced
 Normalizer. The Enhanced Normalizer can normalize inline/external scripts.
 It supports scripts over multiple PDUs. It is a stateful JavaScript whitespace
-and identifiers normalizer. All JavaScript identifier names, except those from
+and identifiers normalizer. Normalizer concatenates string literals whenever 
+it's possible to do. This also works with any other normalizations that result
+in string literals. All JavaScript identifier names, except those from
 the ignore list, will be substituted with unified names in the following
 format: var_0000 -> var_ffff. The Normalizer tries to expand an escaped text,
 so it will appear in a usual form in the output. Moreover, Normalizer validates
diff --git a/src/service_inspectors/http_inspect/dev_notes.txt b/src/service_inspectors/http_inspect/dev_notes.txt
index 83c5970be..eb69bbe30 100755
--- a/src/service_inspectors/http_inspect/dev_notes.txt
+++ b/src/service_inspectors/http_inspect/dev_notes.txt
@@ -223,7 +223,9 @@ During message body analysis the Enhanced Normalizer does one of the following:
 Enhanced Normalizer is a stateful JavaScript whitespace and identifiers normalizer.
 Normalizer will remove all extraneous whitespace and newlines, keeping a single space where 
 syntactically necessary. Comments will be removed, but contents of string literals will
-be kept intact. Semicolons will be inserted, if not already present, according to ECMAScript
+be kept intact. Any string literals, added by the plus operator,
+will be concatenated. This also works for functions that result in string
+literals. Semicolons will be inserted, if not already present, according to ECMAScript
 automatic semicolon insertion rules.
 All JavaScript identifier names, except those from the ignore list,
 will be substituted with unified names in the following format: var_0000 -> var_ffff.
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h
index 0747af6d3..be3011100 100644
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -184,6 +184,7 @@ private:
     JSRet do_identifier_substitution(const char* lexeme, bool id_part);
     JSRet push_identifier(const char* ident);
     bool unescape(const char* lexeme);
+    bool concatenate();
     void process_punctuator(JSToken tok = PUNCTUATOR);
     void skip_punctuator();
     void process_closing_brace();
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l
index 074dd45e9..727b3746b 100644
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -1573,6 +1573,28 @@ bool JSTokenizer::unescape(const char* lexeme)
     return true;
 }
 
+bool JSTokenizer::concatenate()
+{
+    std::streambuf* pbuf = yyout.rdbuf();
+    std::streamsize size = pbuf->pubseekoff(0, yyout.cur, yyout.out);
+
+    if (size >= 2)
+    {
+        char tail[2];
+        pbuf->pubseekoff(-2, yyout.cur, yyout.out);
+        pbuf->sgetn(tail, 2);
+
+        if (tail[1] == '+' and (tail[0] == '\'' or tail[0] == '"'))
+        {
+            pbuf->pubseekoff(-2, yyout.cur, yyout.out);
+
+            return true;
+        }
+    }
+
+    return false;
+}
+
 void JSTokenizer::process_punctuator(JSToken tok)
 {
     ECHO;
@@ -2159,8 +2181,8 @@ JSTokenizer::JSRet JSTokenizer::literal_dq_string_start()
     dealias_append();
     EXEC(do_semicolon_insertion(ASI_GROUP_7))
     EXEC(do_spacing(LITERAL))
-    ECHO;
-    BEGIN(dqstr);
+    if (!concatenate())
+        ECHO;
     set_ident_norm(true);
 
     switch (func_call_type())
@@ -2186,7 +2208,8 @@ JSTokenizer::JSRet JSTokenizer::literal_sq_string_start()
     dealias_append();
     EXEC(do_semicolon_insertion(ASI_GROUP_7))
     EXEC(do_spacing(LITERAL))
-    ECHO;
+    if (!concatenate())
+        ECHO;
     set_ident_norm(true);
 
     switch (func_call_type())
@@ -2305,7 +2328,8 @@ JSTokenizer::JSRet JSTokenizer::open_parenthesis()
         token = LITERAL;
         BEGIN(char_code);
         set_char_code_str(true);
-        yyout << '\'';
+        if (!concatenate())
+            yyout << '\'';
         break;
     case FuncType::UNESCAPE:
         skip_punctuator();
diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc
index 0c30c01e7..1e6230c5d 100644
--- a/src/utils/test/js_normalizer_test.cc
+++ b/src/utils/test/js_normalizer_test.cc
@@ -399,7 +399,7 @@ static const char clamav_buf2[] =
     "function () { var tst=\"a\"+'bc'+     'd'; }";
 
 static const char clamav_expected2[] =
-    "function(){var tst=\"a\"+'bc'+'d';}";
+    "function(){var tst=\"abcd';}";
 
 static const char clamav_buf3[] =
     "dF('bmfsu%2639%2638x11u%2638%263%3A%264C1');";
@@ -751,7 +751,7 @@ static const char syntax_cases_buf0[] =
     "var esc = 'I don\\'t \\n know';\n";
 
 static const char syntax_cases_expected0[] =
-    "var a;var b=\"init this    stuff\";var c=\"Hi\"+\" \"+\"Joe\";"
+    "var a;var b=\"init this    stuff\";var c=\"Hi Joe\";"
     "var d=1+2+\"3\";var e=[2,3,5,8];var f=false;var g=/( i'm   a  .* regex )/;"
     "var h=function(){};const PI=3.14;var a=1,b=2,c=a+b;let z='zzz zz';var g=null;"
     "var name={first:\"Jane\",last:\"Doe\"};var esc='I don\\'t \\n know';";
@@ -4643,6 +4643,209 @@ TEST_CASE("Function call tracking - over multiple PDU", "[JSNormalizer]")
     }
 }
 
+TEST_CASE("String Concatenation - Basic", "[JSNormalizer]")
+{
+    SECTION("Two strings")
+    {
+        SECTION("single quoted strings")
+            test_normalization("'foo' + 'bar'", "'foobar'");
+
+        SECTION("double quoted strings")
+            test_normalization("\"foo\" + \"bar\"", "\"foobar\"");
+
+        SECTION("double quoted string + single quoted string")
+            test_normalization("\"foo\" + 'bar'", "\"foobar'");
+
+        SECTION("single quoted string + double quoted string")
+            test_normalization("'foo' + \"bar\"", "'foobar\"");
+
+        SECTION("string + function call")
+            test_normalization("'foo' + general('bar')", "'foo'+var_0000('bar')");
+
+        SECTION("function call + string")
+            test_normalization("general('bar') + 'foo'", "var_0000('bar')+'foo'");
+
+        SECTION("inside function call arguments")
+            test_normalization("general('foo' + 'bar')", "var_0000('foobar')");
+
+        SECTION("with concatenation inside")
+            test_normalization("'\"foo\"' + '+\"bar\"')", "'\"foo\"+\"bar\"'");
+
+        SECTION("terminated concatenation")
+            test_normalization("'foo' + '!</script>')", "'foo!");
+    }
+    SECTION("Three strings")
+    {
+        SECTION("single quoted strings")
+            test_normalization("'foo' + 'bar' + 'baz'", "'foobarbaz'");
+
+        SECTION("double quoted strings")
+            test_normalization("\"foo\" + \"bar\" + \"baz\"", "\"foobarbaz\"");
+
+        SECTION("single quoted string + double quoted string + double quoted string")
+            test_normalization("'foo' + \"bar\" + \"baz\"", "'foobarbaz\"");
+
+        SECTION("double quoted string + double quoted string + single quoted string")
+            test_normalization("\"foo\" + \"bar\" + 'baz'", "\"foobarbaz'");
+
+        SECTION("double quoted string + single quoted string + double quoted string")
+            test_normalization("\"foo\" + 'bar' + \"baz\"", "\"foobarbaz\"");
+
+        SECTION("function call between literals")
+            test_normalization("'foo' + general('bar') + \"baz\"", "'foo'+var_0000('bar')+\"baz\"");
+    }
+    SECTION("multiline comment before the plus symbol")
+        test_normalization("'foo' /*comment*/ + 'bar'", "'foobar'");
+
+    SECTION("single line comment before the plus symbol")
+        test_normalization("'foo' //comment\n + 'bar'", "'foobar'");
+
+    SECTION("HTML comment before the plus symbol")
+        test_normalization("'foo' <!-- HTML comment\n + 'bar'", "'foobar'");
+
+    SECTION("tab after the plus symbol")
+        test_normalization("'foo' + \t 'bar'", "'foobar'");
+
+    SECTION("comment after the plus symbol")
+        test_normalization("'foo' + /*comment*/ 'bar'", "'foobar'");
+
+    SECTION("with a non-string literal in chain")
+        test_normalization("'foo' + 'bar' + 2", "'foobar'+2");
+
+    SECTION("with a non-string literal between strings")
+        test_normalization("'foo' + 2 + 'bar'", "'foo'+2+'bar'");
+
+    SECTION("with a template literal")
+        test_normalization("\"foo\" + `bar`", "\"foo\"+`bar`");
+
+    SECTION("with a template literal substitution")
+        test_normalization("\"foo\" + `bar${a + 1}`", "\"foo\"+`bar${var_0000+1}`");
+
+    SECTION("inside a template literal substitution")
+        test_normalization("`literal${\"foo\" + \"bar\"}`", "`literal${\"foobar\"}`");
+
+    SECTION("automatic semicolon insertion after concatenation")
+        test_normalization("'foo' + 'bar'\nvar a = 5;", "'foobar';var var_0000=5;");
+}
+
+TEST_CASE("String Concatenation - With unescape", "[JSNormalizer]")
+{
+    SECTION("unescape")
+    {
+        SECTION("single quoted string + single quoted unescape")
+            test_normalization("'foo' + unescape('%62%61%72')", "'foobar'");
+
+        SECTION("double quoted string + single quoted unescape")
+            test_normalization("\"foo\" + unescape('%62%61%72')", "\"foobar'");
+
+        SECTION("single quoted unescape + single quoted string")
+            test_normalization("unescape('%66%6f%6f') + 'bar'", "'foobar'");
+
+        SECTION("double quoted unescape + double quoted string")
+            test_normalization("unescape(\"%66%6f%6f\") + \"bar\"", "\"foobar\"");
+
+        SECTION("string + unescape + string")
+            test_normalization("'foo' + unescape('%62%61%72') + 'baz'", "'foobarbaz'");
+
+        SECTION("unescape + unescape")
+            test_normalization("unescape('%66%6f%6f') + unescape('%62%61%72')", "'foobar'");
+
+        SECTION("inside function call arguments")
+            test_normalization("unescape('foo' + '%62' + '%61' + '%72')", "'foobar'");
+    }
+    SECTION("String.fromCharCode")
+    {
+        SECTION("single quoted string + String.fromCharCode")
+            test_normalization("'foo' + String.fromCharCode(98, 97, 114)", "'foobar'");
+
+        SECTION("double quoted string + String.fromCharCode")
+            test_normalization("\"foo\" + String.fromCharCode(98, 97, 114)", "\"foobar'");
+
+        SECTION("String.fromCharCode + single quoted string")
+            test_normalization("String.fromCharCode(102, 111, 111) + 'bar'", "'foobar'");
+
+        SECTION("String.fromCharCode + double quoted string")
+            test_normalization("String.fromCharCode(102, 111, 111) + \"bar\"", "'foobar\"");
+        SECTION("Inside function call arguments")
+            test_normalization(" String.fromCharCode('foo' + 'bar')", "'' 'foobar'");
+    }
+}
+
+TEST_CASE("String Concatenation - Multiple PDU", "[JSNormalizer]")
+{
+    SECTION("Two single quoted strings")
+    {
+        test_normalization({
+            {"'",   "'"         },
+            {"foo", "'foo"      },
+            {"'",   "'foo'"     },
+            {" +",  "'foo'+"    },
+            {" '",  "'foo"      },
+            {"bar", "'foobar"   },
+            {"'",   "'foobar'"  }
+        });
+    }
+    SECTION("Three double quoted strings")
+    {
+        test_normalization({
+            {"\"foo",       "\"foo"         },
+            {"\" + \"",     "\"foo"         },
+            {"bar\"",       "\"foobar\""    },
+            {"+ \"baz\"",   "\"foobarbaz\"" }
+
+        });
+    }
+    SECTION("single quoted string + double quoted string")
+    {
+        test_normalization({
+            {"'foo",    "'foo"      },
+            {"'",       "'foo'"     },
+            {" + \"",   "\'foo"     },
+            {"bar",     "'foobar"   },
+            {"\"",      "'foobar\"" }
+        });
+    }
+    SECTION("With a non-string literal between strings")
+    {
+        test_normalization({
+            {"\"fo",    "\"fo"                  },
+            {"o\"",     "\"foo\""               },
+            {" + i",    "\"foo\"+var_0000"      },
+            {"d + ",    "\"foo\"+var_0001+"     },
+            {"'ba",     "\"foo\"+var_0001+'ba"  },
+            {"r'",      "\"foo\"+var_0001+'bar'"}
+        });
+    }
+    SECTION("With unescape")
+    {
+        test_normalization({
+            {"'fo",         "'fo"               },
+            {"o'",          "'foo'"             },
+            {" + ",         "'foo'+"            },
+            {"unescape",    "'foo'+unescape"    },
+            {"(",           "'foo'+"            },
+            {"'%62%61%72",  "'foobar"           },
+            {"'+",          "'foobar'+"         },
+            {"'baz",        "'foobarbaz"        },
+            {"'",           "'foobarbaz'"       }
+        });
+    }
+    SECTION("With String.fromCharCode")
+    {
+        test_normalization({
+            {"'foo",            "'foo"                      },
+            {"' + ",            "'foo'+"                    },
+            {"String",          "'foo'+String"              },
+            {".fromCharCode",   "'foo'+String.fromCharCode" },
+            {"(",               "'foo"                      },
+            {"98,97,114",       "'foobar"                   },
+            {")+",              "'foobar'+"                 },
+            {"'",               "'foobar"                   },
+            {"baz'",            "'foobarbaz'"               }
+        });
+    }
+}
+
 #endif // CATCH_TEST_BUILD
 
 // Benchmark tests
diff --git a/src/utils/test/js_unescape_test.cc b/src/utils/test/js_unescape_test.cc
index 6736935be..3c8d29ccf 100644
--- a/src/utils/test/js_unescape_test.cc
+++ b/src/utils/test/js_unescape_test.cc
@@ -1083,11 +1083,11 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
         );
         test_normalization(
             "unescape ( '\\x62\\x61\\x72' + '\\x62\\x61\\x72' ) ;",
-            "'bar'+'bar';"
+            "'barbar';"
         );
         test_normalization_mixed_encoding(
             "unescape ( '\\x62\\x61\\x72' + '\\u62\\u61\\u72' ) ;",
-            "'bar'+'bar';"
+            "'barbar';"
         );
     }
 
@@ -1131,7 +1131,7 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
     {
         test_normalization(
             "unescape('\\x62\\x61\\x72'+unescape('\\x62\\x61\\x72')+decodeURI('\\u62\\u61\\u72')) ;",
-            "'bar'+'bar'+'bar';"
+            "'barbarbar';"
         );
         test_normalization(
             "document.write(unescape('%62%61%72')) ;",