Pull request #3320: JSN: String literals concatenation

author Mike Stepanek (mstepane) <mstepane@cisco.com>

Mon, 28 Mar 2022 10:59:05 +0000 (10:59 +0000)

committer Mike Stepanek (mstepane) <mstepane@cisco.com>

Mon, 28 Mar 2022 10:59:05 +0000 (10:59 +0000)
author Mike Stepanek (mstepane) <mstepane@cisco.com>
Mon, 28 Mar 2022 10:59:05 +0000 (10:59 +0000)
committer Mike Stepanek (mstepane) <mstepane@cisco.com>
Mon, 28 Mar 2022 10:59:05 +0000 (10:59 +0000)
diff --git a/doc/user/http_inspect.txt b/doc/user/http_inspect.txt

index 2d583e0cdd68fc79078e562f691722f0347878c8..02bd93adfb5fa328ed779562bca90a825c1671d2 100755 (executable)
--- a/doc/user/http_inspect.txt
+++ b/doc/user/http_inspect.txt
@@ -76,7 +76,9 @@ will be removed.
  Having ips option 'js_data' in the rules automatically enables Enhanced
  Normalizer. The Enhanced Normalizer can normalize inline/external scripts.
  It supports scripts over multiple PDUs. It is a stateful JavaScript whitespace
-and identifiers normalizer. All JavaScript identifier names, except those from
+and identifiers normalizer. Normalizer concatenates string literals whenever 
+it's possible to do. This also works with any other normalizations that result
+in string literals. All JavaScript identifier names, except those from
  the ignore list, will be substituted with unified names in the following
  format: var_0000 -> var_ffff. The Normalizer tries to expand an escaped text,
  so it will appear in a usual form in the output. Moreover, Normalizer validates
diff --git a/src/service_inspectors/http_inspect/dev_notes.txt b/src/service_inspectors/http_inspect/dev_notes.txt

index 83c5970be967c5721391a23831a0f7bb06ad71dc..eb69bbe30d5ad75964235cc1e2d428e5d780b119 100755 (executable)
--- a/src/service_inspectors/http_inspect/dev_notes.txt
+++ b/src/service_inspectors/http_inspect/dev_notes.txt
@@ -223,7 +223,9 @@ During message body analysis the Enhanced Normalizer does one of the following:
  Enhanced Normalizer is a stateful JavaScript whitespace and identifiers normalizer.
  Normalizer will remove all extraneous whitespace and newlines, keeping a single space where 
  syntactically necessary. Comments will be removed, but contents of string literals will
-be kept intact. Semicolons will be inserted, if not already present, according to ECMAScript
+be kept intact. Any string literals, added by the plus operator,
+will be concatenated. This also works for functions that result in string
+literals. Semicolons will be inserted, if not already present, according to ECMAScript
  automatic semicolon insertion rules.
  All JavaScript identifier names, except those from the ignore list,
  will be substituted with unified names in the following format: var_0000 -> var_ffff.
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h

index 0747af6d3258c3e6817486729814c09ca9708ba9..be3011100806b1440cd7dd7ebfff5c4ad266ab07 100644 (file)
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -184,6 +184,7 @@ private:
      JSRet do_identifier_substitution(const char* lexeme, bool id_part);
      JSRet push_identifier(const char* ident);
      bool unescape(const char* lexeme);
+    bool concatenate();
      void process_punctuator(JSToken tok = PUNCTUATOR);
      void skip_punctuator();
      void process_closing_brace();
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l

index 074dd45e98f35dd7657b41575020c61393f1a4eb..727b3746bcbe003e30e014bfc51c0d52158d2fa7 100644 (file)
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -1573,6 +1573,28 @@ bool JSTokenizer::unescape(const char* lexeme)
      return true;
  }
  
+bool JSTokenizer::concatenate()
+{
+    std::streambuf* pbuf = yyout.rdbuf();
+    std::streamsize size = pbuf->pubseekoff(0, yyout.cur, yyout.out);
+
+    if (size >= 2)
+    {
+        char tail[2];
+        pbuf->pubseekoff(-2, yyout.cur, yyout.out);
+        pbuf->sgetn(tail, 2);
+
+        if (tail[1] == '+' and (tail[0] == '\'' or tail[0] == '"'))
+        {
+            pbuf->pubseekoff(-2, yyout.cur, yyout.out);
+
+            return true;
+        }
+    }
+
+    return false;
+}
+
  void JSTokenizer::process_punctuator(JSToken tok)
  {
      ECHO;
@@ -2159,8 +2181,8 @@ JSTokenizer::JSRet JSTokenizer::literal_dq_string_start()
      dealias_append();
      EXEC(do_semicolon_insertion(ASI_GROUP_7))
      EXEC(do_spacing(LITERAL))
-    ECHO;
-    BEGIN(dqstr);
+    if (!concatenate())
+        ECHO;
      set_ident_norm(true);
  
      switch (func_call_type())
@@ -2186,7 +2208,8 @@ JSTokenizer::JSRet JSTokenizer::literal_sq_string_start()
      dealias_append();
      EXEC(do_semicolon_insertion(ASI_GROUP_7))
      EXEC(do_spacing(LITERAL))
-    ECHO;
+    if (!concatenate())
+        ECHO;
      set_ident_norm(true);
  
      switch (func_call_type())
@@ -2305,7 +2328,8 @@ JSTokenizer::JSRet JSTokenizer::open_parenthesis()
          token = LITERAL;
          BEGIN(char_code);
          set_char_code_str(true);
-        yyout << '\'';
+        if (!concatenate())
+            yyout << '\'';
          break;
      case FuncType::UNESCAPE:
          skip_punctuator();
diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc

index 0c30c01e71f478ab2c66b881edacb2a90410a569..1e6230c5df9f5f99f73e2098fa7d0430008f3588 100644 (file)
--- a/src/utils/test/js_normalizer_test.cc
+++ b/src/utils/test/js_normalizer_test.cc
@@ -399,7 +399,7 @@ static const char clamav_buf2[] =
      "function () { var tst=\"a\"+'bc'+     'd'; }";
  
  static const char clamav_expected2[] =
-    "function(){var tst=\"a\"+'bc'+'d';}";
+    "function(){var tst=\"abcd';}";
  
  static const char clamav_buf3[] =
      "dF('bmfsu%2639%2638x11u%2638%263%3A%264C1');";
@@ -751,7 +751,7 @@ static const char syntax_cases_buf0[] =
      "var esc = 'I don\\'t \\n know';\n";
  
  static const char syntax_cases_expected0[] =
-    "var a;var b=\"init this    stuff\";var c=\"Hi\"+\" \"+\"Joe\";"
+    "var a;var b=\"init this    stuff\";var c=\"Hi Joe\";"
      "var d=1+2+\"3\";var e=[2,3,5,8];var f=false;var g=/( i'm   a  .* regex )/;"
      "var h=function(){};const PI=3.14;var a=1,b=2,c=a+b;let z='zzz zz';var g=null;"
      "var name={first:\"Jane\",last:\"Doe\"};var esc='I don\\'t \\n know';";
@@ -4643,6 +4643,209 @@ TEST_CASE("Function call tracking - over multiple PDU", "[JSNormalizer]")
      }
  }
  
+TEST_CASE("String Concatenation - Basic", "[JSNormalizer]")
+{
+    SECTION("Two strings")
+    {
+        SECTION("single quoted strings")
+            test_normalization("'foo' + 'bar'", "'foobar'");
+
+        SECTION("double quoted strings")
+            test_normalization("\"foo\" + \"bar\"", "\"foobar\"");
+
+        SECTION("double quoted string + single quoted string")
+            test_normalization("\"foo\" + 'bar'", "\"foobar'");
+
+        SECTION("single quoted string + double quoted string")
+            test_normalization("'foo' + \"bar\"", "'foobar\"");
+
+        SECTION("string + function call")
+            test_normalization("'foo' + general('bar')", "'foo'+var_0000('bar')");
+
+        SECTION("function call + string")
+            test_normalization("general('bar') + 'foo'", "var_0000('bar')+'foo'");
+
+        SECTION("inside function call arguments")
+            test_normalization("general('foo' + 'bar')", "var_0000('foobar')");
+
+        SECTION("with concatenation inside")
+            test_normalization("'\"foo\"' + '+\"bar\"')", "'\"foo\"+\"bar\"'");
+
+        SECTION("terminated concatenation")
+            test_normalization("'foo' + '!</script>')", "'foo!");
+    }
+    SECTION("Three strings")
+    {
+        SECTION("single quoted strings")
+            test_normalization("'foo' + 'bar' + 'baz'", "'foobarbaz'");
+
+        SECTION("double quoted strings")
+            test_normalization("\"foo\" + \"bar\" + \"baz\"", "\"foobarbaz\"");
+
+        SECTION("single quoted string + double quoted string + double quoted string")
+            test_normalization("'foo' + \"bar\" + \"baz\"", "'foobarbaz\"");
+
+        SECTION("double quoted string + double quoted string + single quoted string")
+            test_normalization("\"foo\" + \"bar\" + 'baz'", "\"foobarbaz'");
+
+        SECTION("double quoted string + single quoted string + double quoted string")
+            test_normalization("\"foo\" + 'bar' + \"baz\"", "\"foobarbaz\"");
+
+        SECTION("function call between literals")
+            test_normalization("'foo' + general('bar') + \"baz\"", "'foo'+var_0000('bar')+\"baz\"");
+    }
+    SECTION("multiline comment before the plus symbol")
+        test_normalization("'foo' /*comment*/ + 'bar'", "'foobar'");
+
+    SECTION("single line comment before the plus symbol")
+        test_normalization("'foo' //comment\n + 'bar'", "'foobar'");
+
+    SECTION("HTML comment before the plus symbol")
+        test_normalization("'foo' <!-- HTML comment\n + 'bar'", "'foobar'");
+
+    SECTION("tab after the plus symbol")
+        test_normalization("'foo' + \t 'bar'", "'foobar'");
+
+    SECTION("comment after the plus symbol")
+        test_normalization("'foo' + /*comment*/ 'bar'", "'foobar'");
+
+    SECTION("with a non-string literal in chain")
+        test_normalization("'foo' + 'bar' + 2", "'foobar'+2");
+
+    SECTION("with a non-string literal between strings")
+        test_normalization("'foo' + 2 + 'bar'", "'foo'+2+'bar'");
+
+    SECTION("with a template literal")
+        test_normalization("\"foo\" + `bar`", "\"foo\"+`bar`");
+
+    SECTION("with a template literal substitution")
+        test_normalization("\"foo\" + `bar${a + 1}`", "\"foo\"+`bar${var_0000+1}`");
+
+    SECTION("inside a template literal substitution")
+        test_normalization("`literal${\"foo\" + \"bar\"}`", "`literal${\"foobar\"}`");
+
+    SECTION("automatic semicolon insertion after concatenation")
+        test_normalization("'foo' + 'bar'\nvar a = 5;", "'foobar';var var_0000=5;");
+}
+
+TEST_CASE("String Concatenation - With unescape", "[JSNormalizer]")
+{
+    SECTION("unescape")
+    {
+        SECTION("single quoted string + single quoted unescape")
+            test_normalization("'foo' + unescape('%62%61%72')", "'foobar'");
+
+        SECTION("double quoted string + single quoted unescape")
+            test_normalization("\"foo\" + unescape('%62%61%72')", "\"foobar'");
+
+        SECTION("single quoted unescape + single quoted string")
+            test_normalization("unescape('%66%6f%6f') + 'bar'", "'foobar'");
+
+        SECTION("double quoted unescape + double quoted string")
+            test_normalization("unescape(\"%66%6f%6f\") + \"bar\"", "\"foobar\"");
+
+        SECTION("string + unescape + string")
+            test_normalization("'foo' + unescape('%62%61%72') + 'baz'", "'foobarbaz'");
+
+        SECTION("unescape + unescape")
+            test_normalization("unescape('%66%6f%6f') + unescape('%62%61%72')", "'foobar'");
+
+        SECTION("inside function call arguments")
+            test_normalization("unescape('foo' + '%62' + '%61' + '%72')", "'foobar'");
+    }
+    SECTION("String.fromCharCode")
+    {
+        SECTION("single quoted string + String.fromCharCode")
+            test_normalization("'foo' + String.fromCharCode(98, 97, 114)", "'foobar'");
+
+        SECTION("double quoted string + String.fromCharCode")
+            test_normalization("\"foo\" + String.fromCharCode(98, 97, 114)", "\"foobar'");
+
+        SECTION("String.fromCharCode + single quoted string")
+            test_normalization("String.fromCharCode(102, 111, 111) + 'bar'", "'foobar'");
+
+        SECTION("String.fromCharCode + double quoted string")
+            test_normalization("String.fromCharCode(102, 111, 111) + \"bar\"", "'foobar\"");
+        SECTION("Inside function call arguments")
+            test_normalization(" String.fromCharCode('foo' + 'bar')", "'' 'foobar'");
+    }
+}
+
+TEST_CASE("String Concatenation - Multiple PDU", "[JSNormalizer]")
+{
+    SECTION("Two single quoted strings")
+    {
+        test_normalization({
+            {"'",   "'"         },
+            {"foo", "'foo"      },
+            {"'",   "'foo'"     },
+            {" +",  "'foo'+"    },
+            {" '",  "'foo"      },
+            {"bar", "'foobar"   },
+            {"'",   "'foobar'"  }
+        });
+    }
+    SECTION("Three double quoted strings")
+    {
+        test_normalization({
+            {"\"foo",       "\"foo"         },
+            {"\" + \"",     "\"foo"         },
+            {"bar\"",       "\"foobar\""    },
+            {"+ \"baz\"",   "\"foobarbaz\"" }
+
+        });
+    }
+    SECTION("single quoted string + double quoted string")
+    {
+        test_normalization({
+            {"'foo",    "'foo"      },
+            {"'",       "'foo'"     },
+            {" + \"",   "\'foo"     },
+            {"bar",     "'foobar"   },
+            {"\"",      "'foobar\"" }
+        });
+    }
+    SECTION("With a non-string literal between strings")
+    {
+        test_normalization({
+            {"\"fo",    "\"fo"                  },
+            {"o\"",     "\"foo\""               },
+            {" + i",    "\"foo\"+var_0000"      },
+            {"d + ",    "\"foo\"+var_0001+"     },
+            {"'ba",     "\"foo\"+var_0001+'ba"  },
+            {"r'",      "\"foo\"+var_0001+'bar'"}
+        });
+    }
+    SECTION("With unescape")
+    {
+        test_normalization({
+            {"'fo",         "'fo"               },
+            {"o'",          "'foo'"             },
+            {" + ",         "'foo'+"            },
+            {"unescape",    "'foo'+unescape"    },
+            {"(",           "'foo'+"            },
+            {"'%62%61%72",  "'foobar"           },
+            {"'+",          "'foobar'+"         },
+            {"'baz",        "'foobarbaz"        },
+            {"'",           "'foobarbaz'"       }
+        });
+    }
+    SECTION("With String.fromCharCode")
+    {
+        test_normalization({
+            {"'foo",            "'foo"                      },
+            {"' + ",            "'foo'+"                    },
+            {"String",          "'foo'+String"              },
+            {".fromCharCode",   "'foo'+String.fromCharCode" },
+            {"(",               "'foo"                      },
+            {"98,97,114",       "'foobar"                   },
+            {")+",              "'foobar'+"                 },
+            {"'",               "'foobar"                   },
+            {"baz'",            "'foobarbaz'"               }
+        });
+    }
+}
+
  #endif // CATCH_TEST_BUILD
  
  // Benchmark tests
diff --git a/src/utils/test/js_unescape_test.cc b/src/utils/test/js_unescape_test.cc

index 6736935befb4c68d97d684b7ed0e1a0a36cab65d..3c8d29ccf957a681fb23d04c0192860f41e738e1 100644 (file)
--- a/src/utils/test/js_unescape_test.cc
+++ b/src/utils/test/js_unescape_test.cc
@@ -1083,11 +1083,11 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
          );
          test_normalization(
              "unescape ( '\\x62\\x61\\x72' + '\\x62\\x61\\x72' ) ;",
-            "'bar'+'bar';"
+            "'barbar';"
          );
          test_normalization_mixed_encoding(
              "unescape ( '\\x62\\x61\\x72' + '\\u62\\u61\\u72' ) ;",
-            "'bar'+'bar';"
+            "'barbar';"
          );
      }
  
@@ -1131,7 +1131,7 @@ TEST_CASE("Mixed input", "[JSNormalizer]")
      {
          test_normalization(
              "unescape('\\x62\\x61\\x72'+unescape('\\x62\\x61\\x72')+decodeURI('\\u62\\u61\\u72')) ;",
-            "'bar'+'bar'+'bar';"
+            "'barbarbar';"
          );
          test_normalization(
              "document.write(unescape('%62%61%72')) ;",
author	Mike Stepanek (mstepane) <mstepane@cisco.com>
	Mon, 28 Mar 2022 10:59:05 +0000 (10:59 +0000)
committer	Mike Stepanek (mstepane) <mstepane@cisco.com>
	Mon, 28 Mar 2022 10:59:05 +0000 (10:59 +0000)
doc/user/http_inspect.txt		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/dev_notes.txt		patch \| blob \| blame \| history
src/utils/js_tokenizer.h		patch \| blob \| blame \| history
src/utils/js_tokenizer.l		patch \| blob \| blame \| history
src/utils/test/js_normalizer_test.cc		patch \| blob \| blame \| history
src/utils/test/js_unescape_test.cc		patch \| blob \| blame \| history