From: Mike Stepanek (mstepane) Date: Mon, 28 Mar 2022 10:59:05 +0000 (+0000) Subject: Pull request #3320: JSN: String literals concatenation X-Git-Tag: 3.1.27.0~13 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c60b70a2d566062ed98e341399d738a0343bbb07;p=thirdparty%2Fsnort3.git Pull request #3320: JSN: String literals concatenation Merge in SNORT/snort3 from ~ASERBENI/snort3:string_concat to master Squashed commit of the following: commit 34a89bea5e85a417f37bc26aaf859727e3148456 Author: Andrii Serbeniuk Date: Fri Mar 11 12:54:48 2022 +0200 utils: add string concatenation for Enchanced JS Normalizer --- diff --git a/doc/user/http_inspect.txt b/doc/user/http_inspect.txt index 2d583e0cd..02bd93adf 100755 --- a/doc/user/http_inspect.txt +++ b/doc/user/http_inspect.txt @@ -76,7 +76,9 @@ will be removed. Having ips option 'js_data' in the rules automatically enables Enhanced Normalizer. The Enhanced Normalizer can normalize inline/external scripts. It supports scripts over multiple PDUs. It is a stateful JavaScript whitespace -and identifiers normalizer. All JavaScript identifier names, except those from +and identifiers normalizer. Normalizer concatenates string literals whenever +it's possible to do. This also works with any other normalizations that result +in string literals. All JavaScript identifier names, except those from the ignore list, will be substituted with unified names in the following format: var_0000 -> var_ffff. The Normalizer tries to expand an escaped text, so it will appear in a usual form in the output. Moreover, Normalizer validates diff --git a/src/service_inspectors/http_inspect/dev_notes.txt b/src/service_inspectors/http_inspect/dev_notes.txt index 83c5970be..eb69bbe30 100755 --- a/src/service_inspectors/http_inspect/dev_notes.txt +++ b/src/service_inspectors/http_inspect/dev_notes.txt @@ -223,7 +223,9 @@ During message body analysis the Enhanced Normalizer does one of the following: Enhanced Normalizer is a stateful JavaScript whitespace and identifiers normalizer. Normalizer will remove all extraneous whitespace and newlines, keeping a single space where syntactically necessary. Comments will be removed, but contents of string literals will -be kept intact. Semicolons will be inserted, if not already present, according to ECMAScript +be kept intact. Any string literals, added by the plus operator, +will be concatenated. This also works for functions that result in string +literals. Semicolons will be inserted, if not already present, according to ECMAScript automatic semicolon insertion rules. All JavaScript identifier names, except those from the ignore list, will be substituted with unified names in the following format: var_0000 -> var_ffff. diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index 0747af6d3..be3011100 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -184,6 +184,7 @@ private: JSRet do_identifier_substitution(const char* lexeme, bool id_part); JSRet push_identifier(const char* ident); bool unescape(const char* lexeme); + bool concatenate(); void process_punctuator(JSToken tok = PUNCTUATOR); void skip_punctuator(); void process_closing_brace(); diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index 074dd45e9..727b3746b 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -1573,6 +1573,28 @@ bool JSTokenizer::unescape(const char* lexeme) return true; } +bool JSTokenizer::concatenate() +{ + std::streambuf* pbuf = yyout.rdbuf(); + std::streamsize size = pbuf->pubseekoff(0, yyout.cur, yyout.out); + + if (size >= 2) + { + char tail[2]; + pbuf->pubseekoff(-2, yyout.cur, yyout.out); + pbuf->sgetn(tail, 2); + + if (tail[1] == '+' and (tail[0] == '\'' or tail[0] == '"')) + { + pbuf->pubseekoff(-2, yyout.cur, yyout.out); + + return true; + } + } + + return false; +} + void JSTokenizer::process_punctuator(JSToken tok) { ECHO; @@ -2159,8 +2181,8 @@ JSTokenizer::JSRet JSTokenizer::literal_dq_string_start() dealias_append(); EXEC(do_semicolon_insertion(ASI_GROUP_7)) EXEC(do_spacing(LITERAL)) - ECHO; - BEGIN(dqstr); + if (!concatenate()) + ECHO; set_ident_norm(true); switch (func_call_type()) @@ -2186,7 +2208,8 @@ JSTokenizer::JSRet JSTokenizer::literal_sq_string_start() dealias_append(); EXEC(do_semicolon_insertion(ASI_GROUP_7)) EXEC(do_spacing(LITERAL)) - ECHO; + if (!concatenate()) + ECHO; set_ident_norm(true); switch (func_call_type()) @@ -2305,7 +2328,8 @@ JSTokenizer::JSRet JSTokenizer::open_parenthesis() token = LITERAL; BEGIN(char_code); set_char_code_str(true); - yyout << '\''; + if (!concatenate()) + yyout << '\''; break; case FuncType::UNESCAPE: skip_punctuator(); diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index 0c30c01e7..1e6230c5d 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -399,7 +399,7 @@ static const char clamav_buf2[] = "function () { var tst=\"a\"+'bc'+ 'd'; }"; static const char clamav_expected2[] = - "function(){var tst=\"a\"+'bc'+'d';}"; + "function(){var tst=\"abcd';}"; static const char clamav_buf3[] = "dF('bmfsu%2639%2638x11u%2638%263%3A%264C1');"; @@ -751,7 +751,7 @@ static const char syntax_cases_buf0[] = "var esc = 'I don\\'t \\n know';\n"; static const char syntax_cases_expected0[] = - "var a;var b=\"init this stuff\";var c=\"Hi\"+\" \"+\"Joe\";" + "var a;var b=\"init this stuff\";var c=\"Hi Joe\";" "var d=1+2+\"3\";var e=[2,3,5,8];var f=false;var g=/( i'm a .* regex )/;" "var h=function(){};const PI=3.14;var a=1,b=2,c=a+b;let z='zzz zz';var g=null;" "var name={first:\"Jane\",last:\"Doe\"};var esc='I don\\'t \\n know';"; @@ -4643,6 +4643,209 @@ TEST_CASE("Function call tracking - over multiple PDU", "[JSNormalizer]") } } +TEST_CASE("String Concatenation - Basic", "[JSNormalizer]") +{ + SECTION("Two strings") + { + SECTION("single quoted strings") + test_normalization("'foo' + 'bar'", "'foobar'"); + + SECTION("double quoted strings") + test_normalization("\"foo\" + \"bar\"", "\"foobar\""); + + SECTION("double quoted string + single quoted string") + test_normalization("\"foo\" + 'bar'", "\"foobar'"); + + SECTION("single quoted string + double quoted string") + test_normalization("'foo' + \"bar\"", "'foobar\""); + + SECTION("string + function call") + test_normalization("'foo' + general('bar')", "'foo'+var_0000('bar')"); + + SECTION("function call + string") + test_normalization("general('bar') + 'foo'", "var_0000('bar')+'foo'"); + + SECTION("inside function call arguments") + test_normalization("general('foo' + 'bar')", "var_0000('foobar')"); + + SECTION("with concatenation inside") + test_normalization("'\"foo\"' + '+\"bar\"')", "'\"foo\"+\"bar\"'"); + + SECTION("terminated concatenation") + test_normalization("'foo' + '!')", "'foo!"); + } + SECTION("Three strings") + { + SECTION("single quoted strings") + test_normalization("'foo' + 'bar' + 'baz'", "'foobarbaz'"); + + SECTION("double quoted strings") + test_normalization("\"foo\" + \"bar\" + \"baz\"", "\"foobarbaz\""); + + SECTION("single quoted string + double quoted string + double quoted string") + test_normalization("'foo' + \"bar\" + \"baz\"", "'foobarbaz\""); + + SECTION("double quoted string + double quoted string + single quoted string") + test_normalization("\"foo\" + \"bar\" + 'baz'", "\"foobarbaz'"); + + SECTION("double quoted string + single quoted string + double quoted string") + test_normalization("\"foo\" + 'bar' + \"baz\"", "\"foobarbaz\""); + + SECTION("function call between literals") + test_normalization("'foo' + general('bar') + \"baz\"", "'foo'+var_0000('bar')+\"baz\""); + } + SECTION("multiline comment before the plus symbol") + test_normalization("'foo' /*comment*/ + 'bar'", "'foobar'"); + + SECTION("single line comment before the plus symbol") + test_normalization("'foo' //comment\n + 'bar'", "'foobar'"); + + SECTION("HTML comment before the plus symbol") + test_normalization("'foo'