From: Mike Stepanek (mstepane) Date: Wed, 30 Mar 2022 16:03:01 +0000 (+0000) Subject: Pull request #3326: JSN: decode String.fromCodePoint() JavaScript function X-Git-Tag: 3.1.27.0~9 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=93e521a6bfe382c160d0fc73c52fe27fbb803e2b;p=thirdparty%2Fsnort3.git Pull request #3326: JSN: decode String.fromCodePoint() JavaScript function Merge in SNORT/snort3 from ~OSERHIIE/snort3:js_from_code_point to master Squashed commit of the following: commit a4e3c6cad84181fb907ccafec6e4941e4611a927 Author: Oleksandr Serhiienko Date: Mon Mar 28 13:34:04 2022 +0300 http_inspect: decode String.fromCodePoint() JavaScript function * utils: add support for supplementary characters in JS Normalizer * utils: add tracking and decoding of String.fromCodePoint() JavaScript function in JS Normalizer * utils: add unit test coverage * http_inspect: update dev notes * doc: update user manual --- diff --git a/doc/user/http_inspect.txt b/doc/user/http_inspect.txt index 02bd93adf..cf7dcaf2e 100755 --- a/doc/user/http_inspect.txt +++ b/doc/user/http_inspect.txt @@ -80,7 +80,8 @@ and identifiers normalizer. Normalizer concatenates string literals whenever it's possible to do. This also works with any other normalizations that result in string literals. All JavaScript identifier names, except those from the ignore list, will be substituted with unified names in the following -format: var_0000 -> var_ffff. The Normalizer tries to expand an escaped text, +format: var_0000 -> var_ffff. But the unescape-like function names will be removed +from the normalized data. The Normalizer tries to expand an escaped text, so it will appear in a usual form in the output. Moreover, Normalizer validates the syntax concerning ECMA-262 Standard, including scope tracking and restrictions for script elements. For more information on how additionally configure diff --git a/src/service_inspectors/http_inspect/dev_notes.txt b/src/service_inspectors/http_inspect/dev_notes.txt index eb69bbe30..997d4afb5 100755 --- a/src/service_inspectors/http_inspect/dev_notes.txt +++ b/src/service_inspectors/http_inspect/dev_notes.txt @@ -258,19 +258,20 @@ For example: a("hello") // will be substituted to 'console.log("hello")' In addition to the scope tracking, JS Normalizer specifically tracks unicode unescape -functions(unescape, decodeURI, decodeURIComponent, String.fromCharCode). This allows detection of -unescape functions nested within other unescape functions, which is a potential -indicator of a multilevel obfuscation. The definition of a function call depends on +functions(unescape, decodeURI, decodeURIComponent, String.fromCharCode, String.fromCodePoint). +This allows detection of unescape functions nested within other unescape functions, which is +a potential indicator of a multilevel obfuscation. The definition of a function call depends on identifier substitution, so such identifiers must be included in the ignore list in order to use this feature. After determining the unescape sequence, it is decoded into the -corresponding string. +corresponding string, and the name of unescape function will not be present in the output. For example: - unescape('\u0062\u0061\u0072') -> 'bar' - decodeURI('%62%61%72') -> 'bar' - decodeURIComponent('\x62\x61\x72') -> 'bar' - String.fromCharCode(98, 0x0061, 0x72) -> 'bar' + unescape('\u0062\u0061\u0072') -> 'bar' + decodeURI('%62%61%72') -> 'bar' + decodeURIComponent('\x62\x61\x72') -> 'bar' + String.fromCharCode(98, 0x0061, 0x72) -> 'bar' + String.fromCodePoint(65600, 65601, 0x10042) -> '𐁀𐁁𐁂' Supported formats follow diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index e7b84d7ed..21a1fd725 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -364,12 +364,13 @@ private: FuncType type; }; - const std::array function_identifiers + const std::array function_identifiers {{ - {"unescape", FuncType::UNESCAPE }, - {"decodeURI", FuncType::UNESCAPE }, - {"decodeURIComponent", FuncType::UNESCAPE }, - {"String.fromCharCode", FuncType::CHAR_CODE } + {"unescape", FuncType::UNESCAPE }, + {"decodeURI", FuncType::UNESCAPE }, + {"decodeURIComponent", FuncType::UNESCAPE }, + {"String.fromCharCode", FuncType::CHAR_CODE}, + {"String.fromCodePoint", FuncType::CHAR_CODE} }}; const uint32_t max_bracket_depth; diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index da6c8bf15..263a91786 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -1237,8 +1237,15 @@ static std::string unicode_to_utf8(const unsigned int code) res += 0x80 | ((code >> 6) & 0x3f); res += 0x80 | (code & 0x3f); } + else if (code <= 0x1fffff) + { + res += 0xf0 | (code >> 18); + res += 0x80 | ((code >> 12) & 0x3f); + res += 0x80 | ((code >> 6) & 0x3f); + res += 0x80 | (code & 0x3f); + } else - res += "\uffff"; + res += "\xf7\xbf\xbf\xbf"; // UTF-8 sequence for hex 0x1fffff return res; } diff --git a/src/utils/test/js_unescape_test.cc b/src/utils/test/js_unescape_test.cc index 3c8d29ccf..ee58b1137 100644 --- a/src/utils/test/js_unescape_test.cc +++ b/src/utils/test/js_unescape_test.cc @@ -276,7 +276,11 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]") ); test_normalization( "String.fromCharCode(65536)", - "'\uffff'" + "'\xf0\x90\x80\x80'" + ); + test_normalization( + "String.fromCodePoint(2097152)", + "'\xf7\xbf\xbf\xbf'" ); } @@ -315,12 +319,20 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]") "'\ueEfF'" ); test_normalization( - "String.fromCharCode(0x10000)", - "'\uffff'" + "String.fromCodePoint(0x10000)", + "'\xf0\x90\x80\x80'" ); test_normalization( "String.fromCharCode(0X10000)", - "'\uffff'" + "'\xf0\x90\x80\x80'" + ); + test_normalization( + "String.fromCodePoint(0x200000)", + "'\xf7\xbf\xbf\xbf'" + ); + test_normalization( + "String.fromCodePoint(0X200000)", + "'\xf7\xbf\xbf\xbf'" ); } } @@ -825,6 +837,63 @@ TEST_CASE("String.fromCharCode()", "[JSNormalizer]") } } +TEST_CASE("String.fromCodePoint()", "[JSNormalizer]") +{ + SECTION("decimal") + { + test_normalization( + "String.fromCodePoint(98, 97, 114)", + "'bar'" + ); + + test_normalization( + "String.fromCodePoint(65600, 65601, 65602)", + "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" + ); + } + + SECTION("hexadecimal") + { + test_normalization( + "String.fromCodePoint(0x62, 0x61, 0x72)", + "'bar'" + ); + + test_normalization( + "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072)", + "'bar'" + ); + + test_normalization( + "String.fromCodePoint(0x10040, 0x10041, 0x10042)", + "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" + ); + } + + SECTION("mixed sequence") + { + test_normalization_mixed_encoding( + "String.fromCodePoint(98, 97, 0x72)", + "'bar'" + ); + + test_normalization_mixed_encoding( + "String.fromCodePoint(0x00000062, 97, 114)", + "'bar'" + ); + + test_normalization_mixed_encoding( + "String.fromCodePoint(65600, 0x10041, 65602)", + "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" + ); + + test_normalization_mixed_encoding( + "String.fromCodePoint(0x10040, 65601, 0x10042)", + "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" + ); + } +} + TEST_CASE("Split", "[JSNormalizer]") { SECTION("unescape()") @@ -1063,6 +1132,61 @@ TEST_CASE("Split", "[JSNormalizer]") { "114)", "'bar'" } }); } + + SECTION("String.fromCodePoint()") + { + test_normalization({ + { "String.fromCodePoint(", "'" }, + { ")", "''" } + }); + + test_normalization({ + { "String.fromCodePoint(9", "'\u0009" }, + { "8, 97, 114)", "'bar'" } + }); + + test_normalization({ + { "String.fromCodePoint(98,", "'b" }, + { "97, 114)", "'bar'" } + }); + + test_normalization({ + { "String.fromCodePoint(98, 97", "'ba" }, + { ",114)", "'bar'" } + }); + + test_normalization({ + { "String.fromCodePoint(98, 97, 114", "'bar" }, + { ")", "'bar'" } + }); + + test_normalization({ + { "String.fromCodePoint(0x0062", "'b" }, + { ",0x0061, 0x0072)", "'bar'" } + }); + + test_normalization({ + { "String.fromCodePoint(0x00000062, 0x00000061", "'ba" }, + { ", 0x0072)", "'bar'" } + }); + + test_normalization({ + { "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072", "'bar" }, + { ")", "'bar'" } + }); + + test_normalization({ + { "String.fromCodePoint(0x00000062,", "'b" }, + { "0x00000061,", "'ba" }, + { "0x72)", "'bar'" } + }); + + test_normalization({ + { "String.fromCodePoint(98,", "'b" }, + { "97,", "'ba" }, + { "114)", "'bar'" } + }); + } } TEST_CASE("Mixed input", "[JSNormalizer]") @@ -1109,6 +1233,10 @@ TEST_CASE("Mixed input", "[JSNormalizer]") "String.fromCharCode (114, 0x72, eval('123'), 114, 0x72) ;", "'rr' eval('123'),114,0x72;" ); + test_normalization_mixed_encoding( + "String.fromCodePoint (114, 0x00000072, eval('123'), 114, 0x00000072) ;", + "'rr' eval('123'),114,0x00000072;" + ); } SECTION("comment") @@ -1125,6 +1253,18 @@ TEST_CASE("Mixed input", "[JSNormalizer]") "String.fromCharCode(0x62, \r 0x61,