From: Mike Stepanek (mstepane) Date: Fri, 27 May 2022 16:47:05 +0000 (+0000) Subject: Pull request #3431: http_inspect: add handling of binary and octal integers to JS... X-Git-Tag: 3.1.31.0~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=31f1ed7536b193b4d848fccbeaa2a6d3739b9717;p=thirdparty%2Fsnort3.git Pull request #3431: http_inspect: add handling of binary and octal integers to JS Normalizer Merge in SNORT/snort3 from ~VHORBATO/snort3:js_int_lit to master Squashed commit of the following: commit 2e3b8040edc18c5410c5a055eace0199a3135189 Author: Vitalii Date: Thu May 19 12:44:06 2022 +0300 http_inspect: add handling of binary, octal and big integers to JS Normalizer --- diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index 14b6a1229..d697887ab 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -303,8 +303,7 @@ private: void escaped_unicode_utf_8(); void escaped_code_point(); void escaped_url_sequence_latin_1(); - void dec_code_point(); - void hex_code_point(); + void lit_int_code_point(int base); void char_code_no_match(); static const char* p_scope_codes[]; diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index cdebb9eed..4a1ab9228 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -32,6 +32,7 @@ #include "utils/js_tokenizer.h" +#include #include #include "utils/js_identifier_ctx.h" @@ -86,14 +87,16 @@ constexpr bool JSTokenizer::insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX]; enum EncodingType { - IS_HEX = 1 << 0, // hex code unit: 0xXXXX - IS_DEC = 1 << 1, // dec code unit: XXXX - IS_XBACKSLASH = 1 << 2, // \xXX - IS_UBACKSLASH_1 = 1 << 3, // \uXX - IS_UBACKSLASH_2 = 1 << 4, // \uXXXX - IS_UPERCENT = 1 << 5, // %uXXXX - IS_PERCENT = 1 << 6, // %XX - IS_UCODEPOINT = 1 << 7 // \u{0xXXXX} + IS_BIN = 1 << 0, // bin code unit: 0bXXXX + IS_OCT = 1 << 1, // oct code unit: 0oXXXX + IS_DEC = 1 << 2, // dec code unit: XXXX + IS_HEX = 1 << 3, // hex code unit: 0xXXXX + IS_XBACKSLASH = 1 << 4, // \xXX + IS_UBACKSLASH_1 = 1 << 5, // \uXX + IS_UBACKSLASH_2 = 1 << 6, // \uXXXX + IS_UPERCENT = 1 << 7, // %uXXXX + IS_PERCENT = 1 << 8, // %XX + IS_UCODEPOINT = 1 << 9 // \u{0xXXXX} }; %} @@ -981,9 +984,12 @@ IDENTIFIER ({IDENTIFIER_START}{IDENTIFIER_PART})* LITERAL_NULL null LITERAL_THIS this LITERAL_BOOLEAN true|false -LITERAL_DECIMAL [.]?[0-9]+[\.]?[0-9]*[eE]?[0-9]* -LITERAL_INTEGER [0-9]* -LITERAL_HEX_INTEGER 0x[0-9a-fA-F]*|0X[0-9a-fA-F]* +LITERAL_BIN_INTEGER 0[bB][01](_?[01])* +LITERAL_OCT_INTEGER 0[oO]?[0-7](_?[0-7])* +LITERAL_DECIMAL [.]?(_?[0-9])+[\.]?(_?[0-9])*([eE](_?[0-9])+)? +LITERAL_INTEGER [0-9](_?[0-9])* +LITERAL_HEX_INTEGER 0[xX][a-fA-F0-9](_?[a-fA-F0-9])* +LITERAL_BIG_INTEGER ({LITERAL_DECIMAL}|{LITERAL_BIN_INTEGER}|{LITERAL_OCT_INTEGER}|{LITERAL_HEX_INTEGER})n LITERAL_DQ_STRING_START \" LITERAL_DQ_STRING_END \" @@ -1011,7 +1017,7 @@ LITERAL_REGEX_G_CLOSE \)|\]|\} LITERAL_UNDEFINED undefined LITERAL_INFINITY Infinity|\xE2\x88\x9E LITERAL_NAN NaN -LITERAL {LITERAL_NULL}|{LITERAL_THIS}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN} +LITERAL {LITERAL_NULL}|{LITERAL_THIS}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_BIN_INTEGER}|{LITERAL_OCT_INTEGER}|{LITERAL_HEX_INTEGER}|{LITERAL_BIG_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN} HTML_COMMENT_OPEN "<"+"!--" HTML_TAG_SCRIPT_OPEN "<"+(?i:script)[\x9\xA\xC\x20\x2f\x3e] @@ -1221,8 +1227,10 @@ ALL_UNICODE [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8 {LITERAL} { EXEC(general_literal()) } {IDENTIFIER} { EXEC(general_identifier()) } -{LITERAL_INTEGER} { set_encoding(IS_DEC); dec_code_point(); } -{LITERAL_HEX_INTEGER} { set_encoding(IS_HEX); hex_code_point(); } +{LITERAL_BIN_INTEGER} { set_encoding(IS_BIN); lit_int_code_point(2); } +{LITERAL_OCT_INTEGER} { set_encoding(IS_OCT); lit_int_code_point(8); } +{LITERAL_INTEGER} { set_encoding(IS_DEC); lit_int_code_point(10); } +{LITERAL_HEX_INTEGER} { set_encoding(IS_HEX); lit_int_code_point(16); } .|{ALL_UNICODE} { general_unicode(); } @@ -2919,16 +2927,11 @@ void JSTokenizer::escaped_url_sequence_latin_1() yyout << (char)std::stoi(code, nullptr, 16); } -void JSTokenizer::dec_code_point() +void JSTokenizer::lit_int_code_point(int base) { - std::string code(YYText()); - yyout << unicode_to_utf8(std::stoi(code, nullptr, 10)); -} - -void JSTokenizer::hex_code_point() -{ - std::string code(YYText()); - yyout << unicode_to_utf8(std::stoi(code, nullptr, 16)); + std::string code(base != 10 && !isdigit(YYText()[1]) ? YYText() + 2 : YYText()); + code.erase(std::remove(code.begin(), code.end(), '_'), code.end()); + yyout << unicode_to_utf8(std::stoi(code, nullptr, base)); } void JSTokenizer::char_code_no_match() diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index 1d4138286..f0b6c4c01 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -624,16 +624,16 @@ static const char all_patterns_expected3[] = "interface void while delete export package"; static const char all_patterns_buf4[] = - "/regex/g undefined null true false 2 23 2.3 2.23 .2 .02 4. +2 -2 " - "+3.3 -3.3 +23 -32 2.3E45 3.E34 -2.3E45 -3.E34 +2.3E45 +3.E34 0x1234 0XFFFF Infinity " - "\xE2\x88\x9E NaN \"\" \"double string\" \"d\" '' 'single string' 's' x=/regex/gs " - "x=2/2/1 `\ntemplate\n`"; + "/regex/g undefined null true false 2 23 2_3 2.3 2.23 2.2_3 .2 .02 .0_2 4. +2 -2 " + "+3.3 -3.3 +23 -32 2.3E45 2.3E4_5 3.E34 -2.3E45 -3.E34 +2.3E45 +3.E34 0b101 0B111 0o357 0O777 " + "0373 0x1234 0XFFFF 123n 0b101n 0o123n 0xaffn Infinity \xE2\x88\x9E NaN \"\" \"double string\" " + "\"d\" '' 'single string' 's' x=/regex/gs x=2/2/1 `\ntemplate\n`"; static const char all_patterns_expected4[] = - "/regex/g undefined null true false 2 23 2.3 2.23 .2 .02 4.+2-2" - "+3.3-3.3+23-32 2.3E45 3.E34-2.3E45-3.E34+2.3E45+3.E34 0x1234 0XFFFF Infinity " - "\xE2\x88\x9E NaN \"\" \"double string\" \"d\" '' 'single string' 's' x=/regex/gs " - "x=2/2/1 `\ntemplate\n`"; + "/regex/g undefined null true false 2 23 2_3 2.3 2.23 2.2_3 .2 .02 .0_2 4.+2-2" + "+3.3-3.3+23-32 2.3E45 2.3E4_5 3.E34-2.3E45-3.E34+2.3E45+3.E34 0b101 0B111 0o357 0O777 0373 " + "0x1234 0XFFFF 123n 0b101n 0o123n 0xaffn Infinity \xE2\x88\x9E NaN \"\" \"double string\" " + "\"d\" '' 'single string' 's' x=/regex/gs x=2/2/1 `\ntemplate\n`"; static const char all_patterns_buf5[] = "$2abc _2abc abc $__$ 肖晗 XÆA12 \\u0041abc \\u00FBdef \\u1234ghi ab\xE2\x80\xA8ww " @@ -2763,6 +2763,115 @@ TEST_CASE("split in keyword", "[JSNormalizer]") } } +TEST_CASE("split in integer literal", "[JSNormalizer]") +{ + SECTION("1 2;") + { + const char dat1[] = "1"; + const char dat2[] = "2;"; + const char exp1[] = "1"; + const char exp2[] = "12;"; + const char exp[] = "12;"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION("0 b01;") + { + const char dat1[] = "0"; + const char dat2[] = "b01;"; + const char exp1[] = "0"; + const char exp2[] = "0b01;"; + const char exp[] = "0b01;"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION("0o 12;") + { + const char dat1[] = "0o"; + const char dat2[] = "12;"; + const char exp1[] = "0 o"; + const char exp2[] = "0o12;"; + const char exp[] = "0o12;"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION("0 12;") + { + const char dat1[] = "0"; + const char dat2[] = "12;"; + const char exp1[] = "0"; + const char exp2[] = "012;"; + const char exp[] = "012;"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION("01 9;") + { + const char dat1[] = "01"; + const char dat2[] = "9;"; + const char exp1[] = "01"; + const char exp2[] = "019;"; + const char exp[] = "019;"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION(". 12;") + { + const char dat1[] = "."; + const char dat2[] = "12;"; + const char exp1[] = "."; + const char exp2[] = ".12;"; + const char exp[] = ".12;"; + + NORMALIZE_2(dat1, dat2, exp1, exp2); + NORM_COMBINED_2(dat1, dat2, exp); + } + SECTION("0 x 12;") + { + const char dat1[] = "0"; + const char dat2[] = "x"; + const char dat3[] = "12;"; + const char exp1[] = "0"; + const char exp2[] = " x"; + const char exp3[] = "0x12;"; + const char exp[] = "0x12;"; + + NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3); + NORM_COMBINED_3(dat1, dat2, dat3, exp); + } + SECTION("1 _ 2;") + { + const char dat1[] = "1"; + const char dat2[] = "_"; + const char dat3[] = "2;"; + const char exp1[] = "1"; + const char exp2[] = " _"; + const char exp3[] = "1_2;"; + const char exp[] = "1_2;"; + + NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3); + NORM_COMBINED_3(dat1, dat2, dat3, exp); + } + SECTION("1 E 2;") + { + const char dat1[] = "1"; + const char dat2[] = "E"; + const char dat3[] = "2;"; + const char exp1[] = "1"; + const char exp2[] = " E"; + const char exp3[] = "1E2;"; + const char exp[] = "1E2;"; + + NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3); + NORM_COMBINED_3(dat1, dat2, dat3, exp); + } +} + TEST_CASE("split and continuation combined", "[JSNormalizer]") { SECTION("PDU 1 [cont] PDU 2 [end end cont end]") diff --git a/src/utils/test/js_unescape_test.cc b/src/utils/test/js_unescape_test.cc index 949e9840b..0df3e2c34 100644 --- a/src/utils/test/js_unescape_test.cc +++ b/src/utils/test/js_unescape_test.cc @@ -74,18 +74,74 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]") "'\u0020 \u00EB \u0123 \u4567 \u89aA \ubBcC \u00dD \ueEfF'" ); + SECTION("binary") + { + test_normalization( + "String.fromCharCode(0b1, 0B1100 ,0b11101011, 0B000101011001, 0b0001101010000101)", + "'\u0001\u000c\u00EB\u0159\u1a85'" + ); + test_normalization( + "String.fromCharCode(0b10000000000000000)", + "'\xf0\x90\x80\x80'" + ); + test_normalization( + "String.fromCharCode(0B10000000000000000)", + "'\xf0\x90\x80\x80'" + ); + test_normalization( + "String.fromCodePoint(0b1000000000000000000000)", + "'\xf7\xbf\xbf\xbf'" + ); + test_normalization( + "String.fromCodePoint(0B1000000000000000000000)", + "'\xf7\xbf\xbf\xbf'" + ); + } + + SECTION("octal") + { + test_normalization( + "String.fromCharCode(0O1, 014 ,0o353, 0O531, 0o15205)", + "'\u0001\u000c\u00EB\u0159\u1a85'" + ); + test_normalization( + "String.fromCharCode(0o200000)", + "'\xf0\x90\x80\x80'" + ); + test_normalization( + "String.fromCharCode(0O200000)", + "'\xf0\x90\x80\x80'" + ); + test_normalization( + "String.fromCharCode(0200000)", + "'\xf0\x90\x80\x80'" + ); + test_normalization( + "String.fromCodePoint(0o10_000_000)", + "'\xf7\xbf\xbf\xbf'" + ); + test_normalization( + "String.fromCodePoint(0O10_000_000)", + "'\xf7\xbf\xbf\xbf'" + ); + test_normalization( + "String.fromCodePoint(010_000_000)", + "'\xf7\xbf\xbf\xbf'" + ); + } + SECTION("decimal") { test_normalization( - "String.fromCharCode(1, 12 ,235, 345, 6789, 1000, 0001)", - "'\u0001\u000c\u00EB\u0159\u1a85\u03e8\u0001'" + "String.fromCharCode(1, 12 ,235, 345, 6789, 10_00, 00_09)", + "'\u0001\u000c\u00EB\u0159\u1a85\u03e8\u0009'" ); test_normalization( "String.fromCharCode(65536)", "'\xf0\x90\x80\x80'" ); test_normalization( - "String.fromCodePoint(2097152)", + "String.fromCodePoint(209_715_2)", "'\xf7\xbf\xbf\xbf'" ); } @@ -93,7 +149,7 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]") SECTION("hexadecimal") { test_normalization( - "String.fromCharCode(0x0001, 0X00EB, 0x0123, 0x4567, 0x89aA, 0xbBcC, 0x00dD, 0xeEfF)", + "String.fromCharCode(0x0001, 0X00EB, 0x0123, 0x45_67, 0x89aA, 0xbBcC, 0x00dD, 0xe_Ef_F)", "'\u0001\u00EB\u0123\u4567\u89aA\ubBcC\u00dD\ueEfF'" ); test_normalization( @@ -105,11 +161,11 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]") "'\xf0\x90\x80\x80'" ); test_normalization( - "String.fromCodePoint(0x200000)", + "String.fromCodePoint(0x200_000)", "'\xf7\xbf\xbf\xbf'" ); test_normalization( - "String.fromCodePoint(0X200000)", + "String.fromCodePoint(0X200_000)", "'\xf7\xbf\xbf\xbf'" ); } @@ -584,6 +640,41 @@ TEST_CASE("decodeURIComponent()", "[JSNormalizer]") TEST_CASE("String.fromCharCode()", "[JSNormalizer]") { + SECTION("binary") + { + test_normalization( + "String.fromCharCode(0b1100010, 0B1100001, 0b1110010)", + "'bar'" + ); + + test_normalization( + "String.fromCharCode(0B001100010, 0b001100001, 0B001110010)", + "'bar'" + ); + + test_normalization( + "String.fromCharCode(0b11_00_010, 0b00_11_00_001, 0B11_10_010)", + "'bar'" + ); + } + + SECTION("octal") + { + test_normalization( + "String.fromCharCode(0o142, 0o141, 0o162)", + "'bar'" + ); + + test_normalization( + "String.fromCharCode(0o00142, 0o00141, 0o00162)", + "'bar'" + ); + test_normalization( + "String.fromCharCode(0o00_14_2, 0O0_0_1_4_1, 000_162)", + "'bar'" + ); + } + SECTION("decimal") { test_normalization( @@ -600,7 +691,7 @@ TEST_CASE("String.fromCharCode()", "[JSNormalizer]") ); test_normalization( - "String.fromCharCode(0x0062, 0x0061, 0x0072)", + "String.fromCharCode(0x0062, 0x00_61, 0x0072)", "'bar'" ); } @@ -608,19 +699,50 @@ TEST_CASE("String.fromCharCode()", "[JSNormalizer]") SECTION("mixed sequence") { test_normalization_mixed_encoding( - "String.fromCharCode(98, 97, 0x72)", - "'bar'" + "String.fromCharCode(0b11_00_110, 111, 0o157, 98, 0b1100001, 0x72)", + "'foobar'" ); test_normalization_mixed_encoding( - "String.fromCharCode(0x62, 97, 114)", - "'bar'" + "String.fromCharCode(102 ,0b110_1111, 0o157, 0x62, 97, 0O162)", + "'foobar'" ); } } TEST_CASE("String.fromCodePoint()", "[JSNormalizer]") { + SECTION("binary") + { + test_normalization( + "String.fromCodePoint(0b1100010, 0b1100001, 0b1110010)", + "'bar'" + ); + + test_normalization( + "String.fromCodePoint(0b10000000001000000, 0b10000000001000001, 0b10000000001000010)", + "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" + ); + + test_normalization( + "String.fromCodePoint(0b000_1_100_010, 0B1100001, 0B111_0010)", + "'bar'" + ); + } + + SECTION("octal") + { + test_normalization( + "String.fromCodePoint(0o142, 0O141, 0162)", + "'bar'" + ); + + test_normalization( + "String.fromCodePoint(0200_100, 0o200101, 0O200_102)", + "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" + ); + } + SECTION("decimal") { test_normalization( @@ -629,7 +751,7 @@ TEST_CASE("String.fromCodePoint()", "[JSNormalizer]") ); test_normalization( - "String.fromCodePoint(65600, 65601, 65602)", + "String.fromCodePoint(65600, 65_601, 65602)", "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" ); } @@ -642,7 +764,7 @@ TEST_CASE("String.fromCodePoint()", "[JSNormalizer]") ); test_normalization( - "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072)", + "String.fromCodePoint(0x000_000_62, 0X00000061, 0x00000072)", "'bar'" ); @@ -655,23 +777,23 @@ TEST_CASE("String.fromCodePoint()", "[JSNormalizer]") SECTION("mixed sequence") { test_normalization_mixed_encoding( - "String.fromCodePoint(98, 97, 0x72)", - "'bar'" + "String.fromCodePoint(0b1100110, 111, 0o157, 98, 0b11_00_001, 0x72)", + "'foobar'" ); test_normalization_mixed_encoding( - "String.fromCodePoint(0x00000062, 97, 114)", - "'bar'" + "String.fromCodePoint(102 ,0b1101111, 0o157, 0X000_00062, 97, 0O162)", + "'foobar'" ); test_normalization_mixed_encoding( - "String.fromCodePoint(65600, 0x10041, 65602)", - "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" + "String.fromCodePoint(65600, 0x10041, 0o200102, 0B1000_0000_0010_00011)", + "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82\xf0\x90\x81\x83'" ); test_normalization_mixed_encoding( - "String.fromCodePoint(0x10040, 65601, 0x10042)", - "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'" + "String.fromCodePoint(0200_100, 65601, 0B10000000001000010, 0x10043)", + "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82\xf0\x90\x81\x83'" ); } }