]> git.ipfire.org Git - thirdparty/snort3.git/commitdiff
Pull request #3431: http_inspect: add handling of binary and octal integers to JS...
authorMike Stepanek (mstepane) <mstepane@cisco.com>
Fri, 27 May 2022 16:47:05 +0000 (16:47 +0000)
committerMike Stepanek (mstepane) <mstepane@cisco.com>
Fri, 27 May 2022 16:47:05 +0000 (16:47 +0000)
Merge in SNORT/snort3 from ~VHORBATO/snort3:js_int_lit to master

Squashed commit of the following:

commit 2e3b8040edc18c5410c5a055eace0199a3135189
Author: Vitalii <vhorbato@cisco.com>
Date:   Thu May 19 12:44:06 2022 +0300

    http_inspect: add handling of binary, octal and big integers to JS Normalizer

src/utils/js_tokenizer.h
src/utils/js_tokenizer.l
src/utils/test/js_normalizer_test.cc
src/utils/test/js_unescape_test.cc

index 14b6a12292e686d28fe23f8c5102fe5024372649..d697887ab00998c3e0ce45b9181ce1c4d5e5530f 100644 (file)
@@ -303,8 +303,7 @@ private:
     void escaped_unicode_utf_8();
     void escaped_code_point();
     void escaped_url_sequence_latin_1();
-    void dec_code_point();
-    void hex_code_point();
+    void lit_int_code_point(int base);
     void char_code_no_match();
 
     static const char* p_scope_codes[];
index cdebb9eed8e2be854e04257a6adfc7cc4988a0a3..4a1ab9228fdd75dafadaa8908931133b55465d40 100644 (file)
@@ -32,6 +32,7 @@
 
 #include "utils/js_tokenizer.h"
 
+#include <algorithm>
 #include <cassert>
 
 #include "utils/js_identifier_ctx.h"
@@ -86,14 +87,16 @@ constexpr bool JSTokenizer::insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX];
 
 enum EncodingType
 {
-    IS_HEX          = 1 << 0,   // hex code unit: 0xXXXX
-    IS_DEC          = 1 << 1,   // dec code unit: XXXX
-    IS_XBACKSLASH   = 1 << 2,   // \xXX
-    IS_UBACKSLASH_1 = 1 << 3,   // \uXX
-    IS_UBACKSLASH_2 = 1 << 4,   // \uXXXX
-    IS_UPERCENT     = 1 << 5,   // %uXXXX
-    IS_PERCENT      = 1 << 6,   // %XX
-    IS_UCODEPOINT   = 1 << 7    // \u{0xXXXX}
+    IS_BIN          = 1 << 0,   // bin code unit: 0bXXXX
+    IS_OCT          = 1 << 1,   // oct code unit: 0oXXXX
+    IS_DEC          = 1 << 2,   // dec code unit: XXXX
+    IS_HEX          = 1 << 3,   // hex code unit: 0xXXXX
+    IS_XBACKSLASH   = 1 << 4,   // \xXX
+    IS_UBACKSLASH_1 = 1 << 5,   // \uXX
+    IS_UBACKSLASH_2 = 1 << 6,   // \uXXXX
+    IS_UPERCENT     = 1 << 7,   // %uXXXX
+    IS_PERCENT      = 1 << 8,   // %XX
+    IS_UCODEPOINT   = 1 << 9    // \u{0xXXXX}
 };
 
 %}
@@ -981,9 +984,12 @@ IDENTIFIER          ({IDENTIFIER_START}{IDENTIFIER_PART})*
 LITERAL_NULL                  null
 LITERAL_THIS                  this
 LITERAL_BOOLEAN               true|false
-LITERAL_DECIMAL               [.]?[0-9]+[\.]?[0-9]*[eE]?[0-9]*
-LITERAL_INTEGER               [0-9]*
-LITERAL_HEX_INTEGER           0x[0-9a-fA-F]*|0X[0-9a-fA-F]*
+LITERAL_BIN_INTEGER           0[bB][01](_?[01])*
+LITERAL_OCT_INTEGER           0[oO]?[0-7](_?[0-7])*
+LITERAL_DECIMAL               [.]?(_?[0-9])+[\.]?(_?[0-9])*([eE](_?[0-9])+)?
+LITERAL_INTEGER               [0-9](_?[0-9])*
+LITERAL_HEX_INTEGER           0[xX][a-fA-F0-9](_?[a-fA-F0-9])*
+LITERAL_BIG_INTEGER           ({LITERAL_DECIMAL}|{LITERAL_BIN_INTEGER}|{LITERAL_OCT_INTEGER}|{LITERAL_HEX_INTEGER})n
 
 LITERAL_DQ_STRING_START       \"
 LITERAL_DQ_STRING_END         \"
@@ -1011,7 +1017,7 @@ LITERAL_REGEX_G_CLOSE         \)|\]|\}
 LITERAL_UNDEFINED             undefined
 LITERAL_INFINITY              Infinity|\xE2\x88\x9E
 LITERAL_NAN                   NaN
-LITERAL                       {LITERAL_NULL}|{LITERAL_THIS}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN}
+LITERAL                       {LITERAL_NULL}|{LITERAL_THIS}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_BIN_INTEGER}|{LITERAL_OCT_INTEGER}|{LITERAL_HEX_INTEGER}|{LITERAL_BIG_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN}
 
 HTML_COMMENT_OPEN         "<"+"!--"
 HTML_TAG_SCRIPT_OPEN      "<"+(?i:script)[\x9\xA\xC\x20\x2f\x3e]
@@ -1221,8 +1227,10 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 {LITERAL}                           { EXEC(general_literal()) }
 {IDENTIFIER}                        { EXEC(general_identifier()) }
 
-<char_code>{LITERAL_INTEGER}       { set_encoding(IS_DEC); dec_code_point(); }
-<char_code>{LITERAL_HEX_INTEGER}   { set_encoding(IS_HEX); hex_code_point(); }
+<char_code>{LITERAL_BIN_INTEGER}   { set_encoding(IS_BIN); lit_int_code_point(2); }
+<char_code>{LITERAL_OCT_INTEGER}   { set_encoding(IS_OCT); lit_int_code_point(8); }
+<char_code>{LITERAL_INTEGER}       { set_encoding(IS_DEC); lit_int_code_point(10); }
+<char_code>{LITERAL_HEX_INTEGER}   { set_encoding(IS_HEX); lit_int_code_point(16); }
 
 .|{ALL_UNICODE}                     { general_unicode(); }
 
@@ -2919,16 +2927,11 @@ void JSTokenizer::escaped_url_sequence_latin_1()
     yyout << (char)std::stoi(code, nullptr, 16);
 }
 
-void JSTokenizer::dec_code_point()
+void JSTokenizer::lit_int_code_point(int base)
 {
-    std::string code(YYText());
-    yyout << unicode_to_utf8(std::stoi(code, nullptr, 10));
-}
-
-void JSTokenizer::hex_code_point()
-{
-    std::string code(YYText());
-    yyout << unicode_to_utf8(std::stoi(code, nullptr, 16));
+    std::string code(base != 10 && !isdigit(YYText()[1]) ? YYText() + 2 : YYText());
+    code.erase(std::remove(code.begin(), code.end(), '_'), code.end());
+    yyout << unicode_to_utf8(std::stoi(code, nullptr, base));
 }
 
 void JSTokenizer::char_code_no_match()
index 1d413828679cfef10da3a49d4a3c814d5b701240..f0b6c4c01cf89a33889a8db4657e3fa6c0ea11f0 100644 (file)
@@ -624,16 +624,16 @@ static const char all_patterns_expected3[] =
     "interface void while delete export package";
 
 static const char all_patterns_buf4[] =
-    "/regex/g undefined null true false 2 23 2.3 2.23 .2 .02 4. +2 -2 "
-    "+3.3 -3.3 +23 -32 2.3E45 3.E34 -2.3E45 -3.E34 +2.3E45 +3.E34 0x1234 0XFFFF Infinity "
-    "\xE2\x88\x9E NaN \"\" \"double string\" \"d\" '' 'single string' 's' x=/regex/gs "
-    "x=2/2/1 `\ntemplate\n`";
+    "/regex/g undefined null true false 2 23 2_3 2.3 2.23 2.2_3 .2 .02 .0_2 4. +2 -2 "
+    "+3.3 -3.3 +23 -32 2.3E45 2.3E4_5 3.E34 -2.3E45 -3.E34 +2.3E45 +3.E34 0b101 0B111 0o357 0O777 "
+    "0373 0x1234 0XFFFF 123n 0b101n 0o123n 0xaffn Infinity \xE2\x88\x9E NaN \"\" \"double string\" "
+    "\"d\" '' 'single string' 's' x=/regex/gs x=2/2/1 `\ntemplate\n`";
 
 static const char all_patterns_expected4[] =
-    "/regex/g undefined null true false 2 23 2.3 2.23 .2 .02 4.+2-2"
-    "+3.3-3.3+23-32 2.3E45 3.E34-2.3E45-3.E34+2.3E45+3.E34 0x1234 0XFFFF Infinity "
-    "\xE2\x88\x9E NaN \"\" \"double string\" \"d\" '' 'single string' 's' x=/regex/gs "
-    "x=2/2/1 `\ntemplate\n`";
+    "/regex/g undefined null true false 2 23 2_3 2.3 2.23 2.2_3 .2 .02 .0_2 4.+2-2"
+    "+3.3-3.3+23-32 2.3E45 2.3E4_5 3.E34-2.3E45-3.E34+2.3E45+3.E34 0b101 0B111 0o357 0O777 0373 "
+    "0x1234 0XFFFF 123n 0b101n 0o123n 0xaffn Infinity \xE2\x88\x9E NaN \"\" \"double string\" "
+    "\"d\" '' 'single string' 's' x=/regex/gs x=2/2/1 `\ntemplate\n`";
 
 static const char all_patterns_buf5[] =
     "$2abc _2abc abc $__$ 肖晗 XÆA12 \\u0041abc \\u00FBdef \\u1234ghi ab\xE2\x80\xA8ww "
@@ -2763,6 +2763,115 @@ TEST_CASE("split in keyword", "[JSNormalizer]")
     }
 }
 
+TEST_CASE("split in integer literal", "[JSNormalizer]")
+{
+    SECTION("1 2;")
+    {
+        const char dat1[] = "1";
+        const char dat2[] = "2;";
+        const char exp1[] = "1";
+        const char exp2[] = "12;";
+        const char exp[] = "12;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("0 b01;")
+    {
+        const char dat1[] = "0";
+        const char dat2[] = "b01;";
+        const char exp1[] = "0";
+        const char exp2[] = "0b01;";
+        const char exp[] = "0b01;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("0o 12;")
+    {
+        const char dat1[] = "0o";
+        const char dat2[] = "12;";
+        const char exp1[] = "0 o";
+        const char exp2[] = "0o12;";
+        const char exp[] = "0o12;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("0 12;")
+    {
+        const char dat1[] = "0";
+        const char dat2[] = "12;";
+        const char exp1[] = "0";
+        const char exp2[] = "012;";
+        const char exp[] = "012;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("01 9;")
+    {
+        const char dat1[] = "01";
+        const char dat2[] = "9;";
+        const char exp1[] = "01";
+        const char exp2[] = "019;";
+        const char exp[] = "019;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION(". 12;")
+    {
+        const char dat1[] = ".";
+        const char dat2[] = "12;";
+        const char exp1[] = ".";
+        const char exp2[] = ".12;";
+        const char exp[] = ".12;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("0 x 12;")
+    {
+        const char dat1[] = "0";
+        const char dat2[] = "x";
+        const char dat3[] = "12;";
+        const char exp1[] = "0";
+        const char exp2[] = " x";
+        const char exp3[] = "0x12;";
+        const char exp[] = "0x12;";
+
+        NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+        NORM_COMBINED_3(dat1, dat2, dat3, exp);
+    }
+    SECTION("1 _ 2;")
+    {
+        const char dat1[] = "1";
+        const char dat2[] = "_";
+        const char dat3[] = "2;";
+        const char exp1[] = "1";
+        const char exp2[] = " _";
+        const char exp3[] = "1_2;";
+        const char exp[] = "1_2;";
+
+        NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+        NORM_COMBINED_3(dat1, dat2, dat3, exp);
+    }
+    SECTION("1 E 2;")
+    {
+        const char dat1[] = "1";
+        const char dat2[] = "E";
+        const char dat3[] = "2;";
+        const char exp1[] = "1";
+        const char exp2[] = " E";
+        const char exp3[] = "1E2;";
+        const char exp[] = "1E2;";
+
+        NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+        NORM_COMBINED_3(dat1, dat2, dat3, exp);
+    }
+}
+
 TEST_CASE("split and continuation combined", "[JSNormalizer]")
 {
     SECTION("PDU 1 [cont] PDU 2 [end end cont end]")
index 949e9840b7a1d9174e73f5ba5a82ed0b679892cd..0df3e2c34262a472b0217b8386e5b7b4af2baf52 100644 (file)
@@ -74,18 +74,74 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]")
             "'\u0020 \u00EB \u0123 \u4567 \u89aA \ubBcC \u00dD \ueEfF'"
         );
 
+    SECTION("binary")
+    {
+        test_normalization(
+            "String.fromCharCode(0b1, 0B1100 ,0b11101011, 0B000101011001, 0b0001101010000101)",
+            "'\u0001\u000c\u00EB\u0159\u1a85'"
+        );
+        test_normalization(
+            "String.fromCharCode(0b10000000000000000)",
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCharCode(0B10000000000000000)",
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCodePoint(0b1000000000000000000000)",
+            "'\xf7\xbf\xbf\xbf'"
+        );
+        test_normalization(
+            "String.fromCodePoint(0B1000000000000000000000)",
+            "'\xf7\xbf\xbf\xbf'"
+        );
+    }
+
+    SECTION("octal")
+    {
+        test_normalization(
+            "String.fromCharCode(0O1, 014 ,0o353, 0O531, 0o15205)",
+            "'\u0001\u000c\u00EB\u0159\u1a85'"
+        );
+        test_normalization(
+            "String.fromCharCode(0o200000)",
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCharCode(0O200000)",
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCharCode(0200000)",
+            "'\xf0\x90\x80\x80'"
+        );
+        test_normalization(
+            "String.fromCodePoint(0o10_000_000)",
+            "'\xf7\xbf\xbf\xbf'"
+        );
+        test_normalization(
+            "String.fromCodePoint(0O10_000_000)",
+            "'\xf7\xbf\xbf\xbf'"
+        );
+        test_normalization(
+            "String.fromCodePoint(010_000_000)",
+            "'\xf7\xbf\xbf\xbf'"
+        );
+    }
+
     SECTION("decimal")
     {
         test_normalization(
-            "String.fromCharCode(1, 12 ,235, 345, 6789, 1000, 0001)",
-            "'\u0001\u000c\u00EB\u0159\u1a85\u03e8\u0001'"
+            "String.fromCharCode(1, 12 ,235, 345, 6789, 10_00, 00_09)",
+            "'\u0001\u000c\u00EB\u0159\u1a85\u03e8\u0009'"
         );
         test_normalization(
             "String.fromCharCode(65536)",
             "'\xf0\x90\x80\x80'"
         );
         test_normalization(
-            "String.fromCodePoint(2097152)",
+            "String.fromCodePoint(209_715_2)",
             "'\xf7\xbf\xbf\xbf'"
         );
     }
@@ -93,7 +149,7 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]")
     SECTION("hexadecimal")
     {
         test_normalization(
-            "String.fromCharCode(0x0001, 0X00EB, 0x0123, 0x4567, 0x89aA, 0xbBcC, 0x00dD, 0xeEfF)",
+            "String.fromCharCode(0x0001, 0X00EB, 0x0123, 0x45_67, 0x89aA, 0xbBcC, 0x00dD, 0xe_Ef_F)",
             "'\u0001\u00EB\u0123\u4567\u89aA\ubBcC\u00dD\ueEfF'"
         );
         test_normalization(
@@ -105,11 +161,11 @@ TEST_CASE("Sequence parsing", "[JSNormalizer]")
             "'\xf0\x90\x80\x80'"
         );
         test_normalization(
-            "String.fromCodePoint(0x200000)",
+            "String.fromCodePoint(0x200_000)",
             "'\xf7\xbf\xbf\xbf'"
         );
         test_normalization(
-            "String.fromCodePoint(0X200000)",
+            "String.fromCodePoint(0X200_000)",
             "'\xf7\xbf\xbf\xbf'"
         );
     }
@@ -584,6 +640,41 @@ TEST_CASE("decodeURIComponent()", "[JSNormalizer]")
 
 TEST_CASE("String.fromCharCode()", "[JSNormalizer]")
 {
+    SECTION("binary")
+    {
+        test_normalization(
+            "String.fromCharCode(0b1100010, 0B1100001, 0b1110010)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCharCode(0B001100010, 0b001100001, 0B001110010)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCharCode(0b11_00_010, 0b00_11_00_001, 0B11_10_010)",
+            "'bar'"
+        );
+    }
+
+    SECTION("octal")
+    {
+        test_normalization(
+            "String.fromCharCode(0o142, 0o141, 0o162)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCharCode(0o00142, 0o00141, 0o00162)",
+            "'bar'"
+        );
+        test_normalization(
+            "String.fromCharCode(0o00_14_2, 0O0_0_1_4_1, 000_162)",
+            "'bar'"
+        );
+    }
+
     SECTION("decimal")
     {
         test_normalization(
@@ -600,7 +691,7 @@ TEST_CASE("String.fromCharCode()", "[JSNormalizer]")
         );
 
         test_normalization(
-            "String.fromCharCode(0x0062, 0x0061, 0x0072)",
+            "String.fromCharCode(0x0062, 0x00_61, 0x0072)",
             "'bar'"
         );
     }
@@ -608,19 +699,50 @@ TEST_CASE("String.fromCharCode()", "[JSNormalizer]")
     SECTION("mixed sequence")
     {
         test_normalization_mixed_encoding(
-            "String.fromCharCode(98, 97, 0x72)",
-            "'bar'"
+            "String.fromCharCode(0b11_00_110, 111, 0o157, 98, 0b1100001, 0x72)",
+            "'foobar'"
         );
 
         test_normalization_mixed_encoding(
-            "String.fromCharCode(0x62, 97, 114)",
-            "'bar'"
+            "String.fromCharCode(102 ,0b110_1111, 0o157, 0x62, 97, 0O162)",
+            "'foobar'"
         );
     }
 }
 
 TEST_CASE("String.fromCodePoint()", "[JSNormalizer]")
 {
+    SECTION("binary")
+    {
+        test_normalization(
+            "String.fromCodePoint(0b1100010, 0b1100001, 0b1110010)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(0b10000000001000000, 0b10000000001000001, 0b10000000001000010)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(0b000_1_100_010, 0B1100001, 0B111_0010)",
+            "'bar'"
+        );
+    }
+
+    SECTION("octal")
+    {
+        test_normalization(
+            "String.fromCodePoint(0o142, 0O141, 0162)",
+            "'bar'"
+        );
+
+        test_normalization(
+            "String.fromCodePoint(0200_100, 0o200101, 0O200_102)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+        );
+    }
+
     SECTION("decimal")
     {
         test_normalization(
@@ -629,7 +751,7 @@ TEST_CASE("String.fromCodePoint()", "[JSNormalizer]")
         );
 
         test_normalization(
-            "String.fromCodePoint(65600, 65601, 65602)",
+            "String.fromCodePoint(65600, 65_601, 65602)",
             "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
         );
     }
@@ -642,7 +764,7 @@ TEST_CASE("String.fromCodePoint()", "[JSNormalizer]")
         );
 
         test_normalization(
-            "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072)",
+            "String.fromCodePoint(0x000_000_62, 0X00000061, 0x00000072)",
             "'bar'"
         );
 
@@ -655,23 +777,23 @@ TEST_CASE("String.fromCodePoint()", "[JSNormalizer]")
     SECTION("mixed sequence")
     {
         test_normalization_mixed_encoding(
-            "String.fromCodePoint(98, 97, 0x72)",
-            "'bar'"
+            "String.fromCodePoint(0b1100110, 111, 0o157, 98, 0b11_00_001, 0x72)",
+            "'foobar'"
         );
 
         test_normalization_mixed_encoding(
-            "String.fromCodePoint(0x00000062, 97, 114)",
-            "'bar'"
+            "String.fromCodePoint(102 ,0b1101111, 0o157, 0X000_00062, 97, 0O162)",
+            "'foobar'"
         );
 
         test_normalization_mixed_encoding(
-            "String.fromCodePoint(65600, 0x10041, 65602)",
-            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+            "String.fromCodePoint(65600, 0x10041, 0o200102, 0B1000_0000_0010_00011)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82\xf0\x90\x81\x83'"
         );
 
         test_normalization_mixed_encoding(
-            "String.fromCodePoint(0x10040, 65601, 0x10042)",
-            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+            "String.fromCodePoint(0200_100, 65601, 0B10000000001000010, 0x10043)",
+            "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82\xf0\x90\x81\x83'"
         );
     }
 }