void escaped_unicode_utf_8();
void escaped_code_point();
void escaped_url_sequence_latin_1();
- void dec_code_point();
- void hex_code_point();
+ void lit_int_code_point(int base);
void char_code_no_match();
static const char* p_scope_codes[];
#include "utils/js_tokenizer.h"
+#include <algorithm>
#include <cassert>
#include "utils/js_identifier_ctx.h"
enum EncodingType
{
- IS_HEX = 1 << 0, // hex code unit: 0xXXXX
- IS_DEC = 1 << 1, // dec code unit: XXXX
- IS_XBACKSLASH = 1 << 2, // \xXX
- IS_UBACKSLASH_1 = 1 << 3, // \uXX
- IS_UBACKSLASH_2 = 1 << 4, // \uXXXX
- IS_UPERCENT = 1 << 5, // %uXXXX
- IS_PERCENT = 1 << 6, // %XX
- IS_UCODEPOINT = 1 << 7 // \u{0xXXXX}
+ IS_BIN = 1 << 0, // bin code unit: 0bXXXX
+ IS_OCT = 1 << 1, // oct code unit: 0oXXXX
+ IS_DEC = 1 << 2, // dec code unit: XXXX
+ IS_HEX = 1 << 3, // hex code unit: 0xXXXX
+ IS_XBACKSLASH = 1 << 4, // \xXX
+ IS_UBACKSLASH_1 = 1 << 5, // \uXX
+ IS_UBACKSLASH_2 = 1 << 6, // \uXXXX
+ IS_UPERCENT = 1 << 7, // %uXXXX
+ IS_PERCENT = 1 << 8, // %XX
+ IS_UCODEPOINT = 1 << 9 // \u{0xXXXX}
};
%}
LITERAL_NULL null
LITERAL_THIS this
LITERAL_BOOLEAN true|false
-LITERAL_DECIMAL [.]?[0-9]+[\.]?[0-9]*[eE]?[0-9]*
-LITERAL_INTEGER [0-9]*
-LITERAL_HEX_INTEGER 0x[0-9a-fA-F]*|0X[0-9a-fA-F]*
+LITERAL_BIN_INTEGER 0[bB][01](_?[01])*
+LITERAL_OCT_INTEGER 0[oO]?[0-7](_?[0-7])*
+LITERAL_DECIMAL [.]?(_?[0-9])+[\.]?(_?[0-9])*([eE](_?[0-9])+)?
+LITERAL_INTEGER [0-9](_?[0-9])*
+LITERAL_HEX_INTEGER 0[xX][a-fA-F0-9](_?[a-fA-F0-9])*
+LITERAL_BIG_INTEGER ({LITERAL_DECIMAL}|{LITERAL_BIN_INTEGER}|{LITERAL_OCT_INTEGER}|{LITERAL_HEX_INTEGER})n
LITERAL_DQ_STRING_START \"
LITERAL_DQ_STRING_END \"
LITERAL_UNDEFINED undefined
LITERAL_INFINITY Infinity|\xE2\x88\x9E
LITERAL_NAN NaN
-LITERAL {LITERAL_NULL}|{LITERAL_THIS}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN}
+LITERAL {LITERAL_NULL}|{LITERAL_THIS}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_BIN_INTEGER}|{LITERAL_OCT_INTEGER}|{LITERAL_HEX_INTEGER}|{LITERAL_BIG_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN}
HTML_COMMENT_OPEN "<"+"!--"
HTML_TAG_SCRIPT_OPEN "<"+(?i:script)[\x9\xA\xC\x20\x2f\x3e]
{LITERAL} { EXEC(general_literal()) }
{IDENTIFIER} { EXEC(general_identifier()) }
-<char_code>{LITERAL_INTEGER} { set_encoding(IS_DEC); dec_code_point(); }
-<char_code>{LITERAL_HEX_INTEGER} { set_encoding(IS_HEX); hex_code_point(); }
+<char_code>{LITERAL_BIN_INTEGER} { set_encoding(IS_BIN); lit_int_code_point(2); }
+<char_code>{LITERAL_OCT_INTEGER} { set_encoding(IS_OCT); lit_int_code_point(8); }
+<char_code>{LITERAL_INTEGER} { set_encoding(IS_DEC); lit_int_code_point(10); }
+<char_code>{LITERAL_HEX_INTEGER} { set_encoding(IS_HEX); lit_int_code_point(16); }
.|{ALL_UNICODE} { general_unicode(); }
yyout << (char)std::stoi(code, nullptr, 16);
}
-void JSTokenizer::dec_code_point()
+void JSTokenizer::lit_int_code_point(int base)
{
- std::string code(YYText());
- yyout << unicode_to_utf8(std::stoi(code, nullptr, 10));
-}
-
-void JSTokenizer::hex_code_point()
-{
- std::string code(YYText());
- yyout << unicode_to_utf8(std::stoi(code, nullptr, 16));
+ std::string code(base != 10 && !isdigit(YYText()[1]) ? YYText() + 2 : YYText());
+ code.erase(std::remove(code.begin(), code.end(), '_'), code.end());
+ yyout << unicode_to_utf8(std::stoi(code, nullptr, base));
}
void JSTokenizer::char_code_no_match()
"interface void while delete export package";
static const char all_patterns_buf4[] =
- "/regex/g undefined null true false 2 23 2.3 2.23 .2 .02 4. +2 -2 "
- "+3.3 -3.3 +23 -32 2.3E45 3.E34 -2.3E45 -3.E34 +2.3E45 +3.E34 0x1234 0XFFFF Infinity "
- "\xE2\x88\x9E NaN \"\" \"double string\" \"d\" '' 'single string' 's' x=/regex/gs "
- "x=2/2/1 `\ntemplate\n`";
+ "/regex/g undefined null true false 2 23 2_3 2.3 2.23 2.2_3 .2 .02 .0_2 4. +2 -2 "
+ "+3.3 -3.3 +23 -32 2.3E45 2.3E4_5 3.E34 -2.3E45 -3.E34 +2.3E45 +3.E34 0b101 0B111 0o357 0O777 "
+ "0373 0x1234 0XFFFF 123n 0b101n 0o123n 0xaffn Infinity \xE2\x88\x9E NaN \"\" \"double string\" "
+ "\"d\" '' 'single string' 's' x=/regex/gs x=2/2/1 `\ntemplate\n`";
static const char all_patterns_expected4[] =
- "/regex/g undefined null true false 2 23 2.3 2.23 .2 .02 4.+2-2"
- "+3.3-3.3+23-32 2.3E45 3.E34-2.3E45-3.E34+2.3E45+3.E34 0x1234 0XFFFF Infinity "
- "\xE2\x88\x9E NaN \"\" \"double string\" \"d\" '' 'single string' 's' x=/regex/gs "
- "x=2/2/1 `\ntemplate\n`";
+ "/regex/g undefined null true false 2 23 2_3 2.3 2.23 2.2_3 .2 .02 .0_2 4.+2-2"
+ "+3.3-3.3+23-32 2.3E45 2.3E4_5 3.E34-2.3E45-3.E34+2.3E45+3.E34 0b101 0B111 0o357 0O777 0373 "
+ "0x1234 0XFFFF 123n 0b101n 0o123n 0xaffn Infinity \xE2\x88\x9E NaN \"\" \"double string\" "
+ "\"d\" '' 'single string' 's' x=/regex/gs x=2/2/1 `\ntemplate\n`";
static const char all_patterns_buf5[] =
"$2abc _2abc abc $__$ 肖晗 XÆA12 \\u0041abc \\u00FBdef \\u1234ghi ab\xE2\x80\xA8ww "
}
}
+TEST_CASE("split in integer literal", "[JSNormalizer]")
+{
+ SECTION("1 2;")
+ {
+ const char dat1[] = "1";
+ const char dat2[] = "2;";
+ const char exp1[] = "1";
+ const char exp2[] = "12;";
+ const char exp[] = "12;";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION("0 b01;")
+ {
+ const char dat1[] = "0";
+ const char dat2[] = "b01;";
+ const char exp1[] = "0";
+ const char exp2[] = "0b01;";
+ const char exp[] = "0b01;";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION("0o 12;")
+ {
+ const char dat1[] = "0o";
+ const char dat2[] = "12;";
+ const char exp1[] = "0 o";
+ const char exp2[] = "0o12;";
+ const char exp[] = "0o12;";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION("0 12;")
+ {
+ const char dat1[] = "0";
+ const char dat2[] = "12;";
+ const char exp1[] = "0";
+ const char exp2[] = "012;";
+ const char exp[] = "012;";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION("01 9;")
+ {
+ const char dat1[] = "01";
+ const char dat2[] = "9;";
+ const char exp1[] = "01";
+ const char exp2[] = "019;";
+ const char exp[] = "019;";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION(". 12;")
+ {
+ const char dat1[] = ".";
+ const char dat2[] = "12;";
+ const char exp1[] = ".";
+ const char exp2[] = ".12;";
+ const char exp[] = ".12;";
+
+ NORMALIZE_2(dat1, dat2, exp1, exp2);
+ NORM_COMBINED_2(dat1, dat2, exp);
+ }
+ SECTION("0 x 12;")
+ {
+ const char dat1[] = "0";
+ const char dat2[] = "x";
+ const char dat3[] = "12;";
+ const char exp1[] = "0";
+ const char exp2[] = " x";
+ const char exp3[] = "0x12;";
+ const char exp[] = "0x12;";
+
+ NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+ NORM_COMBINED_3(dat1, dat2, dat3, exp);
+ }
+ SECTION("1 _ 2;")
+ {
+ const char dat1[] = "1";
+ const char dat2[] = "_";
+ const char dat3[] = "2;";
+ const char exp1[] = "1";
+ const char exp2[] = " _";
+ const char exp3[] = "1_2;";
+ const char exp[] = "1_2;";
+
+ NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+ NORM_COMBINED_3(dat1, dat2, dat3, exp);
+ }
+ SECTION("1 E 2;")
+ {
+ const char dat1[] = "1";
+ const char dat2[] = "E";
+ const char dat3[] = "2;";
+ const char exp1[] = "1";
+ const char exp2[] = " E";
+ const char exp3[] = "1E2;";
+ const char exp[] = "1E2;";
+
+ NORMALIZE_3(dat1, dat2, dat3, exp1, exp2, exp3);
+ NORM_COMBINED_3(dat1, dat2, dat3, exp);
+ }
+}
+
TEST_CASE("split and continuation combined", "[JSNormalizer]")
{
SECTION("PDU 1 [cont] PDU 2 [end end cont end]")
"'\u0020 \u00EB \u0123 \u4567 \u89aA \ubBcC \u00dD \ueEfF'"
);
+ SECTION("binary")
+ {
+ test_normalization(
+ "String.fromCharCode(0b1, 0B1100 ,0b11101011, 0B000101011001, 0b0001101010000101)",
+ "'\u0001\u000c\u00EB\u0159\u1a85'"
+ );
+ test_normalization(
+ "String.fromCharCode(0b10000000000000000)",
+ "'\xf0\x90\x80\x80'"
+ );
+ test_normalization(
+ "String.fromCharCode(0B10000000000000000)",
+ "'\xf0\x90\x80\x80'"
+ );
+ test_normalization(
+ "String.fromCodePoint(0b1000000000000000000000)",
+ "'\xf7\xbf\xbf\xbf'"
+ );
+ test_normalization(
+ "String.fromCodePoint(0B1000000000000000000000)",
+ "'\xf7\xbf\xbf\xbf'"
+ );
+ }
+
+ SECTION("octal")
+ {
+ test_normalization(
+ "String.fromCharCode(0O1, 014 ,0o353, 0O531, 0o15205)",
+ "'\u0001\u000c\u00EB\u0159\u1a85'"
+ );
+ test_normalization(
+ "String.fromCharCode(0o200000)",
+ "'\xf0\x90\x80\x80'"
+ );
+ test_normalization(
+ "String.fromCharCode(0O200000)",
+ "'\xf0\x90\x80\x80'"
+ );
+ test_normalization(
+ "String.fromCharCode(0200000)",
+ "'\xf0\x90\x80\x80'"
+ );
+ test_normalization(
+ "String.fromCodePoint(0o10_000_000)",
+ "'\xf7\xbf\xbf\xbf'"
+ );
+ test_normalization(
+ "String.fromCodePoint(0O10_000_000)",
+ "'\xf7\xbf\xbf\xbf'"
+ );
+ test_normalization(
+ "String.fromCodePoint(010_000_000)",
+ "'\xf7\xbf\xbf\xbf'"
+ );
+ }
+
SECTION("decimal")
{
test_normalization(
- "String.fromCharCode(1, 12 ,235, 345, 6789, 1000, 0001)",
- "'\u0001\u000c\u00EB\u0159\u1a85\u03e8\u0001'"
+ "String.fromCharCode(1, 12 ,235, 345, 6789, 10_00, 00_09)",
+ "'\u0001\u000c\u00EB\u0159\u1a85\u03e8\u0009'"
);
test_normalization(
"String.fromCharCode(65536)",
"'\xf0\x90\x80\x80'"
);
test_normalization(
- "String.fromCodePoint(2097152)",
+ "String.fromCodePoint(209_715_2)",
"'\xf7\xbf\xbf\xbf'"
);
}
SECTION("hexadecimal")
{
test_normalization(
- "String.fromCharCode(0x0001, 0X00EB, 0x0123, 0x4567, 0x89aA, 0xbBcC, 0x00dD, 0xeEfF)",
+ "String.fromCharCode(0x0001, 0X00EB, 0x0123, 0x45_67, 0x89aA, 0xbBcC, 0x00dD, 0xe_Ef_F)",
"'\u0001\u00EB\u0123\u4567\u89aA\ubBcC\u00dD\ueEfF'"
);
test_normalization(
"'\xf0\x90\x80\x80'"
);
test_normalization(
- "String.fromCodePoint(0x200000)",
+ "String.fromCodePoint(0x200_000)",
"'\xf7\xbf\xbf\xbf'"
);
test_normalization(
- "String.fromCodePoint(0X200000)",
+ "String.fromCodePoint(0X200_000)",
"'\xf7\xbf\xbf\xbf'"
);
}
TEST_CASE("String.fromCharCode()", "[JSNormalizer]")
{
+ SECTION("binary")
+ {
+ test_normalization(
+ "String.fromCharCode(0b1100010, 0B1100001, 0b1110010)",
+ "'bar'"
+ );
+
+ test_normalization(
+ "String.fromCharCode(0B001100010, 0b001100001, 0B001110010)",
+ "'bar'"
+ );
+
+ test_normalization(
+ "String.fromCharCode(0b11_00_010, 0b00_11_00_001, 0B11_10_010)",
+ "'bar'"
+ );
+ }
+
+ SECTION("octal")
+ {
+ test_normalization(
+ "String.fromCharCode(0o142, 0o141, 0o162)",
+ "'bar'"
+ );
+
+ test_normalization(
+ "String.fromCharCode(0o00142, 0o00141, 0o00162)",
+ "'bar'"
+ );
+ test_normalization(
+ "String.fromCharCode(0o00_14_2, 0O0_0_1_4_1, 000_162)",
+ "'bar'"
+ );
+ }
+
SECTION("decimal")
{
test_normalization(
);
test_normalization(
- "String.fromCharCode(0x0062, 0x0061, 0x0072)",
+ "String.fromCharCode(0x0062, 0x00_61, 0x0072)",
"'bar'"
);
}
SECTION("mixed sequence")
{
test_normalization_mixed_encoding(
- "String.fromCharCode(98, 97, 0x72)",
- "'bar'"
+ "String.fromCharCode(0b11_00_110, 111, 0o157, 98, 0b1100001, 0x72)",
+ "'foobar'"
);
test_normalization_mixed_encoding(
- "String.fromCharCode(0x62, 97, 114)",
- "'bar'"
+ "String.fromCharCode(102 ,0b110_1111, 0o157, 0x62, 97, 0O162)",
+ "'foobar'"
);
}
}
TEST_CASE("String.fromCodePoint()", "[JSNormalizer]")
{
+ SECTION("binary")
+ {
+ test_normalization(
+ "String.fromCodePoint(0b1100010, 0b1100001, 0b1110010)",
+ "'bar'"
+ );
+
+ test_normalization(
+ "String.fromCodePoint(0b10000000001000000, 0b10000000001000001, 0b10000000001000010)",
+ "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+ );
+
+ test_normalization(
+ "String.fromCodePoint(0b000_1_100_010, 0B1100001, 0B111_0010)",
+ "'bar'"
+ );
+ }
+
+ SECTION("octal")
+ {
+ test_normalization(
+ "String.fromCodePoint(0o142, 0O141, 0162)",
+ "'bar'"
+ );
+
+ test_normalization(
+ "String.fromCodePoint(0200_100, 0o200101, 0O200_102)",
+ "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+ );
+ }
+
SECTION("decimal")
{
test_normalization(
);
test_normalization(
- "String.fromCodePoint(65600, 65601, 65602)",
+ "String.fromCodePoint(65600, 65_601, 65602)",
"'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
);
}
);
test_normalization(
- "String.fromCodePoint(0x00000062, 0x00000061, 0x00000072)",
+ "String.fromCodePoint(0x000_000_62, 0X00000061, 0x00000072)",
"'bar'"
);
SECTION("mixed sequence")
{
test_normalization_mixed_encoding(
- "String.fromCodePoint(98, 97, 0x72)",
- "'bar'"
+ "String.fromCodePoint(0b1100110, 111, 0o157, 98, 0b11_00_001, 0x72)",
+ "'foobar'"
);
test_normalization_mixed_encoding(
- "String.fromCodePoint(0x00000062, 97, 114)",
- "'bar'"
+ "String.fromCodePoint(102 ,0b1101111, 0o157, 0X000_00062, 97, 0O162)",
+ "'foobar'"
);
test_normalization_mixed_encoding(
- "String.fromCodePoint(65600, 0x10041, 65602)",
- "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+ "String.fromCodePoint(65600, 0x10041, 0o200102, 0B1000_0000_0010_00011)",
+ "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82\xf0\x90\x81\x83'"
);
test_normalization_mixed_encoding(
- "String.fromCodePoint(0x10040, 65601, 0x10042)",
- "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82'"
+ "String.fromCodePoint(0200_100, 65601, 0B10000000001000010, 0x10043)",
+ "'\xf0\x90\x81\x80\xf0\x90\x81\x81\xf0\x90\x81\x82\xf0\x90\x81\x83'"
);
}
}