max_template_nesting(max_template_nesting),
ident_ctx(mapper),
bytes_read(0),
+ tokens_read(0),
tmp_buf(buf),
tmp_buf_size(buf_size),
tmp_cap_size(cap_size),
yy_switch_to_buffer((YY_BUFFER_STATE)tmp_buffer);
tmp_bytes_read = bytes_read;
+ tmp_tokens_read = tokens_read;
}
void JSTokenizer::switch_to_initial()
tmp_buffer = nullptr;
bytes_read = tmp_bytes_read;
+ tmp_tokens_read = tokens_read - tmp_tokens_read;
}
// A return value of this method uses to terminate the scanner
// The return value should be used to make a decision about yyterminate() call
JSTokenizer::JSRet JSTokenizer::eval_eof()
{
- // If the temporal scan buffer reaches EOF, cleanup and
- // continue with the initial one
- if (tmp_buffer)
- {
- switch_to_initial();
- return EOS;
- }
+ if (!tmp_buffer)
+ return SCRIPT_CONTINUE;
+
+ switch_to_initial();
- // Normal termination
- return SCRIPT_CONTINUE;
+ if (tmp_tokens_read != 1 or token != IDENTIFIER)
+ return BAD_TOKEN;
+
+ // remove temporal buffer normalization state
+ memset((void*)(states + sp), 0, sizeof(states[0]));
+ --sp;
+ sp %= JSTOKENIZER_MAX_STATES;
+
+ return EOS;
}
JSTokenizer::JSRet JSTokenizer::do_spacing(JSToken cur_token)
bool JSTokenizer::states_process()
{
bytes_read += yyleng;
+ ++tokens_read;
// Fulfillment goes after this check only in case of split over several input scripts.
// Otherwise, new state is pushed.
bytes_in = std::max(bytes_read, bytes_in) - bytes_in;
bytes_read = 0;
+ tokens_read = 0;
return static_cast<JSTokenizer::JSRet>(r);
}
}
}
+TEST_CASE("Identifiers", "[JSNormalizer]")
+{
+ SECTION("all patterns")
+ {
+ test_normalization(
+ "\\u0061",
+ "var_0000"
+ );
+ test_normalization_bad(
+ "\\u0020",
+ "",
+ JSTokenizer::BAD_TOKEN
+ );
+
+ test_normalization(
+ "\\u{0061}",
+ "var_0000"
+ );
+ test_normalization(
+ "\\u{061}",
+ "var_0000"
+ );
+ test_normalization(
+ "\\u{61}",
+ "var_0000"
+ );
+ test_normalization_bad(
+ "\\u{1}",
+ "\u0001",
+ JSTokenizer::BAD_TOKEN
+ );
+ }
+
+ SECTION("valid sequence")
+ {
+ test_normalization(
+ " \\u0061bc ;",
+ "var_0000;"
+ );
+ test_normalization(
+ " a\\u0062c ;",
+ "var_0000;"
+ );
+ test_normalization(
+ " ab\\u0063 ;",
+ "var_0000;"
+ );
+ }
+
+ SECTION("invalid sequence")
+ {
+ test_normalization_bad(
+ " \\u0020bc ;",
+ "var_0000",
+ JSTokenizer::BAD_TOKEN
+ );
+ test_normalization_bad(
+ " a\\u0020c ;",
+ "var_0000 var_0001",
+ JSTokenizer::BAD_TOKEN
+ );
+ test_normalization_bad(
+ " ab\\u0020 ;",
+ "var_0000",
+ JSTokenizer::BAD_TOKEN
+ );
+ }
+
+ SECTION("valid code point")
+ {
+ test_normalization(
+ " \\u{61}bc ;",
+ "var_0000;"
+ );
+ test_normalization(
+ " a\\u{62}c ;",
+ "var_0000;"
+ );
+ test_normalization(
+ " ab\\u{63} ;",
+ "var_0000;"
+ );
+ }
+
+ SECTION("invalid code point")
+ {
+ test_normalization_bad(
+ " \\u{20}bc ;",
+ "var_0000",
+ JSTokenizer::BAD_TOKEN
+ );
+ test_normalization_bad(
+ " a\\u{20}c ;",
+ "var_0000 var_0001",
+ JSTokenizer::BAD_TOKEN
+ );
+ test_normalization_bad(
+ " ab\\u{20} ;",
+ "var_0000",
+ JSTokenizer::BAD_TOKEN
+ );
+ }
+
+ SECTION("valid dot accessor")
+ {
+ test_normalization(
+ "\\u0066\\u006f\\u006f.\\u0062\\u0061\\u0072 ;",
+ "var_0000.var_0001;"
+ );
+ test_normalization(
+ "console.\\u006c\\u006f\\u0067 ;",
+ "console.log;"
+ );
+ test_normalization(
+ "\\u0066\\u006f\\u006f.\\u006a\\u006f\\u0069\\u006e ;",
+ "var_0000.join;"
+ );
+ }
+
+ SECTION("invalid dot accessor")
+ {
+ test_normalization_bad(
+ "\\u0066\\u006f\\u006f.\\u0020\\u0061\\u0072 ;",
+ "var_0000.var_0001",
+ JSTokenizer::BAD_TOKEN
+ );
+ test_normalization_bad(
+ "\\u0066\\u0020\\u006f.\\u0062\\u0061\\u0072 ;",
+ "var_0000 var_0001",
+ JSTokenizer::BAD_TOKEN
+ );
+ test_normalization_bad(
+ "console.\\u006c\\u0020\\u0067 ;",
+ "console.l var_0000",
+ JSTokenizer::BAD_TOKEN
+ );
+ test_normalization_bad(
+ "\\u0066\\u0020\\u006f.\\u006a\\u006f\\u0069\\u006e ;",
+ "var_0000 var_0001",
+ JSTokenizer::BAD_TOKEN
+ );
+ }
+}
+
TEST_CASE("Split", "[JSNormalizer]")
{
SECTION("unescape()")
{ "114)", "'bar'" }
});
}
+
+ SECTION("identifier")
+ {
+ test_normalization({
+ { "\\u0062", "var_0000" },
+ { "\\u0061\\u0072", "var_0001" }
+ });
+ test_normalization({
+ { "\\u{62}", "var_0000" },
+ { "\\u{61}\\u{72}", "var_0001" }
+ });
+ test_normalization({
+ { "\\u0062", "var_0000" },
+ { "\\u{61}\\u{72}", "var_0001" }
+ });
+ test_normalization({
+ { "\\u{62}", "var_0000" },
+ { "\\u0061\\u0072", "var_0001" }
+ });
+ test_normalization({
+ { "\\u{63}\\u{6f}\\u{6e}", "var_0000" },
+ { "\\u{73}\\u{6f}\\u{6c}\\u{65}", "console" }
+ });
+ test_normalization({
+ { "\\u0062", "var_0000" },
+ { "\\u0061", "var_0001" },
+ { "\\u0072", "var_0002" }
+ });
+ }
}
TEST_CASE("Mixed input", "[JSNormalizer]")
SECTION("identifier")
{
+ test_normalization(
+ "\\u0062\\u{61}\\u0072",
+ "var_0000"
+ );
+ test_normalization(
+ "\\u{62}\\u0061\\u{72}",
+ "var_0000"
+ );
test_normalization(
"unescape ( f(\"A\\u20B\\u20C\"), eval(\"\\u66\\u6f\\u6f\"), \"\\u66\\u6f\\u6f\" ) ;",
"var_0000(\"A\\u20B\\u20C\"),eval(\"\\u66\\u6f\\u6f\"),\"foo\";"