#include "js_identifier_ctx.h"
#include <cassert>
+#include <memory.h>
#if !defined(CATCH_TEST_BUILD) && !defined(BENCHMARK_TEST)
#include "service_inspectors/http_inspect/http_enum.h"
};
#endif // CATCH_TEST_BUILD
-#define MAX_LAST_NAME 65535
-#define HEX_DIGIT_MASK 15
+#define NORM_NAME_SIZE 9 // size of the normalized form plus null symbol
+#define NORM_NAME_CNT 65536
-static const char hex_digits[] =
-{
- '0', '1','2','3', '4', '5', '6', '7', '8','9', 'a', 'b', 'c', 'd', 'e', 'f'
-};
+static char norm_names[NORM_NAME_SIZE * NORM_NAME_CNT];
-static inline std::string format_name(int32_t num)
+static void init_norm_names()
{
- std::string name("var_");
- name.reserve(8);
- name.push_back(hex_digits[(num >> 12) & HEX_DIGIT_MASK]);
- name.push_back(hex_digits[(num >> 8) & HEX_DIGIT_MASK]);
- name.push_back(hex_digits[(num >> 4) & HEX_DIGIT_MASK]);
- name.push_back(hex_digits[num & HEX_DIGIT_MASK]);
-
- return name;
+ static bool once = false;
+
+ if (once)
+ return;
+
+ once = true;
+
+ char* c = norm_names;
+ const char hex[] = "0123456789abcdef";
+
+ for (int i = 0; i < NORM_NAME_CNT; ++i)
+ {
+ *c++ = 'v';
+ *c++ = 'a';
+ *c++ = 'r';
+ *c++ = '_';
+ *c++ = hex[(i >> 12) & 0xf];
+ *c++ = hex[(i >> 8) & 0xf];
+ *c++ = hex[(i >> 4) & 0xf];
+ *c++ = hex[(i >> 0) & 0xf];
+ *c++ = '\0';
+ }
+
+ assert(sizeof(norm_names) == c - norm_names);
}
JSIdentifierCtx::JSIdentifierCtx(int32_t depth, uint32_t max_scope_depth,
- const std::unordered_set<std::string>& ignored_ids)
- : ignored_ids(ignored_ids), depth(depth), max_scope_depth(max_scope_depth)
+ const std::unordered_set<std::string>& ignore_list)
+ : ignore_list(ignore_list), max_scope_depth(max_scope_depth)
{
+ init_norm_names();
+
+ memset(id_fast, 0, sizeof(id_fast));
+ norm_name = norm_names;
+ norm_name_end = norm_names + NORM_NAME_SIZE * std::min(depth, NORM_NAME_CNT);
scopes.emplace_back(JSProgramScopeType::GLOBAL);
+
+ for (const auto& iid : ignore_list)
+ if (iid.length() == 1)
+ id_fast[(unsigned)iid[0]] = iid.c_str();
+ else
+ id_names[iid] = iid.c_str();
}
-const char* JSIdentifierCtx::substitute(const char* identifier)
+const char* JSIdentifierCtx::substitute(unsigned char c)
{
- const auto it = ident_names.find(identifier);
- if (it != ident_names.end())
- return it->second.c_str();
+ auto p = id_fast[c];
+ if (p)
+ return p;
+
+ if (norm_name >= norm_name_end)
+ return nullptr;
+
+ auto n = norm_name;
+ norm_name += NORM_NAME_SIZE;
+ HttpModule::increment_peg_counts(HttpEnums::PEG_JS_IDENTIFIER);
+
+ return id_fast[c] = n;
+}
+
+const char* JSIdentifierCtx::substitute(const char* id_name)
+{
+ assert(*id_name);
+
+ if (id_name[1] == '\0')
+ return substitute(*id_name);
+
+ const auto it = id_names.find(id_name);
+ if (it != id_names.end())
+ return it->second;
- if (ident_last_name >= depth || ident_last_name > MAX_LAST_NAME)
+ if (norm_name >= norm_name_end)
return nullptr;
- ident_names[identifier] = format_name(ident_last_name++);
+ auto n = norm_name;
+ norm_name += NORM_NAME_SIZE;
HttpModule::increment_peg_counts(HttpEnums::PEG_JS_IDENTIFIER);
- return ident_names[identifier].c_str();
+
+ return id_names[id_name] = n;
}
-bool JSIdentifierCtx::is_ignored(const char* identifier) const
+bool JSIdentifierCtx::is_ignored(const char* id_name) const
{
- return ignored_ids.count(identifier);
+ return id_name < norm_names ||
+ id_name >= norm_names + NORM_NAME_SIZE * NORM_NAME_CNT;
}
bool JSIdentifierCtx::scope_push(JSProgramScopeType t)
void JSIdentifierCtx::reset()
{
- ident_last_name = 0;
-
- ident_names.clear();
+ memset(id_fast, 0, sizeof(id_fast));
+ norm_name = norm_names;
+ id_names.clear();
scopes.clear();
scopes.emplace_back(JSProgramScopeType::GLOBAL);
+
+ for (const auto& iid : ignore_list)
+ if (iid.length() == 1)
+ id_fast[(unsigned)iid[0]] = iid.c_str();
+ else
+ id_names[iid] = iid.c_str();
}
void JSIdentifierCtx::add_alias(const char* alias, const std::string&& value)
{
assert(alias);
assert(!scopes.empty());
- scopes.back().add_alias(alias, std::move(value));
-}
-const char* JSIdentifierCtx::alias_lookup(const char* alias) const
-{
- assert(alias);
+ auto& a = aliases[alias];
+ a.emplace_back(std::move(value));
- for (auto it = scopes.rbegin(); it != scopes.rend(); ++it)
- {
- if (const char* value = it->get_alias_value(alias))
- return value;
- }
- return nullptr;
+ scopes.back().reference(a);
}
-void JSIdentifierCtx::ProgramScope::add_alias(const char* alias, const std::string&& value)
+const char* JSIdentifierCtx::alias_lookup(const char* alias) const
{
assert(alias);
- aliases[alias] = value;
-}
-const char* JSIdentifierCtx::ProgramScope::get_alias_value(const char* alias) const
-{
- assert(alias);
+ const auto& i = aliases.find(alias);
- const auto it = aliases.find(alias);
- if (it != aliases.end())
- return it->second.c_str();
- else
- return nullptr;
+ return i != aliases.end() && !i->second.empty()
+ ? i->second.back().c_str() : nullptr;
}
// advanced program scope access for testing
for(const auto& scope:scopes)
{
return_list.push_back(scope.type());
- }
- return return_list;
-}
-
-bool JSIdentifierCtx::scope_contains(size_t pos, const char* alias) const
-{
- size_t offset = 0;
- for (auto it = scopes.begin(); it != scopes.end(); ++it, ++offset)
- {
- if (offset == pos)
- return it->get_alias_value(alias);
}
- assert(false);
- return false;
+ return return_list;
}
#endif // CATCH_TEST_BUILD
-
#include <string>
#include <unordered_map>
#include <unordered_set>
+#include <vector>
enum JSProgramScopeType : unsigned int
{
{
public:
JSIdentifierCtx(int32_t depth, uint32_t max_scope_depth,
- const std::unordered_set<std::string>& ignored_ids);
+ const std::unordered_set<std::string>& ignore_list);
virtual const char* substitute(const char* identifier) override;
virtual void add_alias(const char* alias, const std::string&& value) override;
virtual size_t size() const override
{ return (sizeof(JSIdentifierCtx) + (sizeof(std::string) * 2 * 500) +
(sizeof(ProgramScope) * 3)); }
+
private:
+ using Alias = std::vector<std::string>;
+ using AliasRef = std::list<Alias*>;
+ using AliasMap = std::unordered_map<std::string, Alias>;
+ using NameMap = std::unordered_map<std::string, const char*>;
+
class ProgramScope
{
public:
- ProgramScope(JSProgramScopeType t) : t(t) {}
+ ProgramScope(JSProgramScopeType t) : t(t)
+ {}
- void add_alias(const char* alias, const std::string&& value);
- const char* get_alias_value(const char* alias) const;
+ ~ProgramScope()
+ { for (auto a : to_remove) a->pop_back(); }
+
+ void reference(Alias& a)
+ { to_remove.push_back(&a); }
JSProgramScopeType type() const
{ return t; }
+
private:
- std::unordered_map<std::string, std::string> aliases;
JSProgramScopeType t;
+ AliasRef to_remove{};
};
+ inline const char* substitute(unsigned char c);
+
+ // do not swap next two lines, the destructor frees them in the reverse order
+ AliasMap aliases;
std::list<ProgramScope> scopes;
- std::unordered_map<std::string, std::string> ident_names;
- const std::unordered_set<std::string>& ignored_ids;
- int32_t ident_last_name = 0;
- int32_t depth;
+ const char* id_fast[256];
+ NameMap id_names;
+ const std::unordered_set<std::string>& ignore_list;
+
+ const char* norm_name;
+ const char* norm_name_end;
uint32_t max_scope_depth;
// advanced program scope access for testing
-#ifdef CATCH_TEST_BUILD
+#if defined(CATCH_TEST_BUILD) || defined(BENCHMARK_TEST)
public:
// compare scope list with the passed pattern
bool scope_check(const std::list<JSProgramScopeType>& compare) const;
const std::list<JSProgramScopeType> get_types() const;
- bool scope_contains(size_t pos, const char* alias) const;
#endif // CATCH_TEST_BUILD
};
#endif // JS_IDENTIFIER_CTX
-
LITERAL_DQ_STRING_START \"
LITERAL_DQ_STRING_END \"
LITERAL_DQ_STRING_SKIP \\\"
-LITERAL_DQ_STRING_TEXT .
+LITERAL_DQ_STRING_TEXT [^\"\\\xA\xD\{0x10}(\xE2\x80\xA8)(\xE2\x80\xA9)("<"+(?i:script))("<"+(?i:\/script>))]{1,32}
LITERAL_SQ_STRING_START \'
LITERAL_SQ_STRING_END \'
LITERAL_SQ_STRING_SKIP \\\'
-LITERAL_SQ_STRING_TEXT .
+LITERAL_SQ_STRING_TEXT [^\'\\\xA\xD\{0x10}(\xE2\x80\xA8)(\xE2\x80\xA9)("<"+(?i:script))("<"+(?i:\/script>))]{1,32}
LITERAL_TEMPLATE_START \`
LITERAL_TEMPLATE_END \`
LITERAL_TEMPLATE_SUBST_START \$\{
-LITERAL_TEMPLATE_OTHER .
+LITERAL_TEMPLATE_OTHER [^\\\`(\$\{)("<"+(?i:\/script>))]{1,32}
LITERAL_REGEX_START \/[^*\/]
LITERAL_REGEX_END \/[gimsuy]*
LITERAL_REGEX_SKIP \\\/
yyout << lexeme;
return EOS;
}
- else
- set_ident_norm(true);
- if (ident_ctx.is_ignored(lexeme) && !id_part)
+ set_ident_norm(true);
+
+ const char* name = ident_ctx.substitute(lexeme);
+
+ if (!name)
{
- ignored_id_pos = yyout.rdbuf()->pubseekoff(0, yyout.cur, std::ios_base::out);
- set_ident_norm(false);
- yyout << lexeme;
- return EOS;
+ debug_logf(6, http_trace, TRACE_JS_DUMP, nullptr,
+ "'%s' => IDENTIFIER_OVERFLOW\n", lexeme);
+ return IDENTIFIER_OVERFLOW;
+ }
+
+ if (ident_ctx.is_ignored(name))
+ {
+ if (id_part)
+ {
+ std::string n(name);
+ n.push_back('+'); // any illegal symbol as a part of ID name
+ name = ident_ctx.substitute(n.c_str());
+ }
+ else
+ {
+ ignored_id_pos = yyout.rdbuf()->pubseekoff(0, yyout.cur, std::ios_base::out);
+ set_ident_norm(false);
+ yyout << name;
+ return EOS;
+ }
}
- const char *ident = nullptr;
- if (!id_part)
- ident = ident_ctx.alias_lookup(lexeme);
- if (ident)
+ const char* alias = id_part ? nullptr : ident_ctx.alias_lookup(lexeme);
+
+ if (alias)
{
set_ident_norm(false);
ignored_id_pos = yyout.rdbuf()->pubseekoff(0, yyout.cur, std::ios_base::out);
last_dealiased = std::string(YYText());
dealias_stored = true;
- }
- else
- {
- ignored_id_pos = -1;
- ident = ident_ctx.substitute(lexeme);
- }
+ yyout << alias;
- if (!ident)
- {
debug_logf(6, http_trace, TRACE_JS_DUMP, nullptr,
- "'%s' => IDENTIFIER_OVERFLOW\n", lexeme);
-
- return IDENTIFIER_OVERFLOW;
+ "'%s' => '%s'\n", lexeme, alias);
+ return EOS;
}
- debug_logf(6, http_trace, TRACE_JS_DUMP, nullptr,
- "'%s' => '%s'\n", lexeme, ident);
-
- yyout << ident;
+ ignored_id_pos = -1;
+ yyout << name;
+ debug_logf(6, http_trace, TRACE_JS_DUMP, nullptr,
+ "'%s' => '%s'\n", lexeme, name);
return EOS;
}
bool JSTokenizer::states_process()
{
- if (!yyleng)
- return true;
-
bytes_read += yyleng;
// Fulfillment goes after this check only in case of split over several input scripts.
bytes_skip = bytes_skip - yyleng;
+ // Continue normalization from the last state without any changes
+ if (bytes_skip == 0)
+ {
+ token = eof_token;
+ yy_start = eof_sc;
+ }
+ // Update parsing state every match
+ else if (bytes_skip > 0)
+ {
+ do { ++sp; sp %= JSTOKENIZER_MAX_STATES; }
+ while (states[sp].sc == 0);
+
+ auto& state = states[sp];
+ token = state.token;
+ yy_start = state.sc;
+ }
// Ignore normalization till all the already normalized bytes are skipped or mismatch found.
// If mismatch found, adjust normalization state and renormalize from the mismatch point.
- if (bytes_skip < 0)
+ else
{
bytes_skip = 0;
states_adjust();
return true;
}
- // Otherwise, continue normalization from the last state without any changes
- else if (bytes_skip == 0)
- {
- token = eof_token;
- yy_start = eof_sc;
- }
- // Meanwhile, update parsing state every match
- else
- {
- do { ++sp; sp %= JSTOKENIZER_MAX_STATES; }
- while (states[sp].sc == 0);
-
- auto& state = states[sp];
- token = state.token;
- yy_start = state.sc;
- }
return false;
}
case IDENTIFIER:
{
FuncType ret = FuncType::GENERAL;
- if (ignored_id_pos >= 0)
+ if (ignored_id_pos >= 0)
{
std::streambuf* pbuf = yyout.rdbuf();
std::streamsize size = pbuf->pubseekoff(0, yyout.cur, yyout.out) - ignored_id_pos;
assert(size >= 0);
-
+
char tail[256];
assert((size_t)size <= sizeof(tail));
size = std::min((size_t)size, sizeof(tail));
void JSTokenizer::dealias_identifier(bool id_part, bool assignment_start)
{
auto lexeme = YYText();
- switch(alias_state)
+
+ switch (alias_state)
{
- case ALIAS_NONE:
+ case ALIAS_NONE:
+ if (assignment_start)
{
- if (assignment_start)
- {
- alias = std::string(YYText());
- aliased.clear();
- aliased.str("");
- alias_state = ALIAS_DEFINITION;
- }
- break;
+ alias = std::string(YYText());
+ aliased.clear();
+ aliased.str("");
+ alias_state = ALIAS_DEFINITION;
}
- case ALIAS_PREFIX:
- case ALIAS_DEFINITION:
- {
+ break;
+ case ALIAS_PREFIX:
+ case ALIAS_DEFINITION:
+ dealias_reset();
+ break;
+ case ALIAS_EQUALS:
+ alias_state = ALIAS_VALUE;
+ // fallthrough
+ case ALIAS_VALUE:
+ {
+ auto dealias = ident_ctx.alias_lookup(lexeme);
+ if ((!ident_norm() && id_part) ||
+ (ident_ctx.is_ignored(ident_ctx.substitute(lexeme)) && !id_part))
+ aliased << YYText();
+ else if (dealias)
+ aliased << dealias;
+ else
dealias_reset();
- break;
- }
- case ALIAS_EQUALS:
- alias_state = ALIAS_VALUE;
- // fallthrough
- case ALIAS_VALUE:
- {
- auto dealias = ident_ctx.alias_lookup(lexeme);
- if ((ident_ctx.is_ignored(lexeme) && !id_part) || (!ident_norm() && id_part))
- aliased << YYText();
- else if (dealias)
- aliased << dealias;
- else
- dealias_reset();
- break;
- }
+ break;
+ }
}
}
bytes_read = 0;
return static_cast<JSTokenizer::JSRet>(r);
-}
\ No newline at end of file
+}
{
JSIdentifierCtx ident_ctx(DEPTH, SCOPE_DEPTH, s_ignored_ids);
- CHECK(ident_ctx.is_ignored("console") == true);
- CHECK(ident_ctx.is_ignored("foo") == false);
+ auto v1 = ident_ctx.substitute("console");
+ auto v2 = ident_ctx.substitute("foo");
+
+ CHECK(ident_ctx.is_ignored(v1) == true);
+ CHECK(ident_ctx.is_ignored(v2) == false);
}
TEST_CASE("JSIdentifierCtx::scopes", "[JSIdentifierCtx]")
{
ident_ctx.add_alias("a", "console.log");
ident_ctx.add_alias("b", "document");
- CHECK(ident_ctx.scope_contains(0, "a"));
- CHECK(ident_ctx.scope_contains(0, "b"));
CHECK(!strcmp(ident_ctx.alias_lookup("a"), "console.log"));
CHECK(!strcmp(ident_ctx.alias_lookup("b"), "document"));
REQUIRE(ident_ctx.scope_push(JSProgramScopeType::FUNCTION));
ident_ctx.add_alias("a", "document");
- CHECK(ident_ctx.scope_contains(1, "a"));
- CHECK(!ident_ctx.scope_contains(1, "b"));
CHECK(!strcmp(ident_ctx.alias_lookup("a"), "document"));
CHECK(!strcmp(ident_ctx.alias_lookup("b"), "document"));
REQUIRE(ident_ctx.scope_push(JSProgramScopeType::BLOCK));
ident_ctx.add_alias("b", "console.log");
- CHECK(ident_ctx.scope_contains(2, "b"));
- CHECK(!ident_ctx.scope_contains(2, "a"));
CHECK(!strcmp(ident_ctx.alias_lookup("b"), "console.log"));
CHECK(!strcmp(ident_ctx.alias_lookup("a"), "document"));
REQUIRE(ident_ctx.scope_pop(JSProgramScopeType::BLOCK));
REQUIRE(ident_ctx.scope_pop(JSProgramScopeType::FUNCTION));
ident_ctx.add_alias("a", "eval");
- CHECK(ident_ctx.scope_contains(0, "a"));
- CHECK(ident_ctx.scope_contains(0, "b"));
CHECK(!strcmp(ident_ctx.alias_lookup("a"), "eval"));
CHECK(!strcmp(ident_ctx.alias_lookup("b"), "document"));
SECTION("ignored fake defined function identifier")
{
const std::unordered_set<std::string> s_ignored_ids_fake {"fake_unescape"};
- JSTokenizerTester tester_fake(norm_depth, max_scope_depth, s_ignored_ids_fake,
+ JSTokenizerTester tester_fake(norm_depth, max_scope_depth, s_ignored_ids_fake,
max_template_nesting, max_bracket_depth);
tester_fake.test_function_scopes({
{"fake_unescape(", "fake_unescape(", {FuncType::NOT_FUNC, FuncType::GENERAL}}