From: Mike Stepanek (mstepane) <mstepane@cisco.com>
Date: Fri, 1 Oct 2021 16:57:06 +0000 (+0000)
Subject: Merge pull request #3081 in SNORT/snort3 from ~VHORBATO/snort3:rename_norm_ident... 
X-Git-Tag: 3.1.14.0~8
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4c84efc6e58b938166d71389a3bf8d07b994eecf;p=thirdparty%2Fsnort3.git

Merge pull request #3081 in SNORT/snort3 from ~VHORBATO/snort3:rename_norm_ident to master

Squashed commit of the following:

commit 613865899894440d15e9cb49ba6a76b1cb790688
Author: Vitalii <vhorbato@cisco.com>
Date:   Mon Sep 27 09:49:16 2021 +0300

    http_inspect: change format of normalized JS identifiers

    utils: adjust output streambuffer expanding strategy and reserved memory
---

diff --git a/src/service_inspectors/http_inspect/dev_notes.txt b/src/service_inspectors/http_inspect/dev_notes.txt
index 707835373..c33a6e024 100755
--- a/src/service_inspectors/http_inspect/dev_notes.txt
+++ b/src/service_inspectors/http_inspect/dev_notes.txt
@@ -232,9 +232,9 @@ So, the following whitespace codes will be normalized:
  * Also including new-line and carriage-return line-break characters
 
 All JavaScript identifier names will be substituted to unified names with the
-following format: a0 -> z9999. So, the number of unique identifiers available
-is 260000 names per HTTP transaction. If Normalizer overruns the configured
-limit, built-in alert generated. Additionaly, there is a config option to
+following format: var_0000 -> var_ffff. So, the number of unique identifiers available
+is 65536 names per HTTP transaction. If Normalizer overruns the configured
+limit, built-in alert is generated. Additionally, there is a config option to
 specify the limit manually:
  * http_inspect.js_norm_identifier_depth.
 
diff --git a/src/service_inspectors/http_inspect/http_module.cc b/src/service_inspectors/http_inspect/http_module.cc
index b090fbf64..e92461950 100755
--- a/src/service_inspectors/http_inspect/http_module.cc
+++ b/src/service_inspectors/http_inspect/http_module.cc
@@ -85,8 +85,8 @@ const Parameter HttpModule::http_params[] =
       "number of input JavaScript bytes to normalize (-1 unlimited) "
       "(experimental)" },
 
-    // range of accepted identifier names is (a0:z9999), so the max is 26 * 10000 = 260000
-    { "js_norm_identifier_depth", Parameter::PT_INT, "0:260000", "260000",
+    // range of accepted identifier names is (var_0000:var_ffff), so the max is 2^16
+    { "js_norm_identifier_depth", Parameter::PT_INT, "0:65536", "65536",
       "max number of unique JavaScript identifiers to normalize" },
 
     { "js_norm_max_tmpl_nest", Parameter::PT_INT, "0:255", "32",
diff --git a/src/utils/dev_notes.txt b/src/utils/dev_notes.txt
index 7ecc6c546..5d5d1f1b7 100644
--- a/src/utils/dev_notes.txt
+++ b/src/utils/dev_notes.txt
@@ -44,5 +44,5 @@ which could be useful for final consumer.
 
 From performance perspective, ostreambuf_infl can reserve an amount of memory
 before actual operations. Also, memory extending is done by predefined
-portions of 2^8^, 2^9^, 2^10^, 2^13^, 2^16^, 2^16^, 2^16^...
+portions of 2^11^, 2^12^, 2^13^, 2^14^, 2^15^, 2^15^, 2^15^...
 This tries to minimize the number of memory reallocation.
diff --git a/src/utils/js_identifier_ctx.cc b/src/utils/js_identifier_ctx.cc
index 3b682b490..35b2b44dd 100644
--- a/src/utils/js_identifier_ctx.cc
+++ b/src/utils/js_identifier_ctx.cc
@@ -42,44 +42,43 @@ public:
 };
 #endif // CATCH_TEST_BUILD
 
-#define FIRST_NAME_SIZE   26
-#define LAST_NAME_SIZE  9999
+#define MAX_LAST_NAME     65535
+#define HEX_DIGIT_MASK   15
 
-static const char s_ident_first_names[FIRST_NAME_SIZE] =
+static const char hex_digits[] = 
 {
-    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
-    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
+    '0', '1','2','3', '4', '5', '6', '7', '8','9', 'a', 'b', 'c', 'd', 'e', 'f'
 };
 
+static inline std::string format_name(int32_t num)
+{
+    std::string name("var_");
+    name.reserve(8);
+    name.push_back(hex_digits[(num >> 12) & HEX_DIGIT_MASK]); 
+    name.push_back(hex_digits[(num >> 8) & HEX_DIGIT_MASK]); 
+    name.push_back(hex_digits[(num >> 4) & HEX_DIGIT_MASK]);
+    name.push_back(hex_digits[num & HEX_DIGIT_MASK]); 
+
+    return name;
+}
+
 const char* JSIdentifierCtx::substitute(const char* identifier)
 {
     const auto it = ident_names.find(identifier);
     if (it != ident_names.end())
         return it->second.c_str();
 
-    if (++ident_last_name > LAST_NAME_SIZE)
-    {
-        if (++ident_first_name > FIRST_NAME_SIZE - 1)
-            return nullptr;
-
-        ident_last_name = 0;
-    }
-
-    if (++unique_ident_cnt > depth)
+    if (ident_last_name >= depth || ident_last_name > MAX_LAST_NAME)
         return nullptr;
 
-    ident_names[identifier] = s_ident_first_names[ident_first_name]
-        + std::to_string(ident_last_name);
-
+    ident_names[identifier] = format_name(ident_last_name++);
     HttpModule::increment_peg_counts(HttpEnums::PEG_JS_IDENTIFIER);
     return ident_names[identifier].c_str();
 }
 
 void JSIdentifierCtx::reset()
 {
-    ident_first_name = 0;
-    ident_last_name = -1;
-    unique_ident_cnt = 0;
+    ident_last_name = 0;
     ident_names.clear();
 }
 
diff --git a/src/utils/js_identifier_ctx.h b/src/utils/js_identifier_ctx.h
index 6a5add294..b69ec8679 100644
--- a/src/utils/js_identifier_ctx.h
+++ b/src/utils/js_identifier_ctx.h
@@ -46,9 +46,7 @@ public:
     { return (sizeof(JSIdentifierCtx) + (sizeof(std::string) * 2 * 500)); }
 
 private:
-    int ident_first_name = 0;
-    int ident_last_name = -1;
-    int32_t unique_ident_cnt = 0;
+    int32_t ident_last_name = 0;
     int32_t depth;
 
     std::unordered_map<std::string, std::string> ident_names;
diff --git a/src/utils/js_normalizer.cc b/src/utils/js_normalizer.cc
index eff85e64e..cca5ed758 100644
--- a/src/utils/js_normalizer.cc
+++ b/src/utils/js_normalizer.cc
@@ -23,6 +23,8 @@
 
 #include "js_normalizer.h"
 
+#define BUFF_EXP_FACTOR 1.3
+
 using namespace snort;
 using namespace std;
 
@@ -67,7 +69,7 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len)
     in_buf.pubsetbuf(nullptr, 0)
         ->pubsetbuf(tmp_buf, tmp_buf_size)
         ->pubsetbuf(const_cast<char*>(src), len);
-    out_buf.reserve(src_len);
+    out_buf.reserve(src_len * BUFF_EXP_FACTOR);
 
     JSTokenizer::JSRet ret = static_cast<JSTokenizer::JSRet>(tokenizer.yylex());
     in.clear();
diff --git a/src/utils/streambuf.cc b/src/utils/streambuf.cc
index dee1b939a..1b5e1d1b0 100644
--- a/src/utils/streambuf.cc
+++ b/src/utils/streambuf.cc
@@ -222,11 +222,11 @@ int istreambuf_glue::underflow()
 
 const ostreambuf_infl::State ostreambuf_infl::states[] =
 {
-    {states + 1, 1 << 8},
-    {states + 2, 1 << 9},
-    {states + 3, 1 << 10},
-    {states + 4, 1 << 13},
-    {states + 4, 1 << 16}
+    {states + 1, 1 << 11},
+    {states + 2, 1 << 12},
+    {states + 3, 1 << 13},
+    {states + 4, 1 << 14},
+    {states + 4, 1 << 15}
 };
 
 ostreambuf_infl::ostreambuf_infl() :
@@ -335,7 +335,7 @@ streamsize ostreambuf_infl::xsputn(const char* s, streamsize n)
 
     auto c_avail = epptr() - pptr();
     if (n > c_avail)
-        enlarge(n - c_avail);
+        gen.n > (n - c_avail) ? enlarge() : enlarge(n - c_avail);
 
     auto n_avail = epptr() - pptr();
     n = min(n, n_avail);
diff --git a/src/utils/test/js_identifier_ctx_test.cc b/src/utils/test/js_identifier_ctx_test.cc
index 89f025250..618bb22b9 100644
--- a/src/utils/test/js_identifier_ctx_test.cc
+++ b/src/utils/test/js_identifier_ctx_test.cc
@@ -25,19 +25,11 @@
 
 #include <cstring>
 #include <vector>
+#include <iomanip>
 
 #include "utils/js_identifier_ctx.h"
 
-#define DEPTH 260000
-
-#define FIRST_NAME_SIZE   26
-#define LAST_NAME_SIZE  9999
-
-static const char s_ident_first_names[FIRST_NAME_SIZE] =
-{
-    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
-    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
-};
+#define DEPTH 65536
 
 TEST_CASE("JSIdentifierCtx::substitute()", "[JSIdentifierCtx]")
 {
@@ -45,26 +37,26 @@ TEST_CASE("JSIdentifierCtx::substitute()", "[JSIdentifierCtx]")
     {
         JSIdentifierCtx ident_ctx(DEPTH);
 
-        CHECK(!strcmp(ident_ctx.substitute("a"), "a0"));
-        CHECK(!strcmp(ident_ctx.substitute("a"), "a0"));
+        CHECK(!strcmp(ident_ctx.substitute("a"), "var_0000"));
+        CHECK(!strcmp(ident_ctx.substitute("a"), "var_0000"));
     }
     SECTION("different names")
     {
         JSIdentifierCtx ident_ctx(DEPTH);
 
-        CHECK(!strcmp(ident_ctx.substitute("a"), "a0"));
-        CHECK(!strcmp(ident_ctx.substitute("b"), "a1"));
-        CHECK(!strcmp(ident_ctx.substitute("a"), "a0"));
+        CHECK(!strcmp(ident_ctx.substitute("a"), "var_0000"));
+        CHECK(!strcmp(ident_ctx.substitute("b"), "var_0001"));
+        CHECK(!strcmp(ident_ctx.substitute("a"), "var_0000"));
     }
     SECTION("depth reached")
     {
         JSIdentifierCtx ident_ctx(2);
 
-        CHECK(!strcmp(ident_ctx.substitute("a"), "a0"));
-        CHECK(!strcmp(ident_ctx.substitute("b"), "a1"));
+        CHECK(!strcmp(ident_ctx.substitute("a"), "var_0000"));
+        CHECK(!strcmp(ident_ctx.substitute("b"), "var_0001"));
         CHECK(ident_ctx.substitute("c") == nullptr);
         CHECK(ident_ctx.substitute("d") == nullptr);
-        CHECK(!strcmp(ident_ctx.substitute("a"), "a0"));
+        CHECK(!strcmp(ident_ctx.substitute("a"), "var_0000"));
     }
     SECTION("max names")
     {
@@ -77,10 +69,12 @@ TEST_CASE("JSIdentifierCtx::substitute()", "[JSIdentifierCtx]")
         for (int it = 0; it < DEPTH + 2; ++it)
             n.push_back("n" + std::to_string(it));
 
-        for (int it_first = 0; it_first < FIRST_NAME_SIZE; ++it_first)
+        for (int it_name = 0; it_name < DEPTH; ++it_name)
         {
-            for (int it_last = 0; it_last <= LAST_NAME_SIZE; ++it_last)
-                e.push_back(s_ident_first_names[it_first] + std::to_string(it_last));
+            std::stringstream stream;
+            stream << std::setfill ('0') << std::setw(4) 
+                << std::hex << it_name;
+            e.push_back("var_" + stream.str());
         }
 
         for (int it = 0; it < DEPTH; ++it)
diff --git a/src/utils/test/streambuf_test.cc b/src/utils/test/streambuf_test.cc
index 65778366e..f3246b4fc 100644
--- a/src/utils/test/streambuf_test.cc
+++ b/src/utils/test/streambuf_test.cc
@@ -1442,9 +1442,9 @@ TEST_CASE("output buffer - basic", "[Stream buffers]")
         CHECK(c == 'A');
         CHECK(off_b == 0);
         CHECK(off_c == 1);
-        CHECK(off_e == 256);
+        CHECK(off_e == 2048);
 
-        EXP_RES(b, exp, 1, 256);
+        EXP_RES(b, exp, 1, 2048);
     }
 
     SECTION("put two chars")
@@ -1463,9 +1463,9 @@ TEST_CASE("output buffer - basic", "[Stream buffers]")
         CHECK(off_b == 0);
         CHECK(off_1 == 1);
         CHECK(off_2 == 2);
-        CHECK(off_e == 256);
+        CHECK(off_e == 2048);
 
-        EXP_RES(b, exp, 2, 256);
+        EXP_RES(b, exp, 2, 2048);
     }
 
     SECTION("extend buffer")
@@ -1484,9 +1484,9 @@ TEST_CASE("output buffer - basic", "[Stream buffers]")
         CHECK(c2 == 'Z');
         CHECK(off_b == 0);
         CHECK(off_1 == 1);
-        CHECK(off_2 == 257);
-        CHECK(off_e == 256);
-        CHECK(off_z == 768);
+        CHECK(off_2 == 2049);
+        CHECK(off_e == 2048);
+        CHECK(off_z == 6144);
     }
 
     SECTION("put sequence of chars")
@@ -1524,7 +1524,7 @@ TEST_CASE("output buffer - basic", "[Stream buffers]")
         CHECK(c2 == 'Z');
         CHECK(off_b == 0);
         CHECK(off_c == len + 2);
-        CHECK(off_e == 4096 + 512);
+        CHECK(off_e == 4096 + 2048);
     }
 }