From: Oleksandr Serhiienko <oserhiie@cisco.com>
Date: Thu, 4 Aug 2022 09:51:17 +0000 (+0300)
Subject: utils: fix JS split to reflect tokens correction and re-normalization
X-Git-Tag: 3.1.39.0~1
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d45988e1632cfbc407989f336276af548518188a;p=thirdparty%2Fsnort3.git

utils: fix JS split to reflect tokens correction and re-normalization
---

diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h
index 3bcb33cc4..697b76d49 100644
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -341,7 +341,8 @@ private:
         JSToken token = UNDEFINED;          // the token before
         int orig_len = 0;                   // current token original length
         int norm_len = 0;                   // normalized length of previous tokens
-        int sc = 0;                        // current Starting Condition (0 means NOT_SET)
+        int sc = 0;                         // current Starting Condition (0 means NOT_SET)
+        int correction = 0;                 // correction length
     } states[JSTOKENIZER_MAX_STATES];
     int sp = 0;                             // points to the top of states
     int eof_sp = 0;                         // points to the last state before the EOF
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l
index 61db2e741..2a6820968 100644
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -1702,6 +1702,9 @@ void JSTokenizer::states_correct(int take_off)
 
     bytes_read -= delta;
     state.orig_len -= delta;
+    state.correction = take_off;
+
+    yyless(take_off);
 }
 
 void JSTokenizer::states_over()
@@ -1766,6 +1769,7 @@ bool JSTokenizer::states_process()
         state.orig_len = yyleng;
         state.norm_len = yyout.rdbuf()->pubseekoff(0, std::ios_base::cur, std::ios_base::out);
         state.sc = yy_start;
+        state.correction = 0;
 
         return true;
     }
@@ -1781,6 +1785,16 @@ bool JSTokenizer::states_process()
     // Update parsing state every match
     else if (bytes_skip > 0)
     {
+        // if the state was corrected, reflect this during the parsing
+        if (auto correction = states[sp].correction)
+        {
+            auto delta = yyleng - correction;
+            bytes_skip += delta;
+            bytes_read -= delta;
+
+            yyless(correction);
+        }
+
         do { ++sp; sp %= JSTOKENIZER_MAX_STATES; }
         while (states[sp].sc == 0);
 
@@ -1804,6 +1818,7 @@ bool JSTokenizer::states_process()
         state.orig_len = yyleng;
         state.norm_len = yyout.rdbuf()->pubseekoff(0, std::ios_base::cur, std::ios_base::out);
         state.sc = yy_start;
+        state.correction = 0;
 
         return true;
     }
@@ -2316,7 +2331,6 @@ JSTokenizer::JSRet JSTokenizer::literal_regex_start()
     EXEC(do_spacing(LITERAL))
     yyout << '/';
     states_correct(1);
-    yyless(1);
     BEGIN(regex);
     set_ident_norm(true);
     regex_stack = VStack<char>();
@@ -2957,7 +2971,6 @@ void JSTokenizer::explicit_otag()
 
     // discard match of the script tag and scan again without leading '<'
     states_correct(1);
-    yyless(1);
 
     // process leading '<' as a comparison operator
     operator_comparison();
diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc
index 7f6cd4218..c32c87e2a 100644
--- a/src/utils/test/js_normalizer_test.cc
+++ b/src/utils/test/js_normalizer_test.cc
@@ -2457,6 +2457,39 @@ TEST_CASE("split between tokens", "[JSNormalizer]")
         const char exp2[] = "<script)";
         const char exp[] = "(a<script)";
 
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("complete regex (1 parsing group) - identifier")
+    {
+        const char dat1[] = "/ss/,";
+        const char dat2[] = " a ;";
+        const char exp1[] = "/ss/,";
+        const char exp2[] = "a;";
+        const char exp[] = "/ss/,a;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("complete regex (2 parsing groups) - identifier")
+    {
+        const char dat1[] = "/\\s/,";
+        const char dat2[] = " a ;";
+        const char exp1[] = "/\\s/,";
+        const char exp2[] = "a;";
+        const char exp[] = "/\\s/,a;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("complete regex (not the first) - identifier")
+    {
+        const char dat1[] = ",/\\s/,";
+        const char dat2[] = " a ;";
+        const char exp1[] = ",/\\s/,";
+        const char exp2[] = "a;";
+        const char exp[] = ",/\\s/,a;";
+
         NORMALIZE_2(dat1, dat2, exp1, exp2);
         NORM_COMBINED_2(dat1, dat2, exp);
     }
@@ -2745,7 +2778,7 @@ TEST_CASE("split in closing tag", "[JSNormalizer]")
         const char dat2[] = "ipt";
         const char dat3[] = ">";
         const char exp1[] = "::::</scr";
-        const char exp2[] = "cript";
+        const char exp2[] = "script";
         const char exp3[] = "";
         const char exp[] = "::::";