utils: fix JS split to reflect tokens correction and re-normalization

author Oleksandr Serhiienko <oserhiie@cisco.com>

Thu, 4 Aug 2022 09:51:17 +0000 (12:51 +0300)

committer Oleksandr Serhiienko <oserhiie@cisco.com>

Tue, 9 Aug 2022 13:44:30 +0000 (16:44 +0300)
author Oleksandr Serhiienko <oserhiie@cisco.com>
Thu, 4 Aug 2022 09:51:17 +0000 (12:51 +0300)
committer Oleksandr Serhiienko <oserhiie@cisco.com>
Tue, 9 Aug 2022 13:44:30 +0000 (16:44 +0300)
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h

index 3bcb33cc46f9b0b0116bbf9f128ff31c34e99e2a..697b76d4944c7188f35ce73ff8161bc48c8264e6 100644 (file)
--- a/src/utils/js_tokenizer.h
+++ b/src/utils/js_tokenizer.h
@@ -341,7 +341,8 @@ private:
          JSToken token = UNDEFINED;          // the token before
          int orig_len = 0;                   // current token original length
          int norm_len = 0;                   // normalized length of previous tokens
-        int sc = 0;                        // current Starting Condition (0 means NOT_SET)
+        int sc = 0;                         // current Starting Condition (0 means NOT_SET)
+        int correction = 0;                 // correction length
      } states[JSTOKENIZER_MAX_STATES];
      int sp = 0;                             // points to the top of states
      int eof_sp = 0;                         // points to the last state before the EOF
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l

index 61db2e7416072f66f342d97c7103808740e42389..2a68209689b9bb8eaf653f28b7cdac575e83722b 100644 (file)
--- a/src/utils/js_tokenizer.l
+++ b/src/utils/js_tokenizer.l
@@ -1702,6 +1702,9 @@ void JSTokenizer::states_correct(int take_off)
  
      bytes_read -= delta;
      state.orig_len -= delta;
+    state.correction = take_off;
+
+    yyless(take_off);
  }
  
  void JSTokenizer::states_over()
@@ -1766,6 +1769,7 @@ bool JSTokenizer::states_process()
          state.orig_len = yyleng;
          state.norm_len = yyout.rdbuf()->pubseekoff(0, std::ios_base::cur, std::ios_base::out);
          state.sc = yy_start;
+        state.correction = 0;
  
          return true;
      }
@@ -1781,6 +1785,16 @@ bool JSTokenizer::states_process()
      // Update parsing state every match
      else if (bytes_skip > 0)
      {
+        // if the state was corrected, reflect this during the parsing
+        if (auto correction = states[sp].correction)
+        {
+            auto delta = yyleng - correction;
+            bytes_skip += delta;
+            bytes_read -= delta;
+
+            yyless(correction);
+        }
+
          do { ++sp; sp %= JSTOKENIZER_MAX_STATES; }
          while (states[sp].sc == 0);
  
@@ -1804,6 +1818,7 @@ bool JSTokenizer::states_process()
          state.orig_len = yyleng;
          state.norm_len = yyout.rdbuf()->pubseekoff(0, std::ios_base::cur, std::ios_base::out);
          state.sc = yy_start;
+        state.correction = 0;
  
          return true;
      }
@@ -2316,7 +2331,6 @@ JSTokenizer::JSRet JSTokenizer::literal_regex_start()
      EXEC(do_spacing(LITERAL))
      yyout << '/';
      states_correct(1);
-    yyless(1);
      BEGIN(regex);
      set_ident_norm(true);
      regex_stack = VStack<char>();
@@ -2957,7 +2971,6 @@ void JSTokenizer::explicit_otag()
  
      // discard match of the script tag and scan again without leading '<'
      states_correct(1);
-    yyless(1);
  
      // process leading '<' as a comparison operator
      operator_comparison();
diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc

index 7f6cd4218600ceca85f476046afefc2acc429e18..c32c87e2a53f98b84715835a80987af9ef03c251 100644 (file)
--- a/src/utils/test/js_normalizer_test.cc
+++ b/src/utils/test/js_normalizer_test.cc
@@ -2457,6 +2457,39 @@ TEST_CASE("split between tokens", "[JSNormalizer]")
          const char exp2[] = "<script)";
          const char exp[] = "(a<script)";
  
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("complete regex (1 parsing group) - identifier")
+    {
+        const char dat1[] = "/ss/,";
+        const char dat2[] = " a ;";
+        const char exp1[] = "/ss/,";
+        const char exp2[] = "a;";
+        const char exp[] = "/ss/,a;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("complete regex (2 parsing groups) - identifier")
+    {
+        const char dat1[] = "/\\s/,";
+        const char dat2[] = " a ;";
+        const char exp1[] = "/\\s/,";
+        const char exp2[] = "a;";
+        const char exp[] = "/\\s/,a;";
+
+        NORMALIZE_2(dat1, dat2, exp1, exp2);
+        NORM_COMBINED_2(dat1, dat2, exp);
+    }
+    SECTION("complete regex (not the first) - identifier")
+    {
+        const char dat1[] = ",/\\s/,";
+        const char dat2[] = " a ;";
+        const char exp1[] = ",/\\s/,";
+        const char exp2[] = "a;";
+        const char exp[] = ",/\\s/,a;";
+
          NORMALIZE_2(dat1, dat2, exp1, exp2);
          NORM_COMBINED_2(dat1, dat2, exp);
      }
@@ -2745,7 +2778,7 @@ TEST_CASE("split in closing tag", "[JSNormalizer]")
          const char dat2[] = "ipt";
          const char dat3[] = ">";
          const char exp1[] = "::::</scr";
-        const char exp2[] = "cript";
+        const char exp2[] = "script";
          const char exp3[] = "";
          const char exp[] = "::::";
author	Oleksandr Serhiienko <oserhiie@cisco.com>
	Thu, 4 Aug 2022 09:51:17 +0000 (12:51 +0300)
committer	Oleksandr Serhiienko <oserhiie@cisco.com>
	Tue, 9 Aug 2022 13:44:30 +0000 (16:44 +0300)
src/utils/js_tokenizer.h		patch \| blob \| blame \| history
src/utils/js_tokenizer.l		patch \| blob \| blame \| history
src/utils/test/js_normalizer_test.cc		patch \| blob \| blame \| history