]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
character classes: handle \Q\E and utf8
authorAlex Coyte <a.coyte@intel.com>
Tue, 20 Jun 2017 00:19:32 +0000 (10:19 +1000)
committerMatthew Barr <matthew.barr@intel.com>
Tue, 20 Jun 2017 22:43:44 +0000 (08:43 +1000)
src/parser/Parser.rl
unit/hyperscan/bad_patterns.txt

index 05a084bb0009fa3a9e1933cb3cbbc415be759ad7..ce9ca865b66f5a1f1a926757d2d900421579bfd4 100644 (file)
@@ -1184,6 +1184,11 @@ unichar readUtf8CodePoint4c(const char *s) {
                   currentSeq->addComponent(move(cc));
               };
 
+              hi_byte when is_utf8 => {
+                  assert(mode.utf8);
+                  throwInvalidUtf8();
+              };
+
               # Literal character
               any => {
                   addLiteral(currentSeq, *ts, mode);
@@ -1198,6 +1203,31 @@ unichar readUtf8CodePoint4c(const char *s) {
               '\\E' => {
                   fret;
               };
+
+              #unicode chars
+              utf8_2c when is_utf8 => {
+                  assert(mode.utf8);
+                  currentCls->add(readUtf8CodePoint2c(ts));
+                  inCharClassEarly = false;
+              };
+
+              utf8_3c when is_utf8 => {
+                  assert(mode.utf8);
+                  currentCls->add(readUtf8CodePoint3c(ts));
+                  inCharClassEarly = false;
+              };
+
+              utf8_4c when is_utf8 => {
+                  assert(mode.utf8);
+                  currentCls->add(readUtf8CodePoint4c(ts));
+                  inCharClassEarly = false;
+              };
+
+              hi_byte when is_utf8 => {
+                  assert(mode.utf8);
+                  throwInvalidUtf8();
+              };
+
               # Literal character
               any => {
                   currentCls->add(*ts);
index 3d6d9db909714f9cc98144ff5b03ae13b7171d45..3042dc8294252b80f4f4dd14225ac49ff8056294 100644 (file)
 145:/abc/8{edit_distance=1} #UTF-8 is disallowed for approximate matching.
 146:/(*UTF8)abc/{edit_distance=1} #UTF-8 is disallowed for approximate matching.
 147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match.
+148:/\QÀ\Eaaaa/8 #Expression is not valid UTF-8.
+149:/[\QÀ\Eaaaa]/8 #Expression is not valid UTF-8.