]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
Treat characters between \Q \E as codepoints in UTF8 mode.
authorAlex Coyte <a.coyte@intel.com>
Mon, 19 Jun 2017 01:03:05 +0000 (11:03 +1000)
committerMatthew Barr <matthew.barr@intel.com>
Tue, 20 Jun 2017 22:43:44 +0000 (08:43 +1000)
fixes github issue #57

src/parser/Parser.rl

index 52b3340c66e79973c9bbb52d07820571e15cddb4..05a084bb0009fa3a9e1933cb3cbbc415be759ad7 100644 (file)
@@ -1155,6 +1155,35 @@ unichar readUtf8CodePoint4c(const char *s) {
               '\\E' => {
                   fgoto main;
               };
+
+              #unicode chars
+              utf8_2c when is_utf8 => {
+                  assert(mode.utf8);
+                  /* leverage ComponentClass to generate the vertices */
+                  auto cc = getComponentClass(mode);
+                  cc->add(readUtf8CodePoint2c(ts));
+                  cc->finalize();
+                  currentSeq->addComponent(move(cc));
+              };
+
+              utf8_3c when is_utf8 => {
+                  assert(mode.utf8);
+                  /* leverage ComponentClass to generate the vertices */
+                  auto cc = getComponentClass(mode);
+                  cc->add(readUtf8CodePoint3c(ts));
+                  cc->finalize();
+                  currentSeq->addComponent(move(cc));
+              };
+
+              utf8_4c when is_utf8 => {
+                  assert(mode.utf8);
+                  /* leverage ComponentClass to generate the vertices */
+                  auto cc = getComponentClass(mode);
+                  cc->add(readUtf8CodePoint4c(ts));
+                  cc->finalize();
+                  currentSeq->addComponent(move(cc));
+              };
+
               # Literal character
               any => {
                   addLiteral(currentSeq, *ts, mode);