Rework parser rejection for POSIX collating elems

author Justin Viiret <justin.viiret@intel.com>

Sun, 8 Nov 2015 23:37:20 +0000 (10:37 +1100)

committer Matthew Barr <matthew.barr@intel.com>

Tue, 10 Nov 2015 03:36:39 +0000 (14:36 +1100)
author Justin Viiret <justin.viiret@intel.com>
Sun, 8 Nov 2015 23:37:20 +0000 (10:37 +1100)
committer Matthew Barr <matthew.barr@intel.com>
Tue, 10 Nov 2015 03:36:39 +0000 (14:36 +1100)
diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl

index 1481b7d8045773eeebed7334c21016454cb3c6d1..37beb7653f3cfbe29e32a8ba6b5de3ef3de45eaa 100644 (file)
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -790,10 +790,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
          any => { throw LocatedParseError("Unknown property"); };
                       *|;
      charClassGuts := |*
-              # We don't like POSIX collating elements (neither does PCRE or Perl).
-              '\[\.' [^\]]* '\.\]' | 
-              '\[=' [^\]]* '=\]' => {
-                  throw LocatedParseError("Unsupported POSIX collating element");
+              # We don't support POSIX collating elements (neither does PCRE
+              # or Perl). These look like [.ch.] or [=ch=].
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
+                  throw LocatedParseError("Unsupported POSIX collating "
+                                          "element");
                };
                # Named sets
                # Adding these may cause the charclass to close, hence the
@@ -1090,23 +1092,6 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                    throwInvalidUtf8();
                };
  
-              # dot or equals at the end of a character class could be the end
-              # of a collating element, like [.blah.] or [=blah=].
-              [.=] ']' => {
-                  if (currentCls->getFirstChar() == *ts) {
-                      assert(currentClsBegin);
-                      ostringstream oss;
-                      oss << "Unsupported POSIX collating element at index "
-                          << currentClsBegin - ptr << ".";
-                      throw ParseError(oss.str());
-                  }
-                  currentCls->add(*ts);
-                  currentCls->finalize();
-                  currentSeq->addComponent(move(currentCls));
-                  inCharClass = false;
-                  fgoto main;
-              };
-
                # Literal character
                (any - ']') => {
                    if (currentCls->class_empty()) {
@@ -1232,6 +1217,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                    throw LocatedParseError("POSIX named classes are only "
                                            "supported inside a class");
                };
+              # We don't support POSIX collating elements (neither does PCRE
+              # or Perl). These look like [.ch.] or [=ch=].
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
+                  throw LocatedParseError("Unsupported POSIX collating "
+                                          "element");
+              };
                # Begin eating characters for class
                '\[' => eatClass;
                # Begin quoted literal
diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt

index 1ad445b363ec1da73b93c559f826735aaef58f93..837ba871fdb9cf31b56ec94bd104eb57a1dd8a06 100644 (file)
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -128,3 +128,7 @@
  128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ñññññññññññññññññññññññññññ0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
  129:/bignum \1111111111111111111/ #Number is too big at index 7.
  130:/foo|&{5555555,}/ #Bounded repeat is too large.
+131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
+132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
+133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
+134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.
author	Justin Viiret <justin.viiret@intel.com>
	Sun, 8 Nov 2015 23:37:20 +0000 (10:37 +1100)
committer	Matthew Barr <matthew.barr@intel.com>
	Tue, 10 Nov 2015 03:36:39 +0000 (14:36 +1100)
src/parser/Parser.rl		patch \| blob \| blame \| history
unit/hyperscan/bad_patterns.txt		patch \| blob \| blame \| history