]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
Unify handling of caseless flag in class parser
authorJustin Viiret <justin.viiret@intel.com>
Tue, 17 Nov 2015 06:23:52 +0000 (17:23 +1100)
committerMatthew Barr <matthew.barr@intel.com>
Sun, 6 Dec 2015 22:07:37 +0000 (09:07 +1100)
Apply caselessness to each element added to a class, rather than all at
finalize time (which required separated ucp dnf and-ucp working data).

Unifies the behaviour of AsciiComponentClass and Utf8ComponentClass in
this respect.

src/parser/AsciiComponentClass.cpp
src/parser/AsciiComponentClass.h
src/parser/Utf8ComponentClass.cpp
src/parser/Utf8ComponentClass.h

index 44ecb5bb70205d66b30e8d5b2fb77f01b66bc512..7cfa6e11b3e5cf43be74ce410d371456ed970e54 100644 (file)
@@ -61,11 +61,15 @@ void AsciiComponentClass::createRange(unichar to) {
     unsigned char from = (u8)range_start;
     if (from > to) {
         throw LocatedParseError("Range out of order in character class");
-    } else {
-        in_cand_range = false;
-        cr.setRange(from, to);
-        range_start = INVALID_UNICODE;
     }
+
+    in_cand_range = false;
+    CharReach ncr(from, to);
+    if (mode.caseless) {
+        make_caseless(&ncr);
+    }
+    cr |= ncr;
+    range_start = INVALID_UNICODE;
 }
 
 void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
@@ -95,16 +99,13 @@ void AsciiComponentClass::add(PredefinedClass c, bool negative) {
         c = translateForUcpMode(c, mode);
     }
 
+    // Note: caselessness is handled by getPredefinedCharReach.
     CharReach pcr = getPredefinedCharReach(c, mode);
     if (negative) {
         pcr.flip();
     }
 
-    if (isUcp(c)) {
-        cr_ucp |= pcr;
-    } else {
-        cr |= pcr;
-    }
+    cr |= pcr;
     range_start = INVALID_UNICODE;
     in_cand_range = false;
 }
@@ -120,7 +121,12 @@ void AsciiComponentClass::add(unichar c) {
         return;
     }
 
-    cr.set(c);
+    CharReach ncr(c, c);
+    if (mode.caseless) {
+        make_caseless(&ncr);
+    }
+
+    cr |= ncr;
     range_start = c;
 }
 
@@ -136,12 +142,6 @@ void AsciiComponentClass::finalize() {
         in_cand_range = false;
     }
 
-    if (mode.caseless) {
-        make_caseless(&cr);
-    }
-
-    cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
-
     if (m_negate) {
         cr.flip();
     }
index 2d5ef843435c69f4a3cde09152dca15a610083d5..925fa9bff4f9c58524eb4b79d2c458b28d36ea1c 100644 (file)
@@ -78,12 +78,10 @@ protected:
 private:
     Position position;
     CharReach cr;
-    CharReach cr_ucp;
 
     // Private copy ctor. Use clone instead.
     AsciiComponentClass(const AsciiComponentClass &other)
-        : ComponentClass(other), position(other.position), cr(other.cr),
-          cr_ucp(other.cr_ucp) {}
+        : ComponentClass(other), position(other.position), cr(other.cr) {}
 };
 
 } // namespace ue2
index 54f9edb94eaea1907e44d1def21db7c4799894e8..21707902a0e36852f03911caf95fe762f3dafdcf 100644 (file)
@@ -515,16 +515,16 @@ void UTF8ComponentClass::createRange(unichar to) {
     unichar from = range_start;
     if (from > to) {
         throw LocatedParseError("Range out of order in character class");
-    } else {
-        in_cand_range = false;
-        CodePointSet ncps;
-        ncps.setRange(from, to);
-        if (mode.caseless) {
-            make_caseless(&ncps);
-        }
-        cps |= ncps;
-        range_start = INVALID_UNICODE;
     }
+
+    in_cand_range = false;
+    CodePointSet ncps;
+    ncps.setRange(from, to);
+    if (mode.caseless) {
+        make_caseless(&ncps);
+    }
+    cps |= ncps;
+    range_start = INVALID_UNICODE;
 }
 
 void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
@@ -543,11 +543,7 @@ void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
         pcps.flip();
     }
 
-    if (isUcp(c)) {
-        cps_ucp |= pcps;
-    } else {
-        cps |= pcps;
-    }
+    cps |= pcps;
 
     range_start = INVALID_UNICODE;
     in_cand_range = false;
@@ -585,8 +581,6 @@ void UTF8ComponentClass::finalize() {
         in_cand_range = false;
     }
 
-    cps |= cps_ucp; /* characters from ucp props always case sensitive */
-
     if (m_negate) {
         cps.flip();
     }
@@ -594,31 +588,6 @@ void UTF8ComponentClass::finalize() {
     finalized = true;
 }
 
-bool isUcp(PredefinedClass c) {
-    switch (c) {
-    case CLASS_ALNUM:
-    case CLASS_ALPHA:
-    case CLASS_ANY:
-    case CLASS_ASCII:
-    case CLASS_BLANK:
-    case CLASS_CNTRL:
-    case CLASS_DIGIT:
-    case CLASS_GRAPH:
-    case CLASS_HORZ:
-    case CLASS_LOWER:
-    case CLASS_PRINT:
-    case CLASS_PUNCT:
-    case CLASS_SPACE:
-    case CLASS_UPPER:
-    case CLASS_VERT:
-    case CLASS_WORD:
-    case CLASS_XDIGIT:
-        return false;
-    default:
-        return true;
-    }
-}
-
 Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
     map<u8, Position>::const_iterator it = heads.find(first_byte);
     if (it != heads.end()) {
index 3d21a278cb4967b94772e660548ad0e0150ecbf0..f4e7ea328d378fba89690d61babb802a029a73c1 100644 (file)
@@ -93,7 +93,6 @@ private:
     void buildFourByte(GlushkovBuildState &bs);
 
     CodePointSet cps;
-    CodePointSet cps_ucp;
 
     std::map<u8, Position> heads;
     Position single_pos;
@@ -108,7 +107,6 @@ private:
 };
 
 PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
-bool isUcp(PredefinedClass c);
 
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                        const ParseMode &mode);