]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
Fix defn of POSIX graph, print, punct classes
authorJustin Viiret <justin.viiret@intel.com>
Mon, 16 Nov 2015 05:43:43 +0000 (16:43 +1100)
committerMatthew Barr <matthew.barr@intel.com>
Sun, 6 Dec 2015 22:06:23 +0000 (09:06 +1100)
The POSIX classes [:graph:], [:print:] and [:punct:] are handled
specially in UCP mode by PCRE. This change matches that behaviour.

src/parser/ComponentClass.cpp
src/parser/ComponentClass.h
src/parser/Utf8ComponentClass.cpp
src/parser/Utf8ComponentClass.h

index 43c05898f8523b77ffa5ce1406fb0fc98853e3d0..a91ae979ff1a5a2a13015a377adfdbc1a86d2285 100644 (file)
@@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
     case CLASS_DIGIT:
         return number;
     case CLASS_GRAPH:
-    case CLASS_XGRAPH:
         return CharReach(0x21, 0x7e);
+    case CLASS_XGRAPH:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_HORZ:
         return CharReach("\x09\x20\xA0");
     case CLASS_LOWER:
@@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
         }
     case CLASS_PRINT:
         return CharReach(0x20, 0x7e);
+    case CLASS_XPRINT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_PUNCT:
         return CharReach(0x21, '0' - 1)
             | CharReach('9' + 1, 'A' - 1)
             | CharReach('Z' + 1, 'a' - 1)
             | CharReach('z' + 1, 126);
+    case CLASS_XPUNCT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_SPACE:
         return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
     case CLASS_UPPER:
index 1cb1a7d0f44eea66c42d29cea1a4ed3d6a4027b3..040e6d786c0f9dc92ac1e332915c9a8d54208475 100644 (file)
@@ -63,7 +63,9 @@ enum PredefinedClass {
     CLASS_VERT,
     CLASS_WORD,
     CLASS_XDIGIT,
-    CLASS_XGRAPH,
+    CLASS_XGRAPH, /* [:graph:] in UCP mode */
+    CLASS_XPRINT, /* [:print:] in UCP mode */
+    CLASS_XPUNCT, /* [:punct:] in UCP mode */
     CLASS_UCP_C,
     CLASS_UCP_CC,
     CLASS_UCP_CF,
index 3a6a85a401c17d394819a2fc27b180e558211919..54f9edb94eaea1907e44d1def21db7c4799894e8 100644 (file)
@@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
         } else {
             return CLASS_UCP_LL;
         }
+    case CLASS_PRINT:
+        return CLASS_XPRINT;
+    case CLASS_PUNCT:
+        return CLASS_XPUNCT;
     case CLASS_SPACE:
         return CLASS_UCP_XPS;
     case CLASS_UPPER:
@@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
     }
 }
 
-static
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                        const ParseMode &mode) {
     /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
@@ -117,6 +120,25 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
         rv |= cf;
         return rv;
     }
+    case CLASS_XPRINT: {
+        // Same as graph, plus everything with the Zs property.
+        CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
+        rv |= getUcpZs();
+        return rv;
+    }
+    case CLASS_XPUNCT: {
+        // Everything with the P (punctuation) property, plus code points in S
+        // (symbols) that are < 128.
+        // NOTE: PCRE versions 8.37 and earlier erroneously use 256 as the
+        // cut-off here, so we are compatible with that for now. PCRE bug #1718
+        // tracks this; once PCRE 8.38 is released we should correct this
+        // behaviour.
+        CodePointSet rv = getUcpP();
+        CodePointSet symbols = getUcpS();
+        symbols.unsetRange(256, MAX_UNICODE);
+        rv |= symbols;
+        return rv;
+    }
     case CLASS_HORZ: {
         CodePointSet rv;
         rv.set(0x0009); /* Horizontal tab */
index b2c402f988247dbe81a790bf6e78f4d8499b9589..3d21a278cb4967b94772e660548ad0e0150ecbf0 100644 (file)
@@ -110,6 +110,9 @@ private:
 PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
 bool isUcp(PredefinedClass c);
 
+CodePointSet getPredefinedCodePointSet(PredefinedClass c,
+                                       const ParseMode &mode);
+
 } // namespace
 
 #endif // UTF8_COMPONENT_CLASS_H