case CLASS_DIGIT:
return number;
case CLASS_GRAPH:
- case CLASS_XGRAPH:
return CharReach(0x21, 0x7e);
+ case CLASS_XGRAPH:
+ return to_cr(getPredefinedCodePointSet(c, mode));
case CLASS_HORZ:
return CharReach("\x09\x20\xA0");
case CLASS_LOWER:
}
case CLASS_PRINT:
return CharReach(0x20, 0x7e);
+ case CLASS_XPRINT:
+ return to_cr(getPredefinedCodePointSet(c, mode));
case CLASS_PUNCT:
return CharReach(0x21, '0' - 1)
| CharReach('9' + 1, 'A' - 1)
| CharReach('Z' + 1, 'a' - 1)
| CharReach('z' + 1, 126);
+ case CLASS_XPUNCT:
+ return to_cr(getPredefinedCodePointSet(c, mode));
case CLASS_SPACE:
return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
case CLASS_UPPER:
CLASS_VERT,
CLASS_WORD,
CLASS_XDIGIT,
- CLASS_XGRAPH,
+ CLASS_XGRAPH, /* [:graph:] in UCP mode */
+ CLASS_XPRINT, /* [:print:] in UCP mode */
+ CLASS_XPUNCT, /* [:punct:] in UCP mode */
CLASS_UCP_C,
CLASS_UCP_CC,
CLASS_UCP_CF,
} else {
return CLASS_UCP_LL;
}
+ case CLASS_PRINT:
+ return CLASS_XPRINT;
+ case CLASS_PUNCT:
+ return CLASS_XPUNCT;
case CLASS_SPACE:
return CLASS_UCP_XPS;
case CLASS_UPPER:
}
}
-static
CodePointSet getPredefinedCodePointSet(PredefinedClass c,
const ParseMode &mode) {
/* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
rv |= cf;
return rv;
}
+ case CLASS_XPRINT: {
+ // Same as graph, plus everything with the Zs property.
+ CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
+ rv |= getUcpZs();
+ return rv;
+ }
+ case CLASS_XPUNCT: {
+ // Everything with the P (punctuation) property, plus code points in S
+ // (symbols) that are < 128.
+ // NOTE: PCRE versions 8.37 and earlier erroneously use 256 as the
+ // cut-off here, so we are compatible with that for now. PCRE bug #1718
+ // tracks this; once PCRE 8.38 is released we should correct this
+ // behaviour.
+ CodePointSet rv = getUcpP();
+ CodePointSet symbols = getUcpS();
+ symbols.unsetRange(256, MAX_UNICODE);
+ rv |= symbols;
+ return rv;
+ }
case CLASS_HORZ: {
CodePointSet rv;
rv.set(0x0009); /* Horizontal tab */
PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
bool isUcp(PredefinedClass c);
+CodePointSet getPredefinedCodePointSet(PredefinedClass c,
+ const ParseMode &mode);
+
} // namespace
#endif // UTF8_COMPONENT_CLASS_H