Optimizations
=============
+re
+--
+
+* Character class escapes (``\d``, ``\D``, ``\s``, ``\S``, ``\w`` and ``\W``)
+ outside a character set are now compiled to a single ``CATEGORY`` opcode
+ instead of being wrapped in an ``IN`` block. This speeds up matching of
+ patterns such as ``\d+`` and reduces the size of the compiled byte code.
+ (Contributed by Serhiy Storchaka in :gh:`152033`.)
+
module_name
-----------
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
-_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
+_UNIT_CODES = _LITERAL_CODES | {ANY, IN, CATEGORY}
_REPEATING_CODES = {
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
if iscased and iscased(av):
return None
return [(op, av)]
+ elif op is CATEGORY:
+ return [(op, av)]
elif op is BRANCH:
charset = []
charsetappend = charset.append
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
+_SETITEMCODES = frozenset({LITERAL, CATEGORY})
ESCAPES = {
r"\a": (LITERAL, ord("\a")),
r"\A": (AT, AT_BEGINNING_STRING), # start of string
r"\b": (AT, AT_BOUNDARY),
r"\B": (AT, AT_NON_BOUNDARY),
- r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
- r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
- r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
- r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
- r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
- r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
+ r"\d": (CATEGORY, CATEGORY_DIGIT),
+ r"\D": (CATEGORY, CATEGORY_NOT_DIGIT),
+ r"\s": (CATEGORY, CATEGORY_SPACE),
+ r"\S": (CATEGORY, CATEGORY_NOT_SPACE),
+ r"\w": (CATEGORY, CATEGORY_WORD),
+ r"\W": (CATEGORY, CATEGORY_NOT_WORD),
r"\z": (AT, AT_END_STRING), # end of string
r"\Z": (AT, AT_END_STRING), # end of string (obsolete)
}
if code:
return code
code = CATEGORIES.get(escape)
- if code and code[0] is IN:
+ if code and code[0] is CATEGORY:
return code
try:
c = escape[1:2]
if len(item) != 1:
break
op, av = item[0]
- if op is LITERAL:
+ if op in _SETITEMCODES:
set.append((op, av))
elif op is IN and av[0][0] is not NEGATE:
set.extend(av)
raise source.error("unterminated character set",
source.tell() - here)
if that == "]":
- if code1[0] is IN:
- code1 = code1[1][0]
setappend(code1)
setappend((LITERAL, _ord("-")))
break
raise source.error(msg, len(this) + 1 + len(that))
setappend((RANGE, (lo, hi)))
else:
- if code1[0] is IN:
- code1 = code1[1][0]
setappend(code1)
set = _uniq(set)
--- /dev/null
+Optimize matching of character class escapes (``\d``, ``\D``, ``\s``,
+``\S``, ``\w`` and ``\W``) that occur outside a character set: they are now
+compiled to a single ``CATEGORY`` opcode instead of being wrapped in an
+``IN`` block. This speeds up patterns such as ``\d+`` and reduces the size
+of the compiled byte code.
} while (0)
#define GET_SKIP GET_SKIP_ADJ(0)
+static int
+_validate_category(SRE_CODE arg)
+{
+ switch (arg) {
+ case SRE_CATEGORY_DIGIT:
+ case SRE_CATEGORY_NOT_DIGIT:
+ case SRE_CATEGORY_SPACE:
+ case SRE_CATEGORY_NOT_SPACE:
+ case SRE_CATEGORY_WORD:
+ case SRE_CATEGORY_NOT_WORD:
+ case SRE_CATEGORY_LINEBREAK:
+ case SRE_CATEGORY_NOT_LINEBREAK:
+ case SRE_CATEGORY_LOC_WORD:
+ case SRE_CATEGORY_LOC_NOT_WORD:
+ case SRE_CATEGORY_UNI_DIGIT:
+ case SRE_CATEGORY_UNI_NOT_DIGIT:
+ case SRE_CATEGORY_UNI_SPACE:
+ case SRE_CATEGORY_UNI_NOT_SPACE:
+ case SRE_CATEGORY_UNI_WORD:
+ case SRE_CATEGORY_UNI_NOT_WORD:
+ case SRE_CATEGORY_UNI_LINEBREAK:
+ case SRE_CATEGORY_UNI_NOT_LINEBREAK:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
static int
_validate_charset(SRE_CODE *code, SRE_CODE *end)
{
case SRE_OP_CATEGORY:
GET_ARG;
- switch (arg) {
- case SRE_CATEGORY_DIGIT:
- case SRE_CATEGORY_NOT_DIGIT:
- case SRE_CATEGORY_SPACE:
- case SRE_CATEGORY_NOT_SPACE:
- case SRE_CATEGORY_WORD:
- case SRE_CATEGORY_NOT_WORD:
- case SRE_CATEGORY_LINEBREAK:
- case SRE_CATEGORY_NOT_LINEBREAK:
- case SRE_CATEGORY_LOC_WORD:
- case SRE_CATEGORY_LOC_NOT_WORD:
- case SRE_CATEGORY_UNI_DIGIT:
- case SRE_CATEGORY_UNI_NOT_DIGIT:
- case SRE_CATEGORY_UNI_SPACE:
- case SRE_CATEGORY_UNI_NOT_SPACE:
- case SRE_CATEGORY_UNI_WORD:
- case SRE_CATEGORY_UNI_NOT_WORD:
- case SRE_CATEGORY_UNI_LINEBREAK:
- case SRE_CATEGORY_UNI_NOT_LINEBREAK:
- break;
- default:
+ if (!_validate_category(arg)) {
FAIL;
}
break;
}
break;
+ case SRE_OP_CATEGORY:
+ GET_ARG;
+ if (!_validate_category(arg)) {
+ FAIL;
+ }
+ break;
+
case SRE_OP_ANY:
case SRE_OP_ANY_ALL:
/* These have no operands */
SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
{
SRE_CODE chr;
+ SRE_CODE arg;
SRE_CHAR c;
const SRE_CHAR* ptr = (const SRE_CHAR *)state->ptr;
const SRE_CHAR* end = (const SRE_CHAR *)state->end;
ptr++;
break;
+ case SRE_OP_CATEGORY:
+ arg = pattern[1];
+ TRACE(("|%p|%p|COUNT CATEGORY %d\n", pattern, ptr, arg));
+ while (ptr < end && sre_category(arg, *ptr))
+ ptr++;
+ break;
+
default:
/* repeated single character pattern */
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));