From f502c73012c931cd23bc1df6af7af0cc6410fe6c Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Jun 2001 15:15:02 +0000 Subject: [PATCH] Bring SRE up do date with Python 2.1 --- Lib/sre.py | 81 ++++++++++--- Lib/sre_compile.py | 27 +++-- Lib/sre_constants.py | 36 +++++- Lib/sre_parse.py | 160 ++++++++++++++++--------- Misc/NEWS | 2 + Modules/_sre.c | 257 ++++++++++++++++++++++++++++------------ Modules/sre_constants.h | 17 ++- 7 files changed, 410 insertions(+), 170 deletions(-) diff --git a/Lib/sre.py b/Lib/sre.py index 6dea5c40456f..6706fac8692e 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -3,7 +3,7 @@ # # re-compatible interface for the sre matching engine # -# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. # # This version of the SRE library can be redistributed under CNRI's # Python 1.6 license. For any other use, please contact Secret Labs @@ -14,23 +14,31 @@ # other compatibility work. # -# FIXME: change all FIXME's to XXX ;-) - import sre_compile import sre_parse +# public symbols +__all__ = [ "match", "search", "sub", "subn", "split", "findall", + "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", + "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", + "UNICODE", "error" ] + +__version__ = "2.1b2" + +# this module works under 1.5.2 and later. don't use string methods import string # flags -I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE -L = LOCALE = sre_compile.SRE_FLAG_LOCALE -M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE -S = DOTALL = sre_compile.SRE_FLAG_DOTALL -X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE +I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case +L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale +U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale +M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline +S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline +X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments -# sre extensions (may or may not be in 1.6/2.0 final) -T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE -U = UNICODE = sre_compile.SRE_FLAG_UNICODE +# sre extensions (experimental, don't rely on these) +T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking +DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation # sre exception error = sre_compile.error @@ -38,36 +46,60 @@ error = sre_compile.error # -------------------------------------------------------------------- # public interface -# FIXME: add docstrings - def match(pattern, string, flags=0): + """Try to apply the pattern at the start of the string, returning + a match object, or None if no match was found.""" return _compile(pattern, flags).match(string) def search(pattern, string, flags=0): + """Scan through string looking for a match to the pattern, returning + a match object, or None if no match was found.""" return _compile(pattern, flags).search(string) def sub(pattern, repl, string, count=0): + """Return the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in string by the + replacement repl""" return _compile(pattern, 0).sub(repl, string, count) def subn(pattern, repl, string, count=0): + """Return a 2-tuple containing (new_string, number). + new_string is the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in the source + string by the replacement repl. number is the number of + substitutions that were made.""" return _compile(pattern, 0).subn(repl, string, count) def split(pattern, string, maxsplit=0): + """Split the source string by the occurrences of the pattern, + returning a list containing the resulting substrings.""" return _compile(pattern, 0).split(string, maxsplit) def findall(pattern, string, maxsplit=0): + """Return a list of all non-overlapping matches in the string. + + If one or more groups are present in the pattern, return a + list of groups; this will be a list of tuples if the pattern + has more than one group. + + Empty matches are included in the result.""" return _compile(pattern, 0).findall(string, maxsplit) def compile(pattern, flags=0): + "Compile a regular expression pattern, returning a pattern object." return _compile(pattern, flags) def purge(): + "Clear the regular expression cache" _cache.clear() + _cache_repl.clear() def template(pattern, flags=0): + "Compile a template pattern, returning a pattern object" return _compile(pattern, flags|T) def escape(pattern): + "Escape all non-alphanumeric characters in pattern." s = list(pattern) for i in range(len(pattern)): c = pattern[i] @@ -82,6 +114,8 @@ def escape(pattern): # internals _cache = {} +_cache_repl = {} + _MAXCACHE = 100 def _join(seq, sep): @@ -105,6 +139,21 @@ def _compile(*key): _cache[key] = p return p +def _compile_repl(*key): + # internal: compile replacement pattern + p = _cache_repl.get(key) + if p is not None: + return p + repl, pattern = key + try: + p = sre_parse.parse_template(repl, pattern) + except error, v: + raise error, v # invalid expression + if len(_cache_repl) >= _MAXCACHE: + _cache_repl.clear() + _cache_repl[key] = p + return p + def _expand(pattern, match, template): # internal: match.expand implementation hook template = sre_parse.parse_template(template, pattern) @@ -119,7 +168,7 @@ def _subn(pattern, template, string, count=0): if callable(template): filter = template else: - template = sre_parse.parse_template(template, pattern) + template = _compile_repl(template, pattern) def filter(match, template=template): return sre_parse.expand_template(template, match) n = i = 0 @@ -158,7 +207,7 @@ def _split(pattern, string, maxsplit=0): continue append(string[i:b]) if g and b != e: - extend(m.groups()) + extend(list(m.groups())) i = e n = n + 1 append(string[i:]) @@ -204,7 +253,7 @@ class Scanner: break action = self.lexicon[m.lastindex][1] if callable(action): - self.match = match + self.match = m action = action(self, m.group()) if action is not None: append(action) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index dc508e57cdc4..44cb23e6a4a7 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -3,7 +3,7 @@ # # convert template to internal format # -# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. +# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. # # See the sre.py file for information on usage and redistribution. # @@ -12,6 +12,8 @@ import _sre from sre_constants import * +assert _sre.MAGIC == MAGIC, "SRE module mismatch" + MAXCODE = 65535 def _compile(code, pattern, flags): @@ -21,9 +23,10 @@ def _compile(code, pattern, flags): if op in (LITERAL, NOT_LITERAL): if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) + emit(_sre.getlower(av, flags)) else: emit(OPCODES[op]) - emit(av) + emit(av) elif op is IN: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) @@ -102,9 +105,12 @@ def _compile(code, pattern, flags): elif op is AT: emit(OPCODES[op]) if flags & SRE_FLAG_MULTILINE: - emit(ATCODES[AT_MULTILINE.get(av, av)]) - else: - emit(ATCODES[av]) + av = AT_MULTILINE.get(av, av) + if flags & SRE_FLAG_LOCALE: + av = AT_LOCALE.get(av, av) + elif flags & SRE_FLAG_UNICODE: + av = AT_UNICODE.get(av, av) + emit(ATCODES[av]) elif op is BRANCH: emit(OPCODES[op]) tail = [] @@ -121,11 +127,10 @@ def _compile(code, pattern, flags): elif op is CATEGORY: emit(OPCODES[op]) if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) + av = CH_LOCALE[av] elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) - else: - emit(CHCODES[av]) + av = CH_UNICODE[av] + emit(CHCODES[av]) elif op is GROUPREF: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) @@ -176,7 +181,7 @@ def _optimize_charset(charset, fixup): for i in range(fixup(av[0]), fixup(av[1])+1): charmap[i] = 1 elif op is CATEGORY: - # FIXME: could append to charmap tail + # XXX: could append to charmap tail return charset # cannot compress except IndexError: # character set contains unicode characters @@ -364,7 +369,7 @@ def compile(p, flags=0): # print code - # FIXME: get rid of this limitation! + # XXX: get rid of this limitation! assert p.pattern.groups <= 100,\ "sorry, but this version only supports 100 named groups" diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index ea649c048293..bbe7880a1d55 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -4,13 +4,20 @@ # various symbols used by the regular expression engine. # run this script to update the _sre include files! # -# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. # # See the sre.py file for information on usage and redistribution. # +# update when constants are added or removed + +MAGIC = 20010320 + +# max code word in this release + MAXREPEAT = 65535 +# SRE standard exception (access as sre.error) # should this really be here? class error(Exception): @@ -54,10 +61,16 @@ SUBPATTERN = "subpattern" # positions AT_BEGINNING = "at_beginning" AT_BEGINNING_LINE = "at_beginning_line" +AT_BEGINNING_STRING = "at_beginning_string" AT_BOUNDARY = "at_boundary" AT_NON_BOUNDARY = "at_non_boundary" AT_END = "at_end" AT_END_LINE = "at_end_line" +AT_END_STRING = "at_end_string" +AT_LOC_BOUNDARY = "at_loc_boundary" +AT_LOC_NON_BOUNDARY = "at_loc_non_boundary" +AT_UNI_BOUNDARY = "at_uni_boundary" +AT_UNI_NON_BOUNDARY = "at_uni_non_boundary" # categories CATEGORY_DIGIT = "category_digit" @@ -109,8 +122,10 @@ OPCODES = [ ] ATCODES = [ - AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY, - AT_NON_BOUNDARY, AT_END, AT_END_LINE + AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, + AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING, + AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY, + AT_UNI_NON_BOUNDARY ] CHCODES = [ @@ -148,6 +163,16 @@ AT_MULTILINE = { AT_END: AT_END_LINE } +AT_LOCALE = { + AT_BOUNDARY: AT_LOC_BOUNDARY, + AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY +} + +AT_UNICODE = { + AT_BOUNDARY: AT_UNI_BOUNDARY, + AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY +} + CH_LOCALE = { CATEGORY_DIGIT: CATEGORY_DIGIT, CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, @@ -178,6 +203,7 @@ SRE_FLAG_MULTILINE = 8 # treat target as multiline string SRE_FLAG_DOTALL = 16 # treat target as a single string SRE_FLAG_UNICODE = 32 # use unicode locale SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments +SRE_FLAG_DEBUG = 128 # debugging # flags for INFO primitive SRE_INFO_PREFIX = 1 # has prefix @@ -201,13 +227,15 @@ if __name__ == "__main__": * NOTE: This file is generated by sre_constants.py. If you need * to change anything in here, edit sre_constants.py and run it. * - * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * * See the _sre.c file for information on usage and redistribution. */ """) + f.write("#define SRE_MAGIC %d\n" % MAGIC) + dump(f, OPCODES, "SRE_OP") dump(f, ATCODES, "SRE") dump(f, CHCODES, "SRE") diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 7c36d4f2dcb2..44626bd5e82f 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -3,11 +3,14 @@ # # convert re-style regular expression to sre pattern # -# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. # # See the sre.py file for information on usage and redistribution. # +# XXX: show string offset and offending character for all errors + +# this module works under 1.5.2 and later. don't use string methods import string, sys from sre_constants import * @@ -23,18 +26,18 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF") WHITESPACE = tuple(" \t\n\r\v\f") ESCAPES = { - r"\a": (LITERAL, 7), - r"\b": (LITERAL, 8), - r"\f": (LITERAL, 12), - r"\n": (LITERAL, 10), - r"\r": (LITERAL, 13), - r"\t": (LITERAL, 9), - r"\v": (LITERAL, 11), + r"\a": (LITERAL, ord("\a")), + r"\b": (LITERAL, ord("\b")), + r"\f": (LITERAL, ord("\f")), + r"\n": (LITERAL, ord("\n")), + r"\r": (LITERAL, ord("\r")), + r"\t": (LITERAL, ord("\t")), + r"\v": (LITERAL, ord("\v")), r"\\": (LITERAL, ord("\\")) } CATEGORIES = { - r"\A": (AT, AT_BEGINNING), # start of string + r"\A": (AT, AT_BEGINNING_STRING), # start of string r"\b": (AT, AT_BOUNDARY), r"\B": (AT, AT_NON_BOUNDARY), r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), @@ -43,7 +46,7 @@ CATEGORIES = { r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - r"\Z": (AT, AT_END), # end of string + r"\Z": (AT, AT_END_STRING), # end of string } FLAGS = { @@ -58,18 +61,31 @@ FLAGS = { "u": SRE_FLAG_UNICODE, } +# figure out best way to convert hex/octal numbers to integers +try: + int("10", 8) + atoi = int # 2.0 and later +except TypeError: + atoi = string.atoi # 1.5.2 + class Pattern: # master pattern object. keeps track of global attributes def __init__(self): self.flags = 0 + self.open = [] self.groups = 1 self.groupdict = {} - def getgroup(self, name=None): + def opengroup(self, name=None): gid = self.groups self.groups = gid + 1 if name: self.groupdict[name] = gid + self.open.append(gid) return gid + def closegroup(self, gid): + self.open.remove(gid) + def checkgroup(self, gid): + return gid < self.groups and gid not in self.open class SubPattern: # a subpattern, in intermediate form @@ -208,7 +224,7 @@ def isname(name): def _group(escape, groups): # check if the escape string represents a valid group try: - gid = int(escape[1:]) + gid = atoi(escape[1:]) if gid and gid < groups: return gid except ValueError: @@ -231,13 +247,13 @@ def _class_escape(source, escape): escape = escape[2:] if len(escape) != 2: raise error, "bogus escape: %s" % repr("\\" + escape) - return LITERAL, int(escape, 16) & 0xff + return LITERAL, atoi(escape, 16) & 0xff elif str(escape[1:2]) in OCTDIGITS: # octal escape (up to three digits) while source.next in OCTDIGITS and len(escape) < 5: escape = escape + source.get() escape = escape[1:] - return LITERAL, int(escape, 8) & 0xff + return LITERAL, atoi(escape, 8) & 0xff if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: @@ -259,12 +275,12 @@ def _escape(source, escape, state): escape = escape + source.get() if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, atoi(escape[2:], 16) & 0xff elif escape[1:2] == "0": # octal escape while source.next in OCTDIGITS and len(escape) < 4: escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + return LITERAL, atoi(escape[1:], 8) & 0xff elif escape[1:2] in DIGITS: # octal escape *or* decimal group reference (sigh) here = source.tell() @@ -274,10 +290,12 @@ def _escape(source, escape, state): source.next in OCTDIGITS): # got three octal digits; this is an octal escape escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + return LITERAL, atoi(escape[1:], 8) & 0xff # got at least one decimal digit; this is a group reference group = _group(escape, state.groups) if group: + if not state.checkgroup(group): + raise error, "cannot refer to open group" return GROUPREF, group raise ValueError if len(escape) == 2: @@ -402,24 +420,24 @@ def _parse(source, state): else: code2 = LITERAL, ord(this) if code1[0] != LITERAL or code2[0] != LITERAL: - raise error, "illegal range" + raise error, "bad character range" lo = code1[1] hi = code2[1] if hi < lo: - raise error, "illegal range" + raise error, "bad character range" set.append((RANGE, (lo, hi))) else: if code1[0] is IN: code1 = code1[1][0] set.append(code1) - # FIXME: move set optimization to compiler! + # XXX: should move set optimization to compiler! if len(set)==1 and set[0][0] is LITERAL: subpattern.append(set[0]) # optimization elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: subpattern.append((NOT_LITERAL, set[1][1])) # optimization else: - # FIXME: add charmap optimization + # XXX: should add charmap optimization here subpattern.append((IN, set)) elif this and this[0] in REPEAT_CHARS: @@ -428,6 +446,7 @@ def _parse(source, state): min, max = 0, 1 elif this == "*": min, max = 0, MAXREPEAT + elif this == "+": min, max = 1, MAXREPEAT elif this == "{": @@ -446,17 +465,22 @@ def _parse(source, state): source.seek(here) continue if lo: - min = int(lo) + min = atoi(lo) if hi: - max = int(hi) - # FIXME: check that hi >= lo! + max = atoi(hi) + if max < min: + raise error, "bad repeat interval" else: raise error, "not supported" # figure out which item to repeat if subpattern: item = subpattern[-1:] else: + item = None + if not item or (len(item) == 1 and item[0][0] == AT): raise error, "nothing to repeat" + if item[0][0] in (MIN_REPEAT, MAX_REPEAT): + raise error, "multiple repeat" if source.match("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) else: @@ -485,7 +509,7 @@ def _parse(source, state): name = name + char group = 1 if not isname(name): - raise error, "illegal character in group name" + raise error, "bad character in group name" elif source.match("="): # named backreference name = "" @@ -497,7 +521,7 @@ def _parse(source, state): break name = name + char if not isname(name): - raise error, "illegal character in group name" + raise error, "bad character in group name" gid = state.groupdict.get(name) if gid is None: raise error, "unknown group name" @@ -539,6 +563,8 @@ def _parse(source, state): continue else: # flags + if not FLAGS.has_key(source.next): + raise error, "unexpected end of pattern" while FLAGS.has_key(source.next): state.flags = state.flags | FLAGS[source.get()] if group: @@ -547,15 +573,19 @@ def _parse(source, state): # anonymous group group = None else: - group = state.getgroup(name) + group = state.opengroup(name) p = _parse_sub(source, state) if not source.match(")"): raise error, "unbalanced parenthesis" + if group is not None: + state.closegroup(group) subpattern.append((SUBPATTERN, (group, p))) else: while 1: char = source.get() - if char is None or char == ")": + if char is None: + raise error, "unexpected end of pattern" + if char == ")": break raise error, "unknown extension" @@ -582,6 +612,7 @@ def parse(str, flags=0, pattern=None): if pattern is None: pattern = Pattern() pattern.flags = flags + pattern.str = str p = _parse_sub(source, pattern, 0) @@ -591,7 +622,8 @@ def parse(str, flags=0, pattern=None): elif tail: raise error, "bogus characters at end of regular expression" - # p.dump() + if flags & SRE_FLAG_DEBUG: + p.dump() if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: # the VERBOSE flag was switched on inside the pattern. to be @@ -606,6 +638,16 @@ def parse_template(source, pattern): s = Tokenizer(source) p = [] a = p.append + def literal(literal, p=p): + if p and p[-1][0] is LITERAL: + p[-1] = LITERAL, p[-1][1] + literal + else: + p.append((LITERAL, literal)) + sep = source[:0] + if type(sep) is type(""): + char = chr + else: + char = unichr while 1: this = s.get() if this is None: @@ -625,10 +667,10 @@ def parse_template(source, pattern): if not name: raise error, "bad group name" try: - index = int(name) + index = atoi(name) except ValueError: if not isname(name): - raise error, "illegal character in group name" + raise error, "bad character in group name" try: index = pattern.groupindex[name] except KeyError: @@ -641,7 +683,7 @@ def parse_template(source, pattern): if group: if (s.next not in DIGITS or not _group(this + s.next, pattern.groups+1)): - code = MARK, int(group) + code = MARK, group break elif s.next in OCTDIGITS: this = this + s.get() @@ -649,34 +691,42 @@ def parse_template(source, pattern): break if not code: this = this[1:] - code = LITERAL, int(this[-6:], 8) & 0xff - a(code) + code = LITERAL, char(atoi(this[-6:], 8) & 0xff) + if code[0] is LITERAL: + literal(code[1]) + else: + a(code) else: try: - a(ESCAPES[this]) + this = char(ESCAPES[this][1]) except KeyError: - for c in this: - a((LITERAL, ord(c))) + pass + literal(this) else: - a((LITERAL, ord(this))) - return p + literal(this) + # convert template to groups and literals lists + i = 0 + groups = [] + literals = [] + for c, s in p: + if c is MARK: + groups.append((i, s)) + literals.append(None) + else: + literals.append(s) + i = i + 1 + return groups, literals def expand_template(template, match): - # FIXME: this is sooooo slow. drop in the slicelist - # code instead - p = [] - a = p.append + g = match.group sep = match.string[:0] - if type(sep) is type(""): - char = chr - else: - char = unichr - for c, s in template: - if c is LITERAL: - a(char(s)) - elif c is MARK: - s = match.group(s) + groups, literals = template + literals = literals[:] + try: + for index, group in groups: + literals[index] = s = g(group) if s is None: - raise error, "empty group" - a(s) - return string.join(p, sep) + raise IndexError + except IndexError: + raise error, "empty group" + return string.join(literals, sep) diff --git a/Misc/NEWS b/Misc/NEWS index e0a85dde6a14..6a88fa778c30 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -13,6 +13,8 @@ http://sourceforge.net/bugs/?func=detailbug&bug_id=&group_id=5470 More recent bugs are accessed as http://sourceforge.net/tracker/index.php?func=detail&aid=&group_id=5470&atid=105470 +- Brought SRE up to date with Python 2.1 + - #117278, #117167: _tkinter - #116172, curses module fails to build on SGI, _curses diff --git a/Modules/_sre.c b/Modules/_sre.c index b72b8b2c1a6d..308b7260b57f 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -5,14 +5,14 @@ * * partial history: * 1999-10-24 fl created (based on existing template matcher code) - * 2000-03-06 fl first alpha, sort of (0.5) - * 2000-06-30 fl added fast search optimization (0.9.3) - * 2000-06-30 fl added assert (lookahead) primitives, etc (0.9.4) - * 2000-07-02 fl added charset optimizations, etc (0.9.5) + * 2000-03-06 fl first alpha, sort of + * 2000-06-30 fl added fast search optimization + * 2000-06-30 fl added assert (lookahead) primitives, etc + * 2000-07-02 fl added charset optimizations, etc * 2000-07-03 fl store code in pattern object, lookbehind, etc * 2000-07-08 fl added regs attribute - * 2000-07-21 fl reset lastindex in scanner methods (0.9.6) - * 2000-08-01 fl fixes for 1.6b1 (0.9.8) + * 2000-07-21 fl reset lastindex in scanner methods + * 2000-08-01 fl fixes for 1.6b1 * 2000-08-03 fl added recursion limit * 2000-08-07 fl use PyOS_CheckStack() if available * 2000-08-08 fl changed findall to return empty strings instead of None @@ -21,8 +21,15 @@ * 2000-09-20 fl added expand method * 2000-09-21 fl don't use the buffer interface for unicode strings * 2000-10-03 fl fixed assert_not primitive; support keyword arguments + * 2000-10-24 fl really fixed assert_not; reset groups in findall + * 2000-12-21 fl fixed memory leak in groupdict + * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL + * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug + * 2001-01-16 fl fixed memory leak in pattern destructor + * 2001-03-20 fl lots of fixes for 2.1b2 + * 2001-04-15 fl export copyright as Python attribute, not global * - * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * * This version of the SRE library can be redistributed under CNRI's * Python 1.6 license. For any other use, please contact Secret Labs @@ -35,7 +42,8 @@ #ifndef SRE_RECURSIVE -char copyright[] = " SRE 0.9.8 Copyright (c) 1997-2000 by Secret Labs AB "; +static char copyright[] = + " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB "; #include "Python.h" @@ -44,7 +52,9 @@ char copyright[] = " SRE 0.9.8 Copyright (c) 1997-2000 by Secret Labs AB "; #include /* name of this module, minus the leading underscore */ -#define MODULE "sre" +#if !defined(SRE_MODULE) +#define SRE_MODULE "sre" +#endif /* defining this one enables tracing */ #undef VERBOSE @@ -76,6 +86,10 @@ char copyright[] = " SRE 0.9.8 Copyright (c) 1997-2000 by Secret Labs AB "; /* enables aggressive inlining (always on for Visual C) */ #undef USE_INLINE +#if PY_VERSION_HEX < 0x01060000 +#define PyObject_DEL(op) PyMem_DEL((op)) +#endif + /* -------------------------------------------------------------------- */ #if defined(_MSC_VER) @@ -130,11 +144,6 @@ static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 }; -static unsigned int sre_lower(unsigned int ch) -{ - return ((ch) < 128 ? sre_char_lower[ch] : ch); -} - #define SRE_IS_DIGIT(ch)\ ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0) #define SRE_IS_SPACE(ch)\ @@ -146,30 +155,39 @@ static unsigned int sre_lower(unsigned int ch) #define SRE_IS_WORD(ch)\ ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0) -/* locale-specific character predicates */ - -static unsigned int sre_lower_locale(unsigned int ch) +static unsigned int sre_lower(unsigned int ch) { - return ((ch) < 256 ? tolower((ch)) : ch); + return ((ch) < 128 ? sre_char_lower[ch] : ch); } + +/* locale-specific character predicates */ + #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0) #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0) #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n') #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0) #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_') +static unsigned int sre_lower_locale(unsigned int ch) +{ + return ((ch) < 256 ? tolower((ch)) : ch); +} + /* unicode-specific character predicates */ #if defined(HAVE_UNICODE) -static unsigned int sre_lower_unicode(unsigned int ch) -{ - return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); -} + #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch)) #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_') + +static unsigned int sre_lower_unicode(unsigned int ch) +{ + return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); +} + #endif LOCAL(int) @@ -216,6 +234,23 @@ sre_category(SRE_CODE category, unsigned int ch) return SRE_UNI_IS_LINEBREAK(ch); case SRE_CATEGORY_UNI_NOT_LINEBREAK: return !SRE_UNI_IS_LINEBREAK(ch); +#else + case SRE_CATEGORY_UNI_DIGIT: + return SRE_IS_DIGIT(ch); + case SRE_CATEGORY_UNI_NOT_DIGIT: + return !SRE_IS_DIGIT(ch); + case SRE_CATEGORY_UNI_SPACE: + return SRE_IS_SPACE(ch); + case SRE_CATEGORY_UNI_NOT_SPACE: + return !SRE_IS_SPACE(ch); + case SRE_CATEGORY_UNI_WORD: + return SRE_LOC_IS_WORD(ch); + case SRE_CATEGORY_UNI_NOT_WORD: + return !SRE_LOC_IS_WORD(ch); + case SRE_CATEGORY_UNI_LINEBREAK: + return SRE_IS_LINEBREAK(ch); + case SRE_CATEGORY_UNI_NOT_LINEBREAK: + return !SRE_IS_LINEBREAK(ch); #endif } return 0; @@ -354,6 +389,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) switch (at) { case SRE_AT_BEGINNING: + case SRE_AT_BEGINNING_STRING: return ((void*) ptr == state->beginning); case SRE_AT_BEGINNING_LINE: @@ -369,6 +405,9 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) return ((void*) ptr == state->end || SRE_IS_LINEBREAK((int) ptr[0])); + case SRE_AT_END_STRING: + return ((void*) ptr == state->end); + case SRE_AT_BOUNDARY: if (state->beginning == state->end) return 0; @@ -386,6 +425,42 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) this = ((void*) ptr < state->end) ? SRE_IS_WORD((int) ptr[0]) : 0; return this == that; + + case SRE_AT_LOC_BOUNDARY: + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_LOC_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_LOC_IS_WORD((int) ptr[0]) : 0; + return this != that; + + case SRE_AT_LOC_NON_BOUNDARY: + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_LOC_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_LOC_IS_WORD((int) ptr[0]) : 0; + return this == that; + + case SRE_AT_UNI_BOUNDARY: + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_UNI_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_UNI_IS_WORD((int) ptr[0]) : 0; + return this != that; + + case SRE_AT_UNI_NON_BOUNDARY: + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_UNI_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_UNI_IS_WORD((int) ptr[0]) : 0; + return this == that; } return 0; @@ -783,13 +858,13 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) /* */ TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1])); state->ptr = ptr - pattern[1]; - if (state->ptr < state->beginning) - return 0; - i = SRE_MATCH(state, pattern + 2, level + 1); - if (i < 0) - return i; - if (i) - return 0; + if (state->ptr >= state->beginning) { + i = SRE_MATCH(state, pattern + 2, level + 1); + if (i < 0) + return i; + if (i) + return 0; + } pattern += pattern[0]; break; @@ -825,7 +900,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) /* this operator only works if the repeated item is exactly one character wide, and we're not already collecting backtracking points. for other cases, - use the MAX_REPEAT operator instead */ + use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ @@ -899,7 +974,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) case SRE_OP_REPEAT: /* create repeat context. all the hard work is done - by the UNTIL operator */ + by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr, pattern[1], pattern[2])); @@ -973,6 +1048,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) if (i) return i; state->repeat = rp; + state->ptr = ptr; return 0; case SRE_OP_MIN_UNTIL: @@ -985,7 +1061,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) count = rp->count + 1; - TRACE(("|%p|%p|MIN_UNTIL %d\n", pattern, ptr, count)); + TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count, + rp->pattern)); state->ptr = ptr; @@ -1003,11 +1080,23 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) /* see if the tail matches */ state->repeat = rp->prev; - i = SRE_MATCH(state, pattern, level + 1); + /* FIXME: the following fix doesn't always work (#133283) */ + if (0 && rp->pattern[2] == 65535) { + /* unbounded repeat */ + for (;;) { + i = SRE_MATCH(state, pattern, level + 1); + if (i || ptr >= end) + break; + state->ptr = ++ptr; + } + } else + i = SRE_MATCH(state, pattern, level + 1); if (i) { /* free(rp); */ return i; } + + state->ptr = ptr; state->repeat = rp; if (count >= rp->pattern[2] && rp->pattern[2] != 65535) @@ -1019,6 +1108,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) if (i) return i; rp->count = count - 1; + state->ptr = ptr; return 0; default: @@ -1185,35 +1275,26 @@ _compile(PyObject* self_, PyObject* args) int groups = 0; PyObject* groupindex = NULL; PyObject* indexgroup = NULL; - if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code, - &groups, &groupindex, &indexgroup)) - return NULL; - - code = PySequence_Fast(code, "code argument must be a sequence"); - if (!code) + if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags, + &PyList_Type, &code, &groups, + &groupindex, &indexgroup)) return NULL; -#if PY_VERSION_HEX >= 0x01060000 - n = PySequence_Size(code); -#else - n = PySequence_Length(code); -#endif + n = PyList_GET_SIZE(code); - self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n); - if (!self) { - Py_DECREF(code); + self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n); + if (!self) return NULL; - } for (i = 0; i < n; i++) { - PyObject *o = PySequence_Fast_GET_ITEM(code, i); + PyObject *o = PyList_GET_ITEM(code, i); self->code[i] = (SRE_CODE) PyInt_AsLong(o); } - Py_DECREF(code); - - if (PyErr_Occurred()) + if (PyErr_Occurred()) { + PyObject_DEL(self); return NULL; + } Py_INCREF(pattern); self->pattern = pattern; @@ -1245,9 +1326,11 @@ sre_getlower(PyObject* self, PyObject* args) return NULL; if (flags & SRE_FLAG_LOCALE) return Py_BuildValue("i", sre_lower_locale(character)); -#if defined(HAVE_UNICODE) if (flags & SRE_FLAG_UNICODE) +#if defined(HAVE_UNICODE) return Py_BuildValue("i", sre_lower_unicode(character)); +#else + return Py_BuildValue("i", sre_lower_locale(character)); #endif return Py_BuildValue("i", sre_lower(character)); } @@ -1355,9 +1438,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, if (pattern->flags & SRE_FLAG_LOCALE) state->lower = sre_lower_locale; -#if defined(HAVE_UNICODE) else if (pattern->flags & SRE_FLAG_UNICODE) +#if defined(HAVE_UNICODE) state->lower = sre_lower_unicode; +#else + state->lower = sre_lower_locale; #endif else state->lower = sre_lower; @@ -1495,7 +1580,7 @@ pattern_scanner(PatternObject* pattern, PyObject* args) string = state_init(&self->state, pattern, string, start, end); if (!string) { - PyObject_Del(self); + PyObject_DEL(self); return NULL; } @@ -1510,6 +1595,7 @@ pattern_dealloc(PatternObject* self) { Py_XDECREF(self->pattern); Py_XDECREF(self->groupindex); + Py_XDECREF(self->indexgroup); PyObject_DEL(self); } @@ -1593,7 +1679,7 @@ call(char* function, PyObject* args) PyObject* func; PyObject* result; - name = PyString_FromString(MODULE); + name = PyString_FromString(SRE_MODULE); if (!name) return NULL; module = PyImport_Import(name); @@ -1680,6 +1766,8 @@ pattern_findall(PatternObject* self, PyObject* args, PyObject* kw) PyObject* item; + state_reset(&state); + state.ptr = state.start; if (state.charsize == 1) { @@ -1962,7 +2050,7 @@ match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) PyObject* def = Py_None; static char* kwlist[] = { "default", NULL }; - if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def)) + if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def)) return NULL; result = PyDict_New(); @@ -1970,34 +2058,35 @@ match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) return result; keys = PyMapping_Keys(self->pattern->groupindex); - if (!keys) { - Py_DECREF(result); - return NULL; - } + if (!keys) + goto failed; for (index = 0; index < PyList_GET_SIZE(keys); index++) { + int status; PyObject* key; - PyObject* item; + PyObject* value; key = PyList_GET_ITEM(keys, index); - if (!key) { - Py_DECREF(keys); - Py_DECREF(result); - return NULL; - } - item = match_getslice(self, key, def); - if (!item) { + if (!key) + goto failed; + value = match_getslice(self, key, def); + if (!value) { Py_DECREF(key); - Py_DECREF(keys); - Py_DECREF(result); - return NULL; + goto failed; } - /* FIXME: this can fail, right? */ - PyDict_SetItem(result, key, item); + status = PyDict_SetItem(result, key, value); + Py_DECREF(value); + if (status < 0) + goto failed; } Py_DECREF(keys); return result; + +failed: + Py_DECREF(keys); + Py_DECREF(result); + return NULL; } static PyObject* @@ -2324,17 +2413,27 @@ static PyMethodDef _functions[] = { {NULL, NULL} }; -void -#if defined(WIN32) -__declspec(dllexport) -#endif +DL_EXPORT(void) init_sre(void) { + PyObject* m; + PyObject* d; + /* Patch object types */ Pattern_Type.ob_type = Match_Type.ob_type = Scanner_Type.ob_type = &PyType_Type; - Py_InitModule("_" MODULE, _functions); + m = Py_InitModule("_" SRE_MODULE, _functions); + d = PyModule_GetDict(m); + + PyDict_SetItemString( + d, "MAGIC", (PyObject*) PyInt_FromLong(SRE_MAGIC) + ); + + PyDict_SetItemString( + d, "copyright", (PyObject*) PyString_FromString(copyright) + ); + } #endif /* !defined(SRE_RECURSIVE) */ diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 5c55c3dbd917..73bcb349711d 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -6,11 +6,12 @@ * NOTE: This file is generated by sre_constants.py. If you need * to change anything in here, edit sre_constants.py and run it. * - * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * * See the _sre.c file for information on usage and redistribution. */ +#define SRE_MAGIC 20010320 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -42,10 +43,16 @@ #define SRE_OP_SUBPATTERN 28 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 -#define SRE_AT_BOUNDARY 2 -#define SRE_AT_NON_BOUNDARY 3 -#define SRE_AT_END 4 -#define SRE_AT_END_LINE 5 +#define SRE_AT_BEGINNING_STRING 2 +#define SRE_AT_BOUNDARY 3 +#define SRE_AT_NON_BOUNDARY 4 +#define SRE_AT_END 5 +#define SRE_AT_END_LINE 6 +#define SRE_AT_END_STRING 7 +#define SRE_AT_LOC_BOUNDARY 8 +#define SRE_AT_LOC_NON_BOUNDARY 9 +#define SRE_AT_UNI_BOUNDARY 10 +#define SRE_AT_UNI_NON_BOUNDARY 11 #define SRE_CATEGORY_DIGIT 0 #define SRE_CATEGORY_NOT_DIGIT 1 #define SRE_CATEGORY_SPACE 2 -- 2.47.3