#
# re-compatible interface for the sre matching engine
#
-# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
+# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# This version of the SRE library can be redistributed under CNRI's
# Python 1.6 license. For any other use, please contact Secret Labs
# other compatibility work.
#
-# FIXME: change all FIXME's to XXX ;-)
-
import sre_compile
import sre_parse
+# public symbols
+__all__ = [ "match", "search", "sub", "subn", "split", "findall",
+ "compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
+ "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
+ "UNICODE", "error" ]
+
+__version__ = "2.1b2"
+
+# this module works under 1.5.2 and later. don't use string methods
import string
# flags
-I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
-L = LOCALE = sre_compile.SRE_FLAG_LOCALE
-M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
-S = DOTALL = sre_compile.SRE_FLAG_DOTALL
-X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
+I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
+L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
+U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
+M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
+S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
+X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
-# sre extensions (may or may not be in 1.6/2.0 final)
-T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
-U = UNICODE = sre_compile.SRE_FLAG_UNICODE
+# sre extensions (experimental, don't rely on these)
+T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking
+DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation
# sre exception
error = sre_compile.error
# --------------------------------------------------------------------
# public interface
-# FIXME: add docstrings
-
def match(pattern, string, flags=0):
+ """Try to apply the pattern at the start of the string, returning
+ a match object, or None if no match was found."""
return _compile(pattern, flags).match(string)
def search(pattern, string, flags=0):
+ """Scan through string looking for a match to the pattern, returning
+ a match object, or None if no match was found."""
return _compile(pattern, flags).search(string)
def sub(pattern, repl, string, count=0):
+ """Return the string obtained by replacing the leftmost
+ non-overlapping occurrences of the pattern in string by the
+ replacement repl"""
return _compile(pattern, 0).sub(repl, string, count)
def subn(pattern, repl, string, count=0):
+ """Return a 2-tuple containing (new_string, number).
+ new_string is the string obtained by replacing the leftmost
+ non-overlapping occurrences of the pattern in the source
+ string by the replacement repl. number is the number of
+ substitutions that were made."""
return _compile(pattern, 0).subn(repl, string, count)
def split(pattern, string, maxsplit=0):
+ """Split the source string by the occurrences of the pattern,
+ returning a list containing the resulting substrings."""
return _compile(pattern, 0).split(string, maxsplit)
def findall(pattern, string, maxsplit=0):
+ """Return a list of all non-overlapping matches in the string.
+
+ If one or more groups are present in the pattern, return a
+ list of groups; this will be a list of tuples if the pattern
+ has more than one group.
+
+ Empty matches are included in the result."""
return _compile(pattern, 0).findall(string, maxsplit)
def compile(pattern, flags=0):
+ "Compile a regular expression pattern, returning a pattern object."
return _compile(pattern, flags)
def purge():
+ "Clear the regular expression cache"
_cache.clear()
+ _cache_repl.clear()
def template(pattern, flags=0):
+ "Compile a template pattern, returning a pattern object"
return _compile(pattern, flags|T)
def escape(pattern):
+ "Escape all non-alphanumeric characters in pattern."
s = list(pattern)
for i in range(len(pattern)):
c = pattern[i]
# internals
_cache = {}
+_cache_repl = {}
+
_MAXCACHE = 100
def _join(seq, sep):
_cache[key] = p
return p
+def _compile_repl(*key):
+ # internal: compile replacement pattern
+ p = _cache_repl.get(key)
+ if p is not None:
+ return p
+ repl, pattern = key
+ try:
+ p = sre_parse.parse_template(repl, pattern)
+ except error, v:
+ raise error, v # invalid expression
+ if len(_cache_repl) >= _MAXCACHE:
+ _cache_repl.clear()
+ _cache_repl[key] = p
+ return p
+
def _expand(pattern, match, template):
# internal: match.expand implementation hook
template = sre_parse.parse_template(template, pattern)
if callable(template):
filter = template
else:
- template = sre_parse.parse_template(template, pattern)
+ template = _compile_repl(template, pattern)
def filter(match, template=template):
return sre_parse.expand_template(template, match)
n = i = 0
continue
append(string[i:b])
if g and b != e:
- extend(m.groups())
+ extend(list(m.groups()))
i = e
n = n + 1
append(string[i:])
break
action = self.lexicon[m.lastindex][1]
if callable(action):
- self.match = match
+ self.match = m
action = action(self, m.group())
if action is not None:
append(action)
#
# convert template to internal format
#
-# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
+# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#
from sre_constants import *
+assert _sre.MAGIC == MAGIC, "SRE module mismatch"
+
MAXCODE = 65535
def _compile(code, pattern, flags):
if op in (LITERAL, NOT_LITERAL):
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
+ emit(_sre.getlower(av, flags))
else:
emit(OPCODES[op])
- emit(av)
+ emit(av)
elif op is IN:
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
elif op is AT:
emit(OPCODES[op])
if flags & SRE_FLAG_MULTILINE:
- emit(ATCODES[AT_MULTILINE.get(av, av)])
- else:
- emit(ATCODES[av])
+ av = AT_MULTILINE.get(av, av)
+ if flags & SRE_FLAG_LOCALE:
+ av = AT_LOCALE.get(av, av)
+ elif flags & SRE_FLAG_UNICODE:
+ av = AT_UNICODE.get(av, av)
+ emit(ATCODES[av])
elif op is BRANCH:
emit(OPCODES[op])
tail = []
elif op is CATEGORY:
emit(OPCODES[op])
if flags & SRE_FLAG_LOCALE:
- emit(CHCODES[CH_LOCALE[av]])
+ av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE:
- emit(CHCODES[CH_UNICODE[av]])
- else:
- emit(CHCODES[av])
+ av = CH_UNICODE[av]
+ emit(CHCODES[av])
elif op is GROUPREF:
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
for i in range(fixup(av[0]), fixup(av[1])+1):
charmap[i] = 1
elif op is CATEGORY:
- # FIXME: could append to charmap tail
+ # XXX: could append to charmap tail
return charset # cannot compress
except IndexError:
# character set contains unicode characters
# print code
- # FIXME: <fl> get rid of this limitation!
+ # XXX: <fl> get rid of this limitation!
assert p.pattern.groups <= 100,\
"sorry, but this version only supports 100 named groups"
# various symbols used by the regular expression engine.
# run this script to update the _sre include files!
#
-# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
+# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#
+# update when constants are added or removed
+
+MAGIC = 20010320
+
+# max code word in this release
+
MAXREPEAT = 65535
+# SRE standard exception (access as sre.error)
# should this really be here?
class error(Exception):
# positions
AT_BEGINNING = "at_beginning"
AT_BEGINNING_LINE = "at_beginning_line"
+AT_BEGINNING_STRING = "at_beginning_string"
AT_BOUNDARY = "at_boundary"
AT_NON_BOUNDARY = "at_non_boundary"
AT_END = "at_end"
AT_END_LINE = "at_end_line"
+AT_END_STRING = "at_end_string"
+AT_LOC_BOUNDARY = "at_loc_boundary"
+AT_LOC_NON_BOUNDARY = "at_loc_non_boundary"
+AT_UNI_BOUNDARY = "at_uni_boundary"
+AT_UNI_NON_BOUNDARY = "at_uni_non_boundary"
# categories
CATEGORY_DIGIT = "category_digit"
]
ATCODES = [
- AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY,
- AT_NON_BOUNDARY, AT_END, AT_END_LINE
+ AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
+ AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING,
+ AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY,
+ AT_UNI_NON_BOUNDARY
]
CHCODES = [
AT_END: AT_END_LINE
}
+AT_LOCALE = {
+ AT_BOUNDARY: AT_LOC_BOUNDARY,
+ AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
+}
+
+AT_UNICODE = {
+ AT_BOUNDARY: AT_UNI_BOUNDARY,
+ AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
+}
+
CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
SRE_FLAG_DOTALL = 16 # treat target as a single string
SRE_FLAG_UNICODE = 32 # use unicode locale
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
+SRE_FLAG_DEBUG = 128 # debugging
# flags for INFO primitive
SRE_INFO_PREFIX = 1 # has prefix
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
- * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
+ * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
""")
+ f.write("#define SRE_MAGIC %d\n" % MAGIC)
+
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
#
# convert re-style regular expression to sre pattern
#
-# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
+# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#
+# XXX: show string offset and offending character for all errors
+
+# this module works under 1.5.2 and later. don't use string methods
import string, sys
from sre_constants import *
WHITESPACE = tuple(" \t\n\r\v\f")
ESCAPES = {
- r"\a": (LITERAL, 7),
- r"\b": (LITERAL, 8),
- r"\f": (LITERAL, 12),
- r"\n": (LITERAL, 10),
- r"\r": (LITERAL, 13),
- r"\t": (LITERAL, 9),
- r"\v": (LITERAL, 11),
+ r"\a": (LITERAL, ord("\a")),
+ r"\b": (LITERAL, ord("\b")),
+ r"\f": (LITERAL, ord("\f")),
+ r"\n": (LITERAL, ord("\n")),
+ r"\r": (LITERAL, ord("\r")),
+ r"\t": (LITERAL, ord("\t")),
+ r"\v": (LITERAL, ord("\v")),
r"\\": (LITERAL, ord("\\"))
}
CATEGORIES = {
- r"\A": (AT, AT_BEGINNING), # start of string
+ r"\A": (AT, AT_BEGINNING_STRING), # start of string
r"\b": (AT, AT_BOUNDARY),
r"\B": (AT, AT_NON_BOUNDARY),
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
- r"\Z": (AT, AT_END), # end of string
+ r"\Z": (AT, AT_END_STRING), # end of string
}
FLAGS = {
"u": SRE_FLAG_UNICODE,
}
+# figure out best way to convert hex/octal numbers to integers
+try:
+ int("10", 8)
+ atoi = int # 2.0 and later
+except TypeError:
+ atoi = string.atoi # 1.5.2
+
class Pattern:
# master pattern object. keeps track of global attributes
def __init__(self):
self.flags = 0
+ self.open = []
self.groups = 1
self.groupdict = {}
- def getgroup(self, name=None):
+ def opengroup(self, name=None):
gid = self.groups
self.groups = gid + 1
if name:
self.groupdict[name] = gid
+ self.open.append(gid)
return gid
+ def closegroup(self, gid):
+ self.open.remove(gid)
+ def checkgroup(self, gid):
+ return gid < self.groups and gid not in self.open
class SubPattern:
# a subpattern, in intermediate form
def _group(escape, groups):
# check if the escape string represents a valid group
try:
- gid = int(escape[1:])
+ gid = atoi(escape[1:])
if gid and gid < groups:
return gid
except ValueError:
escape = escape[2:]
if len(escape) != 2:
raise error, "bogus escape: %s" % repr("\\" + escape)
- return LITERAL, int(escape, 16) & 0xff
+ return LITERAL, atoi(escape, 16) & 0xff
elif str(escape[1:2]) in OCTDIGITS:
# octal escape (up to three digits)
while source.next in OCTDIGITS and len(escape) < 5:
escape = escape + source.get()
escape = escape[1:]
- return LITERAL, int(escape, 8) & 0xff
+ return LITERAL, atoi(escape, 8) & 0xff
if len(escape) == 2:
return LITERAL, ord(escape[1])
except ValueError:
escape = escape + source.get()
if len(escape) != 4:
raise ValueError
- return LITERAL, int(escape[2:], 16) & 0xff
+ return LITERAL, atoi(escape[2:], 16) & 0xff
elif escape[1:2] == "0":
# octal escape
while source.next in OCTDIGITS and len(escape) < 4:
escape = escape + source.get()
- return LITERAL, int(escape[1:], 8) & 0xff
+ return LITERAL, atoi(escape[1:], 8) & 0xff
elif escape[1:2] in DIGITS:
# octal escape *or* decimal group reference (sigh)
here = source.tell()
source.next in OCTDIGITS):
# got three octal digits; this is an octal escape
escape = escape + source.get()
- return LITERAL, int(escape[1:], 8) & 0xff
+ return LITERAL, atoi(escape[1:], 8) & 0xff
# got at least one decimal digit; this is a group reference
group = _group(escape, state.groups)
if group:
+ if not state.checkgroup(group):
+ raise error, "cannot refer to open group"
return GROUPREF, group
raise ValueError
if len(escape) == 2:
else:
code2 = LITERAL, ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL:
- raise error, "illegal range"
+ raise error, "bad character range"
lo = code1[1]
hi = code2[1]
if hi < lo:
- raise error, "illegal range"
+ raise error, "bad character range"
set.append((RANGE, (lo, hi)))
else:
if code1[0] is IN:
code1 = code1[1][0]
set.append(code1)
- # FIXME: <fl> move set optimization to compiler!
+ # XXX: <fl> should move set optimization to compiler!
if len(set)==1 and set[0][0] is LITERAL:
subpattern.append(set[0]) # optimization
elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
subpattern.append((NOT_LITERAL, set[1][1])) # optimization
else:
- # FIXME: <fl> add charmap optimization
+ # XXX: <fl> should add charmap optimization here
subpattern.append((IN, set))
elif this and this[0] in REPEAT_CHARS:
min, max = 0, 1
elif this == "*":
min, max = 0, MAXREPEAT
+
elif this == "+":
min, max = 1, MAXREPEAT
elif this == "{":
source.seek(here)
continue
if lo:
- min = int(lo)
+ min = atoi(lo)
if hi:
- max = int(hi)
- # FIXME: <fl> check that hi >= lo!
+ max = atoi(hi)
+ if max < min:
+ raise error, "bad repeat interval"
else:
raise error, "not supported"
# figure out which item to repeat
if subpattern:
item = subpattern[-1:]
else:
+ item = None
+ if not item or (len(item) == 1 and item[0][0] == AT):
raise error, "nothing to repeat"
+ if item[0][0] in (MIN_REPEAT, MAX_REPEAT):
+ raise error, "multiple repeat"
if source.match("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item))
else:
name = name + char
group = 1
if not isname(name):
- raise error, "illegal character in group name"
+ raise error, "bad character in group name"
elif source.match("="):
# named backreference
name = ""
break
name = name + char
if not isname(name):
- raise error, "illegal character in group name"
+ raise error, "bad character in group name"
gid = state.groupdict.get(name)
if gid is None:
raise error, "unknown group name"
continue
else:
# flags
+ if not FLAGS.has_key(source.next):
+ raise error, "unexpected end of pattern"
while FLAGS.has_key(source.next):
state.flags = state.flags | FLAGS[source.get()]
if group:
# anonymous group
group = None
else:
- group = state.getgroup(name)
+ group = state.opengroup(name)
p = _parse_sub(source, state)
if not source.match(")"):
raise error, "unbalanced parenthesis"
+ if group is not None:
+ state.closegroup(group)
subpattern.append((SUBPATTERN, (group, p)))
else:
while 1:
char = source.get()
- if char is None or char == ")":
+ if char is None:
+ raise error, "unexpected end of pattern"
+ if char == ")":
break
raise error, "unknown extension"
if pattern is None:
pattern = Pattern()
pattern.flags = flags
+ pattern.str = str
p = _parse_sub(source, pattern, 0)
elif tail:
raise error, "bogus characters at end of regular expression"
- # p.dump()
+ if flags & SRE_FLAG_DEBUG:
+ p.dump()
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
# the VERBOSE flag was switched on inside the pattern. to be
s = Tokenizer(source)
p = []
a = p.append
+ def literal(literal, p=p):
+ if p and p[-1][0] is LITERAL:
+ p[-1] = LITERAL, p[-1][1] + literal
+ else:
+ p.append((LITERAL, literal))
+ sep = source[:0]
+ if type(sep) is type(""):
+ char = chr
+ else:
+ char = unichr
while 1:
this = s.get()
if this is None:
if not name:
raise error, "bad group name"
try:
- index = int(name)
+ index = atoi(name)
except ValueError:
if not isname(name):
- raise error, "illegal character in group name"
+ raise error, "bad character in group name"
try:
index = pattern.groupindex[name]
except KeyError:
if group:
if (s.next not in DIGITS or
not _group(this + s.next, pattern.groups+1)):
- code = MARK, int(group)
+ code = MARK, group
break
elif s.next in OCTDIGITS:
this = this + s.get()
break
if not code:
this = this[1:]
- code = LITERAL, int(this[-6:], 8) & 0xff
- a(code)
+ code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
+ if code[0] is LITERAL:
+ literal(code[1])
+ else:
+ a(code)
else:
try:
- a(ESCAPES[this])
+ this = char(ESCAPES[this][1])
except KeyError:
- for c in this:
- a((LITERAL, ord(c)))
+ pass
+ literal(this)
else:
- a((LITERAL, ord(this)))
- return p
+ literal(this)
+ # convert template to groups and literals lists
+ i = 0
+ groups = []
+ literals = []
+ for c, s in p:
+ if c is MARK:
+ groups.append((i, s))
+ literals.append(None)
+ else:
+ literals.append(s)
+ i = i + 1
+ return groups, literals
def expand_template(template, match):
- # FIXME: <fl> this is sooooo slow. drop in the slicelist
- # code instead
- p = []
- a = p.append
+ g = match.group
sep = match.string[:0]
- if type(sep) is type(""):
- char = chr
- else:
- char = unichr
- for c, s in template:
- if c is LITERAL:
- a(char(s))
- elif c is MARK:
- s = match.group(s)
+ groups, literals = template
+ literals = literals[:]
+ try:
+ for index, group in groups:
+ literals[index] = s = g(group)
if s is None:
- raise error, "empty group"
- a(s)
- return string.join(p, sep)
+ raise IndexError
+ except IndexError:
+ raise error, "empty group"
+ return string.join(literals, sep)
More recent bugs are accessed as
http://sourceforge.net/tracker/index.php?func=detail&aid=<id>&group_id=5470&atid=105470
+- Brought SRE up to date with Python 2.1
+
- #117278, #117167: _tkinter
- #116172, curses module fails to build on SGI, _curses
*
* partial history:
* 1999-10-24 fl created (based on existing template matcher code)
- * 2000-03-06 fl first alpha, sort of (0.5)
- * 2000-06-30 fl added fast search optimization (0.9.3)
- * 2000-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
- * 2000-07-02 fl added charset optimizations, etc (0.9.5)
+ * 2000-03-06 fl first alpha, sort of
+ * 2000-06-30 fl added fast search optimization
+ * 2000-06-30 fl added assert (lookahead) primitives, etc
+ * 2000-07-02 fl added charset optimizations, etc
* 2000-07-03 fl store code in pattern object, lookbehind, etc
* 2000-07-08 fl added regs attribute
- * 2000-07-21 fl reset lastindex in scanner methods (0.9.6)
- * 2000-08-01 fl fixes for 1.6b1 (0.9.8)
+ * 2000-07-21 fl reset lastindex in scanner methods
+ * 2000-08-01 fl fixes for 1.6b1
* 2000-08-03 fl added recursion limit
* 2000-08-07 fl use PyOS_CheckStack() if available
* 2000-08-08 fl changed findall to return empty strings instead of None
* 2000-09-20 fl added expand method
* 2000-09-21 fl don't use the buffer interface for unicode strings
* 2000-10-03 fl fixed assert_not primitive; support keyword arguments
+ * 2000-10-24 fl really fixed assert_not; reset groups in findall
+ * 2000-12-21 fl fixed memory leak in groupdict
+ * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
+ * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
+ * 2001-01-16 fl fixed memory leak in pattern destructor
+ * 2001-03-20 fl lots of fixes for 2.1b2
+ * 2001-04-15 fl export copyright as Python attribute, not global
*
- * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
+ * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* This version of the SRE library can be redistributed under CNRI's
* Python 1.6 license. For any other use, please contact Secret Labs
#ifndef SRE_RECURSIVE
-char copyright[] = " SRE 0.9.8 Copyright (c) 1997-2000 by Secret Labs AB ";
+static char copyright[] =
+ " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB ";
#include "Python.h"
#include <ctype.h>
/* name of this module, minus the leading underscore */
-#define MODULE "sre"
+#if !defined(SRE_MODULE)
+#define SRE_MODULE "sre"
+#endif
/* defining this one enables tracing */
#undef VERBOSE
/* enables aggressive inlining (always on for Visual C) */
#undef USE_INLINE
+#if PY_VERSION_HEX < 0x01060000
+#define PyObject_DEL(op) PyMem_DEL((op))
+#endif
+
/* -------------------------------------------------------------------- */
#if defined(_MSC_VER)
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127 };
-static unsigned int sre_lower(unsigned int ch)
-{
- return ((ch) < 128 ? sre_char_lower[ch] : ch);
-}
-
#define SRE_IS_DIGIT(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
#define SRE_IS_SPACE(ch)\
#define SRE_IS_WORD(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
-/* locale-specific character predicates */
-
-static unsigned int sre_lower_locale(unsigned int ch)
+static unsigned int sre_lower(unsigned int ch)
{
- return ((ch) < 256 ? tolower((ch)) : ch);
+ return ((ch) < 128 ? sre_char_lower[ch] : ch);
}
+
+/* locale-specific character predicates */
+
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
+static unsigned int sre_lower_locale(unsigned int ch)
+{
+ return ((ch) < 256 ? tolower((ch)) : ch);
+}
+
/* unicode-specific character predicates */
#if defined(HAVE_UNICODE)
-static unsigned int sre_lower_unicode(unsigned int ch)
-{
- return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
-}
+
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
+
+static unsigned int sre_lower_unicode(unsigned int ch)
+{
+ return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
+}
+
#endif
LOCAL(int)
return SRE_UNI_IS_LINEBREAK(ch);
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
return !SRE_UNI_IS_LINEBREAK(ch);
+#else
+ case SRE_CATEGORY_UNI_DIGIT:
+ return SRE_IS_DIGIT(ch);
+ case SRE_CATEGORY_UNI_NOT_DIGIT:
+ return !SRE_IS_DIGIT(ch);
+ case SRE_CATEGORY_UNI_SPACE:
+ return SRE_IS_SPACE(ch);
+ case SRE_CATEGORY_UNI_NOT_SPACE:
+ return !SRE_IS_SPACE(ch);
+ case SRE_CATEGORY_UNI_WORD:
+ return SRE_LOC_IS_WORD(ch);
+ case SRE_CATEGORY_UNI_NOT_WORD:
+ return !SRE_LOC_IS_WORD(ch);
+ case SRE_CATEGORY_UNI_LINEBREAK:
+ return SRE_IS_LINEBREAK(ch);
+ case SRE_CATEGORY_UNI_NOT_LINEBREAK:
+ return !SRE_IS_LINEBREAK(ch);
#endif
}
return 0;
switch (at) {
case SRE_AT_BEGINNING:
+ case SRE_AT_BEGINNING_STRING:
return ((void*) ptr == state->beginning);
case SRE_AT_BEGINNING_LINE:
return ((void*) ptr == state->end ||
SRE_IS_LINEBREAK((int) ptr[0]));
+ case SRE_AT_END_STRING:
+ return ((void*) ptr == state->end);
+
case SRE_AT_BOUNDARY:
if (state->beginning == state->end)
return 0;
this = ((void*) ptr < state->end) ?
SRE_IS_WORD((int) ptr[0]) : 0;
return this == that;
+
+ case SRE_AT_LOC_BOUNDARY:
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+ return this != that;
+
+ case SRE_AT_LOC_NON_BOUNDARY:
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+ return this == that;
+
+ case SRE_AT_UNI_BOUNDARY:
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+ return this != that;
+
+ case SRE_AT_UNI_NON_BOUNDARY:
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+ return this == that;
}
return 0;
/* <ASSERT_NOT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
state->ptr = ptr - pattern[1];
- if (state->ptr < state->beginning)
- return 0;
- i = SRE_MATCH(state, pattern + 2, level + 1);
- if (i < 0)
- return i;
- if (i)
- return 0;
+ if (state->ptr >= state->beginning) {
+ i = SRE_MATCH(state, pattern + 2, level + 1);
+ if (i < 0)
+ return i;
+ if (i)
+ return 0;
+ }
pattern += pattern[0];
break;
/* this operator only works if the repeated item is
exactly one character wide, and we're not already
collecting backtracking points. for other cases,
- use the MAX_REPEAT operator instead */
+ use the MAX_REPEAT operator */
/* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
- by the UNTIL operator */
+ by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
/* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
pattern[1], pattern[2]));
if (i)
return i;
state->repeat = rp;
+ state->ptr = ptr;
return 0;
case SRE_OP_MIN_UNTIL:
count = rp->count + 1;
- TRACE(("|%p|%p|MIN_UNTIL %d\n", pattern, ptr, count));
+ TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
+ rp->pattern));
state->ptr = ptr;
/* see if the tail matches */
state->repeat = rp->prev;
- i = SRE_MATCH(state, pattern, level + 1);
+ /* FIXME: the following fix doesn't always work (#133283) */
+ if (0 && rp->pattern[2] == 65535) {
+ /* unbounded repeat */
+ for (;;) {
+ i = SRE_MATCH(state, pattern, level + 1);
+ if (i || ptr >= end)
+ break;
+ state->ptr = ++ptr;
+ }
+ } else
+ i = SRE_MATCH(state, pattern, level + 1);
if (i) {
/* free(rp); */
return i;
}
+
+ state->ptr = ptr;
state->repeat = rp;
if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
if (i)
return i;
rp->count = count - 1;
+ state->ptr = ptr;
return 0;
default:
int groups = 0;
PyObject* groupindex = NULL;
PyObject* indexgroup = NULL;
- if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code,
- &groups, &groupindex, &indexgroup))
- return NULL;
-
- code = PySequence_Fast(code, "code argument must be a sequence");
- if (!code)
+ if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
+ &PyList_Type, &code, &groups,
+ &groupindex, &indexgroup))
return NULL;
-#if PY_VERSION_HEX >= 0x01060000
- n = PySequence_Size(code);
-#else
- n = PySequence_Length(code);
-#endif
+ n = PyList_GET_SIZE(code);
- self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
- if (!self) {
- Py_DECREF(code);
+ self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
+ if (!self)
return NULL;
- }
for (i = 0; i < n; i++) {
- PyObject *o = PySequence_Fast_GET_ITEM(code, i);
+ PyObject *o = PyList_GET_ITEM(code, i);
self->code[i] = (SRE_CODE) PyInt_AsLong(o);
}
- Py_DECREF(code);
-
- if (PyErr_Occurred())
+ if (PyErr_Occurred()) {
+ PyObject_DEL(self);
return NULL;
+ }
Py_INCREF(pattern);
self->pattern = pattern;
return NULL;
if (flags & SRE_FLAG_LOCALE)
return Py_BuildValue("i", sre_lower_locale(character));
-#if defined(HAVE_UNICODE)
if (flags & SRE_FLAG_UNICODE)
+#if defined(HAVE_UNICODE)
return Py_BuildValue("i", sre_lower_unicode(character));
+#else
+ return Py_BuildValue("i", sre_lower_locale(character));
#endif
return Py_BuildValue("i", sre_lower(character));
}
if (pattern->flags & SRE_FLAG_LOCALE)
state->lower = sre_lower_locale;
-#if defined(HAVE_UNICODE)
else if (pattern->flags & SRE_FLAG_UNICODE)
+#if defined(HAVE_UNICODE)
state->lower = sre_lower_unicode;
+#else
+ state->lower = sre_lower_locale;
#endif
else
state->lower = sre_lower;
string = state_init(&self->state, pattern, string, start, end);
if (!string) {
- PyObject_Del(self);
+ PyObject_DEL(self);
return NULL;
}
{
Py_XDECREF(self->pattern);
Py_XDECREF(self->groupindex);
+ Py_XDECREF(self->indexgroup);
PyObject_DEL(self);
}
PyObject* func;
PyObject* result;
- name = PyString_FromString(MODULE);
+ name = PyString_FromString(SRE_MODULE);
if (!name)
return NULL;
module = PyImport_Import(name);
PyObject* item;
+ state_reset(&state);
+
state.ptr = state.start;
if (state.charsize == 1) {
PyObject* def = Py_None;
static char* kwlist[] = { "default", NULL };
- if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
+ if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
return NULL;
result = PyDict_New();
return result;
keys = PyMapping_Keys(self->pattern->groupindex);
- if (!keys) {
- Py_DECREF(result);
- return NULL;
- }
+ if (!keys)
+ goto failed;
for (index = 0; index < PyList_GET_SIZE(keys); index++) {
+ int status;
PyObject* key;
- PyObject* item;
+ PyObject* value;
key = PyList_GET_ITEM(keys, index);
- if (!key) {
- Py_DECREF(keys);
- Py_DECREF(result);
- return NULL;
- }
- item = match_getslice(self, key, def);
- if (!item) {
+ if (!key)
+ goto failed;
+ value = match_getslice(self, key, def);
+ if (!value) {
Py_DECREF(key);
- Py_DECREF(keys);
- Py_DECREF(result);
- return NULL;
+ goto failed;
}
- /* FIXME: <fl> this can fail, right? */
- PyDict_SetItem(result, key, item);
+ status = PyDict_SetItem(result, key, value);
+ Py_DECREF(value);
+ if (status < 0)
+ goto failed;
}
Py_DECREF(keys);
return result;
+
+failed:
+ Py_DECREF(keys);
+ Py_DECREF(result);
+ return NULL;
}
static PyObject*
{NULL, NULL}
};
-void
-#if defined(WIN32)
-__declspec(dllexport)
-#endif
+DL_EXPORT(void)
init_sre(void)
{
+ PyObject* m;
+ PyObject* d;
+
/* Patch object types */
Pattern_Type.ob_type = Match_Type.ob_type =
Scanner_Type.ob_type = &PyType_Type;
- Py_InitModule("_" MODULE, _functions);
+ m = Py_InitModule("_" SRE_MODULE, _functions);
+ d = PyModule_GetDict(m);
+
+ PyDict_SetItemString(
+ d, "MAGIC", (PyObject*) PyInt_FromLong(SRE_MAGIC)
+ );
+
+ PyDict_SetItemString(
+ d, "copyright", (PyObject*) PyString_FromString(copyright)
+ );
+
}
#endif /* !defined(SRE_RECURSIVE) */
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
- * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
+ * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
+#define SRE_MAGIC 20010320
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
#define SRE_OP_SUBPATTERN 28
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
-#define SRE_AT_BOUNDARY 2
-#define SRE_AT_NON_BOUNDARY 3
-#define SRE_AT_END 4
-#define SRE_AT_END_LINE 5
+#define SRE_AT_BEGINNING_STRING 2
+#define SRE_AT_BOUNDARY 3
+#define SRE_AT_NON_BOUNDARY 4
+#define SRE_AT_END 5
+#define SRE_AT_END_LINE 6
+#define SRE_AT_END_STRING 7
+#define SRE_AT_LOC_BOUNDARY 8
+#define SRE_AT_LOC_NON_BOUNDARY 9
+#define SRE_AT_UNI_BOUNDARY 10
+#define SRE_AT_UNI_NON_BOUNDARY 11
#define SRE_CATEGORY_DIGIT 0
#define SRE_CATEGORY_NOT_DIGIT 1
#define SRE_CATEGORY_SPACE 2