]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
the mad patcher strikes again:
authorFredrik Lundh <fredrik@pythonware.com>
Fri, 30 Jun 2000 13:55:15 +0000 (13:55 +0000)
committerFredrik Lundh <fredrik@pythonware.com>
Fri, 30 Jun 2000 13:55:15 +0000 (13:55 +0000)
-- added pickling support (only works if sre is imported)

-- fixed wordsize problems in engine
   (instead of casting literals down to the character size,
   cast characters up to the literal size (same as the code
   word size).  this prevents false hits when you're matching
   a unicode pattern against an 8-bit string. (unfortunately,
   this broke another test, but I think the test should be
   changed in this case; more on that on python-dev)

-- added sre.purge function
   (unofficial, clears the cache)

Lib/sre.py
Lib/sre_compile.py
Lib/sre_parse.py
Lib/test/output/test_sre
Modules/_sre.c

index 97a5140e91687c9598dd2bce6395bb16fd1cf4c4..5e6aeeb8533633890bc3cbbdfe8de7574b40ca2f 100644 (file)
@@ -89,6 +89,10 @@ def _compile(pattern, flags=0):
     _cache[key] = p
     return p
 
+def purge():
+    # clear pattern cache
+    _cache.clear()
+
 def _sub(pattern, template, string, count=0):
     # internal: pattern.sub implementation hook
     return _subn(pattern, template, string, count)[0]
@@ -142,3 +146,12 @@ def _split(pattern, string, maxsplit=0):
         n = n + 1
     append(string[i:])
     return s
+
+# register myself for pickling
+
+import copy_reg
+
+def _pickle(p):
+    return _compile, (p.pattern, p.flags)
+
+copy_reg.pickle(type(_compile("")), _pickle, _compile)
index 0829c00e2796190f84ce32b17979968b87deeef2..e48a7eb9901dea43845b56e7b5037d263df1b1b1 100644 (file)
@@ -31,15 +31,15 @@ def _compile(code, pattern, flags):
                 emit(OPCODES[OP_IGNORE[op]])
             else:
                 emit(OPCODES[op])
-            emit(ord(av))
+            emit(av)
         elif op is IN:
             if flags & SRE_FLAG_IGNORECASE:
                 emit(OPCODES[OP_IGNORE[op]])
                 def fixup(literal, flags=flags):
-                    return _sre.getlower(ord(literal), flags)
+                    return _sre.getlower(literal, flags)
             else:
                 emit(OPCODES[op])
-                fixup = ord
+               fixup = lambda x: x
             skip = len(code); emit(0)
             for op, av in av:
                 emit(OPCODES[op])
@@ -165,7 +165,7 @@ def _compile_info(code, pattern, flags):
     if not (flags & SRE_FLAG_IGNORECASE):
         for op, av in pattern.data:
             if op is LITERAL:
-                prefix.append(ord(av))
+                prefix.append(av)
             else:
                 break
     # add an info block
index d3dbe00041e15166ac11332cfe7ca52e33b0450d..fb954e994c6e28576ea878c522abde3f2c72ce9a 100644 (file)
@@ -19,6 +19,9 @@ from sre_constants import *
 # FIXME: should be 65535, but the arraymodule is still broken
 MAXREPEAT = 32767
 
+# FIXME: same here
+CHARMASK = 0x7fff
+
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS  = "*+?{"
 
@@ -30,14 +33,14 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF")
 WHITESPACE = string.whitespace
 
 ESCAPES = {
-    r"\a": (LITERAL, chr(7)),
-    r"\b": (LITERAL, chr(8)),
-    r"\f": (LITERAL, chr(12)),
-    r"\n": (LITERAL, chr(10)),
-    r"\r": (LITERAL, chr(13)),
-    r"\t": (LITERAL, chr(9)),
-    r"\v": (LITERAL, chr(11)),
-    r"\\": (LITERAL, "\\")
+    r"\a": (LITERAL, 7),
+    r"\b": (LITERAL, 8),
+    r"\f": (LITERAL, 12),
+    r"\n": (LITERAL, 10),
+    r"\r": (LITERAL, 13),
+    r"\t": (LITERAL, 9),
+    r"\v": (LITERAL, 11),
+    r"\\": (LITERAL, ord("\\"))
 }
 
 CATEGORIES = {
@@ -176,9 +179,6 @@ def isdigit(char):
 
 def isname(name):
     # check that group name is a valid string
-    # FIXME: <fl> this code is really lame.  should use a regular
-    # expression instead, but I seem to have certain bootstrapping
-    # problems here ;-)
     if not isident(name[0]):
         return 0
     for char in name:
@@ -209,16 +209,14 @@ def _class_escape(source, escape):
             while source.next in HEXDIGITS:
                 escape = escape + source.get()
             escape = escape[2:]
-            # FIXME: support unicode characters!
-            return LITERAL, chr(int(escape[-4:], 16) & 0xff)
+            return LITERAL, int(escape[-4:], 16) & CHARMASK
         elif str(escape[1:2]) in OCTDIGITS:
             while source.next in OCTDIGITS:
                 escape = escape + source.get()
             escape = escape[1:]
-            # FIXME: support unicode characters!
-            return LITERAL, chr(int(escape[-6:], 8) & 0xff)
+            return LITERAL, int(escape[-6:], 8) & CHARMASK
         if len(escape) == 2:
-            return LITERAL, escape[1]
+            return LITERAL, ord(escape[1])
     except ValueError:
         pass
     raise error, "bogus escape: %s" % repr(escape)
@@ -236,8 +234,7 @@ def _escape(source, escape, state):
             while source.next in HEXDIGITS:
                 escape = escape + source.get()
             escape = escape[2:]
-            # FIXME: support unicode characters!
-            return LITERAL, chr(int(escape[-4:], 16) & 0xff)
+            return LITERAL, int(escape[-4:], 16) & CHARMASK
         elif escape[1:2] in DIGITS:
             while 1:
                 group = _group(escape, state.groups)
@@ -251,17 +248,14 @@ def _escape(source, escape, state):
                 else:
                     break
             escape = escape[1:]
-            # FIXME: support unicode characters!
-            return LITERAL, chr(int(escape[-6:], 8) & 0xff)
+            return LITERAL, int(escape[-6:], 8) & CHARMASK
         if len(escape) == 2:
-            return LITERAL, escape[1]
+            return LITERAL, ord(escape[1])
     except ValueError:
         pass
     raise error, "bogus escape: %s" % repr(escape)
 
-
 def _branch(pattern, items):
-
     # form a branch operator from a set of items
 
     subpattern = SubPattern(pattern)
@@ -327,7 +321,7 @@ def _parse(source, state, flags=0):
                 continue
 
         if this and this[0] not in SPECIAL_CHARS:
-            subpattern.append((LITERAL, this))
+            subpattern.append((LITERAL, ord(this)))
 
         elif this == "[":
             # character set
@@ -345,7 +339,7 @@ def _parse(source, state, flags=0):
                 elif this and this[0] == "\\":
                     code1 = _class_escape(source, this)
                 elif this:
-                    code1 = LITERAL, this
+                    code1 = LITERAL, ord(this)
                 else:
                     raise error, "unexpected end of regular expression"
                 if source.match("-"):
@@ -353,17 +347,15 @@ def _parse(source, state, flags=0):
                     this = source.get()
                     if this == "]":
                         set.append(code1)
-                        set.append((LITERAL, "-"))
+                        set.append((LITERAL, ord("-")))
                         break
                     else:
                         if this[0] == "\\":
                             code2 = _class_escape(source, this)
                         else:
-                            code2 = LITERAL, this
+                            code2 = LITERAL, ord(this)
                         if code1[0] != LITERAL or code2[0] != LITERAL:
                             raise error, "illegal range"
-                        if len(code1[1]) != 1 or len(code2[1]) != 1:
-                            raise error, "illegal range"
                         set.append((RANGE, (code1[1], code2[1])))
                 else:
                     if code1[0] is IN:
@@ -605,17 +597,16 @@ def parse_template(source, pattern):
                         break
                 if not code:
                     this = this[1:]
-                    # FIXME: support unicode characters!
-                    code = LITERAL, chr(int(this[-6:], 8) & 0xff)
+                    code = LITERAL, int(this[-6:], 8) & CHARMASK
                 a(code)
             else:
                 try:
                     a(ESCAPES[this])
                 except KeyError:
                     for c in this:
-                        a((LITERAL, c))
+                        a((LITERAL, ord(c)))
         else:
-            a((LITERAL, this))
+            a((LITERAL, ord(this)))
     return p
 
 def expand_template(template, match):
@@ -623,12 +614,17 @@ def expand_template(template, match):
     # code instead
     p = []
     a = p.append
+    sep = match.string[:0]
+    if type(sep) is type(""):
+       char = chr
+    else:
+       char = unichr
     for c, s in template:
         if c is LITERAL:
-            a(s)
+            a(char(s))
         elif c is MARK:
             s = match.group(s)
             if s is None:
                 raise error, "empty group"
             a(s)
-    return match.string[:0].join(p)
+    return sep.join(p)
index d3732b5214985ede6609e0c6031770ded70d3325..10de93dd2051a82304491b0f90daec9cdd217445 100644 (file)
@@ -1,6 +1,5 @@
 test_sre
-test_support -- test failed re module pickle
-test_support -- test failed re module cPickle
+=== Failed incorrectly ('\\x00ffffffffffffff', '\377', 0, 'found', '\377')
 === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
 === Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
 === grouping error ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', 0, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/') 'd:msgs/tdir/sub1/-trial/' should be 'd:msgs/tdir/sub1/-tdir/'
index 22b6c7347c5d2c1ba8540f5b1413add087eafef7..268c5dd82b9ca34e16147f9a489248f2be777c50 100644 (file)
@@ -20,7 +20,7 @@
  * 00-06-28 fl fixed findall (0.9.1)
  * 00-06-29 fl fixed split, added more scanner features (0.9.2)
  * 00-06-30 fl tuning, fast search (0.9.3)
- * 00-06-30 fl added assert (lookahead) primitives (0.9.4)
+ * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
  *
  * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
  *
@@ -339,7 +339,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
 }
 
 LOCAL(int)
-SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
+SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
 {
        /* check if character is a member of the given set */
 
@@ -356,13 +356,13 @@ SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
                        return !ok;
 
                case SRE_OP_LITERAL:
-                       if (ch == (SRE_CHAR) set[0])
+                       if (ch == set[0])
                                return ok;
                        set++;
                        break;
 
                case SRE_OP_RANGE:
-                       if ((SRE_CHAR) set[0] <= ch && ch <= (SRE_CHAR) set[1])
+                       if (set[0] <= ch && ch <= set[1])
                                return ok;
                        set += 2;
                        break;
@@ -455,8 +455,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
                case SRE_OP_LITERAL:
                        /* match literal string */
                        /* args: <code> */
-                       TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) pattern[0]));
-                       if (ptr >= end || *ptr != (SRE_CHAR) pattern[0])
+                       TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
+                       if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
                                goto failure;
                        pattern++;
                        ptr++;
@@ -465,8 +465,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
                case SRE_OP_NOT_LITERAL:
                        /* match anything that is not literal character */
                        /* args: <code> */
-                       TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) pattern[0]));
-                       if (ptr >= end || *ptr == (SRE_CHAR) pattern[0])
+                       TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
+                       if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
                                goto failure;
                        pattern++;
                        ptr++;
@@ -528,7 +528,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
                        break;
 
                case SRE_OP_LITERAL_IGNORE:
-                       TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
+                       TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
                        if (ptr >= end ||
                 state->lower(*ptr) != state->lower(*pattern))
                                goto failure;
@@ -537,8 +537,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
                        break;
 
                case SRE_OP_NOT_LITERAL_IGNORE:
-                       TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
-                   (SRE_CHAR) *pattern));
+                       TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
                        if (ptr >= end ||
                 state->lower(*ptr) == state->lower(*pattern))
                                goto failure;
@@ -549,7 +548,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
                case SRE_OP_IN_IGNORE:
                        TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
                        if (ptr >= end
-                               || !SRE_MEMBER(pattern+1, (SRE_CHAR) state->lower(*ptr)))
+                               || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
                                goto failure;
                        pattern += pattern[0];
                        ptr++;
@@ -631,9 +630,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
 
                        } else if (pattern[3] == SRE_OP_LITERAL) {
                                /* repeated literal */
-                               SRE_CHAR chr = (SRE_CHAR) pattern[4];
+                               SRE_CODE chr = pattern[4];
                                while (count < (int) pattern[2]) {
-                                       if (ptr >= end || *ptr != chr)
+                                       if (ptr >= end || (SRE_CODE) ptr[0] != chr)
                                                break;
                                        ptr++;
                                        count++;
@@ -641,9 +640,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
 
                        } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
                                /* repeated literal */
-                               SRE_CHAR chr = (SRE_CHAR) pattern[4];
+                               SRE_CODE chr = pattern[4];
                                while (count < (int) pattern[2]) {
-                                       if (ptr >= end || (SRE_CHAR) state->lower(*ptr) != chr)
+                                       if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
                                                break;
                                        ptr++;
                                        count++;
@@ -651,9 +650,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
 
                        } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
                                /* repeated non-literal */
-                               SRE_CHAR chr = (SRE_CHAR) pattern[4];
+                               SRE_CODE chr = pattern[4];
                                while (count < (int) pattern[2]) {
-                                       if (ptr >= end || *ptr == chr)
+                                       if (ptr >= end || (SRE_CODE) ptr[0] == chr)
                                                break;
                                        ptr++;
                                        count++;
@@ -661,9 +660,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
 
                        } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
                                /* repeated non-literal */
-                               SRE_CHAR chr = (SRE_CHAR) pattern[4];
+                               SRE_CODE chr = pattern[4];
                                while (count < (int) pattern[2]) {
-                                       if (ptr >= end || (SRE_CHAR) state->lower(*ptr) == chr)
+                                       if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
                                                break;
                                        ptr++;
                                        count++;
@@ -712,7 +711,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
                        } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
                                /* tail starts with a literal. skip positions where
                                   the rest of the pattern cannot possibly match */
-                               SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
+                               SRE_CODE chr = pattern[pattern[0]+1];
                                TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
                                for (;;) {
                                        TRACE(("%8d: scan for tail match\n", PTR(ptr)));
@@ -868,7 +867,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
                        TRACE(("%8d: branch\n", PTR(ptr)));
                        while (*pattern) {
                                if (pattern[1] != SRE_OP_LITERAL ||
-                                       (ptr < end && *ptr == (SRE_CHAR) pattern[2])) {
+                                       (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
                                        TRACE(("%8d: branch check\n", PTR(ptr)));
                                        state->ptr = ptr;
                                        i = SRE_MATCH(state, pattern + 1);
@@ -976,7 +975,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
         end = state->end;
         while (ptr < end) {
             for (;;) {
-                if (*ptr != (SRE_CHAR) prefix[i]) {
+                if ((SRE_CODE) ptr[0] != prefix[i]) {
                     if (!i)
                         break;
                     else
@@ -1008,9 +1007,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
        if (pattern[0] == SRE_OP_LITERAL) {
                /* pattern starts with a literal character.  this is used for
            short prefixes, and if fast search is disabled*/
-               SRE_CHAR chr = (SRE_CHAR) pattern[1];
+               SRE_CODE chr = pattern[1];
                for (;;) {
-                       while (ptr < end && *ptr != chr)
+                       while (ptr < end && (SRE_CODE) ptr[0] != chr)
                                ptr++;
                        if (ptr == end)
                                return 0;