]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
- actually enabled charset anchors in the engine (still not
authorFredrik Lundh <fredrik@pythonware.com>
Sun, 2 Jul 2000 17:33:27 +0000 (17:33 +0000)
committerFredrik Lundh <fredrik@pythonware.com>
Sun, 2 Jul 2000 17:33:27 +0000 (17:33 +0000)
  used by the code generator)

- changed max repeat value in engine (to match earlier array fix)

- added experimental "which part matched?" mechanism to sre; see
  http://hem.passagen.se/eff/2000_07_01_bot-archive.htm#416954
  or python-dev for details.

Lib/sre.py
Lib/sre_compile.py
Lib/sre_constants.py
Lib/sre_parse.py
Modules/_sre.c
Modules/sre.h
Modules/sre_constants.h

index a09184b2a00f40f16925d23c76102b57cc986378..79f12a1b0f45c37990bbfa69e3d940600336552d 100644 (file)
@@ -155,3 +155,34 @@ def _pickle(p):
     return _compile, (p.pattern, p.flags)
 
 copy_reg.pickle(type(_compile("")), _pickle, _compile)
+
+# --------------------------------------------------------------------
+# experimental stuff (see python-dev discussions for details)
+
+class Scanner:
+    def __init__(self, lexicon):
+        self.lexicon = lexicon
+        p = []
+        for phrase, action in lexicon:
+            p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
+        self.scanner = sre.compile("|".join(p))
+    def scan(self, string):
+        result = []
+        append = result.append
+        match = self.scanner.match
+        i = 0
+        while 1:
+            m = match(string, i)
+            if not m:
+                break
+            j = m.end()
+            if i == j:
+                break
+            action = self.lexicon[m.index][1]
+            if callable(action):
+                self.match = match
+                action = action(self, m.group())
+            if action is not None:
+                append(action)
+            i = j
+        return result, string[i:]
index a593ee73f05d682c3f9b0fe3e86219af7421c144..e5c501edd111eca6c3a69fd486fe88ad7b6ef036 100644 (file)
@@ -208,7 +208,7 @@ def _compile(code, pattern, flags):
             else:
                 emit(OPCODES[op])
             emit(av-1)
-        elif op is MARK:
+        elif op in (MARK, INDEX):
             emit(OPCODES[op])
             emit(av)
         else:
index f0e45ea5105c5fc64f81c711941f85de3765ecbd..076637d86d768716d2f603aafaeb02726a2c456a 100644 (file)
@@ -33,6 +33,7 @@ GROUP = "group"
 GROUP_IGNORE = "group_ignore"
 IN = "in"
 IN_IGNORE = "in_ignore"
+INDEX = "index"
 INFO = "info"
 JUMP = "jump"
 LITERAL = "literal"
@@ -90,6 +91,7 @@ OPCODES = [
     CATEGORY,
     CHARSET,
     GROUP, GROUP_IGNORE,
+    INDEX,
     IN, IN_IGNORE,
     INFO,
     JUMP,
index b2632563c75026ca67fec922ef6ac1dad24c3d99..81ca217a470fbc3a839610a6e266636fab85d992 100644 (file)
@@ -451,6 +451,23 @@ def _parse(source, state):
                         if gid is None:
                             raise error, "unknown group name"
                         subpattern.append((GROUP, gid))
+                    elif source.match("#"):
+                        index = ""
+                        while 1:
+                            char = source.get()
+                            if char is None:
+                                raise error, "unterminated index"
+                            if char == ")":
+                                break
+                            index = index + char
+                        try:
+                            index = int(index)
+                            if index < 0 or index > MAXREPEAT:
+                                raise ValueError
+                        except ValueError:
+                            raise error, "illegal index"
+                        subpattern.append((INDEX, index))
+                        continue
                     else:
                         char = source.get()
                         if char is None:
index 3bc023789a285acd1d7a8dda3724d95c2f4d5c9a..e11a8923dca3e00a34e24fd59bfea0afe7ed3dd3 100644 (file)
@@ -21,6 +21,7 @@
  * 00-06-29 fl fixed split, added more scanner features (0.9.2)
  * 00-06-30 fl added fast search optimization (0.9.3)
  * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
+ * 00-07-02 fl added charset optimizations, etc (0.9.5)
  *
  * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
  *
@@ -31,7 +32,7 @@
 
 #ifndef SRE_RECURSIVE
 
-char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB ";
+char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
 
 #include "Python.h"
 
@@ -587,6 +588,14 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
                        pattern++;
                        break;
 
+               case SRE_OP_INDEX:
+                       /* set index */
+                       /* args: <index> */
+                       TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
+            state->index = pattern[0];
+                       pattern++;
+                       break;
+
                case SRE_OP_JUMP:
                case SRE_OP_INFO:
                        /* jump forward */
@@ -810,7 +819,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
             /* match maximum number of items, pushing alternate end
                points to the stack */
 
-            while (pattern[2] == 32767 || count < (int) pattern[2]) {
+            while (pattern[2] == 65535 || count < (int) pattern[2]) {
                                state->stackbase = stack;
                                i = SRE_MATCH(state, pattern + 3);
                                state->stackbase = stackbase; /* rewind */
@@ -980,10 +989,12 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
         }
 
         if (flags & SRE_INFO_PREFIX) {
+            /* pattern starts with a known prefix */
             prefix_len = pattern[5];
             prefix = pattern + 6;
             overlap = prefix + prefix_len - 1;
         } else if (flags & SRE_INFO_CHARSET)
+            /* pattern starts with a character from a known set */
             charset = pattern + 5;
 
         pattern += 1 + pattern[1];
@@ -1042,7 +1053,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
                        if (status != 0)
                                break;
                }
-#if 0
     } else if (charset) {
                /* pattern starts with a character from a known set */
                for (;;) {
@@ -1057,7 +1067,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
                        if (status != 0)
                                break;
         }
-#endif
        } else
                /* general case */
                while (ptr <= end) {
@@ -1204,6 +1213,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
        for (i = 0; i < SRE_MARK_SIZE; i++)
                state->mark[i] = NULL;
 
+    state->index = -1;
+
        state->stack = NULL;
        state->stackbase = 0;
        state->stacksize = 0;
@@ -1286,6 +1297,8 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
                        } else
                                match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
 
+        match->index = state->index;
+
                return (PyObject*) match;
 
        } else if (status < 0) {
@@ -1887,6 +1900,15 @@ match_getattr(MatchObject* self, char* name)
        if (!strcmp(name, "endpos"))
                return Py_BuildValue("i", 0); /* FIXME */
 
+       if (!strcmp(name, "index")) {
+        /* experimental */
+        if (self->index < 0) {
+            Py_INCREF(Py_None);
+            return Py_None;
+        } else
+            return Py_BuildValue("i", self->index);
+    }
+
        PyErr_SetString(PyExc_AttributeError, name);
        return NULL;
 }
index 274f085541e59041c8c025565bda19539ee8fd9a..7e7d8356e53dc65a5416232a37ce0cb559911a15 100644 (file)
@@ -33,6 +33,7 @@ typedef struct {
     PyObject_HEAD
     PyObject* string; /* link to the target string */
     PatternObject* pattern; /* link to the regex (pattern) object */
+    int index; /* last index marker seen by the engine (-1 if none) */
     int groups; /* number of groups (start/end marks) */
     int mark[2];
 } MatchObject;
@@ -57,6 +58,7 @@ typedef struct {
     /* character size */
     int charsize;
     /* registers */
+    int index;
     int lastmark;
     void* mark[SRE_MARK_SIZE];
     /* backtracking stack */
index da25ec4bb35fd0ef3ecc607be293e14c06d49dcf..bffcddebf2a6ae34e02977b75b1fe8b838fcf706 100644 (file)
 #define SRE_OP_CHARSET 9
 #define SRE_OP_GROUP 10
 #define SRE_OP_GROUP_IGNORE 11
-#define SRE_OP_IN 12
-#define SRE_OP_IN_IGNORE 13
-#define SRE_OP_INFO 14
-#define SRE_OP_JUMP 15
-#define SRE_OP_LITERAL 16
-#define SRE_OP_LITERAL_IGNORE 17
-#define SRE_OP_MARK 18
-#define SRE_OP_MAX_REPEAT 19
-#define SRE_OP_MAX_REPEAT_ONE 20
-#define SRE_OP_MIN_REPEAT 21
-#define SRE_OP_NOT_LITERAL 22
-#define SRE_OP_NOT_LITERAL_IGNORE 23
-#define SRE_OP_NEGATE 24
-#define SRE_OP_RANGE 25
-#define SRE_OP_REPEAT 26
+#define SRE_OP_INDEX 12
+#define SRE_OP_IN 13
+#define SRE_OP_IN_IGNORE 14
+#define SRE_OP_INFO 15
+#define SRE_OP_JUMP 16
+#define SRE_OP_LITERAL 17
+#define SRE_OP_LITERAL_IGNORE 18
+#define SRE_OP_MARK 19
+#define SRE_OP_MAX_REPEAT 20
+#define SRE_OP_MAX_REPEAT_ONE 21
+#define SRE_OP_MIN_REPEAT 22
+#define SRE_OP_NOT_LITERAL 23
+#define SRE_OP_NOT_LITERAL_IGNORE 24
+#define SRE_OP_NEGATE 25
+#define SRE_OP_RANGE 26
+#define SRE_OP_REPEAT 27
 #define SRE_AT_BEGINNING 0
 #define SRE_AT_BEGINNING_LINE 1
 #define SRE_AT_BOUNDARY 2