]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
[3.12] gh-67877: Fix memory leaks in terminated RE matching (GH-126840) (GH-126961)
authorSerhiy Storchaka <storchaka@gmail.com>
Mon, 18 Nov 2024 11:24:13 +0000 (13:24 +0200)
committerGitHub <noreply@github.com>
Mon, 18 Nov 2024 11:24:13 +0000 (11:24 +0000)
If SRE(match) function terminates abruptly, either because of a signal
or because memory allocation fails, allocated SRE_REPEAT blocks might
be never released.

(cherry picked from commit 7538e7f5696408fa0aa02fce8a413a7dfac76a04)

Lib/test/test_re.py
Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst [new file with mode: 0644]
Modules/_sre/clinic/sre.c.h
Modules/_sre/sre.c
Modules/_sre/sre.h
Modules/_sre/sre_lib.h

index 756a7ccd506be43764d45497691f2500670f2282..130e1e1d01dec5401f8c53938d7126107aac1234 100644 (file)
@@ -2621,6 +2621,50 @@ class ReTests(unittest.TestCase):
                 p.terminate()
                 p.join()
 
+    def test_fail(self):
+        self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
+
+    def test_character_set_any(self):
+        # The union of complementary character sets matches any character
+        # and is equivalent to "(?s:.)".
+        s = '1x\n'
+        for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
+            with self.subTest(pattern=p):
+                self.assertEqual(re.findall(p, s), list(s))
+                self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)
+
+    def test_character_set_none(self):
+        # Negation of the union of complementary character sets does not match
+        # any character.
+        s = '1x\n'
+        for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
+            with self.subTest(pattern=p):
+                self.assertIsNone(re.search(p, s))
+                self.assertIsNone(re.search('(?s:.)' + p, s))
+
+    def check_interrupt(self, pattern, string, maxcount):
+        class Interrupt(Exception):
+            pass
+        p = re.compile(pattern)
+        for n in range(maxcount):
+            try:
+                p._fail_after(n, Interrupt)
+                p.match(string)
+                return n
+            except Interrupt:
+                pass
+            finally:
+                p._fail_after(-1, None)
+
+    @unittest.skipUnless(hasattr(re.Pattern, '_fail_after'), 'requires debug build')
+    def test_memory_leaks(self):
+        self.check_interrupt(r'(.)*:', 'abc:', 100)
+        self.check_interrupt(r'([^:])*?:', 'abc:', 100)
+        self.check_interrupt(r'([^:])*+:', 'abc:', 100)
+        self.check_interrupt(r'(.){2,4}:', 'abc:', 100)
+        self.check_interrupt(r'([^:]){2,4}?:', 'abc:', 100)
+        self.check_interrupt(r'([^:]){2,4}+:', 'abc:', 100)
+
 
 def get_debug_out(pat):
     with captured_stdout() as out:
diff --git a/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst b/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst
new file mode 100644 (file)
index 0000000..021b4ae
--- /dev/null
@@ -0,0 +1,2 @@
+Fix memory leaks when :mod:`regular expression <re>` matching terminates
+abruptly, either because of a signal or because memory allocation fails.
index 529c634e76d63c92240e5738d1ef48c3f4aaec6a..56a4e6048fa8ef4eb0127e1762740825d6a75b34 100644 (file)
@@ -975,6 +975,44 @@ PyDoc_STRVAR(_sre_SRE_Pattern___deepcopy____doc__,
 #define _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF    \
     {"__deepcopy__", (PyCFunction)_sre_SRE_Pattern___deepcopy__, METH_O, _sre_SRE_Pattern___deepcopy____doc__},
 
+#if defined(Py_DEBUG)
+
+PyDoc_STRVAR(_sre_SRE_Pattern__fail_after__doc__,
+"_fail_after($self, count, exception, /)\n"
+"--\n"
+"\n"
+"For debugging.");
+
+#define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF    \
+    {"_fail_after", _PyCFunction_CAST(_sre_SRE_Pattern__fail_after), METH_FASTCALL, _sre_SRE_Pattern__fail_after__doc__},
+
+static PyObject *
+_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
+                                  PyObject *exception);
+
+static PyObject *
+_sre_SRE_Pattern__fail_after(PatternObject *self, PyObject *const *args, Py_ssize_t nargs)
+{
+    PyObject *return_value = NULL;
+    int count;
+    PyObject *exception;
+
+    if (!_PyArg_CheckPositional("_fail_after", nargs, 2, 2)) {
+        goto exit;
+    }
+    count = _PyLong_AsInt(args[0]);
+    if (count == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+    exception = args[1];
+    return_value = _sre_SRE_Pattern__fail_after_impl(self, count, exception);
+
+exit:
+    return return_value;
+}
+
+#endif /* defined(Py_DEBUG) */
+
 PyDoc_STRVAR(_sre_compile__doc__,
 "compile($module, /, pattern, flags, code, groups, groupindex,\n"
 "        indexgroup)\n"
@@ -1460,4 +1498,8 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const
     }
     return _sre_SRE_Scanner_search_impl(self, cls);
 }
-/*[clinic end generated code: output=045de53cfe02dee0 input=a9049054013a1b77]*/
+
+#ifndef _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
+    #define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
+#endif /* !defined(_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF) */
+/*[clinic end generated code: output=2165ecf43a7c20e8 input=a9049054013a1b77]*/
index 6d9843bb76d7918f6e23d4e521530affc4ea3d61..35c6058dac4eae02c27fcdf71ff0651ec170fdd5 100644 (file)
@@ -218,6 +218,85 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size)
     return 0;
 }
 
+/* memory pool functions for SRE_REPEAT, this can avoid memory
+   leak when SRE(match) function terminates abruptly.
+   state->repeat_pool_used is a doubly-linked list, so that we
+   can remove a SRE_REPEAT node from it.
+   state->repeat_pool_unused is a singly-linked list, we put/get
+   node at the head. */
+static SRE_REPEAT *
+repeat_pool_malloc(SRE_STATE *state)
+{
+    SRE_REPEAT *repeat;
+
+    if (state->repeat_pool_unused) {
+        /* remove from unused pool (singly-linked list) */
+        repeat = state->repeat_pool_unused;
+        state->repeat_pool_unused = repeat->pool_next;
+    }
+    else {
+        repeat = PyObject_Malloc(sizeof(SRE_REPEAT));
+        if (!repeat) {
+            return NULL;
+        }
+    }
+
+    /* add to used pool (doubly-linked list) */
+    SRE_REPEAT *temp = state->repeat_pool_used;
+    if (temp) {
+        temp->pool_prev = repeat;
+    }
+    repeat->pool_prev = NULL;
+    repeat->pool_next = temp;
+    state->repeat_pool_used = repeat;
+
+    return repeat;
+}
+
+static void
+repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
+{
+    SRE_REPEAT *prev = repeat->pool_prev;
+    SRE_REPEAT *next = repeat->pool_next;
+
+    /* remove from used pool (doubly-linked list) */
+    if (prev) {
+        prev->pool_next = next;
+    }
+    else {
+        state->repeat_pool_used = next;
+    }
+    if (next) {
+        next->pool_prev = prev;
+    }
+
+    /* add to unused pool (singly-linked list) */
+    repeat->pool_next = state->repeat_pool_unused;
+    state->repeat_pool_unused = repeat;
+}
+
+static void
+repeat_pool_clear(SRE_STATE *state)
+{
+    /* clear used pool */
+    SRE_REPEAT *next = state->repeat_pool_used;
+    state->repeat_pool_used = NULL;
+    while (next) {
+        SRE_REPEAT *temp = next;
+        next = temp->pool_next;
+        PyObject_Free(temp);
+    }
+
+    /* clear unused pool */
+    next = state->repeat_pool_unused;
+    state->repeat_pool_unused = NULL;
+    while (next) {
+        SRE_REPEAT *temp = next;
+        next = temp->pool_next;
+        PyObject_Free(temp);
+    }
+}
+
 /* generate 8-bit version */
 
 #define SRE_CHAR Py_UCS1
@@ -463,6 +542,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
     state->pos = start;
     state->endpos = end;
 
+#ifdef Py_DEBUG
+    state->fail_after_count = pattern->fail_after_count;
+    state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
+#endif
+
     return string;
   err:
     /* We add an explicit cast here because MSVC has a bug when
@@ -485,6 +569,8 @@ state_fini(SRE_STATE* state)
     /* See above PyMem_Del for why we explicitly cast here. */
     PyMem_Free((void*) state->mark);
     state->mark = NULL;
+    /* SRE_REPEAT pool */
+    repeat_pool_clear(state);
 }
 
 /* calculate offset from start of string */
@@ -571,6 +657,9 @@ pattern_traverse(PatternObject *self, visitproc visit, void *arg)
     Py_VISIT(self->groupindex);
     Py_VISIT(self->indexgroup);
     Py_VISIT(self->pattern);
+#ifdef Py_DEBUG
+    Py_VISIT(self->fail_after_exc);
+#endif
     return 0;
 }
 
@@ -580,6 +669,9 @@ pattern_clear(PatternObject *self)
     Py_CLEAR(self->groupindex);
     Py_CLEAR(self->indexgroup);
     Py_CLEAR(self->pattern);
+#ifdef Py_DEBUG
+    Py_CLEAR(self->fail_after_exc);
+#endif
     return 0;
 }
 
@@ -642,7 +734,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
     Py_ssize_t status;
     PyObject *match;
 
-    if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
+    if (!state_init(&state, self, string, pos, endpos))
         return NULL;
 
     state.ptr = state.start;
@@ -1330,6 +1422,29 @@ _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
     return Py_NewRef(self);
 }
 
+#ifdef Py_DEBUG
+/*[clinic input]
+_sre.SRE_Pattern._fail_after
+
+    count: int
+    exception: object
+    /
+
+For debugging.
+[clinic start generated code]*/
+
+static PyObject *
+_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
+                                  PyObject *exception)
+/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
+{
+    self->fail_after_count = count;
+    Py_INCREF(exception);
+    Py_XSETREF(self->fail_after_exc, exception);
+    Py_RETURN_NONE;
+}
+#endif /* Py_DEBUG */
+
 static PyObject *
 pattern_repr(PatternObject *obj)
 {
@@ -1456,6 +1571,10 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
     self->pattern = NULL;
     self->groupindex = NULL;
     self->indexgroup = NULL;
+#ifdef Py_DEBUG
+    self->fail_after_count = -1;
+    self->fail_after_exc = NULL;
+#endif
 
     self->codesize = n;
 
@@ -2552,7 +2671,8 @@ pattern_new_match(_sremodulestate* module_state,
         if (!match)
             return NULL;
 
-        match->pattern = (PatternObject*)Py_NewRef(pattern);
+        Py_INCREF(pattern);
+        match->pattern = pattern;
 
         match->string = Py_NewRef(state->string);
 
@@ -2688,7 +2808,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
         return NULL;
     }
 
-    match = pattern_new_match(module_state, (PatternObject*) self->pattern,
+    match = pattern_new_match(module_state, self->pattern,
                               state, status);
 
     if (status == 0)
@@ -2738,7 +2858,7 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
         return NULL;
     }
 
-    match = pattern_new_match(module_state, (PatternObject*) self->pattern,
+    match = pattern_new_match(module_state, self->pattern,
                               state, status);
 
     if (status == 0)
@@ -2774,7 +2894,8 @@ pattern_scanner(_sremodulestate *module_state,
         return NULL;
     }
 
-    scanner->pattern = Py_NewRef(self);
+    Py_INCREF(self);
+    scanner->pattern = self;
 
     PyObject_GC_Track(scanner);
     return (PyObject*) scanner;
@@ -2968,6 +3089,7 @@ static PyMethodDef pattern_methods[] = {
     _SRE_SRE_PATTERN_SCANNER_METHODDEF
     _SRE_SRE_PATTERN___COPY___METHODDEF
     _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
+    _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
     {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
      PyDoc_STR("See PEP 585")},
     {NULL, NULL}
index a0f235606e290e4fcf99c70c0f7aeb291f10602e..b8c6f8e3e660d1be5d967a9ce9f5c1941def81e4 100644 (file)
@@ -34,6 +34,11 @@ typedef struct {
     int flags; /* flags used when compiling pattern source */
     PyObject *weakreflist; /* List of weak references */
     int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */
+#ifdef Py_DEBUG
+    /* for simulation of user interruption */
+    int fail_after_count;
+    PyObject *fail_after_exc;
+#endif
     /* pattern code */
     Py_ssize_t codesize;
     SRE_CODE code[1];
@@ -68,6 +73,9 @@ typedef struct SRE_REPEAT_T {
     const SRE_CODE* pattern; /* points to REPEAT operator arguments */
     const void* last_ptr; /* helper to check for infinite loops */
     struct SRE_REPEAT_T *prev; /* points to previous repeat context */
+    /* for SRE_REPEAT pool */
+    struct SRE_REPEAT_T *pool_prev;
+    struct SRE_REPEAT_T *pool_next;
 } SRE_REPEAT;
 
 typedef struct {
@@ -94,12 +102,19 @@ typedef struct {
     size_t data_stack_base;
     /* current repeat context */
     SRE_REPEAT *repeat;
+    /* SRE_REPEAT pool */
+    SRE_REPEAT *repeat_pool_used;
+    SRE_REPEAT *repeat_pool_unused;
     unsigned int sigcount;
+#ifdef Py_DEBUG
+    int fail_after_count;
+    PyObject *fail_after_exc;
+#endif
 } SRE_STATE;
 
 typedef struct {
     PyObject_HEAD
-    PyObject* pattern;
+    PatternObject* pattern;
     SRE_STATE state;
     int executing;
 } ScannerObject;
index 95c1ada908d222e3e2dff07efe7744e4146ceadc..d82ba7aa3c8b83a533bbcbee37110c18f31b754a 100644 (file)
@@ -524,13 +524,28 @@ typedef struct {
     Py_ssize_t last_ctx_pos;
 } SRE(match_context);
 
-#define MAYBE_CHECK_SIGNALS                                        \
+#define _MAYBE_CHECK_SIGNALS                                       \
     do {                                                           \
         if ((0 == (++sigcount & 0xfff)) && PyErr_CheckSignals()) { \
             RETURN_ERROR(SRE_ERROR_INTERRUPTED);                   \
         }                                                          \
     } while (0)
 
+#ifdef Py_DEBUG
+# define MAYBE_CHECK_SIGNALS                                       \
+    do {                                                           \
+        _MAYBE_CHECK_SIGNALS;                                      \
+        if (state->fail_after_count >= 0) {                        \
+            if (state->fail_after_count-- == 0) {                  \
+                PyErr_SetNone(state->fail_after_exc);              \
+                RETURN_ERROR(SRE_ERROR_INTERRUPTED);               \
+            }                                                      \
+        }                                                          \
+    } while (0)
+#else
+# define MAYBE_CHECK_SIGNALS _MAYBE_CHECK_SIGNALS
+#endif /* Py_DEBUG */
+
 #ifdef HAVE_COMPUTED_GOTOS
     #ifndef USE_COMPUTED_GOTOS
     #define USE_COMPUTED_GOTOS 1
@@ -1083,12 +1098,9 @@ dispatch:
                    pattern[1], pattern[2]));
 
             /* install new repeat context */
-            /* TODO(https://github.com/python/cpython/issues/67877): Fix this
-             * potential memory leak. */
-            ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep));
+            ctx->u.rep = repeat_pool_malloc(state);
             if (!ctx->u.rep) {
-                PyErr_NoMemory();
-                RETURN_FAILURE;
+                RETURN_ERROR(SRE_ERROR_MEMORY);
             }
             ctx->u.rep->count = -1;
             ctx->u.rep->pattern = pattern;
@@ -1099,7 +1111,7 @@ dispatch:
             state->ptr = ptr;
             DO_JUMP(JUMP_REPEAT, jump_repeat, pattern+pattern[0]);
             state->repeat = ctx->u.rep->prev;
-            PyObject_Free(ctx->u.rep);
+            repeat_pool_free(state, ctx->u.rep);
 
             if (ret) {
                 RETURN_ON_ERROR(ret);