]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-67877: Fix memory leaks in terminated RE matching (GH-126840)
authorSerhiy Storchaka <storchaka@gmail.com>
Mon, 18 Nov 2024 09:53:45 +0000 (11:53 +0200)
committerGitHub <noreply@github.com>
Mon, 18 Nov 2024 09:53:45 +0000 (11:53 +0200)
If SRE(match) function terminates abruptly, either because of a signal
or because memory allocation fails, allocated SRE_REPEAT blocks might
be never released.

Co-authored-by: <wjssz@users.noreply.github.com>
Lib/test/test_re.py
Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst [new file with mode: 0644]
Modules/_sre/clinic/sre.c.h
Modules/_sre/sre.c
Modules/_sre/sre.h
Modules/_sre/sre_lib.h

index 7bc702ec89a4a7b1dc40c5a165dd88631e331164..1612fc7663e87e4d7366e6abc5828a6b224886f0 100644 (file)
@@ -2681,6 +2681,29 @@ class ReTests(unittest.TestCase):
                 self.assertIsNone(re.search(p, s))
                 self.assertIsNone(re.search('(?s:.)' + p, s))
 
+    def check_interrupt(self, pattern, string, maxcount):
+        class Interrupt(Exception):
+            pass
+        p = re.compile(pattern)
+        for n in range(maxcount):
+            try:
+                p._fail_after(n, Interrupt)
+                p.match(string)
+                return n
+            except Interrupt:
+                pass
+            finally:
+                p._fail_after(-1, None)
+
+    @unittest.skipUnless(hasattr(re.Pattern, '_fail_after'), 'requires debug build')
+    def test_memory_leaks(self):
+        self.check_interrupt(r'(.)*:', 'abc:', 100)
+        self.check_interrupt(r'([^:])*?:', 'abc:', 100)
+        self.check_interrupt(r'([^:])*+:', 'abc:', 100)
+        self.check_interrupt(r'(.){2,4}:', 'abc:', 100)
+        self.check_interrupt(r'([^:]){2,4}?:', 'abc:', 100)
+        self.check_interrupt(r'([^:]){2,4}+:', 'abc:', 100)
+
 
 def get_debug_out(pat):
     with captured_stdout() as out:
diff --git a/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst b/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst
new file mode 100644 (file)
index 0000000..021b4ae
--- /dev/null
@@ -0,0 +1,2 @@
+Fix memory leaks when :mod:`regular expression <re>` matching terminates
+abruptly, either because of a signal or because memory allocation fails.
index e287f3d5ad399156b185ac927851460a9459b678..87e4785a42846860e68ab2064a040d6f6cf4b16c 100644 (file)
@@ -985,6 +985,44 @@ PyDoc_STRVAR(_sre_SRE_Pattern___deepcopy____doc__,
 #define _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF    \
     {"__deepcopy__", (PyCFunction)_sre_SRE_Pattern___deepcopy__, METH_O, _sre_SRE_Pattern___deepcopy____doc__},
 
+#if defined(Py_DEBUG)
+
+PyDoc_STRVAR(_sre_SRE_Pattern__fail_after__doc__,
+"_fail_after($self, count, exception, /)\n"
+"--\n"
+"\n"
+"For debugging.");
+
+#define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF    \
+    {"_fail_after", _PyCFunction_CAST(_sre_SRE_Pattern__fail_after), METH_FASTCALL, _sre_SRE_Pattern__fail_after__doc__},
+
+static PyObject *
+_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
+                                  PyObject *exception);
+
+static PyObject *
+_sre_SRE_Pattern__fail_after(PatternObject *self, PyObject *const *args, Py_ssize_t nargs)
+{
+    PyObject *return_value = NULL;
+    int count;
+    PyObject *exception;
+
+    if (!_PyArg_CheckPositional("_fail_after", nargs, 2, 2)) {
+        goto exit;
+    }
+    count = PyLong_AsInt(args[0]);
+    if (count == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+    exception = args[1];
+    return_value = _sre_SRE_Pattern__fail_after_impl(self, count, exception);
+
+exit:
+    return return_value;
+}
+
+#endif /* defined(Py_DEBUG) */
+
 PyDoc_STRVAR(_sre_compile__doc__,
 "compile($module, /, pattern, flags, code, groups, groupindex,\n"
 "        indexgroup)\n"
@@ -1474,4 +1512,8 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const
     }
     return _sre_SRE_Scanner_search_impl(self, cls);
 }
-/*[clinic end generated code: output=afaa301d55957cb0 input=a9049054013a1b77]*/
+
+#ifndef _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
+    #define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
+#endif /* !defined(_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF) */
+/*[clinic end generated code: output=f8cb77f2261f0b2e input=a9049054013a1b77]*/
index 2c86f8869d8e588d35fc672abe1055ad7cdc9456..36f542ddb4df2bd9cb51763cb5041d842e7352f1 100644 (file)
@@ -267,6 +267,85 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size)
     return 0;
 }
 
+/* memory pool functions for SRE_REPEAT, this can avoid memory
+   leak when SRE(match) function terminates abruptly.
+   state->repeat_pool_used is a doubly-linked list, so that we
+   can remove a SRE_REPEAT node from it.
+   state->repeat_pool_unused is a singly-linked list, we put/get
+   node at the head. */
+static SRE_REPEAT *
+repeat_pool_malloc(SRE_STATE *state)
+{
+    SRE_REPEAT *repeat;
+
+    if (state->repeat_pool_unused) {
+        /* remove from unused pool (singly-linked list) */
+        repeat = state->repeat_pool_unused;
+        state->repeat_pool_unused = repeat->pool_next;
+    }
+    else {
+        repeat = PyMem_Malloc(sizeof(SRE_REPEAT));
+        if (!repeat) {
+            return NULL;
+        }
+    }
+
+    /* add to used pool (doubly-linked list) */
+    SRE_REPEAT *temp = state->repeat_pool_used;
+    if (temp) {
+        temp->pool_prev = repeat;
+    }
+    repeat->pool_prev = NULL;
+    repeat->pool_next = temp;
+    state->repeat_pool_used = repeat;
+
+    return repeat;
+}
+
+static void
+repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
+{
+    SRE_REPEAT *prev = repeat->pool_prev;
+    SRE_REPEAT *next = repeat->pool_next;
+
+    /* remove from used pool (doubly-linked list) */
+    if (prev) {
+        prev->pool_next = next;
+    }
+    else {
+        state->repeat_pool_used = next;
+    }
+    if (next) {
+        next->pool_prev = prev;
+    }
+
+    /* add to unused pool (singly-linked list) */
+    repeat->pool_next = state->repeat_pool_unused;
+    state->repeat_pool_unused = repeat;
+}
+
+static void
+repeat_pool_clear(SRE_STATE *state)
+{
+    /* clear used pool */
+    SRE_REPEAT *next = state->repeat_pool_used;
+    state->repeat_pool_used = NULL;
+    while (next) {
+        SRE_REPEAT *temp = next;
+        next = temp->pool_next;
+        PyMem_Free(temp);
+    }
+
+    /* clear unused pool */
+    next = state->repeat_pool_unused;
+    state->repeat_pool_unused = NULL;
+    while (next) {
+        SRE_REPEAT *temp = next;
+        next = temp->pool_next;
+        PyMem_Free(temp);
+    }
+}
+
 /* generate 8-bit version */
 
 #define SRE_CHAR Py_UCS1
@@ -511,6 +590,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
     state->pos = start;
     state->endpos = end;
 
+#ifdef Py_DEBUG
+    state->fail_after_count = pattern->fail_after_count;
+    state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
+#endif
+
     return string;
   err:
     /* We add an explicit cast here because MSVC has a bug when
@@ -533,6 +617,8 @@ state_fini(SRE_STATE* state)
     /* See above PyMem_Free() for why we explicitly cast here. */
     PyMem_Free((void*) state->mark);
     state->mark = NULL;
+    /* SRE_REPEAT pool */
+    repeat_pool_clear(state);
 }
 
 /* calculate offset from start of string */
@@ -619,6 +705,9 @@ pattern_traverse(PatternObject *self, visitproc visit, void *arg)
     Py_VISIT(self->groupindex);
     Py_VISIT(self->indexgroup);
     Py_VISIT(self->pattern);
+#ifdef Py_DEBUG
+    Py_VISIT(self->fail_after_exc);
+#endif
     return 0;
 }
 
@@ -628,6 +717,9 @@ pattern_clear(PatternObject *self)
     Py_CLEAR(self->groupindex);
     Py_CLEAR(self->indexgroup);
     Py_CLEAR(self->pattern);
+#ifdef Py_DEBUG
+    Py_CLEAR(self->fail_after_exc);
+#endif
     return 0;
 }
 
@@ -690,7 +782,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
     Py_ssize_t status;
     PyObject *match;
 
-    if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
+    if (!state_init(&state, self, string, pos, endpos))
         return NULL;
 
     INIT_TRACE(&state);
@@ -1381,6 +1473,29 @@ _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
     return Py_NewRef(self);
 }
 
+#ifdef Py_DEBUG
+/*[clinic input]
+_sre.SRE_Pattern._fail_after
+
+    count: int
+    exception: object
+    /
+
+For debugging.
+[clinic start generated code]*/
+
+static PyObject *
+_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
+                                  PyObject *exception)
+/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
+{
+    self->fail_after_count = count;
+    Py_INCREF(exception);
+    Py_XSETREF(self->fail_after_exc, exception);
+    Py_RETURN_NONE;
+}
+#endif /* Py_DEBUG */
+
 static PyObject *
 pattern_repr(PatternObject *obj)
 {
@@ -1506,6 +1621,10 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
     self->pattern = NULL;
     self->groupindex = NULL;
     self->indexgroup = NULL;
+#ifdef Py_DEBUG
+    self->fail_after_count = -1;
+    self->fail_after_exc = NULL;
+#endif
 
     self->codesize = n;
 
@@ -2604,7 +2723,8 @@ pattern_new_match(_sremodulestate* module_state,
         if (!match)
             return NULL;
 
-        match->pattern = (PatternObject*)Py_NewRef(pattern);
+        Py_INCREF(pattern);
+        match->pattern = pattern;
 
         match->string = Py_NewRef(state->string);
 
@@ -2740,7 +2860,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
         return NULL;
     }
 
-    match = pattern_new_match(module_state, (PatternObject*) self->pattern,
+    match = pattern_new_match(module_state, self->pattern,
                               state, status);
 
     if (status == 0)
@@ -2790,7 +2910,7 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
         return NULL;
     }
 
-    match = pattern_new_match(module_state, (PatternObject*) self->pattern,
+    match = pattern_new_match(module_state, self->pattern,
                               state, status);
 
     if (status == 0)
@@ -2826,7 +2946,8 @@ pattern_scanner(_sremodulestate *module_state,
         return NULL;
     }
 
-    scanner->pattern = Py_NewRef(self);
+    Py_INCREF(self);
+    scanner->pattern = self;
 
     PyObject_GC_Track(scanner);
     return (PyObject*) scanner;
@@ -3020,6 +3141,7 @@ static PyMethodDef pattern_methods[] = {
     _SRE_SRE_PATTERN_SCANNER_METHODDEF
     _SRE_SRE_PATTERN___COPY___METHODDEF
     _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
+    _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
     {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
      PyDoc_STR("See PEP 585")},
     {NULL, NULL}
index 83d89d57b111997ee6163d0201084812c7638cf2..42681c2addf3c2ccb3d4e290c7573acf8e9d1c19 100644 (file)
@@ -34,6 +34,11 @@ typedef struct {
     int flags; /* flags used when compiling pattern source */
     PyObject *weakreflist; /* List of weak references */
     int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */
+#ifdef Py_DEBUG
+    /* for simulation of user interruption */
+    int fail_after_count;
+    PyObject *fail_after_exc;
+#endif
     /* pattern code */
     Py_ssize_t codesize;
     SRE_CODE code[1];
@@ -68,6 +73,9 @@ typedef struct SRE_REPEAT_T {
     const SRE_CODE* pattern; /* points to REPEAT operator arguments */
     const void* last_ptr; /* helper to check for infinite loops */
     struct SRE_REPEAT_T *prev; /* points to previous repeat context */
+    /* for SRE_REPEAT pool */
+    struct SRE_REPEAT_T *pool_prev;
+    struct SRE_REPEAT_T *pool_next;
 } SRE_REPEAT;
 
 typedef struct {
@@ -95,12 +103,19 @@ typedef struct {
     size_t data_stack_base;
     /* current repeat context */
     SRE_REPEAT *repeat;
+    /* SRE_REPEAT pool */
+    SRE_REPEAT *repeat_pool_used;
+    SRE_REPEAT *repeat_pool_unused;
     unsigned int sigcount;
+#ifdef Py_DEBUG
+    int fail_after_count;
+    PyObject *fail_after_exc;
+#endif
 } SRE_STATE;
 
 typedef struct {
     PyObject_HEAD
-    PyObject* pattern;
+    PatternObject* pattern;
     SRE_STATE state;
     int executing;
 } ScannerObject;
index 97fbb0a75e54b65fde36ccf226a45f1b7d562de8..0c93f5121103e8175c4256c237cebea544ea5030 100644 (file)
@@ -560,13 +560,28 @@ typedef struct {
     Py_ssize_t last_ctx_pos;
 } SRE(match_context);
 
-#define MAYBE_CHECK_SIGNALS                                        \
+#define _MAYBE_CHECK_SIGNALS                                       \
     do {                                                           \
         if ((0 == (++sigcount & 0xfff)) && PyErr_CheckSignals()) { \
             RETURN_ERROR(SRE_ERROR_INTERRUPTED);                   \
         }                                                          \
     } while (0)
 
+#ifdef Py_DEBUG
+# define MAYBE_CHECK_SIGNALS                                       \
+    do {                                                           \
+        _MAYBE_CHECK_SIGNALS;                                      \
+        if (state->fail_after_count >= 0) {                        \
+            if (state->fail_after_count-- == 0) {                  \
+                PyErr_SetNone(state->fail_after_exc);              \
+                RETURN_ERROR(SRE_ERROR_INTERRUPTED);               \
+            }                                                      \
+        }                                                          \
+    } while (0)
+#else
+# define MAYBE_CHECK_SIGNALS _MAYBE_CHECK_SIGNALS
+#endif /* Py_DEBUG */
+
 #ifdef HAVE_COMPUTED_GOTOS
     #ifndef USE_COMPUTED_GOTOS
     #define USE_COMPUTED_GOTOS 1
@@ -1120,12 +1135,9 @@ dispatch:
                    pattern[1], pattern[2]));
 
             /* install new repeat context */
-            /* TODO(https://github.com/python/cpython/issues/67877): Fix this
-             * potential memory leak. */
-            ctx->u.rep = (SRE_REPEAT*) PyMem_Malloc(sizeof(*ctx->u.rep));
+            ctx->u.rep = repeat_pool_malloc(state);
             if (!ctx->u.rep) {
-                PyErr_NoMemory();
-                RETURN_FAILURE;
+                RETURN_ERROR(SRE_ERROR_MEMORY);
             }
             ctx->u.rep->count = -1;
             ctx->u.rep->pattern = pattern;
@@ -1136,7 +1148,7 @@ dispatch:
             state->ptr = ptr;
             DO_JUMP(JUMP_REPEAT, jump_repeat, pattern+pattern[0]);
             state->repeat = ctx->u.rep->prev;
-            PyMem_Free(ctx->u.rep);
+            repeat_pool_free(state, ctx->u.rep);
 
             if (ret) {
                 RETURN_ON_ERROR(ret);