]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
[3.10] gh-116773: Fix overlapped memory corruption crash (GH-116774) (GH-117079)
authorjkriegshauser <jkriegshauser@gmail.com>
Wed, 27 Mar 2024 15:24:34 +0000 (08:24 -0700)
committerGitHub <noreply@github.com>
Wed, 27 Mar 2024 15:24:34 +0000 (16:24 +0100)
Co-authored-by: Ɓukasz Langa <lukasz@langa.pl>
Lib/asyncio/windows_events.py
Lib/test/test_asyncio/test_windows_events.py
Misc/NEWS.d/next/Windows/2024-03-14-01-58-22.gh-issue-116773.H2UldY.rst [new file with mode: 0644]
Modules/overlapped.c

index 3204c7c9be96a4b32a67ba4ba1b752a65b9e6b09..5aed825588bc5baa5849d4f7cc8c8b5aa4c2ca8e 100644 (file)
@@ -323,13 +323,13 @@ class ProactorEventLoop(proactor_events.BaseProactorEventLoop):
             if self._self_reading_future is not None:
                 ov = self._self_reading_future._ov
                 self._self_reading_future.cancel()
-                # self_reading_future was just cancelled so if it hasn't been
-                # finished yet, it never will be (it's possible that it has
-                # already finished and its callback is waiting in the queue,
-                # where it could still happen if the event loop is restarted).
-                # Unregister it otherwise IocpProactor.close will wait for it
-                # forever
-                if ov is not None:
+                # self_reading_future always uses IOCP, so even though it's
+                # been cancelled, we need to make sure that the IOCP message
+                # is received so that the kernel is not holding on to the
+                # memory, possibly causing memory corruption later. Only
+                # unregister it if IO is complete in all respects. Otherwise
+                # we need another _poll() later to complete the IO.
+                if ov is not None and not ov.pending:
                     self._proactor._unregister(ov)
                 self._self_reading_future = None
 
index 46eb7ecf7c4a911d373b6c17ca4f4d92012b1ea2..daf0f1ec1b232708331f9ef0f241251a994907ba 100644 (file)
@@ -36,7 +36,23 @@ class UpperProto(asyncio.Protocol):
             self.trans.close()
 
 
-class ProactorLoopCtrlC(test_utils.TestCase):
+class WindowsEventsTestCase(test_utils.TestCase):
+    def _unraisablehook(self, unraisable):
+        # Storing unraisable.object can resurrect an object which is being
+        # finalized. Storing unraisable.exc_value creates a reference cycle.
+        self._unraisable = unraisable
+        print(unraisable)
+
+    def setUp(self):
+        self._prev_unraisablehook = sys.unraisablehook
+        self._unraisable = None
+        sys.unraisablehook = self._unraisablehook
+
+    def tearDown(self):
+        sys.unraisablehook = self._prev_unraisablehook
+        self.assertIsNone(self._unraisable)
+
+class ProactorLoopCtrlC(WindowsEventsTestCase):
 
     def test_ctrl_c(self):
 
@@ -58,7 +74,7 @@ class ProactorLoopCtrlC(test_utils.TestCase):
         thread.join()
 
 
-class ProactorMultithreading(test_utils.TestCase):
+class ProactorMultithreading(WindowsEventsTestCase):
     def test_run_from_nonmain_thread(self):
         finished = False
 
@@ -79,7 +95,7 @@ class ProactorMultithreading(test_utils.TestCase):
         self.assertTrue(finished)
 
 
-class ProactorTests(test_utils.TestCase):
+class ProactorTests(WindowsEventsTestCase):
 
     def setUp(self):
         super().setUp()
@@ -290,8 +306,32 @@ class ProactorTests(test_utils.TestCase):
 
         return "done"
 
-
-class WinPolicyTests(test_utils.TestCase):
+    def test_loop_restart(self):
+        # We're fishing for the "RuntimeError: <_overlapped.Overlapped object at XXX>
+        # still has pending operation at deallocation, the process may crash" error
+        stop = threading.Event()
+        def threadMain():
+            while not stop.is_set():
+                self.loop.call_soon_threadsafe(lambda: None)
+                time.sleep(0.01)
+        thr = threading.Thread(target=threadMain)
+
+        # In 10 60-second runs of this test prior to the fix:
+        # time in seconds until failure: (none), 15.0, 6.4, (none), 7.6, 8.3, 1.7, 22.2, 23.5, 8.3
+        # 10 seconds had a 50% failure rate but longer would be more costly
+        end_time = time.time() + 10 # Run for 10 seconds
+        self.loop.call_soon(thr.start)
+        while not self._unraisable: # Stop if we got an unraisable exc
+            self.loop.stop()
+            self.loop.run_forever()
+            if time.time() >= end_time:
+                break
+
+        stop.set()
+        thr.join()
+
+
+class WinPolicyTests(WindowsEventsTestCase):
 
     def test_selector_win_policy(self):
         async def main():
diff --git a/Misc/NEWS.d/next/Windows/2024-03-14-01-58-22.gh-issue-116773.H2UldY.rst b/Misc/NEWS.d/next/Windows/2024-03-14-01-58-22.gh-issue-116773.H2UldY.rst
new file mode 100644 (file)
index 0000000..8fc3fe8
--- /dev/null
@@ -0,0 +1 @@
+Fix instances of ``<_overlapped.Overlapped object at 0xXXX> still has pending operation at deallocation, the process may crash``.
index b9ca86cbd1f67e01dd3951083e7ae247a0facd26..ad8f5430eaf80e5d4b6795f6302c3c14d8c38891 100644 (file)
@@ -692,6 +692,24 @@ Overlapped_dealloc(OverlappedObject *self)
     if (!HasOverlappedIoCompleted(&self->overlapped) &&
         self->type != TYPE_NOT_STARTED)
     {
+        // NOTE: We should not get here, if we do then something is wrong in
+        // the IocpProactor or ProactorEventLoop. Since everything uses IOCP if
+        // the overlapped IO hasn't completed yet then we should not be
+        // deallocating!
+        //
+        // The problem is likely that this OverlappedObject was removed from
+        // the IocpProactor._cache before it was complete. The _cache holds a
+        // reference while IO is pending so that it does not get deallocated
+        // while the kernel has retained the OVERLAPPED structure.
+        //
+        // CancelIoEx (likely called from self.cancel()) may have successfully
+        // completed, but the OVERLAPPED is still in use until either
+        // HasOverlappedIoCompleted() is true or GetQueuedCompletionStatus has
+        // returned this OVERLAPPED object.
+        //
+        // NOTE: Waiting when IOCP is in use can hang indefinitely, but this
+        // CancelIoEx is superfluous in that self.cancel() was already called,
+        // so I've only ever seen this return FALSE with GLE=ERROR_NOT_FOUND
         if (Py_CancelIoEx && Py_CancelIoEx(self->handle, &self->overlapped))
             wait = TRUE;