]> git.ipfire.org Git - thirdparty/dnspython.git/commitdiff
Fix a race condition in trio quic shutdown.
authorBob Halley <halley@dnspython.org>
Fri, 27 Oct 2023 15:55:10 +0000 (08:55 -0700)
committerBob Halley <halley@dnspython.org>
Fri, 27 Oct 2023 15:55:10 +0000 (08:55 -0700)
It was possible to have a "lost wakeup" situation where we had stuff to
send but the trio worker was blocked indefinitely in the receive.

There is no test for this as the race is very race-y and I can't reproduce it
reliably in the test suite, though I was able to do reliable replication a different
way when debugging.

I also reordered event processing to happen after timer handling but before sending
in the trio and sync quic code.  The async code already worked this way due to its
different struture and needed no changes.

dns/quic/_sync.py
dns/quic/_trio.py

index 25dd6d6ed8d2cbff06359039dc06949f6b7e35fd..a71ac67c4b4e7121d1fd3b0a66b9b844f83597aa 100644 (file)
@@ -128,6 +128,8 @@ class SyncQuicConnection(BaseQuicConnection):
                     key.data()
                 with self._lock:
                     self._handle_timer(expiration)
+                self._handle_events()
+                with self._lock:
                     datagrams = self._connection.datagrams_to_send(time.time())
                 for datagram, _ in datagrams:
                     try:
@@ -135,7 +137,6 @@ class SyncQuicConnection(BaseQuicConnection):
                     except BlockingIOError:
                         # we let QUIC handle any lossage
                         pass
-                self._handle_events()
         finally:
             with self._lock:
                 self._done = True
index 1ee702b52bc20cd180f697b88457817b400170ea..0ff0497ec01b44dba3a2bf5e518a088a39847649 100644 (file)
@@ -81,12 +81,21 @@ class TrioQuicConnection(AsyncQuicConnection):
         self._handshake_complete = trio.Event()
         self._run_done = trio.Event()
         self._worker_scope = None
+        self._send_pending = False
 
     async def _worker(self):
         try:
             await self._socket.connect(self._peer)
             while not self._done:
                 (expiration, interval) = self._get_timer_values(False)
+                if self._send_pending:
+                    # Do not block forever if sends are pending.  Even though we
+                    # have a wake-up mechanism if we've already started the blocking
+                    # read, the possibility of context switching in send means that
+                    # more writes can happen while we have no wake up context, so
+                    # we need self._send_pending to avoid (effectively) a "lost wakeup"
+                    # race.
+                    interval = 0.0
                 with trio.CancelScope(
                     deadline=trio.current_time() + interval
                 ) as self._worker_scope:
@@ -94,10 +103,14 @@ class TrioQuicConnection(AsyncQuicConnection):
                     self._connection.receive_datagram(datagram, self._peer, time.time())
                 self._worker_scope = None
                 self._handle_timer(expiration)
+                await self._handle_events()
+                # We clear this now, before sending anything, as sending can cause
+                # context switches that do more sends.  We want to know if that
+                # happens so we don't block a long time on the recv() above.
+                self._send_pending = False
                 datagrams = self._connection.datagrams_to_send(time.time())
                 for datagram, _ in datagrams:
                     await self._socket.send(datagram)
-                await self._handle_events()
         finally:
             self._done = True
             self._handshake_complete.set()
@@ -129,6 +142,7 @@ class TrioQuicConnection(AsyncQuicConnection):
 
     async def write(self, stream, data, is_end=False):
         self._connection.send_stream_data(stream, data, is_end)
+        self._send_pending = True
         if self._worker_scope is not None:
             self._worker_scope.cancel()
 
@@ -159,6 +173,7 @@ class TrioQuicConnection(AsyncQuicConnection):
             self._manager.closed(self._peer[0], self._peer[1])
             self._closed = True
             self._connection.close()
+            self._send_pending = True
             if self._worker_scope is not None:
                 self._worker_scope.cancel()
             await self._run_done.wait()