aio: io_uring: Trigger async processing for large IOs

author Andres Freund <andres@anarazel.de>

Sun, 5 Apr 2026 04:43:54 +0000 (00:43 -0400)

committer Andres Freund <andres@anarazel.de>

Sun, 5 Apr 2026 04:43:54 +0000 (00:43 -0400)
author Andres Freund <andres@anarazel.de>
Sun, 5 Apr 2026 04:43:54 +0000 (00:43 -0400)
committer Andres Freund <andres@anarazel.de>
Sun, 5 Apr 2026 04:43:54 +0000 (00:43 -0400)
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c

index 39984df31b458ccc6fa90c32f2297e9eb7663610..9f76d2683c0c97d8c347829a4abd03f1167b8ccb 100644 (file)
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -409,7 +409,6 @@ static int
  pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
  {
         struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring;
-       int                     in_flight_before = dclist_count(&pgaio_my_backend->in_flight_ios);
  
         Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
  
@@ -425,27 +424,6 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
  
                 pgaio_io_prepare_submit(ioh);
                 pgaio_uring_sq_from_io(ioh, sqe);
-
-               /*
-                * io_uring executes IO in process context if possible. That's
-                * generally good, as it reduces context switching. When performing a
-                * lot of buffered IO that means that copying between page cache and
-                * userspace memory happens in the foreground, as it can't be
-                * offloaded to DMA hardware as is possible when using direct IO. When
-                * executing a lot of buffered IO this causes io_uring to be slower
-                * than worker mode, as worker mode parallelizes the copying. io_uring
-                * can be told to offload work to worker threads instead.
-                *
-                * If an IO is buffered IO and we already have IOs in flight or
-                * multiple IOs are being submitted, we thus tell io_uring to execute
-                * the IO in the background. We don't do so for the first few IOs
-                * being submitted as executing in this process' context has lower
-                * latency.
-                */
-               if (in_flight_before > 4 && (ioh->flags & PGAIO_HF_BUFFERED))
-                       io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
-
-               in_flight_before++;
         }
  
         while (true)
@@ -701,10 +679,65 @@ pgaio_uring_check_one(PgAioHandle *ioh, uint64 ref_generation)
         LWLockRelease(&owner_context->completion_lock);
  }
  
+/*
+ * io_uring executes IO in process context if possible. That's generally good,
+ * as it reduces context switching. When performing a lot of buffered IO that
+ * means that copying between page cache and userspace memory happens in the
+ * foreground, as it can't be offloaded to DMA hardware as is possible when
+ * using direct IO. When executing a lot of buffered IO this causes io_uring
+ * to be slower than worker mode, as worker mode parallelizes the
+ * copying. io_uring can be told to offload work to worker threads instead.
+ *
+ * If the IOs are small, we only benefit from forcing things into the
+ * background if there is a lot of IO, as otherwise the overhead from context
+ * switching is higher than the gain.
+ *
+ * If IOs are large, there is benefit from asynchronous processing at lower
+ * queue depths, as IO latency is less of a crucial factor and parallelizing
+ * memory copies is more important.  In addition, it is important to trigger
+ * asynchronous processing even at low queue depth, as with foreground
+ * processing we might never actually reach deep enough IO depths to trigger
+ * asynchronous processing, which in turn would deprive readahead control
+ * logic of information about whether a deeper look-ahead distance would be
+ * advantageous.
+ *
+ * We have done some basic benchmarking to validate the thresholds used, but
+ * it's quite plausible that there are better values.  See
+ * https://postgr.es/m/3gkuvs3lz3u3skuaxfkxnsysfqslf2srigl6546vhesekve6v2%40va3r5esummvg
+ * for some details of this benchmarking.
+ */
+static bool
+pgaio_uring_should_use_async(PgAioHandle *ioh, size_t io_size)
+{
+       /*
+        * With DIO there's no benefit from forcing asynchronous processing, as
+        * io_uring will never execute direct IO synchronously during submission.
+        */
+       if (!(ioh->flags & PGAIO_HF_BUFFERED))
+               return false;
+
+       /*
+        * Once the IO queue depth is not that shallow anymore, the overhead of
+        * dispatching to the background is a less significant factor.
+        */
+       if (dclist_count(&pgaio_my_backend->in_flight_ios) > 4)
+               return true;
+
+       /*
+        * If the IO is larger, the gains from parallelizing the memory copy are
+        * larger and typically the impact of the latency is smaller.
+        */
+       if (io_size >= (BLCKSZ * 4))
+               return true;
+
+       return false;
+}
+
  static void
  pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
  {
         struct iovec *iov;
+       size_t          io_size = 0;
  
         switch ((PgAioOp) ioh->op)
         {
@@ -717,6 +750,8 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
                                                                    iov->iov_base,
                                                                    iov->iov_len,
                                                                    ioh->op_data.read.offset);
+
+                               io_size = iov->iov_len;
                         }
                         else
                         {
@@ -726,7 +761,13 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
                                                                         ioh->op_data.read.iov_length,
                                                                         ioh->op_data.read.offset);
  
+                               for (int i = 0; i < ioh->op_data.read.iov_length; i++, iov++)
+                                       io_size += iov->iov_len;
                         }
+
+                       if (pgaio_uring_should_use_async(ioh, io_size))
+                               io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
+
                         break;
  
                 case PGAIO_OP_WRITEV:
@@ -747,6 +788,12 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
                                                                          ioh->op_data.write.iov_length,
                                                                          ioh->op_data.write.offset);
                         }
+
+                       /*
+                        * For now don't trigger use of IOSQE_ASYNC for writes, it's not
+                        * clear there is a performance benefit in doing so.
+                        */
+
                         break;
  
                 case PGAIO_OP_INVALID:
author	Andres Freund <andres@anarazel.de>
	Sun, 5 Apr 2026 04:43:54 +0000 (00:43 -0400)
committer	Andres Freund <andres@anarazel.de>
	Sun, 5 Apr 2026 04:43:54 +0000 (00:43 -0400)