pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
{
struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring;
- int in_flight_before = dclist_count(&pgaio_my_backend->in_flight_ios);
Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
pgaio_io_prepare_submit(ioh);
pgaio_uring_sq_from_io(ioh, sqe);
-
- /*
- * io_uring executes IO in process context if possible. That's
- * generally good, as it reduces context switching. When performing a
- * lot of buffered IO that means that copying between page cache and
- * userspace memory happens in the foreground, as it can't be
- * offloaded to DMA hardware as is possible when using direct IO. When
- * executing a lot of buffered IO this causes io_uring to be slower
- * than worker mode, as worker mode parallelizes the copying. io_uring
- * can be told to offload work to worker threads instead.
- *
- * If an IO is buffered IO and we already have IOs in flight or
- * multiple IOs are being submitted, we thus tell io_uring to execute
- * the IO in the background. We don't do so for the first few IOs
- * being submitted as executing in this process' context has lower
- * latency.
- */
- if (in_flight_before > 4 && (ioh->flags & PGAIO_HF_BUFFERED))
- io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
-
- in_flight_before++;
}
while (true)
LWLockRelease(&owner_context->completion_lock);
}
+/*
+ * io_uring executes IO in process context if possible. That's generally good,
+ * as it reduces context switching. When performing a lot of buffered IO that
+ * means that copying between page cache and userspace memory happens in the
+ * foreground, as it can't be offloaded to DMA hardware as is possible when
+ * using direct IO. When executing a lot of buffered IO this causes io_uring
+ * to be slower than worker mode, as worker mode parallelizes the
+ * copying. io_uring can be told to offload work to worker threads instead.
+ *
+ * If the IOs are small, we only benefit from forcing things into the
+ * background if there is a lot of IO, as otherwise the overhead from context
+ * switching is higher than the gain.
+ *
+ * If IOs are large, there is benefit from asynchronous processing at lower
+ * queue depths, as IO latency is less of a crucial factor and parallelizing
+ * memory copies is more important. In addition, it is important to trigger
+ * asynchronous processing even at low queue depth, as with foreground
+ * processing we might never actually reach deep enough IO depths to trigger
+ * asynchronous processing, which in turn would deprive readahead control
+ * logic of information about whether a deeper look-ahead distance would be
+ * advantageous.
+ *
+ * We have done some basic benchmarking to validate the thresholds used, but
+ * it's quite plausible that there are better values. See
+ * https://postgr.es/m/3gkuvs3lz3u3skuaxfkxnsysfqslf2srigl6546vhesekve6v2%40va3r5esummvg
+ * for some details of this benchmarking.
+ */
+static bool
+pgaio_uring_should_use_async(PgAioHandle *ioh, size_t io_size)
+{
+ /*
+ * With DIO there's no benefit from forcing asynchronous processing, as
+ * io_uring will never execute direct IO synchronously during submission.
+ */
+ if (!(ioh->flags & PGAIO_HF_BUFFERED))
+ return false;
+
+ /*
+ * Once the IO queue depth is not that shallow anymore, the overhead of
+ * dispatching to the background is a less significant factor.
+ */
+ if (dclist_count(&pgaio_my_backend->in_flight_ios) > 4)
+ return true;
+
+ /*
+ * If the IO is larger, the gains from parallelizing the memory copy are
+ * larger and typically the impact of the latency is smaller.
+ */
+ if (io_size >= (BLCKSZ * 4))
+ return true;
+
+ return false;
+}
+
static void
pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
{
struct iovec *iov;
+ size_t io_size = 0;
switch ((PgAioOp) ioh->op)
{
iov->iov_base,
iov->iov_len,
ioh->op_data.read.offset);
+
+ io_size = iov->iov_len;
}
else
{
ioh->op_data.read.iov_length,
ioh->op_data.read.offset);
+ for (int i = 0; i < ioh->op_data.read.iov_length; i++, iov++)
+ io_size += iov->iov_len;
}
+
+ if (pgaio_uring_should_use_async(ioh, io_size))
+ io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
+
break;
case PGAIO_OP_WRITEV:
ioh->op_data.write.iov_length,
ioh->op_data.write.offset);
}
+
+ /*
+ * For now don't trigger use of IOSQE_ASYNC for writes, it's not
+ * clear there is a performance benefit in doing so.
+ */
+
break;
case PGAIO_OP_INVALID: