From e8b6535533be4269e4b7bd23d4bb17dd976dc7a3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 13 Oct 2022 14:59:01 -0400 Subject: [PATCH] block: add BDRV_REQ_REGISTERED_BUF request flag Block drivers may optimize I/O requests accessing buffers previously registered with bdrv_register_buf(). Checking whether all elements of a request's QEMUIOVector are within previously registered buffers is expensive, so we need a hint from the user to avoid costly checks. Add a BDRV_REQ_REGISTERED_BUF request flag to indicate that all QEMUIOVector elements in an I/O request are known to be within previously registered buffers. Always pass the flag through to driver read/write functions. There is little harm in passing the flag to a driver that does not use it. Passing the flag to drivers avoids changes across many block drivers. Filter drivers would need to explicitly support the flag and pass through to their children when the children support it. That's a lot of code changes and it's hard to remember to do that everywhere, leading to silent reduced performance when the flag is accidentally dropped. The only problematic scenario with the approach in this patch is when a driver passes the flag through to internal I/O requests that don't use the same I/O buffer. In that case the hint may be set when it should actually be clear. This is a rare case though so the risk is low. Some drivers have assert(!flags), which no longer works when BDRV_REQ_REGISTERED_BUF is passed in. These assertions aren't very useful anyway since the functions are called almost exclusively by bdrv_driver_preadv/pwritev() so if we get flags handling right there then the assertion is not needed. Signed-off-by: Stefan Hajnoczi Message-id: 20221013185908.1297568-7-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block.c | 14 +++++++++ block/blkverify.c | 4 +-- block/crypto.c | 4 +-- block/file-posix.c | 1 - block/gluster.c | 1 - block/io.c | 61 ++++++++++++++++++++++-------------- block/mirror.c | 2 ++ block/nbd.c | 1 - block/parallels.c | 1 - block/qcow.c | 2 -- block/qed.c | 1 - block/raw-format.c | 2 ++ block/replication.c | 1 - block/ssh.c | 1 - block/vhdx.c | 1 - include/block/block-common.h | 9 ++++++ 16 files changed, 69 insertions(+), 37 deletions(-) diff --git a/block.c b/block.c index 1fbf6b9e699..c69be2cfe3e 100644 --- a/block.c +++ b/block.c @@ -1641,6 +1641,20 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, goto open_failed; } + assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK)); + assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK)); + + /* + * Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves + * drivers that pass read/write requests through to a child the trouble of + * declaring support explicitly. + * + * Drivers must not propagate this flag accidentally when they initiate I/O + * to a bounce buffer. That case should be rare though. + */ + bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF; + bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF; + ret = refresh_total_sectors(bs, bs->total_sectors); if (ret < 0) { error_setg_errno(errp, -ret, "Could not refresh total sector count"); diff --git a/block/blkverify.c b/block/blkverify.c index 020b1ae7b60..f36fd6aeb25 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -235,8 +235,8 @@ blkverify_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, qemu_iovec_init(&raw_qiov, qiov->niov); qemu_iovec_clone(&raw_qiov, qiov, buf); - ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, flags, - false); + ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, + flags & ~BDRV_REQ_REGISTERED_BUF, false); cmp_offset = qemu_iovec_compare(qiov, &raw_qiov); if (cmp_offset != -1) { diff --git a/block/crypto.c b/block/crypto.c index 7a57774b768..c7365598a7c 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -410,7 +410,6 @@ block_crypto_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block); uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block); - assert(!flags); assert(payload_offset < INT64_MAX); assert(QEMU_IS_ALIGNED(offset, sector_size)); assert(QEMU_IS_ALIGNED(bytes, sector_size)); @@ -473,7 +472,8 @@ block_crypto_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block); uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block); - assert(!(flags & ~BDRV_REQ_FUA)); + flags &= ~BDRV_REQ_REGISTERED_BUF; + assert(payload_offset < INT64_MAX); assert(QEMU_IS_ALIGNED(offset, sector_size)); assert(QEMU_IS_ALIGNED(bytes, sector_size)); diff --git a/block/file-posix.c b/block/file-posix.c index 23acffb9a45..b9647c5ffc2 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2133,7 +2133,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { - assert(flags == 0); return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); } diff --git a/block/gluster.c b/block/gluster.c index bb1144cf6a0..7c90f7ba4b8 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1236,7 +1236,6 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, QEMUIOVector *qiov, int flags) { - assert(!flags); return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } diff --git a/block/io.c b/block/io.c index cca402bf7bc..4207648db68 100644 --- a/block/io.c +++ b/block/io.c @@ -1130,8 +1130,7 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, int ret; bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); - assert(!(flags & ~BDRV_REQ_MASK)); - assert(!(flags & BDRV_REQ_NO_FALLBACK)); + assert(!(flags & ~bs->supported_read_flags)); if (!drv) { return -ENOMEDIUM; @@ -1195,23 +1194,29 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; + bool emulate_fua = false; int64_t sector_num; unsigned int nb_sectors; QEMUIOVector local_qiov; int ret; bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); - assert(!(flags & ~BDRV_REQ_MASK)); - assert(!(flags & BDRV_REQ_NO_FALLBACK)); if (!drv) { return -ENOMEDIUM; } + if ((flags & BDRV_REQ_FUA) && + (~bs->supported_write_flags & BDRV_REQ_FUA)) { + flags &= ~BDRV_REQ_FUA; + emulate_fua = true; + } + + flags &= bs->supported_write_flags; + if (drv->bdrv_co_pwritev_part) { ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + flags); goto emulate_flags; } @@ -1221,9 +1226,7 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, } if (drv->bdrv_co_pwritev) { - ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags); goto emulate_flags; } @@ -1233,10 +1236,8 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, .coroutine = qemu_coroutine_self(), }; - acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, - flags & bs->supported_write_flags, + acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags, bdrv_co_io_em_complete, &co); - flags &= ~bs->supported_write_flags; if (acb == NULL) { ret = -EIO; } else { @@ -1254,12 +1255,10 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, assert(bytes <= BDRV_REQUEST_MAX_BYTES); assert(drv->bdrv_co_writev); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags); emulate_flags: - if (ret == 0 && (flags & BDRV_REQ_FUA)) { + if (ret == 0 && emulate_fua) { ret = bdrv_co_flush(bs); } @@ -1487,11 +1486,14 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), align); - /* TODO: We would need a per-BDS .supported_read_flags and + /* + * TODO: We would need a per-BDS .supported_read_flags and * potential fallback support, if we ever implement any read flags * to pass through to drivers. For now, there aren't any - * passthrough flags. */ - assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); + * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint. + */ + assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH | + BDRV_REQ_REGISTERED_BUF))); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { @@ -1532,7 +1534,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, goto out; } - assert(!(flags & ~bs->supported_read_flags)); + assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF))); max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { @@ -1721,7 +1723,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) static int bdrv_pad_request(BlockDriverState *bs, QEMUIOVector **qiov, size_t *qiov_offset, int64_t *offset, int64_t *bytes, - BdrvRequestPadding *pad, bool *padded) + BdrvRequestPadding *pad, bool *padded, + BdrvRequestFlags *flags) { int ret; @@ -1749,6 +1752,10 @@ static int bdrv_pad_request(BlockDriverState *bs, if (padded) { *padded = true; } + if (flags) { + /* Can't use optimization hint with bounce buffer */ + *flags &= ~BDRV_REQ_REGISTERED_BUF; + } return 0; } @@ -1803,7 +1810,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, } ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, - NULL); + NULL, &flags); if (ret < 0) { goto fail; } @@ -1848,6 +1855,11 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, return -ENOTSUP; } + /* By definition there is no user buffer so this flag doesn't make sense */ + if (flags & BDRV_REQ_REGISTERED_BUF) { + return -EINVAL; + } + /* Invalidate the cached block-status data range if this write overlaps */ bdrv_bsc_invalidate_range(bs, offset, bytes); @@ -2133,6 +2145,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, bool padding; BdrvRequestPadding pad; + /* This flag doesn't make sense for padding or zero writes */ + flags &= ~BDRV_REQ_REGISTERED_BUF; + padding = bdrv_init_padding(bs, offset, bytes, &pad); if (padding) { assert(!(flags & BDRV_REQ_NO_WAIT)); @@ -2250,7 +2265,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, * alignment only if there is no ZERO flag. */ ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, - &padded); + &padded, &flags); if (ret < 0) { return ret; } diff --git a/block/mirror.c b/block/mirror.c index 80c0109d396..bed089d2e07 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1486,6 +1486,8 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs, qemu_iovec_init(&bounce_qiov, 1); qemu_iovec_add(&bounce_qiov, bounce_buf, bytes); qiov = &bounce_qiov; + + flags &= ~BDRV_REQ_REGISTERED_BUF; } ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov, diff --git a/block/nbd.c b/block/nbd.c index 494b9d683ea..7d485c86d28 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1222,7 +1222,6 @@ static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offse }; assert(bytes <= NBD_MAX_BUFFER_SIZE); - assert(!flags); if (!bytes) { return 0; diff --git a/block/parallels.c b/block/parallels.c index c1523e7dabd..dd15a441005 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -329,7 +329,6 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs, QEMUIOVector hd_qiov; int ret = 0; - assert(!flags); qemu_iovec_init(&hd_qiov, qiov->niov); while (nb_sectors > 0) { diff --git a/block/qcow.c b/block/qcow.c index 311aaa87052..e9180c7b618 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -628,7 +628,6 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState *bs, int64_t offset, uint8_t *buf; void *orig_buf; - assert(!flags); if (qiov->niov > 1) { buf = orig_buf = qemu_try_blockalign(bs, qiov->size); if (buf == NULL) { @@ -725,7 +724,6 @@ static coroutine_fn int qcow_co_pwritev(BlockDriverState *bs, int64_t offset, uint8_t *buf; void *orig_buf; - assert(!flags); s->cluster_cache_offset = -1; /* disable compressed cache */ /* We must always copy the iov when encrypting, so we diff --git a/block/qed.c b/block/qed.c index bda00e6257b..99a9ec9b57c 100644 --- a/block/qed.c +++ b/block/qed.c @@ -1395,7 +1395,6 @@ static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags) { - assert(!flags); return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE); } diff --git a/block/raw-format.c b/block/raw-format.c index f337ac75696..c8dc9bc850f 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -258,6 +258,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, qemu_iovec_add(&local_qiov, buf, 512); qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512); qiov = &local_qiov; + + flags &= ~BDRV_REQ_REGISTERED_BUF; } ret = raw_adjust_offset(bs, &offset, bytes, true); diff --git a/block/replication.c b/block/replication.c index c67f931f376..13f1d39571b 100644 --- a/block/replication.c +++ b/block/replication.c @@ -261,7 +261,6 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, int ret; int64_t n; - assert(!flags); ret = replication_get_io_status(s); if (ret < 0) { goto out; diff --git a/block/ssh.c b/block/ssh.c index a2dc6465369..a3cddc392c3 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -1196,7 +1196,6 @@ static coroutine_fn int ssh_co_writev(BlockDriverState *bs, BDRVSSHState *s = bs->opaque; int ret; - assert(!flags); qemu_co_mutex_lock(&s->lock); ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, qiov); diff --git a/block/vhdx.c b/block/vhdx.c index e10e78ebfdf..e2344ee0b7c 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1342,7 +1342,6 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, uint64_t bat_prior_offset = 0; bool bat_update = false; - assert(!flags); qemu_iovec_init(&hd_qiov, qiov->niov); qemu_co_mutex_lock(&s->lock); diff --git a/include/block/block-common.h b/include/block/block-common.h index fdb7306e78a..061606e867c 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -80,6 +80,15 @@ typedef enum { */ BDRV_REQ_MAY_UNMAP = 0x4, + /* + * An optimization hint when all QEMUIOVector elements are within + * previously registered bdrv_register_buf() memory ranges. + * + * Code that replaces the user's QEMUIOVector elements with bounce buffers + * must take care to clear this flag. + */ + BDRV_REQ_REGISTERED_BUF = 0x8, + BDRV_REQ_FUA = 0x10, BDRV_REQ_WRITE_COMPRESSED = 0x20, -- 2.47.3