--- /dev/null
+From a3145690c98e01d16cf5192141f69e6c59e46fe1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Sep 2023 12:57:04 +0100
+Subject: iommu/arm-smmu-v3: Avoid constructing invalid range commands
+
+From: Robin Murphy <robin.murphy@arm.com>
+
+[ Upstream commit eb6c97647be227822c7ce23655482b05e348fba5 ]
+
+Although io-pgtable's non-leaf invalidations are always for full tables,
+I missed that SVA also uses non-leaf invalidations, while being at the
+mercy of whatever range the MMU notifier throws at it. This means it
+definitely wants the previous TTL fix as well, since it also doesn't
+know exactly which leaf level(s) may need invalidating, but it can also
+give us less-aligned ranges wherein certain corners may lead to building
+an invalid command where TTL, Num and Scale are all 0. It should be fine
+to handle this by over-invalidating an extra page, since falling back to
+a non-range command opens up a whole can of errata-flavoured worms.
+
+Fixes: 6833b8f2e199 ("iommu/arm-smmu-v3: Set TTL invalidation hint better")
+Reported-by: Rui Zhu <zhurui3@huawei.com>
+Signed-off-by: Robin Murphy <robin.murphy@arm.com>
+Link: https://lore.kernel.org/r/b99cfe71af2bd93a8a2930f20967fb2a4f7748dd.1694432734.git.robin.murphy@arm.com
+Signed-off-by: Will Deacon <will@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 ++++++++++-----
+ 1 file changed, 10 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+index becf37c088772..8966f7d5aab61 100644
+--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
++++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+@@ -1886,18 +1886,23 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
+ /* Get the leaf page size */
+ tg = __ffs(smmu_domain->domain.pgsize_bitmap);
+
++ num_pages = size >> tg;
++
+ /* Convert page size of 12,14,16 (log2) to 1,2,3 */
+ cmd->tlbi.tg = (tg - 10) / 2;
+
+ /*
+- * Determine what level the granule is at. For non-leaf, io-pgtable
+- * assumes .tlb_flush_walk can invalidate multiple levels at once,
+- * so ignore the nominal last-level granule and leave TTL=0.
++ * Determine what level the granule is at. For non-leaf, both
++ * io-pgtable and SVA pass a nominal last-level granule because
++ * they don't know what level(s) actually apply, so ignore that
++ * and leave TTL=0. However for various errata reasons we still
++ * want to use a range command, so avoid the SVA corner case
++ * where both scale and num could be 0 as well.
+ */
+ if (cmd->tlbi.leaf)
+ cmd->tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
+-
+- num_pages = size >> tg;
++ else if ((num_pages & CMDQ_TLBI_RANGE_NUM_MAX) == 1)
++ num_pages++;
+ }
+
+ cmds.num = 0;
+--
+2.40.1
+
--- /dev/null
+From 3c125eedc84784f2e10502f95ab4a9fa173f082d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 1 Jun 2023 17:43:33 +0100
+Subject: iommu/arm-smmu-v3: Set TTL invalidation hint better
+
+From: Robin Murphy <robin.murphy@arm.com>
+
+[ Upstream commit 6833b8f2e19945a41e4d5efd8c6d9f4cae9a5b7d ]
+
+When io-pgtable unmaps a whole table, rather than waste time walking it
+to find the leaf entries to invalidate exactly, it simply expects
+.tlb_flush_walk with nominal last-level granularity to invalidate any
+leaf entries at higher intermediate levels as well. This works fine with
+page-based invalidation, but with range commands we need to be careful
+with the TTL hint - unconditionally setting it based on the given level
+3 granule means that an invalidation for a level 1 table would strictly
+not be required to affect level 2 block entries. It's easy to comply
+with the expected behaviour by simply not setting the TTL hint for
+non-leaf invalidations, so let's do that.
+
+Signed-off-by: Robin Murphy <robin.murphy@arm.com>
+Link: https://lore.kernel.org/r/b409d9a17c52dc0db51faee91d92737bb7975f5b.1685637456.git.robin.murphy@arm.com
+Signed-off-by: Will Deacon <will@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+index db33dc87f69ed..becf37c088772 100644
+--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
++++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+@@ -1889,8 +1889,13 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
+ /* Convert page size of 12,14,16 (log2) to 1,2,3 */
+ cmd->tlbi.tg = (tg - 10) / 2;
+
+- /* Determine what level the granule is at */
+- cmd->tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
++ /*
++ * Determine what level the granule is at. For non-leaf, io-pgtable
++ * assumes .tlb_flush_walk can invalidate multiple levels at once,
++ * so ignore the nominal last-level granule and leave TTL=0.
++ */
++ if (cmd->tlbi.leaf)
++ cmd->tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
+
+ num_pages = size >> tg;
+ }
+--
+2.40.1
+
--- /dev/null
+From 28fcba242a2f3bd6cfbf34d0865cb8c53437b749 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Oct 2023 11:59:33 +0200
+Subject: rbd: decouple header read-in from updating rbd_dev->header
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 510a7330c82a7754d5df0117a8589e8a539067c7 upstream.
+
+Make rbd_dev_header_info() populate a passed struct rbd_image_header
+instead of rbd_dev->header and introduce rbd_dev_update_header() for
+updating mutable fields in rbd_dev->header upon refresh. The initial
+read-in of both mutable and immutable fields in rbd_dev_image_probe()
+passes in rbd_dev->header so no update step is required there.
+
+rbd_init_layout() is now called directly from rbd_dev_image_probe()
+instead of individually in format 1 and format 2 implementations.
+
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/block/rbd.c | 206 ++++++++++++++++++++++++--------------------
+ 1 file changed, 114 insertions(+), 92 deletions(-)
+
+diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
+index 762795430b4d8..b1c44c6338573 100644
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -632,7 +632,8 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
+ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
+
+ static int rbd_dev_refresh(struct rbd_device *rbd_dev);
+-static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
++static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
++ struct rbd_image_header *header);
+ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+ u64 snap_id);
+ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+@@ -993,15 +994,24 @@ static void rbd_init_layout(struct rbd_device *rbd_dev)
+ RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
+ }
+
++static void rbd_image_header_cleanup(struct rbd_image_header *header)
++{
++ kfree(header->object_prefix);
++ ceph_put_snap_context(header->snapc);
++ kfree(header->snap_sizes);
++ kfree(header->snap_names);
++
++ memset(header, 0, sizeof(*header));
++}
++
+ /*
+ * Fill an rbd image header with information from the given format 1
+ * on-disk header.
+ */
+-static int rbd_header_from_disk(struct rbd_device *rbd_dev,
+- struct rbd_image_header_ondisk *ondisk)
++static int rbd_header_from_disk(struct rbd_image_header *header,
++ struct rbd_image_header_ondisk *ondisk,
++ bool first_time)
+ {
+- struct rbd_image_header *header = &rbd_dev->header;
+- bool first_time = header->object_prefix == NULL;
+ struct ceph_snap_context *snapc;
+ char *object_prefix = NULL;
+ char *snap_names = NULL;
+@@ -1068,11 +1078,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
+ if (first_time) {
+ header->object_prefix = object_prefix;
+ header->obj_order = ondisk->options.order;
+- rbd_init_layout(rbd_dev);
+- } else {
+- ceph_put_snap_context(header->snapc);
+- kfree(header->snap_names);
+- kfree(header->snap_sizes);
+ }
+
+ /* The remaining fields always get updated (when we refresh) */
+@@ -4858,7 +4863,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
+ * return, the rbd_dev->header field will contain up-to-date
+ * information about the image.
+ */
+-static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
++static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev,
++ struct rbd_image_header *header,
++ bool first_time)
+ {
+ struct rbd_image_header_ondisk *ondisk = NULL;
+ u32 snap_count = 0;
+@@ -4906,7 +4913,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
+ snap_count = le32_to_cpu(ondisk->snap_count);
+ } while (snap_count != want_count);
+
+- ret = rbd_header_from_disk(rbd_dev, ondisk);
++ ret = rbd_header_from_disk(header, ondisk, first_time);
+ out:
+ kfree(ondisk);
+
+@@ -5469,17 +5476,12 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+ return 0;
+ }
+
+-static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
+-{
+- return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
+- &rbd_dev->header.obj_order,
+- &rbd_dev->header.image_size);
+-}
+-
+-static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
++static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev,
++ char **pobject_prefix)
+ {
+ size_t size;
+ void *reply_buf;
++ char *object_prefix;
+ int ret;
+ void *p;
+
+@@ -5497,16 +5499,16 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
+ goto out;
+
+ p = reply_buf;
+- rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
+- p + ret, NULL, GFP_NOIO);
++ object_prefix = ceph_extract_encoded_string(&p, p + ret, NULL,
++ GFP_NOIO);
++ if (IS_ERR(object_prefix)) {
++ ret = PTR_ERR(object_prefix);
++ goto out;
++ }
+ ret = 0;
+
+- if (IS_ERR(rbd_dev->header.object_prefix)) {
+- ret = PTR_ERR(rbd_dev->header.object_prefix);
+- rbd_dev->header.object_prefix = NULL;
+- } else {
+- dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
+- }
++ *pobject_prefix = object_prefix;
++ dout(" object_prefix = %s\n", object_prefix);
+ out:
+ kfree(reply_buf);
+
+@@ -5557,13 +5559,6 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+ return 0;
+ }
+
+-static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
+-{
+- return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
+- rbd_is_ro(rbd_dev),
+- &rbd_dev->header.features);
+-}
+-
+ /*
+ * These are generic image flags, but since they are used only for
+ * object map, store them in rbd_dev->object_map_flags.
+@@ -5838,14 +5833,14 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+ return ret;
+ }
+
+-static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
++static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev,
++ u64 *stripe_unit, u64 *stripe_count)
+ {
+ struct {
+ __le64 stripe_unit;
+ __le64 stripe_count;
+ } __attribute__ ((packed)) striping_info_buf = { 0 };
+ size_t size = sizeof (striping_info_buf);
+- void *p;
+ int ret;
+
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+@@ -5857,27 +5852,33 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
+ if (ret < size)
+ return -ERANGE;
+
+- p = &striping_info_buf;
+- rbd_dev->header.stripe_unit = ceph_decode_64(&p);
+- rbd_dev->header.stripe_count = ceph_decode_64(&p);
++ *stripe_unit = le64_to_cpu(striping_info_buf.stripe_unit);
++ *stripe_count = le64_to_cpu(striping_info_buf.stripe_count);
++ dout(" stripe_unit = %llu stripe_count = %llu\n", *stripe_unit,
++ *stripe_count);
++
+ return 0;
+ }
+
+-static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
++static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev, s64 *data_pool_id)
+ {
+- __le64 data_pool_id;
++ __le64 data_pool_buf;
+ int ret;
+
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_data_pool",
+- NULL, 0, &data_pool_id, sizeof(data_pool_id));
++ NULL, 0, &data_pool_buf,
++ sizeof(data_pool_buf));
++ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret < 0)
+ return ret;
+- if (ret < sizeof(data_pool_id))
++ if (ret < sizeof(data_pool_buf))
+ return -EBADMSG;
+
+- rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
+- WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
++ *data_pool_id = le64_to_cpu(data_pool_buf);
++ dout(" data_pool_id = %lld\n", *data_pool_id);
++ WARN_ON(*data_pool_id == CEPH_NOPOOL);
++
+ return 0;
+ }
+
+@@ -6069,7 +6070,8 @@ static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
+ return ret;
+ }
+
+-static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
++static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev,
++ struct ceph_snap_context **psnapc)
+ {
+ size_t size;
+ int ret;
+@@ -6130,9 +6132,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
+ for (i = 0; i < snap_count; i++)
+ snapc->snaps[i] = ceph_decode_64(&p);
+
+- ceph_put_snap_context(rbd_dev->header.snapc);
+- rbd_dev->header.snapc = snapc;
+-
++ *psnapc = snapc;
+ dout(" snap context seq = %llu, snap_count = %u\n",
+ (unsigned long long)seq, (unsigned int)snap_count);
+ out:
+@@ -6181,38 +6181,42 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+ return snap_name;
+ }
+
+-static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
++static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev,
++ struct rbd_image_header *header,
++ bool first_time)
+ {
+- bool first_time = rbd_dev->header.object_prefix == NULL;
+ int ret;
+
+- ret = rbd_dev_v2_image_size(rbd_dev);
++ ret = _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
++ first_time ? &header->obj_order : NULL,
++ &header->image_size);
+ if (ret)
+ return ret;
+
+ if (first_time) {
+- ret = rbd_dev_v2_header_onetime(rbd_dev);
++ ret = rbd_dev_v2_header_onetime(rbd_dev, header);
+ if (ret)
+ return ret;
+ }
+
+- ret = rbd_dev_v2_snap_context(rbd_dev);
+- if (ret && first_time) {
+- kfree(rbd_dev->header.object_prefix);
+- rbd_dev->header.object_prefix = NULL;
+- }
++ ret = rbd_dev_v2_snap_context(rbd_dev, &header->snapc);
++ if (ret)
++ return ret;
+
+- return ret;
++ return 0;
+ }
+
+-static int rbd_dev_header_info(struct rbd_device *rbd_dev)
++static int rbd_dev_header_info(struct rbd_device *rbd_dev,
++ struct rbd_image_header *header,
++ bool first_time)
+ {
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
++ rbd_assert(!header->object_prefix && !header->snapc);
+
+ if (rbd_dev->image_format == 1)
+- return rbd_dev_v1_header_info(rbd_dev);
++ return rbd_dev_v1_header_info(rbd_dev, header, first_time);
+
+- return rbd_dev_v2_header_info(rbd_dev);
++ return rbd_dev_v2_header_info(rbd_dev, header, first_time);
+ }
+
+ /*
+@@ -6700,60 +6704,49 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
+ */
+ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
+ {
+- struct rbd_image_header *header;
+-
+ rbd_dev_parent_put(rbd_dev);
+ rbd_object_map_free(rbd_dev);
+ rbd_dev_mapping_clear(rbd_dev);
+
+ /* Free dynamic fields from the header, then zero it out */
+
+- header = &rbd_dev->header;
+- ceph_put_snap_context(header->snapc);
+- kfree(header->snap_sizes);
+- kfree(header->snap_names);
+- kfree(header->object_prefix);
+- memset(header, 0, sizeof (*header));
++ rbd_image_header_cleanup(&rbd_dev->header);
+ }
+
+-static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
++static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
++ struct rbd_image_header *header)
+ {
+ int ret;
+
+- ret = rbd_dev_v2_object_prefix(rbd_dev);
++ ret = rbd_dev_v2_object_prefix(rbd_dev, &header->object_prefix);
+ if (ret)
+- goto out_err;
++ return ret;
+
+ /*
+ * Get the and check features for the image. Currently the
+ * features are assumed to never change.
+ */
+- ret = rbd_dev_v2_features(rbd_dev);
++ ret = _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
++ rbd_is_ro(rbd_dev), &header->features);
+ if (ret)
+- goto out_err;
++ return ret;
+
+ /* If the image supports fancy striping, get its parameters */
+
+- if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
+- ret = rbd_dev_v2_striping_info(rbd_dev);
+- if (ret < 0)
+- goto out_err;
++ if (header->features & RBD_FEATURE_STRIPINGV2) {
++ ret = rbd_dev_v2_striping_info(rbd_dev, &header->stripe_unit,
++ &header->stripe_count);
++ if (ret)
++ return ret;
+ }
+
+- if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
+- ret = rbd_dev_v2_data_pool(rbd_dev);
++ if (header->features & RBD_FEATURE_DATA_POOL) {
++ ret = rbd_dev_v2_data_pool(rbd_dev, &header->data_pool_id);
+ if (ret)
+- goto out_err;
++ return ret;
+ }
+
+- rbd_init_layout(rbd_dev);
+ return 0;
+-
+-out_err:
+- rbd_dev->header.features = 0;
+- kfree(rbd_dev->header.object_prefix);
+- rbd_dev->header.object_prefix = NULL;
+- return ret;
+ }
+
+ /*
+@@ -6948,13 +6941,15 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
+ if (!depth)
+ down_write(&rbd_dev->header_rwsem);
+
+- ret = rbd_dev_header_info(rbd_dev);
++ ret = rbd_dev_header_info(rbd_dev, &rbd_dev->header, true);
+ if (ret) {
+ if (ret == -ENOENT && !need_watch)
+ rbd_print_dne(rbd_dev, false);
+ goto err_out_probe;
+ }
+
++ rbd_init_layout(rbd_dev);
++
+ /*
+ * If this image is the one being mapped, we have pool name and
+ * id, image name and id, and snap name - need to fill snap id.
+@@ -7009,15 +7004,39 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
+ return ret;
+ }
+
++static void rbd_dev_update_header(struct rbd_device *rbd_dev,
++ struct rbd_image_header *header)
++{
++ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
++ rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
++
++ rbd_dev->header.image_size = header->image_size;
++
++ ceph_put_snap_context(rbd_dev->header.snapc);
++ rbd_dev->header.snapc = header->snapc;
++ header->snapc = NULL;
++
++ if (rbd_dev->image_format == 1) {
++ kfree(rbd_dev->header.snap_names);
++ rbd_dev->header.snap_names = header->snap_names;
++ header->snap_names = NULL;
++
++ kfree(rbd_dev->header.snap_sizes);
++ rbd_dev->header.snap_sizes = header->snap_sizes;
++ header->snap_sizes = NULL;
++ }
++}
++
+ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+ {
++ struct rbd_image_header header = { 0 };
+ u64 mapping_size;
+ int ret;
+
+ down_write(&rbd_dev->header_rwsem);
+ mapping_size = rbd_dev->mapping.size;
+
+- ret = rbd_dev_header_info(rbd_dev);
++ ret = rbd_dev_header_info(rbd_dev, &header, false);
+ if (ret)
+ goto out;
+
+@@ -7031,6 +7050,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+ goto out;
+ }
+
++ rbd_dev_update_header(rbd_dev, &header);
++
+ rbd_assert(!rbd_is_snap(rbd_dev));
+ rbd_dev->mapping.size = rbd_dev->header.image_size;
+
+@@ -7039,6 +7060,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+ if (!ret && mapping_size != rbd_dev->mapping.size)
+ rbd_dev_update_size(rbd_dev);
+
++ rbd_image_header_cleanup(&header);
+ return ret;
+ }
+
+--
+2.40.1
+
--- /dev/null
+From a278d5393b26a75b7ec4d1db54da1a6a04e9f9f1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Oct 2023 11:59:34 +0200
+Subject: rbd: decouple parent info read-in from updating rbd_dev
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit c10311776f0a8ddea2276df96e255625b07045a8 upstream.
+
+Unlike header read-in, parent info read-in is already decoupled in
+get_parent_info(), but it's buried in rbd_dev_v2_parent_info() along
+with the processing logic.
+
+Separate the initial read-in and update read-in logic into
+rbd_dev_setup_parent() and rbd_dev_update_parent() respectively and
+have rbd_dev_v2_parent_info() just populate struct parent_image_info
+(i.e. what get_parent_info() did). Some existing QoI issues, like
+flatten of a standalone clone being disregarded on refresh, remain.
+
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/block/rbd.c | 142 +++++++++++++++++++++++++-------------------
+ 1 file changed, 80 insertions(+), 62 deletions(-)
+
+diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
+index b1c44c6338573..38c92b1b03466 100644
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -5595,6 +5595,14 @@ struct parent_image_info {
+ u64 overlap;
+ };
+
++static void rbd_parent_info_cleanup(struct parent_image_info *pii)
++{
++ kfree(pii->pool_ns);
++ kfree(pii->image_id);
++
++ memset(pii, 0, sizeof(*pii));
++}
++
+ /*
+ * The caller is responsible for @pii.
+ */
+@@ -5664,6 +5672,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
+ if (pii->has_overlap)
+ ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+
++ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
++ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
++ pii->has_overlap, pii->overlap);
+ return 0;
+
+ e_inval:
+@@ -5702,14 +5713,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
+ pii->has_overlap = true;
+ ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+
++ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
++ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
++ pii->has_overlap, pii->overlap);
+ return 0;
+
+ e_inval:
+ return -EINVAL;
+ }
+
+-static int get_parent_info(struct rbd_device *rbd_dev,
+- struct parent_image_info *pii)
++static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
++ struct parent_image_info *pii)
+ {
+ struct page *req_page, *reply_page;
+ void *p;
+@@ -5737,7 +5751,7 @@ static int get_parent_info(struct rbd_device *rbd_dev,
+ return ret;
+ }
+
+-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
++static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
+ {
+ struct rbd_spec *parent_spec;
+ struct parent_image_info pii = { 0 };
+@@ -5747,37 +5761,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+ if (!parent_spec)
+ return -ENOMEM;
+
+- ret = get_parent_info(rbd_dev, &pii);
++ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
+ if (ret)
+ goto out_err;
+
+- dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+- __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
+- pii.has_overlap, pii.overlap);
+-
+- if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
+- /*
+- * Either the parent never existed, or we have
+- * record of it but the image got flattened so it no
+- * longer has a parent. When the parent of a
+- * layered image disappears we immediately set the
+- * overlap to 0. The effect of this is that all new
+- * requests will be treated as if the image had no
+- * parent.
+- *
+- * If !pii.has_overlap, the parent image spec is not
+- * applicable. It's there to avoid duplication in each
+- * snapshot record.
+- */
+- if (rbd_dev->parent_overlap) {
+- rbd_dev->parent_overlap = 0;
+- rbd_dev_parent_put(rbd_dev);
+- pr_info("%s: clone image has been flattened\n",
+- rbd_dev->disk->disk_name);
+- }
+-
++ if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
+ goto out; /* No parent? No problem. */
+- }
+
+ /* The ceph file layout needs to fit pool id in 32 bits */
+
+@@ -5789,46 +5778,34 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+ }
+
+ /*
+- * The parent won't change (except when the clone is
+- * flattened, already handled that). So we only need to
+- * record the parent spec we have not already done so.
++ * The parent won't change except when the clone is flattened,
++ * so we only need to record the parent image spec once.
+ */
+- if (!rbd_dev->parent_spec) {
+- parent_spec->pool_id = pii.pool_id;
+- if (pii.pool_ns && *pii.pool_ns) {
+- parent_spec->pool_ns = pii.pool_ns;
+- pii.pool_ns = NULL;
+- }
+- parent_spec->image_id = pii.image_id;
+- pii.image_id = NULL;
+- parent_spec->snap_id = pii.snap_id;
+-
+- rbd_dev->parent_spec = parent_spec;
+- parent_spec = NULL; /* rbd_dev now owns this */
++ parent_spec->pool_id = pii.pool_id;
++ if (pii.pool_ns && *pii.pool_ns) {
++ parent_spec->pool_ns = pii.pool_ns;
++ pii.pool_ns = NULL;
+ }
++ parent_spec->image_id = pii.image_id;
++ pii.image_id = NULL;
++ parent_spec->snap_id = pii.snap_id;
++
++ rbd_assert(!rbd_dev->parent_spec);
++ rbd_dev->parent_spec = parent_spec;
++ parent_spec = NULL; /* rbd_dev now owns this */
+
+ /*
+- * We always update the parent overlap. If it's zero we issue
+- * a warning, as we will proceed as if there was no parent.
++ * Record the parent overlap. If it's zero, issue a warning as
++ * we will proceed as if there is no parent.
+ */
+- if (!pii.overlap) {
+- if (parent_spec) {
+- /* refresh, careful to warn just once */
+- if (rbd_dev->parent_overlap)
+- rbd_warn(rbd_dev,
+- "clone now standalone (overlap became 0)");
+- } else {
+- /* initial probe */
+- rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
+- }
+- }
++ if (!pii.overlap)
++ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
+ rbd_dev->parent_overlap = pii.overlap;
+
+ out:
+ ret = 0;
+ out_err:
+- kfree(pii.pool_ns);
+- kfree(pii.image_id);
++ rbd_parent_info_cleanup(&pii);
+ rbd_spec_put(parent_spec);
+ return ret;
+ }
+@@ -6978,7 +6955,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
+ }
+
+ if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
+- ret = rbd_dev_v2_parent_info(rbd_dev);
++ ret = rbd_dev_setup_parent(rbd_dev);
+ if (ret)
+ goto err_out_probe;
+ }
+@@ -7027,9 +7004,47 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
+ }
+ }
+
++static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
++ struct parent_image_info *pii)
++{
++ if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
++ /*
++ * Either the parent never existed, or we have
++ * record of it but the image got flattened so it no
++ * longer has a parent. When the parent of a
++ * layered image disappears we immediately set the
++ * overlap to 0. The effect of this is that all new
++ * requests will be treated as if the image had no
++ * parent.
++ *
++ * If !pii.has_overlap, the parent image spec is not
++ * applicable. It's there to avoid duplication in each
++ * snapshot record.
++ */
++ if (rbd_dev->parent_overlap) {
++ rbd_dev->parent_overlap = 0;
++ rbd_dev_parent_put(rbd_dev);
++ pr_info("%s: clone has been flattened\n",
++ rbd_dev->disk->disk_name);
++ }
++ } else {
++ rbd_assert(rbd_dev->parent_spec);
++
++ /*
++ * Update the parent overlap. If it became zero, issue
++ * a warning as we will proceed as if there is no parent.
++ */
++ if (!pii->overlap && rbd_dev->parent_overlap)
++ rbd_warn(rbd_dev,
++ "clone has become standalone (overlap 0)");
++ rbd_dev->parent_overlap = pii->overlap;
++ }
++}
++
+ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+ {
+ struct rbd_image_header header = { 0 };
++ struct parent_image_info pii = { 0 };
+ u64 mapping_size;
+ int ret;
+
+@@ -7045,12 +7060,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+ * mapped image getting flattened.
+ */
+ if (rbd_dev->parent) {
+- ret = rbd_dev_v2_parent_info(rbd_dev);
++ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
+ if (ret)
+ goto out;
+ }
+
+ rbd_dev_update_header(rbd_dev, &header);
++ if (rbd_dev->parent)
++ rbd_dev_update_parent(rbd_dev, &pii);
+
+ rbd_assert(!rbd_is_snap(rbd_dev));
+ rbd_dev->mapping.size = rbd_dev->header.image_size;
+@@ -7060,6 +7077,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+ if (!ret && mapping_size != rbd_dev->mapping.size)
+ rbd_dev_update_size(rbd_dev);
+
++ rbd_parent_info_cleanup(&pii);
+ rbd_image_header_cleanup(&header);
+ return ret;
+ }
+--
+2.40.1
+
--- /dev/null
+From 5b47cfb7192acc6fc423bd841721d247b69b1a97 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Oct 2023 11:59:32 +0200
+Subject: rbd: move rbd_dev_refresh() definition
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 0b035401c57021fc6c300272cbb1c5a889d4fe45 upstream.
+
+Move rbd_dev_refresh() definition further down to avoid having to
+move struct parent_image_info definition in the next commit. This
+spares some forward declarations too.
+
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
+[idryomov@gmail.com: backport to 5.10-6.1: context]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/block/rbd.c | 68 ++++++++++++++++++++++-----------------------
+ 1 file changed, 33 insertions(+), 35 deletions(-)
+
+diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
+index 74ef3da545361..762795430b4d8 100644
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -633,8 +633,6 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
+
+ static int rbd_dev_refresh(struct rbd_device *rbd_dev);
+ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
+-static int rbd_dev_header_info(struct rbd_device *rbd_dev);
+-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
+ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+ u64 snap_id);
+ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+@@ -4932,39 +4930,6 @@ static void rbd_dev_update_size(struct rbd_device *rbd_dev)
+ }
+ }
+
+-static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+-{
+- u64 mapping_size;
+- int ret;
+-
+- down_write(&rbd_dev->header_rwsem);
+- mapping_size = rbd_dev->mapping.size;
+-
+- ret = rbd_dev_header_info(rbd_dev);
+- if (ret)
+- goto out;
+-
+- /*
+- * If there is a parent, see if it has disappeared due to the
+- * mapped image getting flattened.
+- */
+- if (rbd_dev->parent) {
+- ret = rbd_dev_v2_parent_info(rbd_dev);
+- if (ret)
+- goto out;
+- }
+-
+- rbd_assert(!rbd_is_snap(rbd_dev));
+- rbd_dev->mapping.size = rbd_dev->header.image_size;
+-
+-out:
+- up_write(&rbd_dev->header_rwsem);
+- if (!ret && mapping_size != rbd_dev->mapping.size)
+- rbd_dev_update_size(rbd_dev);
+-
+- return ret;
+-}
+-
+ static const struct blk_mq_ops rbd_mq_ops = {
+ .queue_rq = rbd_queue_rq,
+ };
+@@ -7044,6 +7009,39 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
+ return ret;
+ }
+
++static int rbd_dev_refresh(struct rbd_device *rbd_dev)
++{
++ u64 mapping_size;
++ int ret;
++
++ down_write(&rbd_dev->header_rwsem);
++ mapping_size = rbd_dev->mapping.size;
++
++ ret = rbd_dev_header_info(rbd_dev);
++ if (ret)
++ goto out;
++
++ /*
++ * If there is a parent, see if it has disappeared due to the
++ * mapped image getting flattened.
++ */
++ if (rbd_dev->parent) {
++ ret = rbd_dev_v2_parent_info(rbd_dev);
++ if (ret)
++ goto out;
++ }
++
++ rbd_assert(!rbd_is_snap(rbd_dev));
++ rbd_dev->mapping.size = rbd_dev->header.image_size;
++
++out:
++ up_write(&rbd_dev->header_rwsem);
++ if (!ret && mapping_size != rbd_dev->mapping.size)
++ rbd_dev_update_size(rbd_dev);
++
++ return ret;
++}
++
+ static ssize_t do_rbd_add(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+--
+2.40.1
+
--- /dev/null
+From fc5da2f9eec35bc2941e978cf2bb860fbc24f56b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Oct 2023 11:59:35 +0200
+Subject: rbd: take header_rwsem in rbd_dev_refresh() only when updating
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d upstream.
+
+rbd_dev_refresh() has been holding header_rwsem across header and
+parent info read-in unnecessarily for ages. With commit 870611e4877e
+("rbd: get snapshot context after exclusive lock is ensured to be
+held"), the potential for deadlocks became much more real owning to
+a) header_rwsem now nesting inside lock_rwsem and b) rw_semaphores
+not allowing new readers after a writer is registered.
+
+For example, assuming that I/O request 1, I/O request 2 and header
+read-in request all target the same OSD:
+
+1. I/O request 1 comes in and gets submitted
+2. watch error occurs
+3. rbd_watch_errcb() takes lock_rwsem for write, clears owner_cid and
+ releases lock_rwsem
+4. after reestablishing the watch, rbd_reregister_watch() calls
+ rbd_dev_refresh() which takes header_rwsem for write and submits
+ a header read-in request
+5. I/O request 2 comes in: after taking lock_rwsem for read in
+ __rbd_img_handle_request(), it blocks trying to take header_rwsem
+ for read in rbd_img_object_requests()
+6. another watch error occurs
+7. rbd_watch_errcb() blocks trying to take lock_rwsem for write
+8. I/O request 1 completion is received by the messenger but can't be
+ processed because lock_rwsem won't be granted anymore
+9. header read-in request completion can't be received, let alone
+ processed, because the messenger is stranded
+
+Change rbd_dev_refresh() to take header_rwsem only for actually
+updating rbd_dev->header. Header and parent info read-in don't need
+any locking.
+
+Cc: stable@vger.kernel.org # 0b035401c570: rbd: move rbd_dev_refresh() definition
+Cc: stable@vger.kernel.org # 510a7330c82a: rbd: decouple header read-in from updating rbd_dev->header
+Cc: stable@vger.kernel.org # c10311776f0a: rbd: decouple parent info read-in from updating rbd_dev
+Cc: stable@vger.kernel.org
+Fixes: 870611e4877e ("rbd: get snapshot context after exclusive lock is ensured to be held")
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/block/rbd.c | 22 +++++++++++-----------
+ 1 file changed, 11 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
+index 38c92b1b03466..afc92869cba42 100644
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -6987,7 +6987,14 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
+
+- rbd_dev->header.image_size = header->image_size;
++ if (rbd_dev->header.image_size != header->image_size) {
++ rbd_dev->header.image_size = header->image_size;
++
++ if (!rbd_is_snap(rbd_dev)) {
++ rbd_dev->mapping.size = header->image_size;
++ rbd_dev_update_size(rbd_dev);
++ }
++ }
+
+ ceph_put_snap_context(rbd_dev->header.snapc);
+ rbd_dev->header.snapc = header->snapc;
+@@ -7045,11 +7052,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+ {
+ struct rbd_image_header header = { 0 };
+ struct parent_image_info pii = { 0 };
+- u64 mapping_size;
+ int ret;
+
+- down_write(&rbd_dev->header_rwsem);
+- mapping_size = rbd_dev->mapping.size;
++ dout("%s rbd_dev %p\n", __func__, rbd_dev);
+
+ ret = rbd_dev_header_info(rbd_dev, &header, false);
+ if (ret)
+@@ -7065,18 +7070,13 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+ goto out;
+ }
+
++ down_write(&rbd_dev->header_rwsem);
+ rbd_dev_update_header(rbd_dev, &header);
+ if (rbd_dev->parent)
+ rbd_dev_update_parent(rbd_dev, &pii);
+-
+- rbd_assert(!rbd_is_snap(rbd_dev));
+- rbd_dev->mapping.size = rbd_dev->header.image_size;
+-
+-out:
+ up_write(&rbd_dev->header_rwsem);
+- if (!ret && mapping_size != rbd_dev->mapping.size)
+- rbd_dev_update_size(rbd_dev);
+
++out:
+ rbd_parent_info_cleanup(&pii);
+ rbd_image_header_cleanup(&header);
+ return ret;
+--
+2.40.1
+
btrfs-use-struct-fscrypt_str-instead-of-struct-qstr.patch
revert-nfsv4-retry-lock-on-old_stateid-during-delega.patch
arm64-avoid-repeated-aa64mmfr1_el1-register-read-on-.patch
+iommu-arm-smmu-v3-set-ttl-invalidation-hint-better.patch
+iommu-arm-smmu-v3-avoid-constructing-invalid-range-c.patch
+rbd-move-rbd_dev_refresh-definition.patch
+rbd-decouple-header-read-in-from-updating-rbd_dev-he.patch
+rbd-decouple-parent-info-read-in-from-updating-rbd_d.patch
+rbd-take-header_rwsem-in-rbd_dev_refresh-only-when-u.patch