From: Sasha Levin Date: Fri, 6 Oct 2023 14:12:48 +0000 (-0400) Subject: Fixes for 5.15 X-Git-Tag: v4.14.327~85 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=ce49096ad92986e2d1dc9ca5c9ab58da7cc62479;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.15 Signed-off-by: Sasha Levin --- diff --git a/queue-5.15/iommu-arm-smmu-v3-avoid-constructing-invalid-range-c.patch b/queue-5.15/iommu-arm-smmu-v3-avoid-constructing-invalid-range-c.patch new file mode 100644 index 00000000000..90019021722 --- /dev/null +++ b/queue-5.15/iommu-arm-smmu-v3-avoid-constructing-invalid-range-c.patch @@ -0,0 +1,65 @@ +From 7906243c9ecda222f63d28cb501de583c1a44474 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Sep 2023 12:57:04 +0100 +Subject: iommu/arm-smmu-v3: Avoid constructing invalid range commands + +From: Robin Murphy + +[ Upstream commit eb6c97647be227822c7ce23655482b05e348fba5 ] + +Although io-pgtable's non-leaf invalidations are always for full tables, +I missed that SVA also uses non-leaf invalidations, while being at the +mercy of whatever range the MMU notifier throws at it. This means it +definitely wants the previous TTL fix as well, since it also doesn't +know exactly which leaf level(s) may need invalidating, but it can also +give us less-aligned ranges wherein certain corners may lead to building +an invalid command where TTL, Num and Scale are all 0. It should be fine +to handle this by over-invalidating an extra page, since falling back to +a non-range command opens up a whole can of errata-flavoured worms. + +Fixes: 6833b8f2e199 ("iommu/arm-smmu-v3: Set TTL invalidation hint better") +Reported-by: Rui Zhu +Signed-off-by: Robin Murphy +Link: https://lore.kernel.org/r/b99cfe71af2bd93a8a2930f20967fb2a4f7748dd.1694432734.git.robin.murphy@arm.com +Signed-off-by: Will Deacon +Signed-off-by: Sasha Levin +--- + drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +index 67845f8e1df9f..761cb657f2561 100644 +--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c ++++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +@@ -1881,18 +1881,23 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, + /* Get the leaf page size */ + tg = __ffs(smmu_domain->domain.pgsize_bitmap); + ++ num_pages = size >> tg; ++ + /* Convert page size of 12,14,16 (log2) to 1,2,3 */ + cmd->tlbi.tg = (tg - 10) / 2; + + /* +- * Determine what level the granule is at. For non-leaf, io-pgtable +- * assumes .tlb_flush_walk can invalidate multiple levels at once, +- * so ignore the nominal last-level granule and leave TTL=0. ++ * Determine what level the granule is at. For non-leaf, both ++ * io-pgtable and SVA pass a nominal last-level granule because ++ * they don't know what level(s) actually apply, so ignore that ++ * and leave TTL=0. However for various errata reasons we still ++ * want to use a range command, so avoid the SVA corner case ++ * where both scale and num could be 0 as well. + */ + if (cmd->tlbi.leaf) + cmd->tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3)); +- +- num_pages = size >> tg; ++ else if ((num_pages & CMDQ_TLBI_RANGE_NUM_MAX) == 1) ++ num_pages++; + } + + cmds.num = 0; +-- +2.40.1 + diff --git a/queue-5.15/iommu-arm-smmu-v3-set-ttl-invalidation-hint-better.patch b/queue-5.15/iommu-arm-smmu-v3-set-ttl-invalidation-hint-better.patch new file mode 100644 index 00000000000..a3b6ebfe54f --- /dev/null +++ b/queue-5.15/iommu-arm-smmu-v3-set-ttl-invalidation-hint-better.patch @@ -0,0 +1,51 @@ +From 0b631a6adf27bbb29731ad408df5b79ee534d030 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 1 Jun 2023 17:43:33 +0100 +Subject: iommu/arm-smmu-v3: Set TTL invalidation hint better + +From: Robin Murphy + +[ Upstream commit 6833b8f2e19945a41e4d5efd8c6d9f4cae9a5b7d ] + +When io-pgtable unmaps a whole table, rather than waste time walking it +to find the leaf entries to invalidate exactly, it simply expects +.tlb_flush_walk with nominal last-level granularity to invalidate any +leaf entries at higher intermediate levels as well. This works fine with +page-based invalidation, but with range commands we need to be careful +with the TTL hint - unconditionally setting it based on the given level +3 granule means that an invalidation for a level 1 table would strictly +not be required to affect level 2 block entries. It's easy to comply +with the expected behaviour by simply not setting the TTL hint for +non-leaf invalidations, so let's do that. + +Signed-off-by: Robin Murphy +Link: https://lore.kernel.org/r/b409d9a17c52dc0db51faee91d92737bb7975f5b.1685637456.git.robin.murphy@arm.com +Signed-off-by: Will Deacon +Signed-off-by: Sasha Levin +--- + drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +index 340ef116d574a..67845f8e1df9f 100644 +--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c ++++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +@@ -1884,8 +1884,13 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, + /* Convert page size of 12,14,16 (log2) to 1,2,3 */ + cmd->tlbi.tg = (tg - 10) / 2; + +- /* Determine what level the granule is at */ +- cmd->tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3)); ++ /* ++ * Determine what level the granule is at. For non-leaf, io-pgtable ++ * assumes .tlb_flush_walk can invalidate multiple levels at once, ++ * so ignore the nominal last-level granule and leave TTL=0. ++ */ ++ if (cmd->tlbi.leaf) ++ cmd->tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3)); + + num_pages = size >> tg; + } +-- +2.40.1 + diff --git a/queue-5.15/rbd-decouple-header-read-in-from-updating-rbd_dev-he.patch b/queue-5.15/rbd-decouple-header-read-in-from-updating-rbd_dev-he.patch new file mode 100644 index 00000000000..2f6f57abe2f --- /dev/null +++ b/queue-5.15/rbd-decouple-header-read-in-from-updating-rbd_dev-he.patch @@ -0,0 +1,452 @@ +From 8e144494334afab81aa5aa1d8c15332bb4780421 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 5 Oct 2023 11:59:33 +0200 +Subject: rbd: decouple header read-in from updating rbd_dev->header + +From: Ilya Dryomov + +commit 510a7330c82a7754d5df0117a8589e8a539067c7 upstream. + +Make rbd_dev_header_info() populate a passed struct rbd_image_header +instead of rbd_dev->header and introduce rbd_dev_update_header() for +updating mutable fields in rbd_dev->header upon refresh. The initial +read-in of both mutable and immutable fields in rbd_dev_image_probe() +passes in rbd_dev->header so no update step is required there. + +rbd_init_layout() is now called directly from rbd_dev_image_probe() +instead of individually in format 1 and format 2 implementations. + +Signed-off-by: Ilya Dryomov +Reviewed-by: Dongsheng Yang +Signed-off-by: Sasha Levin +--- + drivers/block/rbd.c | 206 ++++++++++++++++++++++++-------------------- + 1 file changed, 114 insertions(+), 92 deletions(-) + +diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c +index 772e28d6c1384..37bbfcca8b624 100644 +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -632,7 +632,8 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) + static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); + + static int rbd_dev_refresh(struct rbd_device *rbd_dev); +-static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); ++static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev, ++ struct rbd_image_header *header); + static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + u64 snap_id); + static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, +@@ -994,15 +995,24 @@ static void rbd_init_layout(struct rbd_device *rbd_dev) + RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); + } + ++static void rbd_image_header_cleanup(struct rbd_image_header *header) ++{ ++ kfree(header->object_prefix); ++ ceph_put_snap_context(header->snapc); ++ kfree(header->snap_sizes); ++ kfree(header->snap_names); ++ ++ memset(header, 0, sizeof(*header)); ++} ++ + /* + * Fill an rbd image header with information from the given format 1 + * on-disk header. + */ +-static int rbd_header_from_disk(struct rbd_device *rbd_dev, +- struct rbd_image_header_ondisk *ondisk) ++static int rbd_header_from_disk(struct rbd_image_header *header, ++ struct rbd_image_header_ondisk *ondisk, ++ bool first_time) + { +- struct rbd_image_header *header = &rbd_dev->header; +- bool first_time = header->object_prefix == NULL; + struct ceph_snap_context *snapc; + char *object_prefix = NULL; + char *snap_names = NULL; +@@ -1069,11 +1079,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, + if (first_time) { + header->object_prefix = object_prefix; + header->obj_order = ondisk->options.order; +- rbd_init_layout(rbd_dev); +- } else { +- ceph_put_snap_context(header->snapc); +- kfree(header->snap_names); +- kfree(header->snap_sizes); + } + + /* The remaining fields always get updated (when we refresh) */ +@@ -4859,7 +4864,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, + * return, the rbd_dev->header field will contain up-to-date + * information about the image. + */ +-static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) ++static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev, ++ struct rbd_image_header *header, ++ bool first_time) + { + struct rbd_image_header_ondisk *ondisk = NULL; + u32 snap_count = 0; +@@ -4907,7 +4914,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) + snap_count = le32_to_cpu(ondisk->snap_count); + } while (snap_count != want_count); + +- ret = rbd_header_from_disk(rbd_dev, ondisk); ++ ret = rbd_header_from_disk(header, ondisk, first_time); + out: + kfree(ondisk); + +@@ -5473,17 +5480,12 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + return 0; + } + +-static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) +-{ +- return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, +- &rbd_dev->header.obj_order, +- &rbd_dev->header.image_size); +-} +- +-static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) ++static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev, ++ char **pobject_prefix) + { + size_t size; + void *reply_buf; ++ char *object_prefix; + int ret; + void *p; + +@@ -5501,16 +5503,16 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) + goto out; + + p = reply_buf; +- rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, +- p + ret, NULL, GFP_NOIO); ++ object_prefix = ceph_extract_encoded_string(&p, p + ret, NULL, ++ GFP_NOIO); ++ if (IS_ERR(object_prefix)) { ++ ret = PTR_ERR(object_prefix); ++ goto out; ++ } + ret = 0; + +- if (IS_ERR(rbd_dev->header.object_prefix)) { +- ret = PTR_ERR(rbd_dev->header.object_prefix); +- rbd_dev->header.object_prefix = NULL; +- } else { +- dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); +- } ++ *pobject_prefix = object_prefix; ++ dout(" object_prefix = %s\n", object_prefix); + out: + kfree(reply_buf); + +@@ -5561,13 +5563,6 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + return 0; + } + +-static int rbd_dev_v2_features(struct rbd_device *rbd_dev) +-{ +- return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, +- rbd_is_ro(rbd_dev), +- &rbd_dev->header.features); +-} +- + /* + * These are generic image flags, but since they are used only for + * object map, store them in rbd_dev->object_map_flags. +@@ -5842,14 +5837,14 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) + return ret; + } + +-static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) ++static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev, ++ u64 *stripe_unit, u64 *stripe_count) + { + struct { + __le64 stripe_unit; + __le64 stripe_count; + } __attribute__ ((packed)) striping_info_buf = { 0 }; + size_t size = sizeof (striping_info_buf); +- void *p; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, +@@ -5861,27 +5856,33 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) + if (ret < size) + return -ERANGE; + +- p = &striping_info_buf; +- rbd_dev->header.stripe_unit = ceph_decode_64(&p); +- rbd_dev->header.stripe_count = ceph_decode_64(&p); ++ *stripe_unit = le64_to_cpu(striping_info_buf.stripe_unit); ++ *stripe_count = le64_to_cpu(striping_info_buf.stripe_count); ++ dout(" stripe_unit = %llu stripe_count = %llu\n", *stripe_unit, ++ *stripe_count); ++ + return 0; + } + +-static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) ++static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev, s64 *data_pool_id) + { +- __le64 data_pool_id; ++ __le64 data_pool_buf; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_data_pool", +- NULL, 0, &data_pool_id, sizeof(data_pool_id)); ++ NULL, 0, &data_pool_buf, ++ sizeof(data_pool_buf)); ++ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + return ret; +- if (ret < sizeof(data_pool_id)) ++ if (ret < sizeof(data_pool_buf)) + return -EBADMSG; + +- rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); +- WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); ++ *data_pool_id = le64_to_cpu(data_pool_buf); ++ dout(" data_pool_id = %lld\n", *data_pool_id); ++ WARN_ON(*data_pool_id == CEPH_NOPOOL); ++ + return 0; + } + +@@ -6073,7 +6074,8 @@ static int rbd_spec_fill_names(struct rbd_device *rbd_dev) + return ret; + } + +-static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) ++static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, ++ struct ceph_snap_context **psnapc) + { + size_t size; + int ret; +@@ -6134,9 +6136,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) + for (i = 0; i < snap_count; i++) + snapc->snaps[i] = ceph_decode_64(&p); + +- ceph_put_snap_context(rbd_dev->header.snapc); +- rbd_dev->header.snapc = snapc; +- ++ *psnapc = snapc; + dout(" snap context seq = %llu, snap_count = %u\n", + (unsigned long long)seq, (unsigned int)snap_count); + out: +@@ -6185,38 +6185,42 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + return snap_name; + } + +-static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) ++static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev, ++ struct rbd_image_header *header, ++ bool first_time) + { +- bool first_time = rbd_dev->header.object_prefix == NULL; + int ret; + +- ret = rbd_dev_v2_image_size(rbd_dev); ++ ret = _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, ++ first_time ? &header->obj_order : NULL, ++ &header->image_size); + if (ret) + return ret; + + if (first_time) { +- ret = rbd_dev_v2_header_onetime(rbd_dev); ++ ret = rbd_dev_v2_header_onetime(rbd_dev, header); + if (ret) + return ret; + } + +- ret = rbd_dev_v2_snap_context(rbd_dev); +- if (ret && first_time) { +- kfree(rbd_dev->header.object_prefix); +- rbd_dev->header.object_prefix = NULL; +- } ++ ret = rbd_dev_v2_snap_context(rbd_dev, &header->snapc); ++ if (ret) ++ return ret; + +- return ret; ++ return 0; + } + +-static int rbd_dev_header_info(struct rbd_device *rbd_dev) ++static int rbd_dev_header_info(struct rbd_device *rbd_dev, ++ struct rbd_image_header *header, ++ bool first_time) + { + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); ++ rbd_assert(!header->object_prefix && !header->snapc); + + if (rbd_dev->image_format == 1) +- return rbd_dev_v1_header_info(rbd_dev); ++ return rbd_dev_v1_header_info(rbd_dev, header, first_time); + +- return rbd_dev_v2_header_info(rbd_dev); ++ return rbd_dev_v2_header_info(rbd_dev, header, first_time); + } + + /* +@@ -6703,60 +6707,49 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) + */ + static void rbd_dev_unprobe(struct rbd_device *rbd_dev) + { +- struct rbd_image_header *header; +- + rbd_dev_parent_put(rbd_dev); + rbd_object_map_free(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); + + /* Free dynamic fields from the header, then zero it out */ + +- header = &rbd_dev->header; +- ceph_put_snap_context(header->snapc); +- kfree(header->snap_sizes); +- kfree(header->snap_names); +- kfree(header->object_prefix); +- memset(header, 0, sizeof (*header)); ++ rbd_image_header_cleanup(&rbd_dev->header); + } + +-static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) ++static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev, ++ struct rbd_image_header *header) + { + int ret; + +- ret = rbd_dev_v2_object_prefix(rbd_dev); ++ ret = rbd_dev_v2_object_prefix(rbd_dev, &header->object_prefix); + if (ret) +- goto out_err; ++ return ret; + + /* + * Get the and check features for the image. Currently the + * features are assumed to never change. + */ +- ret = rbd_dev_v2_features(rbd_dev); ++ ret = _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, ++ rbd_is_ro(rbd_dev), &header->features); + if (ret) +- goto out_err; ++ return ret; + + /* If the image supports fancy striping, get its parameters */ + +- if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { +- ret = rbd_dev_v2_striping_info(rbd_dev); +- if (ret < 0) +- goto out_err; ++ if (header->features & RBD_FEATURE_STRIPINGV2) { ++ ret = rbd_dev_v2_striping_info(rbd_dev, &header->stripe_unit, ++ &header->stripe_count); ++ if (ret) ++ return ret; + } + +- if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { +- ret = rbd_dev_v2_data_pool(rbd_dev); ++ if (header->features & RBD_FEATURE_DATA_POOL) { ++ ret = rbd_dev_v2_data_pool(rbd_dev, &header->data_pool_id); + if (ret) +- goto out_err; ++ return ret; + } + +- rbd_init_layout(rbd_dev); + return 0; +- +-out_err: +- rbd_dev->header.features = 0; +- kfree(rbd_dev->header.object_prefix); +- rbd_dev->header.object_prefix = NULL; +- return ret; + } + + /* +@@ -6951,13 +6944,15 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) + if (!depth) + down_write(&rbd_dev->header_rwsem); + +- ret = rbd_dev_header_info(rbd_dev); ++ ret = rbd_dev_header_info(rbd_dev, &rbd_dev->header, true); + if (ret) { + if (ret == -ENOENT && !need_watch) + rbd_print_dne(rbd_dev, false); + goto err_out_probe; + } + ++ rbd_init_layout(rbd_dev); ++ + /* + * If this image is the one being mapped, we have pool name and + * id, image name and id, and snap name - need to fill snap id. +@@ -7012,15 +7007,39 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) + return ret; + } + ++static void rbd_dev_update_header(struct rbd_device *rbd_dev, ++ struct rbd_image_header *header) ++{ ++ rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); ++ rbd_assert(rbd_dev->header.object_prefix); /* !first_time */ ++ ++ rbd_dev->header.image_size = header->image_size; ++ ++ ceph_put_snap_context(rbd_dev->header.snapc); ++ rbd_dev->header.snapc = header->snapc; ++ header->snapc = NULL; ++ ++ if (rbd_dev->image_format == 1) { ++ kfree(rbd_dev->header.snap_names); ++ rbd_dev->header.snap_names = header->snap_names; ++ header->snap_names = NULL; ++ ++ kfree(rbd_dev->header.snap_sizes); ++ rbd_dev->header.snap_sizes = header->snap_sizes; ++ header->snap_sizes = NULL; ++ } ++} ++ + static int rbd_dev_refresh(struct rbd_device *rbd_dev) + { ++ struct rbd_image_header header = { 0 }; + u64 mapping_size; + int ret; + + down_write(&rbd_dev->header_rwsem); + mapping_size = rbd_dev->mapping.size; + +- ret = rbd_dev_header_info(rbd_dev); ++ ret = rbd_dev_header_info(rbd_dev, &header, false); + if (ret) + goto out; + +@@ -7034,6 +7053,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) + goto out; + } + ++ rbd_dev_update_header(rbd_dev, &header); ++ + rbd_assert(!rbd_is_snap(rbd_dev)); + rbd_dev->mapping.size = rbd_dev->header.image_size; + +@@ -7042,6 +7063,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) + if (!ret && mapping_size != rbd_dev->mapping.size) + rbd_dev_update_size(rbd_dev); + ++ rbd_image_header_cleanup(&header); + return ret; + } + +-- +2.40.1 + diff --git a/queue-5.15/rbd-decouple-parent-info-read-in-from-updating-rbd_d.patch b/queue-5.15/rbd-decouple-parent-info-read-in-from-updating-rbd_d.patch new file mode 100644 index 00000000000..51f6faca947 --- /dev/null +++ b/queue-5.15/rbd-decouple-parent-info-read-in-from-updating-rbd_d.patch @@ -0,0 +1,273 @@ +From ab225c3c2c8825e3ea1131a5e47234b6c6fe3afa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 5 Oct 2023 11:59:34 +0200 +Subject: rbd: decouple parent info read-in from updating rbd_dev + +From: Ilya Dryomov + +commit c10311776f0a8ddea2276df96e255625b07045a8 upstream. + +Unlike header read-in, parent info read-in is already decoupled in +get_parent_info(), but it's buried in rbd_dev_v2_parent_info() along +with the processing logic. + +Separate the initial read-in and update read-in logic into +rbd_dev_setup_parent() and rbd_dev_update_parent() respectively and +have rbd_dev_v2_parent_info() just populate struct parent_image_info +(i.e. what get_parent_info() did). Some existing QoI issues, like +flatten of a standalone clone being disregarded on refresh, remain. + +Signed-off-by: Ilya Dryomov +Reviewed-by: Dongsheng Yang +Signed-off-by: Sasha Levin +--- + drivers/block/rbd.c | 142 +++++++++++++++++++++++++------------------- + 1 file changed, 80 insertions(+), 62 deletions(-) + +diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c +index 37bbfcca8b624..7e1266d55457a 100644 +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -5599,6 +5599,14 @@ struct parent_image_info { + u64 overlap; + }; + ++static void rbd_parent_info_cleanup(struct parent_image_info *pii) ++{ ++ kfree(pii->pool_ns); ++ kfree(pii->image_id); ++ ++ memset(pii, 0, sizeof(*pii)); ++} ++ + /* + * The caller is responsible for @pii. + */ +@@ -5668,6 +5676,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev, + if (pii->has_overlap) + ceph_decode_64_safe(&p, end, pii->overlap, e_inval); + ++ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", ++ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id, ++ pii->has_overlap, pii->overlap); + return 0; + + e_inval: +@@ -5706,14 +5717,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev, + pii->has_overlap = true; + ceph_decode_64_safe(&p, end, pii->overlap, e_inval); + ++ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", ++ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id, ++ pii->has_overlap, pii->overlap); + return 0; + + e_inval: + return -EINVAL; + } + +-static int get_parent_info(struct rbd_device *rbd_dev, +- struct parent_image_info *pii) ++static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev, ++ struct parent_image_info *pii) + { + struct page *req_page, *reply_page; + void *p; +@@ -5741,7 +5755,7 @@ static int get_parent_info(struct rbd_device *rbd_dev, + return ret; + } + +-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ++static int rbd_dev_setup_parent(struct rbd_device *rbd_dev) + { + struct rbd_spec *parent_spec; + struct parent_image_info pii = { 0 }; +@@ -5751,37 +5765,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) + if (!parent_spec) + return -ENOMEM; + +- ret = get_parent_info(rbd_dev, &pii); ++ ret = rbd_dev_v2_parent_info(rbd_dev, &pii); + if (ret) + goto out_err; + +- dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", +- __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id, +- pii.has_overlap, pii.overlap); +- +- if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) { +- /* +- * Either the parent never existed, or we have +- * record of it but the image got flattened so it no +- * longer has a parent. When the parent of a +- * layered image disappears we immediately set the +- * overlap to 0. The effect of this is that all new +- * requests will be treated as if the image had no +- * parent. +- * +- * If !pii.has_overlap, the parent image spec is not +- * applicable. It's there to avoid duplication in each +- * snapshot record. +- */ +- if (rbd_dev->parent_overlap) { +- rbd_dev->parent_overlap = 0; +- rbd_dev_parent_put(rbd_dev); +- pr_info("%s: clone image has been flattened\n", +- rbd_dev->disk->disk_name); +- } +- ++ if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) + goto out; /* No parent? No problem. */ +- } + + /* The ceph file layout needs to fit pool id in 32 bits */ + +@@ -5793,46 +5782,34 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) + } + + /* +- * The parent won't change (except when the clone is +- * flattened, already handled that). So we only need to +- * record the parent spec we have not already done so. ++ * The parent won't change except when the clone is flattened, ++ * so we only need to record the parent image spec once. + */ +- if (!rbd_dev->parent_spec) { +- parent_spec->pool_id = pii.pool_id; +- if (pii.pool_ns && *pii.pool_ns) { +- parent_spec->pool_ns = pii.pool_ns; +- pii.pool_ns = NULL; +- } +- parent_spec->image_id = pii.image_id; +- pii.image_id = NULL; +- parent_spec->snap_id = pii.snap_id; +- +- rbd_dev->parent_spec = parent_spec; +- parent_spec = NULL; /* rbd_dev now owns this */ ++ parent_spec->pool_id = pii.pool_id; ++ if (pii.pool_ns && *pii.pool_ns) { ++ parent_spec->pool_ns = pii.pool_ns; ++ pii.pool_ns = NULL; + } ++ parent_spec->image_id = pii.image_id; ++ pii.image_id = NULL; ++ parent_spec->snap_id = pii.snap_id; ++ ++ rbd_assert(!rbd_dev->parent_spec); ++ rbd_dev->parent_spec = parent_spec; ++ parent_spec = NULL; /* rbd_dev now owns this */ + + /* +- * We always update the parent overlap. If it's zero we issue +- * a warning, as we will proceed as if there was no parent. ++ * Record the parent overlap. If it's zero, issue a warning as ++ * we will proceed as if there is no parent. + */ +- if (!pii.overlap) { +- if (parent_spec) { +- /* refresh, careful to warn just once */ +- if (rbd_dev->parent_overlap) +- rbd_warn(rbd_dev, +- "clone now standalone (overlap became 0)"); +- } else { +- /* initial probe */ +- rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); +- } +- } ++ if (!pii.overlap) ++ rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); + rbd_dev->parent_overlap = pii.overlap; + + out: + ret = 0; + out_err: +- kfree(pii.pool_ns); +- kfree(pii.image_id); ++ rbd_parent_info_cleanup(&pii); + rbd_spec_put(parent_spec); + return ret; + } +@@ -6981,7 +6958,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) + } + + if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { +- ret = rbd_dev_v2_parent_info(rbd_dev); ++ ret = rbd_dev_setup_parent(rbd_dev); + if (ret) + goto err_out_probe; + } +@@ -7030,9 +7007,47 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev, + } + } + ++static void rbd_dev_update_parent(struct rbd_device *rbd_dev, ++ struct parent_image_info *pii) ++{ ++ if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) { ++ /* ++ * Either the parent never existed, or we have ++ * record of it but the image got flattened so it no ++ * longer has a parent. When the parent of a ++ * layered image disappears we immediately set the ++ * overlap to 0. The effect of this is that all new ++ * requests will be treated as if the image had no ++ * parent. ++ * ++ * If !pii.has_overlap, the parent image spec is not ++ * applicable. It's there to avoid duplication in each ++ * snapshot record. ++ */ ++ if (rbd_dev->parent_overlap) { ++ rbd_dev->parent_overlap = 0; ++ rbd_dev_parent_put(rbd_dev); ++ pr_info("%s: clone has been flattened\n", ++ rbd_dev->disk->disk_name); ++ } ++ } else { ++ rbd_assert(rbd_dev->parent_spec); ++ ++ /* ++ * Update the parent overlap. If it became zero, issue ++ * a warning as we will proceed as if there is no parent. ++ */ ++ if (!pii->overlap && rbd_dev->parent_overlap) ++ rbd_warn(rbd_dev, ++ "clone has become standalone (overlap 0)"); ++ rbd_dev->parent_overlap = pii->overlap; ++ } ++} ++ + static int rbd_dev_refresh(struct rbd_device *rbd_dev) + { + struct rbd_image_header header = { 0 }; ++ struct parent_image_info pii = { 0 }; + u64 mapping_size; + int ret; + +@@ -7048,12 +7063,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) + * mapped image getting flattened. + */ + if (rbd_dev->parent) { +- ret = rbd_dev_v2_parent_info(rbd_dev); ++ ret = rbd_dev_v2_parent_info(rbd_dev, &pii); + if (ret) + goto out; + } + + rbd_dev_update_header(rbd_dev, &header); ++ if (rbd_dev->parent) ++ rbd_dev_update_parent(rbd_dev, &pii); + + rbd_assert(!rbd_is_snap(rbd_dev)); + rbd_dev->mapping.size = rbd_dev->header.image_size; +@@ -7063,6 +7080,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) + if (!ret && mapping_size != rbd_dev->mapping.size) + rbd_dev_update_size(rbd_dev); + ++ rbd_parent_info_cleanup(&pii); + rbd_image_header_cleanup(&header); + return ret; + } +-- +2.40.1 + diff --git a/queue-5.15/rbd-move-rbd_dev_refresh-definition.patch b/queue-5.15/rbd-move-rbd_dev_refresh-definition.patch new file mode 100644 index 00000000000..706e3674e76 --- /dev/null +++ b/queue-5.15/rbd-move-rbd_dev_refresh-definition.patch @@ -0,0 +1,117 @@ +From ab7e11432cafc821453800e1f03faca5f1ed1119 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 5 Oct 2023 11:59:32 +0200 +Subject: rbd: move rbd_dev_refresh() definition + +From: Ilya Dryomov + +commit 0b035401c57021fc6c300272cbb1c5a889d4fe45 upstream. + +Move rbd_dev_refresh() definition further down to avoid having to +move struct parent_image_info definition in the next commit. This +spares some forward declarations too. + +Signed-off-by: Ilya Dryomov +Reviewed-by: Dongsheng Yang +[idryomov@gmail.com: backport to 5.10-6.1: context] +Signed-off-by: Sasha Levin +--- + drivers/block/rbd.c | 68 ++++++++++++++++++++++----------------------- + 1 file changed, 33 insertions(+), 35 deletions(-) + +diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c +index fe8bdbf4616bc..772e28d6c1384 100644 +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -633,8 +633,6 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); + + static int rbd_dev_refresh(struct rbd_device *rbd_dev); + static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); +-static int rbd_dev_header_info(struct rbd_device *rbd_dev); +-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); + static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + u64 snap_id); + static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, +@@ -4933,39 +4931,6 @@ static void rbd_dev_update_size(struct rbd_device *rbd_dev) + } + } + +-static int rbd_dev_refresh(struct rbd_device *rbd_dev) +-{ +- u64 mapping_size; +- int ret; +- +- down_write(&rbd_dev->header_rwsem); +- mapping_size = rbd_dev->mapping.size; +- +- ret = rbd_dev_header_info(rbd_dev); +- if (ret) +- goto out; +- +- /* +- * If there is a parent, see if it has disappeared due to the +- * mapped image getting flattened. +- */ +- if (rbd_dev->parent) { +- ret = rbd_dev_v2_parent_info(rbd_dev); +- if (ret) +- goto out; +- } +- +- rbd_assert(!rbd_is_snap(rbd_dev)); +- rbd_dev->mapping.size = rbd_dev->header.image_size; +- +-out: +- up_write(&rbd_dev->header_rwsem); +- if (!ret && mapping_size != rbd_dev->mapping.size) +- rbd_dev_update_size(rbd_dev); +- +- return ret; +-} +- + static const struct blk_mq_ops rbd_mq_ops = { + .queue_rq = rbd_queue_rq, + }; +@@ -7047,6 +7012,39 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) + return ret; + } + ++static int rbd_dev_refresh(struct rbd_device *rbd_dev) ++{ ++ u64 mapping_size; ++ int ret; ++ ++ down_write(&rbd_dev->header_rwsem); ++ mapping_size = rbd_dev->mapping.size; ++ ++ ret = rbd_dev_header_info(rbd_dev); ++ if (ret) ++ goto out; ++ ++ /* ++ * If there is a parent, see if it has disappeared due to the ++ * mapped image getting flattened. ++ */ ++ if (rbd_dev->parent) { ++ ret = rbd_dev_v2_parent_info(rbd_dev); ++ if (ret) ++ goto out; ++ } ++ ++ rbd_assert(!rbd_is_snap(rbd_dev)); ++ rbd_dev->mapping.size = rbd_dev->header.image_size; ++ ++out: ++ up_write(&rbd_dev->header_rwsem); ++ if (!ret && mapping_size != rbd_dev->mapping.size) ++ rbd_dev_update_size(rbd_dev); ++ ++ return ret; ++} ++ + static ssize_t do_rbd_add(struct bus_type *bus, + const char *buf, + size_t count) +-- +2.40.1 + diff --git a/queue-5.15/rbd-take-header_rwsem-in-rbd_dev_refresh-only-when-u.patch b/queue-5.15/rbd-take-header_rwsem-in-rbd_dev_refresh-only-when-u.patch new file mode 100644 index 00000000000..9c27d4a2e3c --- /dev/null +++ b/queue-5.15/rbd-take-header_rwsem-in-rbd_dev_refresh-only-when-u.patch @@ -0,0 +1,109 @@ +From c12ea6fa0e8a7a339ca432393de5f748e2acb487 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 5 Oct 2023 11:59:35 +0200 +Subject: rbd: take header_rwsem in rbd_dev_refresh() only when updating + +From: Ilya Dryomov + +commit 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d upstream. + +rbd_dev_refresh() has been holding header_rwsem across header and +parent info read-in unnecessarily for ages. With commit 870611e4877e +("rbd: get snapshot context after exclusive lock is ensured to be +held"), the potential for deadlocks became much more real owning to +a) header_rwsem now nesting inside lock_rwsem and b) rw_semaphores +not allowing new readers after a writer is registered. + +For example, assuming that I/O request 1, I/O request 2 and header +read-in request all target the same OSD: + +1. I/O request 1 comes in and gets submitted +2. watch error occurs +3. rbd_watch_errcb() takes lock_rwsem for write, clears owner_cid and + releases lock_rwsem +4. after reestablishing the watch, rbd_reregister_watch() calls + rbd_dev_refresh() which takes header_rwsem for write and submits + a header read-in request +5. I/O request 2 comes in: after taking lock_rwsem for read in + __rbd_img_handle_request(), it blocks trying to take header_rwsem + for read in rbd_img_object_requests() +6. another watch error occurs +7. rbd_watch_errcb() blocks trying to take lock_rwsem for write +8. I/O request 1 completion is received by the messenger but can't be + processed because lock_rwsem won't be granted anymore +9. header read-in request completion can't be received, let alone + processed, because the messenger is stranded + +Change rbd_dev_refresh() to take header_rwsem only for actually +updating rbd_dev->header. Header and parent info read-in don't need +any locking. + +Cc: stable@vger.kernel.org # 0b035401c570: rbd: move rbd_dev_refresh() definition +Cc: stable@vger.kernel.org # 510a7330c82a: rbd: decouple header read-in from updating rbd_dev->header +Cc: stable@vger.kernel.org # c10311776f0a: rbd: decouple parent info read-in from updating rbd_dev +Cc: stable@vger.kernel.org +Fixes: 870611e4877e ("rbd: get snapshot context after exclusive lock is ensured to be held") +Signed-off-by: Ilya Dryomov +Reviewed-by: Dongsheng Yang +Signed-off-by: Sasha Levin +--- + drivers/block/rbd.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c +index 7e1266d55457a..16744a79a1783 100644 +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -6990,7 +6990,14 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev, + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + rbd_assert(rbd_dev->header.object_prefix); /* !first_time */ + +- rbd_dev->header.image_size = header->image_size; ++ if (rbd_dev->header.image_size != header->image_size) { ++ rbd_dev->header.image_size = header->image_size; ++ ++ if (!rbd_is_snap(rbd_dev)) { ++ rbd_dev->mapping.size = header->image_size; ++ rbd_dev_update_size(rbd_dev); ++ } ++ } + + ceph_put_snap_context(rbd_dev->header.snapc); + rbd_dev->header.snapc = header->snapc; +@@ -7048,11 +7055,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) + { + struct rbd_image_header header = { 0 }; + struct parent_image_info pii = { 0 }; +- u64 mapping_size; + int ret; + +- down_write(&rbd_dev->header_rwsem); +- mapping_size = rbd_dev->mapping.size; ++ dout("%s rbd_dev %p\n", __func__, rbd_dev); + + ret = rbd_dev_header_info(rbd_dev, &header, false); + if (ret) +@@ -7068,18 +7073,13 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) + goto out; + } + ++ down_write(&rbd_dev->header_rwsem); + rbd_dev_update_header(rbd_dev, &header); + if (rbd_dev->parent) + rbd_dev_update_parent(rbd_dev, &pii); +- +- rbd_assert(!rbd_is_snap(rbd_dev)); +- rbd_dev->mapping.size = rbd_dev->header.image_size; +- +-out: + up_write(&rbd_dev->header_rwsem); +- if (!ret && mapping_size != rbd_dev->mapping.size) +- rbd_dev_update_size(rbd_dev); + ++out: + rbd_parent_info_cleanup(&pii); + rbd_image_header_cleanup(&header); + return ret; +-- +2.40.1 + diff --git a/queue-5.15/series b/queue-5.15/series index 360433e59ab..fc10a11a589 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -8,3 +8,9 @@ nfsv4-fix-a-state-manager-thread-deadlock-regression.patch ring-buffer-remove-obsolete-comment-for-free_buffer_.patch ring-buffer-fix-bytes-info-in-per_cpu-buffer-stats.patch arm64-avoid-repeated-aa64mmfr1_el1-register-read-on-.patch +iommu-arm-smmu-v3-set-ttl-invalidation-hint-better.patch +iommu-arm-smmu-v3-avoid-constructing-invalid-range-c.patch +rbd-move-rbd_dev_refresh-definition.patch +rbd-decouple-header-read-in-from-updating-rbd_dev-he.patch +rbd-decouple-parent-info-read-in-from-updating-rbd_d.patch +rbd-take-header_rwsem-in-rbd_dev_refresh-only-when-u.patch