From: Greg Kroah-Hartman Date: Mon, 26 Feb 2024 12:07:34 +0000 (+0100) Subject: 6.7-stable patches X-Git-Tag: v4.19.308~59 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e3be74a119eab9fb972778f4714a93b8f83723f8;p=thirdparty%2Fkernel%2Fstable-queue.git 6.7-stable patches added patches: accel-ivpu-don-t-enable-any-tiles-by-default-on-vpu40xx.patch ata-libata-core-do-not-call-ata_dev_power_set_standby-twice.patch ata-libata-core-do-not-try-to-set-sleeping-devices-to-standby.patch btrfs-defrag-avoid-unnecessary-defrag-caused-by-incorrect-extent-size.patch btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch cachefiles-fix-memory-leak-in-cachefiles_add_cache.patch crypto-virtio-akcipher-fix-stack-overflow-on-memcpy.patch cxl-acpi-fix-load-failures-due-to-single-window-creation-failure.patch cxl-pci-fix-disabling-memory-if-dvsec-cxl-range-does-not-match-a-cfmws-window.patch cxl-pci-skip-to-handle-ras-errors-if-cxl.mem-device-is-detached.patch dm-crypt-don-t-modify-the-data-when-using-authenticated-encryption.patch dm-crypt-recheck-the-integrity-tag-after-a-failure.patch dm-integrity-recheck-the-integrity-tag-after-a-failure.patch dm-verity-recheck-the-hash-after-a-failure.patch docs-instruct-latex-to-cope-with-deeper-nesting.patch drm-amd-display-adjust-few-initialization-order-in-dm.patch drm-amd-display-only-allow-dig-mapping-to-pwrseq-in-new-asic.patch drm-amdgpu-fix-the-runtime-resume-failure-issue.patch drm-buddy-modify-duplicate-list_splice_tail-call.patch drm-meson-don-t-remove-bridges-which-are-created-by-other-drivers.patch drm-ttm-fix-an-invalid-freeing-on-already-freed-page-in-error-path.patch fs-aio-restrict-kiocb_set_cancel_fn-to-i-o-submitted-via-libaio.patch gtp-fix-use-after-free-and-null-ptr-deref-in-gtp_genl_dump_pdp.patch kvm-arm64-vgic-its-test-for-valid-irq-in-its_sync_lpi_pending_table.patch kvm-arm64-vgic-its-test-for-valid-irq-in-movall-handler.patch lib-kconfig.debug-test_iov_iter-depends-on-mmu.patch loongarch-call-early_init_fdt_scan_reserved_mem-earlier.patch loongarch-disable-irq-before-init_fn-for-nonboot-cpus.patch loongarch-update-cpu_sibling_map-when-disabling-nonboot-cpus.patch md-don-t-ignore-read-only-array-in-md_check_recovery.patch md-don-t-ignore-suspended-array-in-md_check_recovery.patch md-don-t-register-sync_thread-for-reshape-directly.patch md-don-t-suspend-the-array-for-interrupted-reshape.patch md-fix-missing-release-of-active_io-for-flush.patch md-make-sure-md_do_sync-will-set-md_recovery_done.patch mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch mm-memblock-add-memblock_rsrv_noinit-into-flagname-array.patch mm-memcontrol-clarify-swapaccount-0-deprecation-warning.patch mm-swap-fix-race-when-skipping-swapcache.patch platform-x86-intel-vbtn-stop-calling-vbdl-from-notify_handler.patch platform-x86-touchscreen_dmi-allow-partial-prefix-matches-for-acpi-names.patch platform-x86-x86-android-tablets-fix-keyboard-touchscreen-on-lenovo-yogabook1-x90.patch revert-parisc-only-list-existing-cpus-in-cpu_possible_mask.patch s390-cio-fix-invalid-ebusy-on-ccw_device_start.patch scsi-core-consult-supported-vpd-page-list-prior-to-fetching-page.patch scsi-sd-usb_storage-uas-access-media-prior-to-querying-device-properties.patch scsi-target-pscsi-fix-bio_put-for-error-case.patch selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0.patch sparc-fix-undefined-reference-to-fb_is_primary_device.patch x86-bugs-add-asm-helpers-for-executing-verw.patch --- diff --git a/queue-6.7/accel-ivpu-don-t-enable-any-tiles-by-default-on-vpu40xx.patch b/queue-6.7/accel-ivpu-don-t-enable-any-tiles-by-default-on-vpu40xx.patch new file mode 100644 index 00000000000..37da4262ebe --- /dev/null +++ b/queue-6.7/accel-ivpu-don-t-enable-any-tiles-by-default-on-vpu40xx.patch @@ -0,0 +1,39 @@ +From eb0d253ff9c74dee30aa92fe460b825eb28acd73 Mon Sep 17 00:00:00 2001 +From: Andrzej Kacprowski +Date: Tue, 20 Feb 2024 14:16:24 +0100 +Subject: accel/ivpu: Don't enable any tiles by default on VPU40xx + +From: Andrzej Kacprowski + +commit eb0d253ff9c74dee30aa92fe460b825eb28acd73 upstream. + +There is no point in requesting 1 tile on VPU40xx as the FW will +probably need more tiles to run workloads, so it will have to +reconfigure PLL anyway. Don't enable any tiles and allow the FW to +perform initial tile configuration. + +This improves NPU boot stability as the tiles are always enabled only +by the FW from the same initial state. + +Fixes: 79cdc56c4a54 ("accel/ivpu: Add initial support for VPU 4") +Cc: stable@vger.kernel.org +Signed-off-by: Andrzej Kacprowski +Signed-off-by: Jacek Lawrynowicz +Reviewed-by: Jeffrey Hugo +Link: https://patchwork.freedesktop.org/patch/msgid/20240220131624.1447813-1-jacek.lawrynowicz@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_hw_40xx.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/accel/ivpu/ivpu_hw_40xx.c ++++ b/drivers/accel/ivpu/ivpu_hw_40xx.c +@@ -24,7 +24,7 @@ + #define SKU_HW_ID_SHIFT 16u + #define SKU_HW_ID_MASK 0xffff0000u + +-#define PLL_CONFIG_DEFAULT 0x1 ++#define PLL_CONFIG_DEFAULT 0x0 + #define PLL_CDYN_DEFAULT 0x80 + #define PLL_EPP_DEFAULT 0x80 + #define PLL_REF_CLK_FREQ (50 * 1000000) diff --git a/queue-6.7/ata-libata-core-do-not-call-ata_dev_power_set_standby-twice.patch b/queue-6.7/ata-libata-core-do-not-call-ata_dev_power_set_standby-twice.patch new file mode 100644 index 00000000000..91718a46629 --- /dev/null +++ b/queue-6.7/ata-libata-core-do-not-call-ata_dev_power_set_standby-twice.patch @@ -0,0 +1,108 @@ +From 9cec467d0502b24660f413a0e8fc782903b46d5b Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Mon, 19 Feb 2024 16:44:30 +0100 +Subject: ata: libata-core: Do not call ata_dev_power_set_standby() twice + +From: Damien Le Moal + +commit 9cec467d0502b24660f413a0e8fc782903b46d5b upstream. + +For regular system shutdown, ata_dev_power_set_standby() will be +executed twice: once the scsi device is removed and another when +ata_pci_shutdown_one() executes and EH completes unloading the devices. + +Make the second call to ata_dev_power_set_standby() do nothing by using +ata_dev_power_is_active() and return if the device is already in +standby. + +Fixes: 2da4c5e24e86 ("ata: libata-core: Improve ata_dev_power_set_active()") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Signed-off-by: Niklas Cassel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/libata-core.c | 59 +++++++++++++++++++++++----------------------- + 1 file changed, 30 insertions(+), 29 deletions(-) + +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -2001,6 +2001,33 @@ bool ata_dev_power_init_tf(struct ata_de + return true; + } + ++static bool ata_dev_power_is_active(struct ata_device *dev) ++{ ++ struct ata_taskfile tf; ++ unsigned int err_mask; ++ ++ ata_tf_init(dev, &tf); ++ tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR; ++ tf.protocol = ATA_PROT_NODATA; ++ tf.command = ATA_CMD_CHK_POWER; ++ ++ err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); ++ if (err_mask) { ++ ata_dev_err(dev, "Check power mode failed (err_mask=0x%x)\n", ++ err_mask); ++ /* ++ * Assume we are in standby mode so that we always force a ++ * spinup in ata_dev_power_set_active(). ++ */ ++ return false; ++ } ++ ++ ata_dev_dbg(dev, "Power mode: 0x%02x\n", tf.nsect); ++ ++ /* Active or idle */ ++ return tf.nsect == 0xff; ++} ++ + /** + * ata_dev_power_set_standby - Set a device power mode to standby + * @dev: target device +@@ -2017,8 +2044,9 @@ void ata_dev_power_set_standby(struct at + struct ata_taskfile tf; + unsigned int err_mask; + +- /* If the device is already sleeping, do nothing. */ +- if (dev->flags & ATA_DFLAG_SLEEPING) ++ /* If the device is already sleeping or in standby, do nothing. */ ++ if ((dev->flags & ATA_DFLAG_SLEEPING) || ++ !ata_dev_power_is_active(dev)) + return; + + /* +@@ -2046,33 +2074,6 @@ void ata_dev_power_set_standby(struct at + err_mask); + } + +-static bool ata_dev_power_is_active(struct ata_device *dev) +-{ +- struct ata_taskfile tf; +- unsigned int err_mask; +- +- ata_tf_init(dev, &tf); +- tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR; +- tf.protocol = ATA_PROT_NODATA; +- tf.command = ATA_CMD_CHK_POWER; +- +- err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); +- if (err_mask) { +- ata_dev_err(dev, "Check power mode failed (err_mask=0x%x)\n", +- err_mask); +- /* +- * Assume we are in standby mode so that we always force a +- * spinup in ata_dev_power_set_active(). +- */ +- return false; +- } +- +- ata_dev_dbg(dev, "Power mode: 0x%02x\n", tf.nsect); +- +- /* Active or idle */ +- return tf.nsect == 0xff; +-} +- + /** + * ata_dev_power_set_active - Set a device power mode to active + * @dev: target device diff --git a/queue-6.7/ata-libata-core-do-not-try-to-set-sleeping-devices-to-standby.patch b/queue-6.7/ata-libata-core-do-not-try-to-set-sleeping-devices-to-standby.patch new file mode 100644 index 00000000000..2e1d805f799 --- /dev/null +++ b/queue-6.7/ata-libata-core-do-not-try-to-set-sleeping-devices-to-standby.patch @@ -0,0 +1,34 @@ +From 4b085736e44dbbe69b5eea1a8a294f404678a1f4 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Thu, 11 Jan 2024 20:51:22 +0900 +Subject: ata: libata-core: Do not try to set sleeping devices to standby + +From: Damien Le Moal + +commit 4b085736e44dbbe69b5eea1a8a294f404678a1f4 upstream. + +In ata ata_dev_power_set_standby(), check that the target device is not +sleeping. If it is, there is no need to do anything. + +Fixes: aa3998dbeb3a ("ata: libata-scsi: Disable scsi device manage_system_start_stop") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Signed-off-by: Niklas Cassel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/libata-core.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -2017,6 +2017,10 @@ void ata_dev_power_set_standby(struct at + struct ata_taskfile tf; + unsigned int err_mask; + ++ /* If the device is already sleeping, do nothing. */ ++ if (dev->flags & ATA_DFLAG_SLEEPING) ++ return; ++ + /* + * Some odd clown BIOSes issue spindown on power off (ACPI S4 or S5) + * causing some drives to spin up and down again. For these, do nothing diff --git a/queue-6.7/btrfs-defrag-avoid-unnecessary-defrag-caused-by-incorrect-extent-size.patch b/queue-6.7/btrfs-defrag-avoid-unnecessary-defrag-caused-by-incorrect-extent-size.patch new file mode 100644 index 00000000000..e8aa1b2fc8b --- /dev/null +++ b/queue-6.7/btrfs-defrag-avoid-unnecessary-defrag-caused-by-incorrect-extent-size.patch @@ -0,0 +1,111 @@ +From e42b9d8b9ea2672811285e6a7654887ff64d23f3 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Wed, 7 Feb 2024 10:00:42 +1030 +Subject: btrfs: defrag: avoid unnecessary defrag caused by incorrect extent size + +From: Qu Wenruo + +commit e42b9d8b9ea2672811285e6a7654887ff64d23f3 upstream. + +[BUG] +With the following file extent layout, defrag would do unnecessary IO +and result more on-disk space usage. + + # mkfs.btrfs -f $dev + # mount $dev $mnt + # xfs_io -f -c "pwrite 0 40m" $mnt/foobar + # sync + # xfs_io -f -c "pwrite 40m 16k" $mnt/foobar + # sync + +Above command would lead to the following file extent layout: + + item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53 + generation 7 type 1 (regular) + extent data disk byte 298844160 nr 41943040 + extent data offset 0 nr 41943040 ram 41943040 + extent compression 0 (none) + item 7 key (257 EXTENT_DATA 41943040) itemoff 15763 itemsize 53 + generation 8 type 1 (regular) + extent data disk byte 13631488 nr 16384 + extent data offset 0 nr 16384 ram 16384 + extent compression 0 (none) + +Which is mostly fine. We can allow the final 16K to be merged with the +previous 40M, but it's upon the end users' preference. + +But if we defrag the file using the default parameters, it would result +worse file layout: + + # btrfs filesystem defrag $mnt/foobar + # sync + + item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53 + generation 7 type 1 (regular) + extent data disk byte 298844160 nr 41943040 + extent data offset 0 nr 8650752 ram 41943040 + extent compression 0 (none) + item 7 key (257 EXTENT_DATA 8650752) itemoff 15763 itemsize 53 + generation 9 type 1 (regular) + extent data disk byte 340787200 nr 33292288 + extent data offset 0 nr 33292288 ram 33292288 + extent compression 0 (none) + item 8 key (257 EXTENT_DATA 41943040) itemoff 15710 itemsize 53 + generation 8 type 1 (regular) + extent data disk byte 13631488 nr 16384 + extent data offset 0 nr 16384 ram 16384 + extent compression 0 (none) + +Note the original 40M extent is still there, but a new 32M extent is +created for no benefit at all. + +[CAUSE] +There is an existing check to make sure we won't defrag a large enough +extent (the threshold is by default 32M). + +But the check is using the length to the end of the extent: + + range_len = em->len - (cur - em->start); + + /* Skip too large extent */ + if (range_len >= extent_thresh) + goto next; + +This means, for the first 8MiB of the extent, the range_len is always +smaller than the default threshold, and would not be defragged. +But after the first 8MiB, the remaining part would fit the requirement, +and be defragged. + +Such different behavior inside the same extent caused the above problem, +and we should avoid different defrag decision inside the same extent. + +[FIX] +Instead of using @range_len, just use @em->len, so that we have a +consistent decision among the same file extent. + +Now with this fix, we won't touch the extent, thus not making it any +worse. + +Reported-by: Filipe Manana +Fixes: 0cb5950f3f3b ("btrfs: fix deadlock when reserving space during defrag") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Boris Burkov +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/defrag.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/defrag.c ++++ b/fs/btrfs/defrag.c +@@ -1047,7 +1047,7 @@ static int defrag_collect_targets(struct + goto add; + + /* Skip too large extent */ +- if (range_len >= extent_thresh) ++ if (em->len >= extent_thresh) + goto next; + + /* diff --git a/queue-6.7/btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch b/queue-6.7/btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch new file mode 100644 index 00000000000..adef49e6dda --- /dev/null +++ b/queue-6.7/btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch @@ -0,0 +1,241 @@ +From b0ad381fa7690244802aed119b478b4bdafc31dd Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Mon, 12 Feb 2024 11:56:02 -0500 +Subject: btrfs: fix deadlock with fiemap and extent locking + +From: Josef Bacik + +commit b0ad381fa7690244802aed119b478b4bdafc31dd upstream. + +While working on the patchset to remove extent locking I got a lockdep +splat with fiemap and pagefaulting with my new extent lock replacement +lock. + +This deadlock exists with our normal code, we just don't have lockdep +annotations with the extent locking so we've never noticed it. + +Since we're copying the fiemap extent to user space on every iteration +we have the chance of pagefaulting. Because we hold the extent lock for +the entire range we could mkwrite into a range in the file that we have +mmap'ed. This would deadlock with the following stack trace + +[<0>] lock_extent+0x28d/0x2f0 +[<0>] btrfs_page_mkwrite+0x273/0x8a0 +[<0>] do_page_mkwrite+0x50/0xb0 +[<0>] do_fault+0xc1/0x7b0 +[<0>] __handle_mm_fault+0x2fa/0x460 +[<0>] handle_mm_fault+0xa4/0x330 +[<0>] do_user_addr_fault+0x1f4/0x800 +[<0>] exc_page_fault+0x7c/0x1e0 +[<0>] asm_exc_page_fault+0x26/0x30 +[<0>] rep_movs_alternative+0x33/0x70 +[<0>] _copy_to_user+0x49/0x70 +[<0>] fiemap_fill_next_extent+0xc8/0x120 +[<0>] emit_fiemap_extent+0x4d/0xa0 +[<0>] extent_fiemap+0x7f8/0xad0 +[<0>] btrfs_fiemap+0x49/0x80 +[<0>] __x64_sys_ioctl+0x3e1/0xb50 +[<0>] do_syscall_64+0x94/0x1a0 +[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76 + +I wrote an fstest to reproduce this deadlock without my replacement lock +and verified that the deadlock exists with our existing locking. + +To fix this simply don't take the extent lock for the entire duration of +the fiemap. This is safe in general because we keep track of where we +are when we're searching the tree, so if an ordered extent updates in +the middle of our fiemap call we'll still emit the correct extents +because we know what offset we were on before. + +The only place we maintain the lock is searching delalloc. Since the +delalloc stuff can change during writeback we want to lock the extent +range so we have a consistent view of delalloc at the time we're +checking to see if we need to set the delalloc flag. + +With this patch applied we no longer deadlock with my testcase. + +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 62 +++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 45 insertions(+), 17 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -2645,16 +2645,34 @@ static int fiemap_process_hole(struct bt + * it beyond i_size. + */ + while (cur_offset < end && cur_offset < i_size) { ++ struct extent_state *cached_state = NULL; + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; ++ u64 lockstart; ++ u64 lockend; + u64 prealloc_len = 0; + bool delalloc; + ++ lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize); ++ lockend = round_up(end, inode->root->fs_info->sectorsize); ++ ++ /* ++ * We are only locking for the delalloc range because that's the ++ * only thing that can change here. With fiemap we have a lock ++ * on the inode, so no buffered or direct writes can happen. ++ * ++ * However mmaps and normal page writeback will cause this to ++ * change arbitrarily. We have to lock the extent lock here to ++ * make sure that nobody messes with the tree while we're doing ++ * btrfs_find_delalloc_in_range. ++ */ ++ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + delalloc_cached_state, + &delalloc_start, + &delalloc_end); ++ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + if (!delalloc) + break; + +@@ -2822,15 +2840,15 @@ int extent_fiemap(struct btrfs_inode *in + u64 start, u64 len) + { + const u64 ino = btrfs_ino(inode); +- struct extent_state *cached_state = NULL; + struct extent_state *delalloc_cached_state = NULL; + struct btrfs_path *path; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_share_check_ctx *backref_ctx; + u64 last_extent_end; + u64 prev_extent_end; +- u64 lockstart; +- u64 lockend; ++ u64 range_start; ++ u64 range_end; ++ const u64 sectorsize = inode->root->fs_info->sectorsize; + bool stopped = false; + int ret; + +@@ -2841,12 +2859,11 @@ int extent_fiemap(struct btrfs_inode *in + goto out; + } + +- lockstart = round_down(start, inode->root->fs_info->sectorsize); +- lockend = round_up(start + len, inode->root->fs_info->sectorsize); +- prev_extent_end = lockstart; ++ range_start = round_down(start, sectorsize); ++ range_end = round_up(start + len, sectorsize); ++ prev_extent_end = range_start; + + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); +- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) +@@ -2854,7 +2871,7 @@ int extent_fiemap(struct btrfs_inode *in + btrfs_release_path(path); + + path->reada = READA_FORWARD; +- ret = fiemap_search_slot(inode, path, lockstart); ++ ret = fiemap_search_slot(inode, path, range_start); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { +@@ -2866,7 +2883,7 @@ int extent_fiemap(struct btrfs_inode *in + goto check_eof_delalloc; + } + +- while (prev_extent_end < lockend) { ++ while (prev_extent_end < range_end) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; +@@ -2889,19 +2906,19 @@ int extent_fiemap(struct btrfs_inode *in + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. + */ +- if (extent_end <= lockstart) ++ if (extent_end <= range_start) + goto next_item; + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { +- const u64 range_end = min(key.offset, lockend) - 1; ++ const u64 hole_end = min(key.offset, range_end) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, +- prev_extent_end, range_end); ++ prev_extent_end, hole_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { +@@ -2911,7 +2928,7 @@ int extent_fiemap(struct btrfs_inode *in + } + + /* We've reached the end of the fiemap range, stop. */ +- if (key.offset >= lockend) { ++ if (key.offset >= range_end) { + stopped = true; + break; + } +@@ -3005,29 +3022,41 @@ check_eof_delalloc: + btrfs_free_path(path); + path = NULL; + +- if (!stopped && prev_extent_end < lockend) { ++ if (!stopped && prev_extent_end < range_end) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, +- 0, 0, 0, prev_extent_end, lockend - 1); ++ 0, 0, 0, prev_extent_end, range_end - 1); + if (ret < 0) + goto out_unlock; +- prev_extent_end = lockend; ++ prev_extent_end = range_end; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { ++ struct extent_state *cached_state = NULL; + u64 delalloc_start; + u64 delalloc_end; ++ u64 lockstart; ++ u64 lockend; + bool delalloc; + ++ lockstart = round_down(prev_extent_end, sectorsize); ++ lockend = round_up(i_size, sectorsize); ++ ++ /* ++ * See the comment in fiemap_process_hole as to why ++ * we're doing the locking here. ++ */ ++ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_cached_state, + &delalloc_start, + &delalloc_end); ++ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { +@@ -3038,7 +3067,6 @@ check_eof_delalloc: + ret = emit_last_fiemap_cache(fieinfo, &cache); + + out_unlock: +- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + out: + free_extent_state(delalloc_cached_state); diff --git a/queue-6.7/cachefiles-fix-memory-leak-in-cachefiles_add_cache.patch b/queue-6.7/cachefiles-fix-memory-leak-in-cachefiles_add_cache.patch new file mode 100644 index 00000000000..824ad33cc07 --- /dev/null +++ b/queue-6.7/cachefiles-fix-memory-leak-in-cachefiles_add_cache.patch @@ -0,0 +1,68 @@ +From e21a2f17566cbd64926fb8f16323972f7a064444 Mon Sep 17 00:00:00 2001 +From: Baokun Li +Date: Sat, 17 Feb 2024 16:14:31 +0800 +Subject: cachefiles: fix memory leak in cachefiles_add_cache() + +From: Baokun Li + +commit e21a2f17566cbd64926fb8f16323972f7a064444 upstream. + +The following memory leak was reported after unbinding /dev/cachefiles: + +================================================================== +unreferenced object 0xffff9b674176e3c0 (size 192): + comm "cachefilesd2", pid 680, jiffies 4294881224 + hex dump (first 32 bytes): + 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + backtrace (crc ea38a44b): + [] kmem_cache_alloc+0x2d5/0x370 + [] prepare_creds+0x26/0x2e0 + [] cachefiles_determine_cache_security+0x1f/0x120 + [] cachefiles_add_cache+0x13c/0x3a0 + [] cachefiles_daemon_write+0x146/0x1c0 + [] vfs_write+0xcb/0x520 + [] ksys_write+0x69/0xf0 + [] do_syscall_64+0x72/0x140 + [] entry_SYSCALL_64_after_hwframe+0x6e/0x76 +================================================================== + +Put the reference count of cache_cred in cachefiles_daemon_unbind() to +fix the problem. And also put cache_cred in cachefiles_add_cache() error +branch to avoid memory leaks. + +Fixes: 9ae326a69004 ("CacheFiles: A cache that backs onto a mounted filesystem") +CC: stable@vger.kernel.org +Signed-off-by: Baokun Li +Link: https://lore.kernel.org/r/20240217081431.796809-1-libaokun1@huawei.com +Acked-by: David Howells +Reviewed-by: Jingbo Xu +Reviewed-by: Jeff Layton +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + fs/cachefiles/cache.c | 2 ++ + fs/cachefiles/daemon.c | 1 + + 2 files changed, 3 insertions(+) + +--- a/fs/cachefiles/cache.c ++++ b/fs/cachefiles/cache.c +@@ -168,6 +168,8 @@ error_unsupported: + dput(root); + error_open_root: + cachefiles_end_secure(cache, saved_cred); ++ put_cred(cache->cache_cred); ++ cache->cache_cred = NULL; + error_getsec: + fscache_relinquish_cache(cache_cookie); + cache->cache = NULL; +--- a/fs/cachefiles/daemon.c ++++ b/fs/cachefiles/daemon.c +@@ -805,6 +805,7 @@ static void cachefiles_daemon_unbind(str + cachefiles_put_directory(cache->graveyard); + cachefiles_put_directory(cache->store); + mntput(cache->mnt); ++ put_cred(cache->cache_cred); + + kfree(cache->rootdirname); + kfree(cache->secctx); diff --git a/queue-6.7/crypto-virtio-akcipher-fix-stack-overflow-on-memcpy.patch b/queue-6.7/crypto-virtio-akcipher-fix-stack-overflow-on-memcpy.patch new file mode 100644 index 00000000000..18bccb764ed --- /dev/null +++ b/queue-6.7/crypto-virtio-akcipher-fix-stack-overflow-on-memcpy.patch @@ -0,0 +1,52 @@ +From c0ec2a712daf133d9996a8a1b7ee2d4996080363 Mon Sep 17 00:00:00 2001 +From: zhenwei pi +Date: Tue, 30 Jan 2024 19:27:40 +0800 +Subject: crypto: virtio/akcipher - Fix stack overflow on memcpy + +From: zhenwei pi + +commit c0ec2a712daf133d9996a8a1b7ee2d4996080363 upstream. + +sizeof(struct virtio_crypto_akcipher_session_para) is less than +sizeof(struct virtio_crypto_op_ctrl_req::u), copying more bytes from +stack variable leads stack overflow. Clang reports this issue by +commands: +make -j CC=clang-14 mrproper >/dev/null 2>&1 +make -j O=/tmp/crypto-build CC=clang-14 allmodconfig >/dev/null 2>&1 +make -j O=/tmp/crypto-build W=1 CC=clang-14 drivers/crypto/virtio/ + virtio_crypto_akcipher_algs.o + +Fixes: 59ca6c93387d ("virtio-crypto: implement RSA algorithm") +Link: https://lore.kernel.org/all/0a194a79-e3a3-45e7-be98-83abd3e1cb7e@roeck-us.net/ +Cc: +Signed-off-by: zhenwei pi +Tested-by: Nathan Chancellor # build +Acked-by: Michael S. Tsirkin +Acked-by: Jason Wang +Signed-off-by: Herbert Xu +Signed-off-by: Greg Kroah-Hartman +--- + drivers/crypto/virtio/virtio_crypto_akcipher_algs.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c ++++ b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c +@@ -104,7 +104,8 @@ static void virtio_crypto_dataq_akcipher + } + + static int virtio_crypto_alg_akcipher_init_session(struct virtio_crypto_akcipher_ctx *ctx, +- struct virtio_crypto_ctrl_header *header, void *para, ++ struct virtio_crypto_ctrl_header *header, ++ struct virtio_crypto_akcipher_session_para *para, + const uint8_t *key, unsigned int keylen) + { + struct scatterlist outhdr_sg, key_sg, inhdr_sg, *sgs[3]; +@@ -128,7 +129,7 @@ static int virtio_crypto_alg_akcipher_in + + ctrl = &vc_ctrl_req->ctrl; + memcpy(&ctrl->header, header, sizeof(ctrl->header)); +- memcpy(&ctrl->u, para, sizeof(ctrl->u)); ++ memcpy(&ctrl->u.akcipher_create_session.para, para, sizeof(*para)); + input = &vc_ctrl_req->input; + input->status = cpu_to_le32(VIRTIO_CRYPTO_ERR); + diff --git a/queue-6.7/cxl-acpi-fix-load-failures-due-to-single-window-creation-failure.patch b/queue-6.7/cxl-acpi-fix-load-failures-due-to-single-window-creation-failure.patch new file mode 100644 index 00000000000..7e82f0670e6 --- /dev/null +++ b/queue-6.7/cxl-acpi-fix-load-failures-due-to-single-window-creation-failure.patch @@ -0,0 +1,139 @@ +From 5c6224bfabbf7f3e491c51ab50fd2c6f92ba1141 Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Fri, 16 Feb 2024 19:11:34 -0800 +Subject: cxl/acpi: Fix load failures due to single window creation failure + +From: Dan Williams + +commit 5c6224bfabbf7f3e491c51ab50fd2c6f92ba1141 upstream. + +The expectation is that cxl_parse_cfwms() continues in the face the of +failure as evidenced by code like: + + cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb); + if (IS_ERR(cxlrd)) + return 0; + +There are other error paths in that function which mistakenly follow +idiomatic expectations and return an error when they should not. Most of +those mistakes are innocuous checks that hardly ever fail in practice. +However, a recent change succeed in making the implementation more +fragile by applying an idiomatic, but still wrong "fix" [1]. In this +failure case the kernel reports: + + cxl root0: Failed to populate active decoder targets + cxl_acpi ACPI0017:00: Failed to add decode range: [mem 0x00000000-0x7fffffff flags 0x200] + +...which is a real issue with that one window (to be fixed separately), +but ends up failing the entirety of cxl_acpi_probe(). + +Undo that recent breakage while also removing the confusion about +ignoring errors. Update all exits paths to return an error per typical +expectations and let an outer wrapper function handle dropping the +error. + +Fixes: 91019b5bc7c2 ("cxl/acpi: Return 'rc' instead of '0' in cxl_parse_cfmws()") [1] +Cc: +Cc: Breno Leitao +Cc: Alison Schofield +Cc: Vishal Verma +Signed-off-by: Dan Williams +Signed-off-by: Greg Kroah-Hartman +--- + drivers/cxl/acpi.c | 46 ++++++++++++++++++++++++++++------------------ + 1 file changed, 28 insertions(+), 18 deletions(-) + +--- a/drivers/cxl/acpi.c ++++ b/drivers/cxl/acpi.c +@@ -194,31 +194,27 @@ struct cxl_cfmws_context { + int id; + }; + +-static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, +- const unsigned long end) ++static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, ++ struct cxl_cfmws_context *ctx) + { + int target_map[CXL_DECODER_MAX_INTERLEAVE]; +- struct cxl_cfmws_context *ctx = arg; + struct cxl_port *root_port = ctx->root_port; + struct resource *cxl_res = ctx->cxl_res; + struct cxl_cxims_context cxims_ctx; + struct cxl_root_decoder *cxlrd; + struct device *dev = ctx->dev; +- struct acpi_cedt_cfmws *cfmws; + cxl_calc_hb_fn cxl_calc_hb; + struct cxl_decoder *cxld; + unsigned int ways, i, ig; + struct resource *res; + int rc; + +- cfmws = (struct acpi_cedt_cfmws *) header; +- + rc = cxl_acpi_cfmws_verify(dev, cfmws); + if (rc) { + dev_err(dev, "CFMWS range %#llx-%#llx not registered\n", + cfmws->base_hpa, + cfmws->base_hpa + cfmws->window_size - 1); +- return 0; ++ return rc; + } + + rc = eiw_to_ways(cfmws->interleave_ways, &ways); +@@ -254,7 +250,7 @@ static int cxl_parse_cfmws(union acpi_su + + cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb); + if (IS_ERR(cxlrd)) +- return 0; ++ return PTR_ERR(cxlrd); + + cxld = &cxlrd->cxlsd.cxld; + cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions); +@@ -298,16 +294,7 @@ err_xormap: + put_device(&cxld->dev); + else + rc = cxl_decoder_autoremove(dev, cxld); +- if (rc) { +- dev_err(dev, "Failed to add decode range: %pr", res); +- return rc; +- } +- dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n", +- dev_name(&cxld->dev), +- phys_to_target_node(cxld->hpa_range.start), +- cxld->hpa_range.start, cxld->hpa_range.end); +- +- return 0; ++ return rc; + + err_insert: + kfree(res->name); +@@ -316,6 +303,29 @@ err_name: + return -ENOMEM; + } + ++static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, ++ const unsigned long end) ++{ ++ struct acpi_cedt_cfmws *cfmws = (struct acpi_cedt_cfmws *)header; ++ struct cxl_cfmws_context *ctx = arg; ++ struct device *dev = ctx->dev; ++ int rc; ++ ++ rc = __cxl_parse_cfmws(cfmws, ctx); ++ if (rc) ++ dev_err(dev, ++ "Failed to add decode range: [%#llx - %#llx] (%d)\n", ++ cfmws->base_hpa, ++ cfmws->base_hpa + cfmws->window_size - 1, rc); ++ else ++ dev_dbg(dev, "decode range: node: %d range [%#llx - %#llx]\n", ++ phys_to_target_node(cfmws->base_hpa), cfmws->base_hpa, ++ cfmws->base_hpa + cfmws->window_size - 1); ++ ++ /* never fail cxl_acpi load for a single window failure */ ++ return 0; ++} ++ + __mock struct acpi_device *to_cxl_host_bridge(struct device *host, + struct device *dev) + { diff --git a/queue-6.7/cxl-pci-fix-disabling-memory-if-dvsec-cxl-range-does-not-match-a-cfmws-window.patch b/queue-6.7/cxl-pci-fix-disabling-memory-if-dvsec-cxl-range-does-not-match-a-cfmws-window.patch new file mode 100644 index 00000000000..8110f95fc6e --- /dev/null +++ b/queue-6.7/cxl-pci-fix-disabling-memory-if-dvsec-cxl-range-does-not-match-a-cfmws-window.patch @@ -0,0 +1,56 @@ +From 0cab687205986491302cd2e440ef1d253031c221 Mon Sep 17 00:00:00 2001 +From: Robert Richter +Date: Fri, 16 Feb 2024 17:01:13 +0100 +Subject: cxl/pci: Fix disabling memory if DVSEC CXL Range does not match a CFMWS window + +From: Robert Richter + +commit 0cab687205986491302cd2e440ef1d253031c221 upstream. + +The Linux CXL subsystem is built on the assumption that HPA == SPA. +That is, the host physical address (HPA) the HDM decoder registers are +programmed with are system physical addresses (SPA). + +During HDM decoder setup, the DVSEC CXL range registers (cxl-3.1, +8.1.3.8) are checked if the memory is enabled and the CXL range is in +a HPA window that is described in a CFMWS structure of the CXL host +bridge (cxl-3.1, 9.18.1.3). + +Now, if the HPA is not an SPA, the CXL range does not match a CFMWS +window and the CXL memory range will be disabled then. The HDM decoder +stops working which causes system memory being disabled and further a +system hang during HDM decoder initialization, typically when a CXL +enabled kernel boots. + +Prevent a system hang and do not disable the HDM decoder if the +decoder's CXL range is not found in a CFMWS window. + +Note the change only fixes a hardware hang, but does not implement +HPA/SPA translation. Support for this can be added in a follow on +patch series. + +Signed-off-by: Robert Richter +Fixes: 34e37b4c432c ("cxl/port: Enable HDM Capability after validating DVSEC Ranges") +Cc: +Link: https://lore.kernel.org/r/20240216160113.407141-1-rrichter@amd.com +Signed-off-by: Dan Williams +Signed-off-by: Greg Kroah-Hartman +--- + drivers/cxl/core/pci.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/cxl/core/pci.c ++++ b/drivers/cxl/core/pci.c +@@ -476,9 +476,9 @@ int cxl_hdm_decode_init(struct cxl_dev_s + allowed++; + } + +- if (!allowed) { +- cxl_set_mem_enable(cxlds, 0); +- info->mem_enabled = 0; ++ if (!allowed && info->mem_enabled) { ++ dev_err(dev, "Range register decodes outside platform defined CXL ranges.\n"); ++ return -ENXIO; + } + + /* diff --git a/queue-6.7/cxl-pci-skip-to-handle-ras-errors-if-cxl.mem-device-is-detached.patch b/queue-6.7/cxl-pci-skip-to-handle-ras-errors-if-cxl.mem-device-is-detached.patch new file mode 100644 index 00000000000..5b4e625b89d --- /dev/null +++ b/queue-6.7/cxl-pci-skip-to-handle-ras-errors-if-cxl.mem-device-is-detached.patch @@ -0,0 +1,121 @@ +From eef5c7b28dbecd6b141987a96db6c54e49828102 Mon Sep 17 00:00:00 2001 +From: Li Ming +Date: Mon, 29 Jan 2024 13:18:56 +0000 +Subject: cxl/pci: Skip to handle RAS errors if CXL.mem device is detached + +From: Li Ming + +commit eef5c7b28dbecd6b141987a96db6c54e49828102 upstream. + +The PCI AER model is an awkward fit for CXL error handling. While the +expectation is that a PCI device can escalate to link reset to recover +from an AER event, the same reset on CXL amounts to a surprise memory +hotplug of massive amounts of memory. + +At present, the CXL error handler attempts some optimistic error +handling to unbind the device from the cxl_mem driver after reaping some +RAS register values. This results in a "hopeful" attempt to unplug the +memory, but there is no guarantee that will succeed. + +A subsequent AER notification after the memdev unbind event can no +longer assume the registers are mapped. Check for memdev bind before +reaping status register values to avoid crashes of the form: + + BUG: unable to handle page fault for address: ffa00000195e9100 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + [...] + RIP: 0010:__cxl_handle_ras+0x30/0x110 [cxl_core] + [...] + Call Trace: + + ? __die+0x24/0x70 + ? page_fault_oops+0x82/0x160 + ? kernelmode_fixup_or_oops+0x84/0x110 + ? exc_page_fault+0x113/0x170 + ? asm_exc_page_fault+0x26/0x30 + ? __pfx_dpc_reset_link+0x10/0x10 + ? __cxl_handle_ras+0x30/0x110 [cxl_core] + ? find_cxl_port+0x59/0x80 [cxl_core] + cxl_handle_rp_ras+0xbc/0xd0 [cxl_core] + cxl_error_detected+0x6c/0xf0 [cxl_core] + report_error_detected+0xc7/0x1c0 + pci_walk_bus+0x73/0x90 + pcie_do_recovery+0x23f/0x330 + +Longer term, the unbind and PCI_ERS_RESULT_DISCONNECT behavior might +need to be replaced with a new PCI_ERS_RESULT_PANIC. + +Fixes: 6ac07883dbb5 ("cxl/pci: Add RCH downstream port error logging") +Cc: stable@vger.kernel.org +Suggested-by: Dan Williams +Signed-off-by: Li Ming +Link: https://lore.kernel.org/r/20240129131856.2458980-1-ming4.li@intel.com +Signed-off-by: Dan Williams +Signed-off-by: Greg Kroah-Hartman +--- + drivers/cxl/core/pci.c | 43 +++++++++++++++++++++++++++++++------------ + 1 file changed, 31 insertions(+), 12 deletions(-) + +--- a/drivers/cxl/core/pci.c ++++ b/drivers/cxl/core/pci.c +@@ -931,11 +931,21 @@ static void cxl_handle_rdport_errors(str + void cxl_cor_error_detected(struct pci_dev *pdev) + { + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); ++ struct device *dev = &cxlds->cxlmd->dev; + +- if (cxlds->rcd) +- cxl_handle_rdport_errors(cxlds); ++ scoped_guard(device, dev) { ++ if (!dev->driver) { ++ dev_warn(&pdev->dev, ++ "%s: memdev disabled, abort error handling\n", ++ dev_name(dev)); ++ return; ++ } + +- cxl_handle_endpoint_cor_ras(cxlds); ++ if (cxlds->rcd) ++ cxl_handle_rdport_errors(cxlds); ++ ++ cxl_handle_endpoint_cor_ras(cxlds); ++ } + } + EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); + +@@ -947,16 +957,25 @@ pci_ers_result_t cxl_error_detected(stru + struct device *dev = &cxlmd->dev; + bool ue; + +- if (cxlds->rcd) +- cxl_handle_rdport_errors(cxlds); ++ scoped_guard(device, dev) { ++ if (!dev->driver) { ++ dev_warn(&pdev->dev, ++ "%s: memdev disabled, abort error handling\n", ++ dev_name(dev)); ++ return PCI_ERS_RESULT_DISCONNECT; ++ } ++ ++ if (cxlds->rcd) ++ cxl_handle_rdport_errors(cxlds); ++ /* ++ * A frozen channel indicates an impending reset which is fatal to ++ * CXL.mem operation, and will likely crash the system. On the off ++ * chance the situation is recoverable dump the status of the RAS ++ * capability registers and bounce the active state of the memdev. ++ */ ++ ue = cxl_handle_endpoint_ras(cxlds); ++ } + +- /* +- * A frozen channel indicates an impending reset which is fatal to +- * CXL.mem operation, and will likely crash the system. On the off +- * chance the situation is recoverable dump the status of the RAS +- * capability registers and bounce the active state of the memdev. +- */ +- ue = cxl_handle_endpoint_ras(cxlds); + + switch (state) { + case pci_channel_io_normal: diff --git a/queue-6.7/dm-crypt-don-t-modify-the-data-when-using-authenticated-encryption.patch b/queue-6.7/dm-crypt-don-t-modify-the-data-when-using-authenticated-encryption.patch new file mode 100644 index 00000000000..177fe88ee3c --- /dev/null +++ b/queue-6.7/dm-crypt-don-t-modify-the-data-when-using-authenticated-encryption.patch @@ -0,0 +1,43 @@ +From 50c70240097ce41fe6bce6478b80478281e4d0f7 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Mon, 19 Feb 2024 21:30:10 +0100 +Subject: dm-crypt: don't modify the data when using authenticated encryption + +From: Mikulas Patocka + +commit 50c70240097ce41fe6bce6478b80478281e4d0f7 upstream. + +It was said that authenticated encryption could produce invalid tag when +the data that is being encrypted is modified [1]. So, fix this problem by +copying the data into the clone bio first and then encrypt them inside the +clone bio. + +This may reduce performance, but it is needed to prevent the user from +corrupting the device by writing data with O_DIRECT and modifying them at +the same time. + +[1] https://lore.kernel.org/all/20240207004723.GA35324@sol.localdomain/T/ + +Signed-off-by: Mikulas Patocka +Cc: stable@vger.kernel.org +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-crypt.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/md/dm-crypt.c ++++ b/drivers/md/dm-crypt.c +@@ -2111,6 +2111,12 @@ static void kcryptd_crypt_write_convert( + io->ctx.bio_out = clone; + io->ctx.iter_out = clone->bi_iter; + ++ if (crypt_integrity_aead(cc)) { ++ bio_copy_data(clone, io->base_bio); ++ io->ctx.bio_in = clone; ++ io->ctx.iter_in = clone->bi_iter; ++ } ++ + sector += bio_sectors(clone); + + crypt_inc_pending(io); diff --git a/queue-6.7/dm-crypt-recheck-the-integrity-tag-after-a-failure.patch b/queue-6.7/dm-crypt-recheck-the-integrity-tag-after-a-failure.patch new file mode 100644 index 00000000000..19984f5d0ea --- /dev/null +++ b/queue-6.7/dm-crypt-recheck-the-integrity-tag-after-a-failure.patch @@ -0,0 +1,212 @@ +From 42e15d12070b4ff9af2b980f1b65774c2dab0507 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Mon, 19 Feb 2024 21:31:11 +0100 +Subject: dm-crypt: recheck the integrity tag after a failure + +From: Mikulas Patocka + +commit 42e15d12070b4ff9af2b980f1b65774c2dab0507 upstream. + +If a userspace process reads (with O_DIRECT) multiple blocks into the same +buffer, dm-crypt reports an authentication error [1]. The error is +reported in a log and it may cause RAID leg being kicked out of the +array. + +This commit fixes dm-crypt, so that if integrity verification fails, the +data is read again into a kernel buffer (where userspace can't modify it) +and the integrity tag is rechecked. If the recheck succeeds, the content +of the kernel buffer is copied into the user buffer; if the recheck fails, +an integrity error is reported. + +[1] https://people.redhat.com/~mpatocka/testcases/blk-auth-modify/read2.c + +Signed-off-by: Mikulas Patocka +Cc: stable@vger.kernel.org +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-crypt.c | 89 +++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 73 insertions(+), 16 deletions(-) + +--- a/drivers/md/dm-crypt.c ++++ b/drivers/md/dm-crypt.c +@@ -62,6 +62,8 @@ struct convert_context { + struct skcipher_request *req; + struct aead_request *req_aead; + } r; ++ bool aead_recheck; ++ bool aead_failed; + + }; + +@@ -82,6 +84,8 @@ struct dm_crypt_io { + blk_status_t error; + sector_t sector; + ++ struct bvec_iter saved_bi_iter; ++ + struct rb_node rb_node; + } CRYPTO_MINALIGN_ATTR; + +@@ -1370,10 +1374,13 @@ static int crypt_convert_block_aead(stru + if (r == -EBADMSG) { + sector_t s = le64_to_cpu(*sector); + +- DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", +- ctx->bio_in->bi_bdev, s); +- dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", +- ctx->bio_in, s, 0); ++ ctx->aead_failed = true; ++ if (ctx->aead_recheck) { ++ DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", ++ ctx->bio_in->bi_bdev, s); ++ dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", ++ ctx->bio_in, s, 0); ++ } + } + + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) +@@ -1757,6 +1764,8 @@ static void crypt_io_init(struct dm_cryp + io->base_bio = bio; + io->sector = sector; + io->error = 0; ++ io->ctx.aead_recheck = false; ++ io->ctx.aead_failed = false; + io->ctx.r.req = NULL; + io->integrity_metadata = NULL; + io->integrity_metadata_from_pool = false; +@@ -1768,6 +1777,8 @@ static void crypt_inc_pending(struct dm_ + atomic_inc(&io->io_pending); + } + ++static void kcryptd_queue_read(struct dm_crypt_io *io); ++ + /* + * One of the bios was finished. Check for completion of + * the whole request and correctly clean up the buffer. +@@ -1781,6 +1792,15 @@ static void crypt_dec_pending(struct dm_ + if (!atomic_dec_and_test(&io->io_pending)) + return; + ++ if (likely(!io->ctx.aead_recheck) && unlikely(io->ctx.aead_failed) && ++ cc->on_disk_tag_size && bio_data_dir(base_bio) == READ) { ++ io->ctx.aead_recheck = true; ++ io->ctx.aead_failed = false; ++ io->error = 0; ++ kcryptd_queue_read(io); ++ return; ++ } ++ + if (io->ctx.r.req) + crypt_free_req(cc, io->ctx.r.req, base_bio); + +@@ -1816,15 +1836,19 @@ static void crypt_endio(struct bio *clon + struct dm_crypt_io *io = clone->bi_private; + struct crypt_config *cc = io->cc; + unsigned int rw = bio_data_dir(clone); +- blk_status_t error; ++ blk_status_t error = clone->bi_status; ++ ++ if (io->ctx.aead_recheck && !error) { ++ kcryptd_queue_crypt(io); ++ return; ++ } + + /* + * free the processed pages + */ +- if (rw == WRITE) ++ if (rw == WRITE || io->ctx.aead_recheck) + crypt_free_buffer_pages(cc, clone); + +- error = clone->bi_status; + bio_put(clone); + + if (rw == READ && !error) { +@@ -1845,6 +1869,22 @@ static int kcryptd_io_read(struct dm_cry + struct crypt_config *cc = io->cc; + struct bio *clone; + ++ if (io->ctx.aead_recheck) { ++ if (!(gfp & __GFP_DIRECT_RECLAIM)) ++ return 1; ++ crypt_inc_pending(io); ++ clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); ++ if (unlikely(!clone)) { ++ crypt_dec_pending(io); ++ return 1; ++ } ++ clone->bi_iter.bi_sector = cc->start + io->sector; ++ crypt_convert_init(cc, &io->ctx, clone, clone, io->sector); ++ io->saved_bi_iter = clone->bi_iter; ++ dm_submit_bio_remap(io->base_bio, clone); ++ return 0; ++ } ++ + /* + * We need the original biovec array in order to decrypt the whole bio + * data *afterwards* -- thanks to immutable biovecs we don't need to +@@ -2107,6 +2147,14 @@ dec: + + static void kcryptd_crypt_read_done(struct dm_crypt_io *io) + { ++ if (io->ctx.aead_recheck) { ++ if (!io->error) { ++ io->ctx.bio_in->bi_iter = io->saved_bi_iter; ++ bio_copy_data(io->base_bio, io->ctx.bio_in); ++ } ++ crypt_free_buffer_pages(io->cc, io->ctx.bio_in); ++ bio_put(io->ctx.bio_in); ++ } + crypt_dec_pending(io); + } + +@@ -2136,11 +2184,17 @@ static void kcryptd_crypt_read_convert(s + + crypt_inc_pending(io); + +- crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, +- io->sector); ++ if (io->ctx.aead_recheck) { ++ io->ctx.cc_sector = io->sector + cc->iv_offset; ++ r = crypt_convert(cc, &io->ctx, ++ test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); ++ } else { ++ crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, ++ io->sector); + +- r = crypt_convert(cc, &io->ctx, +- test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); ++ r = crypt_convert(cc, &io->ctx, ++ test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); ++ } + /* + * Crypto API backlogged the request, because its queue was full + * and we're in softirq context, so continue from a workqueue +@@ -2182,10 +2236,13 @@ static void kcryptd_async_done(void *dat + if (error == -EBADMSG) { + sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)); + +- DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", +- ctx->bio_in->bi_bdev, s); +- dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", +- ctx->bio_in, s, 0); ++ ctx->aead_failed = true; ++ if (ctx->aead_recheck) { ++ DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", ++ ctx->bio_in->bi_bdev, s); ++ dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", ++ ctx->bio_in, s, 0); ++ } + io->error = BLK_STS_PROTECTION; + } else if (error < 0) + io->error = BLK_STS_IOERR; +@@ -3110,7 +3167,7 @@ static int crypt_ctr_optional(struct dm_ + sval = strchr(opt_string + strlen("integrity:"), ':') + 1; + if (!strcasecmp(sval, "aead")) { + set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); +- } else if (strcasecmp(sval, "none")) { ++ } else if (strcasecmp(sval, "none")) { + ti->error = "Unknown integrity profile"; + return -EINVAL; + } diff --git a/queue-6.7/dm-integrity-recheck-the-integrity-tag-after-a-failure.patch b/queue-6.7/dm-integrity-recheck-the-integrity-tag-after-a-failure.patch new file mode 100644 index 00000000000..9b48fd72d29 --- /dev/null +++ b/queue-6.7/dm-integrity-recheck-the-integrity-tag-after-a-failure.patch @@ -0,0 +1,159 @@ +From c88f5e553fe38b2ffc4c33d08654e5281b297677 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Mon, 19 Feb 2024 21:27:39 +0100 +Subject: dm-integrity: recheck the integrity tag after a failure + +From: Mikulas Patocka + +commit c88f5e553fe38b2ffc4c33d08654e5281b297677 upstream. + +If a userspace process reads (with O_DIRECT) multiple blocks into the same +buffer, dm-integrity reports an error [1]. The error is reported in a log +and it may cause RAID leg being kicked out of the array. + +This commit fixes dm-integrity, so that if integrity verification fails, +the data is read again into a kernel buffer (where userspace can't modify +it) and the integrity tag is rechecked. If the recheck succeeds, the +content of the kernel buffer is copied into the user buffer; if the +recheck fails, an integrity error is reported. + +[1] https://people.redhat.com/~mpatocka/testcases/blk-auth-modify/read2.c + +Signed-off-by: Mikulas Patocka +Cc: stable@vger.kernel.org +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-integrity.c | 93 +++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 84 insertions(+), 9 deletions(-) + +--- a/drivers/md/dm-integrity.c ++++ b/drivers/md/dm-integrity.c +@@ -278,6 +278,8 @@ struct dm_integrity_c { + + atomic64_t number_of_mismatches; + ++ mempool_t recheck_pool; ++ + struct notifier_block reboot_notifier; + }; + +@@ -1689,6 +1691,79 @@ failed: + get_random_bytes(result, ic->tag_size); + } + ++static void integrity_recheck(struct dm_integrity_io *dio) ++{ ++ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); ++ struct dm_integrity_c *ic = dio->ic; ++ struct bvec_iter iter; ++ struct bio_vec bv; ++ sector_t sector, logical_sector, area, offset; ++ char checksum_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; ++ struct page *page; ++ void *buffer; ++ ++ get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); ++ dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, ++ &dio->metadata_offset); ++ sector = get_data_sector(ic, area, offset); ++ logical_sector = dio->range.logical_sector; ++ ++ page = mempool_alloc(&ic->recheck_pool, GFP_NOIO); ++ buffer = page_to_virt(page); ++ ++ __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { ++ unsigned pos = 0; ++ ++ do { ++ char *mem; ++ int r; ++ struct dm_io_request io_req; ++ struct dm_io_region io_loc; ++ io_req.bi_opf = REQ_OP_READ; ++ io_req.mem.type = DM_IO_KMEM; ++ io_req.mem.ptr.addr = buffer; ++ io_req.notify.fn = NULL; ++ io_req.client = ic->io; ++ io_loc.bdev = ic->dev->bdev; ++ io_loc.sector = sector; ++ io_loc.count = ic->sectors_per_block; ++ ++ r = dm_io(&io_req, 1, &io_loc, NULL); ++ if (unlikely(r)) { ++ dio->bi_status = errno_to_blk_status(r); ++ goto free_ret; ++ } ++ ++ integrity_sector_checksum(ic, logical_sector, buffer, ++ checksum_onstack); ++ r = dm_integrity_rw_tag(ic, checksum_onstack, &dio->metadata_block, ++ &dio->metadata_offset, ic->tag_size, TAG_CMP); ++ if (r) { ++ if (r > 0) { ++ DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ++ bio->bi_bdev, logical_sector); ++ atomic64_inc(&ic->number_of_mismatches); ++ dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", ++ bio, logical_sector, 0); ++ r = -EILSEQ; ++ } ++ dio->bi_status = errno_to_blk_status(r); ++ goto free_ret; ++ } ++ ++ mem = bvec_kmap_local(&bv); ++ memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT); ++ kunmap_local(mem); ++ ++ pos += ic->sectors_per_block << SECTOR_SHIFT; ++ sector += ic->sectors_per_block; ++ logical_sector += ic->sectors_per_block; ++ } while (pos < bv.bv_len); ++ } ++free_ret: ++ mempool_free(page, &ic->recheck_pool); ++} ++ + static void integrity_metadata(struct work_struct *w) + { + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); +@@ -1776,15 +1851,8 @@ again: + checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); + if (unlikely(r)) { + if (r > 0) { +- sector_t s; +- +- s = sector - ((r + ic->tag_size - 1) / ic->tag_size); +- DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", +- bio->bi_bdev, s); +- r = -EILSEQ; +- atomic64_inc(&ic->number_of_mismatches); +- dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", +- bio, s, 0); ++ integrity_recheck(dio); ++ goto skip_io; + } + if (likely(checksums != checksums_onstack)) + kfree(checksums); +@@ -4261,6 +4329,12 @@ static int dm_integrity_ctr(struct dm_ta + goto bad; + } + ++ r = mempool_init_page_pool(&ic->recheck_pool, 1, 0); ++ if (r) { ++ ti->error = "Cannot allocate mempool"; ++ goto bad; ++ } ++ + ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", + WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); + if (!ic->metadata_wq) { +@@ -4609,6 +4683,7 @@ static void dm_integrity_dtr(struct dm_t + kvfree(ic->bbs); + if (ic->bufio) + dm_bufio_client_destroy(ic->bufio); ++ mempool_exit(&ic->recheck_pool); + mempool_exit(&ic->journal_io_mempool); + if (ic->io) + dm_io_client_destroy(ic->io); diff --git a/queue-6.7/dm-verity-recheck-the-hash-after-a-failure.patch b/queue-6.7/dm-verity-recheck-the-hash-after-a-failure.patch new file mode 100644 index 00000000000..93c0918c8c6 --- /dev/null +++ b/queue-6.7/dm-verity-recheck-the-hash-after-a-failure.patch @@ -0,0 +1,189 @@ +From 9177f3c0dea6143d05cac1bbd28668fd0e216d11 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Mon, 19 Feb 2024 21:28:09 +0100 +Subject: dm-verity: recheck the hash after a failure + +From: Mikulas Patocka + +commit 9177f3c0dea6143d05cac1bbd28668fd0e216d11 upstream. + +If a userspace process reads (with O_DIRECT) multiple blocks into the same +buffer, dm-verity reports an error [1]. + +This commit fixes dm-verity, so that if hash verification fails, the data +is read again into a kernel buffer (where userspace can't modify it) and +the hash is rechecked. If the recheck succeeds, the content of the kernel +buffer is copied into the user buffer; if the recheck fails, an error is +reported. + +[1] https://people.redhat.com/~mpatocka/testcases/blk-auth-modify/read2.c + +Signed-off-by: Mikulas Patocka +Cc: stable@vger.kernel.org +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-verity-target.c | 86 +++++++++++++++++++++++++++++++++++++++--- + drivers/md/dm-verity.h | 6 ++ + 2 files changed, 86 insertions(+), 6 deletions(-) + +--- a/drivers/md/dm-verity-target.c ++++ b/drivers/md/dm-verity-target.c +@@ -482,6 +482,63 @@ int verity_for_bv_block(struct dm_verity + return 0; + } + ++static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io, ++ u8 *data, size_t len) ++{ ++ memcpy(data, io->recheck_buffer, len); ++ io->recheck_buffer += len; ++ ++ return 0; ++} ++ ++static int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, ++ struct bvec_iter start, sector_t cur_block) ++{ ++ struct page *page; ++ void *buffer; ++ int r; ++ struct dm_io_request io_req; ++ struct dm_io_region io_loc; ++ ++ page = mempool_alloc(&v->recheck_pool, GFP_NOIO); ++ buffer = page_to_virt(page); ++ ++ io_req.bi_opf = REQ_OP_READ; ++ io_req.mem.type = DM_IO_KMEM; ++ io_req.mem.ptr.addr = buffer; ++ io_req.notify.fn = NULL; ++ io_req.client = v->io; ++ io_loc.bdev = v->data_dev->bdev; ++ io_loc.sector = cur_block << (v->data_dev_block_bits - SECTOR_SHIFT); ++ io_loc.count = 1 << (v->data_dev_block_bits - SECTOR_SHIFT); ++ r = dm_io(&io_req, 1, &io_loc, NULL); ++ if (unlikely(r)) ++ goto free_ret; ++ ++ r = verity_hash(v, verity_io_hash_req(v, io), buffer, ++ 1 << v->data_dev_block_bits, ++ verity_io_real_digest(v, io), true); ++ if (unlikely(r)) ++ goto free_ret; ++ ++ if (memcmp(verity_io_real_digest(v, io), ++ verity_io_want_digest(v, io), v->digest_size)) { ++ r = -EIO; ++ goto free_ret; ++ } ++ ++ io->recheck_buffer = buffer; ++ r = verity_for_bv_block(v, io, &start, verity_recheck_copy); ++ if (unlikely(r)) ++ goto free_ret; ++ ++ r = 0; ++free_ret: ++ mempool_free(page, &v->recheck_pool); ++ ++ return r; ++} ++ + static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) + { +@@ -508,9 +565,7 @@ static int verity_verify_io(struct dm_ve + { + bool is_zero; + struct dm_verity *v = io->v; +-#if defined(CONFIG_DM_VERITY_FEC) + struct bvec_iter start; +-#endif + struct bvec_iter iter_copy; + struct bvec_iter *iter; + struct crypto_wait wait; +@@ -561,10 +616,7 @@ static int verity_verify_io(struct dm_ve + if (unlikely(r < 0)) + return r; + +-#if defined(CONFIG_DM_VERITY_FEC) +- if (verity_fec_is_enabled(v)) +- start = *iter; +-#endif ++ start = *iter; + r = verity_for_io_block(v, io, iter, &wait); + if (unlikely(r < 0)) + return r; +@@ -586,6 +638,10 @@ static int verity_verify_io(struct dm_ve + * tasklet since it may sleep, so fallback to work-queue. + */ + return -EAGAIN; ++ } else if (verity_recheck(v, io, start, cur_block) == 0) { ++ if (v->validated_blocks) ++ set_bit(cur_block, v->validated_blocks); ++ continue; + #if defined(CONFIG_DM_VERITY_FEC) + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, + cur_block, NULL, &start) == 0) { +@@ -941,6 +997,10 @@ static void verity_dtr(struct dm_target + if (v->verify_wq) + destroy_workqueue(v->verify_wq); + ++ mempool_exit(&v->recheck_pool); ++ if (v->io) ++ dm_io_client_destroy(v->io); ++ + if (v->bufio) + dm_bufio_client_destroy(v->bufio); + +@@ -1379,6 +1439,20 @@ static int verity_ctr(struct dm_target * + } + v->hash_blocks = hash_position; + ++ r = mempool_init_page_pool(&v->recheck_pool, 1, 0); ++ if (unlikely(r)) { ++ ti->error = "Cannot allocate mempool"; ++ goto bad; ++ } ++ ++ v->io = dm_io_client_create(); ++ if (IS_ERR(v->io)) { ++ r = PTR_ERR(v->io); ++ v->io = NULL; ++ ti->error = "Cannot allocate dm io"; ++ goto bad; ++ } ++ + v->bufio = dm_bufio_client_create(v->hash_dev->bdev, + 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), + dm_bufio_alloc_callback, NULL, +--- a/drivers/md/dm-verity.h ++++ b/drivers/md/dm-verity.h +@@ -11,6 +11,7 @@ + #ifndef DM_VERITY_H + #define DM_VERITY_H + ++#include + #include + #include + #include +@@ -68,6 +69,9 @@ struct dm_verity { + unsigned long *validated_blocks; /* bitset blocks validated */ + + char *signature_key_desc; /* signature keyring reference */ ++ ++ struct dm_io_client *io; ++ mempool_t recheck_pool; + }; + + struct dm_verity_io { +@@ -84,6 +88,8 @@ struct dm_verity_io { + + struct work_struct work; + ++ char *recheck_buffer; ++ + /* + * Three variably-size fields follow this struct: + * diff --git a/queue-6.7/docs-instruct-latex-to-cope-with-deeper-nesting.patch b/queue-6.7/docs-instruct-latex-to-cope-with-deeper-nesting.patch new file mode 100644 index 00000000000..af83ec7199f --- /dev/null +++ b/queue-6.7/docs-instruct-latex-to-cope-with-deeper-nesting.patch @@ -0,0 +1,43 @@ +From 0df8669f69a8638f04c6a3d1f3b7056c2c18f62c Mon Sep 17 00:00:00 2001 +From: Jonathan Corbet +Date: Mon, 19 Feb 2024 09:05:38 -0700 +Subject: docs: Instruct LaTeX to cope with deeper nesting + +From: Jonathan Corbet + +commit 0df8669f69a8638f04c6a3d1f3b7056c2c18f62c upstream. + +The addition of the XFS online fsck documentation starting with +commit a8f6c2e54ddc ("xfs: document the motivation for online fsck design") +added a deeper level of nesting than LaTeX is prepared to deal with. That +caused a pdfdocs build failure with the helpful "Too deeply nested" error +message buried deeply in Documentation/output/filesystems.log. + +Increase the "maxlistdepth" parameter to instruct LaTeX that it needs to +deal with the deeper nesting whether it wants to or not. + +Suggested-by: Akira Yokosawa +Tested-by: Akira Yokosawa +Cc: stable@vger.kernel.org # v6.4+ +Link: https://lore.kernel.org/linux-doc/67f6ac60-7957-4b92-9d72-a08fbad0e028@gmail.com/ +Signed-off-by: Jonathan Corbet +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/conf.py | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/Documentation/conf.py ++++ b/Documentation/conf.py +@@ -383,6 +383,12 @@ latex_elements = { + verbatimhintsturnover=false, + ''', + ++ # ++ # Some of our authors are fond of deep nesting; tell latex to ++ # cope. ++ # ++ 'maxlistdepth': '10', ++ + # For CJK One-half spacing, need to be in front of hyperref + 'extrapackages': r'\usepackage{setspace}', + diff --git a/queue-6.7/drm-amd-display-adjust-few-initialization-order-in-dm.patch b/queue-6.7/drm-amd-display-adjust-few-initialization-order-in-dm.patch new file mode 100644 index 00000000000..740e166ef9a --- /dev/null +++ b/queue-6.7/drm-amd-display-adjust-few-initialization-order-in-dm.patch @@ -0,0 +1,107 @@ +From 22e1dc4b2fec17af70f297a4295c5f19a0f3fbeb Mon Sep 17 00:00:00 2001 +From: Wayne Lin +Date: Fri, 2 Feb 2024 17:34:11 +0800 +Subject: drm/amd/display: adjust few initialization order in dm + +From: Wayne Lin + +commit 22e1dc4b2fec17af70f297a4295c5f19a0f3fbeb upstream. + +[Why] +Observe error message "Can't retrieve aconnector in hpd_rx_irq_offload_work" +when boot up with a mst tbt4 dock connected. After analyzing, there are few +parts needed to be adjusted: + +1. hpd_rx_offload_wq[].aconnector is not initialzed before the dmub outbox +hpd_irq handler get registered which causes the error message. + +2. registeration of hpd and hpd_rx_irq event for usb4 dp tunneling is not +aligned with legacy interface sequence + +[How] +Put DMUB_NOTIFICATION_HPD and DMUB_NOTIFICATION_HPD_IRQ handler +registration into register_hpd_handlers() to align other interfaces and +get hpd_rx_offload_wq[].aconnector initialized earlier than that. + +Leave DMUB_NOTIFICATION_AUX_REPLY registered as it was since we need that +while calling dc_link_detect(). USB4 connection status will be proactively +detected by dc_link_detect_connection_type() in amdgpu_dm_initialize_drm_device() + +Cc: Stable +Reviewed-by: Aurabindo Pillai +Acked-by: Rodrigo Siqueira +Tested-by: Daniel Wheeler +Signed-off-by: Wayne Lin +Signed-off-by: Alex Deucher +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 37 ++++++++++------------ + 1 file changed, 18 insertions(+), 19 deletions(-) + +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -1810,21 +1810,12 @@ static int amdgpu_dm_init(struct amdgpu_ + DRM_ERROR("amdgpu: fail to register dmub aux callback"); + goto error; + } +- if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD, dmub_hpd_callback, true)) { +- DRM_ERROR("amdgpu: fail to register dmub hpd callback"); +- goto error; +- } +- if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD_IRQ, dmub_hpd_callback, true)) { +- DRM_ERROR("amdgpu: fail to register dmub hpd callback"); +- goto error; +- } +- } +- +- /* Enable outbox notification only after IRQ handlers are registered and DMUB is alive. +- * It is expected that DMUB will resend any pending notifications at this point, for +- * example HPD from DPIA. +- */ +- if (dc_is_dmub_outbox_supported(adev->dm.dc)) { ++ /* Enable outbox notification only after IRQ handlers are registered and DMUB is alive. ++ * It is expected that DMUB will resend any pending notifications at this point. Note ++ * that hpd and hpd_irq handler registration are deferred to register_hpd_handlers() to ++ * align legacy interface initialization sequence. Connection status will be proactivly ++ * detected once in the amdgpu_dm_initialize_drm_device. ++ */ + dc_enable_dmub_outbox(adev->dm.dc); + + /* DPIA trace goes to dmesg logs only if outbox is enabled */ +@@ -3494,6 +3485,14 @@ static void register_hpd_handlers(struct + int_params.requested_polarity = INTERRUPT_POLARITY_DEFAULT; + int_params.current_polarity = INTERRUPT_POLARITY_DEFAULT; + ++ if (dc_is_dmub_outbox_supported(adev->dm.dc)) { ++ if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD, dmub_hpd_callback, true)) ++ DRM_ERROR("amdgpu: fail to register dmub hpd callback"); ++ ++ if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD_IRQ, dmub_hpd_callback, true)) ++ DRM_ERROR("amdgpu: fail to register dmub hpd callback"); ++ } ++ + list_for_each_entry(connector, + &dev->mode_config.connector_list, head) { + +@@ -3519,10 +3518,6 @@ static void register_hpd_handlers(struct + handle_hpd_rx_irq, + (void *) aconnector); + } +- +- if (adev->dm.hpd_rx_offload_wq) +- adev->dm.hpd_rx_offload_wq[connector->index].aconnector = +- aconnector; + } + } + +@@ -4493,6 +4488,10 @@ static int amdgpu_dm_initialize_drm_devi + + link = dc_get_link_at_index(dm->dc, i); + ++ if (dm->hpd_rx_offload_wq) ++ dm->hpd_rx_offload_wq[aconnector->base.index].aconnector = ++ aconnector; ++ + if (!dc_link_detect_connection_type(link, &new_connection_type)) + DRM_ERROR("KMS: Failed to detect connector\n"); + diff --git a/queue-6.7/drm-amd-display-only-allow-dig-mapping-to-pwrseq-in-new-asic.patch b/queue-6.7/drm-amd-display-only-allow-dig-mapping-to-pwrseq-in-new-asic.patch new file mode 100644 index 00000000000..833cb5e36fe --- /dev/null +++ b/queue-6.7/drm-amd-display-only-allow-dig-mapping-to-pwrseq-in-new-asic.patch @@ -0,0 +1,131 @@ +From 4e73826089ce899357580bbf6e0afe4e6f9900b7 Mon Sep 17 00:00:00 2001 +From: Lewis Huang +Date: Wed, 31 Jan 2024 17:20:17 +0800 +Subject: drm/amd/display: Only allow dig mapping to pwrseq in new asic + +From: Lewis Huang + +commit 4e73826089ce899357580bbf6e0afe4e6f9900b7 upstream. + +[Why] +The old asic only have 1 pwrseq hw. +We don't need to map the diginst to pwrseq inst in old asic. + +[How] +1. Only mapping dig to pwrseq for new asic. +2. Move mapping function into dcn specific panel control component + +Cc: Stable # v6.6+ +Cc: Mario Limonciello +Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3122 +Reviewed-by: Anthony Koo +Acked-by: Rodrigo Siqueira +Tested-by: Daniel Wheeler +Signed-off-by: Lewis Huang +Signed-off-by: Alex Deucher +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/display/dc/dce/dce_panel_cntl.c | 1 + drivers/gpu/drm/amd/display/dc/dcn301/dcn301_panel_cntl.c | 1 + drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c | 18 +++++++++ + drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h | 2 - + drivers/gpu/drm/amd/display/dc/link/link_factory.c | 26 -------------- + 5 files changed, 21 insertions(+), 27 deletions(-) + +--- a/drivers/gpu/drm/amd/display/dc/dce/dce_panel_cntl.c ++++ b/drivers/gpu/drm/amd/display/dc/dce/dce_panel_cntl.c +@@ -290,4 +290,5 @@ void dce_panel_cntl_construct( + dce_panel_cntl->base.funcs = &dce_link_panel_cntl_funcs; + dce_panel_cntl->base.ctx = init_data->ctx; + dce_panel_cntl->base.inst = init_data->inst; ++ dce_panel_cntl->base.pwrseq_inst = 0; + } +--- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_panel_cntl.c ++++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_panel_cntl.c +@@ -215,4 +215,5 @@ void dcn301_panel_cntl_construct( + dcn301_panel_cntl->base.funcs = &dcn301_link_panel_cntl_funcs; + dcn301_panel_cntl->base.ctx = init_data->ctx; + dcn301_panel_cntl->base.inst = init_data->inst; ++ dcn301_panel_cntl->base.pwrseq_inst = 0; + } +--- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c ++++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c +@@ -154,8 +154,24 @@ void dcn31_panel_cntl_construct( + struct dcn31_panel_cntl *dcn31_panel_cntl, + const struct panel_cntl_init_data *init_data) + { ++ uint8_t pwrseq_inst = 0xF; ++ + dcn31_panel_cntl->base.funcs = &dcn31_link_panel_cntl_funcs; + dcn31_panel_cntl->base.ctx = init_data->ctx; + dcn31_panel_cntl->base.inst = init_data->inst; +- dcn31_panel_cntl->base.pwrseq_inst = init_data->pwrseq_inst; ++ ++ switch (init_data->eng_id) { ++ case ENGINE_ID_DIGA: ++ pwrseq_inst = 0; ++ break; ++ case ENGINE_ID_DIGB: ++ pwrseq_inst = 1; ++ break; ++ default: ++ DC_LOG_WARNING("Unsupported pwrseq engine id: %d!\n", init_data->eng_id); ++ ASSERT(false); ++ break; ++ } ++ ++ dcn31_panel_cntl->base.pwrseq_inst = pwrseq_inst; + } +--- a/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h ++++ b/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h +@@ -56,7 +56,7 @@ struct panel_cntl_funcs { + struct panel_cntl_init_data { + struct dc_context *ctx; + uint32_t inst; +- uint32_t pwrseq_inst; ++ uint32_t eng_id; + }; + + struct panel_cntl { +--- a/drivers/gpu/drm/amd/display/dc/link/link_factory.c ++++ b/drivers/gpu/drm/amd/display/dc/link/link_factory.c +@@ -368,30 +368,6 @@ static enum transmitter translate_encode + } + } + +-static uint8_t translate_dig_inst_to_pwrseq_inst(struct dc_link *link) +-{ +- uint8_t pwrseq_inst = 0xF; +- struct dc_context *dc_ctx = link->dc->ctx; +- +- DC_LOGGER_INIT(dc_ctx->logger); +- +- switch (link->eng_id) { +- case ENGINE_ID_DIGA: +- pwrseq_inst = 0; +- break; +- case ENGINE_ID_DIGB: +- pwrseq_inst = 1; +- break; +- default: +- DC_LOG_WARNING("Unsupported pwrseq engine id: %d!\n", link->eng_id); +- ASSERT(false); +- break; +- } +- +- return pwrseq_inst; +-} +- +- + static void link_destruct(struct dc_link *link) + { + int i; +@@ -655,7 +631,7 @@ static bool construct_phy(struct dc_link + link->link_id.id == CONNECTOR_ID_LVDS)) { + panel_cntl_init_data.ctx = dc_ctx; + panel_cntl_init_data.inst = panel_cntl_init_data.ctx->dc_edp_id_count; +- panel_cntl_init_data.pwrseq_inst = translate_dig_inst_to_pwrseq_inst(link); ++ panel_cntl_init_data.eng_id = link->eng_id; + link->panel_cntl = + link->dc->res_pool->funcs->panel_cntl_create( + &panel_cntl_init_data); diff --git a/queue-6.7/drm-amdgpu-fix-the-runtime-resume-failure-issue.patch b/queue-6.7/drm-amdgpu-fix-the-runtime-resume-failure-issue.patch new file mode 100644 index 00000000000..85c70e9fa20 --- /dev/null +++ b/queue-6.7/drm-amdgpu-fix-the-runtime-resume-failure-issue.patch @@ -0,0 +1,34 @@ +From bbfaf2aea7164db59739728d62d9cc91d64ff856 Mon Sep 17 00:00:00 2001 +From: Ma Jun +Date: Wed, 21 Feb 2024 17:16:49 +0800 +Subject: drm/amdgpu: Fix the runtime resume failure issue + +From: Ma Jun + +commit bbfaf2aea7164db59739728d62d9cc91d64ff856 upstream. + +Don't set power state flag when system enter runtime suspend, +or it may cause runtime resume failure issue. + +Fixes: 3a9626c816db ("drm/amd: Stop evicting resources on APUs in suspend") +Signed-off-by: Ma Jun +Reviewed-by: Mario Limonciello +Signed-off-by: Alex Deucher +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +@@ -1528,6 +1528,9 @@ bool amdgpu_acpi_is_s0ix_active(struct a + */ + void amdgpu_choose_low_power_state(struct amdgpu_device *adev) + { ++ if (adev->in_runpm) ++ return; ++ + if (amdgpu_acpi_is_s0ix_active(adev)) + adev->in_s0ix = true; + else if (amdgpu_acpi_is_s3_active(adev)) diff --git a/queue-6.7/drm-buddy-modify-duplicate-list_splice_tail-call.patch b/queue-6.7/drm-buddy-modify-duplicate-list_splice_tail-call.patch new file mode 100644 index 00000000000..15050b0cfed --- /dev/null +++ b/queue-6.7/drm-buddy-modify-duplicate-list_splice_tail-call.patch @@ -0,0 +1,50 @@ +From 02f76a9cd4494719600baf1ab278930df39431ab Mon Sep 17 00:00:00 2001 +From: Arunpravin Paneer Selvam +Date: Fri, 16 Feb 2024 15:30:48 +0530 +Subject: drm/buddy: Modify duplicate list_splice_tail call +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Arunpravin Paneer Selvam + +commit 02f76a9cd4494719600baf1ab278930df39431ab upstream. + +Remove the duplicate list_splice_tail call when the +total_allocated < size condition is true. + +Cc: # 6.7+ +Fixes: 8746c6c9dfa3 ("drm/buddy: Fix alloc_range() error handling code") +Reported-by: Bert Karwatzki +Signed-off-by: Arunpravin Paneer Selvam +Reviewed-by: Matthew Auld +Link: https://patchwork.freedesktop.org/patch/msgid/20240216100048.4101-1-Arunpravin.PaneerSelvam@amd.com +Signed-off-by: Christian König +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/drm_buddy.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c +index c1a99bf4dffd..c4222b886db7 100644 +--- a/drivers/gpu/drm/drm_buddy.c ++++ b/drivers/gpu/drm/drm_buddy.c +@@ -538,13 +538,13 @@ static int __alloc_range(struct drm_buddy *mm, + list_add(&block->left->tmp_link, dfs); + } while (1); + +- list_splice_tail(&allocated, blocks); +- + if (total_allocated < size) { + err = -ENOSPC; + goto err_free; + } + ++ list_splice_tail(&allocated, blocks); ++ + return 0; + + err_undo: +-- +2.44.0 + diff --git a/queue-6.7/drm-meson-don-t-remove-bridges-which-are-created-by-other-drivers.patch b/queue-6.7/drm-meson-don-t-remove-bridges-which-are-created-by-other-drivers.patch new file mode 100644 index 00000000000..fe2b7976d18 --- /dev/null +++ b/queue-6.7/drm-meson-don-t-remove-bridges-which-are-created-by-other-drivers.patch @@ -0,0 +1,66 @@ +From bd915ae73a2d78559b376ad2caf5e4ef51de2455 Mon Sep 17 00:00:00 2001 +From: Martin Blumenstingl +Date: Thu, 15 Feb 2024 23:04:42 +0100 +Subject: drm/meson: Don't remove bridges which are created by other drivers + +From: Martin Blumenstingl + +commit bd915ae73a2d78559b376ad2caf5e4ef51de2455 upstream. + +Stop calling drm_bridge_remove() for bridges allocated/managed by other +drivers in the remove paths of meson_encoder_{cvbs,dsi,hdmi}. +drm_bridge_remove() unregisters the bridge so it cannot be used +anymore. Doing so for bridges we don't own can lead to the video +pipeline not being able to come up after -EPROBE_DEFER of the VPU +because we're unregistering a bridge that's managed by another driver. +The other driver doesn't know that we have unregistered it's bridge +and on subsequent .probe() we're not able to find those bridges anymore +(since nobody re-creates them). + +This fixes probe errors on Meson8b boards with the CVBS outputs enabled. + +Fixes: 09847723c12f ("drm/meson: remove drm bridges at aggregate driver unbind time") +Fixes: 42dcf15f901c ("drm/meson: add DSI encoder") +Cc: +Reported-by: Steve Morvai +Signed-off-by: Martin Blumenstingl +Reviewed-by: Neil Armstrong +Tested-by: Steve Morvai +Link: https://lore.kernel.org/r/20240215220442.1343152-1-martin.blumenstingl@googlemail.com +Reviewed-by: Neil Armstrong +Signed-off-by: Neil Armstrong +Link: https://patchwork.freedesktop.org/patch/msgid/20240215220442.1343152-1-martin.blumenstingl@googlemail.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/meson/meson_encoder_cvbs.c | 1 - + drivers/gpu/drm/meson/meson_encoder_dsi.c | 1 - + drivers/gpu/drm/meson/meson_encoder_hdmi.c | 1 - + 3 files changed, 3 deletions(-) + +--- a/drivers/gpu/drm/meson/meson_encoder_cvbs.c ++++ b/drivers/gpu/drm/meson/meson_encoder_cvbs.c +@@ -294,6 +294,5 @@ void meson_encoder_cvbs_remove(struct me + if (priv->encoders[MESON_ENC_CVBS]) { + meson_encoder_cvbs = priv->encoders[MESON_ENC_CVBS]; + drm_bridge_remove(&meson_encoder_cvbs->bridge); +- drm_bridge_remove(meson_encoder_cvbs->next_bridge); + } + } +--- a/drivers/gpu/drm/meson/meson_encoder_dsi.c ++++ b/drivers/gpu/drm/meson/meson_encoder_dsi.c +@@ -168,6 +168,5 @@ void meson_encoder_dsi_remove(struct mes + if (priv->encoders[MESON_ENC_DSI]) { + meson_encoder_dsi = priv->encoders[MESON_ENC_DSI]; + drm_bridge_remove(&meson_encoder_dsi->bridge); +- drm_bridge_remove(meson_encoder_dsi->next_bridge); + } + } +--- a/drivers/gpu/drm/meson/meson_encoder_hdmi.c ++++ b/drivers/gpu/drm/meson/meson_encoder_hdmi.c +@@ -474,6 +474,5 @@ void meson_encoder_hdmi_remove(struct me + if (priv->encoders[MESON_ENC_HDMI]) { + meson_encoder_hdmi = priv->encoders[MESON_ENC_HDMI]; + drm_bridge_remove(&meson_encoder_hdmi->bridge); +- drm_bridge_remove(meson_encoder_hdmi->next_bridge); + } + } diff --git a/queue-6.7/drm-ttm-fix-an-invalid-freeing-on-already-freed-page-in-error-path.patch b/queue-6.7/drm-ttm-fix-an-invalid-freeing-on-already-freed-page-in-error-path.patch new file mode 100644 index 00000000000..e8899af4ac0 --- /dev/null +++ b/queue-6.7/drm-ttm-fix-an-invalid-freeing-on-already-freed-page-in-error-path.patch @@ -0,0 +1,49 @@ +From 40510a941d27d405a82dc3320823d875f94625df Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= +Date: Wed, 21 Feb 2024 08:33:24 +0100 +Subject: drm/ttm: Fix an invalid freeing on already freed page in error path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Thomas Hellström + +commit 40510a941d27d405a82dc3320823d875f94625df upstream. + +If caching mode change fails due to, for example, OOM we +free the allocated pages in a two-step process. First the pages +for which the caching change has already succeeded. Secondly +the pages for which a caching change did not succeed. + +However the second step was incorrectly freeing the pages already +freed in the first step. + +Fix. + +Signed-off-by: Thomas Hellström +Fixes: 379989e7cbdc ("drm/ttm/pool: Fix ttm_pool_alloc error path") +Cc: Christian König +Cc: Dave Airlie +Cc: Christian Koenig +Cc: Huang Rui +Cc: dri-devel@lists.freedesktop.org +Cc: # v6.4+ +Reviewed-by: Matthew Auld +Reviewed-by: Christian König +Link: https://patchwork.freedesktop.org/patch/msgid/20240221073324.3303-1-thomas.hellstrom@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/ttm/ttm_pool.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/gpu/drm/ttm/ttm_pool.c ++++ b/drivers/gpu/drm/ttm/ttm_pool.c +@@ -387,7 +387,7 @@ static void ttm_pool_free_range(struct t + enum ttm_caching caching, + pgoff_t start_page, pgoff_t end_page) + { +- struct page **pages = tt->pages; ++ struct page **pages = &tt->pages[start_page]; + unsigned int order; + pgoff_t i, nr; + diff --git a/queue-6.7/fs-aio-restrict-kiocb_set_cancel_fn-to-i-o-submitted-via-libaio.patch b/queue-6.7/fs-aio-restrict-kiocb_set_cancel_fn-to-i-o-submitted-via-libaio.patch new file mode 100644 index 00000000000..2c0eda06e18 --- /dev/null +++ b/queue-6.7/fs-aio-restrict-kiocb_set_cancel_fn-to-i-o-submitted-via-libaio.patch @@ -0,0 +1,83 @@ +From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001 +From: Bart Van Assche +Date: Thu, 15 Feb 2024 12:47:38 -0800 +Subject: fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio + +From: Bart Van Assche + +commit b820de741ae48ccf50dd95e297889c286ff4f760 upstream. + +If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the +following kernel warning appears: + +WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8 +Call trace: + kiocb_set_cancel_fn+0x9c/0xa8 + ffs_epfile_read_iter+0x144/0x1d0 + io_read+0x19c/0x498 + io_issue_sqe+0x118/0x27c + io_submit_sqes+0x25c/0x5fc + __arm64_sys_io_uring_enter+0x104/0xab0 + invoke_syscall+0x58/0x11c + el0_svc_common+0xb4/0xf4 + do_el0_svc+0x2c/0xb0 + el0_svc+0x2c/0xa4 + el0t_64_sync_handler+0x68/0xb4 + el0t_64_sync+0x1a4/0x1a8 + +Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is +submitted by libaio. + +Suggested-by: Jens Axboe +Cc: Christoph Hellwig +Cc: Avi Kivity +Cc: Sandeep Dhavale +Cc: Jens Axboe +Cc: Greg Kroah-Hartman +Cc: Kent Overstreet +Cc: stable@vger.kernel.org +Signed-off-by: Bart Van Assche +Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + fs/aio.c | 9 ++++++++- + include/linux/fs.h | 2 ++ + 2 files changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -594,6 +594,13 @@ void kiocb_set_cancel_fn(struct kiocb *i + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + ++ /* ++ * kiocb didn't come from aio or is neither a read nor a write, hence ++ * ignore it. ++ */ ++ if (!(iocb->ki_flags & IOCB_AIO_RW)) ++ return; ++ + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) + return; + +@@ -1463,7 +1470,7 @@ static int aio_prep_rw(struct kiocb *req + req->ki_complete = aio_complete_rw; + req->private = NULL; + req->ki_pos = iocb->aio_offset; +- req->ki_flags = req->ki_filp->f_iocb_flags; ++ req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; + if (iocb->aio_flags & IOCB_FLAG_RESFD) + req->ki_flags |= IOCB_EVENTFD; + if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -352,6 +352,8 @@ enum rw_hint { + * unrelated IO (like cache flushing, new IO generation, etc). + */ + #define IOCB_DIO_CALLER_COMP (1 << 22) ++/* kiocb is a read or write operation submitted by fs/aio.c. */ ++#define IOCB_AIO_RW (1 << 23) + + /* for use in trace events */ + #define TRACE_IOCB_STRINGS \ diff --git a/queue-6.7/gtp-fix-use-after-free-and-null-ptr-deref-in-gtp_genl_dump_pdp.patch b/queue-6.7/gtp-fix-use-after-free-and-null-ptr-deref-in-gtp_genl_dump_pdp.patch new file mode 100644 index 00000000000..f305f1ff177 --- /dev/null +++ b/queue-6.7/gtp-fix-use-after-free-and-null-ptr-deref-in-gtp_genl_dump_pdp.patch @@ -0,0 +1,97 @@ +From 136cfaca22567a03bbb3bf53a43d8cb5748b80ec Mon Sep 17 00:00:00 2001 +From: Vasiliy Kovalev +Date: Wed, 14 Feb 2024 19:27:33 +0300 +Subject: gtp: fix use-after-free and null-ptr-deref in gtp_genl_dump_pdp() + +From: Vasiliy Kovalev + +commit 136cfaca22567a03bbb3bf53a43d8cb5748b80ec upstream. + +The gtp_net_ops pernet operations structure for the subsystem must be +registered before registering the generic netlink family. + +Syzkaller hit 'general protection fault in gtp_genl_dump_pdp' bug: + +general protection fault, probably for non-canonical address +0xdffffc0000000002: 0000 [#1] PREEMPT SMP KASAN NOPTI +KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] +CPU: 1 PID: 5826 Comm: gtp Not tainted 6.8.0-rc3-std-def-alt1 #1 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-alt1 04/01/2014 +RIP: 0010:gtp_genl_dump_pdp+0x1be/0x800 [gtp] +Code: c6 89 c6 e8 64 e9 86 df 58 45 85 f6 0f 85 4e 04 00 00 e8 c5 ee 86 + df 48 8b 54 24 18 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> + 3c 02 00 0f 85 de 05 00 00 48 8b 44 24 18 4c 8b 30 4c 39 f0 74 +RSP: 0018:ffff888014107220 EFLAGS: 00010202 +RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 +RDX: 0000000000000002 RSI: 0000000000000000 RDI: 0000000000000000 +RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 +R13: ffff88800fcda588 R14: 0000000000000001 R15: 0000000000000000 +FS: 00007f1be4eb05c0(0000) GS:ffff88806ce80000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f1be4e766cf CR3: 000000000c33e000 CR4: 0000000000750ef0 +PKRU: 55555554 +Call Trace: + + ? show_regs+0x90/0xa0 + ? die_addr+0x50/0xd0 + ? exc_general_protection+0x148/0x220 + ? asm_exc_general_protection+0x22/0x30 + ? gtp_genl_dump_pdp+0x1be/0x800 [gtp] + ? __alloc_skb+0x1dd/0x350 + ? __pfx___alloc_skb+0x10/0x10 + genl_dumpit+0x11d/0x230 + netlink_dump+0x5b9/0xce0 + ? lockdep_hardirqs_on_prepare+0x253/0x430 + ? __pfx_netlink_dump+0x10/0x10 + ? kasan_save_track+0x10/0x40 + ? __kasan_kmalloc+0x9b/0xa0 + ? genl_start+0x675/0x970 + __netlink_dump_start+0x6fc/0x9f0 + genl_family_rcv_msg_dumpit+0x1bb/0x2d0 + ? __pfx_genl_family_rcv_msg_dumpit+0x10/0x10 + ? genl_op_from_small+0x2a/0x440 + ? cap_capable+0x1d0/0x240 + ? __pfx_genl_start+0x10/0x10 + ? __pfx_genl_dumpit+0x10/0x10 + ? __pfx_genl_done+0x10/0x10 + ? security_capable+0x9d/0xe0 + +Cc: stable@vger.kernel.org +Signed-off-by: Vasiliy Kovalev +Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") +Link: https://lore.kernel.org/r/20240214162733.34214-1-kovalev@altlinux.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/gtp.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/drivers/net/gtp.c ++++ b/drivers/net/gtp.c +@@ -1907,20 +1907,20 @@ static int __init gtp_init(void) + if (err < 0) + goto error_out; + +- err = genl_register_family(>p_genl_family); ++ err = register_pernet_subsys(>p_net_ops); + if (err < 0) + goto unreg_rtnl_link; + +- err = register_pernet_subsys(>p_net_ops); ++ err = genl_register_family(>p_genl_family); + if (err < 0) +- goto unreg_genl_family; ++ goto unreg_pernet_subsys; + + pr_info("GTP module loaded (pdp ctx size %zd bytes)\n", + sizeof(struct pdp_ctx)); + return 0; + +-unreg_genl_family: +- genl_unregister_family(>p_genl_family); ++unreg_pernet_subsys: ++ unregister_pernet_subsys(>p_net_ops); + unreg_rtnl_link: + rtnl_link_unregister(>p_link_ops); + error_out: diff --git a/queue-6.7/kvm-arm64-vgic-its-test-for-valid-irq-in-its_sync_lpi_pending_table.patch b/queue-6.7/kvm-arm64-vgic-its-test-for-valid-irq-in-its_sync_lpi_pending_table.patch new file mode 100644 index 00000000000..3b11d5c393b --- /dev/null +++ b/queue-6.7/kvm-arm64-vgic-its-test-for-valid-irq-in-its_sync_lpi_pending_table.patch @@ -0,0 +1,36 @@ +From 8d3a7dfb801d157ac423261d7cd62c33e95375f8 Mon Sep 17 00:00:00 2001 +From: Oliver Upton +Date: Wed, 21 Feb 2024 09:27:31 +0000 +Subject: KVM: arm64: vgic-its: Test for valid IRQ in its_sync_lpi_pending_table() + +From: Oliver Upton + +commit 8d3a7dfb801d157ac423261d7cd62c33e95375f8 upstream. + +vgic_get_irq() may not return a valid descriptor if there is no ITS that +holds a valid translation for the specified INTID. If that is the case, +it is safe to silently ignore it and continue processing the LPI pending +table. + +Cc: stable@vger.kernel.org +Fixes: 33d3bc9556a7 ("KVM: arm64: vgic-its: Read initial LPI pending table") +Signed-off-by: Oliver Upton +Link: https://lore.kernel.org/r/20240221092732.4126848-2-oliver.upton@linux.dev +Signed-off-by: Marc Zyngier +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/vgic/vgic-its.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/arm64/kvm/vgic/vgic-its.c ++++ b/arch/arm64/kvm/vgic/vgic-its.c +@@ -468,6 +468,9 @@ static int its_sync_lpi_pending_table(st + } + + irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); ++ if (!irq) ++ continue; ++ + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = pendmask & (1U << bit_nr); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); diff --git a/queue-6.7/kvm-arm64-vgic-its-test-for-valid-irq-in-movall-handler.patch b/queue-6.7/kvm-arm64-vgic-its-test-for-valid-irq-in-movall-handler.patch new file mode 100644 index 00000000000..001fbc7ea75 --- /dev/null +++ b/queue-6.7/kvm-arm64-vgic-its-test-for-valid-irq-in-movall-handler.patch @@ -0,0 +1,35 @@ +From 85a71ee9a0700f6c18862ef3b0011ed9dad99aca Mon Sep 17 00:00:00 2001 +From: Oliver Upton +Date: Wed, 21 Feb 2024 09:27:32 +0000 +Subject: KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler + +From: Oliver Upton + +commit 85a71ee9a0700f6c18862ef3b0011ed9dad99aca upstream. + +It is possible that an LPI mapped in a different ITS gets unmapped while +handling the MOVALL command. If that is the case, there is no state that +can be migrated to the destination. Silently ignore it and continue +migrating other LPIs. + +Cc: stable@vger.kernel.org +Fixes: ff9c114394aa ("KVM: arm/arm64: GICv4: Handle MOVALL applied to a vPE") +Signed-off-by: Oliver Upton +Link: https://lore.kernel.org/r/20240221092732.4126848-3-oliver.upton@linux.dev +Signed-off-by: Marc Zyngier +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/vgic/vgic-its.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/arm64/kvm/vgic/vgic-its.c ++++ b/arch/arm64/kvm/vgic/vgic-its.c +@@ -1432,6 +1432,8 @@ static int vgic_its_cmd_handle_movall(st + + for (i = 0; i < irq_count; i++) { + irq = vgic_get_irq(kvm, NULL, intids[i]); ++ if (!irq) ++ continue; + + update_affinity(irq, vcpu2); + diff --git a/queue-6.7/lib-kconfig.debug-test_iov_iter-depends-on-mmu.patch b/queue-6.7/lib-kconfig.debug-test_iov_iter-depends-on-mmu.patch new file mode 100644 index 00000000000..b6a55fe1983 --- /dev/null +++ b/queue-6.7/lib-kconfig.debug-test_iov_iter-depends-on-mmu.patch @@ -0,0 +1,44 @@ +From 1eb1e984379e2da04361763f66eec90dd75cf63e Mon Sep 17 00:00:00 2001 +From: Guenter Roeck +Date: Thu, 8 Feb 2024 07:30:10 -0800 +Subject: lib/Kconfig.debug: TEST_IOV_ITER depends on MMU + +From: Guenter Roeck + +commit 1eb1e984379e2da04361763f66eec90dd75cf63e upstream. + +Trying to run the iov_iter unit test on a nommu system such as the qemu +kc705-nommu emulation results in a crash. + + KTAP version 1 + # Subtest: iov_iter + # module: kunit_iov_iter + 1..9 +BUG: failure at mm/nommu.c:318/vmap()! +Kernel panic - not syncing: BUG! + +The test calls vmap() directly, but vmap() is not supported on nommu +systems, causing the crash. TEST_IOV_ITER therefore needs to depend on +MMU. + +Link: https://lkml.kernel.org/r/20240208153010.1439753-1-linux@roeck-us.net +Fixes: 2d71340ff1d4 ("iov_iter: Kunit tests for copying to/from an iterator") +Signed-off-by: Guenter Roeck +Cc: David Howells +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + lib/Kconfig.debug | 1 + + 1 file changed, 1 insertion(+) + +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -2234,6 +2234,7 @@ config TEST_DIV64 + config TEST_IOV_ITER + tristate "Test iov_iter operation" if !KUNIT_ALL_TESTS + depends on KUNIT ++ depends on MMU + default KUNIT_ALL_TESTS + help + Enable this to turn on testing of the operation of the I/O iterator diff --git a/queue-6.7/loongarch-call-early_init_fdt_scan_reserved_mem-earlier.patch b/queue-6.7/loongarch-call-early_init_fdt_scan_reserved_mem-earlier.patch new file mode 100644 index 00000000000..b99d22eb311 --- /dev/null +++ b/queue-6.7/loongarch-call-early_init_fdt_scan_reserved_mem-earlier.patch @@ -0,0 +1,49 @@ +From 9fa304b9f8ec440e614af6d35826110c633c4074 Mon Sep 17 00:00:00 2001 +From: Huacai Chen +Date: Fri, 23 Feb 2024 14:36:31 +0800 +Subject: LoongArch: Call early_init_fdt_scan_reserved_mem() earlier + +From: Huacai Chen + +commit 9fa304b9f8ec440e614af6d35826110c633c4074 upstream. + +The unflatten_and_copy_device_tree() function contains a call to +memblock_alloc(). This means that memblock is allocating memory before +any of the reserved memory regions are set aside in the arch_mem_init() +function which calls early_init_fdt_scan_reserved_mem(). Therefore, +there is a possibility for memblock to allocate from any of the +reserved memory regions. + +Hence, move the call to early_init_fdt_scan_reserved_mem() to be earlier +in the init sequence, so that the reserved memory regions are set aside +before any allocations are done using memblock. + +Cc: stable@vger.kernel.org +Fixes: 88d4d957edc707e ("LoongArch: Add FDT booting support from efi system table") +Signed-off-by: Oreoluwa Babatunde +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/kernel/setup.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/loongarch/kernel/setup.c ++++ b/arch/loongarch/kernel/setup.c +@@ -366,6 +366,8 @@ void __init platform_init(void) + acpi_gbl_use_default_register_widths = false; + acpi_boot_table_init(); + #endif ++ ++ early_init_fdt_scan_reserved_mem(); + unflatten_and_copy_device_tree(); + + #ifdef CONFIG_NUMA +@@ -399,8 +401,6 @@ static void __init arch_mem_init(char ** + + check_kernel_sections_mem(); + +- early_init_fdt_scan_reserved_mem(); +- + /* + * In order to reduce the possibility of kernel panic when failed to + * get IO TLB memory under CONFIG_SWIOTLB, it is better to allocate diff --git a/queue-6.7/loongarch-disable-irq-before-init_fn-for-nonboot-cpus.patch b/queue-6.7/loongarch-disable-irq-before-init_fn-for-nonboot-cpus.patch new file mode 100644 index 00000000000..4906d722505 --- /dev/null +++ b/queue-6.7/loongarch-disable-irq-before-init_fn-for-nonboot-cpus.patch @@ -0,0 +1,74 @@ +From 1001db6c42e4012b55e5ee19405490f23e033b5a Mon Sep 17 00:00:00 2001 +From: Huacai Chen +Date: Fri, 23 Feb 2024 14:36:31 +0800 +Subject: LoongArch: Disable IRQ before init_fn() for nonboot CPUs + +From: Huacai Chen + +commit 1001db6c42e4012b55e5ee19405490f23e033b5a upstream. + +Disable IRQ before init_fn() for nonboot CPUs when hotplug, in order to +silence such warnings (and also avoid potential errors due to unexpected +interrupts): + +WARNING: CPU: 1 PID: 0 at kernel/rcu/tree.c:4503 rcu_cpu_starting+0x214/0x280 +CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.6.17+ #1198 +pc 90000000048e3334 ra 90000000047bd56c tp 900000010039c000 sp 900000010039fdd0 +a0 0000000000000001 a1 0000000000000006 a2 900000000802c040 a3 0000000000000000 +a4 0000000000000001 a5 0000000000000004 a6 0000000000000000 a7 90000000048e3f4c +t0 0000000000000001 t1 9000000005c70968 t2 0000000004000000 t3 000000000005e56e +t4 00000000000002e4 t5 0000000000001000 t6 ffffffff80000000 t7 0000000000040000 +t8 9000000007931638 u0 0000000000000006 s9 0000000000000004 s0 0000000000000001 +s1 9000000006356ac0 s2 9000000007244000 s3 0000000000000001 s4 0000000000000001 +s5 900000000636f000 s6 7fffffffffffffff s7 9000000002123940 s8 9000000001ca55f8 + ra: 90000000047bd56c tlb_init+0x24c/0x528 + ERA: 90000000048e3334 rcu_cpu_starting+0x214/0x280 + CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) + PRMD: 00000000 (PPLV0 -PIE -PWE) + EUEN: 00000000 (-FPE -SXE -ASXE -BTE) + ECFG: 00071000 (LIE=12 VS=7) +ESTAT: 000c0000 [BRK] (IS= ECode=12 EsubCode=0) + PRID: 0014c010 (Loongson-64bit, Loongson-3A5000) +CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.6.17+ #1198 +Stack : 0000000000000000 9000000006375000 9000000005b61878 900000010039c000 + 900000010039fa30 0000000000000000 900000010039fa38 900000000619a140 + 9000000006456888 9000000006456880 900000010039f950 0000000000000001 + 0000000000000001 cb0cb028ec7e52e1 0000000002b90000 9000000100348700 + 0000000000000000 0000000000000001 ffffffff916d12f1 0000000000000003 + 0000000000040000 9000000007930370 0000000002b90000 0000000000000004 + 9000000006366000 900000000619a140 0000000000000000 0000000000000004 + 0000000000000000 0000000000000009 ffffffffffc681f2 9000000002123940 + 9000000001ca55f8 9000000006366000 90000000047a4828 00007ffff057ded8 + 00000000000000b0 0000000000000000 0000000000000000 0000000000071000 + ... +Call Trace: +[<90000000047a4828>] show_stack+0x48/0x1a0 +[<9000000005b61874>] dump_stack_lvl+0x84/0xcc +[<90000000047f60ac>] __warn+0x8c/0x1e0 +[<9000000005b0ab34>] report_bug+0x1b4/0x280 +[<9000000005b63110>] do_bp+0x2d0/0x480 +[<90000000047a2e20>] handle_bp+0x120/0x1c0 +[<90000000048e3334>] rcu_cpu_starting+0x214/0x280 +[<90000000047bd568>] tlb_init+0x248/0x528 +[<90000000047a4c44>] per_cpu_trap_init+0x124/0x160 +[<90000000047a19f4>] cpu_probe+0x494/0xa00 +[<90000000047b551c>] start_secondary+0x3c/0xc0 +[<9000000005b66134>] smpboot_entry+0x50/0x58 + +Cc: stable@vger.kernel.org +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/kernel/smp.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/loongarch/kernel/smp.c ++++ b/arch/loongarch/kernel/smp.c +@@ -334,6 +334,7 @@ void __noreturn arch_cpu_idle_dead(void) + addr = iocsr_read64(LOONGARCH_IOCSR_MBUF0); + } while (addr == 0); + ++ local_irq_disable(); + init_fn = (void *)TO_CACHE(addr); + iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_CLEAR); + diff --git a/queue-6.7/loongarch-update-cpu_sibling_map-when-disabling-nonboot-cpus.patch b/queue-6.7/loongarch-update-cpu_sibling_map-when-disabling-nonboot-cpus.patch new file mode 100644 index 00000000000..ebe4d2598f3 --- /dev/null +++ b/queue-6.7/loongarch-update-cpu_sibling_map-when-disabling-nonboot-cpus.patch @@ -0,0 +1,208 @@ +From 752cd08da320a667a833803a8fd6bb266114cce5 Mon Sep 17 00:00:00 2001 +From: Huacai Chen +Date: Fri, 23 Feb 2024 14:36:31 +0800 +Subject: LoongArch: Update cpu_sibling_map when disabling nonboot CPUs + +From: Huacai Chen + +commit 752cd08da320a667a833803a8fd6bb266114cce5 upstream. + +Update cpu_sibling_map when disabling nonboot CPUs by defining & calling +clear_cpu_sibling_map(), otherwise we get such errors on SMT systems: + +jump label: negative count! +WARNING: CPU: 6 PID: 45 at kernel/jump_label.c:263 __static_key_slow_dec_cpuslocked+0xec/0x100 +CPU: 6 PID: 45 Comm: cpuhp/6 Not tainted 6.8.0-rc5+ #1340 +pc 90000000004c302c ra 90000000004c302c tp 90000001005bc000 sp 90000001005bfd20 +a0 000000000000001b a1 900000000224c278 a2 90000001005bfb58 a3 900000000224c280 +a4 900000000224c278 a5 90000001005bfb50 a6 0000000000000001 a7 0000000000000001 +t0 ce87a4763eb5234a t1 ce87a4763eb5234a t2 0000000000000000 t3 0000000000000000 +t4 0000000000000006 t5 0000000000000000 t6 0000000000000064 t7 0000000000001964 +t8 000000000009ebf6 u0 9000000001f2a068 s9 0000000000000000 s0 900000000246a2d8 +s1 ffffffffffffffff s2 ffffffffffffffff s3 90000000021518c0 s4 0000000000000040 +s5 9000000002151058 s6 9000000009828e40 s7 00000000000000b4 s8 0000000000000006 + ra: 90000000004c302c __static_key_slow_dec_cpuslocked+0xec/0x100 + ERA: 90000000004c302c __static_key_slow_dec_cpuslocked+0xec/0x100 + CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) + PRMD: 00000004 (PPLV0 +PIE -PWE) + EUEN: 00000000 (-FPE -SXE -ASXE -BTE) + ECFG: 00071c1c (LIE=2-4,10-12 VS=7) +ESTAT: 000c0000 [BRK] (IS= ECode=12 EsubCode=0) + PRID: 0014d000 (Loongson-64bit, Loongson-3A6000-HV) +CPU: 6 PID: 45 Comm: cpuhp/6 Not tainted 6.8.0-rc5+ #1340 +Stack : 0000000000000000 900000000203f258 900000000179afc8 90000001005bc000 + 90000001005bf980 0000000000000000 90000001005bf988 9000000001fe0be0 + 900000000224c280 900000000224c278 90000001005bf8c0 0000000000000001 + 0000000000000001 ce87a4763eb5234a 0000000007f38000 90000001003f8cc0 + 0000000000000000 0000000000000006 0000000000000000 4c206e6f73676e6f + 6f4c203a656d616e 000000000009ec99 0000000007f38000 0000000000000000 + 900000000214b000 9000000001fe0be0 0000000000000004 0000000000000000 + 0000000000000107 0000000000000009 ffffffffffafdabe 00000000000000b4 + 0000000000000006 90000000004c302c 9000000000224528 00005555939a0c7c + 00000000000000b0 0000000000000004 0000000000000000 0000000000071c1c + ... +Call Trace: +[<9000000000224528>] show_stack+0x48/0x1a0 +[<900000000179afc8>] dump_stack_lvl+0x78/0xa0 +[<9000000000263ed0>] __warn+0x90/0x1a0 +[<90000000017419b8>] report_bug+0x1b8/0x280 +[<900000000179c564>] do_bp+0x264/0x420 +[<90000000004c302c>] __static_key_slow_dec_cpuslocked+0xec/0x100 +[<90000000002b4d7c>] sched_cpu_deactivate+0x2fc/0x300 +[<9000000000266498>] cpuhp_invoke_callback+0x178/0x8a0 +[<9000000000267f70>] cpuhp_thread_fun+0xf0/0x240 +[<90000000002a117c>] smpboot_thread_fn+0x1dc/0x2e0 +[<900000000029a720>] kthread+0x140/0x160 +[<9000000000222288>] ret_from_kernel_thread+0xc/0xa4 + +Cc: stable@vger.kernel.org +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/kernel/smp.c | 121 ++++++++++++++++++++++++-------------------- + 1 file changed, 68 insertions(+), 53 deletions(-) + +--- a/arch/loongarch/kernel/smp.c ++++ b/arch/loongarch/kernel/smp.c +@@ -88,6 +88,73 @@ void show_ipi_list(struct seq_file *p, i + } + } + ++static inline void set_cpu_core_map(int cpu) ++{ ++ int i; ++ ++ cpumask_set_cpu(cpu, &cpu_core_setup_map); ++ ++ for_each_cpu(i, &cpu_core_setup_map) { ++ if (cpu_data[cpu].package == cpu_data[i].package) { ++ cpumask_set_cpu(i, &cpu_core_map[cpu]); ++ cpumask_set_cpu(cpu, &cpu_core_map[i]); ++ } ++ } ++} ++ ++static inline void set_cpu_sibling_map(int cpu) ++{ ++ int i; ++ ++ cpumask_set_cpu(cpu, &cpu_sibling_setup_map); ++ ++ for_each_cpu(i, &cpu_sibling_setup_map) { ++ if (cpus_are_siblings(cpu, i)) { ++ cpumask_set_cpu(i, &cpu_sibling_map[cpu]); ++ cpumask_set_cpu(cpu, &cpu_sibling_map[i]); ++ } ++ } ++} ++ ++static inline void clear_cpu_sibling_map(int cpu) ++{ ++ int i; ++ ++ for_each_cpu(i, &cpu_sibling_setup_map) { ++ if (cpus_are_siblings(cpu, i)) { ++ cpumask_clear_cpu(i, &cpu_sibling_map[cpu]); ++ cpumask_clear_cpu(cpu, &cpu_sibling_map[i]); ++ } ++ } ++ ++ cpumask_clear_cpu(cpu, &cpu_sibling_setup_map); ++} ++ ++/* ++ * Calculate a new cpu_foreign_map mask whenever a ++ * new cpu appears or disappears. ++ */ ++void calculate_cpu_foreign_map(void) ++{ ++ int i, k, core_present; ++ cpumask_t temp_foreign_map; ++ ++ /* Re-calculate the mask */ ++ cpumask_clear(&temp_foreign_map); ++ for_each_online_cpu(i) { ++ core_present = 0; ++ for_each_cpu(k, &temp_foreign_map) ++ if (cpus_are_siblings(i, k)) ++ core_present = 1; ++ if (!core_present) ++ cpumask_set_cpu(i, &temp_foreign_map); ++ } ++ ++ for_each_online_cpu(i) ++ cpumask_andnot(&cpu_foreign_map[i], ++ &temp_foreign_map, &cpu_sibling_map[i]); ++} ++ + /* Send mailbox buffer via Mail_Send */ + static void csr_mail_send(uint64_t data, int cpu, int mailbox) + { +@@ -300,6 +367,7 @@ int loongson_cpu_disable(void) + numa_remove_cpu(cpu); + #endif + set_cpu_online(cpu, false); ++ clear_cpu_sibling_map(cpu); + calculate_cpu_foreign_map(); + local_irq_save(flags); + irq_migrate_all_off_this_cpu(); +@@ -377,59 +445,6 @@ static int __init ipi_pm_init(void) + core_initcall(ipi_pm_init); + #endif + +-static inline void set_cpu_sibling_map(int cpu) +-{ +- int i; +- +- cpumask_set_cpu(cpu, &cpu_sibling_setup_map); +- +- for_each_cpu(i, &cpu_sibling_setup_map) { +- if (cpus_are_siblings(cpu, i)) { +- cpumask_set_cpu(i, &cpu_sibling_map[cpu]); +- cpumask_set_cpu(cpu, &cpu_sibling_map[i]); +- } +- } +-} +- +-static inline void set_cpu_core_map(int cpu) +-{ +- int i; +- +- cpumask_set_cpu(cpu, &cpu_core_setup_map); +- +- for_each_cpu(i, &cpu_core_setup_map) { +- if (cpu_data[cpu].package == cpu_data[i].package) { +- cpumask_set_cpu(i, &cpu_core_map[cpu]); +- cpumask_set_cpu(cpu, &cpu_core_map[i]); +- } +- } +-} +- +-/* +- * Calculate a new cpu_foreign_map mask whenever a +- * new cpu appears or disappears. +- */ +-void calculate_cpu_foreign_map(void) +-{ +- int i, k, core_present; +- cpumask_t temp_foreign_map; +- +- /* Re-calculate the mask */ +- cpumask_clear(&temp_foreign_map); +- for_each_online_cpu(i) { +- core_present = 0; +- for_each_cpu(k, &temp_foreign_map) +- if (cpus_are_siblings(i, k)) +- core_present = 1; +- if (!core_present) +- cpumask_set_cpu(i, &temp_foreign_map); +- } +- +- for_each_online_cpu(i) +- cpumask_andnot(&cpu_foreign_map[i], +- &temp_foreign_map, &cpu_sibling_map[i]); +-} +- + /* Preload SMP state for boot cpu */ + void smp_prepare_boot_cpu(void) + { diff --git a/queue-6.7/md-don-t-ignore-read-only-array-in-md_check_recovery.patch b/queue-6.7/md-don-t-ignore-read-only-array-in-md_check_recovery.patch new file mode 100644 index 00000000000..2c729214591 --- /dev/null +++ b/queue-6.7/md-don-t-ignore-read-only-array-in-md_check_recovery.patch @@ -0,0 +1,131 @@ +From 55a48ad2db64737f7ffc0407634218cc6e4c513b Mon Sep 17 00:00:00 2001 +From: Yu Kuai +Date: Thu, 1 Feb 2024 17:25:47 +0800 +Subject: md: Don't ignore read-only array in md_check_recovery() + +From: Yu Kuai + +commit 55a48ad2db64737f7ffc0407634218cc6e4c513b upstream. + +Usually if the array is not read-write, md_check_recovery() won't +register new sync_thread in the first place. And if the array is +read-write and sync_thread is registered, md_set_readonly() will +unregister sync_thread before setting the array read-only. md/raid +follow this behavior hence there is no problem. + +After commit f52f5c71f3d4 ("md: fix stopping sync thread"), following +hang can be triggered by test shell/integrity-caching.sh: + +1) array is read-only. dm-raid update super block: +rs_update_sbs + ro = mddev->ro + mddev->ro = 0 + -> set array read-write + md_update_sb + +2) register new sync thread concurrently. + +3) dm-raid set array back to read-only: +rs_update_sbs + mddev->ro = ro + +4) stop the array: +raid_dtr + md_stop + stop_sync_thread + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_wakeup_thread_directly(mddev->sync_thread); + wait_event(..., !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + +5) sync thread done: + md_do_sync + set_bit(MD_RECOVERY_DONE, &mddev->recovery); + md_wakeup_thread(mddev->thread); + +6) daemon thread can't unregister sync thread: + md_check_recovery + if (!md_is_rdwr(mddev) && + !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return; + -> -> MD_RECOVERY_RUNNING can't be cleared, hence step 4 hang; + +The root cause is that dm-raid manipulate 'mddev->ro' by itself, +however, dm-raid really should stop sync thread before setting the +array read-only. Unfortunately, I need to read more code before I +can refacter the handler of 'mddev->ro' in dm-raid, hence let's fix +the problem the easy way for now to prevent dm-raid regression. + +Reported-by: Mikulas Patocka +Closes: https://lore.kernel.org/all/9801e40-8ac7-e225-6a71-309dcf9dc9aa@redhat.com/ +Fixes: ecbfb9f118bc ("dm raid: add raid level takeover support") +Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") +Cc: stable@vger.kernel.org # v6.7+ +Signed-off-by: Yu Kuai +Signed-off-by: Song Liu +Link: https://lore.kernel.org/r/20240201092559.910982-3-yukuai1@huaweicloud.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/md.c | 31 ++++++++++++++++++------------- + 1 file changed, 18 insertions(+), 13 deletions(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -9495,6 +9495,20 @@ not_running: + sysfs_notify_dirent_safe(mddev->sysfs_action); + } + ++static void unregister_sync_thread(struct mddev *mddev) ++{ ++ if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { ++ /* resync/recovery still happening */ ++ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); ++ return; ++ } ++ ++ if (WARN_ON_ONCE(!mddev->sync_thread)) ++ return; ++ ++ md_reap_sync_thread(mddev); ++} ++ + /* + * This routine is regularly called by all per-raid-array threads to + * deal with generic issues like resync and super-block update. +@@ -9532,7 +9546,8 @@ void md_check_recovery(struct mddev *mdd + } + + if (!md_is_rdwr(mddev) && +- !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) ++ !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && ++ !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return; + if ( ! ( + (mddev->sb_flags & ~ (1<recovery)) { +- /* sync_work already queued. */ +- clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); ++ unregister_sync_thread(mddev); + goto unlock; + } + +@@ -9618,16 +9632,7 @@ void md_check_recovery(struct mddev *mdd + * still set. + */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { +- if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { +- /* resync/recovery still happening */ +- clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +- goto unlock; +- } +- +- if (WARN_ON_ONCE(!mddev->sync_thread)) +- goto unlock; +- +- md_reap_sync_thread(mddev); ++ unregister_sync_thread(mddev); + goto unlock; + } + diff --git a/queue-6.7/md-don-t-ignore-suspended-array-in-md_check_recovery.patch b/queue-6.7/md-don-t-ignore-suspended-array-in-md_check_recovery.patch new file mode 100644 index 00000000000..674389fad32 --- /dev/null +++ b/queue-6.7/md-don-t-ignore-suspended-array-in-md_check_recovery.patch @@ -0,0 +1,71 @@ +From 1baae052cccd08daf9a9d64c3f959d8cdb689757 Mon Sep 17 00:00:00 2001 +From: Yu Kuai +Date: Thu, 1 Feb 2024 17:25:46 +0800 +Subject: md: Don't ignore suspended array in md_check_recovery() + +From: Yu Kuai + +commit 1baae052cccd08daf9a9d64c3f959d8cdb689757 upstream. + +mddev_suspend() never stop sync_thread, hence it doesn't make sense to +ignore suspended array in md_check_recovery(), which might cause +sync_thread can't be unregistered. + +After commit f52f5c71f3d4 ("md: fix stopping sync thread"), following +hang can be triggered by test shell/integrity-caching.sh: + +1) suspend the array: +raid_postsuspend + mddev_suspend + +2) stop the array: +raid_dtr + md_stop + __md_stop_writes + stop_sync_thread + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_wakeup_thread_directly(mddev->sync_thread); + wait_event(..., !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + +3) sync thread done: +md_do_sync + set_bit(MD_RECOVERY_DONE, &mddev->recovery); + md_wakeup_thread(mddev->thread); + +4) daemon thread can't unregister sync thread: +md_check_recovery + if (mddev->suspended) + return; -> return directly + md_read_sync_thread + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + -> MD_RECOVERY_RUNNING can't be cleared, hence step 2 hang; + +This problem is not just related to dm-raid, fix it by ignoring +suspended array in md_check_recovery(). And follow up patches will +improve dm-raid better to frozen sync thread during suspend. + +Reported-by: Mikulas Patocka +Closes: https://lore.kernel.org/all/8fb335e-6d2c-dbb5-d7-ded8db5145a@redhat.com/ +Fixes: 68866e425be2 ("MD: no sync IO while suspended") +Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") +Cc: stable@vger.kernel.org # v6.7+ +Signed-off-by: Yu Kuai +Signed-off-by: Song Liu +Link: https://lore.kernel.org/r/20240201092559.910982-2-yukuai1@huaweicloud.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/md.c | 3 --- + 1 file changed, 3 deletions(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -9519,9 +9519,6 @@ not_running: + */ + void md_check_recovery(struct mddev *mddev) + { +- if (READ_ONCE(mddev->suspended)) +- return; +- + if (mddev->bitmap) + md_bitmap_daemon_work(mddev); + diff --git a/queue-6.7/md-don-t-register-sync_thread-for-reshape-directly.patch b/queue-6.7/md-don-t-register-sync_thread-for-reshape-directly.patch new file mode 100644 index 00000000000..fbcdc36e02f --- /dev/null +++ b/queue-6.7/md-don-t-register-sync_thread-for-reshape-directly.patch @@ -0,0 +1,158 @@ +From ad39c08186f8a0f221337985036ba86731d6aafe Mon Sep 17 00:00:00 2001 +From: Yu Kuai +Date: Thu, 1 Feb 2024 17:25:49 +0800 +Subject: md: Don't register sync_thread for reshape directly + +From: Yu Kuai + +commit ad39c08186f8a0f221337985036ba86731d6aafe upstream. + +Currently, if reshape is interrupted, then reassemble the array will +register sync_thread directly from pers->run(), in this case +'MD_RECOVERY_RUNNING' is set directly, however, there is no guarantee +that md_do_sync() will be executed, hence stop_sync_thread() will hang +because 'MD_RECOVERY_RUNNING' can't be cleared. + +Last patch make sure that md_do_sync() will set MD_RECOVERY_DONE, +however, following hang can still be triggered by dm-raid test +shell/lvconvert-raid-reshape.sh occasionally: + +[root@fedora ~]# cat /proc/1982/stack +[<0>] stop_sync_thread+0x1ab/0x270 [md_mod] +[<0>] md_frozen_sync_thread+0x5c/0xa0 [md_mod] +[<0>] raid_presuspend+0x1e/0x70 [dm_raid] +[<0>] dm_table_presuspend_targets+0x40/0xb0 [dm_mod] +[<0>] __dm_destroy+0x2a5/0x310 [dm_mod] +[<0>] dm_destroy+0x16/0x30 [dm_mod] +[<0>] dev_remove+0x165/0x290 [dm_mod] +[<0>] ctl_ioctl+0x4bb/0x7b0 [dm_mod] +[<0>] dm_ctl_ioctl+0x11/0x20 [dm_mod] +[<0>] vfs_ioctl+0x21/0x60 +[<0>] __x64_sys_ioctl+0xb9/0xe0 +[<0>] do_syscall_64+0xc6/0x230 +[<0>] entry_SYSCALL_64_after_hwframe+0x6c/0x74 + +Meanwhile mddev->recovery is: +MD_RECOVERY_RUNNING | +MD_RECOVERY_INTR | +MD_RECOVERY_RESHAPE | +MD_RECOVERY_FROZEN + +Fix this problem by remove the code to register sync_thread directly +from raid10 and raid5. And let md_check_recovery() to register +sync_thread. + +Fixes: f67055780caa ("[PATCH] md: Checkpoint and allow restart of raid5 reshape") +Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") +Cc: stable@vger.kernel.org # v6.7+ +Signed-off-by: Yu Kuai +Signed-off-by: Song Liu +Link: https://lore.kernel.org/r/20240201092559.910982-5-yukuai1@huaweicloud.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/md.c | 5 ++++- + drivers/md/raid10.c | 16 ++-------------- + drivers/md/raid5.c | 29 ++--------------------------- + 3 files changed, 8 insertions(+), 42 deletions(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -9422,6 +9422,7 @@ static void md_start_sync(struct work_st + struct mddev *mddev = container_of(ws, struct mddev, sync_work); + int spares = 0; + bool suspend = false; ++ char *name; + + if (md_spares_need_change(mddev)) + suspend = true; +@@ -9454,8 +9455,10 @@ static void md_start_sync(struct work_st + if (spares) + md_bitmap_write_all(mddev->bitmap); + ++ name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? ++ "reshape" : "resync"; + rcu_assign_pointer(mddev->sync_thread, +- md_register_thread(md_do_sync, mddev, "resync")); ++ md_register_thread(md_do_sync, mddev, name)); + if (!mddev->sync_thread) { + pr_warn("%s: could not start resync thread...\n", + mdname(mddev)); +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -4307,11 +4307,7 @@ static int raid10_run(struct mddev *mdde + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); +- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); +- rcu_assign_pointer(mddev->sync_thread, +- md_register_thread(md_do_sync, mddev, "reshape")); +- if (!mddev->sync_thread) +- goto out_free_conf; ++ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + + return 0; +@@ -4707,16 +4703,8 @@ out: + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); +- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); +- +- rcu_assign_pointer(mddev->sync_thread, +- md_register_thread(md_do_sync, mddev, "reshape")); +- if (!mddev->sync_thread) { +- ret = -EAGAIN; +- goto abort; +- } ++ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + conf->reshape_checkpoint = jiffies; +- md_wakeup_thread(mddev->sync_thread); + md_new_event(); + return 0; + +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -8002,11 +8002,7 @@ static int raid5_run(struct mddev *mddev + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); +- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); +- rcu_assign_pointer(mddev->sync_thread, +- md_register_thread(md_do_sync, mddev, "reshape")); +- if (!mddev->sync_thread) +- goto abort; ++ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + + /* Ok, everything is just fine now */ +@@ -8585,29 +8581,8 @@ static int raid5_start_reshape(struct md + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); +- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); +- rcu_assign_pointer(mddev->sync_thread, +- md_register_thread(md_do_sync, mddev, "reshape")); +- if (!mddev->sync_thread) { +- mddev->recovery = 0; +- spin_lock_irq(&conf->device_lock); +- write_seqcount_begin(&conf->gen_lock); +- mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; +- mddev->new_chunk_sectors = +- conf->chunk_sectors = conf->prev_chunk_sectors; +- mddev->new_layout = conf->algorithm = conf->prev_algo; +- rdev_for_each(rdev, mddev) +- rdev->new_data_offset = rdev->data_offset; +- smp_wmb(); +- conf->generation --; +- conf->reshape_progress = MaxSector; +- mddev->reshape_position = MaxSector; +- write_seqcount_end(&conf->gen_lock); +- spin_unlock_irq(&conf->device_lock); +- return -EAGAIN; +- } ++ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + conf->reshape_checkpoint = jiffies; +- md_wakeup_thread(mddev->sync_thread); + md_new_event(); + return 0; + } diff --git a/queue-6.7/md-don-t-suspend-the-array-for-interrupted-reshape.patch b/queue-6.7/md-don-t-suspend-the-array-for-interrupted-reshape.patch new file mode 100644 index 00000000000..aabcf8db695 --- /dev/null +++ b/queue-6.7/md-don-t-suspend-the-array-for-interrupted-reshape.patch @@ -0,0 +1,68 @@ +From 9e46c70e829bddc24e04f963471e9983a11598b7 Mon Sep 17 00:00:00 2001 +From: Yu Kuai +Date: Thu, 1 Feb 2024 17:25:50 +0800 +Subject: md: Don't suspend the array for interrupted reshape + +From: Yu Kuai + +commit 9e46c70e829bddc24e04f963471e9983a11598b7 upstream. + +md_start_sync() will suspend the array if there are spares that can be +added or removed from conf, however, if reshape is still in progress, +this won't happen at all or data will be corrupted(remove_and_add_spares +won't be called from md_choose_sync_action for reshape), hence there is +no need to suspend the array if reshape is not done yet. + +Meanwhile, there is a potential deadlock for raid456: + +1) reshape is interrupted; + +2) set one of the disk WantReplacement, and add a new disk to the array, + however, recovery won't start until the reshape is finished; + +3) then issue an IO across reshpae position, this IO will wait for + reshape to make progress; + +4) continue to reshape, then md_start_sync() found there is a spare disk + that can be added to conf, mddev_suspend() is called; + +Step 4 and step 3 is waiting for each other, deadlock triggered. Noted +this problem is found by code review, and it's not reporduced yet. + +Fix this porblem by don't suspend the array for interrupted reshape, +this is safe because conf won't be changed until reshape is done. + +Fixes: bc08041b32ab ("md: suspend array in md_start_sync() if array need reconfiguration") +Cc: stable@vger.kernel.org # v6.7+ +Signed-off-by: Yu Kuai +Signed-off-by: Song Liu +Link: https://lore.kernel.org/r/20240201092559.910982-6-yukuai1@huaweicloud.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/md.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -9424,12 +9424,17 @@ static void md_start_sync(struct work_st + bool suspend = false; + char *name; + +- if (md_spares_need_change(mddev)) ++ /* ++ * If reshape is still in progress, spares won't be added or removed ++ * from conf until reshape is done. ++ */ ++ if (mddev->reshape_position == MaxSector && ++ md_spares_need_change(mddev)) { + suspend = true; ++ mddev_suspend(mddev, false); ++ } + +- suspend ? mddev_suspend_and_lock_nointr(mddev) : +- mddev_lock_nointr(mddev); +- ++ mddev_lock_nointr(mddev); + if (!md_is_rdwr(mddev)) { + /* + * On a read-only array we can: diff --git a/queue-6.7/md-fix-missing-release-of-active_io-for-flush.patch b/queue-6.7/md-fix-missing-release-of-active_io-for-flush.patch new file mode 100644 index 00000000000..3004708c5e9 --- /dev/null +++ b/queue-6.7/md-fix-missing-release-of-active_io-for-flush.patch @@ -0,0 +1,58 @@ +From 855678ed8534518e2b428bcbcec695de9ba248e8 Mon Sep 17 00:00:00 2001 +From: Yu Kuai +Date: Thu, 1 Feb 2024 17:25:51 +0800 +Subject: md: Fix missing release of 'active_io' for flush + +From: Yu Kuai + +commit 855678ed8534518e2b428bcbcec695de9ba248e8 upstream. + +submit_flushes + atomic_set(&mddev->flush_pending, 1); + rdev_for_each_rcu(rdev, mddev) + atomic_inc(&mddev->flush_pending); + bi->bi_end_io = md_end_flush + submit_bio(bi); + /* flush io is done first */ + md_end_flush + if (atomic_dec_and_test(&mddev->flush_pending)) + percpu_ref_put(&mddev->active_io) + -> active_io is not released + + if (atomic_dec_and_test(&mddev->flush_pending)) + -> missing release of active_io + +For consequence, mddev_suspend() will wait for 'active_io' to be zero +forever. + +Fix this problem by releasing 'active_io' in submit_flushes() if +'flush_pending' is decreased to zero. + +Fixes: fa2bbff7b0b4 ("md: synchronize flush io with array reconfiguration") +Cc: stable@vger.kernel.org # v6.1+ +Reported-by: Blazej Kucman +Closes: https://lore.kernel.org/lkml/20240130172524.0000417b@linux.intel.com/ +Signed-off-by: Yu Kuai +Signed-off-by: Song Liu +Link: https://lore.kernel.org/r/20240201092559.910982-7-yukuai1@huaweicloud.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/md.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -579,8 +579,12 @@ static void submit_flushes(struct work_s + rcu_read_lock(); + } + rcu_read_unlock(); +- if (atomic_dec_and_test(&mddev->flush_pending)) ++ if (atomic_dec_and_test(&mddev->flush_pending)) { ++ /* The pair is percpu_ref_get() from md_flush_request() */ ++ percpu_ref_put(&mddev->active_io); ++ + queue_work(md_wq, &mddev->flush_work); ++ } + } + + static void md_submit_flush_data(struct work_struct *ws) diff --git a/queue-6.7/md-make-sure-md_do_sync-will-set-md_recovery_done.patch b/queue-6.7/md-make-sure-md_do_sync-will-set-md_recovery_done.patch new file mode 100644 index 00000000000..f8f375fe2f7 --- /dev/null +++ b/queue-6.7/md-make-sure-md_do_sync-will-set-md_recovery_done.patch @@ -0,0 +1,62 @@ +From 82ec0ae59d02e89164b24c0cc8e4e50de78b5fd6 Mon Sep 17 00:00:00 2001 +From: Yu Kuai +Date: Thu, 1 Feb 2024 17:25:48 +0800 +Subject: md: Make sure md_do_sync() will set MD_RECOVERY_DONE + +From: Yu Kuai + +commit 82ec0ae59d02e89164b24c0cc8e4e50de78b5fd6 upstream. + +stop_sync_thread() will interrupt md_do_sync(), and md_do_sync() must +set MD_RECOVERY_DONE, so that follow up md_check_recovery() will +unregister sync_thread, clear MD_RECOVERY_RUNNING and wake up +stop_sync_thread(). + +If MD_RECOVERY_WAIT is set or the array is read-only, md_do_sync() will +return without setting MD_RECOVERY_DONE, and after commit f52f5c71f3d4 +("md: fix stopping sync thread"), dm-raid switch from +md_reap_sync_thread() to stop_sync_thread() to unregister sync_thread +from md_stop() and md_stop_writes(), causing the test +shell/lvconvert-raid-reshape.sh hang. + +We shouldn't switch back to md_reap_sync_thread() because it's +problematic in the first place. Fix the problem by making sure +md_do_sync() will set MD_RECOVERY_DONE. + +Reported-by: Mikulas Patocka +Closes: https://lore.kernel.org/all/ece2b06f-d647-6613-a534-ff4c9bec1142@redhat.com/ +Fixes: d5d885fd514f ("md: introduce new personality funciton start()") +Fixes: 5fd6c1dce06e ("[PATCH] md: allow checkpoint of recovery with version-1 superblock") +Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") +Cc: stable@vger.kernel.org # v6.7+ +Signed-off-by: Yu Kuai +Signed-off-by: Song Liu +Link: https://lore.kernel.org/r/20240201092559.910982-4-yukuai1@huaweicloud.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/md.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -8813,12 +8813,16 @@ void md_do_sync(struct md_thread *thread + int ret; + + /* just incase thread restarts... */ +- if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || +- test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) ++ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return; +- if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ ++ ++ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) ++ goto skip; ++ ++ if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || ++ !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); +- return; ++ goto skip; + } + + if (mddev_is_clustered(mddev)) { diff --git a/queue-6.7/mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch b/queue-6.7/mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch new file mode 100644 index 00000000000..24bd2799d93 --- /dev/null +++ b/queue-6.7/mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch @@ -0,0 +1,64 @@ +From e9e3db69966d5e9e6f7e7d017b407c0025180fe5 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Mon, 5 Feb 2024 12:13:06 -0800 +Subject: mm/damon/core: check apply interval in damon_do_apply_schemes() + +From: SeongJae Park + +commit e9e3db69966d5e9e6f7e7d017b407c0025180fe5 upstream. + +kdamond_apply_schemes() checks apply intervals of schemes and avoid +further applying any schemes if no scheme passed its apply interval. +However, the following schemes applying function, damon_do_apply_schemes() +iterates all schemes without the apply interval check. As a result, the +shortest apply interval is applied to all schemes. Fix the problem by +checking the apply interval in damon_do_apply_schemes(). + +Link: https://lkml.kernel.org/r/20240205201306.88562-1-sj@kernel.org +Fixes: 42f994b71404 ("mm/damon/core: implement scheme-specific apply interval") +Signed-off-by: SeongJae Park +Cc: [6.7.x] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/core.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +--- a/mm/damon/core.c ++++ b/mm/damon/core.c +@@ -1026,6 +1026,9 @@ static void damon_do_apply_schemes(struc + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + ++ if (c->passed_sample_intervals != s->next_apply_sis) ++ continue; ++ + if (!s->wmarks.activated) + continue; + +@@ -1126,10 +1129,6 @@ static void kdamond_apply_schemes(struct + if (c->passed_sample_intervals != s->next_apply_sis) + continue; + +- s->next_apply_sis += +- (s->apply_interval_us ? s->apply_interval_us : +- c->attrs.aggr_interval) / sample_interval; +- + if (!s->wmarks.activated) + continue; + +@@ -1145,6 +1144,14 @@ static void kdamond_apply_schemes(struct + damon_for_each_region_safe(r, next_r, t) + damon_do_apply_schemes(c, t, r); + } ++ ++ damon_for_each_scheme(s, c) { ++ if (c->passed_sample_intervals != s->next_apply_sis) ++ continue; ++ s->next_apply_sis += ++ (s->apply_interval_us ? s->apply_interval_us : ++ c->attrs.aggr_interval) / sample_interval; ++ } + } + + /* diff --git a/queue-6.7/mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch b/queue-6.7/mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch new file mode 100644 index 00000000000..646244f4b31 --- /dev/null +++ b/queue-6.7/mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch @@ -0,0 +1,101 @@ +From 13d0599ab3b2ff17f798353f24bcbef1659d3cfc Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Fri, 16 Feb 2024 11:40:25 -0800 +Subject: mm/damon/lru_sort: fix quota status loss due to online tunings + +From: SeongJae Park + +commit 13d0599ab3b2ff17f798353f24bcbef1659d3cfc upstream. + +For online parameters change, DAMON_LRU_SORT creates new schemes based on +latest values of the parameters and replaces the old schemes with the new +one. When creating it, the internal status of the quotas of the old +schemes is not preserved. As a result, charging of the quota starts from +zero after the online tuning. The data that collected to estimate the +throughput of the scheme's action is also reset, and therefore the +estimation should start from the scratch again. Because the throughput +estimation is being used to convert the time quota to the effective size +quota, this could result in temporal time quota inaccuracy. It would be +recovered over time, though. In short, the quota accuracy could be +temporarily degraded after online parameters update. + +Fix the problem by checking the case and copying the internal fields for +the status. + +Link: https://lkml.kernel.org/r/20240216194025.9207-3-sj@kernel.org +Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting") +Signed-off-by: SeongJae Park +Cc: [6.0+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/lru_sort.c | 43 ++++++++++++++++++++++++++++++++++++------- + 1 file changed, 36 insertions(+), 7 deletions(-) + +--- a/mm/damon/lru_sort.c ++++ b/mm/damon/lru_sort.c +@@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_ + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); + } + ++static void damon_lru_sort_copy_quota_status(struct damos_quota *dst, ++ struct damos_quota *src) ++{ ++ dst->total_charged_sz = src->total_charged_sz; ++ dst->total_charged_ns = src->total_charged_ns; ++ dst->charged_sz = src->charged_sz; ++ dst->charged_from = src->charged_from; ++ dst->charge_target_from = src->charge_target_from; ++ dst->charge_addr_from = src->charge_addr_from; ++} ++ + static int damon_lru_sort_apply_parameters(void) + { +- struct damos *scheme; ++ struct damos *scheme, *hot_scheme, *cold_scheme; ++ struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL; + unsigned int hot_thres, cold_thres; + int err = 0; + +@@ -195,18 +207,35 @@ static int damon_lru_sort_apply_paramete + if (err) + return err; + ++ damon_for_each_scheme(scheme, ctx) { ++ if (!old_hot_scheme) { ++ old_hot_scheme = scheme; ++ continue; ++ } ++ old_cold_scheme = scheme; ++ } ++ + hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) * + hot_thres_access_freq / 1000; +- scheme = damon_lru_sort_new_hot_scheme(hot_thres); +- if (!scheme) ++ hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres); ++ if (!hot_scheme) + return -ENOMEM; +- damon_set_schemes(ctx, &scheme, 1); ++ if (old_hot_scheme) ++ damon_lru_sort_copy_quota_status(&hot_scheme->quota, ++ &old_hot_scheme->quota); + + cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; +- scheme = damon_lru_sort_new_cold_scheme(cold_thres); +- if (!scheme) ++ cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres); ++ if (!cold_scheme) { ++ damon_destroy_scheme(hot_scheme); + return -ENOMEM; +- damon_add_scheme(ctx, scheme); ++ } ++ if (old_cold_scheme) ++ damon_lru_sort_copy_quota_status(&cold_scheme->quota, ++ &old_cold_scheme->quota); ++ ++ damon_set_schemes(ctx, &hot_scheme, 1); ++ damon_add_scheme(ctx, cold_scheme); + + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, diff --git a/queue-6.7/mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch b/queue-6.7/mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch new file mode 100644 index 00000000000..1f5e1386dcf --- /dev/null +++ b/queue-6.7/mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch @@ -0,0 +1,80 @@ +From 1b0ca4e4ff10a2c8402e2cf70132c683e1c772e4 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Fri, 16 Feb 2024 11:40:24 -0800 +Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings + +From: SeongJae Park + +commit 1b0ca4e4ff10a2c8402e2cf70132c683e1c772e4 upstream. + +Patch series "mm/damon: fix quota status loss due to online tunings". + +DAMON_RECLAIM and DAMON_LRU_SORT is not preserving internal quota status +when applying new user parameters, and hence could cause temporal quota +accuracy degradation. Fix it by preserving the status. + + +This patch (of 2): + +For online parameters change, DAMON_RECLAIM creates new scheme based on +latest values of the parameters and replaces the old scheme with the new +one. When creating it, the internal status of the quota of the old +scheme is not preserved. As a result, charging of the quota starts from +zero after the online tuning. The data that collected to estimate the +throughput of the scheme's action is also reset, and therefore the +estimation should start from the scratch again. Because the throughput +estimation is being used to convert the time quota to the effective size +quota, this could result in temporal time quota inaccuracy. It would be +recovered over time, though. In short, the quota accuracy could be +temporarily degraded after online parameters update. + +Fix the problem by checking the case and copying the internal fields for +the status. + +Link: https://lkml.kernel.org/r/20240216194025.9207-1-sj@kernel.org +Link: https://lkml.kernel.org/r/20240216194025.9207-2-sj@kernel.org +Fixes: e035c280f6df ("mm/damon/reclaim: support online inputs update") +Signed-off-by: SeongJae Park +Cc: [5.19+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/reclaim.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +--- a/mm/damon/reclaim.c ++++ b/mm/damon/reclaim.c +@@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_s + &damon_reclaim_wmarks); + } + ++static void damon_reclaim_copy_quota_status(struct damos_quota *dst, ++ struct damos_quota *src) ++{ ++ dst->total_charged_sz = src->total_charged_sz; ++ dst->total_charged_ns = src->total_charged_ns; ++ dst->charged_sz = src->charged_sz; ++ dst->charged_from = src->charged_from; ++ dst->charge_target_from = src->charge_target_from; ++ dst->charge_addr_from = src->charge_addr_from; ++} ++ + static int damon_reclaim_apply_parameters(void) + { +- struct damos *scheme; ++ struct damos *scheme, *old_scheme; + struct damos_filter *filter; + int err = 0; + +@@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameter + scheme = damon_reclaim_new_scheme(); + if (!scheme) + return -ENOMEM; ++ if (!list_empty(&ctx->schemes)) { ++ damon_for_each_scheme(old_scheme, ctx) ++ damon_reclaim_copy_quota_status(&scheme->quota, ++ &old_scheme->quota); ++ } + if (skip_anon) { + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + if (!filter) { diff --git a/queue-6.7/mm-memblock-add-memblock_rsrv_noinit-into-flagname-array.patch b/queue-6.7/mm-memblock-add-memblock_rsrv_noinit-into-flagname-array.patch new file mode 100644 index 00000000000..580b1503917 --- /dev/null +++ b/queue-6.7/mm-memblock-add-memblock_rsrv_noinit-into-flagname-array.patch @@ -0,0 +1,35 @@ +From 4f155af0ae4464134bfcfd9f043b6b727c84e947 Mon Sep 17 00:00:00 2001 +From: Anshuman Khandual +Date: Fri, 9 Feb 2024 08:39:12 +0530 +Subject: mm/memblock: add MEMBLOCK_RSRV_NOINIT into flagname[] array + +From: Anshuman Khandual + +commit 4f155af0ae4464134bfcfd9f043b6b727c84e947 upstream. + +The commit 77e6c43e137c ("memblock: introduce MEMBLOCK_RSRV_NOINIT flag") +skipped adding this newly introduced memblock flag into flagname[] array, +thus preventing a correct memblock flags output for applicable memblock +regions. + +Link: https://lkml.kernel.org/r/20240209030912.1382251-1-anshuman.khandual@arm.com +Fixes: 77e6c43e137c ("memblock: introduce MEMBLOCK_RSRV_NOINIT flag") +Signed-off-by: Anshuman Khandual +Reviewed-by: Mike Rapoport +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/memblock.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/memblock.c ++++ b/mm/memblock.c +@@ -2214,6 +2214,7 @@ static const char * const flagname[] = { + [ilog2(MEMBLOCK_MIRROR)] = "MIRROR", + [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", + [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", ++ [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", + }; + + static int memblock_debug_show(struct seq_file *m, void *private) diff --git a/queue-6.7/mm-memcontrol-clarify-swapaccount-0-deprecation-warning.patch b/queue-6.7/mm-memcontrol-clarify-swapaccount-0-deprecation-warning.patch new file mode 100644 index 00000000000..4bab18a6fe2 --- /dev/null +++ b/queue-6.7/mm-memcontrol-clarify-swapaccount-0-deprecation-warning.patch @@ -0,0 +1,59 @@ +From 118642d7f606fc9b9c92ee611275420320290ffb Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Tue, 13 Feb 2024 03:16:34 -0500 +Subject: mm: memcontrol: clarify swapaccount=0 deprecation warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Johannes Weiner + +commit 118642d7f606fc9b9c92ee611275420320290ffb upstream. + +The swapaccount deprecation warning is throwing false positives. Since we +deprecated the knob and defaulted to enabling, the only reports we've been +getting are from folks that set swapaccount=1. While this is a nice +affirmation that always-enabling was the right choice, we certainly don't +want to warn when users request the supported mode. + +Only warn when disabling is requested, and clarify the warning. + +[colin.i.king@gmail.com: spelling: "commdandline" -> "commandline"] + Link: https://lkml.kernel.org/r/20240215090544.1649201-1-colin.i.king@gmail.com +Link: https://lkml.kernel.org/r/20240213081634.3652326-1-hannes@cmpxchg.org +Fixes: b25806dcd3d5 ("mm: memcontrol: deprecate swapaccounting=0 mode") +Signed-off-by: Colin Ian King +Reported-by: "Jonas Schäfer" +Reported-by: Narcis Garcia +Suggested-by: Yosry Ahmed +Signed-off-by: Johannes Weiner +Reviewed-by: Yosry Ahmed +Acked-by: Michal Hocko +Acked-by: Shakeel Butt +Cc: Roman Gushchin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/memcontrol.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -7905,9 +7905,13 @@ bool mem_cgroup_swap_full(struct folio * + + static int __init setup_swap_account(char *s) + { +- pr_warn_once("The swapaccount= commandline option is deprecated. " +- "Please report your usecase to linux-mm@kvack.org if you " +- "depend on this functionality.\n"); ++ bool res; ++ ++ if (!kstrtobool(s, &res) && !res) ++ pr_warn_once("The swapaccount=0 commandline option is deprecated " ++ "in favor of configuring swap control via cgroupfs. " ++ "Please report your usecase to linux-mm@kvack.org if you " ++ "depend on this functionality.\n"); + return 1; + } + __setup("swapaccount=", setup_swap_account); diff --git a/queue-6.7/mm-swap-fix-race-when-skipping-swapcache.patch b/queue-6.7/mm-swap-fix-race-when-skipping-swapcache.patch new file mode 100644 index 00000000000..2f4167777e2 --- /dev/null +++ b/queue-6.7/mm-swap-fix-race-when-skipping-swapcache.patch @@ -0,0 +1,226 @@ +From 13ddaf26be324a7f951891ecd9ccd04466d27458 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 7 Feb 2024 02:25:59 +0800 +Subject: mm/swap: fix race when skipping swapcache + +From: Kairui Song + +commit 13ddaf26be324a7f951891ecd9ccd04466d27458 upstream. + +When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads +swapin the same entry at the same time, they get different pages (A, B). +Before one thread (T0) finishes the swapin and installs page (A) to the +PTE, another thread (T1) could finish swapin of page (B), swap_free the +entry, then swap out the possibly modified page reusing the same entry. +It breaks the pte_same check in (T0) because PTE value is unchanged, +causing ABA problem. Thread (T0) will install a stalled page (A) into the +PTE and cause data corruption. + +One possible callstack is like this: + +CPU0 CPU1 +---- ---- +do_swap_page() do_swap_page() with same entry + + +swap_read_folio() <- read to page A swap_read_folio() <- read to page B + +... set_pte_at() + swap_free() <- entry is free + + +pte_same() <- Check pass, PTE seems + unchanged, but page A + is stalled! +swap_free() <- page B content lost! +set_pte_at() <- staled page A installed! + +And besides, for ZRAM, swap_free() allows the swap device to discard the +entry content, so even if page (B) is not modified, if swap_read_folio() +on CPU0 happens later than swap_free() on CPU1, it may also cause data +loss. + +To fix this, reuse swapcache_prepare which will pin the swap entry using +the cache flag, and allow only one thread to swap it in, also prevent any +parallel code from putting the entry in the cache. Release the pin after +PT unlocked. + +Racers just loop and wait since it's a rare and very short event. A +schedule_timeout_uninterruptible(1) call is added to avoid repeated page +faults wasting too much CPU, causing livelock or adding too much noise to +perf statistics. A similar livelock issue was described in commit +029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead") + +Reproducer: + +This race issue can be triggered easily using a well constructed +reproducer and patched brd (with a delay in read path) [1]: + +With latest 6.8 mainline, race caused data loss can be observed easily: +$ gcc -g -lpthread test-thread-swap-race.c && ./a.out + Polulating 32MB of memory region... + Keep swapping out... + Starting round 0... + Spawning 65536 workers... + 32746 workers spawned, wait for done... + Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss! + Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss! + Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss! + Round 0 Failed, 15 data loss! + +This reproducer spawns multiple threads sharing the same memory region +using a small swap device. Every two threads updates mapped pages one by +one in opposite direction trying to create a race, with one dedicated +thread keep swapping out the data out using madvise. + +The reproducer created a reproduce rate of about once every 5 minutes, so +the race should be totally possible in production. + +After this patch, I ran the reproducer for over a few hundred rounds and +no data loss observed. + +Performance overhead is minimal, microbenchmark swapin 10G from 32G +zram: + +Before: 10934698 us +After: 11157121 us +Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag) + +[kasong@tencent.com: v4] + Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com +Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com +Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device") +Reported-by: "Huang, Ying" +Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel.com/ +Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1] +Signed-off-by: Kairui Song +Reviewed-by: "Huang, Ying" +Acked-by: Yu Zhao +Acked-by: David Hildenbrand +Acked-by: Chris Li +Cc: Hugh Dickins +Cc: Johannes Weiner +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Minchan Kim +Cc: Yosry Ahmed +Cc: Yu Zhao +Cc: Barry Song <21cnbao@gmail.com> +Cc: SeongJae Park +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/swap.h | 5 +++++ + mm/memory.c | 20 ++++++++++++++++++++ + mm/swap.h | 5 +++++ + mm/swapfile.c | 13 +++++++++++++ + 4 files changed, 43 insertions(+) + +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -553,6 +553,11 @@ static inline int swap_duplicate(swp_ent + return 0; + } + ++static inline int swapcache_prepare(swp_entry_t swp) ++{ ++ return 0; ++} ++ + static inline void swap_free(swp_entry_t swp) + { + } +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -3785,6 +3785,7 @@ vm_fault_t do_swap_page(struct vm_fault + struct page *page; + struct swap_info_struct *si = NULL; + rmap_t rmap_flags = RMAP_NONE; ++ bool need_clear_cache = false; + bool exclusive = false; + swp_entry_t entry; + pte_t pte; +@@ -3853,6 +3854,20 @@ vm_fault_t do_swap_page(struct vm_fault + if (!folio) { + if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && + __swap_count(entry) == 1) { ++ /* ++ * Prevent parallel swapin from proceeding with ++ * the cache flag. Otherwise, another thread may ++ * finish swapin first, free the entry, and swapout ++ * reusing the same entry. It's undetectable as ++ * pte_same() returns true due to entry reuse. ++ */ ++ if (swapcache_prepare(entry)) { ++ /* Relax a bit to prevent rapid repeated page faults */ ++ schedule_timeout_uninterruptible(1); ++ goto out; ++ } ++ need_clear_cache = true; ++ + /* skip swapcache */ + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, + vma, vmf->address, false); +@@ -4099,6 +4114,9 @@ unlock: + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); + out: ++ /* Clear the swap cache pin for direct swapin after PTL unlock */ ++ if (need_clear_cache) ++ swapcache_clear(si, entry); + if (si) + put_swap_device(si); + return ret; +@@ -4113,6 +4131,8 @@ out_release: + folio_unlock(swapcache); + folio_put(swapcache); + } ++ if (need_clear_cache) ++ swapcache_clear(si, entry); + if (si) + put_swap_device(si); + return ret; +--- a/mm/swap.h ++++ b/mm/swap.h +@@ -40,6 +40,7 @@ void __delete_from_swap_cache(struct fol + void delete_from_swap_cache(struct folio *folio); + void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end); ++void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry); + struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr); + struct folio *filemap_get_incore_folio(struct address_space *mapping, +@@ -97,6 +98,10 @@ static inline int swap_writepage(struct + return 0; + } + ++static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) ++{ ++} ++ + static inline struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) + { +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -3363,6 +3363,19 @@ int swapcache_prepare(swp_entry_t entry) + return __swap_duplicate(entry, SWAP_HAS_CACHE); + } + ++void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) ++{ ++ struct swap_cluster_info *ci; ++ unsigned long offset = swp_offset(entry); ++ unsigned char usage; ++ ++ ci = lock_cluster_or_swap_info(si, offset); ++ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); ++ unlock_cluster_or_swap_info(si, ci); ++ if (!usage) ++ free_swap_slot(entry); ++} ++ + struct swap_info_struct *swp_swap_info(swp_entry_t entry) + { + return swap_type_to_swap_info(swp_type(entry)); diff --git a/queue-6.7/platform-x86-intel-vbtn-stop-calling-vbdl-from-notify_handler.patch b/queue-6.7/platform-x86-intel-vbtn-stop-calling-vbdl-from-notify_handler.patch new file mode 100644 index 00000000000..f1625e4ae81 --- /dev/null +++ b/queue-6.7/platform-x86-intel-vbtn-stop-calling-vbdl-from-notify_handler.patch @@ -0,0 +1,50 @@ +From 84c16d01ff219bc0a5dca5219db6b8b86a6854fb Mon Sep 17 00:00:00 2001 +From: Hans de Goede +Date: Fri, 16 Feb 2024 21:33:00 +0100 +Subject: platform/x86: intel-vbtn: Stop calling "VBDL" from notify_handler + +From: Hans de Goede + +commit 84c16d01ff219bc0a5dca5219db6b8b86a6854fb upstream. + +Commit 14c200b7ca46 ("platform/x86: intel-vbtn: Fix missing +tablet-mode-switch events") causes 2 issues on the ThinkPad X1 Tablet Gen2: + +1. The ThinkPad will wake up immediately from suspend +2. When put in tablet mode SW_TABLET_MODE reverts to 0 after about 1 second + +Both these issues are caused by the "VBDL" ACPI method call added +at the end of the notify_handler. + +And it never became entirely clear if this call is even necessary to fix +the issue of missing tablet-mode-switch events on the Dell Inspiron 7352. + +Drop the "VBDL" ACPI method call again to fix the 2 issues this is +causing on the ThinkPad X1 Tablet Gen2. + +Fixes: 14c200b7ca46 ("platform/x86: intel-vbtn: Fix missing tablet-mode-switch events") +Reported-by: Alexander Kobel +Closes: https://lore.kernel.org/platform-driver-x86/295984ce-bd4b-49bd-adc5-ffe7c898d7f0@a-kobel.de/ +Cc: regressions@lists.linux.dev +Cc: Arnold Gozum +Cc: stable@vger.kernel.org +Signed-off-by: Hans de Goede +Tested-by: Alexander Kobel +Link: https://lore.kernel.org/r/20240216203300.245826-1-hdegoede@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/platform/x86/intel/vbtn.c | 3 --- + 1 file changed, 3 deletions(-) + +--- a/drivers/platform/x86/intel/vbtn.c ++++ b/drivers/platform/x86/intel/vbtn.c +@@ -200,9 +200,6 @@ static void notify_handler(acpi_handle h + autorelease = val && (!ke_rel || ke_rel->type == KE_IGNORE); + + sparse_keymap_report_event(input_dev, event, val, autorelease); +- +- /* Some devices need this to report further events */ +- acpi_evaluate_object(handle, "VBDL", NULL, NULL); + } + + /* diff --git a/queue-6.7/platform-x86-touchscreen_dmi-allow-partial-prefix-matches-for-acpi-names.patch b/queue-6.7/platform-x86-touchscreen_dmi-allow-partial-prefix-matches-for-acpi-names.patch new file mode 100644 index 00000000000..cfee8ed2469 --- /dev/null +++ b/queue-6.7/platform-x86-touchscreen_dmi-allow-partial-prefix-matches-for-acpi-names.patch @@ -0,0 +1,58 @@ +From dbcbfd662a725641d118fb3ae5ffb7be4e3d0fb0 Mon Sep 17 00:00:00 2001 +From: Hans de Goede +Date: Mon, 12 Feb 2024 13:06:07 +0100 +Subject: platform/x86: touchscreen_dmi: Allow partial (prefix) matches for ACPI names + +From: Hans de Goede + +commit dbcbfd662a725641d118fb3ae5ffb7be4e3d0fb0 upstream. + +On some devices the ACPI name of the touchscreen is e.g. either +MSSL1680:00 or MSSL1680:01 depending on the BIOS version. + +This happens for example on the "Chuwi Hi8 Air" tablet where the initial +commit's ts_data uses "MSSL1680:00" but the tablets from the github issue +and linux-hardware.org probe linked below both use "MSSL1680:01". + +Replace the strcmp() match on ts_data->acpi_name with a strstarts() +check to allow using a partial match on just the ACPI HID of "MSSL1680" +and change the ts_data->acpi_name for the "Chuwi Hi8 Air" accordingly +to fix the touchscreen not working on models where it is "MSSL1680:01". + +Note this drops the length check for I2C_NAME_SIZE. This never was +necessary since the ACPI names used are never more then 11 chars and +I2C_NAME_SIZE is 20 so the replaced strncmp() would always stop long +before reaching I2C_NAME_SIZE. + +Link: https://linux-hardware.org/?computer=AC4301C0542A +Fixes: bbb97d728f77 ("platform/x86: touchscreen_dmi: Add info for the Chuwi Hi8 Air tablet") +Closes: https://github.com/onitake/gsl-firmware/issues/91 +Cc: stable@vger.kernel.org +Reviewed-by: Kuppuswamy Sathyanarayanan +Signed-off-by: Hans de Goede +Link: https://lore.kernel.org/r/20240212120608.30469-1-hdegoede@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/platform/x86/touchscreen_dmi.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/platform/x86/touchscreen_dmi.c ++++ b/drivers/platform/x86/touchscreen_dmi.c +@@ -81,7 +81,7 @@ static const struct property_entry chuwi + }; + + static const struct ts_dmi_data chuwi_hi8_air_data = { +- .acpi_name = "MSSL1680:00", ++ .acpi_name = "MSSL1680", + .properties = chuwi_hi8_air_props, + }; + +@@ -1821,7 +1821,7 @@ static void ts_dmi_add_props(struct i2c_ + int error; + + if (has_acpi_companion(dev) && +- !strncmp(ts_data->acpi_name, client->name, I2C_NAME_SIZE)) { ++ strstarts(client->name, ts_data->acpi_name)) { + error = device_create_managed_software_node(dev, ts_data->properties, NULL); + if (error) + dev_err(dev, "failed to add properties: %d\n", error); diff --git a/queue-6.7/platform-x86-x86-android-tablets-fix-keyboard-touchscreen-on-lenovo-yogabook1-x90.patch b/queue-6.7/platform-x86-x86-android-tablets-fix-keyboard-touchscreen-on-lenovo-yogabook1-x90.patch new file mode 100644 index 00000000000..5b820421168 --- /dev/null +++ b/queue-6.7/platform-x86-x86-android-tablets-fix-keyboard-touchscreen-on-lenovo-yogabook1-x90.patch @@ -0,0 +1,68 @@ +From bd8905d70944aae5063fd91c667e6f846ee92718 Mon Sep 17 00:00:00 2001 +From: Hans de Goede +Date: Fri, 16 Feb 2024 21:17:18 +0100 +Subject: platform/x86: x86-android-tablets: Fix keyboard touchscreen on Lenovo Yogabook1 X90 + +From: Hans de Goede + +commit bd8905d70944aae5063fd91c667e6f846ee92718 upstream. + +After commit 4014ae236b1d ("platform/x86: x86-android-tablets: Stop using +gpiolib private APIs") the touchscreen in the keyboard half of +the Lenovo Yogabook1 X90 stopped working with the following error: + + Goodix-TS i2c-goodix_ts: error -EBUSY: Failed to get irq GPIO + +The problem is that when getting the IRQ for instantiated i2c_client-s +from a GPIO (rather then using an IRQ directly from the IOAPIC), +x86_acpi_irq_helper_get() now properly requests the GPIO, which disallows +other drivers from requesting it. Normally this is a good thing, but +the goodix touchscreen also uses the IRQ as an output during reset +to select which of its 2 possible I2C addresses should be used. + +Add a new free_gpio flag to struct x86_acpi_irq_data to deal with this +and release the GPIO after getting the IRQ in this special case. + +Fixes: 4014ae236b1d ("platform/x86: x86-android-tablets: Stop using gpiolib private APIs") +Cc: stable@vger.kernel.org +Signed-off-by: Hans de Goede +Link: https://lore.kernel.org/r/20240216201721.239791-2-hdegoede@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/platform/x86/x86-android-tablets/core.c | 3 +++ + drivers/platform/x86/x86-android-tablets/lenovo.c | 1 + + drivers/platform/x86/x86-android-tablets/x86-android-tablets.h | 1 + + 3 files changed, 5 insertions(+) + +--- a/drivers/platform/x86/x86-android-tablets/core.c ++++ b/drivers/platform/x86/x86-android-tablets/core.c +@@ -113,6 +113,9 @@ int x86_acpi_irq_helper_get(const struct + if (irq_type != IRQ_TYPE_NONE && irq_type != irq_get_trigger_type(irq)) + irq_set_irq_type(irq, irq_type); + ++ if (data->free_gpio) ++ devm_gpiod_put(&x86_android_tablet_device->dev, gpiod); ++ + return irq; + case X86_ACPI_IRQ_TYPE_PMIC: + status = acpi_get_handle(NULL, data->chip, &handle); +--- a/drivers/platform/x86/x86-android-tablets/lenovo.c ++++ b/drivers/platform/x86/x86-android-tablets/lenovo.c +@@ -96,6 +96,7 @@ static const struct x86_i2c_client_info + .trigger = ACPI_EDGE_SENSITIVE, + .polarity = ACPI_ACTIVE_LOW, + .con_id = "goodix_ts_irq", ++ .free_gpio = true, + }, + }, { + /* Wacom Digitizer in keyboard half */ +--- a/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h ++++ b/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h +@@ -38,6 +38,7 @@ struct x86_acpi_irq_data { + int index; + int trigger; /* ACPI_EDGE_SENSITIVE / ACPI_LEVEL_SENSITIVE */ + int polarity; /* ACPI_ACTIVE_HIGH / ACPI_ACTIVE_LOW / ACPI_ACTIVE_BOTH */ ++ bool free_gpio; /* Release GPIO after getting IRQ (for TYPE_GPIOINT) */ + const char *con_id; + }; + diff --git a/queue-6.7/revert-parisc-only-list-existing-cpus-in-cpu_possible_mask.patch b/queue-6.7/revert-parisc-only-list-existing-cpus-in-cpu_possible_mask.patch new file mode 100644 index 00000000000..77f45b4694f --- /dev/null +++ b/queue-6.7/revert-parisc-only-list-existing-cpus-in-cpu_possible_mask.patch @@ -0,0 +1,57 @@ +From 82b143aeb169b8b55798d7d2063032e1a6ceeeb0 Mon Sep 17 00:00:00 2001 +From: Helge Deller +Date: Mon, 5 Feb 2024 10:39:20 +0100 +Subject: Revert "parisc: Only list existing CPUs in cpu_possible_mask" + +From: Helge Deller + +commit 82b143aeb169b8b55798d7d2063032e1a6ceeeb0 upstream. + +This reverts commit 0921244f6f4f0d05698b953fe632a99b38907226. + +It broke CPU hotplugging because it modifies the __cpu_possible_mask +after bootup, so that it will be different than nr_cpu_ids, which +then effictively breaks the workqueue setup code and triggers crashes +when shutting down CPUs at runtime. + +Guenter was the first who noticed the wrong values in __cpu_possible_mask, +since the cpumask Kunit tests were failig. + +Reverting this commit fixes both issues, but sadly brings back this +uncritical runtime warning: +register_cpu_capacity_sysctl: too early to get CPU4 device! + +Signed-off-by: Helge Deller +Reported-by: Guenter Roeck +Link: https://lkml.org/lkml/2024/2/4/146 +Link: https://lore.kernel.org/lkml/Zb0mbHlIud_bqftx@slm.duckdns.org/t/ +Cc: stable@vger.kernel.org # 6.0+ +Signed-off-by: Greg Kroah-Hartman +--- + arch/parisc/kernel/processor.c | 8 -------- + 1 file changed, 8 deletions(-) + +--- a/arch/parisc/kernel/processor.c ++++ b/arch/parisc/kernel/processor.c +@@ -172,7 +172,6 @@ static int __init processor_probe(struct + p->cpu_num = cpu_info.cpu_num; + p->cpu_loc = cpu_info.cpu_loc; + +- set_cpu_possible(cpuid, true); + store_cpu_topology(cpuid); + + #ifdef CONFIG_SMP +@@ -474,13 +473,6 @@ static struct parisc_driver cpu_driver _ + */ + void __init processor_init(void) + { +- unsigned int cpu; +- + reset_cpu_topology(); +- +- /* reset possible mask. We will mark those which are possible. */ +- for_each_possible_cpu(cpu) +- set_cpu_possible(cpu, false); +- + register_parisc_driver(&cpu_driver); + } diff --git a/queue-6.7/s390-cio-fix-invalid-ebusy-on-ccw_device_start.patch b/queue-6.7/s390-cio-fix-invalid-ebusy-on-ccw_device_start.patch new file mode 100644 index 00000000000..cfc5aca0108 --- /dev/null +++ b/queue-6.7/s390-cio-fix-invalid-ebusy-on-ccw_device_start.patch @@ -0,0 +1,99 @@ +From 5ef1dc40ffa6a6cb968b0fdc43c3a61727a9e950 Mon Sep 17 00:00:00 2001 +From: Peter Oberparleiter +Date: Wed, 14 Feb 2024 16:06:28 +0100 +Subject: s390/cio: fix invalid -EBUSY on ccw_device_start + +From: Peter Oberparleiter + +commit 5ef1dc40ffa6a6cb968b0fdc43c3a61727a9e950 upstream. + +The s390 common I/O layer (CIO) returns an unexpected -EBUSY return code +when drivers try to start I/O while a path-verification (PV) process is +pending. This can lead to failed device initialization attempts with +symptoms like broken network connectivity after boot. + +Fix this by replacing the -EBUSY return code with a deferred condition +code 1 reply to make path-verification handling consistent from a +driver's point of view. + +The problem can be reproduced semi-regularly using the following process, +while repeating steps 2-3 as necessary (example assumes an OSA device +with bus-IDs 0.0.a000-0.0.a002 on CHPID 0.02): + +1. echo 0.0.a000,0.0.a001,0.0.a002 >/sys/bus/ccwgroup/drivers/qeth/group +2. echo 0 > /sys/bus/ccwgroup/devices/0.0.a000/online +3. echo 1 > /sys/bus/ccwgroup/devices/0.0.a000/online ; \ + echo on > /sys/devices/css0/chp0.02/status + +Background information: + +The common I/O layer starts path-verification I/Os when it receives +indications about changes in a device path's availability. This occurs +for example when hardware events indicate a change in channel-path +status, or when a manual operation such as a CHPID vary or configure +operation is performed. + +If a driver attempts to start I/O while a PV is running, CIO reports a +successful I/O start (ccw_device_start() return code 0). Then, after +completion of PV, CIO synthesizes an interrupt response that indicates +an asynchronous status condition that prevented the start of the I/O +(deferred condition code 1). + +If a PV indication arrives while a device is busy with driver-owned I/O, +PV is delayed until after I/O completion was reported to the driver's +interrupt handler. To ensure that PV can be started eventually, CIO +reports a device busy condition (ccw_device_start() return code -EBUSY) +if a driver tries to start another I/O while PV is pending. + +In some cases this -EBUSY return code causes device drivers to consider +a device not operational, resulting in failed device initialization. + +Note: The code that introduced the problem was added in 2003. Symptoms +started appearing with the following CIO commit that causes a PV +indication when a device is removed from the cio_ignore list after the +associated parent subchannel device was probed, but before online +processing of the CCW device has started: + +2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") + +During boot, the cio_ignore list is modified by the cio_ignore dracut +module [1] as well as Linux vendor-specific systemd service scripts[2]. +When combined, this commit and boot scripts cause a frequent occurrence +of the problem during boot. + +[1] https://github.com/dracutdevs/dracut/tree/master/modules.d/81cio_ignore +[2] https://github.com/SUSE/s390-tools/blob/master/cio_ignore.service + +Cc: stable@vger.kernel.org # v5.15+ +Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") +Tested-By: Thorsten Winkler +Reviewed-by: Thorsten Winkler +Signed-off-by: Peter Oberparleiter +Signed-off-by: Heiko Carstens +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/cio/device_ops.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/s390/cio/device_ops.c ++++ b/drivers/s390/cio/device_ops.c +@@ -202,7 +202,8 @@ int ccw_device_start_timeout_key(struct + return -EINVAL; + if (cdev->private->state == DEV_STATE_NOT_OPER) + return -ENODEV; +- if (cdev->private->state == DEV_STATE_VERIFY) { ++ if (cdev->private->state == DEV_STATE_VERIFY || ++ cdev->private->flags.doverify) { + /* Remember to fake irb when finished. */ + if (!cdev->private->flags.fake_irb) { + cdev->private->flags.fake_irb = FAKE_CMD_IRB; +@@ -214,8 +215,7 @@ int ccw_device_start_timeout_key(struct + } + if (cdev->private->state != DEV_STATE_ONLINE || + ((sch->schib.scsw.cmd.stctl & SCSW_STCTL_PRIM_STATUS) && +- !(sch->schib.scsw.cmd.stctl & SCSW_STCTL_SEC_STATUS)) || +- cdev->private->flags.doverify) ++ !(sch->schib.scsw.cmd.stctl & SCSW_STCTL_SEC_STATUS))) + return -EBUSY; + ret = cio_set_options (sch, flags); + if (ret) diff --git a/queue-6.7/scsi-core-consult-supported-vpd-page-list-prior-to-fetching-page.patch b/queue-6.7/scsi-core-consult-supported-vpd-page-list-prior-to-fetching-page.patch new file mode 100644 index 00000000000..000f83db795 --- /dev/null +++ b/queue-6.7/scsi-core-consult-supported-vpd-page-list-prior-to-fetching-page.patch @@ -0,0 +1,100 @@ +From b5fc07a5fb56216a49e6c1d0b172d5464d99a89b Mon Sep 17 00:00:00 2001 +From: "Martin K. Petersen" +Date: Wed, 14 Feb 2024 17:14:11 -0500 +Subject: scsi: core: Consult supported VPD page list prior to fetching page + +From: Martin K. Petersen + +commit b5fc07a5fb56216a49e6c1d0b172d5464d99a89b upstream. + +Commit c92a6b5d6335 ("scsi: core: Query VPD size before getting full +page") removed the logic which checks whether a VPD page is present on +the supported pages list before asking for the page itself. That was +done because SPC helpfully states "The Supported VPD Pages VPD page +list may or may not include all the VPD pages that are able to be +returned by the device server". Testing had revealed a few devices +that supported some of the 0xBn pages but didn't actually list them in +page 0. + +Julian Sikorski bisected a problem with his drive resetting during +discovery to the commit above. As it turns out, this particular drive +firmware will crash if we attempt to fetch page 0xB9. + +Various approaches were attempted to work around this. In the end, +reinstating the logic that consults VPD page 0 before fetching any +other page was the path of least resistance. A firmware update for the +devices which originally compelled us to remove the check has since +been released. + +Link: https://lore.kernel.org/r/20240214221411.2888112-1-martin.petersen@oracle.com +Fixes: c92a6b5d6335 ("scsi: core: Query VPD size before getting full page") +Cc: stable@vger.kernel.org +Cc: Bart Van Assche +Reported-by: Julian Sikorski +Tested-by: Julian Sikorski +Reviewed-by: Lee Duncan +Reviewed-by: Bart Van Assche +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/scsi/scsi.c | 22 ++++++++++++++++++++-- + include/scsi/scsi_device.h | 4 ---- + 2 files changed, 20 insertions(+), 6 deletions(-) + +--- a/drivers/scsi/scsi.c ++++ b/drivers/scsi/scsi.c +@@ -328,21 +328,39 @@ static int scsi_vpd_inquiry(struct scsi_ + return result + 4; + } + ++enum scsi_vpd_parameters { ++ SCSI_VPD_HEADER_SIZE = 4, ++ SCSI_VPD_LIST_SIZE = 36, ++}; ++ + static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page) + { +- unsigned char vpd_header[SCSI_VPD_HEADER_SIZE] __aligned(4); ++ unsigned char vpd[SCSI_VPD_LIST_SIZE] __aligned(4); + int result; + + if (sdev->no_vpd_size) + return SCSI_DEFAULT_VPD_LEN; + + /* ++ * Fetch the supported pages VPD and validate that the requested page ++ * number is present. ++ */ ++ if (page != 0) { ++ result = scsi_vpd_inquiry(sdev, vpd, 0, sizeof(vpd)); ++ if (result < SCSI_VPD_HEADER_SIZE) ++ return 0; ++ ++ result -= SCSI_VPD_HEADER_SIZE; ++ if (!memchr(&vpd[SCSI_VPD_HEADER_SIZE], page, result)) ++ return 0; ++ } ++ /* + * Fetch the VPD page header to find out how big the page + * is. This is done to prevent problems on legacy devices + * which can not handle allocation lengths as large as + * potentially requested by the caller. + */ +- result = scsi_vpd_inquiry(sdev, vpd_header, page, sizeof(vpd_header)); ++ result = scsi_vpd_inquiry(sdev, vpd, page, SCSI_VPD_HEADER_SIZE); + if (result < 0) + return 0; + +--- a/include/scsi/scsi_device.h ++++ b/include/scsi/scsi_device.h +@@ -100,10 +100,6 @@ struct scsi_vpd { + unsigned char data[]; + }; + +-enum scsi_vpd_parameters { +- SCSI_VPD_HEADER_SIZE = 4, +-}; +- + struct scsi_device { + struct Scsi_Host *host; + struct request_queue *request_queue; diff --git a/queue-6.7/scsi-sd-usb_storage-uas-access-media-prior-to-querying-device-properties.patch b/queue-6.7/scsi-sd-usb_storage-uas-access-media-prior-to-querying-device-properties.patch new file mode 100644 index 00000000000..bd86bac290a --- /dev/null +++ b/queue-6.7/scsi-sd-usb_storage-uas-access-media-prior-to-querying-device-properties.patch @@ -0,0 +1,144 @@ +From 321da3dc1f3c92a12e3c5da934090d2992a8814c Mon Sep 17 00:00:00 2001 +From: "Martin K. Petersen" +Date: Tue, 13 Feb 2024 09:33:06 -0500 +Subject: scsi: sd: usb_storage: uas: Access media prior to querying device properties + +From: Martin K. Petersen + +commit 321da3dc1f3c92a12e3c5da934090d2992a8814c upstream. + +It has been observed that some USB/UAS devices return generic properties +hardcoded in firmware for mode pages for a period of time after a device +has been discovered. The reported properties are either garbage or they do +not accurately reflect the characteristics of the physical storage device +attached in the case of a bridge. + +Prior to commit 1e029397d12f ("scsi: sd: Reorganize DIF/DIX code to +avoid calling revalidate twice") we would call revalidate several +times during device discovery. As a result, incorrect values would +eventually get replaced with ones accurately describing the attached +storage. When we did away with the redundant revalidate pass, several +cases were reported where devices reported nonsensical values or would +end up in write-protected state. + +An initial attempt at addressing this issue involved introducing a +delayed second revalidate invocation. However, this approach still +left some devices reporting incorrect characteristics. + +Tasos Sahanidis debugged the problem further and identified that +introducing a READ operation prior to MODE SENSE fixed the problem and that +it wasn't a timing issue. Issuing a READ appears to cause the devices to +update their state to reflect the actual properties of the storage +media. Device properties like vendor, model, and storage capacity appear to +be correctly reported from the get-go. It is unclear why these devices +defer populating the remaining characteristics. + +Match the behavior of a well known commercial operating system and +trigger a READ operation prior to querying device characteristics to +force the device to populate the mode pages. + +The additional READ is triggered by a flag set in the USB storage and +UAS drivers. We avoid issuing the READ for other transport classes +since some storage devices identify Linux through our particular +discovery command sequence. + +Link: https://lore.kernel.org/r/20240213143306.2194237-1-martin.petersen@oracle.com +Fixes: 1e029397d12f ("scsi: sd: Reorganize DIF/DIX code to avoid calling revalidate twice") +Cc: stable@vger.kernel.org +Reported-by: Tasos Sahanidis +Reviewed-by: Ewan D. Milne +Reviewed-by: Bart Van Assche +Tested-by: Tasos Sahanidis +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/scsi/sd.c | 26 +++++++++++++++++++++++++- + drivers/usb/storage/scsiglue.c | 7 +++++++ + drivers/usb/storage/uas.c | 7 +++++++ + include/scsi/scsi_device.h | 1 + + 4 files changed, 40 insertions(+), 1 deletion(-) + +--- a/drivers/scsi/sd.c ++++ b/drivers/scsi/sd.c +@@ -3410,6 +3410,24 @@ static bool sd_validate_opt_xfer_size(st + return true; + } + ++static void sd_read_block_zero(struct scsi_disk *sdkp) ++{ ++ unsigned int buf_len = sdkp->device->sector_size; ++ char *buffer, cmd[10] = { }; ++ ++ buffer = kmalloc(buf_len, GFP_KERNEL); ++ if (!buffer) ++ return; ++ ++ cmd[0] = READ_10; ++ put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ ++ put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ ++ ++ scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len, ++ SD_TIMEOUT, sdkp->max_retries, NULL); ++ kfree(buffer); ++} ++ + /** + * sd_revalidate_disk - called the first time a new disk is seen, + * performs disk spin up, read_capacity, etc. +@@ -3449,7 +3467,13 @@ static int sd_revalidate_disk(struct gen + */ + if (sdkp->media_present) { + sd_read_capacity(sdkp, buffer); +- ++ /* ++ * Some USB/UAS devices return generic values for mode pages ++ * until the media has been accessed. Trigger a READ operation ++ * to force the device to populate mode pages. ++ */ ++ if (sdp->read_before_ms) ++ sd_read_block_zero(sdkp); + /* + * set the default to rotational. All non-rotational devices + * support the block characteristics VPD page, which will +--- a/drivers/usb/storage/scsiglue.c ++++ b/drivers/usb/storage/scsiglue.c +@@ -180,6 +180,13 @@ static int slave_configure(struct scsi_d + sdev->use_192_bytes_for_3f = 1; + + /* ++ * Some devices report generic values until the media has been ++ * accessed. Force a READ(10) prior to querying device ++ * characteristics. ++ */ ++ sdev->read_before_ms = 1; ++ ++ /* + * Some devices don't like MODE SENSE with page=0x3f, + * which is the command used for checking if a device + * is write-protected. Now that we tell the sd driver +--- a/drivers/usb/storage/uas.c ++++ b/drivers/usb/storage/uas.c +@@ -879,6 +879,13 @@ static int uas_slave_configure(struct sc + sdev->guess_capacity = 1; + + /* ++ * Some devices report generic values until the media has been ++ * accessed. Force a READ(10) prior to querying device ++ * characteristics. ++ */ ++ sdev->read_before_ms = 1; ++ ++ /* + * Some devices don't like MODE SENSE with page=0x3f, + * which is the command used for checking if a device + * is write-protected. Now that we tell the sd driver +--- a/include/scsi/scsi_device.h ++++ b/include/scsi/scsi_device.h +@@ -208,6 +208,7 @@ struct scsi_device { + unsigned use_10_for_rw:1; /* first try 10-byte read / write */ + unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */ + unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */ ++ unsigned read_before_ms:1; /* perform a READ before MODE SENSE */ + unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */ + unsigned no_write_same:1; /* no WRITE SAME command */ + unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */ diff --git a/queue-6.7/scsi-target-pscsi-fix-bio_put-for-error-case.patch b/queue-6.7/scsi-target-pscsi-fix-bio_put-for-error-case.patch new file mode 100644 index 00000000000..cdf686af6bf --- /dev/null +++ b/queue-6.7/scsi-target-pscsi-fix-bio_put-for-error-case.patch @@ -0,0 +1,47 @@ +From de959094eb2197636f7c803af0943cb9d3b35804 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Wed, 14 Feb 2024 23:43:56 +0900 +Subject: scsi: target: pscsi: Fix bio_put() for error case + +From: Naohiro Aota + +commit de959094eb2197636f7c803af0943cb9d3b35804 upstream. + +As of commit 066ff571011d ("block: turn bio_kmalloc into a simple kmalloc +wrapper"), a bio allocated by bio_kmalloc() must be freed by bio_uninit() +and kfree(). That is not done properly for the error case, hitting WARN and +NULL pointer dereference in bio_free(). + +Fixes: 066ff571011d ("block: turn bio_kmalloc into a simple kmalloc wrapper") +CC: stable@vger.kernel.org # 6.1+ +Signed-off-by: Naohiro Aota +Link: https://lore.kernel.org/r/20240214144356.101814-1-naohiro.aota@wdc.com +Reviewed-by: Christoph Hellwig +Reviewed-by: Johannes Thumshirn +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/target/target_core_pscsi.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/drivers/target/target_core_pscsi.c ++++ b/drivers/target/target_core_pscsi.c +@@ -907,12 +907,15 @@ new_bio: + + return 0; + fail: +- if (bio) +- bio_put(bio); ++ if (bio) { ++ bio_uninit(bio); ++ kfree(bio); ++ } + while (req->bio) { + bio = req->bio; + req->bio = bio->bi_next; +- bio_put(bio); ++ bio_uninit(bio); ++ kfree(bio); + } + req->biotail = NULL; + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; diff --git a/queue-6.7/selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0.patch b/queue-6.7/selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0.patch new file mode 100644 index 00000000000..f7e68bb80bc --- /dev/null +++ b/queue-6.7/selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0.patch @@ -0,0 +1,41 @@ +From 7efa6f2c803366f84c3c362f01e822490669d72b Mon Sep 17 00:00:00 2001 +From: Terry Tritton +Date: Mon, 5 Feb 2024 14:50:56 +0000 +Subject: selftests/mm: uffd-unit-test check if huge page size is 0 + +From: Terry Tritton + +commit 7efa6f2c803366f84c3c362f01e822490669d72b upstream. + +If HUGETLBFS is not enabled then the default_huge_page_size function will +return 0 and cause a divide by 0 error. Add a check to see if the huge page +size is 0 and skip the hugetlb tests if it is. + +Link: https://lkml.kernel.org/r/20240205145055.3545806-2-terry.tritton@linaro.org +Fixes: 16a45b57cbf2 ("selftests/mm: add framework for uffd-unit-test") +Signed-off-by: Terry Tritton +Cc: Peter Griffin +Cc: Shuah Khan +Cc: Peter Xu +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/mm/uffd-unit-tests.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/tools/testing/selftests/mm/uffd-unit-tests.c ++++ b/tools/testing/selftests/mm/uffd-unit-tests.c +@@ -1309,6 +1309,12 @@ int main(int argc, char *argv[]) + continue; + + uffd_test_start("%s on %s", test->name, mem_type->name); ++ if ((mem_type->mem_flag == MEM_HUGETLB || ++ mem_type->mem_flag == MEM_HUGETLB_PRIVATE) && ++ (default_huge_page_size() == 0)) { ++ uffd_test_skip("huge page size is 0, feature missing?"); ++ continue; ++ } + if (!uffd_feature_supported(test)) { + uffd_test_skip("feature missing"); + continue; diff --git a/queue-6.7/series b/queue-6.7/series index a2206837f71..c3b24c0773f 100644 --- a/queue-6.7/series +++ b/queue-6.7/series @@ -139,3 +139,55 @@ drm-amd-display-request-usb4-bw-for-mst-streams.patch drm-amd-display-fixed-integer-types-and-null-check-l.patch ib-hfi1-fix-sdma.h-tx-num_descs-off-by-one-error.patch kunit-add-a-macro-to-wrap-a-deferred-action-function.patch +x86-bugs-add-asm-helpers-for-executing-verw.patch +docs-instruct-latex-to-cope-with-deeper-nesting.patch +loongarch-call-early_init_fdt_scan_reserved_mem-earlier.patch +loongarch-disable-irq-before-init_fn-for-nonboot-cpus.patch +loongarch-update-cpu_sibling_map-when-disabling-nonboot-cpus.patch +btrfs-defrag-avoid-unnecessary-defrag-caused-by-incorrect-extent-size.patch +btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch +drm-ttm-fix-an-invalid-freeing-on-already-freed-page-in-error-path.patch +drm-meson-don-t-remove-bridges-which-are-created-by-other-drivers.patch +drm-buddy-modify-duplicate-list_splice_tail-call.patch +drm-amd-display-only-allow-dig-mapping-to-pwrseq-in-new-asic.patch +drm-amd-display-adjust-few-initialization-order-in-dm.patch +drm-amdgpu-fix-the-runtime-resume-failure-issue.patch +s390-cio-fix-invalid-ebusy-on-ccw_device_start.patch +ata-libata-core-do-not-try-to-set-sleeping-devices-to-standby.patch +ata-libata-core-do-not-call-ata_dev_power_set_standby-twice.patch +fs-aio-restrict-kiocb_set_cancel_fn-to-i-o-submitted-via-libaio.patch +lib-kconfig.debug-test_iov_iter-depends-on-mmu.patch +dm-crypt-recheck-the-integrity-tag-after-a-failure.patch +revert-parisc-only-list-existing-cpus-in-cpu_possible_mask.patch +dm-integrity-recheck-the-integrity-tag-after-a-failure.patch +dm-crypt-don-t-modify-the-data-when-using-authenticated-encryption.patch +dm-verity-recheck-the-hash-after-a-failure.patch +cxl-acpi-fix-load-failures-due-to-single-window-creation-failure.patch +cxl-pci-skip-to-handle-ras-errors-if-cxl.mem-device-is-detached.patch +cxl-pci-fix-disabling-memory-if-dvsec-cxl-range-does-not-match-a-cfmws-window.patch +scsi-sd-usb_storage-uas-access-media-prior-to-querying-device-properties.patch +scsi-target-pscsi-fix-bio_put-for-error-case.patch +scsi-core-consult-supported-vpd-page-list-prior-to-fetching-page.patch +selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0.patch +mm-swap-fix-race-when-skipping-swapcache.patch +mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch +mm-memcontrol-clarify-swapaccount-0-deprecation-warning.patch +mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch +mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch +mm-memblock-add-memblock_rsrv_noinit-into-flagname-array.patch +platform-x86-x86-android-tablets-fix-keyboard-touchscreen-on-lenovo-yogabook1-x90.patch +platform-x86-intel-vbtn-stop-calling-vbdl-from-notify_handler.patch +platform-x86-touchscreen_dmi-allow-partial-prefix-matches-for-acpi-names.patch +cachefiles-fix-memory-leak-in-cachefiles_add_cache.patch +sparc-fix-undefined-reference-to-fb_is_primary_device.patch +md-don-t-ignore-suspended-array-in-md_check_recovery.patch +md-don-t-ignore-read-only-array-in-md_check_recovery.patch +md-make-sure-md_do_sync-will-set-md_recovery_done.patch +md-don-t-register-sync_thread-for-reshape-directly.patch +md-don-t-suspend-the-array-for-interrupted-reshape.patch +md-fix-missing-release-of-active_io-for-flush.patch +kvm-arm64-vgic-its-test-for-valid-irq-in-movall-handler.patch +kvm-arm64-vgic-its-test-for-valid-irq-in-its_sync_lpi_pending_table.patch +accel-ivpu-don-t-enable-any-tiles-by-default-on-vpu40xx.patch +gtp-fix-use-after-free-and-null-ptr-deref-in-gtp_genl_dump_pdp.patch +crypto-virtio-akcipher-fix-stack-overflow-on-memcpy.patch diff --git a/queue-6.7/sparc-fix-undefined-reference-to-fb_is_primary_device.patch b/queue-6.7/sparc-fix-undefined-reference-to-fb_is_primary_device.patch new file mode 100644 index 00000000000..42f7697be4f --- /dev/null +++ b/queue-6.7/sparc-fix-undefined-reference-to-fb_is_primary_device.patch @@ -0,0 +1,53 @@ +From ed683b9bb91fc274383e222ba5873a9ee9033462 Mon Sep 17 00:00:00 2001 +From: Javier Martinez Canillas +Date: Tue, 20 Feb 2024 10:54:12 +0100 +Subject: sparc: Fix undefined reference to fb_is_primary_device + +From: Javier Martinez Canillas + +commit ed683b9bb91fc274383e222ba5873a9ee9033462 upstream. + +Commit 55bffc8170bb ("fbdev: Split frame buffer support in FB and FB_CORE +symbols") added a new FB_CORE Kconfig symbol, that can be enabled to only +have fbcon/VT and DRM fbdev emulation, but without support for any legacy +fbdev driver. + +Unfortunately, it missed to change the CONFIG_FB in arch/sparc makefiles, +which leads to the following linking error in some sparc64 configurations: + + sparc64-linux-ld: drivers/video/fbdev/core/fbcon.o: in function `fbcon_fb_registered': +>> fbcon.c:(.text+0x4f60): undefined reference to `fb_is_primary_device' + +Fixes: 55bffc8170bb ("fbdev: Split frame buffer support in FB and FB_CORE symbols") +Reported-by: kernel test robot +Closes: https://lore.kernel.org/r/202401290306.IV8rhJ02-lkp@intel.com/ +Signed-off-by: Javier Martinez Canillas +Reviewed-by: Thomas Zimmermann +Acked-by: Arnd Bergmann +Cc: # v6.6+ +Signed-off-by: Thomas Zimmermann +Link: https://patchwork.freedesktop.org/patch/msgid/20240220095428.3341195-1-javierm@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/sparc/Makefile | 2 +- + arch/sparc/video/Makefile | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/sparc/Makefile ++++ b/arch/sparc/Makefile +@@ -60,7 +60,7 @@ libs-y += arch/sparc/pro + libs-y += arch/sparc/lib/ + + drivers-$(CONFIG_PM) += arch/sparc/power/ +-drivers-$(CONFIG_FB) += arch/sparc/video/ ++drivers-$(CONFIG_FB_CORE) += arch/sparc/video/ + + boot := arch/sparc/boot + +--- a/arch/sparc/video/Makefile ++++ b/arch/sparc/video/Makefile +@@ -1,3 +1,3 @@ + # SPDX-License-Identifier: GPL-2.0-only + +-obj-$(CONFIG_FB) += fbdev.o ++obj-$(CONFIG_FB_CORE) += fbdev.o diff --git a/queue-6.7/x86-bugs-add-asm-helpers-for-executing-verw.patch b/queue-6.7/x86-bugs-add-asm-helpers-for-executing-verw.patch new file mode 100644 index 00000000000..fb6aba1abe6 --- /dev/null +++ b/queue-6.7/x86-bugs-add-asm-helpers-for-executing-verw.patch @@ -0,0 +1,120 @@ +From baf8361e54550a48a7087b603313ad013cc13386 Mon Sep 17 00:00:00 2001 +From: Pawan Gupta +Date: Tue, 13 Feb 2024 18:21:35 -0800 +Subject: x86/bugs: Add asm helpers for executing VERW + +From: Pawan Gupta + +commit baf8361e54550a48a7087b603313ad013cc13386 upstream. + +MDS mitigation requires clearing the CPU buffers before returning to +user. This needs to be done late in the exit-to-user path. Current +location of VERW leaves a possibility of kernel data ending up in CPU +buffers for memory accesses done after VERW such as: + + 1. Kernel data accessed by an NMI between VERW and return-to-user can + remain in CPU buffers since NMI returning to kernel does not + execute VERW to clear CPU buffers. + 2. Alyssa reported that after VERW is executed, + CONFIG_GCC_PLUGIN_STACKLEAK=y scrubs the stack used by a system + call. Memory accesses during stack scrubbing can move kernel stack + contents into CPU buffers. + 3. When caller saved registers are restored after a return from + function executing VERW, the kernel stack accesses can remain in + CPU buffers(since they occur after VERW). + +To fix this VERW needs to be moved very late in exit-to-user path. + +In preparation for moving VERW to entry/exit asm code, create macros +that can be used in asm. Also make VERW patching depend on a new feature +flag X86_FEATURE_CLEAR_CPU_BUF. + +Reported-by: Alyssa Milburn +Suggested-by: Andrew Cooper +Suggested-by: Peter Zijlstra +Signed-off-by: Pawan Gupta +Signed-off-by: Dave Hansen +Link: https://lore.kernel.org/all/20240213-delay-verw-v8-1-a6216d83edb7%40linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/entry/entry.S | 23 +++++++++++++++++++++++ + arch/x86/include/asm/cpufeatures.h | 2 +- + arch/x86/include/asm/nospec-branch.h | 13 +++++++++++++ + 3 files changed, 37 insertions(+), 1 deletion(-) + +--- a/arch/x86/entry/entry.S ++++ b/arch/x86/entry/entry.S +@@ -6,6 +6,9 @@ + #include + #include + #include ++#include ++#include ++#include + + .pushsection .noinstr.text, "ax" + +@@ -20,3 +23,23 @@ SYM_FUNC_END(entry_ibpb) + EXPORT_SYMBOL_GPL(entry_ibpb); + + .popsection ++ ++/* ++ * Define the VERW operand that is disguised as entry code so that ++ * it can be referenced with KPTI enabled. This ensure VERW can be ++ * used late in exit-to-user path after page tables are switched. ++ */ ++.pushsection .entry.text, "ax" ++ ++.align L1_CACHE_BYTES, 0xcc ++SYM_CODE_START_NOALIGN(mds_verw_sel) ++ UNWIND_HINT_UNDEFINED ++ ANNOTATE_NOENDBR ++ .word __KERNEL_DS ++.align L1_CACHE_BYTES, 0xcc ++SYM_CODE_END(mds_verw_sel); ++/* For KVM */ ++EXPORT_SYMBOL_GPL(mds_verw_sel); ++ ++.popsection ++ +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -97,7 +97,7 @@ + #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ + #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ + #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ +-/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ ++#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ + #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ + #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ + #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -315,6 +315,17 @@ + #endif + .endm + ++/* ++ * Macro to execute VERW instruction that mitigate transient data sampling ++ * attacks such as MDS. On affected systems a microcode update overloaded VERW ++ * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. ++ * ++ * Note: Only the memory operand variant of VERW clears the CPU buffers. ++ */ ++.macro CLEAR_CPU_BUFFERS ++ ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF ++.endm ++ + #else /* __ASSEMBLY__ */ + + #define ANNOTATE_RETPOLINE_SAFE \ +@@ -536,6 +547,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ + + DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); + ++extern u16 mds_verw_sel; ++ + #include + + /**