From: Greg Kroah-Hartman Date: Sat, 25 Jun 2022 14:35:12 +0000 (+0200) Subject: 5.15-stable patches X-Git-Tag: v5.10.126~25 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0403df6abac155f7d45c59891334c212da51361c;p=thirdparty%2Fkernel%2Fstable-queue.git 5.15-stable patches added patches: 9p-fix-fid-refcount-leak-in-v9fs_vfs_atomic_open_dotl.patch 9p-fix-fid-refcount-leak-in-v9fs_vfs_get_link.patch 9p-fix-refcounting-during-full-path-walks-for-fid-lookups.patch ata-libata-add-qc-flags-in-ata_qc_complete_template-tracepoint.patch btrfs-add-error-messages-to-all-unrecognized-mount-options.patch btrfs-fix-hang-during-unmount-when-block-group-reclaim-task-is-running.patch btrfs-prevent-remounting-to-v1-space-cache-for-subpage-mount.patch dm-era-commit-metadata-in-postsuspend-after-worker-stops.patch dm-mirror-log-clear-log-bits-up-to-bits_per_long-boundary.patch maintainers-add-new-iommu-development-mailing-list.patch mmc-mediatek-wait-dma-stop-bit-reset-to-0.patch mmc-sdhci-pci-o2micro-fix-card-detect-by-dealing-with-debouncing.patch mtd-rawnand-gpmi-fix-setting-busy-timeout-setting.patch net-openvswitch-fix-parsing-of-nw_proto-for-ipv6-fragments.patch scsi-ibmvfc-allocate-free-queue-resource-only-during-probe-remove.patch scsi-ibmvfc-store-vhost-pointer-during-subcrq-allocation.patch tracing-kprobes-check-whether-get_kretprobe-returns-null-in-kretprobe_dispatcher.patch xen-gntdev-avoid-blocking-in-unmap_grant_pages.patch --- diff --git a/queue-5.15/9p-fix-fid-refcount-leak-in-v9fs_vfs_atomic_open_dotl.patch b/queue-5.15/9p-fix-fid-refcount-leak-in-v9fs_vfs_atomic_open_dotl.patch new file mode 100644 index 00000000000..01406799879 --- /dev/null +++ b/queue-5.15/9p-fix-fid-refcount-leak-in-v9fs_vfs_atomic_open_dotl.patch @@ -0,0 +1,51 @@ +From beca774fc51a9ba8abbc869cf0c3d965ff17cd24 Mon Sep 17 00:00:00 2001 +From: Dominique Martinet +Date: Sun, 12 Jun 2022 16:00:05 +0900 +Subject: 9p: fix fid refcount leak in v9fs_vfs_atomic_open_dotl + +From: Dominique Martinet + +commit beca774fc51a9ba8abbc869cf0c3d965ff17cd24 upstream. + +We need to release directory fid if we fail halfway through open + +This fixes fid leaking with xfstests generic 531 + +Link: https://lkml.kernel.org/r/20220612085330.1451496-2-asmadeus@codewreck.org +Fixes: 6636b6dcc3db ("9p: add refcount to p9_fid struct") +Cc: stable@vger.kernel.org +Reported-by: Tyler Hicks +Reviewed-by: Tyler Hicks +Reviewed-by: Christian Schoenebeck +Signed-off-by: Dominique Martinet +Signed-off-by: Greg Kroah-Hartman +--- + fs/9p/vfs_inode_dotl.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/9p/vfs_inode_dotl.c ++++ b/fs/9p/vfs_inode_dotl.c +@@ -276,6 +276,7 @@ v9fs_vfs_atomic_open_dotl(struct inode * + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); ++ p9_client_clunk(dfid); + goto out; + } + +@@ -287,6 +288,7 @@ v9fs_vfs_atomic_open_dotl(struct inode * + if (err) { + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", + err); ++ p9_client_clunk(dfid); + goto error; + } + err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), +@@ -294,6 +296,7 @@ v9fs_vfs_atomic_open_dotl(struct inode * + if (err < 0) { + p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", + err); ++ p9_client_clunk(dfid); + goto error; + } + v9fs_invalidate_inode_attr(dir); diff --git a/queue-5.15/9p-fix-fid-refcount-leak-in-v9fs_vfs_get_link.patch b/queue-5.15/9p-fix-fid-refcount-leak-in-v9fs_vfs_get_link.patch new file mode 100644 index 00000000000..dafcd5ec017 --- /dev/null +++ b/queue-5.15/9p-fix-fid-refcount-leak-in-v9fs_vfs_get_link.patch @@ -0,0 +1,45 @@ +From e5690f263208c5abce7451370b7786eb25b405eb Mon Sep 17 00:00:00 2001 +From: Dominique Martinet +Date: Sun, 12 Jun 2022 17:14:55 +0900 +Subject: 9p: fix fid refcount leak in v9fs_vfs_get_link + +From: Dominique Martinet + +commit e5690f263208c5abce7451370b7786eb25b405eb upstream. + +we check for protocol version later than required, after a fid has +been obtained. Just move the version check earlier. + +Link: https://lkml.kernel.org/r/20220612085330.1451496-3-asmadeus@codewreck.org +Fixes: 6636b6dcc3db ("9p: add refcount to p9_fid struct") +Cc: stable@vger.kernel.org +Reviewed-by: Tyler Hicks +Reviewed-by: Christian Schoenebeck +Signed-off-by: Dominique Martinet +Signed-off-by: Greg Kroah-Hartman +--- + fs/9p/vfs_inode.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/fs/9p/vfs_inode.c ++++ b/fs/9p/vfs_inode.c +@@ -1228,15 +1228,15 @@ static const char *v9fs_vfs_get_link(str + return ERR_PTR(-ECHILD); + + v9ses = v9fs_dentry2v9ses(dentry); +- fid = v9fs_fid_lookup(dentry); ++ if (!v9fs_proto_dotu(v9ses)) ++ return ERR_PTR(-EBADF); ++ + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); ++ fid = v9fs_fid_lookup(dentry); + + if (IS_ERR(fid)) + return ERR_CAST(fid); + +- if (!v9fs_proto_dotu(v9ses)) +- return ERR_PTR(-EBADF); +- + st = p9_client_stat(fid); + p9_client_clunk(fid); + if (IS_ERR(st)) diff --git a/queue-5.15/9p-fix-refcounting-during-full-path-walks-for-fid-lookups.patch b/queue-5.15/9p-fix-refcounting-during-full-path-walks-for-fid-lookups.patch new file mode 100644 index 00000000000..beedca3dbf3 --- /dev/null +++ b/queue-5.15/9p-fix-refcounting-during-full-path-walks-for-fid-lookups.patch @@ -0,0 +1,103 @@ +From 2a3dcbccd64ba35c045fac92272ff981c4cbef44 Mon Sep 17 00:00:00 2001 +From: Tyler Hicks +Date: Thu, 26 May 2022 18:59:59 -0500 +Subject: 9p: Fix refcounting during full path walks for fid lookups + +From: Tyler Hicks + +commit 2a3dcbccd64ba35c045fac92272ff981c4cbef44 upstream. + +Decrement the refcount of the parent dentry's fid after walking +each path component during a full path walk for a lookup. Failure to do +so can lead to fids that are not clunked until the filesystem is +unmounted, as indicated by this warning: + + 9pnet: found fid 3 not clunked + +The improper refcounting after walking resulted in open(2) returning +-EIO on any directories underneath the mount point when using the virtio +transport. When using the fd transport, there's no apparent issue until +the filesytem is unmounted and the warning above is emitted to the logs. + +In some cases, the user may not yet be attached to the filesystem and a +new root fid, associated with the user, is created and attached to the +root dentry before the full path walk is performed. Increment the new +root fid's refcount to two in that situation so that it can be safely +decremented to one after it is used for the walk operation. The new fid +will still be attached to the root dentry when +v9fs_fid_lookup_with_uid() returns so a final refcount of one is +correct/expected. + +Link: https://lkml.kernel.org/r/20220527000003.355812-2-tyhicks@linux.microsoft.com +Link: https://lkml.kernel.org/r/20220612085330.1451496-4-asmadeus@codewreck.org +Fixes: 6636b6dcc3db ("9p: add refcount to p9_fid struct") +Cc: stable@vger.kernel.org +Signed-off-by: Tyler Hicks +Reviewed-by: Christian Schoenebeck +[Dominique: fix clunking fid multiple times discussed in second link] +Signed-off-by: Dominique Martinet +Signed-off-by: Greg Kroah-Hartman +--- + fs/9p/fid.c | 22 +++++++++------------- + 1 file changed, 9 insertions(+), 13 deletions(-) + +--- a/fs/9p/fid.c ++++ b/fs/9p/fid.c +@@ -151,7 +151,7 @@ static struct p9_fid *v9fs_fid_lookup_wi + const unsigned char **wnames, *uname; + int i, n, l, clone, access; + struct v9fs_session_info *v9ses; +- struct p9_fid *fid, *old_fid = NULL; ++ struct p9_fid *fid, *old_fid; + + v9ses = v9fs_dentry2v9ses(dentry); + access = v9ses->flags & V9FS_ACCESS_MASK; +@@ -193,13 +193,12 @@ static struct p9_fid *v9fs_fid_lookup_wi + if (IS_ERR(fid)) + return fid; + ++ refcount_inc(&fid->count); + v9fs_fid_add(dentry->d_sb->s_root, fid); + } + /* If we are root ourself just return that */ +- if (dentry->d_sb->s_root == dentry) { +- refcount_inc(&fid->count); ++ if (dentry->d_sb->s_root == dentry) + return fid; +- } + /* + * Do a multipath walk with attached root. + * When walking parent we need to make sure we +@@ -211,6 +210,7 @@ static struct p9_fid *v9fs_fid_lookup_wi + fid = ERR_PTR(n); + goto err_out; + } ++ old_fid = fid; + clone = 1; + i = 0; + while (i < n) { +@@ -220,19 +220,15 @@ static struct p9_fid *v9fs_fid_lookup_wi + * walk to ensure none of the patch component change + */ + fid = p9_client_walk(fid, l, &wnames[i], clone); ++ /* non-cloning walk will return the same fid */ ++ if (fid != old_fid) { ++ p9_client_clunk(old_fid); ++ old_fid = fid; ++ } + if (IS_ERR(fid)) { +- if (old_fid) { +- /* +- * If we fail, clunk fid which are mapping +- * to path component and not the last component +- * of the path. +- */ +- p9_client_clunk(old_fid); +- } + kfree(wnames); + goto err_out; + } +- old_fid = fid; + i += l; + clone = 0; + } diff --git a/queue-5.15/ata-libata-add-qc-flags-in-ata_qc_complete_template-tracepoint.patch b/queue-5.15/ata-libata-add-qc-flags-in-ata_qc_complete_template-tracepoint.patch new file mode 100644 index 00000000000..f54460831af --- /dev/null +++ b/queue-5.15/ata-libata-add-qc-flags-in-ata_qc_complete_template-tracepoint.patch @@ -0,0 +1,30 @@ +From 540a92bfe6dab7310b9df2e488ba247d784d0163 Mon Sep 17 00:00:00 2001 +From: Edward Wu +Date: Fri, 17 Jun 2022 11:32:20 +0800 +Subject: ata: libata: add qc->flags in ata_qc_complete_template tracepoint + +From: Edward Wu + +commit 540a92bfe6dab7310b9df2e488ba247d784d0163 upstream. + +Add flags value to check the result of ata completion + +Fixes: 255c03d15a29 ("libata: Add tracepoints") +Cc: stable@vger.kernel.org +Signed-off-by: Edward Wu +Signed-off-by: Damien Le Moal +Signed-off-by: Greg Kroah-Hartman +--- + include/trace/events/libata.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/include/trace/events/libata.h ++++ b/include/trace/events/libata.h +@@ -249,6 +249,7 @@ DECLARE_EVENT_CLASS(ata_qc_complete_temp + __entry->hob_feature = qc->result_tf.hob_feature; + __entry->nsect = qc->result_tf.nsect; + __entry->hob_nsect = qc->result_tf.hob_nsect; ++ __entry->flags = qc->flags; + ), + + TP_printk("ata_port=%u ata_dev=%u tag=%d flags=%s status=%s " \ diff --git a/queue-5.15/btrfs-add-error-messages-to-all-unrecognized-mount-options.patch b/queue-5.15/btrfs-add-error-messages-to-all-unrecognized-mount-options.patch new file mode 100644 index 00000000000..e5d92dbe656 --- /dev/null +++ b/queue-5.15/btrfs-add-error-messages-to-all-unrecognized-mount-options.patch @@ -0,0 +1,150 @@ +From e3a4167c880cf889f66887a152799df4d609dd21 Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Thu, 2 Jun 2022 23:57:17 +0200 +Subject: btrfs: add error messages to all unrecognized mount options + +From: David Sterba + +commit e3a4167c880cf889f66887a152799df4d609dd21 upstream. + +Almost none of the errors stemming from a valid mount option but wrong +value prints a descriptive message which would help to identify why +mount failed. Like in the linked report: + + $ uname -r + v4.19 + $ mount -o compress=zstd /dev/sdb /mnt + mount: /mnt: wrong fs type, bad option, bad superblock on + /dev/sdb, missing codepage or helper program, or other error. + $ dmesg + ... + BTRFS error (device sdb): open_ctree failed + +Errors caused by memory allocation failures are left out as it's not a +user error so reporting that would be confusing. + +Link: https://lore.kernel.org/linux-btrfs/9c3fec36-fc61-3a33-4977-a7e207c3fa4e@gmx.de/ +CC: stable@vger.kernel.org # 4.9+ +Reviewed-by: Qu Wenruo +Reviewed-by: Nikolay Borisov +Reviewed-by: Anand Jain +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/super.c | 39 ++++++++++++++++++++++++++++++++------- + 1 file changed, 32 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -712,6 +712,8 @@ int btrfs_parse_options(struct btrfs_fs_ + compress_force = false; + no_compress++; + } else { ++ btrfs_err(info, "unrecognized compression value %s", ++ args[0].from); + ret = -EINVAL; + goto out; + } +@@ -770,8 +772,11 @@ int btrfs_parse_options(struct btrfs_fs_ + case Opt_thread_pool: + ret = match_int(&args[0], &intarg); + if (ret) { ++ btrfs_err(info, "unrecognized thread_pool value %s", ++ args[0].from); + goto out; + } else if (intarg == 0) { ++ btrfs_err(info, "invalid value 0 for thread_pool"); + ret = -EINVAL; + goto out; + } +@@ -832,8 +837,11 @@ int btrfs_parse_options(struct btrfs_fs_ + break; + case Opt_ratio: + ret = match_int(&args[0], &intarg); +- if (ret) ++ if (ret) { ++ btrfs_err(info, "unrecognized metadata_ratio value %s", ++ args[0].from); + goto out; ++ } + info->metadata_ratio = intarg; + btrfs_info(info, "metadata ratio %u", + info->metadata_ratio); +@@ -850,6 +858,8 @@ int btrfs_parse_options(struct btrfs_fs_ + btrfs_set_and_info(info, DISCARD_ASYNC, + "turning on async discard"); + } else { ++ btrfs_err(info, "unrecognized discard mode value %s", ++ args[0].from); + ret = -EINVAL; + goto out; + } +@@ -874,6 +884,8 @@ int btrfs_parse_options(struct btrfs_fs_ + btrfs_set_and_info(info, FREE_SPACE_TREE, + "enabling free space tree"); + } else { ++ btrfs_err(info, "unrecognized space_cache value %s", ++ args[0].from); + ret = -EINVAL; + goto out; + } +@@ -943,8 +955,12 @@ int btrfs_parse_options(struct btrfs_fs_ + break; + case Opt_check_integrity_print_mask: + ret = match_int(&args[0], &intarg); +- if (ret) ++ if (ret) { ++ btrfs_err(info, ++ "unrecognized check_integrity_print_mask value %s", ++ args[0].from); + goto out; ++ } + info->check_integrity_print_mask = intarg; + btrfs_info(info, "check_integrity_print_mask 0x%x", + info->check_integrity_print_mask); +@@ -959,13 +975,15 @@ int btrfs_parse_options(struct btrfs_fs_ + goto out; + #endif + case Opt_fatal_errors: +- if (strcmp(args[0].from, "panic") == 0) ++ if (strcmp(args[0].from, "panic") == 0) { + btrfs_set_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); +- else if (strcmp(args[0].from, "bug") == 0) ++ } else if (strcmp(args[0].from, "bug") == 0) { + btrfs_clear_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); +- else { ++ } else { ++ btrfs_err(info, "unrecognized fatal_errors value %s", ++ args[0].from); + ret = -EINVAL; + goto out; + } +@@ -973,8 +991,12 @@ int btrfs_parse_options(struct btrfs_fs_ + case Opt_commit_interval: + intarg = 0; + ret = match_int(&args[0], &intarg); +- if (ret) ++ if (ret) { ++ btrfs_err(info, "unrecognized commit_interval value %s", ++ args[0].from); ++ ret = -EINVAL; + goto out; ++ } + if (intarg == 0) { + btrfs_info(info, + "using default commit interval %us", +@@ -988,8 +1010,11 @@ int btrfs_parse_options(struct btrfs_fs_ + break; + case Opt_rescue: + ret = parse_rescue_options(info, args[0].from); +- if (ret < 0) ++ if (ret < 0) { ++ btrfs_err(info, "unrecognized rescue value %s", ++ args[0].from); + goto out; ++ } + break; + #ifdef CONFIG_BTRFS_DEBUG + case Opt_fragment_all: diff --git a/queue-5.15/btrfs-fix-hang-during-unmount-when-block-group-reclaim-task-is-running.patch b/queue-5.15/btrfs-fix-hang-during-unmount-when-block-group-reclaim-task-is-running.patch new file mode 100644 index 00000000000..f0346dc1cfe --- /dev/null +++ b/queue-5.15/btrfs-fix-hang-during-unmount-when-block-group-reclaim-task-is-running.patch @@ -0,0 +1,142 @@ +From 31e70e527806c546a72262f2fc3d982ee23c42d3 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 18 May 2022 10:41:48 +0100 +Subject: btrfs: fix hang during unmount when block group reclaim task is running + +From: Filipe Manana + +commit 31e70e527806c546a72262f2fc3d982ee23c42d3 upstream. + +When we start an unmount, at close_ctree(), if we have the reclaim task +running and in the middle of a data block group relocation, we can trigger +a deadlock when stopping an async reclaim task, producing a trace like the +following: + +[629724.498185] task:kworker/u16:7 state:D stack: 0 pid:681170 ppid: 2 flags:0x00004000 +[629724.499760] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs] +[629724.501267] Call Trace: +[629724.501759] +[629724.502174] __schedule+0x3cb/0xed0 +[629724.502842] schedule+0x4e/0xb0 +[629724.503447] btrfs_wait_on_delayed_iputs+0x7c/0xc0 [btrfs] +[629724.504534] ? prepare_to_wait_exclusive+0xc0/0xc0 +[629724.505442] flush_space+0x423/0x630 [btrfs] +[629724.506296] ? rcu_read_unlock_trace_special+0x20/0x50 +[629724.507259] ? lock_release+0x220/0x4a0 +[629724.507932] ? btrfs_get_alloc_profile+0xb3/0x290 [btrfs] +[629724.508940] ? do_raw_spin_unlock+0x4b/0xa0 +[629724.509688] btrfs_async_reclaim_metadata_space+0x139/0x320 [btrfs] +[629724.510922] process_one_work+0x252/0x5a0 +[629724.511694] ? process_one_work+0x5a0/0x5a0 +[629724.512508] worker_thread+0x52/0x3b0 +[629724.513220] ? process_one_work+0x5a0/0x5a0 +[629724.514021] kthread+0xf2/0x120 +[629724.514627] ? kthread_complete_and_exit+0x20/0x20 +[629724.515526] ret_from_fork+0x22/0x30 +[629724.516236] +[629724.516694] task:umount state:D stack: 0 pid:719055 ppid:695412 flags:0x00004000 +[629724.518269] Call Trace: +[629724.518746] +[629724.519160] __schedule+0x3cb/0xed0 +[629724.519835] schedule+0x4e/0xb0 +[629724.520467] schedule_timeout+0xed/0x130 +[629724.521221] ? lock_release+0x220/0x4a0 +[629724.521946] ? lock_acquired+0x19c/0x420 +[629724.522662] ? trace_hardirqs_on+0x1b/0xe0 +[629724.523411] __wait_for_common+0xaf/0x1f0 +[629724.524189] ? usleep_range_state+0xb0/0xb0 +[629724.524997] __flush_work+0x26d/0x530 +[629724.525698] ? flush_workqueue_prep_pwqs+0x140/0x140 +[629724.526580] ? lock_acquire+0x1a0/0x310 +[629724.527324] __cancel_work_timer+0x137/0x1c0 +[629724.528190] close_ctree+0xfd/0x531 [btrfs] +[629724.529000] ? evict_inodes+0x166/0x1c0 +[629724.529510] generic_shutdown_super+0x74/0x120 +[629724.530103] kill_anon_super+0x14/0x30 +[629724.530611] btrfs_kill_super+0x12/0x20 [btrfs] +[629724.531246] deactivate_locked_super+0x31/0xa0 +[629724.531817] cleanup_mnt+0x147/0x1c0 +[629724.532319] task_work_run+0x5c/0xa0 +[629724.532984] exit_to_user_mode_prepare+0x1a6/0x1b0 +[629724.533598] syscall_exit_to_user_mode+0x16/0x40 +[629724.534200] do_syscall_64+0x48/0x90 +[629724.534667] entry_SYSCALL_64_after_hwframe+0x44/0xae +[629724.535318] RIP: 0033:0x7fa2b90437a7 +[629724.535804] RSP: 002b:00007ffe0b7e4458 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 +[629724.536912] RAX: 0000000000000000 RBX: 00007fa2b9182264 RCX: 00007fa2b90437a7 +[629724.538156] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000555d6cf20dd0 +[629724.539053] RBP: 0000555d6cf20ba0 R08: 0000000000000000 R09: 00007ffe0b7e3200 +[629724.539956] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 +[629724.540883] R13: 0000555d6cf20dd0 R14: 0000555d6cf20cb0 R15: 0000000000000000 +[629724.541796] + +This happens because: + +1) Before entering close_ctree() we have the async block group reclaim + task running and relocating a data block group; + +2) There's an async metadata (or data) space reclaim task running; + +3) We enter close_ctree() and park the cleaner kthread; + +4) The async space reclaim task is at flush_space() and runs all the + existing delayed iputs; + +5) Before the async space reclaim task calls + btrfs_wait_on_delayed_iputs(), the block group reclaim task which is + doing the data block group relocation, creates a delayed iput at + replace_file_extents() (called when COWing leaves that have file extent + items pointing to relocated data extents, during the merging phase + of relocation roots); + +6) The async reclaim space reclaim task blocks at + btrfs_wait_on_delayed_iputs(), since we have a new delayed iput; + +7) The task at close_ctree() then calls cancel_work_sync() to stop the + async space reclaim task, but it blocks since that task is waiting for + the delayed iput to be run; + +8) The delayed iput is never run because the cleaner kthread is parked, + and no one else runs delayed iputs, resulting in a hang. + +So fix this by stopping the async block group reclaim task before we +park the cleaner kthread. + +Fixes: 18bb8bbf13c183 ("btrfs: zoned: automatically reclaim zones") +CC: stable@vger.kernel.org # 5.15+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/disk-io.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -4360,6 +4360,17 @@ void __cold close_ctree(struct btrfs_fs_ + int ret; + + set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); ++ ++ /* ++ * We may have the reclaim task running and relocating a data block group, ++ * in which case it may create delayed iputs. So stop it before we park ++ * the cleaner kthread otherwise we can get new delayed iputs after ++ * parking the cleaner, and that can make the async reclaim task to hang ++ * if it's waiting for delayed iputs to complete, since the cleaner is ++ * parked and can not run delayed iputs - this will make us hang when ++ * trying to stop the async reclaim task. ++ */ ++ cancel_work_sync(&fs_info->reclaim_bgs_work); + /* + * We don't want the cleaner to start new transactions, add more delayed + * iputs, etc. while we're closing. We can't use kthread_stop() yet +@@ -4400,8 +4411,6 @@ void __cold close_ctree(struct btrfs_fs_ + cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); + +- cancel_work_sync(&fs_info->reclaim_bgs_work); +- + /* Cancel or finish ongoing discard work */ + btrfs_discard_cleanup(fs_info); + diff --git a/queue-5.15/btrfs-prevent-remounting-to-v1-space-cache-for-subpage-mount.patch b/queue-5.15/btrfs-prevent-remounting-to-v1-space-cache-for-subpage-mount.patch new file mode 100644 index 00000000000..d24f9232a2b --- /dev/null +++ b/queue-5.15/btrfs-prevent-remounting-to-v1-space-cache-for-subpage-mount.patch @@ -0,0 +1,49 @@ +From 0591f04036218d572d54349ea8c7914ad9c82b2b Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Wed, 18 May 2022 13:03:09 +0800 +Subject: btrfs: prevent remounting to v1 space cache for subpage mount + +From: Qu Wenruo + +commit 0591f04036218d572d54349ea8c7914ad9c82b2b upstream. + +Upstream commit 9f73f1aef98b ("btrfs: force v2 space cache usage for +subpage mount") forces subpage mount to use v2 cache, to avoid +deprecated v1 cache which doesn't support subpage properly. + +But there is a loophole that user can still remount to v1 cache. + +The existing check will only give users a warning, but does not really +prevent to do the remount. + +Although remounting to v1 will not cause any problems since the v1 cache +will always be marked invalid when mounted with a different page size, +it's still better to prevent v1 cache at all for subpage mounts. + +Fixes: 9f73f1aef98b ("btrfs: force v2 space cache usage for subpage mount") +CC: stable@vger.kernel.org # 5.15+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/super.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -1917,6 +1917,14 @@ static int btrfs_remount(struct super_bl + if (ret) + goto restore; + ++ /* V1 cache is not supported for subpage mount. */ ++ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { ++ btrfs_warn(fs_info, ++ "v1 space cache is not supported for page size %lu with sectorsize %u", ++ PAGE_SIZE, fs_info->sectorsize); ++ ret = -EINVAL; ++ goto restore; ++ } + btrfs_remount_begin(fs_info, old_opts, *flags); + btrfs_resize_thread_pool(fs_info, + fs_info->thread_pool_size, old_thread_pool_size); diff --git a/queue-5.15/dm-era-commit-metadata-in-postsuspend-after-worker-stops.patch b/queue-5.15/dm-era-commit-metadata-in-postsuspend-after-worker-stops.patch new file mode 100644 index 00000000000..c90a53fa46c --- /dev/null +++ b/queue-5.15/dm-era-commit-metadata-in-postsuspend-after-worker-stops.patch @@ -0,0 +1,91 @@ +From 9ae6e8b1c9bbf6874163d1243e393137313762b7 Mon Sep 17 00:00:00 2001 +From: Nikos Tsironis +Date: Tue, 21 Jun 2022 15:24:03 +0300 +Subject: dm era: commit metadata in postsuspend after worker stops + +From: Nikos Tsironis + +commit 9ae6e8b1c9bbf6874163d1243e393137313762b7 upstream. + +During postsuspend dm-era does the following: + +1. Archives the current era +2. Commits the metadata, as part of the RPC call for archiving the + current era +3. Stops the worker + +Until the worker stops, it might write to the metadata again. Moreover, +these writes are not flushed to disk immediately, but are cached by the +dm-bufio client, which writes them back asynchronously. + +As a result, the committed metadata of a suspended dm-era device might +not be consistent with the in-core metadata. + +In some cases, this can result in the corruption of the on-disk +metadata. Suppose the following sequence of events: + +1. Load a new table, e.g. a snapshot-origin table, to a device with a + dm-era table +2. Suspend the device +3. dm-era commits its metadata, but the worker does a few more metadata + writes until it stops, as part of digesting an archived writeset +4. These writes are cached by the dm-bufio client +5. Load the dm-era table to another device. +6. The new instance of the dm-era target loads the committed, on-disk + metadata, which don't include the extra writes done by the worker + after the metadata commit. +7. Resume the new device +8. The new dm-era target instance starts using the metadata +9. Resume the original device +10. The destructor of the old dm-era target instance is called and + destroys the dm-bufio client, which results in flushing the cached + writes to disk +11. These writes might overwrite the writes done by the new dm-era + instance, hence corrupting its metadata. + +Fix this by committing the metadata after the worker stops running. + +stop_worker uses flush_workqueue to flush the current work. However, the +work item may re-queue itself and flush_workqueue doesn't wait for +re-queued works to finish. + +This could result in the worker changing the metadata after they have +been committed, or writing to the metadata concurrently with the commit +in the postsuspend thread. + +Use drain_workqueue instead, which waits until the work and all +re-queued works finish. + +Fixes: eec40579d8487 ("dm: add era target") +Cc: stable@vger.kernel.org # v3.15+ +Signed-off-by: Nikos Tsironis +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-era-target.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/md/dm-era-target.c ++++ b/drivers/md/dm-era-target.c +@@ -1400,7 +1400,7 @@ static void start_worker(struct era *era + static void stop_worker(struct era *era) + { + atomic_set(&era->suspended, 1); +- flush_workqueue(era->wq); ++ drain_workqueue(era->wq); + } + + /*---------------------------------------------------------------- +@@ -1570,6 +1570,12 @@ static void era_postsuspend(struct dm_ta + } + + stop_worker(era); ++ ++ r = metadata_commit(era->md); ++ if (r) { ++ DMERR("%s: metadata_commit failed", __func__); ++ /* FIXME: fail mode */ ++ } + } + + static int era_preresume(struct dm_target *ti) diff --git a/queue-5.15/dm-mirror-log-clear-log-bits-up-to-bits_per_long-boundary.patch b/queue-5.15/dm-mirror-log-clear-log-bits-up-to-bits_per_long-boundary.patch new file mode 100644 index 00000000000..951b1125830 --- /dev/null +++ b/queue-5.15/dm-mirror-log-clear-log-bits-up-to-bits_per_long-boundary.patch @@ -0,0 +1,42 @@ +From 90736eb3232d208ee048493f371075e4272e0944 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Thu, 23 Jun 2022 14:53:25 -0400 +Subject: dm mirror log: clear log bits up to BITS_PER_LONG boundary + +From: Mikulas Patocka + +commit 90736eb3232d208ee048493f371075e4272e0944 upstream. + +Commit 85e123c27d5c ("dm mirror log: round up region bitmap size to +BITS_PER_LONG") introduced a regression on 64-bit architectures in the +lvm testsuite tests: lvcreate-mirror, mirror-names and vgsplit-operation. + +If the device is shrunk, we need to clear log bits beyond the end of the +device. The code clears bits up to a 32-bit boundary and then calculates +lc->sync_count by summing set bits up to a 64-bit boundary (the commit +changed that; previously, this boundary was 32-bit too). So, it was using +some non-zeroed bits in the calculation and this caused misbehavior. + +Fix this regression by clearing bits up to BITS_PER_LONG boundary. + +Fixes: 85e123c27d5c ("dm mirror log: round up region bitmap size to BITS_PER_LONG") +Cc: stable@vger.kernel.org +Reported-by: Benjamin Marzinski +Signed-off-by: Mikulas Patocka +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-log.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/md/dm-log.c ++++ b/drivers/md/dm-log.c +@@ -615,7 +615,7 @@ static int disk_resume(struct dm_dirty_l + log_clear_bit(lc, lc->clean_bits, i); + + /* clear any old bits -- device has shrunk */ +- for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++) ++ for (i = lc->region_count; i % BITS_PER_LONG; i++) + log_clear_bit(lc, lc->clean_bits, i); + + /* copy clean across to sync */ diff --git a/queue-5.15/maintainers-add-new-iommu-development-mailing-list.patch b/queue-5.15/maintainers-add-new-iommu-development-mailing-list.patch new file mode 100644 index 00000000000..dc815d034c7 --- /dev/null +++ b/queue-5.15/maintainers-add-new-iommu-development-mailing-list.patch @@ -0,0 +1,113 @@ +From c242507c1b895646b4a25060df13b6214805759f Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Fri, 24 Jun 2022 14:51:39 +0200 +Subject: MAINTAINERS: Add new IOMMU development mailing list + +From: Joerg Roedel + +commit c242507c1b895646b4a25060df13b6214805759f upstream. + +The IOMMU mailing list will move from lists.linux-foundation.org to +lists.linux.dev. The hard switch of the archive will happen on July +5th, but add the new list now already so that people start using the +list when sending patches. After July 5th the old list will disappear. + +Cc: stable@vger.kernel.org +Signed-off-by: Joerg Roedel +Link: https://lore.kernel.org/r/20220624125139.412-1-joro@8bytes.org +Signed-off-by: Joerg Roedel +Signed-off-by: Greg Kroah-Hartman +--- + MAINTAINERS | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -434,6 +434,7 @@ ACPI VIOT DRIVER + M: Jean-Philippe Brucker + L: linux-acpi@vger.kernel.org + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + S: Maintained + F: drivers/acpi/viot.c + F: include/linux/acpi_viot.h +@@ -941,6 +942,7 @@ AMD IOMMU (AMD-VI) + M: Joerg Roedel + R: Suravee Suthikulpanit + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + S: Maintained + T: git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git + F: drivers/iommu/amd/ +@@ -5602,6 +5604,7 @@ M: Christoph Hellwig + M: Marek Szyprowski + R: Robin Murphy + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + S: Supported + W: http://git.infradead.org/users/hch/dma-mapping.git + T: git git://git.infradead.org/users/hch/dma-mapping.git +@@ -5614,6 +5617,7 @@ F: kernel/dma/ + DMA MAPPING BENCHMARK + M: Barry Song + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + F: kernel/dma/map_benchmark.c + F: tools/testing/selftests/dma/ + +@@ -7115,6 +7119,7 @@ F: drivers/gpu/drm/exynos/exynos_dp* + EXYNOS SYSMMU (IOMMU) driver + M: Marek Szyprowski + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + S: Maintained + F: drivers/iommu/exynos-iommu.c + +@@ -9457,6 +9462,7 @@ INTEL IOMMU (VT-d) + M: David Woodhouse + M: Lu Baolu + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + S: Supported + T: git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git + F: drivers/iommu/intel/ +@@ -9793,6 +9799,7 @@ IOMMU DRIVERS + M: Joerg Roedel + M: Will Deacon + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + S: Maintained + T: git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git + F: Documentation/devicetree/bindings/iommu/ +@@ -11795,6 +11802,7 @@ F: drivers/i2c/busses/i2c-mt65xx.c + MEDIATEK IOMMU DRIVER + M: Yong Wu + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) + S: Supported + F: Documentation/devicetree/bindings/iommu/mediatek* +@@ -15554,6 +15562,7 @@ F: drivers/i2c/busses/i2c-qcom-cci.c + QUALCOMM IOMMU + M: Rob Clark + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + L: linux-arm-msm@vger.kernel.org + S: Maintained + F: drivers/iommu/arm/arm-smmu/qcom_iommu.c +@@ -17982,6 +17991,7 @@ F: arch/x86/boot/video* + SWIOTLB SUBSYSTEM + M: Christoph Hellwig + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + S: Supported + W: http://git.infradead.org/users/hch/dma-mapping.git + T: git git://git.infradead.org/users/hch/dma-mapping.git +@@ -20562,6 +20572,7 @@ M: Juergen Gross + M: Stefano Stabellini + L: xen-devel@lists.xenproject.org (moderated for non-subscribers) + L: iommu@lists.linux-foundation.org ++L: iommu@lists.linux.dev + S: Supported + F: arch/x86/xen/*swiotlb* + F: drivers/xen/*swiotlb* diff --git a/queue-5.15/mmc-mediatek-wait-dma-stop-bit-reset-to-0.patch b/queue-5.15/mmc-mediatek-wait-dma-stop-bit-reset-to-0.patch new file mode 100644 index 00000000000..96de112a5af --- /dev/null +++ b/queue-5.15/mmc-mediatek-wait-dma-stop-bit-reset-to-0.patch @@ -0,0 +1,88 @@ +From 89bcd9a64b849380ef57e3032b307574e48db524 Mon Sep 17 00:00:00 2001 +From: Mengqi Zhang +Date: Thu, 9 Jun 2022 19:22:39 +0800 +Subject: mmc: mediatek: wait dma stop bit reset to 0 + +From: Mengqi Zhang + +commit 89bcd9a64b849380ef57e3032b307574e48db524 upstream. + +MediaTek IP requires that after dma stop, it need to wait this dma stop +bit auto-reset to 0. When bus is in high loading state, it will take a +while for the dma stop complete. If there is no waiting operation here, +when program runs to clear fifo and reset, bus will hang. + +In addition, there should be no return in msdc_data_xfer_next() if +there is data need be transferred, because no matter what error occurs +here, it should continue to excute to the following mmc_request_done. +Otherwise the core layer may wait complete forever. + +Signed-off-by: Mengqi Zhang +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20220609112239.18911-1-mengqi.zhang@mediatek.com +Signed-off-by: Ulf Hansson +Signed-off-by: Greg Kroah-Hartman +--- + drivers/mmc/host/mtk-sd.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +--- a/drivers/mmc/host/mtk-sd.c ++++ b/drivers/mmc/host/mtk-sd.c +@@ -1355,7 +1355,7 @@ static void msdc_data_xfer_next(struct m + msdc_request_done(host, mrq); + } + +-static bool msdc_data_xfer_done(struct msdc_host *host, u32 events, ++static void msdc_data_xfer_done(struct msdc_host *host, u32 events, + struct mmc_request *mrq, struct mmc_data *data) + { + struct mmc_command *stop; +@@ -1375,7 +1375,7 @@ static bool msdc_data_xfer_done(struct m + spin_unlock_irqrestore(&host->lock, flags); + + if (done) +- return true; ++ return; + stop = data->stop; + + if (check_data || (stop && stop->error)) { +@@ -1384,12 +1384,15 @@ static bool msdc_data_xfer_done(struct m + sdr_set_field(host->base + MSDC_DMA_CTRL, MSDC_DMA_CTRL_STOP, + 1); + ++ ret = readl_poll_timeout_atomic(host->base + MSDC_DMA_CTRL, val, ++ !(val & MSDC_DMA_CTRL_STOP), 1, 20000); ++ if (ret) ++ dev_dbg(host->dev, "DMA stop timed out\n"); ++ + ret = readl_poll_timeout_atomic(host->base + MSDC_DMA_CFG, val, + !(val & MSDC_DMA_CFG_STS), 1, 20000); +- if (ret) { +- dev_dbg(host->dev, "DMA stop timed out\n"); +- return false; +- } ++ if (ret) ++ dev_dbg(host->dev, "DMA inactive timed out\n"); + + sdr_clr_bits(host->base + MSDC_INTEN, data_ints_mask); + dev_dbg(host->dev, "DMA stop\n"); +@@ -1414,9 +1417,7 @@ static bool msdc_data_xfer_done(struct m + } + + msdc_data_xfer_next(host, mrq); +- done = true; + } +- return done; + } + + static void msdc_set_buswidth(struct msdc_host *host, u32 width) +@@ -2347,6 +2348,9 @@ static void msdc_cqe_disable(struct mmc_ + if (recovery) { + sdr_set_field(host->base + MSDC_DMA_CTRL, + MSDC_DMA_CTRL_STOP, 1); ++ if (WARN_ON(readl_poll_timeout(host->base + MSDC_DMA_CTRL, val, ++ !(val & MSDC_DMA_CTRL_STOP), 1, 3000))) ++ return; + if (WARN_ON(readl_poll_timeout(host->base + MSDC_DMA_CFG, val, + !(val & MSDC_DMA_CFG_STS), 1, 3000))) + return; diff --git a/queue-5.15/mmc-sdhci-pci-o2micro-fix-card-detect-by-dealing-with-debouncing.patch b/queue-5.15/mmc-sdhci-pci-o2micro-fix-card-detect-by-dealing-with-debouncing.patch new file mode 100644 index 00000000000..78ffc6934cb --- /dev/null +++ b/queue-5.15/mmc-sdhci-pci-o2micro-fix-card-detect-by-dealing-with-debouncing.patch @@ -0,0 +1,34 @@ +From e591fcf6b4e39335c9b128b17738fcd2fdd278ae Mon Sep 17 00:00:00 2001 +From: Chevron Li +Date: Thu, 2 Jun 2022 06:25:43 -0700 +Subject: mmc: sdhci-pci-o2micro: Fix card detect by dealing with debouncing + +From: Chevron Li + +commit e591fcf6b4e39335c9b128b17738fcd2fdd278ae upstream. + +The result from ->get_cd() may be incorrect as the card detect debouncing +isn't managed correctly. Let's fix it. + +Signed-off-by: Chevron Li +Fixes: 7d44061704dd ("mmc: sdhci-pci-o2micro: Fix O2 Host data read/write DLL Lock phase shift issue") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20220602132543.596-1-chevron.li@bayhubtech.com +[Ulf: Updated the commit message] +Signed-off-by: Ulf Hansson +Signed-off-by: Greg Kroah-Hartman +--- + drivers/mmc/host/sdhci-pci-o2micro.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/mmc/host/sdhci-pci-o2micro.c ++++ b/drivers/mmc/host/sdhci-pci-o2micro.c +@@ -147,6 +147,8 @@ static int sdhci_o2_get_cd(struct mmc_ho + + if (!(sdhci_readw(host, O2_PLL_DLL_WDT_CONTROL1) & O2_PLL_LOCK_STATUS)) + sdhci_o2_enable_internal_clock(host); ++ else ++ sdhci_o2_wait_card_detect_stable(host); + + return !!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT); + } diff --git a/queue-5.15/mtd-rawnand-gpmi-fix-setting-busy-timeout-setting.patch b/queue-5.15/mtd-rawnand-gpmi-fix-setting-busy-timeout-setting.patch new file mode 100644 index 00000000000..9f47770ae5a --- /dev/null +++ b/queue-5.15/mtd-rawnand-gpmi-fix-setting-busy-timeout-setting.patch @@ -0,0 +1,49 @@ +From 06781a5026350cde699d2d10c9914a25c1524f45 Mon Sep 17 00:00:00 2001 +From: Sascha Hauer +Date: Tue, 14 Jun 2022 10:31:38 +0200 +Subject: mtd: rawnand: gpmi: Fix setting busy timeout setting + +From: Sascha Hauer + +commit 06781a5026350cde699d2d10c9914a25c1524f45 upstream. + +The DEVICE_BUSY_TIMEOUT value is described in the Reference Manual as: + +| Timeout waiting for NAND Ready/Busy or ATA IRQ. Used in WAIT_FOR_READY +| mode. This value is the number of GPMI_CLK cycles multiplied by 4096. + +So instead of multiplying the value in cycles with 4096, we have to +divide it by that value. Use DIV_ROUND_UP to make sure we are on the +safe side, especially when the calculated value in cycles is smaller +than 4096 as typically the case. + +This bug likely never triggered because any timeout != 0 usually will +do. In my case the busy timeout in cycles was originally calculated as +2408, which multiplied with 4096 is 0x968000. The lower 16 bits were +taken for the 16 bit wide register field, so the register value was +0x8000. With 2970bf5a32f0 ("mtd: rawnand: gpmi: fix controller timings +setting") however the value in cycles became 2384, which multiplied +with 4096 is 0x950000. The lower 16 bit are 0x0 now resulting in an +intermediate timeout when reading from NAND. + +Fixes: b1206122069aa ("mtd: rawnand: gpmi: use core timings instead of an empirical derivation") +Cc: stable@vger.kernel.org +Signed-off-by: Sascha Hauer +Signed-off-by: Miquel Raynal +Link: https://lore.kernel.org/linux-mtd/20220614083138.3455683-1-s.hauer@pengutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c ++++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c +@@ -685,7 +685,7 @@ static void gpmi_nfc_compute_timings(str + hw->timing0 = BF_GPMI_TIMING0_ADDRESS_SETUP(addr_setup_cycles) | + BF_GPMI_TIMING0_DATA_HOLD(data_hold_cycles) | + BF_GPMI_TIMING0_DATA_SETUP(data_setup_cycles); +- hw->timing1 = BF_GPMI_TIMING1_BUSY_TIMEOUT(busy_timeout_cycles * 4096); ++ hw->timing1 = BF_GPMI_TIMING1_BUSY_TIMEOUT(DIV_ROUND_UP(busy_timeout_cycles, 4096)); + + /* + * Derive NFC ideal delay from {3}: diff --git a/queue-5.15/net-openvswitch-fix-parsing-of-nw_proto-for-ipv6-fragments.patch b/queue-5.15/net-openvswitch-fix-parsing-of-nw_proto-for-ipv6-fragments.patch new file mode 100644 index 00000000000..c68caae70cd --- /dev/null +++ b/queue-5.15/net-openvswitch-fix-parsing-of-nw_proto-for-ipv6-fragments.patch @@ -0,0 +1,63 @@ +From 12378a5a75e33f34f8586706eb61cca9e6d4690c Mon Sep 17 00:00:00 2001 +From: Rosemarie O'Riorden +Date: Tue, 21 Jun 2022 16:48:45 -0400 +Subject: net: openvswitch: fix parsing of nw_proto for IPv6 fragments + +From: Rosemarie O'Riorden + +commit 12378a5a75e33f34f8586706eb61cca9e6d4690c upstream. + +When a packet enters the OVS datapath and does not match any existing +flows installed in the kernel flow cache, the packet will be sent to +userspace to be parsed, and a new flow will be created. The kernel and +OVS rely on each other to parse packet fields in the same way so that +packets will be handled properly. + +As per the design document linked below, OVS expects all later IPv6 +fragments to have nw_proto=44 in the flow key, so they can be correctly +matched on OpenFlow rules. OpenFlow controllers create pipelines based +on this design. + +This behavior was changed by the commit in the Fixes tag so that +nw_proto equals the next_header field of the last extension header. +However, there is no counterpart for this change in OVS userspace, +meaning that this field is parsed differently between OVS and the +kernel. This is a problem because OVS creates actions based on what is +parsed in userspace, but the kernel-provided flow key is used as a match +criteria, as described in Documentation/networking/openvswitch.rst. This +leads to issues such as packets incorrectly matching on a flow and thus +the wrong list of actions being applied to the packet. Such changes in +packet parsing cannot be implemented without breaking the userspace. + +The offending commit is partially reverted to restore the expected +behavior. + +The change technically made sense and there is a good reason that it was +implemented, but it does not comply with the original design of OVS. +If in the future someone wants to implement such a change, then it must +be user-configurable and disabled by default to preserve backwards +compatibility with existing OVS versions. + +Cc: stable@vger.kernel.org +Fixes: fa642f08839b ("openvswitch: Derive IP protocol number for IPv6 later frags") +Link: https://docs.openvswitch.org/en/latest/topics/design/#fragments +Signed-off-by: Rosemarie O'Riorden +Acked-by: Eelco Chaudron +Link: https://lore.kernel.org/r/20220621204845.9721-1-roriorden@redhat.com +Signed-off-by: Paolo Abeni +Signed-off-by: Greg Kroah-Hartman +--- + net/openvswitch/flow.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/openvswitch/flow.c ++++ b/net/openvswitch/flow.c +@@ -266,7 +266,7 @@ static int parse_ipv6hdr(struct sk_buff + if (flags & IP6_FH_F_FRAG) { + if (frag_off) { + key->ip.frag = OVS_FRAG_TYPE_LATER; +- key->ip.proto = nexthdr; ++ key->ip.proto = NEXTHDR_FRAGMENT; + return 0; + } + key->ip.frag = OVS_FRAG_TYPE_FIRST; diff --git a/queue-5.15/scsi-ibmvfc-allocate-free-queue-resource-only-during-probe-remove.patch b/queue-5.15/scsi-ibmvfc-allocate-free-queue-resource-only-during-probe-remove.patch new file mode 100644 index 00000000000..007462ba09d --- /dev/null +++ b/queue-5.15/scsi-ibmvfc-allocate-free-queue-resource-only-during-probe-remove.patch @@ -0,0 +1,234 @@ +From 72ea7fe0db73d65c7d977208842d8ade9b823de9 Mon Sep 17 00:00:00 2001 +From: Tyrel Datwyler +Date: Thu, 16 Jun 2022 12:11:26 -0700 +Subject: scsi: ibmvfc: Allocate/free queue resource only during probe/remove + +From: Tyrel Datwyler + +commit 72ea7fe0db73d65c7d977208842d8ade9b823de9 upstream. + +Currently, the sub-queues and event pool resources are allocated/freed for +every CRQ connection event such as reset and LPM. This exposes the driver +to a couple issues. First the inefficiency of freeing and reallocating +memory that can simply be resued after being sanitized. Further, a system +under memory pressue runs the risk of allocation failures that could result +in a crippled driver. Finally, there is a race window where command +submission/compeletion can try to pull/return elements from/to an event +pool that is being deleted or already has been deleted due to the lack of +host state around freeing/allocating resources. The following is an example +of list corruption following a live partition migration (LPM): + +Oops: Exception in kernel mode, sig: 5 [#1] +LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries +Modules linked in: vfat fat isofs cdrom ext4 mbcache jbd2 nft_counter nft_compat nf_tables nfnetlink rpadlpar_io rpaphp xsk_diag nfsv3 nfs_acl nfs lockd grace fscache netfs rfkill bonding tls sunrpc pseries_rng drm drm_panel_orientation_quirks xfs libcrc32c dm_service_time sd_mod t10_pi sg ibmvfc scsi_transport_fc ibmveth vmx_crypto dm_multipath dm_mirror dm_region_hash dm_log dm_mod ipmi_devintf ipmi_msghandler fuse +CPU: 0 PID: 2108 Comm: ibmvfc_0 Kdump: loaded Not tainted 5.14.0-70.9.1.el9_0.ppc64le #1 +NIP: c0000000007c4bb0 LR: c0000000007c4bac CTR: 00000000005b9a10 +REGS: c00000025c10b760 TRAP: 0700 Not tainted (5.14.0-70.9.1.el9_0.ppc64le) +MSR: 800000000282b033 CR: 2800028f XER: 0000000f +CFAR: c0000000001f55bc IRQMASK: 0 + GPR00: c0000000007c4bac c00000025c10ba00 c000000002a47c00 000000000000004e + GPR04: c0000031e3006f88 c0000031e308bd00 c00000025c10b768 0000000000000027 + GPR08: 0000000000000000 c0000031e3009dc0 00000031e0eb0000 0000000000000000 + GPR12: c0000031e2ffffa8 c000000002dd0000 c000000000187108 c00000020fcee2c0 + GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 + GPR20: 0000000000000000 0000000000000000 0000000000000000 c008000002f81300 + GPR24: 5deadbeef0000100 5deadbeef0000122 c000000263ba6910 c00000024cc88000 + GPR28: 000000000000003c c0000002430a0000 c0000002430ac300 000000000000c300 +NIP [c0000000007c4bb0] __list_del_entry_valid+0x90/0x100 +LR [c0000000007c4bac] __list_del_entry_valid+0x8c/0x100 +Call Trace: +[c00000025c10ba00] [c0000000007c4bac] __list_del_entry_valid+0x8c/0x100 (unreliable) +[c00000025c10ba60] [c008000002f42284] ibmvfc_free_queue+0xec/0x210 [ibmvfc] +[c00000025c10bb10] [c008000002f4246c] ibmvfc_deregister_scsi_channel+0xc4/0x160 [ibmvfc] +[c00000025c10bba0] [c008000002f42580] ibmvfc_release_sub_crqs+0x78/0x130 [ibmvfc] +[c00000025c10bc20] [c008000002f4f6cc] ibmvfc_do_work+0x5c4/0xc70 [ibmvfc] +[c00000025c10bce0] [c008000002f4fdec] ibmvfc_work+0x74/0x1e8 [ibmvfc] +[c00000025c10bda0] [c0000000001872b8] kthread+0x1b8/0x1c0 +[c00000025c10be10] [c00000000000cd64] ret_from_kernel_thread+0x5c/0x64 +Instruction dump: +40820034 38600001 38210060 4e800020 7c0802a6 7c641b78 3c62fe7a 7d254b78 +3863b590 f8010070 4ba309cd 60000000 <0fe00000> 7c0802a6 3c62fe7a 3863b640 +---[ end trace 11a2b65a92f8b66c ]--- +ibmvfc 30000003: Send warning. Receive queue closed, will retry. + +Add registration/deregistration helpers that are called instead during +connection resets to sanitize and reconfigure the queues. + +Link: https://lore.kernel.org/r/20220616191126.1281259-3-tyreld@linux.ibm.com +Fixes: 3034ebe26389 ("scsi: ibmvfc: Add alloc/dealloc routines for SCSI Sub-CRQ Channels") +Cc: stable@vger.kernel.org +Reviewed-by: Brian King +Signed-off-by: Tyrel Datwyler +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/scsi/ibmvscsi/ibmvfc.c | 79 ++++++++++++++++++++++++++++++++--------- + 1 file changed, 62 insertions(+), 17 deletions(-) + +--- a/drivers/scsi/ibmvscsi/ibmvfc.c ++++ b/drivers/scsi/ibmvscsi/ibmvfc.c +@@ -160,8 +160,8 @@ static void ibmvfc_npiv_logout(struct ib + static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *); + static void ibmvfc_tgt_move_login(struct ibmvfc_target *); + +-static void ibmvfc_release_sub_crqs(struct ibmvfc_host *); +-static void ibmvfc_init_sub_crqs(struct ibmvfc_host *); ++static void ibmvfc_dereg_sub_crqs(struct ibmvfc_host *); ++static void ibmvfc_reg_sub_crqs(struct ibmvfc_host *); + + static const char *unknown_error = "unknown error"; + +@@ -917,7 +917,7 @@ static int ibmvfc_reenable_crq_queue(str + struct vio_dev *vdev = to_vio_dev(vhost->dev); + unsigned long flags; + +- ibmvfc_release_sub_crqs(vhost); ++ ibmvfc_dereg_sub_crqs(vhost); + + /* Re-enable the CRQ */ + do { +@@ -936,7 +936,7 @@ static int ibmvfc_reenable_crq_queue(str + spin_unlock(vhost->crq.q_lock); + spin_unlock_irqrestore(vhost->host->host_lock, flags); + +- ibmvfc_init_sub_crqs(vhost); ++ ibmvfc_reg_sub_crqs(vhost); + + return rc; + } +@@ -955,7 +955,7 @@ static int ibmvfc_reset_crq(struct ibmvf + struct vio_dev *vdev = to_vio_dev(vhost->dev); + struct ibmvfc_queue *crq = &vhost->crq; + +- ibmvfc_release_sub_crqs(vhost); ++ ibmvfc_dereg_sub_crqs(vhost); + + /* Close the CRQ */ + do { +@@ -988,7 +988,7 @@ static int ibmvfc_reset_crq(struct ibmvf + spin_unlock(vhost->crq.q_lock); + spin_unlock_irqrestore(vhost->host->host_lock, flags); + +- ibmvfc_init_sub_crqs(vhost); ++ ibmvfc_reg_sub_crqs(vhost); + + return rc; + } +@@ -5757,9 +5757,6 @@ static int ibmvfc_register_scsi_channel( + + ENTER; + +- if (ibmvfc_alloc_queue(vhost, scrq, IBMVFC_SUB_CRQ_FMT)) +- return -ENOMEM; +- + rc = h_reg_sub_crq(vdev->unit_address, scrq->msg_token, PAGE_SIZE, + &scrq->cookie, &scrq->hw_irq); + +@@ -5799,7 +5796,6 @@ irq_failed: + rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); + } while (rtas_busy_delay(rc)); + reg_failed: +- ibmvfc_free_queue(vhost, scrq); + LEAVE; + return rc; + } +@@ -5825,12 +5821,50 @@ static void ibmvfc_deregister_scsi_chann + if (rc) + dev_err(dev, "Failed to free sub-crq[%d]: rc=%ld\n", index, rc); + +- ibmvfc_free_queue(vhost, scrq); ++ /* Clean out the queue */ ++ memset(scrq->msgs.crq, 0, PAGE_SIZE); ++ scrq->cur = 0; ++ ++ LEAVE; ++} ++ ++static void ibmvfc_reg_sub_crqs(struct ibmvfc_host *vhost) ++{ ++ int i, j; ++ ++ ENTER; ++ if (!vhost->mq_enabled || !vhost->scsi_scrqs.scrqs) ++ return; ++ ++ for (i = 0; i < nr_scsi_hw_queues; i++) { ++ if (ibmvfc_register_scsi_channel(vhost, i)) { ++ for (j = i; j > 0; j--) ++ ibmvfc_deregister_scsi_channel(vhost, j - 1); ++ vhost->do_enquiry = 0; ++ return; ++ } ++ } ++ ++ LEAVE; ++} ++ ++static void ibmvfc_dereg_sub_crqs(struct ibmvfc_host *vhost) ++{ ++ int i; ++ ++ ENTER; ++ if (!vhost->mq_enabled || !vhost->scsi_scrqs.scrqs) ++ return; ++ ++ for (i = 0; i < nr_scsi_hw_queues; i++) ++ ibmvfc_deregister_scsi_channel(vhost, i); ++ + LEAVE; + } + + static void ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) + { ++ struct ibmvfc_queue *scrq; + int i, j; + + ENTER; +@@ -5846,30 +5880,41 @@ static void ibmvfc_init_sub_crqs(struct + } + + for (i = 0; i < nr_scsi_hw_queues; i++) { +- if (ibmvfc_register_scsi_channel(vhost, i)) { +- for (j = i; j > 0; j--) +- ibmvfc_deregister_scsi_channel(vhost, j - 1); ++ scrq = &vhost->scsi_scrqs.scrqs[i]; ++ if (ibmvfc_alloc_queue(vhost, scrq, IBMVFC_SUB_CRQ_FMT)) { ++ for (j = i; j > 0; j--) { ++ scrq = &vhost->scsi_scrqs.scrqs[j - 1]; ++ ibmvfc_free_queue(vhost, scrq); ++ } + kfree(vhost->scsi_scrqs.scrqs); + vhost->scsi_scrqs.scrqs = NULL; + vhost->scsi_scrqs.active_queues = 0; + vhost->do_enquiry = 0; +- break; ++ vhost->mq_enabled = 0; ++ return; + } + } + ++ ibmvfc_reg_sub_crqs(vhost); ++ + LEAVE; + } + + static void ibmvfc_release_sub_crqs(struct ibmvfc_host *vhost) + { ++ struct ibmvfc_queue *scrq; + int i; + + ENTER; + if (!vhost->scsi_scrqs.scrqs) + return; + +- for (i = 0; i < nr_scsi_hw_queues; i++) +- ibmvfc_deregister_scsi_channel(vhost, i); ++ ibmvfc_dereg_sub_crqs(vhost); ++ ++ for (i = 0; i < nr_scsi_hw_queues; i++) { ++ scrq = &vhost->scsi_scrqs.scrqs[i]; ++ ibmvfc_free_queue(vhost, scrq); ++ } + + kfree(vhost->scsi_scrqs.scrqs); + vhost->scsi_scrqs.scrqs = NULL; diff --git a/queue-5.15/scsi-ibmvfc-store-vhost-pointer-during-subcrq-allocation.patch b/queue-5.15/scsi-ibmvfc-store-vhost-pointer-during-subcrq-allocation.patch new file mode 100644 index 00000000000..3189ccf4143 --- /dev/null +++ b/queue-5.15/scsi-ibmvfc-store-vhost-pointer-during-subcrq-allocation.patch @@ -0,0 +1,96 @@ +From aeaadcde1a60138bceb65de3cdaeec78170b4459 Mon Sep 17 00:00:00 2001 +From: Tyrel Datwyler +Date: Thu, 16 Jun 2022 12:11:25 -0700 +Subject: scsi: ibmvfc: Store vhost pointer during subcrq allocation + +From: Tyrel Datwyler + +commit aeaadcde1a60138bceb65de3cdaeec78170b4459 upstream. + +Currently the back pointer from a queue to the vhost adapter isn't set +until after subcrq interrupt registration. The value is available when a +queue is first allocated and can/should be also set for primary and async +queues as well as subcrqs. + +This fixes a crash observed during kexec/kdump on Power 9 with legacy XICS +interrupt controller where a pending subcrq interrupt from the previous +kernel can be replayed immediately upon IRQ registration resulting in +dereference of a garbage backpointer in ibmvfc_interrupt_scsi(). + +Kernel attempted to read user page (58) - exploit attempt? (uid: 0) +BUG: Kernel NULL pointer dereference on read at 0x00000058 +Faulting instruction address: 0xc008000003216a08 +Oops: Kernel access of bad area, sig: 11 [#1] +... +NIP [c008000003216a08] ibmvfc_interrupt_scsi+0x40/0xb0 [ibmvfc] +LR [c0000000082079e8] __handle_irq_event_percpu+0x98/0x270 +Call Trace: +[c000000047fa3d80] [c0000000123e6180] 0xc0000000123e6180 (unreliable) +[c000000047fa3df0] [c0000000082079e8] __handle_irq_event_percpu+0x98/0x270 +[c000000047fa3ea0] [c000000008207d18] handle_irq_event+0x98/0x188 +[c000000047fa3ef0] [c00000000820f564] handle_fasteoi_irq+0xc4/0x310 +[c000000047fa3f40] [c000000008205c60] generic_handle_irq+0x50/0x80 +[c000000047fa3f60] [c000000008015c40] __do_irq+0x70/0x1a0 +[c000000047fa3f90] [c000000008016d7c] __do_IRQ+0x9c/0x130 +[c000000014622f60] [0000000020000000] 0x20000000 +[c000000014622ff0] [c000000008016e50] do_IRQ+0x40/0xa0 +[c000000014623020] [c000000008017044] replay_soft_interrupts+0x194/0x2f0 +[c000000014623210] [c0000000080172a8] arch_local_irq_restore+0x108/0x170 +[c000000014623240] [c000000008eb1008] _raw_spin_unlock_irqrestore+0x58/0xb0 +[c000000014623270] [c00000000820b12c] __setup_irq+0x49c/0x9f0 +[c000000014623310] [c00000000820b7c0] request_threaded_irq+0x140/0x230 +[c000000014623380] [c008000003212a50] ibmvfc_register_scsi_channel+0x1e8/0x2f0 [ibmvfc] +[c000000014623450] [c008000003213d1c] ibmvfc_init_sub_crqs+0xc4/0x1f0 [ibmvfc] +[c0000000146234d0] [c0080000032145a8] ibmvfc_reset_crq+0x150/0x210 [ibmvfc] +[c000000014623550] [c0080000032147c8] ibmvfc_init_crq+0x160/0x280 [ibmvfc] +[c0000000146235f0] [c00800000321a9cc] ibmvfc_probe+0x2a4/0x530 [ibmvfc] + +Link: https://lore.kernel.org/r/20220616191126.1281259-2-tyreld@linux.ibm.com +Fixes: 3034ebe26389 ("scsi: ibmvfc: Add alloc/dealloc routines for SCSI Sub-CRQ Channels") +Cc: stable@vger.kernel.org +Reviewed-by: Brian King +Signed-off-by: Tyrel Datwyler +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/scsi/ibmvscsi/ibmvfc.c | 3 ++- + drivers/scsi/ibmvscsi/ibmvfc.h | 2 +- + 2 files changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/scsi/ibmvscsi/ibmvfc.c ++++ b/drivers/scsi/ibmvscsi/ibmvfc.c +@@ -5680,6 +5680,8 @@ static int ibmvfc_alloc_queue(struct ibm + queue->cur = 0; + queue->fmt = fmt; + queue->size = PAGE_SIZE / fmt_size; ++ ++ queue->vhost = vhost; + return 0; + } + +@@ -5788,7 +5790,6 @@ static int ibmvfc_register_scsi_channel( + } + + scrq->hwq_id = index; +- scrq->vhost = vhost; + + LEAVE; + return 0; +--- a/drivers/scsi/ibmvscsi/ibmvfc.h ++++ b/drivers/scsi/ibmvscsi/ibmvfc.h +@@ -789,6 +789,7 @@ struct ibmvfc_queue { + spinlock_t _lock; + spinlock_t *q_lock; + ++ struct ibmvfc_host *vhost; + struct ibmvfc_event_pool evt_pool; + struct list_head sent; + struct list_head free; +@@ -797,7 +798,6 @@ struct ibmvfc_queue { + union ibmvfc_iu cancel_rsp; + + /* Sub-CRQ fields */ +- struct ibmvfc_host *vhost; + unsigned long cookie; + unsigned long vios_cookie; + unsigned long hw_irq; diff --git a/queue-5.15/series b/queue-5.15/series index 84f6f3950e7..eb55067c970 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -7,3 +7,21 @@ alsa-hda-realtek-alc897-headset-mic-no-sound.patch alsa-hda-realtek-apply-fixup-for-lenovo-yoga-duet-7-properly.patch alsa-hda-realtek-add-quirk-for-clevo-pd70pnt.patch alsa-hda-realtek-add-quirk-for-clevo-ns50pu.patch +net-openvswitch-fix-parsing-of-nw_proto-for-ipv6-fragments.patch +9p-fix-refcounting-during-full-path-walks-for-fid-lookups.patch +9p-fix-fid-refcount-leak-in-v9fs_vfs_atomic_open_dotl.patch +9p-fix-fid-refcount-leak-in-v9fs_vfs_get_link.patch +btrfs-fix-hang-during-unmount-when-block-group-reclaim-task-is-running.patch +btrfs-prevent-remounting-to-v1-space-cache-for-subpage-mount.patch +btrfs-add-error-messages-to-all-unrecognized-mount-options.patch +scsi-ibmvfc-store-vhost-pointer-during-subcrq-allocation.patch +scsi-ibmvfc-allocate-free-queue-resource-only-during-probe-remove.patch +mmc-sdhci-pci-o2micro-fix-card-detect-by-dealing-with-debouncing.patch +mmc-mediatek-wait-dma-stop-bit-reset-to-0.patch +xen-gntdev-avoid-blocking-in-unmap_grant_pages.patch +maintainers-add-new-iommu-development-mailing-list.patch +mtd-rawnand-gpmi-fix-setting-busy-timeout-setting.patch +ata-libata-add-qc-flags-in-ata_qc_complete_template-tracepoint.patch +dm-era-commit-metadata-in-postsuspend-after-worker-stops.patch +dm-mirror-log-clear-log-bits-up-to-bits_per_long-boundary.patch +tracing-kprobes-check-whether-get_kretprobe-returns-null-in-kretprobe_dispatcher.patch diff --git a/queue-5.15/tracing-kprobes-check-whether-get_kretprobe-returns-null-in-kretprobe_dispatcher.patch b/queue-5.15/tracing-kprobes-check-whether-get_kretprobe-returns-null-in-kretprobe_dispatcher.patch new file mode 100644 index 00000000000..60ee31b9722 --- /dev/null +++ b/queue-5.15/tracing-kprobes-check-whether-get_kretprobe-returns-null-in-kretprobe_dispatcher.patch @@ -0,0 +1,61 @@ +From cc72b72073ac982a954d3b43519ca1c28f03c27c Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Sat, 28 May 2022 00:55:39 +0900 +Subject: tracing/kprobes: Check whether get_kretprobe() returns NULL in kretprobe_dispatcher() + +From: Masami Hiramatsu (Google) + +commit cc72b72073ac982a954d3b43519ca1c28f03c27c upstream. + +There is a small chance that get_kretprobe(ri) returns NULL in +kretprobe_dispatcher() when another CPU unregisters the kretprobe +right after __kretprobe_trampoline_handler(). + +To avoid this issue, kretprobe_dispatcher() checks the get_kretprobe() +return value again. And if it is NULL, it returns soon because that +kretprobe is under unregistering process. + +This issue has been introduced when the kretprobe is decoupled +from the struct kretprobe_instance by commit d741bf41d7c7 +("kprobes: Remove kretprobe hash"). Before that commit, the +struct kretprob_instance::rp directly points the kretprobe +and it is never be NULL. + +Link: https://lkml.kernel.org/r/165366693881.797669.16926184644089588731.stgit@devnote2 + +Reported-by: Yonghong Song +Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash") +Cc: Peter Zijlstra +Cc: Ingo Molnar +Cc: bpf +Cc: Kernel Team +Cc: stable@vger.kernel.org +Signed-off-by: Masami Hiramatsu (Google) +Acked-by: Jiri Olsa +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace_kprobe.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -1733,8 +1733,17 @@ static int + kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) + { + struct kretprobe *rp = get_kretprobe(ri); +- struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp); ++ struct trace_kprobe *tk; + ++ /* ++ * There is a small chance that get_kretprobe(ri) returns NULL when ++ * the kretprobe is unregister on another CPU between kretprobe's ++ * trampoline_handler and this function. ++ */ ++ if (unlikely(!rp)) ++ return 0; ++ ++ tk = container_of(rp, struct trace_kprobe, rp); + raw_cpu_inc(*tk->nhit); + + if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) diff --git a/queue-5.15/xen-gntdev-avoid-blocking-in-unmap_grant_pages.patch b/queue-5.15/xen-gntdev-avoid-blocking-in-unmap_grant_pages.patch new file mode 100644 index 00000000000..ddcab30eb90 --- /dev/null +++ b/queue-5.15/xen-gntdev-avoid-blocking-in-unmap_grant_pages.patch @@ -0,0 +1,362 @@ +From dbe97cff7dd9f0f75c524afdd55ad46be3d15295 Mon Sep 17 00:00:00 2001 +From: Demi Marie Obenour +Date: Tue, 21 Jun 2022 22:27:26 -0400 +Subject: xen/gntdev: Avoid blocking in unmap_grant_pages() + +From: Demi Marie Obenour + +commit dbe97cff7dd9f0f75c524afdd55ad46be3d15295 upstream. + +unmap_grant_pages() currently waits for the pages to no longer be used. +In https://github.com/QubesOS/qubes-issues/issues/7481, this lead to a +deadlock against i915: i915 was waiting for gntdev's MMU notifier to +finish, while gntdev was waiting for i915 to free its pages. I also +believe this is responsible for various deadlocks I have experienced in +the past. + +Avoid these problems by making unmap_grant_pages async. This requires +making it return void, as any errors will not be available when the +function returns. Fortunately, the only use of the return value is a +WARN_ON(), which can be replaced by a WARN_ON when the error is +detected. Additionally, a failed call will not prevent further calls +from being made, but this is harmless. + +Because unmap_grant_pages is now async, the grant handle will be sent to +INVALID_GRANT_HANDLE too late to prevent multiple unmaps of the same +handle. Instead, a separate bool array is allocated for this purpose. +This wastes memory, but stuffing this information in padding bytes is +too fragile. Furthermore, it is necessary to grab a reference to the +map before making the asynchronous call, and release the reference when +the call returns. + +It is also necessary to guard against reentrancy in gntdev_map_put(), +and to handle the case where userspace tries to map a mapping whose +contents have not all been freed yet. + +Fixes: 745282256c75 ("xen/gntdev: safely unmap grants in case they are still in use") +Cc: stable@vger.kernel.org +Signed-off-by: Demi Marie Obenour +Reviewed-by: Juergen Gross +Link: https://lore.kernel.org/r/20220622022726.2538-1-demi@invisiblethingslab.com +Signed-off-by: Juergen Gross +Signed-off-by: Greg Kroah-Hartman +--- + drivers/xen/gntdev-common.h | 7 + + drivers/xen/gntdev.c | 157 +++++++++++++++++++++++++++++--------------- + 2 files changed, 113 insertions(+), 51 deletions(-) + +--- a/drivers/xen/gntdev-common.h ++++ b/drivers/xen/gntdev-common.h +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + struct gntdev_dmabuf_priv; + +@@ -56,6 +57,7 @@ struct gntdev_grant_map { + struct gnttab_unmap_grant_ref *unmap_ops; + struct gnttab_map_grant_ref *kmap_ops; + struct gnttab_unmap_grant_ref *kunmap_ops; ++ bool *being_removed; + struct page **pages; + unsigned long pages_vm_start; + +@@ -73,6 +75,11 @@ struct gntdev_grant_map { + /* Needed to avoid allocation in gnttab_dma_free_pages(). */ + xen_pfn_t *frames; + #endif ++ ++ /* Number of live grants */ ++ atomic_t live_grants; ++ /* Needed to avoid allocation in __unmap_grant_pages */ ++ struct gntab_unmap_queue_data unmap_data; + }; + + struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, +--- a/drivers/xen/gntdev.c ++++ b/drivers/xen/gntdev.c +@@ -35,6 +35,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -60,10 +61,11 @@ module_param(limit, uint, 0644); + MODULE_PARM_DESC(limit, + "Maximum number of grants that may be mapped by one mapping request"); + ++/* True in PV mode, false otherwise */ + static int use_ptemod; + +-static int unmap_grant_pages(struct gntdev_grant_map *map, +- int offset, int pages); ++static void unmap_grant_pages(struct gntdev_grant_map *map, ++ int offset, int pages); + + static struct miscdevice gntdev_miscdev; + +@@ -120,6 +122,7 @@ static void gntdev_free_map(struct gntde + kvfree(map->unmap_ops); + kvfree(map->kmap_ops); + kvfree(map->kunmap_ops); ++ kvfree(map->being_removed); + kfree(map); + } + +@@ -140,10 +143,13 @@ struct gntdev_grant_map *gntdev_alloc_ma + add->unmap_ops = kvmalloc_array(count, sizeof(add->unmap_ops[0]), + GFP_KERNEL); + add->pages = kvcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); ++ add->being_removed = ++ kvcalloc(count, sizeof(add->being_removed[0]), GFP_KERNEL); + if (NULL == add->grants || + NULL == add->map_ops || + NULL == add->unmap_ops || +- NULL == add->pages) ++ NULL == add->pages || ++ NULL == add->being_removed) + goto err; + if (use_ptemod) { + add->kmap_ops = kvmalloc_array(count, sizeof(add->kmap_ops[0]), +@@ -250,9 +256,36 @@ void gntdev_put_map(struct gntdev_priv * + if (!refcount_dec_and_test(&map->users)) + return; + +- if (map->pages && !use_ptemod) ++ if (map->pages && !use_ptemod) { ++ /* ++ * Increment the reference count. This ensures that the ++ * subsequent call to unmap_grant_pages() will not wind up ++ * re-entering itself. It *can* wind up calling ++ * gntdev_put_map() recursively, but such calls will be with a ++ * reference count greater than 1, so they will return before ++ * this code is reached. The recursion depth is thus limited to ++ * 1. Do NOT use refcount_inc() here, as it will detect that ++ * the reference count is zero and WARN(). ++ */ ++ refcount_set(&map->users, 1); ++ ++ /* ++ * Unmap the grants. This may or may not be asynchronous, so it ++ * is possible that the reference count is 1 on return, but it ++ * could also be greater than 1. ++ */ + unmap_grant_pages(map, 0, map->count); + ++ /* Check if the memory now needs to be freed */ ++ if (!refcount_dec_and_test(&map->users)) ++ return; ++ ++ /* ++ * All pages have been returned to the hypervisor, so free the ++ * map. ++ */ ++ } ++ + if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { + notify_remote_via_evtchn(map->notify.event); + evtchn_put(map->notify.event); +@@ -283,6 +316,7 @@ static int find_grant_ptes(pte_t *pte, u + + int gntdev_map_grant_pages(struct gntdev_grant_map *map) + { ++ size_t alloced = 0; + int i, err = 0; + + if (!use_ptemod) { +@@ -331,97 +365,116 @@ int gntdev_map_grant_pages(struct gntdev + map->count); + + for (i = 0; i < map->count; i++) { +- if (map->map_ops[i].status == GNTST_okay) ++ if (map->map_ops[i].status == GNTST_okay) { + map->unmap_ops[i].handle = map->map_ops[i].handle; +- else if (!err) ++ if (!use_ptemod) ++ alloced++; ++ } else if (!err) + err = -EINVAL; + + if (map->flags & GNTMAP_device_map) + map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr; + + if (use_ptemod) { +- if (map->kmap_ops[i].status == GNTST_okay) ++ if (map->kmap_ops[i].status == GNTST_okay) { ++ if (map->map_ops[i].status == GNTST_okay) ++ alloced++; + map->kunmap_ops[i].handle = map->kmap_ops[i].handle; +- else if (!err) ++ } else if (!err) + err = -EINVAL; + } + } ++ atomic_add(alloced, &map->live_grants); + return err; + } + +-static int __unmap_grant_pages(struct gntdev_grant_map *map, int offset, +- int pages) ++static void __unmap_grant_pages_done(int result, ++ struct gntab_unmap_queue_data *data) + { +- int i, err = 0; +- struct gntab_unmap_queue_data unmap_data; +- +- if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { +- int pgno = (map->notify.addr >> PAGE_SHIFT); +- if (pgno >= offset && pgno < offset + pages) { +- /* No need for kmap, pages are in lowmem */ +- uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); +- tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; +- map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; +- } +- } +- +- unmap_data.unmap_ops = map->unmap_ops + offset; +- unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL; +- unmap_data.pages = map->pages + offset; +- unmap_data.count = pages; +- +- err = gnttab_unmap_refs_sync(&unmap_data); +- if (err) +- return err; ++ unsigned int i; ++ struct gntdev_grant_map *map = data->data; ++ unsigned int offset = data->unmap_ops - map->unmap_ops; + +- for (i = 0; i < pages; i++) { +- if (map->unmap_ops[offset+i].status) +- err = -EINVAL; ++ for (i = 0; i < data->count; i++) { ++ WARN_ON(map->unmap_ops[offset+i].status); + pr_debug("unmap handle=%d st=%d\n", + map->unmap_ops[offset+i].handle, + map->unmap_ops[offset+i].status); + map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; + if (use_ptemod) { +- if (map->kunmap_ops[offset+i].status) +- err = -EINVAL; ++ WARN_ON(map->kunmap_ops[offset+i].status); + pr_debug("kunmap handle=%u st=%d\n", + map->kunmap_ops[offset+i].handle, + map->kunmap_ops[offset+i].status); + map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; + } + } +- return err; ++ /* ++ * Decrease the live-grant counter. This must happen after the loop to ++ * prevent premature reuse of the grants by gnttab_mmap(). ++ */ ++ atomic_sub(data->count, &map->live_grants); ++ ++ /* Release reference taken by __unmap_grant_pages */ ++ gntdev_put_map(NULL, map); + } + +-static int unmap_grant_pages(struct gntdev_grant_map *map, int offset, +- int pages) ++static void __unmap_grant_pages(struct gntdev_grant_map *map, int offset, ++ int pages) + { +- int range, err = 0; ++ if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { ++ int pgno = (map->notify.addr >> PAGE_SHIFT); ++ ++ if (pgno >= offset && pgno < offset + pages) { ++ /* No need for kmap, pages are in lowmem */ ++ uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); ++ ++ tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; ++ map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; ++ } ++ } ++ ++ map->unmap_data.unmap_ops = map->unmap_ops + offset; ++ map->unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL; ++ map->unmap_data.pages = map->pages + offset; ++ map->unmap_data.count = pages; ++ map->unmap_data.done = __unmap_grant_pages_done; ++ map->unmap_data.data = map; ++ refcount_inc(&map->users); /* to keep map alive during async call below */ ++ ++ gnttab_unmap_refs_async(&map->unmap_data); ++} ++ ++static void unmap_grant_pages(struct gntdev_grant_map *map, int offset, ++ int pages) ++{ ++ int range; ++ ++ if (atomic_read(&map->live_grants) == 0) ++ return; /* Nothing to do */ + + pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + + /* It is possible the requested range will have a "hole" where we + * already unmapped some of the grants. Only unmap valid ranges. + */ +- while (pages && !err) { +- while (pages && +- map->unmap_ops[offset].handle == INVALID_GRANT_HANDLE) { ++ while (pages) { ++ while (pages && map->being_removed[offset]) { + offset++; + pages--; + } + range = 0; + while (range < pages) { +- if (map->unmap_ops[offset + range].handle == +- INVALID_GRANT_HANDLE) ++ if (map->being_removed[offset + range]) + break; ++ map->being_removed[offset + range] = true; + range++; + } +- err = __unmap_grant_pages(map, offset, range); ++ if (range) ++ __unmap_grant_pages(map, offset, range); + offset += range; + pages -= range; + } +- +- return err; + } + + /* ------------------------------------------------------------------ */ +@@ -473,7 +526,6 @@ static bool gntdev_invalidate(struct mmu + struct gntdev_grant_map *map = + container_of(mn, struct gntdev_grant_map, notifier); + unsigned long mstart, mend; +- int err; + + if (!mmu_notifier_range_blockable(range)) + return false; +@@ -494,10 +546,9 @@ static bool gntdev_invalidate(struct mmu + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + range->start, range->end, mstart, mend); +- err = unmap_grant_pages(map, ++ unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); +- WARN_ON(err); + + return true; + } +@@ -985,6 +1036,10 @@ static int gntdev_mmap(struct file *flip + goto unlock_out; + if (use_ptemod && map->vma) + goto unlock_out; ++ if (atomic_read(&map->live_grants)) { ++ err = -EAGAIN; ++ goto unlock_out; ++ } + refcount_inc(&map->users); + + vma->vm_ops = &gntdev_vmops;