From: Greg Kroah-Hartman Date: Mon, 18 Apr 2022 08:13:12 +0000 (+0200) Subject: 5.15-stable patches X-Git-Tag: v4.9.311~29 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=43ea1499d581afde79e60fb1fc746d74764207f8;p=thirdparty%2Fkernel%2Fstable-queue.git 5.15-stable patches added patches: cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch gcc-plugins-latent_entropy-use-dev-urandom.patch kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch mm-page_alloc-fix-build_zonerefs_node.patch mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch --- diff --git a/queue-5.15/cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch b/queue-5.15/cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch new file mode 100644 index 00000000000..c404a5aff1d --- /dev/null +++ b/queue-5.15/cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch @@ -0,0 +1,47 @@ +From 8b6c58458ee3206dde345fce327a4cb83e69caf9 Mon Sep 17 00:00:00 2001 +From: Ronnie Sahlberg +Date: Wed, 13 Apr 2022 10:02:17 +1000 +Subject: cifs: verify that tcon is valid before dereference in cifs_kill_sb + +From: Ronnie Sahlberg + +commit 8b6c58458ee3206dde345fce327a4cb83e69caf9 upstream. + +On umount, cifs_sb->tlink_tree might contain entries that do not represent +a valid tcon. +Check the tcon for error before we dereference it. + +Signed-off-by: Ronnie Sahlberg +Cc: stable@vger.kernel.org +Reviewed-by: Shyam Prasad N +Reported-by: Xiaoli Feng +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman +--- + fs/cifs/cifsfs.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/cifs/cifsfs.c ++++ b/fs/cifs/cifsfs.c +@@ -266,10 +266,11 @@ static void cifs_kill_sb(struct super_bl + * before we kill the sb. + */ + if (cifs_sb->root) { +- node = rb_first(root); +- while (node != NULL) { ++ for (node = rb_first(root); node; node = rb_next(node)) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + tcon = tlink_tcon(tlink); ++ if (IS_ERR(tcon)) ++ continue; + cfid = &tcon->crfid; + mutex_lock(&cfid->fid_mutex); + if (cfid->dentry) { +@@ -277,7 +278,6 @@ static void cifs_kill_sb(struct super_bl + cfid->dentry = NULL; + } + mutex_unlock(&cfid->fid_mutex); +- node = rb_next(node); + } + + /* finally release root dentry */ diff --git a/queue-5.15/gcc-plugins-latent_entropy-use-dev-urandom.patch b/queue-5.15/gcc-plugins-latent_entropy-use-dev-urandom.patch new file mode 100644 index 00000000000..c40144ecf42 --- /dev/null +++ b/queue-5.15/gcc-plugins-latent_entropy-use-dev-urandom.patch @@ -0,0 +1,121 @@ +From c40160f2998c897231f8454bf797558d30a20375 Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Wed, 6 Apr 2022 00:28:15 +0200 +Subject: gcc-plugins: latent_entropy: use /dev/urandom + +From: Jason A. Donenfeld + +commit c40160f2998c897231f8454bf797558d30a20375 upstream. + +While the latent entropy plugin mostly doesn't derive entropy from +get_random_const() for measuring the call graph, when __latent_entropy is +applied to a constant, then it's initialized statically to output from +get_random_const(). In that case, this data is derived from a 64-bit +seed, which means a buffer of 512 bits doesn't really have that amount +of compile-time entropy. + +This patch fixes that shortcoming by just buffering chunks of +/dev/urandom output and doling it out as requested. + +At the same time, it's important that we don't break the use of +-frandom-seed, for people who want the runtime benefits of the latent +entropy plugin, while still having compile-time determinism. In that +case, we detect whether gcc's set_random_seed() has been called by +making a call to get_random_seed(noinit=true) in the plugin init +function, which is called after set_random_seed() is called but before +anything that calls get_random_seed(noinit=false), and seeing if it's +zero or not. If it's not zero, we're in deterministic mode, and so we +just generate numbers with a basic xorshift prng. + +Note that we don't detect if -frandom-seed is being used using the +documented local_tick variable, because it's assigned via: + local_tick = (unsigned) tv.tv_sec * 1000 + tv.tv_usec / 1000; +which may well overflow and become -1 on its own, and so isn't +reliable: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105171 + +[kees: The 256 byte rnd_buf size was chosen based on average (250), + median (64), and std deviation (575) bytes of used entropy for a + defconfig x86_64 build] + +Fixes: 38addce8b600 ("gcc-plugins: Add latent_entropy plugin") +Cc: stable@vger.kernel.org +Cc: PaX Team +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Kees Cook +Link: https://lore.kernel.org/r/20220405222815.21155-1-Jason@zx2c4.com +Signed-off-by: Greg Kroah-Hartman +--- + scripts/gcc-plugins/latent_entropy_plugin.c | 44 +++++++++++++++++----------- + 1 file changed, 27 insertions(+), 17 deletions(-) + +--- a/scripts/gcc-plugins/latent_entropy_plugin.c ++++ b/scripts/gcc-plugins/latent_entropy_plugin.c +@@ -86,25 +86,31 @@ static struct plugin_info latent_entropy + .help = "disable\tturn off latent entropy instrumentation\n", + }; + +-static unsigned HOST_WIDE_INT seed; +-/* +- * get_random_seed() (this is a GCC function) generates the seed. +- * This is a simple random generator without any cryptographic security because +- * the entropy doesn't come from here. +- */ ++static unsigned HOST_WIDE_INT deterministic_seed; ++static unsigned HOST_WIDE_INT rnd_buf[32]; ++static size_t rnd_idx = ARRAY_SIZE(rnd_buf); ++static int urandom_fd = -1; ++ + static unsigned HOST_WIDE_INT get_random_const(void) + { +- unsigned int i; +- unsigned HOST_WIDE_INT ret = 0; +- +- for (i = 0; i < 8 * sizeof(ret); i++) { +- ret = (ret << 1) | (seed & 1); +- seed >>= 1; +- if (ret & 1) +- seed ^= 0xD800000000000000ULL; ++ if (deterministic_seed) { ++ unsigned HOST_WIDE_INT w = deterministic_seed; ++ w ^= w << 13; ++ w ^= w >> 7; ++ w ^= w << 17; ++ deterministic_seed = w; ++ return deterministic_seed; + } + +- return ret; ++ if (urandom_fd < 0) { ++ urandom_fd = open("/dev/urandom", O_RDONLY); ++ gcc_assert(urandom_fd >= 0); ++ } ++ if (rnd_idx >= ARRAY_SIZE(rnd_buf)) { ++ gcc_assert(read(urandom_fd, rnd_buf, sizeof(rnd_buf)) == sizeof(rnd_buf)); ++ rnd_idx = 0; ++ } ++ return rnd_buf[rnd_idx++]; + } + + static tree tree_get_random_const(tree type) +@@ -537,8 +543,6 @@ static void latent_entropy_start_unit(vo + tree type, id; + int quals; + +- seed = get_random_seed(false); +- + if (in_lto_p) + return; + +@@ -573,6 +577,12 @@ __visible int plugin_init(struct plugin_ + const struct plugin_argument * const argv = plugin_info->argv; + int i; + ++ /* ++ * Call get_random_seed() with noinit=true, so that this returns ++ * 0 in the case where no seed has been passed via -frandom-seed. ++ */ ++ deterministic_seed = get_random_seed(true); ++ + static const struct ggc_root_tab gt_ggc_r_gt_latent_entropy[] = { + { + .base = &latent_entropy_decl, diff --git a/queue-5.15/kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch b/queue-5.15/kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch new file mode 100644 index 00000000000..6349c7bc5e8 --- /dev/null +++ b/queue-5.15/kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch @@ -0,0 +1,68 @@ +From a44a4cc1c969afec97dbb2aedaf6f38eaa6253bb Mon Sep 17 00:00:00 2001 +From: Oliver Upton +Date: Wed, 6 Apr 2022 23:56:13 +0000 +Subject: KVM: Don't create VM debugfs files outside of the VM directory + +From: Oliver Upton + +commit a44a4cc1c969afec97dbb2aedaf6f38eaa6253bb upstream. + +Unfortunately, there is no guarantee that KVM was able to instantiate a +debugfs directory for a particular VM. To that end, KVM shouldn't even +attempt to create new debugfs files in this case. If the specified +parent dentry is NULL, debugfs_create_file() will instantiate files at +the root of debugfs. + +For arm64, it is possible to create the vgic-state file outside of a +VM directory, the file is not cleaned up when a VM is destroyed. +Nonetheless, the corresponding struct kvm is freed when the VM is +destroyed. + +Nip the problem in the bud for all possible errant debugfs file +creations by initializing kvm->debugfs_dentry to -ENOENT. In so doing, +debugfs_create_file() will fail instead of creating the file in the root +directory. + +Cc: stable@kernel.org +Fixes: 929f45e32499 ("kvm: no need to check return value of debugfs_create functions") +Signed-off-by: Oliver Upton +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20220406235615.1447180-2-oupton@google.com +Signed-off-by: Greg Kroah-Hartman +--- + virt/kvm/kvm_main.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -911,7 +911,7 @@ static void kvm_destroy_vm_debugfs(struc + int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + + kvm_vcpu_stats_header.num_desc; + +- if (!kvm->debugfs_dentry) ++ if (IS_ERR(kvm->debugfs_dentry)) + return; + + debugfs_remove_recursive(kvm->debugfs_dentry); +@@ -934,6 +934,12 @@ static int kvm_create_vm_debugfs(struct + int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + + kvm_vcpu_stats_header.num_desc; + ++ /* ++ * Force subsequent debugfs file creations to fail if the VM directory ++ * is not created. ++ */ ++ kvm->debugfs_dentry = ERR_PTR(-ENOENT); ++ + if (!debugfs_initialized()) + return 0; + +@@ -5373,7 +5379,7 @@ static void kvm_uevent_notify_change(uns + } + add_uevent_var(env, "PID=%d", kvm->userspace_pid); + +- if (kvm->debugfs_dentry) { ++ if (!IS_ERR(kvm->debugfs_dentry)) { + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + + if (p) { diff --git a/queue-5.15/kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch b/queue-5.15/kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch new file mode 100644 index 00000000000..8335c1ffab8 --- /dev/null +++ b/queue-5.15/kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch @@ -0,0 +1,156 @@ +From 1d0e84806047f38027d7572adb4702ef7c09b317 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 31 Mar 2022 22:13:59 +0000 +Subject: KVM: x86/mmu: Resolve nx_huge_pages when kvm.ko is loaded + +From: Sean Christopherson + +commit 1d0e84806047f38027d7572adb4702ef7c09b317 upstream. + +Resolve nx_huge_pages to true/false when kvm.ko is loaded, leaving it as +-1 is technically undefined behavior when its value is read out by +param_get_bool(), as boolean values are supposed to be '0' or '1'. + +Alternatively, KVM could define a custom getter for the param, but the +auto value doesn't depend on the vendor module in any way, and printing +"auto" would be unnecessarily unfriendly to the user. + +In addition to fixing the undefined behavior, resolving the auto value +also fixes the scenario where the auto value resolves to N and no vendor +module is loaded. Previously, -1 would result in Y being printed even +though KVM would ultimately disable the mitigation. + +Rename the existing MMU module init/exit helpers to clarify that they're +invoked with respect to the vendor module, and add comments to document +why KVM has two separate "module init" flows. + + ========================================================================= + UBSAN: invalid-load in kernel/params.c:320:33 + load of value 255 is not a valid value for type '_Bool' + CPU: 6 PID: 892 Comm: tail Not tainted 5.17.0-rc3+ #799 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + Call Trace: + + dump_stack_lvl+0x34/0x44 + ubsan_epilogue+0x5/0x40 + __ubsan_handle_load_invalid_value.cold+0x43/0x48 + param_get_bool.cold+0xf/0x14 + param_attr_show+0x55/0x80 + module_attr_show+0x1c/0x30 + sysfs_kf_seq_show+0x93/0xc0 + seq_read_iter+0x11c/0x450 + new_sync_read+0x11b/0x1a0 + vfs_read+0xf0/0x190 + ksys_read+0x5f/0xe0 + do_syscall_64+0x3b/0xc0 + entry_SYSCALL_64_after_hwframe+0x44/0xae + + ========================================================================= + +Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation") +Cc: stable@vger.kernel.org +Reported-by: Bruno Goncalves +Reported-by: Jan Stancek +Signed-off-by: Sean Christopherson +Message-Id: <20220331221359.3912754-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/kvm_host.h | 5 +++-- + arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++++++---- + arch/x86/kvm/x86.c | 20 ++++++++++++++++++-- + 3 files changed, 37 insertions(+), 8 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1559,8 +1559,9 @@ static inline int kvm_arch_flush_remote_ + return -ENOTSUPP; + } + +-int kvm_mmu_module_init(void); +-void kvm_mmu_module_exit(void); ++void kvm_mmu_x86_module_init(void); ++int kvm_mmu_vendor_module_init(void); ++void kvm_mmu_vendor_module_exit(void); + + void kvm_mmu_destroy(struct kvm_vcpu *vcpu); + int kvm_mmu_create(struct kvm_vcpu *vcpu); +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6105,12 +6105,24 @@ static int set_nx_huge_pages(const char + return 0; + } + +-int kvm_mmu_module_init(void) ++/* ++ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as ++ * its default value of -1 is technically undefined behavior for a boolean. ++ */ ++void kvm_mmu_x86_module_init(void) + { +- int ret = -ENOMEM; +- + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); ++} ++ ++/* ++ * The bulk of the MMU initialization is deferred until the vendor module is ++ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need ++ * to be reset when a potentially different vendor module is loaded. ++ */ ++int kvm_mmu_vendor_module_init(void) ++{ ++ int ret = -ENOMEM; + + /* + * MMU roles use union aliasing which is, generally speaking, an +@@ -6182,7 +6194,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vc + mmu_free_memory_caches(vcpu); + } + +-void kvm_mmu_module_exit(void) ++void kvm_mmu_vendor_module_exit(void) + { + mmu_destroy_caches(); + percpu_counter_destroy(&kvm_total_used_mmu_pages); +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8562,7 +8562,7 @@ int kvm_arch_init(void *opaque) + } + kvm_nr_uret_msrs = 0; + +- r = kvm_mmu_module_init(); ++ r = kvm_mmu_vendor_module_init(); + if (r) + goto out_free_percpu; + +@@ -8612,7 +8612,7 @@ void kvm_arch_exit(void) + cancel_work_sync(&pvclock_gtod_work); + #endif + kvm_x86_ops.hardware_enable = NULL; +- kvm_mmu_module_exit(); ++ kvm_mmu_vendor_module_exit(); + free_percpu(user_return_msrs); + kmem_cache_destroy(x86_emulator_cache); + kmem_cache_destroy(x86_fpu_cache); +@@ -12618,3 +12618,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); ++ ++static int __init kvm_x86_init(void) ++{ ++ kvm_mmu_x86_module_init(); ++ return 0; ++} ++module_init(kvm_x86_init); ++ ++static void __exit kvm_x86_exit(void) ++{ ++ /* ++ * If module_init() is implemented, module_exit() must also be ++ * implemented to allow module unload. ++ */ ++} ++module_exit(kvm_x86_exit); diff --git a/queue-5.15/memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch b/queue-5.15/memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch new file mode 100644 index 00000000000..6ed65a30f95 --- /dev/null +++ b/queue-5.15/memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch @@ -0,0 +1,49 @@ +From b452dbf24d7d9a990d70118462925f6ee287d135 Mon Sep 17 00:00:00 2001 +From: Johan Hovold +Date: Thu, 3 Mar 2022 19:06:32 +0100 +Subject: memory: renesas-rpc-if: fix platform-device leak in error path + +From: Johan Hovold + +commit b452dbf24d7d9a990d70118462925f6ee287d135 upstream. + +Make sure to free the flash platform device in the event that +registration fails during probe. + +Fixes: ca7d8b980b67 ("memory: add Renesas RPC-IF driver") +Cc: stable@vger.kernel.org # 5.8 +Cc: Sergei Shtylyov +Signed-off-by: Johan Hovold +Link: https://lore.kernel.org/r/20220303180632.3194-1-johan@kernel.org +Signed-off-by: Krzysztof Kozlowski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/memory/renesas-rpc-if.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/drivers/memory/renesas-rpc-if.c ++++ b/drivers/memory/renesas-rpc-if.c +@@ -579,6 +579,7 @@ static int rpcif_probe(struct platform_d + struct platform_device *vdev; + struct device_node *flash; + const char *name; ++ int ret; + + flash = of_get_next_child(pdev->dev.of_node, NULL); + if (!flash) { +@@ -602,7 +603,14 @@ static int rpcif_probe(struct platform_d + return -ENOMEM; + vdev->dev.parent = &pdev->dev; + platform_set_drvdata(pdev, vdev); +- return platform_device_add(vdev); ++ ++ ret = platform_device_add(vdev); ++ if (ret) { ++ platform_device_put(vdev); ++ return ret; ++ } ++ ++ return 0; + } + + static int rpcif_remove(struct platform_device *pdev) diff --git a/queue-5.15/mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch b/queue-5.15/mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch new file mode 100644 index 00000000000..fc2e74eb654 --- /dev/null +++ b/queue-5.15/mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch @@ -0,0 +1,156 @@ +From e914d8f00391520ecc4495dd0ca0124538ab7119 Mon Sep 17 00:00:00 2001 +From: Minchan Kim +Date: Thu, 14 Apr 2022 19:13:46 -0700 +Subject: mm: fix unexpected zeroed page mapping with zram swap + +From: Minchan Kim + +commit e914d8f00391520ecc4495dd0ca0124538ab7119 upstream. + +Two processes under CLONE_VM cloning, user process can be corrupted by +seeing zeroed page unexpectedly. + + CPU A CPU B + + do_swap_page do_swap_page + SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path + swap_readpage valid data + swap_slot_free_notify + delete zram entry + swap_readpage zeroed(invalid) data + pte_lock + map the *zero data* to userspace + pte_unlock + pte_lock + if (!pte_same) + goto out_nomap; + pte_unlock + return and next refault will + read zeroed data + +The swap_slot_free_notify is bogus for CLONE_VM case since it doesn't +increase the refcount of swap slot at copy_mm so it couldn't catch up +whether it's safe or not to discard data from backing device. In the +case, only the lock it could rely on to synchronize swap slot freeing is +page table lock. Thus, this patch gets rid of the swap_slot_free_notify +function. With this patch, CPU A will see correct data. + + CPU A CPU B + + do_swap_page do_swap_page + SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path + swap_readpage original data + pte_lock + map the original data + swap_free + swap_range_free + bd_disk->fops->swap_slot_free_notify + swap_readpage read zeroed data + pte_unlock + pte_lock + if (!pte_same) + goto out_nomap; + pte_unlock + return + on next refault will see mapped data by CPU B + +The concern of the patch would increase memory consumption since it +could keep wasted memory with compressed form in zram as well as +uncompressed form in address space. However, most of cases of zram uses +no readahead and do_swap_page is followed by swap_free so it will free +the compressed form from in zram quickly. + +Link: https://lkml.kernel.org/r/YjTVVxIAsnKAXjTd@google.com +Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device") +Reported-by: Ivan Babrou +Tested-by: Ivan Babrou +Signed-off-by: Minchan Kim +Cc: Nitin Gupta +Cc: Sergey Senozhatsky +Cc: Jens Axboe +Cc: David Hildenbrand +Cc: [4.14+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_io.c | 54 ------------------------------------------------------ + 1 file changed, 54 deletions(-) + +--- a/mm/page_io.c ++++ b/mm/page_io.c +@@ -50,54 +50,6 @@ void end_swap_bio_write(struct bio *bio) + bio_put(bio); + } + +-static void swap_slot_free_notify(struct page *page) +-{ +- struct swap_info_struct *sis; +- struct gendisk *disk; +- swp_entry_t entry; +- +- /* +- * There is no guarantee that the page is in swap cache - the software +- * suspend code (at least) uses end_swap_bio_read() against a non- +- * swapcache page. So we must check PG_swapcache before proceeding with +- * this optimization. +- */ +- if (unlikely(!PageSwapCache(page))) +- return; +- +- sis = page_swap_info(page); +- if (data_race(!(sis->flags & SWP_BLKDEV))) +- return; +- +- /* +- * The swap subsystem performs lazy swap slot freeing, +- * expecting that the page will be swapped out again. +- * So we can avoid an unnecessary write if the page +- * isn't redirtied. +- * This is good for real swap storage because we can +- * reduce unnecessary I/O and enhance wear-leveling +- * if an SSD is used as the as swap device. +- * But if in-memory swap device (eg zram) is used, +- * this causes a duplicated copy between uncompressed +- * data in VM-owned memory and compressed data in +- * zram-owned memory. So let's free zram-owned memory +- * and make the VM-owned decompressed page *dirty*, +- * so the page should be swapped out somewhere again if +- * we again wish to reclaim it. +- */ +- disk = sis->bdev->bd_disk; +- entry.val = page_private(page); +- if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) { +- unsigned long offset; +- +- offset = swp_offset(entry); +- +- SetPageDirty(page); +- disk->fops->swap_slot_free_notify(sis->bdev, +- offset); +- } +-} +- + static void end_swap_bio_read(struct bio *bio) + { + struct page *page = bio_first_page_all(bio); +@@ -113,7 +65,6 @@ static void end_swap_bio_read(struct bio + } + + SetPageUptodate(page); +- swap_slot_free_notify(page); + out: + unlock_page(page); + WRITE_ONCE(bio->bi_private, NULL); +@@ -392,11 +343,6 @@ int swap_readpage(struct page *page, boo + if (sis->flags & SWP_SYNCHRONOUS_IO) { + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); + if (!ret) { +- if (trylock_page(page)) { +- swap_slot_free_notify(page); +- unlock_page(page); +- } +- + count_vm_event(PSWPIN); + goto out; + } diff --git a/queue-5.15/mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch b/queue-5.15/mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch new file mode 100644 index 00000000000..509d20a2900 --- /dev/null +++ b/queue-5.15/mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch @@ -0,0 +1,96 @@ +From 23c2d497de21f25898fbea70aeb292ab8acc8c94 Mon Sep 17 00:00:00 2001 +From: Patrick Wang +Date: Thu, 14 Apr 2022 19:14:04 -0700 +Subject: mm: kmemleak: take a full lowmem check in kmemleak_*_phys() + +From: Patrick Wang + +commit 23c2d497de21f25898fbea70aeb292ab8acc8c94 upstream. + +The kmemleak_*_phys() apis do not check the address for lowmem's min +boundary, while the caller may pass an address below lowmem, which will +trigger an oops: + + # echo scan > /sys/kernel/debug/kmemleak + Unable to handle kernel paging request at virtual address ff5fffffffe00000 + Oops [#1] + Modules linked in: + CPU: 2 PID: 134 Comm: bash Not tainted 5.18.0-rc1-next-20220407 #33 + Hardware name: riscv-virtio,qemu (DT) + epc : scan_block+0x74/0x15c + ra : scan_block+0x72/0x15c + epc : ffffffff801e5806 ra : ffffffff801e5804 sp : ff200000104abc30 + gp : ffffffff815cd4e8 tp : ff60000004cfa340 t0 : 0000000000000200 + t1 : 00aaaaaac23954cc t2 : 00000000000003ff s0 : ff200000104abc90 + s1 : ffffffff81b0ff28 a0 : 0000000000000000 a1 : ff5fffffffe01000 + a2 : ffffffff81b0ff28 a3 : 0000000000000002 a4 : 0000000000000001 + a5 : 0000000000000000 a6 : ff200000104abd7c a7 : 0000000000000005 + s2 : ff5fffffffe00ff9 s3 : ffffffff815cd998 s4 : ffffffff815d0e90 + s5 : ffffffff81b0ff28 s6 : 0000000000000020 s7 : ffffffff815d0eb0 + s8 : ffffffffffffffff s9 : ff5fffffffe00000 s10: ff5fffffffe01000 + s11: 0000000000000022 t3 : 00ffffffaa17db4c t4 : 000000000000000f + t5 : 0000000000000001 t6 : 0000000000000000 + status: 0000000000000100 badaddr: ff5fffffffe00000 cause: 000000000000000d + scan_gray_list+0x12e/0x1a6 + kmemleak_scan+0x2aa/0x57e + kmemleak_write+0x32a/0x40c + full_proxy_write+0x56/0x82 + vfs_write+0xa6/0x2a6 + ksys_write+0x6c/0xe2 + sys_write+0x22/0x2a + ret_from_syscall+0x0/0x2 + +The callers may not quite know the actual address they pass(e.g. from +devicetree). So the kmemleak_*_phys() apis should guarantee the address +they finally use is in lowmem range, so check the address for lowmem's +min boundary. + +Link: https://lkml.kernel.org/r/20220413122925.33856-1-patrick.wang.shcn@gmail.com +Signed-off-by: Patrick Wang +Acked-by: Catalin Marinas +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/kmemleak.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/mm/kmemleak.c ++++ b/mm/kmemleak.c +@@ -1125,7 +1125,7 @@ EXPORT_SYMBOL(kmemleak_no_scan); + void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, + gfp_t gfp) + { +- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) ++ if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + kmemleak_alloc(__va(phys), size, min_count, gfp); + } + EXPORT_SYMBOL(kmemleak_alloc_phys); +@@ -1139,7 +1139,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); + */ + void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) + { +- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) ++ if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + kmemleak_free_part(__va(phys), size); + } + EXPORT_SYMBOL(kmemleak_free_part_phys); +@@ -1151,7 +1151,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); + */ + void __ref kmemleak_not_leak_phys(phys_addr_t phys) + { +- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) ++ if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + kmemleak_not_leak(__va(phys)); + } + EXPORT_SYMBOL(kmemleak_not_leak_phys); +@@ -1163,7 +1163,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys); + */ + void __ref kmemleak_ignore_phys(phys_addr_t phys) + { +- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) ++ if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + kmemleak_ignore(__va(phys)); + } + EXPORT_SYMBOL(kmemleak_ignore_phys); diff --git a/queue-5.15/mm-page_alloc-fix-build_zonerefs_node.patch b/queue-5.15/mm-page_alloc-fix-build_zonerefs_node.patch new file mode 100644 index 00000000000..4888f66ed18 --- /dev/null +++ b/queue-5.15/mm-page_alloc-fix-build_zonerefs_node.patch @@ -0,0 +1,69 @@ +From e553f62f10d93551eb883eca227ac54d1a4fad84 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Thu, 14 Apr 2022 19:13:43 -0700 +Subject: mm, page_alloc: fix build_zonerefs_node() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Juergen Gross + +commit e553f62f10d93551eb883eca227ac54d1a4fad84 upstream. + +Since commit 6aa303defb74 ("mm, vmscan: only allocate and reclaim from +zones with pages managed by the buddy allocator") only zones with free +memory are included in a built zonelist. This is problematic when e.g. +all memory of a zone has been ballooned out when zonelists are being +rebuilt. + +The decision whether to rebuild the zonelists when onlining new memory +is done based on populated_zone() returning 0 for the zone the memory +will be added to. The new zone is added to the zonelists only, if it +has free memory pages (managed_zone() returns a non-zero value) after +the memory has been onlined. This implies, that onlining memory will +always free the added pages to the allocator immediately, but this is +not true in all cases: when e.g. running as a Xen guest the onlined new +memory will be added only to the ballooned memory list, it will be freed +only when the guest is being ballooned up afterwards. + +Another problem with using managed_zone() for the decision whether a +zone is being added to the zonelists is, that a zone with all memory +used will in fact be removed from all zonelists in case the zonelists +happen to be rebuilt. + +Use populated_zone() when building a zonelist as it has been done before +that commit. + +There was a report that QubesOS (based on Xen) is hitting this problem. +Xen has switched to use the zone device functionality in kernel 5.9 and +QubesOS wants to use memory hotplugging for guests in order to be able +to start a guest with minimal memory and expand it as needed. This was +the report leading to the patch. + +Link: https://lkml.kernel.org/r/20220407120637.9035-1-jgross@suse.com +Fixes: 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator") +Signed-off-by: Juergen Gross +Reported-by: Marek Marczykowski-Górecki +Acked-by: Michal Hocko +Acked-by: David Hildenbrand +Cc: Marek Marczykowski-Górecki +Reviewed-by: Wei Yang +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -6092,7 +6092,7 @@ static int build_zonerefs_node(pg_data_t + do { + zone_type--; + zone = pgdat->node_zones + zone_type; +- if (managed_zone(zone)) { ++ if (populated_zone(zone)) { + zoneref_set_zone(zone, &zonerefs[nr_zones++]); + check_highest_zone(zone_type); + } diff --git a/queue-5.15/mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch b/queue-5.15/mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch new file mode 100644 index 00000000000..a36951da9ef --- /dev/null +++ b/queue-5.15/mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch @@ -0,0 +1,130 @@ +From f9b141f93659e09a52e28791ccbaf69c273b8e92 Mon Sep 17 00:00:00 2001 +From: Axel Rasmussen +Date: Thu, 14 Apr 2022 19:13:31 -0700 +Subject: mm/secretmem: fix panic when growing a memfd_secret + +From: Axel Rasmussen + +commit f9b141f93659e09a52e28791ccbaf69c273b8e92 upstream. + +When one tries to grow an existing memfd_secret with ftruncate, one gets +a panic [1]. For example, doing the following reliably induces the +panic: + + fd = memfd_secret(); + + ftruncate(fd, 10); + ptr = mmap(NULL, 10, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + strcpy(ptr, "123456789"); + + munmap(ptr, 10); + ftruncate(fd, 20); + +The basic reason for this is, when we grow with ftruncate, we call down +into simple_setattr, and then truncate_inode_pages_range, and eventually +we try to zero part of the memory. The normal truncation code does this +via the direct map (i.e., it calls page_address() and hands that to +memset()). + +For memfd_secret though, we specifically don't map our pages via the +direct map (i.e. we call set_direct_map_invalid_noflush() on every +fault). So the address returned by page_address() isn't useful, and +when we try to memset() with it we panic. + +This patch avoids the panic by implementing a custom setattr for +memfd_secret, which detects resizes specifically (setting the size for +the first time works just fine, since there are no existing pages to try +to zero), and rejects them with EINVAL. + +One could argue growing should be supported, but I think that will +require a significantly more lengthy change. So, I propose a minimal +fix for the benefit of stable kernels, and then perhaps to extend +memfd_secret to support growing in a separate patch. + +[1]: + + BUG: unable to handle page fault for address: ffffa0a889277028 + #PF: supervisor write access in kernel mode + #PF: error_code(0x0002) - not-present page + PGD afa01067 P4D afa01067 PUD 83f909067 PMD 83f8bf067 PTE 800ffffef6d88060 + Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI + CPU: 0 PID: 281 Comm: repro Not tainted 5.17.0-dbg-DEV #1 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 + RIP: 0010:memset_erms+0x9/0x10 + Code: c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 f3 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 aa 4c 89 c8 c3 90 49 89 fa 40 0f b6 ce 48 b8 01 01 01 01 01 01 + RSP: 0018:ffffb932c09afbf0 EFLAGS: 00010246 + RAX: 0000000000000000 RBX: ffffda63c4249dc0 RCX: 0000000000000fd8 + RDX: 0000000000000fd8 RSI: 0000000000000000 RDI: ffffa0a889277028 + RBP: ffffb932c09afc00 R08: 0000000000001000 R09: ffffa0a889277028 + R10: 0000000000020023 R11: 0000000000000000 R12: ffffda63c4249dc0 + R13: ffffa0a890d70d98 R14: 0000000000000028 R15: 0000000000000fd8 + FS: 00007f7294899580(0000) GS:ffffa0af9bc00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: ffffa0a889277028 CR3: 0000000107ef6006 CR4: 0000000000370ef0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + ? zero_user_segments+0x82/0x190 + truncate_inode_partial_folio+0xd4/0x2a0 + truncate_inode_pages_range+0x380/0x830 + truncate_setsize+0x63/0x80 + simple_setattr+0x37/0x60 + notify_change+0x3d8/0x4d0 + do_sys_ftruncate+0x162/0x1d0 + __x64_sys_ftruncate+0x1c/0x20 + do_syscall_64+0x44/0xa0 + entry_SYSCALL_64_after_hwframe+0x44/0xae + Modules linked in: xhci_pci xhci_hcd virtio_net net_failover failover virtio_blk virtio_balloon uhci_hcd ohci_pci ohci_hcd evdev ehci_pci ehci_hcd 9pnet_virtio 9p netfs 9pnet + CR2: ffffa0a889277028 + +[lkp@intel.com: secretmem_iops can be static] + Signed-off-by: kernel test robot +[axelrasmussen@google.com: return EINVAL] + +Link: https://lkml.kernel.org/r/20220324210909.1843814-1-axelrasmussen@google.com +Link: https://lkml.kernel.org/r/20220412193023.279320-1-axelrasmussen@google.com +Signed-off-by: Axel Rasmussen +Cc: Mike Rapoport +Cc: Matthew Wilcox +Cc: +Cc: kernel test robot +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/secretmem.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/mm/secretmem.c ++++ b/mm/secretmem.c +@@ -158,6 +158,22 @@ const struct address_space_operations se + .isolate_page = secretmem_isolate_page, + }; + ++static int secretmem_setattr(struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct iattr *iattr) ++{ ++ struct inode *inode = d_inode(dentry); ++ unsigned int ia_valid = iattr->ia_valid; ++ ++ if ((ia_valid & ATTR_SIZE) && inode->i_size) ++ return -EINVAL; ++ ++ return simple_setattr(mnt_userns, dentry, iattr); ++} ++ ++static const struct inode_operations secretmem_iops = { ++ .setattr = secretmem_setattr, ++}; ++ + static struct vfsmount *secretmem_mnt; + + static struct file *secretmem_file_create(unsigned long flags) +@@ -177,6 +193,7 @@ static struct file *secretmem_file_creat + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); + ++ inode->i_op = &secretmem_iops; + inode->i_mapping->a_ops = &secretmem_aops; + + /* pretend we are a normal file with zero size */ diff --git a/queue-5.15/series b/queue-5.15/series index 4cae06a58da..e8b79b15011 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -141,3 +141,13 @@ drivers-net-slip-fix-npd-bug-in-sl_tx_timeout.patch io_uring-zero-tag-on-rsrc-removal.patch io_uring-use-nospec-annotation-for-more-indexes.patch perf-imx_ddr-fix-undefined-behavior-due-to-shift-ove.patch +mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch +mm-page_alloc-fix-build_zonerefs_node.patch +mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch +mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch +kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch +kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch +sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch +memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch +gcc-plugins-latent_entropy-use-dev-urandom.patch +cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch diff --git a/queue-5.15/sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch b/queue-5.15/sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch new file mode 100644 index 00000000000..9a128b71876 --- /dev/null +++ b/queue-5.15/sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch @@ -0,0 +1,83 @@ +From 773f91b2cf3f52df0d7508fdbf60f37567cdaee4 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Fri, 1 Apr 2022 17:08:21 -0400 +Subject: SUNRPC: Fix NFSD's request deferral on RDMA transports + +From: Chuck Lever + +commit 773f91b2cf3f52df0d7508fdbf60f37567cdaee4 upstream. + +Trond Myklebust reports an NFSD crash in svc_rdma_sendto(). Further +investigation shows that the crash occurred while NFSD was handling +a deferred request. + +This patch addresses two inter-related issues that prevent request +deferral from working correctly for RPC/RDMA requests: + +1. Prevent the crash by ensuring that the original + svc_rqst::rq_xprt_ctxt value is available when the request is + revisited. Otherwise svc_rdma_sendto() does not have a Receive + context available with which to construct its reply. + +2. Possibly since before commit 71641d99ce03 ("svcrdma: Properly + compute .len and .buflen for received RPC Calls"), + svc_rdma_recvfrom() did not include the transport header in the + returned xdr_buf. There should have been no need for svc_defer() + and friends to save and restore that header, as of that commit. + This issue is addressed in a backport-friendly way by simply + having svc_rdma_recvfrom() set rq_xprt_hlen to zero + unconditionally, just as svc_tcp_recvfrom() does. This enables + svc_deferred_recv() to correctly reconstruct an RPC message + received via RPC/RDMA. + +Reported-by: Trond Myklebust +Link: https://lore.kernel.org/linux-nfs/82662b7190f26fb304eb0ab1bb04279072439d4e.camel@hammerspace.com/ +Signed-off-by: Chuck Lever +Cc: +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sunrpc/svc.h | 1 + + net/sunrpc/svc_xprt.c | 3 +++ + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- + 3 files changed, 5 insertions(+), 1 deletion(-) + +--- a/include/linux/sunrpc/svc.h ++++ b/include/linux/sunrpc/svc.h +@@ -384,6 +384,7 @@ struct svc_deferred_req { + size_t addrlen; + struct sockaddr_storage daddr; /* where reply must come from */ + size_t daddrlen; ++ void *xprt_ctxt; + struct cache_deferred_req handle; + size_t xprt_hlen; + int argslen; +--- a/net/sunrpc/svc_xprt.c ++++ b/net/sunrpc/svc_xprt.c +@@ -1213,6 +1213,8 @@ static struct cache_deferred_req *svc_de + dr->daddr = rqstp->rq_daddr; + dr->argslen = rqstp->rq_arg.len >> 2; + dr->xprt_hlen = rqstp->rq_xprt_hlen; ++ dr->xprt_ctxt = rqstp->rq_xprt_ctxt; ++ rqstp->rq_xprt_ctxt = NULL; + + /* back up head to the start of the buffer and copy */ + skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; +@@ -1251,6 +1253,7 @@ static noinline int svc_deferred_recv(st + rqstp->rq_xprt_hlen = dr->xprt_hlen; + rqstp->rq_daddr = dr->daddr; + rqstp->rq_respages = rqstp->rq_pages; ++ rqstp->rq_xprt_ctxt = dr->xprt_ctxt; + svc_xprt_received(rqstp->rq_xprt); + return (dr->argslen<<2) - dr->xprt_hlen; + } +--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +@@ -826,7 +826,7 @@ int svc_rdma_recvfrom(struct svc_rqst *r + goto out_err; + if (ret == 0) + goto out_drop; +- rqstp->rq_xprt_hlen = ret; ++ rqstp->rq_xprt_hlen = 0; + + if (svc_rdma_is_reverse_direction_reply(xprt, ctxt)) + goto out_backchannel;