]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 18 Apr 2022 08:13:12 +0000 (10:13 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 18 Apr 2022 08:13:12 +0000 (10:13 +0200)
added patches:
cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch
gcc-plugins-latent_entropy-use-dev-urandom.patch
kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch
kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch
memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch
mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch
mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch
mm-page_alloc-fix-build_zonerefs_node.patch
mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch
sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch

queue-5.15/cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch [new file with mode: 0644]
queue-5.15/gcc-plugins-latent_entropy-use-dev-urandom.patch [new file with mode: 0644]
queue-5.15/kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch [new file with mode: 0644]
queue-5.15/kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch [new file with mode: 0644]
queue-5.15/memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch [new file with mode: 0644]
queue-5.15/mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch [new file with mode: 0644]
queue-5.15/mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch [new file with mode: 0644]
queue-5.15/mm-page_alloc-fix-build_zonerefs_node.patch [new file with mode: 0644]
queue-5.15/mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch [new file with mode: 0644]
queue-5.15/series
queue-5.15/sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch [new file with mode: 0644]

diff --git a/queue-5.15/cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch b/queue-5.15/cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch
new file mode 100644 (file)
index 0000000..c404a5a
--- /dev/null
@@ -0,0 +1,47 @@
+From 8b6c58458ee3206dde345fce327a4cb83e69caf9 Mon Sep 17 00:00:00 2001
+From: Ronnie Sahlberg <lsahlber@redhat.com>
+Date: Wed, 13 Apr 2022 10:02:17 +1000
+Subject: cifs: verify that tcon is valid before dereference in cifs_kill_sb
+
+From: Ronnie Sahlberg <lsahlber@redhat.com>
+
+commit 8b6c58458ee3206dde345fce327a4cb83e69caf9 upstream.
+
+On umount, cifs_sb->tlink_tree might contain entries that do not represent
+a valid tcon.
+Check the tcon for error before we dereference it.
+
+Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
+Reported-by: Xiaoli Feng <xifeng@redhat.com>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/cifs/cifsfs.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/cifs/cifsfs.c
++++ b/fs/cifs/cifsfs.c
+@@ -266,10 +266,11 @@ static void cifs_kill_sb(struct super_bl
+        * before we kill the sb.
+        */
+       if (cifs_sb->root) {
+-              node = rb_first(root);
+-              while (node != NULL) {
++              for (node = rb_first(root); node; node = rb_next(node)) {
+                       tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+                       tcon = tlink_tcon(tlink);
++                      if (IS_ERR(tcon))
++                              continue;
+                       cfid = &tcon->crfid;
+                       mutex_lock(&cfid->fid_mutex);
+                       if (cfid->dentry) {
+@@ -277,7 +278,6 @@ static void cifs_kill_sb(struct super_bl
+                               cfid->dentry = NULL;
+                       }
+                       mutex_unlock(&cfid->fid_mutex);
+-                      node = rb_next(node);
+               }
+               /* finally release root dentry */
diff --git a/queue-5.15/gcc-plugins-latent_entropy-use-dev-urandom.patch b/queue-5.15/gcc-plugins-latent_entropy-use-dev-urandom.patch
new file mode 100644 (file)
index 0000000..c40144e
--- /dev/null
@@ -0,0 +1,121 @@
+From c40160f2998c897231f8454bf797558d30a20375 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 6 Apr 2022 00:28:15 +0200
+Subject: gcc-plugins: latent_entropy: use /dev/urandom
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit c40160f2998c897231f8454bf797558d30a20375 upstream.
+
+While the latent entropy plugin mostly doesn't derive entropy from
+get_random_const() for measuring the call graph, when __latent_entropy is
+applied to a constant, then it's initialized statically to output from
+get_random_const(). In that case, this data is derived from a 64-bit
+seed, which means a buffer of 512 bits doesn't really have that amount
+of compile-time entropy.
+
+This patch fixes that shortcoming by just buffering chunks of
+/dev/urandom output and doling it out as requested.
+
+At the same time, it's important that we don't break the use of
+-frandom-seed, for people who want the runtime benefits of the latent
+entropy plugin, while still having compile-time determinism. In that
+case, we detect whether gcc's set_random_seed() has been called by
+making a call to get_random_seed(noinit=true) in the plugin init
+function, which is called after set_random_seed() is called but before
+anything that calls get_random_seed(noinit=false), and seeing if it's
+zero or not. If it's not zero, we're in deterministic mode, and so we
+just generate numbers with a basic xorshift prng.
+
+Note that we don't detect if -frandom-seed is being used using the
+documented local_tick variable, because it's assigned via:
+   local_tick = (unsigned) tv.tv_sec * 1000 + tv.tv_usec / 1000;
+which may well overflow and become -1 on its own, and so isn't
+reliable: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105171
+
+[kees: The 256 byte rnd_buf size was chosen based on average (250),
+ median (64), and std deviation (575) bytes of used entropy for a
+ defconfig x86_64 build]
+
+Fixes: 38addce8b600 ("gcc-plugins: Add latent_entropy plugin")
+Cc: stable@vger.kernel.org
+Cc: PaX Team <pageexec@freemail.hu>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Kees Cook <keescook@chromium.org>
+Link: https://lore.kernel.org/r/20220405222815.21155-1-Jason@zx2c4.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ scripts/gcc-plugins/latent_entropy_plugin.c |   44 +++++++++++++++++-----------
+ 1 file changed, 27 insertions(+), 17 deletions(-)
+
+--- a/scripts/gcc-plugins/latent_entropy_plugin.c
++++ b/scripts/gcc-plugins/latent_entropy_plugin.c
+@@ -86,25 +86,31 @@ static struct plugin_info latent_entropy
+       .help           = "disable\tturn off latent entropy instrumentation\n",
+ };
+-static unsigned HOST_WIDE_INT seed;
+-/*
+- * get_random_seed() (this is a GCC function) generates the seed.
+- * This is a simple random generator without any cryptographic security because
+- * the entropy doesn't come from here.
+- */
++static unsigned HOST_WIDE_INT deterministic_seed;
++static unsigned HOST_WIDE_INT rnd_buf[32];
++static size_t rnd_idx = ARRAY_SIZE(rnd_buf);
++static int urandom_fd = -1;
++
+ static unsigned HOST_WIDE_INT get_random_const(void)
+ {
+-      unsigned int i;
+-      unsigned HOST_WIDE_INT ret = 0;
+-
+-      for (i = 0; i < 8 * sizeof(ret); i++) {
+-              ret = (ret << 1) | (seed & 1);
+-              seed >>= 1;
+-              if (ret & 1)
+-                      seed ^= 0xD800000000000000ULL;
++      if (deterministic_seed) {
++              unsigned HOST_WIDE_INT w = deterministic_seed;
++              w ^= w << 13;
++              w ^= w >> 7;
++              w ^= w << 17;
++              deterministic_seed = w;
++              return deterministic_seed;
+       }
+-      return ret;
++      if (urandom_fd < 0) {
++              urandom_fd = open("/dev/urandom", O_RDONLY);
++              gcc_assert(urandom_fd >= 0);
++      }
++      if (rnd_idx >= ARRAY_SIZE(rnd_buf)) {
++              gcc_assert(read(urandom_fd, rnd_buf, sizeof(rnd_buf)) == sizeof(rnd_buf));
++              rnd_idx = 0;
++      }
++      return rnd_buf[rnd_idx++];
+ }
+ static tree tree_get_random_const(tree type)
+@@ -537,8 +543,6 @@ static void latent_entropy_start_unit(vo
+       tree type, id;
+       int quals;
+-      seed = get_random_seed(false);
+-
+       if (in_lto_p)
+               return;
+@@ -573,6 +577,12 @@ __visible int plugin_init(struct plugin_
+       const struct plugin_argument * const argv = plugin_info->argv;
+       int i;
++      /*
++       * Call get_random_seed() with noinit=true, so that this returns
++       * 0 in the case where no seed has been passed via -frandom-seed.
++       */
++      deterministic_seed = get_random_seed(true);
++
+       static const struct ggc_root_tab gt_ggc_r_gt_latent_entropy[] = {
+               {
+                       .base = &latent_entropy_decl,
diff --git a/queue-5.15/kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch b/queue-5.15/kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch
new file mode 100644 (file)
index 0000000..6349c7b
--- /dev/null
@@ -0,0 +1,68 @@
+From a44a4cc1c969afec97dbb2aedaf6f38eaa6253bb Mon Sep 17 00:00:00 2001
+From: Oliver Upton <oupton@google.com>
+Date: Wed, 6 Apr 2022 23:56:13 +0000
+Subject: KVM: Don't create VM debugfs files outside of the VM directory
+
+From: Oliver Upton <oupton@google.com>
+
+commit a44a4cc1c969afec97dbb2aedaf6f38eaa6253bb upstream.
+
+Unfortunately, there is no guarantee that KVM was able to instantiate a
+debugfs directory for a particular VM. To that end, KVM shouldn't even
+attempt to create new debugfs files in this case. If the specified
+parent dentry is NULL, debugfs_create_file() will instantiate files at
+the root of debugfs.
+
+For arm64, it is possible to create the vgic-state file outside of a
+VM directory, the file is not cleaned up when a VM is destroyed.
+Nonetheless, the corresponding struct kvm is freed when the VM is
+destroyed.
+
+Nip the problem in the bud for all possible errant debugfs file
+creations by initializing kvm->debugfs_dentry to -ENOENT. In so doing,
+debugfs_create_file() will fail instead of creating the file in the root
+directory.
+
+Cc: stable@kernel.org
+Fixes: 929f45e32499 ("kvm: no need to check return value of debugfs_create functions")
+Signed-off-by: Oliver Upton <oupton@google.com>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Link: https://lore.kernel.org/r/20220406235615.1447180-2-oupton@google.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/kvm_main.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -911,7 +911,7 @@ static void kvm_destroy_vm_debugfs(struc
+       int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+                                     kvm_vcpu_stats_header.num_desc;
+-      if (!kvm->debugfs_dentry)
++      if (IS_ERR(kvm->debugfs_dentry))
+               return;
+       debugfs_remove_recursive(kvm->debugfs_dentry);
+@@ -934,6 +934,12 @@ static int kvm_create_vm_debugfs(struct
+       int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+                                     kvm_vcpu_stats_header.num_desc;
++      /*
++       * Force subsequent debugfs file creations to fail if the VM directory
++       * is not created.
++       */
++      kvm->debugfs_dentry = ERR_PTR(-ENOENT);
++
+       if (!debugfs_initialized())
+               return 0;
+@@ -5373,7 +5379,7 @@ static void kvm_uevent_notify_change(uns
+       }
+       add_uevent_var(env, "PID=%d", kvm->userspace_pid);
+-      if (kvm->debugfs_dentry) {
++      if (!IS_ERR(kvm->debugfs_dentry)) {
+               char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
+               if (p) {
diff --git a/queue-5.15/kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch b/queue-5.15/kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch
new file mode 100644 (file)
index 0000000..8335c1f
--- /dev/null
@@ -0,0 +1,156 @@
+From 1d0e84806047f38027d7572adb4702ef7c09b317 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 31 Mar 2022 22:13:59 +0000
+Subject: KVM: x86/mmu: Resolve nx_huge_pages when kvm.ko is loaded
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 1d0e84806047f38027d7572adb4702ef7c09b317 upstream.
+
+Resolve nx_huge_pages to true/false when kvm.ko is loaded, leaving it as
+-1 is technically undefined behavior when its value is read out by
+param_get_bool(), as boolean values are supposed to be '0' or '1'.
+
+Alternatively, KVM could define a custom getter for the param, but the
+auto value doesn't depend on the vendor module in any way, and printing
+"auto" would be unnecessarily unfriendly to the user.
+
+In addition to fixing the undefined behavior, resolving the auto value
+also fixes the scenario where the auto value resolves to N and no vendor
+module is loaded.  Previously, -1 would result in Y being printed even
+though KVM would ultimately disable the mitigation.
+
+Rename the existing MMU module init/exit helpers to clarify that they're
+invoked with respect to the vendor module, and add comments to document
+why KVM has two separate "module init" flows.
+
+  =========================================================================
+  UBSAN: invalid-load in kernel/params.c:320:33
+  load of value 255 is not a valid value for type '_Bool'
+  CPU: 6 PID: 892 Comm: tail Not tainted 5.17.0-rc3+ #799
+  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
+  Call Trace:
+   <TASK>
+   dump_stack_lvl+0x34/0x44
+   ubsan_epilogue+0x5/0x40
+   __ubsan_handle_load_invalid_value.cold+0x43/0x48
+   param_get_bool.cold+0xf/0x14
+   param_attr_show+0x55/0x80
+   module_attr_show+0x1c/0x30
+   sysfs_kf_seq_show+0x93/0xc0
+   seq_read_iter+0x11c/0x450
+   new_sync_read+0x11b/0x1a0
+   vfs_read+0xf0/0x190
+   ksys_read+0x5f/0xe0
+   do_syscall_64+0x3b/0xc0
+   entry_SYSCALL_64_after_hwframe+0x44/0xae
+   </TASK>
+  =========================================================================
+
+Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation")
+Cc: stable@vger.kernel.org
+Reported-by: Bruno Goncalves <bgoncalv@redhat.com>
+Reported-by: Jan Stancek <jstancek@redhat.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220331221359.3912754-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/kvm_host.h |    5 +++--
+ arch/x86/kvm/mmu/mmu.c          |   20 ++++++++++++++++----
+ arch/x86/kvm/x86.c              |   20 ++++++++++++++++++--
+ 3 files changed, 37 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1559,8 +1559,9 @@ static inline int kvm_arch_flush_remote_
+               return -ENOTSUPP;
+ }
+-int kvm_mmu_module_init(void);
+-void kvm_mmu_module_exit(void);
++void kvm_mmu_x86_module_init(void);
++int kvm_mmu_vendor_module_init(void);
++void kvm_mmu_vendor_module_exit(void);
+ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+ int kvm_mmu_create(struct kvm_vcpu *vcpu);
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -6105,12 +6105,24 @@ static int set_nx_huge_pages(const char
+       return 0;
+ }
+-int kvm_mmu_module_init(void)
++/*
++ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
++ * its default value of -1 is technically undefined behavior for a boolean.
++ */
++void kvm_mmu_x86_module_init(void)
+ {
+-      int ret = -ENOMEM;
+-
+       if (nx_huge_pages == -1)
+               __set_nx_huge_pages(get_nx_auto_mode());
++}
++
++/*
++ * The bulk of the MMU initialization is deferred until the vendor module is
++ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
++ * to be reset when a potentially different vendor module is loaded.
++ */
++int kvm_mmu_vendor_module_init(void)
++{
++      int ret = -ENOMEM;
+       /*
+        * MMU roles use union aliasing which is, generally speaking, an
+@@ -6182,7 +6194,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vc
+       mmu_free_memory_caches(vcpu);
+ }
+-void kvm_mmu_module_exit(void)
++void kvm_mmu_vendor_module_exit(void)
+ {
+       mmu_destroy_caches();
+       percpu_counter_destroy(&kvm_total_used_mmu_pages);
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8562,7 +8562,7 @@ int kvm_arch_init(void *opaque)
+       }
+       kvm_nr_uret_msrs = 0;
+-      r = kvm_mmu_module_init();
++      r = kvm_mmu_vendor_module_init();
+       if (r)
+               goto out_free_percpu;
+@@ -8612,7 +8612,7 @@ void kvm_arch_exit(void)
+       cancel_work_sync(&pvclock_gtod_work);
+ #endif
+       kvm_x86_ops.hardware_enable = NULL;
+-      kvm_mmu_module_exit();
++      kvm_mmu_vendor_module_exit();
+       free_percpu(user_return_msrs);
+       kmem_cache_destroy(x86_emulator_cache);
+       kmem_cache_destroy(x86_fpu_cache);
+@@ -12618,3 +12618,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
++
++static int __init kvm_x86_init(void)
++{
++      kvm_mmu_x86_module_init();
++      return 0;
++}
++module_init(kvm_x86_init);
++
++static void __exit kvm_x86_exit(void)
++{
++      /*
++       * If module_init() is implemented, module_exit() must also be
++       * implemented to allow module unload.
++       */
++}
++module_exit(kvm_x86_exit);
diff --git a/queue-5.15/memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch b/queue-5.15/memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch
new file mode 100644 (file)
index 0000000..6ed65a3
--- /dev/null
@@ -0,0 +1,49 @@
+From b452dbf24d7d9a990d70118462925f6ee287d135 Mon Sep 17 00:00:00 2001
+From: Johan Hovold <johan@kernel.org>
+Date: Thu, 3 Mar 2022 19:06:32 +0100
+Subject: memory: renesas-rpc-if: fix platform-device leak in error path
+
+From: Johan Hovold <johan@kernel.org>
+
+commit b452dbf24d7d9a990d70118462925f6ee287d135 upstream.
+
+Make sure to free the flash platform device in the event that
+registration fails during probe.
+
+Fixes: ca7d8b980b67 ("memory: add Renesas RPC-IF driver")
+Cc: stable@vger.kernel.org      # 5.8
+Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
+Signed-off-by: Johan Hovold <johan@kernel.org>
+Link: https://lore.kernel.org/r/20220303180632.3194-1-johan@kernel.org
+Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/memory/renesas-rpc-if.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/drivers/memory/renesas-rpc-if.c
++++ b/drivers/memory/renesas-rpc-if.c
+@@ -579,6 +579,7 @@ static int rpcif_probe(struct platform_d
+       struct platform_device *vdev;
+       struct device_node *flash;
+       const char *name;
++      int ret;
+       flash = of_get_next_child(pdev->dev.of_node, NULL);
+       if (!flash) {
+@@ -602,7 +603,14 @@ static int rpcif_probe(struct platform_d
+               return -ENOMEM;
+       vdev->dev.parent = &pdev->dev;
+       platform_set_drvdata(pdev, vdev);
+-      return platform_device_add(vdev);
++
++      ret = platform_device_add(vdev);
++      if (ret) {
++              platform_device_put(vdev);
++              return ret;
++      }
++
++      return 0;
+ }
+ static int rpcif_remove(struct platform_device *pdev)
diff --git a/queue-5.15/mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch b/queue-5.15/mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch
new file mode 100644 (file)
index 0000000..fc2e74e
--- /dev/null
@@ -0,0 +1,156 @@
+From e914d8f00391520ecc4495dd0ca0124538ab7119 Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan@kernel.org>
+Date: Thu, 14 Apr 2022 19:13:46 -0700
+Subject: mm: fix unexpected zeroed page mapping with zram swap
+
+From: Minchan Kim <minchan@kernel.org>
+
+commit e914d8f00391520ecc4495dd0ca0124538ab7119 upstream.
+
+Two processes under CLONE_VM cloning, user process can be corrupted by
+seeing zeroed page unexpectedly.
+
+      CPU A                        CPU B
+
+  do_swap_page                do_swap_page
+  SWP_SYNCHRONOUS_IO path     SWP_SYNCHRONOUS_IO path
+  swap_readpage valid data
+    swap_slot_free_notify
+      delete zram entry
+                              swap_readpage zeroed(invalid) data
+                              pte_lock
+                              map the *zero data* to userspace
+                              pte_unlock
+  pte_lock
+  if (!pte_same)
+    goto out_nomap;
+  pte_unlock
+  return and next refault will
+  read zeroed data
+
+The swap_slot_free_notify is bogus for CLONE_VM case since it doesn't
+increase the refcount of swap slot at copy_mm so it couldn't catch up
+whether it's safe or not to discard data from backing device.  In the
+case, only the lock it could rely on to synchronize swap slot freeing is
+page table lock.  Thus, this patch gets rid of the swap_slot_free_notify
+function.  With this patch, CPU A will see correct data.
+
+      CPU A                        CPU B
+
+  do_swap_page                do_swap_page
+  SWP_SYNCHRONOUS_IO path     SWP_SYNCHRONOUS_IO path
+                              swap_readpage original data
+                              pte_lock
+                              map the original data
+                              swap_free
+                                swap_range_free
+                                  bd_disk->fops->swap_slot_free_notify
+  swap_readpage read zeroed data
+                              pte_unlock
+  pte_lock
+  if (!pte_same)
+    goto out_nomap;
+  pte_unlock
+  return
+  on next refault will see mapped data by CPU B
+
+The concern of the patch would increase memory consumption since it
+could keep wasted memory with compressed form in zram as well as
+uncompressed form in address space.  However, most of cases of zram uses
+no readahead and do_swap_page is followed by swap_free so it will free
+the compressed form from in zram quickly.
+
+Link: https://lkml.kernel.org/r/YjTVVxIAsnKAXjTd@google.com
+Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
+Reported-by: Ivan Babrou <ivan@cloudflare.com>
+Tested-by: Ivan Babrou <ivan@cloudflare.com>
+Signed-off-by: Minchan Kim <minchan@kernel.org>
+Cc: Nitin Gupta <ngupta@vflare.org>
+Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: <stable@vger.kernel.org>   [4.14+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_io.c |   54 ------------------------------------------------------
+ 1 file changed, 54 deletions(-)
+
+--- a/mm/page_io.c
++++ b/mm/page_io.c
+@@ -50,54 +50,6 @@ void end_swap_bio_write(struct bio *bio)
+       bio_put(bio);
+ }
+-static void swap_slot_free_notify(struct page *page)
+-{
+-      struct swap_info_struct *sis;
+-      struct gendisk *disk;
+-      swp_entry_t entry;
+-
+-      /*
+-       * There is no guarantee that the page is in swap cache - the software
+-       * suspend code (at least) uses end_swap_bio_read() against a non-
+-       * swapcache page.  So we must check PG_swapcache before proceeding with
+-       * this optimization.
+-       */
+-      if (unlikely(!PageSwapCache(page)))
+-              return;
+-
+-      sis = page_swap_info(page);
+-      if (data_race(!(sis->flags & SWP_BLKDEV)))
+-              return;
+-
+-      /*
+-       * The swap subsystem performs lazy swap slot freeing,
+-       * expecting that the page will be swapped out again.
+-       * So we can avoid an unnecessary write if the page
+-       * isn't redirtied.
+-       * This is good for real swap storage because we can
+-       * reduce unnecessary I/O and enhance wear-leveling
+-       * if an SSD is used as the as swap device.
+-       * But if in-memory swap device (eg zram) is used,
+-       * this causes a duplicated copy between uncompressed
+-       * data in VM-owned memory and compressed data in
+-       * zram-owned memory.  So let's free zram-owned memory
+-       * and make the VM-owned decompressed page *dirty*,
+-       * so the page should be swapped out somewhere again if
+-       * we again wish to reclaim it.
+-       */
+-      disk = sis->bdev->bd_disk;
+-      entry.val = page_private(page);
+-      if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
+-              unsigned long offset;
+-
+-              offset = swp_offset(entry);
+-
+-              SetPageDirty(page);
+-              disk->fops->swap_slot_free_notify(sis->bdev,
+-                              offset);
+-      }
+-}
+-
+ static void end_swap_bio_read(struct bio *bio)
+ {
+       struct page *page = bio_first_page_all(bio);
+@@ -113,7 +65,6 @@ static void end_swap_bio_read(struct bio
+       }
+       SetPageUptodate(page);
+-      swap_slot_free_notify(page);
+ out:
+       unlock_page(page);
+       WRITE_ONCE(bio->bi_private, NULL);
+@@ -392,11 +343,6 @@ int swap_readpage(struct page *page, boo
+       if (sis->flags & SWP_SYNCHRONOUS_IO) {
+               ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
+               if (!ret) {
+-                      if (trylock_page(page)) {
+-                              swap_slot_free_notify(page);
+-                              unlock_page(page);
+-                      }
+-
+                       count_vm_event(PSWPIN);
+                       goto out;
+               }
diff --git a/queue-5.15/mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch b/queue-5.15/mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch
new file mode 100644 (file)
index 0000000..509d20a
--- /dev/null
@@ -0,0 +1,96 @@
+From 23c2d497de21f25898fbea70aeb292ab8acc8c94 Mon Sep 17 00:00:00 2001
+From: Patrick Wang <patrick.wang.shcn@gmail.com>
+Date: Thu, 14 Apr 2022 19:14:04 -0700
+Subject: mm: kmemleak: take a full lowmem check in kmemleak_*_phys()
+
+From: Patrick Wang <patrick.wang.shcn@gmail.com>
+
+commit 23c2d497de21f25898fbea70aeb292ab8acc8c94 upstream.
+
+The kmemleak_*_phys() apis do not check the address for lowmem's min
+boundary, while the caller may pass an address below lowmem, which will
+trigger an oops:
+
+  # echo scan > /sys/kernel/debug/kmemleak
+  Unable to handle kernel paging request at virtual address ff5fffffffe00000
+  Oops [#1]
+  Modules linked in:
+  CPU: 2 PID: 134 Comm: bash Not tainted 5.18.0-rc1-next-20220407 #33
+  Hardware name: riscv-virtio,qemu (DT)
+  epc : scan_block+0x74/0x15c
+   ra : scan_block+0x72/0x15c
+  epc : ffffffff801e5806 ra : ffffffff801e5804 sp : ff200000104abc30
+   gp : ffffffff815cd4e8 tp : ff60000004cfa340 t0 : 0000000000000200
+   t1 : 00aaaaaac23954cc t2 : 00000000000003ff s0 : ff200000104abc90
+   s1 : ffffffff81b0ff28 a0 : 0000000000000000 a1 : ff5fffffffe01000
+   a2 : ffffffff81b0ff28 a3 : 0000000000000002 a4 : 0000000000000001
+   a5 : 0000000000000000 a6 : ff200000104abd7c a7 : 0000000000000005
+   s2 : ff5fffffffe00ff9 s3 : ffffffff815cd998 s4 : ffffffff815d0e90
+   s5 : ffffffff81b0ff28 s6 : 0000000000000020 s7 : ffffffff815d0eb0
+   s8 : ffffffffffffffff s9 : ff5fffffffe00000 s10: ff5fffffffe01000
+   s11: 0000000000000022 t3 : 00ffffffaa17db4c t4 : 000000000000000f
+   t5 : 0000000000000001 t6 : 0000000000000000
+  status: 0000000000000100 badaddr: ff5fffffffe00000 cause: 000000000000000d
+    scan_gray_list+0x12e/0x1a6
+    kmemleak_scan+0x2aa/0x57e
+    kmemleak_write+0x32a/0x40c
+    full_proxy_write+0x56/0x82
+    vfs_write+0xa6/0x2a6
+    ksys_write+0x6c/0xe2
+    sys_write+0x22/0x2a
+    ret_from_syscall+0x0/0x2
+
+The callers may not quite know the actual address they pass(e.g. from
+devicetree).  So the kmemleak_*_phys() apis should guarantee the address
+they finally use is in lowmem range, so check the address for lowmem's
+min boundary.
+
+Link: https://lkml.kernel.org/r/20220413122925.33856-1-patrick.wang.shcn@gmail.com
+Signed-off-by: Patrick Wang <patrick.wang.shcn@gmail.com>
+Acked-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/kmemleak.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/mm/kmemleak.c
++++ b/mm/kmemleak.c
+@@ -1125,7 +1125,7 @@ EXPORT_SYMBOL(kmemleak_no_scan);
+ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
+                              gfp_t gfp)
+ {
+-      if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
++      if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
+               kmemleak_alloc(__va(phys), size, min_count, gfp);
+ }
+ EXPORT_SYMBOL(kmemleak_alloc_phys);
+@@ -1139,7 +1139,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
+  */
+ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
+ {
+-      if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
++      if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
+               kmemleak_free_part(__va(phys), size);
+ }
+ EXPORT_SYMBOL(kmemleak_free_part_phys);
+@@ -1151,7 +1151,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys);
+  */
+ void __ref kmemleak_not_leak_phys(phys_addr_t phys)
+ {
+-      if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
++      if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
+               kmemleak_not_leak(__va(phys));
+ }
+ EXPORT_SYMBOL(kmemleak_not_leak_phys);
+@@ -1163,7 +1163,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
+  */
+ void __ref kmemleak_ignore_phys(phys_addr_t phys)
+ {
+-      if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
++      if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
+               kmemleak_ignore(__va(phys));
+ }
+ EXPORT_SYMBOL(kmemleak_ignore_phys);
diff --git a/queue-5.15/mm-page_alloc-fix-build_zonerefs_node.patch b/queue-5.15/mm-page_alloc-fix-build_zonerefs_node.patch
new file mode 100644 (file)
index 0000000..4888f66
--- /dev/null
@@ -0,0 +1,69 @@
+From e553f62f10d93551eb883eca227ac54d1a4fad84 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Thu, 14 Apr 2022 19:13:43 -0700
+Subject: mm, page_alloc: fix build_zonerefs_node()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Juergen Gross <jgross@suse.com>
+
+commit e553f62f10d93551eb883eca227ac54d1a4fad84 upstream.
+
+Since commit 6aa303defb74 ("mm, vmscan: only allocate and reclaim from
+zones with pages managed by the buddy allocator") only zones with free
+memory are included in a built zonelist.  This is problematic when e.g.
+all memory of a zone has been ballooned out when zonelists are being
+rebuilt.
+
+The decision whether to rebuild the zonelists when onlining new memory
+is done based on populated_zone() returning 0 for the zone the memory
+will be added to.  The new zone is added to the zonelists only, if it
+has free memory pages (managed_zone() returns a non-zero value) after
+the memory has been onlined.  This implies, that onlining memory will
+always free the added pages to the allocator immediately, but this is
+not true in all cases: when e.g. running as a Xen guest the onlined new
+memory will be added only to the ballooned memory list, it will be freed
+only when the guest is being ballooned up afterwards.
+
+Another problem with using managed_zone() for the decision whether a
+zone is being added to the zonelists is, that a zone with all memory
+used will in fact be removed from all zonelists in case the zonelists
+happen to be rebuilt.
+
+Use populated_zone() when building a zonelist as it has been done before
+that commit.
+
+There was a report that QubesOS (based on Xen) is hitting this problem.
+Xen has switched to use the zone device functionality in kernel 5.9 and
+QubesOS wants to use memory hotplugging for guests in order to be able
+to start a guest with minimal memory and expand it as needed.  This was
+the report leading to the patch.
+
+Link: https://lkml.kernel.org/r/20220407120637.9035-1-jgross@suse.com
+Fixes: 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
+Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -6092,7 +6092,7 @@ static int build_zonerefs_node(pg_data_t
+       do {
+               zone_type--;
+               zone = pgdat->node_zones + zone_type;
+-              if (managed_zone(zone)) {
++              if (populated_zone(zone)) {
+                       zoneref_set_zone(zone, &zonerefs[nr_zones++]);
+                       check_highest_zone(zone_type);
+               }
diff --git a/queue-5.15/mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch b/queue-5.15/mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch
new file mode 100644 (file)
index 0000000..a36951d
--- /dev/null
@@ -0,0 +1,130 @@
+From f9b141f93659e09a52e28791ccbaf69c273b8e92 Mon Sep 17 00:00:00 2001
+From: Axel Rasmussen <axelrasmussen@google.com>
+Date: Thu, 14 Apr 2022 19:13:31 -0700
+Subject: mm/secretmem: fix panic when growing a memfd_secret
+
+From: Axel Rasmussen <axelrasmussen@google.com>
+
+commit f9b141f93659e09a52e28791ccbaf69c273b8e92 upstream.
+
+When one tries to grow an existing memfd_secret with ftruncate, one gets
+a panic [1].  For example, doing the following reliably induces the
+panic:
+
+    fd = memfd_secret();
+
+    ftruncate(fd, 10);
+    ptr = mmap(NULL, 10, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    strcpy(ptr, "123456789");
+
+    munmap(ptr, 10);
+    ftruncate(fd, 20);
+
+The basic reason for this is, when we grow with ftruncate, we call down
+into simple_setattr, and then truncate_inode_pages_range, and eventually
+we try to zero part of the memory.  The normal truncation code does this
+via the direct map (i.e., it calls page_address() and hands that to
+memset()).
+
+For memfd_secret though, we specifically don't map our pages via the
+direct map (i.e.  we call set_direct_map_invalid_noflush() on every
+fault).  So the address returned by page_address() isn't useful, and
+when we try to memset() with it we panic.
+
+This patch avoids the panic by implementing a custom setattr for
+memfd_secret, which detects resizes specifically (setting the size for
+the first time works just fine, since there are no existing pages to try
+to zero), and rejects them with EINVAL.
+
+One could argue growing should be supported, but I think that will
+require a significantly more lengthy change.  So, I propose a minimal
+fix for the benefit of stable kernels, and then perhaps to extend
+memfd_secret to support growing in a separate patch.
+
+[1]:
+
+  BUG: unable to handle page fault for address: ffffa0a889277028
+  #PF: supervisor write access in kernel mode
+  #PF: error_code(0x0002) - not-present page
+  PGD afa01067 P4D afa01067 PUD 83f909067 PMD 83f8bf067 PTE 800ffffef6d88060
+  Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
+  CPU: 0 PID: 281 Comm: repro Not tainted 5.17.0-dbg-DEV #1
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
+  RIP: 0010:memset_erms+0x9/0x10
+  Code: c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 f3 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 <f3> aa 4c 89 c8 c3 90 49 89 fa 40 0f b6 ce 48 b8 01 01 01 01 01 01
+  RSP: 0018:ffffb932c09afbf0 EFLAGS: 00010246
+  RAX: 0000000000000000 RBX: ffffda63c4249dc0 RCX: 0000000000000fd8
+  RDX: 0000000000000fd8 RSI: 0000000000000000 RDI: ffffa0a889277028
+  RBP: ffffb932c09afc00 R08: 0000000000001000 R09: ffffa0a889277028
+  R10: 0000000000020023 R11: 0000000000000000 R12: ffffda63c4249dc0
+  R13: ffffa0a890d70d98 R14: 0000000000000028 R15: 0000000000000fd8
+  FS:  00007f7294899580(0000) GS:ffffa0af9bc00000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: ffffa0a889277028 CR3: 0000000107ef6006 CR4: 0000000000370ef0
+  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+  Call Trace:
+   ? zero_user_segments+0x82/0x190
+   truncate_inode_partial_folio+0xd4/0x2a0
+   truncate_inode_pages_range+0x380/0x830
+   truncate_setsize+0x63/0x80
+   simple_setattr+0x37/0x60
+   notify_change+0x3d8/0x4d0
+   do_sys_ftruncate+0x162/0x1d0
+   __x64_sys_ftruncate+0x1c/0x20
+   do_syscall_64+0x44/0xa0
+   entry_SYSCALL_64_after_hwframe+0x44/0xae
+  Modules linked in: xhci_pci xhci_hcd virtio_net net_failover failover virtio_blk virtio_balloon uhci_hcd ohci_pci ohci_hcd evdev ehci_pci ehci_hcd 9pnet_virtio 9p netfs 9pnet
+  CR2: ffffa0a889277028
+
+[lkp@intel.com: secretmem_iops can be static]
+  Signed-off-by: kernel test robot <lkp@intel.com>
+[axelrasmussen@google.com: return EINVAL]
+
+Link: https://lkml.kernel.org/r/20220324210909.1843814-1-axelrasmussen@google.com
+Link: https://lkml.kernel.org/r/20220412193023.279320-1-axelrasmussen@google.com
+Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Mike Rapoport <rppt@kernel.org>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>
+Cc: kernel test robot <lkp@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/secretmem.c |   17 +++++++++++++++++
+ 1 file changed, 17 insertions(+)
+
+--- a/mm/secretmem.c
++++ b/mm/secretmem.c
+@@ -158,6 +158,22 @@ const struct address_space_operations se
+       .isolate_page   = secretmem_isolate_page,
+ };
++static int secretmem_setattr(struct user_namespace *mnt_userns,
++                           struct dentry *dentry, struct iattr *iattr)
++{
++      struct inode *inode = d_inode(dentry);
++      unsigned int ia_valid = iattr->ia_valid;
++
++      if ((ia_valid & ATTR_SIZE) && inode->i_size)
++              return -EINVAL;
++
++      return simple_setattr(mnt_userns, dentry, iattr);
++}
++
++static const struct inode_operations secretmem_iops = {
++      .setattr = secretmem_setattr,
++};
++
+ static struct vfsmount *secretmem_mnt;
+ static struct file *secretmem_file_create(unsigned long flags)
+@@ -177,6 +193,7 @@ static struct file *secretmem_file_creat
+       mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+       mapping_set_unevictable(inode->i_mapping);
++      inode->i_op = &secretmem_iops;
+       inode->i_mapping->a_ops = &secretmem_aops;
+       /* pretend we are a normal file with zero size */
index 4cae06a58da10ce5178511be0c5cf74e902e5ac7..e8b79b1501166c4e2a70b0b640be401695807178 100644 (file)
@@ -141,3 +141,13 @@ drivers-net-slip-fix-npd-bug-in-sl_tx_timeout.patch
 io_uring-zero-tag-on-rsrc-removal.patch
 io_uring-use-nospec-annotation-for-more-indexes.patch
 perf-imx_ddr-fix-undefined-behavior-due-to-shift-ove.patch
+mm-secretmem-fix-panic-when-growing-a-memfd_secret.patch
+mm-page_alloc-fix-build_zonerefs_node.patch
+mm-fix-unexpected-zeroed-page-mapping-with-zram-swap.patch
+mm-kmemleak-take-a-full-lowmem-check-in-kmemleak_-_phys.patch
+kvm-x86-mmu-resolve-nx_huge_pages-when-kvm.ko-is-loaded.patch
+kvm-don-t-create-vm-debugfs-files-outside-of-the-vm-directory.patch
+sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch
+memory-renesas-rpc-if-fix-platform-device-leak-in-error-path.patch
+gcc-plugins-latent_entropy-use-dev-urandom.patch
+cifs-verify-that-tcon-is-valid-before-dereference-in-cifs_kill_sb.patch
diff --git a/queue-5.15/sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch b/queue-5.15/sunrpc-fix-nfsd-s-request-deferral-on-rdma-transports.patch
new file mode 100644 (file)
index 0000000..9a128b7
--- /dev/null
@@ -0,0 +1,83 @@
+From 773f91b2cf3f52df0d7508fdbf60f37567cdaee4 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Fri, 1 Apr 2022 17:08:21 -0400
+Subject: SUNRPC: Fix NFSD's request deferral on RDMA transports
+
+From: Chuck Lever <chuck.lever@oracle.com>
+
+commit 773f91b2cf3f52df0d7508fdbf60f37567cdaee4 upstream.
+
+Trond Myklebust reports an NFSD crash in svc_rdma_sendto(). Further
+investigation shows that the crash occurred while NFSD was handling
+a deferred request.
+
+This patch addresses two inter-related issues that prevent request
+deferral from working correctly for RPC/RDMA requests:
+
+1. Prevent the crash by ensuring that the original
+   svc_rqst::rq_xprt_ctxt value is available when the request is
+   revisited. Otherwise svc_rdma_sendto() does not have a Receive
+   context available with which to construct its reply.
+
+2. Possibly since before commit 71641d99ce03 ("svcrdma: Properly
+   compute .len and .buflen for received RPC Calls"),
+   svc_rdma_recvfrom() did not include the transport header in the
+   returned xdr_buf. There should have been no need for svc_defer()
+   and friends to save and restore that header, as of that commit.
+   This issue is addressed in a backport-friendly way by simply
+   having svc_rdma_recvfrom() set rq_xprt_hlen to zero
+   unconditionally, just as svc_tcp_recvfrom() does. This enables
+   svc_deferred_recv() to correctly reconstruct an RPC message
+   received via RPC/RDMA.
+
+Reported-by: Trond Myklebust <trondmy@hammerspace.com>
+Link: https://lore.kernel.org/linux-nfs/82662b7190f26fb304eb0ab1bb04279072439d4e.camel@hammerspace.com/
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/sunrpc/svc.h              |    1 +
+ net/sunrpc/svc_xprt.c                   |    3 +++
+ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |    2 +-
+ 3 files changed, 5 insertions(+), 1 deletion(-)
+
+--- a/include/linux/sunrpc/svc.h
++++ b/include/linux/sunrpc/svc.h
+@@ -384,6 +384,7 @@ struct svc_deferred_req {
+       size_t                  addrlen;
+       struct sockaddr_storage daddr;  /* where reply must come from */
+       size_t                  daddrlen;
++      void                    *xprt_ctxt;
+       struct cache_deferred_req handle;
+       size_t                  xprt_hlen;
+       int                     argslen;
+--- a/net/sunrpc/svc_xprt.c
++++ b/net/sunrpc/svc_xprt.c
+@@ -1213,6 +1213,8 @@ static struct cache_deferred_req *svc_de
+               dr->daddr = rqstp->rq_daddr;
+               dr->argslen = rqstp->rq_arg.len >> 2;
+               dr->xprt_hlen = rqstp->rq_xprt_hlen;
++              dr->xprt_ctxt = rqstp->rq_xprt_ctxt;
++              rqstp->rq_xprt_ctxt = NULL;
+               /* back up head to the start of the buffer and copy */
+               skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+@@ -1251,6 +1253,7 @@ static noinline int svc_deferred_recv(st
+       rqstp->rq_xprt_hlen   = dr->xprt_hlen;
+       rqstp->rq_daddr       = dr->daddr;
+       rqstp->rq_respages    = rqstp->rq_pages;
++      rqstp->rq_xprt_ctxt   = dr->xprt_ctxt;
+       svc_xprt_received(rqstp->rq_xprt);
+       return (dr->argslen<<2) - dr->xprt_hlen;
+ }
+--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -826,7 +826,7 @@ int svc_rdma_recvfrom(struct svc_rqst *r
+               goto out_err;
+       if (ret == 0)
+               goto out_drop;
+-      rqstp->rq_xprt_hlen = ret;
++      rqstp->rq_xprt_hlen = 0;
+       if (svc_rdma_is_reverse_direction_reply(xprt, ctxt))
+               goto out_backchannel;