--- /dev/null
+From e8cde32f111f7f5681a7bad3ec747e9e697569a9 Mon Sep 17 00:00:00 2001
+From: Nianyao Tang <tangnianyao@huawei.com>
+Date: Tue, 11 Jun 2024 12:20:49 +0000
+Subject: arm64/cpufeatures/kvm: Add ARMv8.9 FEAT_ECBHB bits in ID_AA64MMFR1 register
+
+From: Nianyao Tang <tangnianyao@huawei.com>
+
+commit e8cde32f111f7f5681a7bad3ec747e9e697569a9 upstream.
+
+Enable ECBHB bits in ID_AA64MMFR1 register as per ARM DDI 0487K.a
+specification.
+
+When guest OS read ID_AA64MMFR1_EL1, kvm emulate this reg using
+ftr_id_aa64mmfr1 and always return ID_AA64MMFR1_EL1.ECBHB=0 to guest.
+It results in guest syscall jump to tramp ventry, which is not needed
+in implementation with ID_AA64MMFR1_EL1.ECBHB=1.
+Let's make the guest syscall process the same as the host.
+
+Signed-off-by: Nianyao Tang <tangnianyao@huawei.com>
+Link: https://lore.kernel.org/r/20240611122049.2758600-1-tangnianyao@huawei.com
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/cpufeature.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm64/kernel/cpufeature.c
++++ b/arch/arm64/kernel/cpufeature.c
+@@ -343,6 +343,7 @@ static const struct arm64_ftr_bits ftr_i
+ };
+
+ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
++ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ECBHB_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0),
--- /dev/null
+From stable+bounces-167094-greg=kroah.com@vger.kernel.org Tue Aug 12 04:16:01 2025
+From: Sumanth Gavini <sumanth.gavini@yahoo.com>
+Date: Mon, 11 Aug 2025 20:34:55 -0500
+Subject: Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync
+To: marcel@holtmann.org, johan.hedberg@gmail.com, luiz.dentz@gmail.com, davem@davemloft.net, edumazet@google.com, kuba@kernel.org, pabeni@redhat.com
+Cc: Sumanth Gavini <sumanth.gavini@yahoo.com>, linux-bluetooth@vger.kernel.org, netdev@vger.kernel.org, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Message-ID: <20250812013457.425332-1-sumanth.gavini@yahoo.com>
+
+From: Sumanth Gavini <sumanth.gavini@yahoo.com>
+
+commit 5af1f84ed13a416297ab9ced7537f4d5ae7f329a upstream.
+
+Connections may be cleanup while waiting for the commands to complete so
+this attempts to check if the connection handle remains valid in case of
+errors that would lead to call hci_conn_failed:
+
+BUG: KASAN: slab-use-after-free in hci_conn_failed+0x1f/0x160
+Read of size 8 at addr ffff888001376958 by task kworker/u3:0/52
+
+CPU: 0 PID: 52 Comm: kworker/u3:0 Not tainted
+6.5.0-rc1-00527-g2dfe76d58d3a #5615
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
+1.16.2-1.fc38 04/01/2014
+Workqueue: hci0 hci_cmd_sync_work
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x1d/0x70
+ print_report+0xce/0x620
+ ? __virt_addr_valid+0xd4/0x150
+ ? hci_conn_failed+0x1f/0x160
+ kasan_report+0xd1/0x100
+ ? hci_conn_failed+0x1f/0x160
+ hci_conn_failed+0x1f/0x160
+ hci_abort_conn_sync+0x237/0x360
+
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Sumanth Gavini <sumanth.gavini@yahoo.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bluetooth/hci_sync.c | 43 +++++++++++++++++++++++++++++--------------
+ 1 file changed, 29 insertions(+), 14 deletions(-)
+
+--- a/net/bluetooth/hci_sync.c
++++ b/net/bluetooth/hci_sync.c
+@@ -5525,31 +5525,46 @@ static int hci_reject_conn_sync(struct h
+
+ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason)
+ {
+- int err;
++ int err = 0;
++ u16 handle = conn->handle;
+
+ switch (conn->state) {
+ case BT_CONNECTED:
+ case BT_CONFIG:
+- return hci_disconnect_sync(hdev, conn, reason);
++ err = hci_disconnect_sync(hdev, conn, reason);
++ break;
+ case BT_CONNECT:
+ err = hci_connect_cancel_sync(hdev, conn);
+- /* Cleanup hci_conn object if it cannot be cancelled as it
+- * likelly means the controller and host stack are out of sync.
+- */
+- if (err) {
+- hci_dev_lock(hdev);
+- hci_conn_failed(conn, err);
+- hci_dev_unlock(hdev);
+- }
+- return err;
++ break;
+ case BT_CONNECT2:
+- return hci_reject_conn_sync(hdev, conn, reason);
++ err = hci_reject_conn_sync(hdev, conn, reason);
++ break;
+ default:
+ conn->state = BT_CLOSED;
+- break;
++ return 0;
++ }
++
++ /* Cleanup hci_conn object if it cannot be cancelled as it
++ * likelly means the controller and host stack are out of sync
++ * or in case of LE it was still scanning so it can be cleanup
++ * safely.
++ */
++ if (err) {
++ struct hci_conn *c;
++
++ /* Check if the connection hasn't been cleanup while waiting
++ * commands to complete.
++ */
++ c = hci_conn_hash_lookup_handle(hdev, handle);
++ if (!c || c != conn)
++ return 0;
++
++ hci_dev_lock(hdev);
++ hci_conn_failed(conn, err);
++ hci_dev_unlock(hdev);
+ }
+
+- return 0;
++ return err;
+ }
+
+ static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason)
--- /dev/null
+From a238487f7965d102794ed9f8aff0b667cd2ae886 Mon Sep 17 00:00:00 2001
+From: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+Date: Fri, 20 Oct 2023 15:49:23 +0200
+Subject: crypto: qat - fix ring to service map for QAT GEN4
+
+From: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+
+commit a238487f7965d102794ed9f8aff0b667cd2ae886 upstream.
+
+The 4xxx drivers hardcode the ring to service mapping. However, when
+additional configurations where added to the driver, the mappings were
+not updated. This implies that an incorrect mapping might be reported
+through pfvf for certain configurations.
+
+Add an algorithm that computes the correct ring to service mapping based
+on the firmware loaded on the device.
+
+Fixes: 0cec19c761e5 ("crypto: qat - add support for compression for 4xxx")
+Signed-off-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+Reviewed-by: Damian Muszynski <damian.muszynski@intel.com>
+Reviewed-by: Tero Kristo <tero.kristo@linux.intel.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+[Giovanni: backport to 6.1.y, conflict resolved simplifying the logic
+in the function get_ring_to_svc_map() as the QAT driver in v6.1 supports
+only limited configurations (crypto only and compression). Differs from
+upstream as the ring to service mapping is hardcoded rather than being
+dynamically computed.]
+Reviewed-by: Ahsan Atta <ahsan.atta@intel.com>
+Tested-by: Ahsan Atta <ahsan.atta@intel.com>
+Signed-off-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c | 13 +++++++++++++
+ drivers/crypto/qat/qat_common/adf_accel_devices.h | 1 +
+ drivers/crypto/qat/qat_common/adf_gen4_hw_data.h | 6 ++++++
+ drivers/crypto/qat/qat_common/adf_init.c | 3 +++
+ 4 files changed, 23 insertions(+)
+
+--- a/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c
++++ b/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c
+@@ -297,6 +297,18 @@ static char *uof_get_name(struct adf_acc
+ return NULL;
+ }
+
++static u16 get_ring_to_svc_map(struct adf_accel_dev *accel_dev)
++{
++ switch (get_service_enabled(accel_dev)) {
++ case SVC_CY:
++ return ADF_GEN4_DEFAULT_RING_TO_SRV_MAP;
++ case SVC_DC:
++ return ADF_GEN4_DEFAULT_RING_TO_SRV_MAP_DC;
++ }
++
++ return 0;
++}
++
+ static u32 uof_get_ae_mask(struct adf_accel_dev *accel_dev, u32 obj_num)
+ {
+ switch (get_service_enabled(accel_dev)) {
+@@ -353,6 +365,7 @@ void adf_init_hw_data_4xxx(struct adf_hw
+ hw_data->uof_get_ae_mask = uof_get_ae_mask;
+ hw_data->set_msix_rttable = set_msix_default_rttable;
+ hw_data->set_ssm_wdtimer = adf_gen4_set_ssm_wdtimer;
++ hw_data->get_ring_to_svc_map = get_ring_to_svc_map;
+ hw_data->disable_iov = adf_disable_sriov;
+ hw_data->ring_pair_reset = adf_gen4_ring_pair_reset;
+ hw_data->enable_pm = adf_gen4_enable_pm;
+--- a/drivers/crypto/qat/qat_common/adf_accel_devices.h
++++ b/drivers/crypto/qat/qat_common/adf_accel_devices.h
+@@ -176,6 +176,7 @@ struct adf_hw_device_data {
+ void (*get_arb_info)(struct arb_info *arb_csrs_info);
+ void (*get_admin_info)(struct admin_info *admin_csrs_info);
+ enum dev_sku_info (*get_sku)(struct adf_hw_device_data *self);
++ u16 (*get_ring_to_svc_map)(struct adf_accel_dev *accel_dev);
+ int (*alloc_irq)(struct adf_accel_dev *accel_dev);
+ void (*free_irq)(struct adf_accel_dev *accel_dev);
+ void (*enable_error_correction)(struct adf_accel_dev *accel_dev);
+--- a/drivers/crypto/qat/qat_common/adf_gen4_hw_data.h
++++ b/drivers/crypto/qat/qat_common/adf_gen4_hw_data.h
+@@ -95,6 +95,12 @@ do { \
+ ADF_RING_BUNDLE_SIZE * (bank) + \
+ ADF_RING_CSR_RING_SRV_ARB_EN, (value))
+
++#define ADF_GEN4_DEFAULT_RING_TO_SRV_MAP_DC \
++ (COMP << ADF_CFG_SERV_RING_PAIR_0_SHIFT | \
++ COMP << ADF_CFG_SERV_RING_PAIR_1_SHIFT | \
++ COMP << ADF_CFG_SERV_RING_PAIR_2_SHIFT | \
++ COMP << ADF_CFG_SERV_RING_PAIR_3_SHIFT)
++
+ /* Default ring mapping */
+ #define ADF_GEN4_DEFAULT_RING_TO_SRV_MAP \
+ (ASYM << ADF_CFG_SERV_RING_PAIR_0_SHIFT | \
+--- a/drivers/crypto/qat/qat_common/adf_init.c
++++ b/drivers/crypto/qat/qat_common/adf_init.c
+@@ -95,6 +95,9 @@ int adf_dev_init(struct adf_accel_dev *a
+ return -EFAULT;
+ }
+
++ if (hw_data->get_ring_to_svc_map)
++ hw_data->ring_to_svc_map = hw_data->get_ring_to_svc_map(accel_dev);
++
+ if (adf_ae_init(accel_dev)) {
+ dev_err(&GET_DEV(accel_dev),
+ "Failed to initialise Acceleration Engine\n");
--- /dev/null
+From nathan@kernel.org Thu Aug 21 20:30:59 2025
+From: Nathan Chancellor <nathan@kernel.org>
+Date: Thu, 21 Aug 2025 11:30:51 -0700
+Subject: kbuild: userprogs: use correct linker when mixing clang and GNU ld
+To: gregkh@linuxfoundation.org, sashal@kernel.org
+Cc: stable@vger.kernel.org, nathan@kernel.org, thomas.weissschuh@linutronix.de
+Message-ID: <20250821183051.1259435-1-nathan@kernel.org>
+
+From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
+
+commit 936599ca514973d44a766b7376c6bbdc96b6a8cc upstream.
+
+The userprogs infrastructure does not expect clang being used with GNU ld
+and in that case uses /usr/bin/ld for linking, not the configured $(LD).
+This fallback is problematic as it will break when cross-compiling.
+Mixing clang and GNU ld is used for example when building for SPARC64,
+as ld.lld is not sufficient; see Documentation/kbuild/llvm.rst.
+
+Relax the check around --ld-path so it gets used for all linkers.
+
+Fixes: dfc1b168a8c4 ("kbuild: userprogs: use correct lld when linking through clang")
+Cc: stable@vger.kernel.org
+Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
+Reviewed-by: Nathan Chancellor <nathan@kernel.org>
+Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
+[nathan: Work around wrapping '--ld-path' in cc-option in older stable
+ branches due to older minimum LLVM version]
+Signed-off-by: Nathan Chancellor <nathan@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/Makefile
++++ b/Makefile
+@@ -1143,7 +1143,7 @@ KBUILD_USERCFLAGS += $(filter -m32 -m64
+ KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS))
+
+ # userspace programs are linked via the compiler, use the correct linker
+-ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_LD_IS_LLD),yy)
++ifdef CONFIG_CC_IS_CLANG
+ KBUILD_USERLDFLAGS += $(call cc-option, --ld-path=$(LD))
+ endif
+
--- /dev/null
+From stable+bounces-164649-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:43 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 13:07:25 -0400
+Subject: KVM: VMX: Flush shadow VMCS on emergency reboot
+To: stable@vger.kernel.org
+Cc: Chao Gao <chao.gao@intel.com>, Kai Huang <kai.huang@intel.com>, Sean Christopherson <seanjc@google.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20250724170725.1404455-3-sashal@kernel.org>
+
+From: Chao Gao <chao.gao@intel.com>
+
+[ Upstream commit a0ee1d5faff135e28810f29e0f06328c66f89852 ]
+
+Ensure the shadow VMCS cache is evicted during an emergency reboot to
+prevent potential memory corruption if the cache is evicted after reboot.
+
+This issue was identified through code inspection, as __loaded_vmcs_clear()
+flushes both the normal VMCS and the shadow VMCS.
+
+Avoid checking the "launched" state during an emergency reboot, unlike the
+behavior in __loaded_vmcs_clear(). This is important because reboot NMIs
+can interfere with operations like copy_shadow_to_vmcs12(), where shadow
+VMCSes are loaded directly using VMPTRLD. In such cases, if NMIs occur
+right after the VMCS load, the shadow VMCSes will be active but the
+"launched" state may not be set.
+
+Fixes: 16f5b9034b69 ("KVM: nVMX: Copy processor-specific shadow-vmcs to VMCS12")
+Cc: stable@vger.kernel.org
+Signed-off-by: Chao Gao <chao.gao@intel.com>
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Link: https://lore.kernel.org/r/20250324140849.2099723-1-chao.gao@intel.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -713,8 +713,11 @@ static void vmx_emergency_disable(void)
+ struct loaded_vmcs *v;
+
+ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+- loaded_vmcss_on_cpu_link)
++ loaded_vmcss_on_cpu_link) {
+ vmcs_clear(v->vmcs);
++ if (v->shadow_vmcs)
++ vmcs_clear(v->shadow_vmcs);
++ }
+
+ __cpu_emergency_vmxoff();
+ }
--- /dev/null
+From f1fb088d9cecde5c3066d8ff8846789667519b7d Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Fri, 4 Apr 2025 12:38:19 -0700
+Subject: KVM: x86: Take irqfds.lock when adding/deleting IRQ bypass producer
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit f1fb088d9cecde5c3066d8ff8846789667519b7d upstream.
+
+Take irqfds.lock when adding/deleting an IRQ bypass producer to ensure
+irqfd->producer isn't modified while kvm_irq_routing_update() is running.
+The only lock held when a producer is added/removed is irqbypass's mutex.
+
+Fixes: 872768800652 ("KVM: x86: select IRQ_BYPASS_MANAGER")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-ID: <20250404193923.1413163-5-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[sean: account for lack of kvm_x86_call()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 19 ++++++++++++++++---
+ 1 file changed, 16 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -13387,16 +13387,22 @@ int kvm_arch_irq_bypass_add_producer(str
+ {
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
++ struct kvm *kvm = irqfd->kvm;
+ int ret;
+
+- irqfd->producer = prod;
+ kvm_arch_start_assignment(irqfd->kvm);
++
++ spin_lock_irq(&kvm->irqfds.lock);
++ irqfd->producer = prod;
++
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
+-
+ if (ret)
+ kvm_arch_end_assignment(irqfd->kvm);
+
++ spin_unlock_irq(&kvm->irqfds.lock);
++
++
+ return ret;
+ }
+
+@@ -13406,9 +13412,9 @@ void kvm_arch_irq_bypass_del_producer(st
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
++ struct kvm *kvm = irqfd->kvm;
+
+ WARN_ON(irqfd->producer != prod);
+- irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+@@ -13416,11 +13422,18 @@ void kvm_arch_irq_bypass_del_producer(st
+ * when the irq is masked/disabled or the consumer side (KVM
+ * int this case doesn't want to receive the interrupts.
+ */
++ spin_lock_irq(&kvm->irqfds.lock);
++ irqfd->producer = NULL;
++
++
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ if (ret)
+ printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+ " fails: %d\n", irqfd->consumer.token, ret);
+
++ spin_unlock_irq(&kvm->irqfds.lock);
++
++
+ kvm_arch_end_assignment(irqfd->kvm);
+ }
+
--- /dev/null
+From stable+bounces-165164-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:29 2025
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Tue, 29 Jul 2025 18:52:40 -0700
+Subject: mm: drop the assumption that VM_SHARED always implies writable
+To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org, Alexander Viro <viro@zeniv.linux.org.uk>, Christian Brauner <brauner@kernel.org>, Jan Kara <jack@suse.cz>, Andrew Morton <akpm@linux-foundation.org>, David Hildenbrand <david@redhat.com>, "Liam R. Howlett" <Liam.Howlett@oracle.com>, Vlastimil Babka <vbabka@suse.cz>, Mike Rapoport <rppt@kernel.org>, Suren Baghdasaryan <surenb@google.com>, Michal Hocko <mhocko@suse.com>, Kees Cook <kees@kernel.org>, Ingo Molnar <mingo@redhat.com>, Peter Zijlstra <peterz@infradead.org>, Juri Lelli <juri.lelli@redhat.com>, Vincent Guittot <vincent.guittot@linaro.org>, Dietmar Eggemann <dietmar.eggemann@arm.com>, Steven Rostedt <rostedt@goodmis.org>, Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>, Valentin Schneider <vschneid@redhat.com>, "Matthew Wilcox (Oracle)" <willy@infradead.org>, Jann Horn <jannh@google.com>, Pedro Falcato <pfalcato@suse.de>
+Cc: aliceryhl@google.com, stable@vger.kernel.org, "Isaac J. Manjarres" <isaacmanjarres@google.com>, kernel-team@android.com, Lorenzo Stoakes <lstoakes@gmail.com>, Andy Lutomirski <luto@kernel.org>, Hugh Dickins <hughd@google.com>, Mike Kravetz <mike.kravetz@oracle.com>, Muchun Song <muchun.song@linux.dev>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org
+Message-ID: <20250730015247.30827-2-isaacmanjarres@google.com>
+
+From: Lorenzo Stoakes <lstoakes@gmail.com>
+
+[ Upstream commit e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 ]
+
+Patch series "permit write-sealed memfd read-only shared mappings", v4.
+
+The man page for fcntl() describing memfd file seals states the following
+about F_SEAL_WRITE:-
+
+ Furthermore, trying to create new shared, writable memory-mappings via
+ mmap(2) will also fail with EPERM.
+
+With emphasis on 'writable'. In turns out in fact that currently the
+kernel simply disallows all new shared memory mappings for a memfd with
+F_SEAL_WRITE applied, rendering this documentation inaccurate.
+
+This matters because users are therefore unable to obtain a shared mapping
+to a memfd after write sealing altogether, which limits their usefulness.
+This was reported in the discussion thread [1] originating from a bug
+report [2].
+
+This is a product of both using the struct address_space->i_mmap_writable
+atomic counter to determine whether writing may be permitted, and the
+kernel adjusting this counter when any VM_SHARED mapping is performed and
+more generally implicitly assuming VM_SHARED implies writable.
+
+It seems sensible that we should only update this mapping if VM_MAYWRITE
+is specified, i.e. whether it is possible that this mapping could at any
+point be written to.
+
+If we do so then all we need to do to permit write seals to function as
+documented is to clear VM_MAYWRITE when mapping read-only. It turns out
+this functionality already exists for F_SEAL_FUTURE_WRITE - we can
+therefore simply adapt this logic to do the same for F_SEAL_WRITE.
+
+We then hit a chicken and egg situation in mmap_region() where the check
+for VM_MAYWRITE occurs before we are able to clear this flag. To work
+around this, perform this check after we invoke call_mmap(), with careful
+consideration of error paths.
+
+Thanks to Andy Lutomirski for the suggestion!
+
+[1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/
+[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238
+
+This patch (of 3):
+
+There is a general assumption that VMAs with the VM_SHARED flag set are
+writable. If the VM_MAYWRITE flag is not set, then this is simply not the
+case.
+
+Update those checks which affect the struct address_space->i_mmap_writable
+field to explicitly test for this by introducing
+[vma_]is_shared_maywrite() helper functions.
+
+This remains entirely conservative, as the lack of VM_MAYWRITE guarantees
+that the VMA cannot be written to.
+
+Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com
+Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com
+Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
+Suggested-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+[isaacmanjarres: resolved merge conflicts due to
+due to refactoring that happened in upstream commit
+5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour")]
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/fs.h | 4 ++--
+ include/linux/mm.h | 11 +++++++++++
+ kernel/fork.c | 2 +-
+ mm/filemap.c | 2 +-
+ mm/madvise.c | 2 +-
+ mm/mmap.c | 8 ++++----
+ 6 files changed, 20 insertions(+), 9 deletions(-)
+
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -410,7 +410,7 @@ extern const struct address_space_operat
+ * It is also used to block modification of page cache contents through
+ * memory mappings.
+ * @gfp_mask: Memory allocation flags to use for allocating pages.
+- * @i_mmap_writable: Number of VM_SHARED mappings.
++ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
+ * @nr_thps: Number of THPs in the pagecache (non-shmem only).
+ * @i_mmap: Tree of private and shared mappings.
+ * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+@@ -513,7 +513,7 @@ static inline int mapping_mapped(struct
+
+ /*
+ * Might pages of this file have been modified in userspace?
+- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
++ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
+ * marks vma as VM_SHARED if it is shared, and the file was opened for
+ * writing i.e. vma may be mprotected writable even if now readonly.
+ *
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -673,6 +673,17 @@ static inline bool vma_is_accessible(str
+ return vma->vm_flags & VM_ACCESS_FLAGS;
+ }
+
++static inline bool is_shared_maywrite(vm_flags_t vm_flags)
++{
++ return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
++ (VM_SHARED | VM_MAYWRITE);
++}
++
++static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
++{
++ return is_shared_maywrite(vma->vm_flags);
++}
++
+ static inline
+ struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+ {
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -669,7 +669,7 @@ static __latent_entropy int dup_mmap(str
+
+ get_file(file);
+ i_mmap_lock_write(mapping);
+- if (tmp->vm_flags & VM_SHARED)
++ if (vma_is_shared_maywrite(tmp))
+ mapping_allow_writable(mapping);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -3554,7 +3554,7 @@ int generic_file_mmap(struct file *file,
+ */
+ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
+ {
+- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
++ if (vma_is_shared_maywrite(vma))
+ return -EINVAL;
+ return generic_file_mmap(file, vma);
+ }
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -980,7 +980,7 @@ static long madvise_remove(struct vm_are
+ return -EINVAL;
+ }
+
+- if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
++ if (!vma_is_shared_maywrite(vma))
+ return -EACCES;
+
+ offset = (loff_t)(start - vma->vm_start)
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -106,7 +106,7 @@ void vma_set_page_prot(struct vm_area_st
+ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
+ struct file *file, struct address_space *mapping)
+ {
+- if (vma->vm_flags & VM_SHARED)
++ if (vma_is_shared_maywrite(vma))
+ mapping_unmap_writable(mapping);
+
+ flush_dcache_mmap_lock(mapping);
+@@ -408,7 +408,7 @@ static unsigned long count_vma_pages_ran
+ static void __vma_link_file(struct vm_area_struct *vma,
+ struct address_space *mapping)
+ {
+- if (vma->vm_flags & VM_SHARED)
++ if (vma_is_shared_maywrite(vma))
+ mapping_allow_writable(mapping);
+
+ flush_dcache_mmap_lock(mapping);
+@@ -2827,7 +2827,7 @@ cannot_expand:
+ vma_mas_store(vma, &mas);
+ mm->map_count++;
+ if (vma->vm_file) {
+- if (vma->vm_flags & VM_SHARED)
++ if (vma_is_shared_maywrite(vma))
+ mapping_allow_writable(vma->vm_file->f_mapping);
+
+ flush_dcache_mmap_lock(vma->vm_file->f_mapping);
+@@ -2901,7 +2901,7 @@ unsigned long mmap_region(struct file *f
+ return -EINVAL;
+
+ /* Map writable and ensure this isn't a sealed memfd. */
+- if (file && (vm_flags & VM_SHARED)) {
++ if (file && is_shared_maywrite(vm_flags)) {
+ int error = mapping_map_writable(file->f_mapping);
+
+ if (error)
--- /dev/null
+From stable+bounces-165166-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:57 2025
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Tue, 29 Jul 2025 18:52:42 -0700
+Subject: mm: reinstate ability to map write-sealed memfd mappings read-only
+To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org, Hugh Dickins <hughd@google.com>, Baolin Wang <baolin.wang@linux.alibaba.com>, Andrew Morton <akpm@linux-foundation.org>, David Hildenbrand <david@redhat.com>, "Liam R. Howlett" <Liam.Howlett@oracle.com>, Vlastimil Babka <vbabka@suse.cz>, Mike Rapoport <rppt@kernel.org>, Suren Baghdasaryan <surenb@google.com>, Michal Hocko <mhocko@suse.com>, Jann Horn <jannh@google.com>, Pedro Falcato <pfalcato@suse.de>
+Cc: aliceryhl@google.com, stable@vger.kernel.org, "Isaac J. Manjarres" <isaacmanjarres@google.com>, kernel-team@android.com, Julian Orth <ju.orth@gmail.com>, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Linus Torvalds <torvalds@linux-foundation.org>, Shuah Khan <shuah@kernel.org>, linux-mm@kvack.org, linux-kernel@vger.kernel.org
+Message-ID: <20250730015247.30827-4-isaacmanjarres@google.com>
+
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+
+[ Upstream commit 8ec396d05d1b737c87311fb7311f753b02c2a6b1 ]
+
+Patch series "mm: reinstate ability to map write-sealed memfd mappings
+read-only".
+
+In commit 158978945f31 ("mm: perform the mapping_map_writable() check
+after call_mmap()") (and preceding changes in the same series) it became
+possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only.
+
+Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path
+behaviour") unintentionally undid this logic by moving the
+mapping_map_writable() check before the shmem_mmap() hook is invoked,
+thereby regressing this change.
+
+This series reworks how we both permit write-sealed mappings being mapped
+read-only and disallow mprotect() from undoing the write-seal, fixing this
+regression.
+
+We also add a regression test to ensure that we do not accidentally
+regress this in future.
+
+Thanks to Julian Orth for reporting this regression.
+
+This patch (of 2):
+
+In commit 158978945f31 ("mm: perform the mapping_map_writable() check
+after call_mmap()") (and preceding changes in the same series) it became
+possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only.
+
+This was previously unnecessarily disallowed, despite the man page
+documentation indicating that it would be, thereby limiting the usefulness
+of F_SEAL_WRITE logic.
+
+We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE
+seal (one which disallows future writes to the memfd) to also be used for
+F_SEAL_WRITE.
+
+For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a
+read-only mapping to disallow mprotect() from overriding the seal - an
+operation performed by seal_check_write(), invoked from shmem_mmap(), the
+f_op->mmap() hook used by shmem mappings.
+
+By extending this to F_SEAL_WRITE and critically - checking
+mapping_map_writable() to determine if we may map the memfd AFTER we
+invoke shmem_mmap() - the desired logic becomes possible. This is because
+mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will
+have cleared.
+
+Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path
+behaviour") unintentionally undid this logic by moving the
+mapping_map_writable() check before the shmem_mmap() hook is invoked,
+thereby regressing this change.
+
+We reinstate this functionality by moving the check out of shmem_mmap()
+and instead performing it in do_mmap() at the point at which VMA flags are
+being determined, which seems in any case to be a more appropriate place
+in which to make this determination.
+
+In order to achieve this we rework memfd seal logic to allow us access to
+this information using existing logic and eliminate the clearing of
+VM_MAYWRITE from seal_check_write() which we are performing in do_mmap()
+instead.
+
+Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com
+Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour")
+Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reported-by: Julian Orth <ju.orth@gmail.com>
+Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/
+Cc: Jann Horn <jannh@google.com>
+Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memfd.h | 14 ++++++++++++
+ include/linux/mm.h | 58 ++++++++++++++++++++++++++++++++++----------------
+ mm/memfd.c | 2 -
+ mm/mmap.c | 4 +++
+ 4 files changed, 59 insertions(+), 19 deletions(-)
+
+--- a/include/linux/memfd.h
++++ b/include/linux/memfd.h
+@@ -6,11 +6,25 @@
+
+ #ifdef CONFIG_MEMFD_CREATE
+ extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
++unsigned int *memfd_file_seals_ptr(struct file *file);
+ #else
+ static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
+ {
+ return -EINVAL;
+ }
++
++static inline unsigned int *memfd_file_seals_ptr(struct file *file)
++{
++ return NULL;
++}
+ #endif
+
++/* Retrieve memfd seals associated with the file, if any. */
++static inline unsigned int memfd_file_seals(struct file *file)
++{
++ unsigned int *sealsp = memfd_file_seals_ptr(file);
++
++ return sealsp ? *sealsp : 0;
++}
++
+ #endif /* __LINUX_MEMFD_H */
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3525,6 +3525,37 @@ void mem_dump_obj(void *object);
+ static inline void mem_dump_obj(void *object) {}
+ #endif
+
++static inline bool is_write_sealed(int seals)
++{
++ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
++}
++
++/**
++ * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
++ * in which case writes should be disallowing moving
++ * forwards.
++ * @seals: the seals to check
++ * @vm_flags: the VMA flags to check
++ *
++ * Returns whether readonly sealed, in which case writess should be disallowed
++ * going forward.
++ */
++static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
++{
++ /*
++ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
++ * MAP_SHARED and read-only, take care to not allow mprotect to
++ * revert protections on such mappings. Do this only for shared
++ * mappings. For private mappings, don't need to mask
++ * VM_MAYWRITE as we still want them to be COW-writable.
++ */
++ if (is_write_sealed(seals) &&
++ ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
++ return true;
++
++ return false;
++}
++
+ /**
+ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+ * handle them.
+@@ -3536,24 +3567,15 @@ static inline void mem_dump_obj(void *ob
+ */
+ static inline int seal_check_write(int seals, struct vm_area_struct *vma)
+ {
+- if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+- /*
+- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+- * write seals are active.
+- */
+- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+- return -EPERM;
+-
+- /*
+- * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
+- * MAP_SHARED and read-only, take care to not allow mprotect to
+- * revert protections on such mappings. Do this only for shared
+- * mappings. For private mappings, don't need to mask
+- * VM_MAYWRITE as we still want them to be COW-writable.
+- */
+- if (vma->vm_flags & VM_SHARED)
+- vma->vm_flags &= ~(VM_MAYWRITE);
+- }
++ if (!is_write_sealed(seals))
++ return 0;
++
++ /*
++ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
++ * write seals are active.
++ */
++ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
++ return -EPERM;
+
+ return 0;
+ }
+--- a/mm/memfd.c
++++ b/mm/memfd.c
+@@ -133,7 +133,7 @@ static int memfd_wait_for_pins(struct ad
+ return error;
+ }
+
+-static unsigned int *memfd_file_seals_ptr(struct file *file)
++unsigned int *memfd_file_seals_ptr(struct file *file)
+ {
+ if (shmem_file(file))
+ return &SHMEM_I(file_inode(file))->seals;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -46,6 +46,7 @@
+ #include <linux/pkeys.h>
+ #include <linux/oom.h>
+ #include <linux/sched/mm.h>
++#include <linux/memfd.h>
+
+ #include <linux/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -1336,6 +1337,7 @@ unsigned long do_mmap(struct file *file,
+
+ if (file) {
+ struct inode *inode = file_inode(file);
++ unsigned int seals = memfd_file_seals(file);
+ unsigned long flags_mask;
+
+ if (!file_mmap_ok(file, inode, pgoff, len))
+@@ -1374,6 +1376,8 @@ unsigned long do_mmap(struct file *file,
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ if (!(file->f_mode & FMODE_WRITE))
+ vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
++ else if (is_readonly_sealed(seals, vm_flags))
++ vm_flags &= ~VM_MAYWRITE;
+ fallthrough;
+ case MAP_PRIVATE:
+ if (!(file->f_mode & FMODE_READ))
--- /dev/null
+From stable+bounces-165165-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:44 2025
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Tue, 29 Jul 2025 18:52:41 -0700
+Subject: mm: update memfd seal write check to include F_SEAL_WRITE
+To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org, Muchun Song <muchun.song@linux.dev>, Oscar Salvador <osalvador@suse.de>, David Hildenbrand <david@redhat.com>, Andrew Morton <akpm@linux-foundation.org>, "Liam R. Howlett" <Liam.Howlett@oracle.com>, Vlastimil Babka <vbabka@suse.cz>, Mike Rapoport <rppt@kernel.org>, Suren Baghdasaryan <surenb@google.com>, Michal Hocko <mhocko@suse.com>, Hugh Dickins <hughd@google.com>, Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: aliceryhl@google.com, stable@vger.kernel.org, "Isaac J. Manjarres" <isaacmanjarres@google.com>, kernel-team@android.com, Lorenzo Stoakes <lstoakes@gmail.com>, Jan Kara <jack@suse.cz>, Alexander Viro <viro@zeniv.linux.org.uk>, Andy Lutomirski <luto@kernel.org>, Christian Brauner <brauner@kernel.org>, "Matthew Wilcox (Oracle)" <willy@infradead.org>, Mike Kravetz <mike.kravetz@oracle.com>, linux-mm@kvack.org, linux-kernel@vger.kernel.org
+Message-ID: <20250730015247.30827-3-isaacmanjarres@google.com>
+
+From: Lorenzo Stoakes <lstoakes@gmail.com>
+
+[ Upstream commit 28464bbb2ddc199433383994bcb9600c8034afa1 ]
+
+The seal_check_future_write() function is called by shmem_mmap() or
+hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd
+sealed this way.
+
+The F_SEAL_WRITE flag is not checked here, as that is handled via the
+mapping->i_mmap_writable mechanism and so any attempt at a mapping would
+fail before this could be run.
+
+However we intend to change this, meaning this check can be performed for
+F_SEAL_WRITE mappings also.
+
+The logic here is equally applicable to both flags, so update this
+function to accommodate both and rename it accordingly.
+
+Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com
+Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/hugetlbfs/inode.c | 2 +-
+ include/linux/mm.h | 15 ++++++++-------
+ mm/shmem.c | 2 +-
+ 3 files changed, 10 insertions(+), 9 deletions(-)
+
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -136,7 +136,7 @@ static int hugetlbfs_file_mmap(struct fi
+ vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+ vma->vm_ops = &hugetlb_vm_ops;
+
+- ret = seal_check_future_write(info->seals, vma);
++ ret = seal_check_write(info->seals, vma);
+ if (ret)
+ return ret;
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3526,25 +3526,26 @@ static inline void mem_dump_obj(void *ob
+ #endif
+
+ /**
+- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
++ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
++ * handle them.
+ * @seals: the seals to check
+ * @vma: the vma to operate on
+ *
+- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
+- * the vma flags. Return 0 if check pass, or <0 for errors.
++ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
++ * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
+ */
+-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
++static inline int seal_check_write(int seals, struct vm_area_struct *vma)
+ {
+- if (seals & F_SEAL_FUTURE_WRITE) {
++ if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+- * "future write" seal active.
++ * write seals are active.
+ */
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+ return -EPERM;
+
+ /*
+- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
++ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
+ * MAP_SHARED and read-only, take care to not allow mprotect to
+ * revert protections on such mappings. Do this only for shared
+ * mappings. For private mappings, don't need to mask
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -2302,7 +2302,7 @@ static int shmem_mmap(struct file *file,
+ struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ int ret;
+
+- ret = seal_check_future_write(info->seals, vma);
++ ret = seal_check_write(info->seals, vma);
+ if (ret)
+ return ret;
+
--- /dev/null
+From stable+bounces-164937-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:43 2025
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Mon, 28 Jul 2025 15:29:21 +0200
+Subject: mptcp: make fallback action and fallback decision atomic
+To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: Paolo Abeni <pabeni@redhat.com>, sashal@kernel.org, Matthieu Baerts <matttbe@kernel.org>, syzbot+5cf807c20386d699b524@syzkaller.appspotmail.com, Jakub Kicinski <kuba@kernel.org>
+Message-ID: <20250728132919.3904847-6-matttbe@kernel.org>
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit f8a1d9b18c5efc76784f5a326e905f641f839894 upstream.
+
+Syzkaller reported the following splat:
+
+ WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 __mptcp_do_fallback net/mptcp/protocol.h:1223 [inline]
+ WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 mptcp_do_fallback net/mptcp/protocol.h:1244 [inline]
+ WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 check_fully_established net/mptcp/options.c:982 [inline]
+ WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 mptcp_incoming_options+0x21a8/0x2510 net/mptcp/options.c:1153
+ Modules linked in:
+ CPU: 1 UID: 0 PID: 7704 Comm: syz.3.1419 Not tainted 6.16.0-rc3-gbd5ce2324dba #20 PREEMPT(voluntary)
+ Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
+ RIP: 0010:__mptcp_do_fallback net/mptcp/protocol.h:1223 [inline]
+ RIP: 0010:mptcp_do_fallback net/mptcp/protocol.h:1244 [inline]
+ RIP: 0010:check_fully_established net/mptcp/options.c:982 [inline]
+ RIP: 0010:mptcp_incoming_options+0x21a8/0x2510 net/mptcp/options.c:1153
+ Code: 24 18 e8 bb 2a 00 fd e9 1b df ff ff e8 b1 21 0f 00 e8 ec 5f c4 fc 44 0f b7 ac 24 b0 00 00 00 e9 54 f1 ff ff e8 d9 5f c4 fc 90 <0f> 0b 90 e9 b8 f4 ff ff e8 8b 2a 00 fd e9 8d e6 ff ff e8 81 2a 00
+ RSP: 0018:ffff8880a3f08448 EFLAGS: 00010246
+ RAX: 0000000000000000 RBX: ffff8880180a8000 RCX: ffffffff84afcf45
+ RDX: ffff888090223700 RSI: ffffffff84afdaa7 RDI: 0000000000000001
+ RBP: ffff888017955780 R08: 0000000000000001 R09: 0000000000000000
+ R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
+ R13: ffff8880180a8910 R14: ffff8880a3e9d058 R15: 0000000000000000
+ FS: 00005555791b8500(0000) GS:ffff88811c495000(0000) knlGS:0000000000000000
+ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 000000110c2800b7 CR3: 0000000058e44000 CR4: 0000000000350ef0
+ Call Trace:
+ <IRQ>
+ tcp_reset+0x26f/0x2b0 net/ipv4/tcp_input.c:4432
+ tcp_validate_incoming+0x1057/0x1b60 net/ipv4/tcp_input.c:5975
+ tcp_rcv_established+0x5b5/0x21f0 net/ipv4/tcp_input.c:6166
+ tcp_v4_do_rcv+0x5dc/0xa70 net/ipv4/tcp_ipv4.c:1925
+ tcp_v4_rcv+0x3473/0x44a0 net/ipv4/tcp_ipv4.c:2363
+ ip_protocol_deliver_rcu+0xba/0x480 net/ipv4/ip_input.c:205
+ ip_local_deliver_finish+0x2f1/0x500 net/ipv4/ip_input.c:233
+ NF_HOOK include/linux/netfilter.h:317 [inline]
+ NF_HOOK include/linux/netfilter.h:311 [inline]
+ ip_local_deliver+0x1be/0x560 net/ipv4/ip_input.c:254
+ dst_input include/net/dst.h:469 [inline]
+ ip_rcv_finish net/ipv4/ip_input.c:447 [inline]
+ NF_HOOK include/linux/netfilter.h:317 [inline]
+ NF_HOOK include/linux/netfilter.h:311 [inline]
+ ip_rcv+0x514/0x810 net/ipv4/ip_input.c:567
+ __netif_receive_skb_one_core+0x197/0x1e0 net/core/dev.c:5975
+ __netif_receive_skb+0x1f/0x120 net/core/dev.c:6088
+ process_backlog+0x301/0x1360 net/core/dev.c:6440
+ __napi_poll.constprop.0+0xba/0x550 net/core/dev.c:7453
+ napi_poll net/core/dev.c:7517 [inline]
+ net_rx_action+0xb44/0x1010 net/core/dev.c:7644
+ handle_softirqs+0x1d0/0x770 kernel/softirq.c:579
+ do_softirq+0x3f/0x90 kernel/softirq.c:480
+ </IRQ>
+ <TASK>
+ __local_bh_enable_ip+0xed/0x110 kernel/softirq.c:407
+ local_bh_enable include/linux/bottom_half.h:33 [inline]
+ inet_csk_listen_stop+0x2c5/0x1070 net/ipv4/inet_connection_sock.c:1524
+ mptcp_check_listen_stop.part.0+0x1cc/0x220 net/mptcp/protocol.c:2985
+ mptcp_check_listen_stop net/mptcp/mib.h:118 [inline]
+ __mptcp_close+0x9b9/0xbd0 net/mptcp/protocol.c:3000
+ mptcp_close+0x2f/0x140 net/mptcp/protocol.c:3066
+ inet_release+0xed/0x200 net/ipv4/af_inet.c:435
+ inet6_release+0x4f/0x70 net/ipv6/af_inet6.c:487
+ __sock_release+0xb3/0x270 net/socket.c:649
+ sock_close+0x1c/0x30 net/socket.c:1439
+ __fput+0x402/0xb70 fs/file_table.c:465
+ task_work_run+0x150/0x240 kernel/task_work.c:227
+ resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
+ exit_to_user_mode_loop+0xd4/0xe0 kernel/entry/common.c:114
+ exit_to_user_mode_prepare include/linux/entry-common.h:330 [inline]
+ syscall_exit_to_user_mode_work include/linux/entry-common.h:414 [inline]
+ syscall_exit_to_user_mode include/linux/entry-common.h:449 [inline]
+ do_syscall_64+0x245/0x360 arch/x86/entry/syscall_64.c:100
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+ RIP: 0033:0x7fc92f8a36ad
+ Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
+ RSP: 002b:00007ffcf52802d8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4
+ RAX: 0000000000000000 RBX: 00007ffcf52803a8 RCX: 00007fc92f8a36ad
+ RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003
+ RBP: 00007fc92fae7ba0 R08: 0000000000000001 R09: 0000002800000000
+ R10: 00007fc92f700000 R11: 0000000000000246 R12: 00007fc92fae5fac
+ R13: 00007fc92fae5fa0 R14: 0000000000026d00 R15: 0000000000026c51
+ </TASK>
+ irq event stamp: 4068
+ hardirqs last enabled at (4076): [<ffffffff81544816>] __up_console_sem+0x76/0x80 kernel/printk/printk.c:344
+ hardirqs last disabled at (4085): [<ffffffff815447fb>] __up_console_sem+0x5b/0x80 kernel/printk/printk.c:342
+ softirqs last enabled at (3096): [<ffffffff840e1be0>] local_bh_enable include/linux/bottom_half.h:33 [inline]
+ softirqs last enabled at (3096): [<ffffffff840e1be0>] inet_csk_listen_stop+0x2c0/0x1070 net/ipv4/inet_connection_sock.c:1524
+ softirqs last disabled at (3097): [<ffffffff813b6b9f>] do_softirq+0x3f/0x90 kernel/softirq.c:480
+
+Since we need to track the 'fallback is possible' condition and the
+fallback status separately, there are a few possible races open between
+the check and the actual fallback action.
+
+Add a spinlock to protect the fallback related information and use it
+close all the possible related races. While at it also remove the
+too-early clearing of allow_infinite_fallback in __mptcp_subflow_connect():
+the field will be correctly cleared by subflow_finish_connect() if/when
+the connection will complete successfully.
+
+If fallback is not possible, as per RFC, reset the current subflow.
+
+Since the fallback operation can now fail and return value should be
+checked, rename the helper accordingly.
+
+Fixes: 0530020a7c8f ("mptcp: track and update contiguous data status")
+Cc: stable@vger.kernel.org
+Reported-by: Matthieu Baerts <matttbe@kernel.org>
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/570
+Reported-by: syzbot+5cf807c20386d699b524@syzkaller.appspotmail.com
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/555
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-1-391aff963322@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ Conflicts in protocol.h, because commit 6ebf6f90ab4a ("mptcp: add
+ mptcpi_subflows_total counter") is not in this version, and this
+ causes conflicts in the context. Commit 65b02260a0e0 ("mptcp: export
+ mptcp_subflow_early_fallback()") is also not in this version, and
+ moves code from protocol.c to protocol.h, but the modification can
+ still apply there. Conflicts in protocol.c because commit ee2708aedad0
+ ("mptcp: use get_retrans wrapper") is not in this version and refactor
+ the code in __mptcp_retrans(), but the modification can still be
+ applied, just not at the same indentation level. There were other
+ conflicts in the context due to commit 8005184fd1ca ("mptcp: refactor
+ sndbuf auto-tuning"), commit b3ea6b272d79 ("mptcp: consolidate initial
+ ack seq generation"), and commit 013e3179dbd2 ("mptcp: fix rcv space
+ initialization") that are not in this version. ]
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/options.c | 3 ++-
+ net/mptcp/protocol.c | 39 +++++++++++++++++++++++++++++++++------
+ net/mptcp/protocol.h | 24 ++++++++++++++++++------
+ net/mptcp/subflow.c | 11 +++++------
+ 4 files changed, 58 insertions(+), 19 deletions(-)
+
+--- a/net/mptcp/options.c
++++ b/net/mptcp/options.c
+@@ -973,8 +973,9 @@ static bool check_fully_established(stru
+ if (subflow->mp_join)
+ goto reset;
+ subflow->mp_capable = 0;
++ if (!mptcp_try_fallback(ssk))
++ goto reset;
+ pr_fallback(msk);
+- mptcp_do_fallback(ssk);
+ return false;
+ }
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -633,10 +633,9 @@ static bool mptcp_check_data_fin(struct
+
+ static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk)
+ {
+- if (READ_ONCE(msk->allow_infinite_fallback)) {
++ if (mptcp_try_fallback(ssk)) {
+ MPTCP_INC_STATS(sock_net(ssk),
+ MPTCP_MIB_DSSCORRUPTIONFALLBACK);
+- mptcp_do_fallback(ssk);
+ } else {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET);
+ mptcp_subflow_reset(ssk);
+@@ -897,6 +896,14 @@ static bool __mptcp_finish_join(struct m
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return false;
+
++ spin_lock_bh(&msk->fallback_lock);
++ if (__mptcp_check_fallback(msk)) {
++ spin_unlock_bh(&msk->fallback_lock);
++ return false;
++ }
++ mptcp_subflow_joined(msk, ssk);
++ spin_unlock_bh(&msk->fallback_lock);
++
+ /* attach to msk socket only after we are sure we will deal with it
+ * at close time
+ */
+@@ -904,7 +911,6 @@ static bool __mptcp_finish_join(struct m
+ mptcp_sock_graft(ssk, sk->sk_socket);
+
+ mptcp_sockopt_sync_locked(msk, ssk);
+- mptcp_subflow_joined(msk, ssk);
+ mptcp_stop_tout_timer(sk);
+ return true;
+ }
+@@ -1288,10 +1294,14 @@ static void mptcp_update_infinite_map(st
+ mpext->infinite_map = 1;
+ mpext->data_len = 0;
+
++ if (!mptcp_try_fallback(ssk)) {
++ mptcp_subflow_reset(ssk);
++ return;
++ }
++
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
+ mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
+ pr_fallback(msk);
+- mptcp_do_fallback(ssk);
+ }
+
+ #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))
+@@ -2638,8 +2648,8 @@ static void mptcp_check_fastclose(struct
+
+ static void __mptcp_retrans(struct sock *sk)
+ {
++ struct mptcp_sendmsg_info info = { .data_lock_held = true, };
+ struct mptcp_sock *msk = mptcp_sk(sk);
+- struct mptcp_sendmsg_info info = {};
+ struct mptcp_data_frag *dfrag;
+ size_t copied = 0;
+ struct sock *ssk;
+@@ -2675,6 +2685,15 @@ static void __mptcp_retrans(struct sock
+ /* limit retransmission to the bytes already sent on some subflows */
+ info.sent = 0;
+ info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
++
++ /* make the whole retrans decision, xmit, disallow fallback atomic */
++ spin_lock_bh(&msk->fallback_lock);
++ if (__mptcp_check_fallback(msk)) {
++ spin_unlock_bh(&msk->fallback_lock);
++ release_sock(ssk);
++ return;
++ }
++
+ while (info.sent < info.limit) {
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0)
+@@ -2690,6 +2709,7 @@ static void __mptcp_retrans(struct sock
+ info.size_goal);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
+ }
++ spin_unlock_bh(&msk->fallback_lock);
+
+ release_sock(ssk);
+
+@@ -2819,6 +2839,7 @@ static int __mptcp_init_sock(struct sock
+ msk->recovery = false;
+
+ mptcp_pm_data_init(msk);
++ spin_lock_init(&msk->fallback_lock);
+
+ /* re-use the csk retrans timer for MPTCP-level retrans */
+ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
+@@ -3651,7 +3672,13 @@ bool mptcp_finish_join(struct sock *ssk)
+
+ /* active subflow, already present inside the conn_list */
+ if (!list_empty(&subflow->node)) {
++ spin_lock_bh(&msk->fallback_lock);
++ if (__mptcp_check_fallback(msk)) {
++ spin_unlock_bh(&msk->fallback_lock);
++ return false;
++ }
+ mptcp_subflow_joined(msk, ssk);
++ spin_unlock_bh(&msk->fallback_lock);
+ return true;
+ }
+
+@@ -3764,7 +3791,7 @@ static void mptcp_subflow_early_fallback
+ struct mptcp_subflow_context *subflow)
+ {
+ subflow->request_mptcp = 0;
+- __mptcp_do_fallback(msk);
++ WARN_ON_ONCE(!__mptcp_try_fallback(msk));
+ }
+
+ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+--- a/net/mptcp/protocol.h
++++ b/net/mptcp/protocol.h
+@@ -317,6 +317,10 @@ struct mptcp_sock {
+
+ u32 setsockopt_seq;
+ char ca_name[TCP_CA_NAME_MAX];
++
++ spinlock_t fallback_lock; /* protects fallback and
++ * allow_infinite_fallback
++ */
+ };
+
+ #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
+@@ -975,25 +979,32 @@ static inline bool mptcp_check_fallback(
+ return __mptcp_check_fallback(msk);
+ }
+
+-static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
++static inline bool __mptcp_try_fallback(struct mptcp_sock *msk)
+ {
+ if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
+ pr_debug("TCP fallback already done (msk=%p)\n", msk);
+- return;
++ return true;
+ }
+- if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback)))
+- return;
++ spin_lock_bh(&msk->fallback_lock);
++ if (!msk->allow_infinite_fallback) {
++ spin_unlock_bh(&msk->fallback_lock);
++ return false;
++ }
++
+ set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
++ spin_unlock_bh(&msk->fallback_lock);
++ return true;
+ }
+
+-static inline void mptcp_do_fallback(struct sock *ssk)
++static inline bool mptcp_try_fallback(struct sock *ssk)
+ {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+ struct mptcp_sock *msk;
+
+ msk = mptcp_sk(sk);
+- __mptcp_do_fallback(msk);
++ if (!__mptcp_try_fallback(msk))
++ return false;
+ if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
+ gfp_t saved_allocation = ssk->sk_allocation;
+
+@@ -1005,6 +1016,7 @@ static inline void mptcp_do_fallback(str
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ ssk->sk_allocation = saved_allocation;
+ }
++ return true;
+ }
+
+ #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a)
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -431,9 +431,11 @@ static void subflow_finish_connect(struc
+ mptcp_get_options(skb, &mp_opt);
+ if (subflow->request_mptcp) {
+ if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) {
++ if (!mptcp_try_fallback(sk))
++ goto do_reset;
++
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+- mptcp_do_fallback(sk);
+ pr_fallback(mptcp_sk(subflow->conn));
+ goto fallback;
+ }
+@@ -1269,7 +1271,7 @@ fallback:
+ return true;
+ }
+
+- if (!READ_ONCE(msk->allow_infinite_fallback)) {
++ if (!mptcp_try_fallback(ssk)) {
+ /* fatal protocol error, close the socket.
+ * subflow_error_report() will introduce the appropriate barriers
+ */
+@@ -1285,8 +1287,6 @@ reset:
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+ return false;
+ }
+-
+- mptcp_do_fallback(ssk);
+ }
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+@@ -1519,7 +1519,6 @@ int __mptcp_subflow_connect(struct sock
+ /* discard the subflow socket */
+ mptcp_sock_graft(ssk, sk->sk_socket);
+ iput(SOCK_INODE(sf));
+- WRITE_ONCE(msk->allow_infinite_fallback, false);
+ mptcp_stop_tout_timer(sk);
+ return 0;
+
+@@ -1690,7 +1689,7 @@ static void subflow_state_change(struct
+ msk = mptcp_sk(parent);
+ if (subflow_simultaneous_connect(sk)) {
+ mptcp_propagate_sndbuf(parent, sk);
+- mptcp_do_fallback(sk);
++ WARN_ON_ONCE(!mptcp_try_fallback(sk));
+ mptcp_rcv_space_init(msk, sk);
+ pr_fallback(msk);
+ subflow->conn_finished = 1;
--- /dev/null
+From stable+bounces-164936-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:45 2025
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Mon, 28 Jul 2025 15:29:22 +0200
+Subject: mptcp: plug races between subflow fail and subflow creation
+To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: Paolo Abeni <pabeni@redhat.com>, sashal@kernel.org, "Matthieu Baerts (NGI0)" <matttbe@kernel.org>, Jakub Kicinski <kuba@kernel.org>
+Message-ID: <20250728132919.3904847-7-matttbe@kernel.org>
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit def5b7b2643ebba696fc60ddf675dca13f073486 upstream.
+
+We have races similar to the one addressed by the previous patch between
+subflow failing and additional subflow creation. They are just harder to
+trigger.
+
+The solution is similar. Use a separate flag to track the condition
+'socket state prevent any additional subflow creation' protected by the
+fallback lock.
+
+The socket fallback makes such flag true, and also receiving or sending
+an MP_FAIL option.
+
+The field 'allow_infinite_fallback' is now always touched under the
+relevant lock, we can drop the ONCE annotation on write.
+
+Fixes: 478d770008b0 ("mptcp: send out MP_FAIL when data checksum fails")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-2-391aff963322@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ Conflicts in subflow.c, because commit f1f26512a9bf ("mptcp: use plain
+ bool instead of custom binary enum") and commit 46a5d3abedbe
+ ("mptcp: fix typos in comments") are not in this version. Both are
+ causing conflicts in the context, and the same modifications can still
+ be applied. Same in protocol.h with commit b8dc6d6ce931 ("mptcp: fix
+ rcv buffer auto-tuning"). Conflicts in protocol.c because commit
+ ee2708aedad0 ("mptcp: use get_retrans wrapper") is not in this version
+ and refactor the code in __mptcp_retrans(), but the modification can
+ still be applied, just not at the same indentation level. ]
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c | 8 +++++++-
+ net/mptcp/protocol.c | 11 ++++++-----
+ net/mptcp/protocol.h | 7 +++++--
+ net/mptcp/subflow.c | 19 ++++++++++++++-----
+ 4 files changed, 32 insertions(+), 13 deletions(-)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -309,8 +309,14 @@ void mptcp_pm_mp_fail_received(struct so
+
+ pr_debug("fail_seq=%llu\n", fail_seq);
+
+- if (!READ_ONCE(msk->allow_infinite_fallback))
++ /* After accepting the fail, we can't create any other subflows */
++ spin_lock_bh(&msk->fallback_lock);
++ if (!msk->allow_infinite_fallback) {
++ spin_unlock_bh(&msk->fallback_lock);
+ return;
++ }
++ msk->allow_subflows = false;
++ spin_unlock_bh(&msk->fallback_lock);
+
+ if (!subflow->fail_tout) {
+ pr_debug("send MP_FAIL response and infinite map\n");
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -885,7 +885,7 @@ void mptcp_data_ready(struct sock *sk, s
+ static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
+ {
+ mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
+- WRITE_ONCE(msk->allow_infinite_fallback, false);
++ msk->allow_infinite_fallback = false;
+ mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
+ }
+
+@@ -897,7 +897,7 @@ static bool __mptcp_finish_join(struct m
+ return false;
+
+ spin_lock_bh(&msk->fallback_lock);
+- if (__mptcp_check_fallback(msk)) {
++ if (!msk->allow_subflows) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
+ }
+@@ -2707,7 +2707,7 @@ static void __mptcp_retrans(struct sock
+ dfrag->already_sent = max(dfrag->already_sent, info.sent);
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
+- WRITE_ONCE(msk->allow_infinite_fallback, false);
++ msk->allow_infinite_fallback = false;
+ }
+ spin_unlock_bh(&msk->fallback_lock);
+
+@@ -2835,7 +2835,8 @@ static int __mptcp_init_sock(struct sock
+ WRITE_ONCE(msk->first, NULL);
+ inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
+ WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+- WRITE_ONCE(msk->allow_infinite_fallback, true);
++ msk->allow_infinite_fallback = true;
++ msk->allow_subflows = true;
+ msk->recovery = false;
+
+ mptcp_pm_data_init(msk);
+@@ -3673,7 +3674,7 @@ bool mptcp_finish_join(struct sock *ssk)
+ /* active subflow, already present inside the conn_list */
+ if (!list_empty(&subflow->node)) {
+ spin_lock_bh(&msk->fallback_lock);
+- if (__mptcp_check_fallback(msk)) {
++ if (!msk->allow_subflows) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
+ }
+--- a/net/mptcp/protocol.h
++++ b/net/mptcp/protocol.h
+@@ -314,12 +314,14 @@ struct mptcp_sock {
+ u64 time; /* start time of measurement window */
+ u64 rtt_us; /* last maximum rtt of subflows */
+ } rcvq_space;
++ bool allow_subflows;
+
+ u32 setsockopt_seq;
+ char ca_name[TCP_CA_NAME_MAX];
+
+- spinlock_t fallback_lock; /* protects fallback and
+- * allow_infinite_fallback
++ spinlock_t fallback_lock; /* protects fallback,
++ * allow_infinite_fallback and
++ * allow_join
+ */
+ };
+
+@@ -991,6 +993,7 @@ static inline bool __mptcp_try_fallback(
+ return false;
+ }
+
++ msk->allow_subflows = false;
+ set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+ spin_unlock_bh(&msk->fallback_lock);
+ return true;
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -1168,20 +1168,29 @@ static void subflow_sched_work_if_closed
+ mptcp_schedule_work(sk);
+ }
+
+-static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
++static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
+ {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned long fail_tout;
+
++ /* we are really failing, prevent any later subflow join */
++ spin_lock_bh(&msk->fallback_lock);
++ if (!msk->allow_infinite_fallback) {
++ spin_unlock_bh(&msk->fallback_lock);
++ return false;
++ }
++ msk->allow_subflows = false;
++ spin_unlock_bh(&msk->fallback_lock);
++
+ /* greceful failure can happen only on the MPC subflow */
+ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
+- return;
++ return false;
+
+ /* since the close timeout take precedence on the fail one,
+ * no need to start the latter when the first is already set
+ */
+ if (sock_flag((struct sock *)msk, SOCK_DEAD))
+- return;
++ return true;
+
+ /* we don't need extreme accuracy here, use a zero fail_tout as special
+ * value meaning no fail timeout at all;
+@@ -1193,6 +1202,7 @@ static void mptcp_subflow_fail(struct mp
+ tcp_send_ack(ssk);
+
+ mptcp_reset_tout_timer(msk, subflow->fail_tout);
++ return true;
+ }
+
+ static bool subflow_check_data_avail(struct sock *ssk)
+@@ -1261,12 +1271,11 @@ fallback:
+ (subflow->mp_join || subflow->valid_csum_seen)) {
+ subflow->send_mp_fail = 1;
+
+- if (!READ_ONCE(msk->allow_infinite_fallback)) {
++ if (!mptcp_subflow_fail(msk, ssk)) {
+ subflow->reset_transient = 0;
+ subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
+ goto reset;
+ }
+- mptcp_subflow_fail(msk, ssk);
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+ return true;
+ }
--- /dev/null
+From stable+bounces-164938-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:45 2025
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Mon, 28 Jul 2025 15:29:23 +0200
+Subject: mptcp: reset fallback status gracefully at disconnect() time
+To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: Paolo Abeni <pabeni@redhat.com>, sashal@kernel.org, "Matthieu Baerts (NGI0)" <matttbe@kernel.org>, Jakub Kicinski <kuba@kernel.org>
+Message-ID: <20250728132919.3904847-8-matttbe@kernel.org>
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit da9b2fc7b73d147d88abe1922de5ab72d72d7756 upstream.
+
+mptcp_disconnect() clears the fallback bit unconditionally, without
+touching the associated flags.
+
+The bit clear is safe, as no fallback operation can race with that --
+all subflow are already in TCP_CLOSE status thanks to the previous
+FASTCLOSE -- but we need to consistently reset all the fallback related
+status.
+
+Also acquire the relevant lock, to avoid fouling static analyzers.
+
+Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-3-391aff963322@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ Conflicts in protocol.c, because commit ebc1e08f01eb ("mptcp: drop
+ last_snd and MPTCP_RESET_SCHEDULER") is not in this version and
+ changed the context. The same modification can still be applied at the
+ same place. ]
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -3204,7 +3204,16 @@ static int mptcp_disconnect(struct sock
+ */
+ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
+ msk->last_snd = NULL;
++
++ /* The first subflow is already in TCP_CLOSE status, the following
++ * can't overlap with a fallback anymore
++ */
++ spin_lock_bh(&msk->fallback_lock);
++ msk->allow_subflows = true;
++ msk->allow_infinite_fallback = true;
+ WRITE_ONCE(msk->flags, 0);
++ spin_unlock_bh(&msk->fallback_lock);
++
+ msk->cb_flags = 0;
+ msk->recovery = false;
+ msk->can_ack = false;
--- /dev/null
+From stable+bounces-165167-greg=kroah.com@vger.kernel.org Wed Jul 30 03:54:17 2025
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Tue, 29 Jul 2025 18:52:43 -0700
+Subject: selftests/memfd: add test for mapping write-sealed memfd read-only
+To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org, Shuah Khan <shuah@kernel.org>
+Cc: aliceryhl@google.com, surenb@google.com, stable@vger.kernel.org, "Isaac J. Manjarres" <isaacmanjarres@google.com>, kernel-team@android.com, Jann Horn <jannh@google.com>, Julian Orth <ju.orth@gmail.com>, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Linus Torvalds <torvalds@linux-foundation.org>, Vlastimil Babka <vbabka@suse.cz>, Andrew Morton <akpm@linux-foundation.org>, linux-kselftest@vger.kernel.org, linux-kernel@vger.kernel.org
+Message-ID: <20250730015247.30827-5-isaacmanjarres@google.com>
+
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+
+[ Upstream commit ea0916e01d0b0f2cce1369ac1494239a79827270 ]
+
+Now we have reinstated the ability to map F_SEAL_WRITE mappings read-only,
+assert that we are able to do this in a test to ensure that we do not
+regress this again.
+
+Link: https://lkml.kernel.org/r/a6377ec470b14c0539b4600cf8fa24bf2e4858ae.1732804776.git.lorenzo.stoakes@oracle.com
+Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Julian Orth <ju.orth@gmail.com>
+Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/memfd/memfd_test.c | 43 +++++++++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+--- a/tools/testing/selftests/memfd/memfd_test.c
++++ b/tools/testing/selftests/memfd/memfd_test.c
+@@ -186,6 +186,24 @@ static void *mfd_assert_mmap_shared(int
+ return p;
+ }
+
++static void *mfd_assert_mmap_read_shared(int fd)
++{
++ void *p;
++
++ p = mmap(NULL,
++ mfd_def_size,
++ PROT_READ,
++ MAP_SHARED,
++ fd,
++ 0);
++ if (p == MAP_FAILED) {
++ printf("mmap() failed: %m\n");
++ abort();
++ }
++
++ return p;
++}
++
+ static void *mfd_assert_mmap_private(int fd)
+ {
+ void *p;
+@@ -802,6 +820,30 @@ static void test_seal_future_write(void)
+ close(fd);
+ }
+
++static void test_seal_write_map_read_shared(void)
++{
++ int fd;
++ void *p;
++
++ printf("%s SEAL-WRITE-MAP-READ\n", memfd_str);
++
++ fd = mfd_assert_new("kern_memfd_seal_write_map_read",
++ mfd_def_size,
++ MFD_CLOEXEC | MFD_ALLOW_SEALING);
++
++ mfd_assert_add_seals(fd, F_SEAL_WRITE);
++ mfd_assert_has_seals(fd, F_SEAL_WRITE);
++
++ p = mfd_assert_mmap_read_shared(fd);
++
++ mfd_assert_read(fd);
++ mfd_assert_read_shared(fd);
++ mfd_fail_write(fd);
++
++ munmap(p, mfd_def_size);
++ close(fd);
++}
++
+ /*
+ * Test SEAL_SHRINK
+ * Test whether SEAL_SHRINK actually prevents shrinking
+@@ -1056,6 +1098,7 @@ int main(int argc, char **argv)
+
+ test_seal_write();
+ test_seal_future_write();
++ test_seal_write_map_read_shared();
+ test_seal_shrink();
+ test_seal_grow();
+ test_seal_resize();
usb-dwc3-imx8mp-fix-device-leak-at-unbind.patch
ata-fix-sata_mobile_lpm_policy-description-in-kconfig.patch
btrfs-populate-otime-when-logging-an-inode-item.patch
+tls-separate-no-async-decryption-request-handling-from-async.patch
+crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch
+arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch
+kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch
+mptcp-make-fallback-action-and-fallback-decision-atomic.patch
+mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch
+mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch
+mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch
+mm-update-memfd-seal-write-check-to-include-f_seal_write.patch
+mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch
+selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch
+bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch
+kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch
+x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch
+x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch
+kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch
--- /dev/null
+From 41532b785e9d79636b3815a64ddf6a096647d011 Mon Sep 17 00:00:00 2001
+From: Sabrina Dubroca <sd@queasysnail.net>
+Date: Wed, 28 Feb 2024 23:43:59 +0100
+Subject: tls: separate no-async decryption request handling from async
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+commit 41532b785e9d79636b3815a64ddf6a096647d011 upstream.
+
+If we're not doing async, the handling is much simpler. There's no
+reference counting, we just need to wait for the completion to wake us
+up and return its result.
+
+We should preferably also use a separate crypto_wait. I'm not seeing a
+UAF as I did in the past, I think aec7961916f3 ("tls: fix race between
+async notify and socket close") took care of it.
+
+This will make the next fix easier.
+
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Link: https://lore.kernel.org/r/47bde5f649707610eaef9f0d679519966fc31061.1709132643.git.sd@queasysnail.net
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ William: The original patch did not apply cleanly due to deletions of
+ non-existent lines in 6.1.y. The UAF the author stopped seeing can still
+ be reproduced on systems without AVX in conjunction with cryptd.
+ Also removed an extraneous statement after a return statement that is
+ adjacent to diff. ]
+Link: https://lore.kernel.org/netdev/he2K1yz_u7bZ-CnYcTSQ4OxuLuHZXN6xZRgp6_ICSWnq8J5FpI_uD1i_1lTSf7WMrYb5ThiX1OR2GTOB2IltgT49Koy7Hhutr4du4KtLvyk=@willsroot.io/
+Signed-off-by: William Liu <will@willsroot.io>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/tls/tls_sw.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -274,9 +274,15 @@ static int tls_do_decryption(struct sock
+ DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->decrypt_pending) < 1);
+ atomic_inc(&ctx->decrypt_pending);
+ } else {
++ DECLARE_CRYPTO_WAIT(wait);
++
+ aead_request_set_callback(aead_req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+- crypto_req_done, &ctx->async_wait);
++ crypto_req_done, &wait);
++ ret = crypto_aead_decrypt(aead_req);
++ if (ret == -EINPROGRESS || ret == -EBUSY)
++ ret = crypto_wait_req(ret, &wait);
++ return ret;
+ }
+
+ ret = crypto_aead_decrypt(aead_req);
+@@ -289,7 +295,6 @@ static int tls_do_decryption(struct sock
+ /* all completions have run, we're not doing async anymore */
+ darg->async = false;
+ return ret;
+- ret = ret ?: -EINPROGRESS;
+ }
+
+ atomic_dec(&ctx->decrypt_pending);
--- /dev/null
+From stable+bounces-164647-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:39 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 13:07:23 -0400
+Subject: x86/reboot: Harden virtualization hooks for emergency reboot
+To: stable@vger.kernel.org
+Cc: Sean Christopherson <seanjc@google.com>, Kai Huang <kai.huang@intel.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20250724170725.1404455-1-sashal@kernel.org>
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 5e408396c60cd0f0b53a43713016b6d6af8d69e0 ]
+
+Provide dedicated helpers to (un)register virt hooks used during an
+emergency crash/reboot, and WARN if there is an attempt to overwrite
+the registered callback, or an attempt to do an unpaired unregister.
+
+Opportunsitically use rcu_assign_pointer() instead of RCU_INIT_POINTER(),
+mainly so that the set/unset paths are more symmetrical, but also because
+any performance gains from using RCU_INIT_POINTER() are meaningless for
+this code.
+
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Link: https://lore.kernel.org/r/20230721201859.2307736-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Stable-dep-of: a0ee1d5faff1 ("KVM: VMX: Flush shadow VMCS on emergency reboot")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/reboot.h | 5 +++--
+ arch/x86/kernel/reboot.c | 30 ++++++++++++++++++++++++------
+ arch/x86/kvm/vmx/vmx.c | 6 ++----
+ 3 files changed, 29 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/include/asm/reboot.h
++++ b/arch/x86/include/asm/reboot.h
+@@ -25,8 +25,9 @@ void __noreturn machine_real_restart(uns
+ #define MRR_BIOS 0
+ #define MRR_APM 1
+
+-typedef void crash_vmclear_fn(void);
+-extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
++typedef void (cpu_emergency_virt_cb)(void);
++void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
++void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
+ void cpu_emergency_disable_virtualization(void);
+
+ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
+--- a/arch/x86/kernel/reboot.c
++++ b/arch/x86/kernel/reboot.c
+@@ -794,17 +794,35 @@ void machine_crash_shutdown(struct pt_re
+ *
+ * protected by rcu.
+ */
+-crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+-EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
++static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
++
++void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
++{
++ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
++ return;
++
++ rcu_assign_pointer(cpu_emergency_virt_callback, callback);
++}
++EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
++
++void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
++{
++ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
++ return;
++
++ rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
++ synchronize_rcu();
++}
++EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+
+ static inline void cpu_crash_vmclear_loaded_vmcss(void)
+ {
+- crash_vmclear_fn *do_vmclear_operation = NULL;
++ cpu_emergency_virt_cb *callback;
+
+ rcu_read_lock();
+- do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
+- if (do_vmclear_operation)
+- do_vmclear_operation();
++ callback = rcu_dereference(cpu_emergency_virt_callback);
++ if (callback)
++ callback();
+ rcu_read_unlock();
+ }
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -8602,8 +8602,7 @@ static void __vmx_exit(void)
+ {
+ allow_smaller_maxphyaddr = false;
+
+- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+- synchronize_rcu();
++ cpu_emergency_unregister_virt_callback(crash_vmclear_local_loaded_vmcss);
+
+ vmx_cleanup_l1d_flush();
+ }
+@@ -8677,8 +8676,7 @@ static int __init vmx_init(void)
+ pi_init_cpu(cpu);
+ }
+
+- rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+- crash_vmclear_local_loaded_vmcss);
++ cpu_emergency_register_virt_callback(crash_vmclear_local_loaded_vmcss);
+
+ vmx_check_vmcs12_offsets();
+
--- /dev/null
+From stable+bounces-164648-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:43 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 13:07:24 -0400
+Subject: x86/reboot: KVM: Handle VMXOFF in KVM's reboot callback
+To: stable@vger.kernel.org
+Cc: Sean Christopherson <seanjc@google.com>, Kai Huang <kai.huang@intel.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20250724170725.1404455-2-sashal@kernel.org>
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 119b5cb4ffd0166f3e98e9ee042f5046f7744f28 ]
+
+Use KVM VMX's reboot/crash callback to do VMXOFF in an emergency instead
+of manually and blindly doing VMXOFF. There's no need to attempt VMXOFF
+if a hypervisor, i.e. KVM, isn't loaded/active, i.e. if the CPU can't
+possibly be post-VMXON.
+
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Link: https://lore.kernel.org/r/20230721201859.2307736-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Stable-dep-of: a0ee1d5faff1 ("KVM: VMX: Flush shadow VMCS on emergency reboot")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/virtext.h | 10 ----------
+ arch/x86/kernel/reboot.c | 29 +++++++++--------------------
+ arch/x86/kvm/vmx/vmx.c | 8 +++++---
+ 3 files changed, 14 insertions(+), 33 deletions(-)
+
+--- a/arch/x86/include/asm/virtext.h
++++ b/arch/x86/include/asm/virtext.h
+@@ -70,16 +70,6 @@ static inline void __cpu_emergency_vmxof
+ cpu_vmxoff();
+ }
+
+-/** Disable VMX if it is supported and enabled on the current CPU
+- */
+-static inline void cpu_emergency_vmxoff(void)
+-{
+- if (cpu_has_vmx())
+- __cpu_emergency_vmxoff();
+-}
+-
+-
+-
+
+ /*
+ * SVM functions:
+--- a/arch/x86/kernel/reboot.c
++++ b/arch/x86/kernel/reboot.c
+@@ -787,13 +787,7 @@ void machine_crash_shutdown(struct pt_re
+ }
+ #endif
+
+-/*
+- * This is used to VMCLEAR all VMCSs loaded on the
+- * processor. And when loading kvm_intel module, the
+- * callback function pointer will be assigned.
+- *
+- * protected by rcu.
+- */
++/* RCU-protected callback to disable virtualization prior to reboot. */
+ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
+
+ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
+@@ -815,17 +809,6 @@ void cpu_emergency_unregister_virt_callb
+ }
+ EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+
+-static inline void cpu_crash_vmclear_loaded_vmcss(void)
+-{
+- cpu_emergency_virt_cb *callback;
+-
+- rcu_read_lock();
+- callback = rcu_dereference(cpu_emergency_virt_callback);
+- if (callback)
+- callback();
+- rcu_read_unlock();
+-}
+-
+ /* This is the CPU performing the emergency shutdown work. */
+ int crashing_cpu = -1;
+
+@@ -836,9 +819,15 @@ int crashing_cpu = -1;
+ */
+ void cpu_emergency_disable_virtualization(void)
+ {
+- cpu_crash_vmclear_loaded_vmcss();
++ cpu_emergency_virt_cb *callback;
++
++ rcu_read_lock();
++ callback = rcu_dereference(cpu_emergency_virt_callback);
++ if (callback)
++ callback();
++ rcu_read_unlock();
+
+- cpu_emergency_vmxoff();
++ /* KVM_AMD doesn't yet utilize the common callback. */
+ cpu_emergency_svm_disable();
+ }
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -707,7 +707,7 @@ static int vmx_set_guest_uret_msr(struct
+ return ret;
+ }
+
+-static void crash_vmclear_local_loaded_vmcss(void)
++static void vmx_emergency_disable(void)
+ {
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v;
+@@ -715,6 +715,8 @@ static void crash_vmclear_local_loaded_v
+ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ vmcs_clear(v->vmcs);
++
++ __cpu_emergency_vmxoff();
+ }
+
+ static void __loaded_vmcs_clear(void *arg)
+@@ -8602,7 +8604,7 @@ static void __vmx_exit(void)
+ {
+ allow_smaller_maxphyaddr = false;
+
+- cpu_emergency_unregister_virt_callback(crash_vmclear_local_loaded_vmcss);
++ cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
+
+ vmx_cleanup_l1d_flush();
+ }
+@@ -8676,7 +8678,7 @@ static int __init vmx_init(void)
+ pi_init_cpu(cpu);
+ }
+
+- cpu_emergency_register_virt_callback(crash_vmclear_local_loaded_vmcss);
++ cpu_emergency_register_virt_callback(vmx_emergency_disable);
+
+ vmx_check_vmcs12_offsets();
+