From: Greg Kroah-Hartman Date: Fri, 22 Aug 2025 13:10:58 +0000 (+0200) Subject: 6.1-stable patches X-Git-Tag: v6.16.3~46 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0de94e2d5522171137800cab5816e920a149e9f5;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch mm-update-memfd-seal-write-check-to-include-f_seal_write.patch mptcp-make-fallback-action-and-fallback-decision-atomic.patch mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch tls-separate-no-async-decryption-request-handling-from-async.patch x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch --- diff --git a/queue-6.1/arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch b/queue-6.1/arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch new file mode 100644 index 0000000000..b4d2737ea0 --- /dev/null +++ b/queue-6.1/arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch @@ -0,0 +1,37 @@ +From e8cde32f111f7f5681a7bad3ec747e9e697569a9 Mon Sep 17 00:00:00 2001 +From: Nianyao Tang +Date: Tue, 11 Jun 2024 12:20:49 +0000 +Subject: arm64/cpufeatures/kvm: Add ARMv8.9 FEAT_ECBHB bits in ID_AA64MMFR1 register + +From: Nianyao Tang + +commit e8cde32f111f7f5681a7bad3ec747e9e697569a9 upstream. + +Enable ECBHB bits in ID_AA64MMFR1 register as per ARM DDI 0487K.a +specification. + +When guest OS read ID_AA64MMFR1_EL1, kvm emulate this reg using +ftr_id_aa64mmfr1 and always return ID_AA64MMFR1_EL1.ECBHB=0 to guest. +It results in guest syscall jump to tramp ventry, which is not needed +in implementation with ID_AA64MMFR1_EL1.ECBHB=1. +Let's make the guest syscall process the same as the host. + +Signed-off-by: Nianyao Tang +Link: https://lore.kernel.org/r/20240611122049.2758600-1-tangnianyao@huawei.com +Signed-off-by: Catalin Marinas +Signed-off-by: Patrick Roy +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kernel/cpufeature.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/arm64/kernel/cpufeature.c ++++ b/arch/arm64/kernel/cpufeature.c +@@ -343,6 +343,7 @@ static const struct arm64_ftr_bits ftr_i + }; + + static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { ++ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ECBHB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0), diff --git a/queue-6.1/bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch b/queue-6.1/bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch new file mode 100644 index 0000000000..9a60af9abe --- /dev/null +++ b/queue-6.1/bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch @@ -0,0 +1,105 @@ +From stable+bounces-167094-greg=kroah.com@vger.kernel.org Tue Aug 12 04:16:01 2025 +From: Sumanth Gavini +Date: Mon, 11 Aug 2025 20:34:55 -0500 +Subject: Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync +To: marcel@holtmann.org, johan.hedberg@gmail.com, luiz.dentz@gmail.com, davem@davemloft.net, edumazet@google.com, kuba@kernel.org, pabeni@redhat.com +Cc: Sumanth Gavini , linux-bluetooth@vger.kernel.org, netdev@vger.kernel.org, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Luiz Augusto von Dentz +Message-ID: <20250812013457.425332-1-sumanth.gavini@yahoo.com> + +From: Sumanth Gavini + +commit 5af1f84ed13a416297ab9ced7537f4d5ae7f329a upstream. + +Connections may be cleanup while waiting for the commands to complete so +this attempts to check if the connection handle remains valid in case of +errors that would lead to call hci_conn_failed: + +BUG: KASAN: slab-use-after-free in hci_conn_failed+0x1f/0x160 +Read of size 8 at addr ffff888001376958 by task kworker/u3:0/52 + +CPU: 0 PID: 52 Comm: kworker/u3:0 Not tainted +6.5.0-rc1-00527-g2dfe76d58d3a #5615 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS +1.16.2-1.fc38 04/01/2014 +Workqueue: hci0 hci_cmd_sync_work +Call Trace: + + dump_stack_lvl+0x1d/0x70 + print_report+0xce/0x620 + ? __virt_addr_valid+0xd4/0x150 + ? hci_conn_failed+0x1f/0x160 + kasan_report+0xd1/0x100 + ? hci_conn_failed+0x1f/0x160 + hci_conn_failed+0x1f/0x160 + hci_abort_conn_sync+0x237/0x360 + +Signed-off-by: Luiz Augusto von Dentz +Signed-off-by: Sumanth Gavini +Signed-off-by: Greg Kroah-Hartman +--- + net/bluetooth/hci_sync.c | 43 +++++++++++++++++++++++++++++-------------- + 1 file changed, 29 insertions(+), 14 deletions(-) + +--- a/net/bluetooth/hci_sync.c ++++ b/net/bluetooth/hci_sync.c +@@ -5525,31 +5525,46 @@ static int hci_reject_conn_sync(struct h + + int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) + { +- int err; ++ int err = 0; ++ u16 handle = conn->handle; + + switch (conn->state) { + case BT_CONNECTED: + case BT_CONFIG: +- return hci_disconnect_sync(hdev, conn, reason); ++ err = hci_disconnect_sync(hdev, conn, reason); ++ break; + case BT_CONNECT: + err = hci_connect_cancel_sync(hdev, conn); +- /* Cleanup hci_conn object if it cannot be cancelled as it +- * likelly means the controller and host stack are out of sync. +- */ +- if (err) { +- hci_dev_lock(hdev); +- hci_conn_failed(conn, err); +- hci_dev_unlock(hdev); +- } +- return err; ++ break; + case BT_CONNECT2: +- return hci_reject_conn_sync(hdev, conn, reason); ++ err = hci_reject_conn_sync(hdev, conn, reason); ++ break; + default: + conn->state = BT_CLOSED; +- break; ++ return 0; ++ } ++ ++ /* Cleanup hci_conn object if it cannot be cancelled as it ++ * likelly means the controller and host stack are out of sync ++ * or in case of LE it was still scanning so it can be cleanup ++ * safely. ++ */ ++ if (err) { ++ struct hci_conn *c; ++ ++ /* Check if the connection hasn't been cleanup while waiting ++ * commands to complete. ++ */ ++ c = hci_conn_hash_lookup_handle(hdev, handle); ++ if (!c || c != conn) ++ return 0; ++ ++ hci_dev_lock(hdev); ++ hci_conn_failed(conn, err); ++ hci_dev_unlock(hdev); + } + +- return 0; ++ return err; + } + + static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) diff --git a/queue-6.1/crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch b/queue-6.1/crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch new file mode 100644 index 0000000000..c691b40f76 --- /dev/null +++ b/queue-6.1/crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch @@ -0,0 +1,104 @@ +From a238487f7965d102794ed9f8aff0b667cd2ae886 Mon Sep 17 00:00:00 2001 +From: Giovanni Cabiddu +Date: Fri, 20 Oct 2023 15:49:23 +0200 +Subject: crypto: qat - fix ring to service map for QAT GEN4 + +From: Giovanni Cabiddu + +commit a238487f7965d102794ed9f8aff0b667cd2ae886 upstream. + +The 4xxx drivers hardcode the ring to service mapping. However, when +additional configurations where added to the driver, the mappings were +not updated. This implies that an incorrect mapping might be reported +through pfvf for certain configurations. + +Add an algorithm that computes the correct ring to service mapping based +on the firmware loaded on the device. + +Fixes: 0cec19c761e5 ("crypto: qat - add support for compression for 4xxx") +Signed-off-by: Giovanni Cabiddu +Reviewed-by: Damian Muszynski +Reviewed-by: Tero Kristo +Signed-off-by: Herbert Xu +[Giovanni: backport to 6.1.y, conflict resolved simplifying the logic +in the function get_ring_to_svc_map() as the QAT driver in v6.1 supports +only limited configurations (crypto only and compression). Differs from +upstream as the ring to service mapping is hardcoded rather than being +dynamically computed.] +Reviewed-by: Ahsan Atta +Tested-by: Ahsan Atta +Signed-off-by: Giovanni Cabiddu +Signed-off-by: Greg Kroah-Hartman +--- + drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c | 13 +++++++++++++ + drivers/crypto/qat/qat_common/adf_accel_devices.h | 1 + + drivers/crypto/qat/qat_common/adf_gen4_hw_data.h | 6 ++++++ + drivers/crypto/qat/qat_common/adf_init.c | 3 +++ + 4 files changed, 23 insertions(+) + +--- a/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c ++++ b/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c +@@ -297,6 +297,18 @@ static char *uof_get_name(struct adf_acc + return NULL; + } + ++static u16 get_ring_to_svc_map(struct adf_accel_dev *accel_dev) ++{ ++ switch (get_service_enabled(accel_dev)) { ++ case SVC_CY: ++ return ADF_GEN4_DEFAULT_RING_TO_SRV_MAP; ++ case SVC_DC: ++ return ADF_GEN4_DEFAULT_RING_TO_SRV_MAP_DC; ++ } ++ ++ return 0; ++} ++ + static u32 uof_get_ae_mask(struct adf_accel_dev *accel_dev, u32 obj_num) + { + switch (get_service_enabled(accel_dev)) { +@@ -353,6 +365,7 @@ void adf_init_hw_data_4xxx(struct adf_hw + hw_data->uof_get_ae_mask = uof_get_ae_mask; + hw_data->set_msix_rttable = set_msix_default_rttable; + hw_data->set_ssm_wdtimer = adf_gen4_set_ssm_wdtimer; ++ hw_data->get_ring_to_svc_map = get_ring_to_svc_map; + hw_data->disable_iov = adf_disable_sriov; + hw_data->ring_pair_reset = adf_gen4_ring_pair_reset; + hw_data->enable_pm = adf_gen4_enable_pm; +--- a/drivers/crypto/qat/qat_common/adf_accel_devices.h ++++ b/drivers/crypto/qat/qat_common/adf_accel_devices.h +@@ -176,6 +176,7 @@ struct adf_hw_device_data { + void (*get_arb_info)(struct arb_info *arb_csrs_info); + void (*get_admin_info)(struct admin_info *admin_csrs_info); + enum dev_sku_info (*get_sku)(struct adf_hw_device_data *self); ++ u16 (*get_ring_to_svc_map)(struct adf_accel_dev *accel_dev); + int (*alloc_irq)(struct adf_accel_dev *accel_dev); + void (*free_irq)(struct adf_accel_dev *accel_dev); + void (*enable_error_correction)(struct adf_accel_dev *accel_dev); +--- a/drivers/crypto/qat/qat_common/adf_gen4_hw_data.h ++++ b/drivers/crypto/qat/qat_common/adf_gen4_hw_data.h +@@ -95,6 +95,12 @@ do { \ + ADF_RING_BUNDLE_SIZE * (bank) + \ + ADF_RING_CSR_RING_SRV_ARB_EN, (value)) + ++#define ADF_GEN4_DEFAULT_RING_TO_SRV_MAP_DC \ ++ (COMP << ADF_CFG_SERV_RING_PAIR_0_SHIFT | \ ++ COMP << ADF_CFG_SERV_RING_PAIR_1_SHIFT | \ ++ COMP << ADF_CFG_SERV_RING_PAIR_2_SHIFT | \ ++ COMP << ADF_CFG_SERV_RING_PAIR_3_SHIFT) ++ + /* Default ring mapping */ + #define ADF_GEN4_DEFAULT_RING_TO_SRV_MAP \ + (ASYM << ADF_CFG_SERV_RING_PAIR_0_SHIFT | \ +--- a/drivers/crypto/qat/qat_common/adf_init.c ++++ b/drivers/crypto/qat/qat_common/adf_init.c +@@ -95,6 +95,9 @@ int adf_dev_init(struct adf_accel_dev *a + return -EFAULT; + } + ++ if (hw_data->get_ring_to_svc_map) ++ hw_data->ring_to_svc_map = hw_data->get_ring_to_svc_map(accel_dev); ++ + if (adf_ae_init(accel_dev)) { + dev_err(&GET_DEV(accel_dev), + "Failed to initialise Acceleration Engine\n"); diff --git a/queue-6.1/kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch b/queue-6.1/kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch new file mode 100644 index 0000000000..ff4d3efb1f --- /dev/null +++ b/queue-6.1/kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch @@ -0,0 +1,44 @@ +From nathan@kernel.org Thu Aug 21 20:30:59 2025 +From: Nathan Chancellor +Date: Thu, 21 Aug 2025 11:30:51 -0700 +Subject: kbuild: userprogs: use correct linker when mixing clang and GNU ld +To: gregkh@linuxfoundation.org, sashal@kernel.org +Cc: stable@vger.kernel.org, nathan@kernel.org, thomas.weissschuh@linutronix.de +Message-ID: <20250821183051.1259435-1-nathan@kernel.org> + +From: Thomas Weißschuh + +commit 936599ca514973d44a766b7376c6bbdc96b6a8cc upstream. + +The userprogs infrastructure does not expect clang being used with GNU ld +and in that case uses /usr/bin/ld for linking, not the configured $(LD). +This fallback is problematic as it will break when cross-compiling. +Mixing clang and GNU ld is used for example when building for SPARC64, +as ld.lld is not sufficient; see Documentation/kbuild/llvm.rst. + +Relax the check around --ld-path so it gets used for all linkers. + +Fixes: dfc1b168a8c4 ("kbuild: userprogs: use correct lld when linking through clang") +Cc: stable@vger.kernel.org +Signed-off-by: Thomas Weißschuh +Reviewed-by: Nathan Chancellor +Signed-off-by: Masahiro Yamada +[nathan: Work around wrapping '--ld-path' in cc-option in older stable + branches due to older minimum LLVM version] +Signed-off-by: Nathan Chancellor +Signed-off-by: Greg Kroah-Hartman +--- + Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/Makefile ++++ b/Makefile +@@ -1143,7 +1143,7 @@ KBUILD_USERCFLAGS += $(filter -m32 -m64 + KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) + + # userspace programs are linked via the compiler, use the correct linker +-ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_LD_IS_LLD),yy) ++ifdef CONFIG_CC_IS_CLANG + KBUILD_USERLDFLAGS += $(call cc-option, --ld-path=$(LD)) + endif + diff --git a/queue-6.1/kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch b/queue-6.1/kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch new file mode 100644 index 0000000000..23b06971fa --- /dev/null +++ b/queue-6.1/kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch @@ -0,0 +1,52 @@ +From stable+bounces-164649-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:43 2025 +From: Sasha Levin +Date: Thu, 24 Jul 2025 13:07:25 -0400 +Subject: KVM: VMX: Flush shadow VMCS on emergency reboot +To: stable@vger.kernel.org +Cc: Chao Gao , Kai Huang , Sean Christopherson , Sasha Levin +Message-ID: <20250724170725.1404455-3-sashal@kernel.org> + +From: Chao Gao + +[ Upstream commit a0ee1d5faff135e28810f29e0f06328c66f89852 ] + +Ensure the shadow VMCS cache is evicted during an emergency reboot to +prevent potential memory corruption if the cache is evicted after reboot. + +This issue was identified through code inspection, as __loaded_vmcs_clear() +flushes both the normal VMCS and the shadow VMCS. + +Avoid checking the "launched" state during an emergency reboot, unlike the +behavior in __loaded_vmcs_clear(). This is important because reboot NMIs +can interfere with operations like copy_shadow_to_vmcs12(), where shadow +VMCSes are loaded directly using VMPTRLD. In such cases, if NMIs occur +right after the VMCS load, the shadow VMCSes will be active but the +"launched" state may not be set. + +Fixes: 16f5b9034b69 ("KVM: nVMX: Copy processor-specific shadow-vmcs to VMCS12") +Cc: stable@vger.kernel.org +Signed-off-by: Chao Gao +Reviewed-by: Kai Huang +Link: https://lore.kernel.org/r/20250324140849.2099723-1-chao.gao@intel.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/vmx.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -713,8 +713,11 @@ static void vmx_emergency_disable(void) + struct loaded_vmcs *v; + + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), +- loaded_vmcss_on_cpu_link) ++ loaded_vmcss_on_cpu_link) { + vmcs_clear(v->vmcs); ++ if (v->shadow_vmcs) ++ vmcs_clear(v->shadow_vmcs); ++ } + + __cpu_emergency_vmxoff(); + } diff --git a/queue-6.1/kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch b/queue-6.1/kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch new file mode 100644 index 0000000000..d6e01fc67b --- /dev/null +++ b/queue-6.1/kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch @@ -0,0 +1,82 @@ +From f1fb088d9cecde5c3066d8ff8846789667519b7d Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 4 Apr 2025 12:38:19 -0700 +Subject: KVM: x86: Take irqfds.lock when adding/deleting IRQ bypass producer + +From: Sean Christopherson + +commit f1fb088d9cecde5c3066d8ff8846789667519b7d upstream. + +Take irqfds.lock when adding/deleting an IRQ bypass producer to ensure +irqfd->producer isn't modified while kvm_irq_routing_update() is running. +The only lock held when a producer is added/removed is irqbypass's mutex. + +Fixes: 872768800652 ("KVM: x86: select IRQ_BYPASS_MANAGER") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-ID: <20250404193923.1413163-5-seanjc@google.com> +Signed-off-by: Paolo Bonzini +[sean: account for lack of kvm_x86_call()] +Signed-off-by: Sean Christopherson +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 19 ++++++++++++++++--- + 1 file changed, 16 insertions(+), 3 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -13387,16 +13387,22 @@ int kvm_arch_irq_bypass_add_producer(str + { + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); ++ struct kvm *kvm = irqfd->kvm; + int ret; + +- irqfd->producer = prod; + kvm_arch_start_assignment(irqfd->kvm); ++ ++ spin_lock_irq(&kvm->irqfds.lock); ++ irqfd->producer = prod; ++ + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, + prod->irq, irqfd->gsi, 1); +- + if (ret) + kvm_arch_end_assignment(irqfd->kvm); + ++ spin_unlock_irq(&kvm->irqfds.lock); ++ ++ + return ret; + } + +@@ -13406,9 +13412,9 @@ void kvm_arch_irq_bypass_del_producer(st + int ret; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); ++ struct kvm *kvm = irqfd->kvm; + + WARN_ON(irqfd->producer != prod); +- irqfd->producer = NULL; + + /* + * When producer of consumer is unregistered, we change back to +@@ -13416,11 +13422,18 @@ void kvm_arch_irq_bypass_del_producer(st + * when the irq is masked/disabled or the consumer side (KVM + * int this case doesn't want to receive the interrupts. + */ ++ spin_lock_irq(&kvm->irqfds.lock); ++ irqfd->producer = NULL; ++ ++ + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); + if (ret) + printk(KERN_INFO "irq bypass consumer (token %p) unregistration" + " fails: %d\n", irqfd->consumer.token, ret); + ++ spin_unlock_irq(&kvm->irqfds.lock); ++ ++ + kvm_arch_end_assignment(irqfd->kvm); + } + diff --git a/queue-6.1/mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch b/queue-6.1/mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch new file mode 100644 index 0000000000..41c739777b --- /dev/null +++ b/queue-6.1/mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch @@ -0,0 +1,204 @@ +From stable+bounces-165164-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:29 2025 +From: "Isaac J. Manjarres" +Date: Tue, 29 Jul 2025 18:52:40 -0700 +Subject: mm: drop the assumption that VM_SHARED always implies writable +To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org, Alexander Viro , Christian Brauner , Jan Kara , Andrew Morton , David Hildenbrand , "Liam R. Howlett" , Vlastimil Babka , Mike Rapoport , Suren Baghdasaryan , Michal Hocko , Kees Cook , Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot , Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Valentin Schneider , "Matthew Wilcox (Oracle)" , Jann Horn , Pedro Falcato +Cc: aliceryhl@google.com, stable@vger.kernel.org, "Isaac J. Manjarres" , kernel-team@android.com, Lorenzo Stoakes , Andy Lutomirski , Hugh Dickins , Mike Kravetz , Muchun Song , linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org +Message-ID: <20250730015247.30827-2-isaacmanjarres@google.com> + +From: Lorenzo Stoakes + +[ Upstream commit e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 ] + +Patch series "permit write-sealed memfd read-only shared mappings", v4. + +The man page for fcntl() describing memfd file seals states the following +about F_SEAL_WRITE:- + + Furthermore, trying to create new shared, writable memory-mappings via + mmap(2) will also fail with EPERM. + +With emphasis on 'writable'. In turns out in fact that currently the +kernel simply disallows all new shared memory mappings for a memfd with +F_SEAL_WRITE applied, rendering this documentation inaccurate. + +This matters because users are therefore unable to obtain a shared mapping +to a memfd after write sealing altogether, which limits their usefulness. +This was reported in the discussion thread [1] originating from a bug +report [2]. + +This is a product of both using the struct address_space->i_mmap_writable +atomic counter to determine whether writing may be permitted, and the +kernel adjusting this counter when any VM_SHARED mapping is performed and +more generally implicitly assuming VM_SHARED implies writable. + +It seems sensible that we should only update this mapping if VM_MAYWRITE +is specified, i.e. whether it is possible that this mapping could at any +point be written to. + +If we do so then all we need to do to permit write seals to function as +documented is to clear VM_MAYWRITE when mapping read-only. It turns out +this functionality already exists for F_SEAL_FUTURE_WRITE - we can +therefore simply adapt this logic to do the same for F_SEAL_WRITE. + +We then hit a chicken and egg situation in mmap_region() where the check +for VM_MAYWRITE occurs before we are able to clear this flag. To work +around this, perform this check after we invoke call_mmap(), with careful +consideration of error paths. + +Thanks to Andy Lutomirski for the suggestion! + +[1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/ +[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238 + +This patch (of 3): + +There is a general assumption that VMAs with the VM_SHARED flag set are +writable. If the VM_MAYWRITE flag is not set, then this is simply not the +case. + +Update those checks which affect the struct address_space->i_mmap_writable +field to explicitly test for this by introducing +[vma_]is_shared_maywrite() helper functions. + +This remains entirely conservative, as the lack of VM_MAYWRITE guarantees +that the VMA cannot be written to. + +Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com +Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com +Signed-off-by: Lorenzo Stoakes +Suggested-by: Andy Lutomirski +Reviewed-by: Jan Kara +Cc: Alexander Viro +Cc: Christian Brauner +Cc: Hugh Dickins +Cc: Matthew Wilcox (Oracle) +Cc: Mike Kravetz +Cc: Muchun Song +Signed-off-by: Andrew Morton +Cc: stable@vger.kernel.org +[isaacmanjarres: resolved merge conflicts due to +due to refactoring that happened in upstream commit +5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour")] +Signed-off-by: Isaac J. Manjarres +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/fs.h | 4 ++-- + include/linux/mm.h | 11 +++++++++++ + kernel/fork.c | 2 +- + mm/filemap.c | 2 +- + mm/madvise.c | 2 +- + mm/mmap.c | 8 ++++---- + 6 files changed, 20 insertions(+), 9 deletions(-) + +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -410,7 +410,7 @@ extern const struct address_space_operat + * It is also used to block modification of page cache contents through + * memory mappings. + * @gfp_mask: Memory allocation flags to use for allocating pages. +- * @i_mmap_writable: Number of VM_SHARED mappings. ++ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. + * @nr_thps: Number of THPs in the pagecache (non-shmem only). + * @i_mmap: Tree of private and shared mappings. + * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. +@@ -513,7 +513,7 @@ static inline int mapping_mapped(struct + + /* + * Might pages of this file have been modified in userspace? +- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap ++ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap + * marks vma as VM_SHARED if it is shared, and the file was opened for + * writing i.e. vma may be mprotected writable even if now readonly. + * +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -673,6 +673,17 @@ static inline bool vma_is_accessible(str + return vma->vm_flags & VM_ACCESS_FLAGS; + } + ++static inline bool is_shared_maywrite(vm_flags_t vm_flags) ++{ ++ return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == ++ (VM_SHARED | VM_MAYWRITE); ++} ++ ++static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) ++{ ++ return is_shared_maywrite(vma->vm_flags); ++} ++ + static inline + struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) + { +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -669,7 +669,7 @@ static __latent_entropy int dup_mmap(str + + get_file(file); + i_mmap_lock_write(mapping); +- if (tmp->vm_flags & VM_SHARED) ++ if (vma_is_shared_maywrite(tmp)) + mapping_allow_writable(mapping); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -3554,7 +3554,7 @@ int generic_file_mmap(struct file *file, + */ + int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) + { +- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) ++ if (vma_is_shared_maywrite(vma)) + return -EINVAL; + return generic_file_mmap(file, vma); + } +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -980,7 +980,7 @@ static long madvise_remove(struct vm_are + return -EINVAL; + } + +- if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) ++ if (!vma_is_shared_maywrite(vma)) + return -EACCES; + + offset = (loff_t)(start - vma->vm_start) +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -106,7 +106,7 @@ void vma_set_page_prot(struct vm_area_st + static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct file *file, struct address_space *mapping) + { +- if (vma->vm_flags & VM_SHARED) ++ if (vma_is_shared_maywrite(vma)) + mapping_unmap_writable(mapping); + + flush_dcache_mmap_lock(mapping); +@@ -408,7 +408,7 @@ static unsigned long count_vma_pages_ran + static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) + { +- if (vma->vm_flags & VM_SHARED) ++ if (vma_is_shared_maywrite(vma)) + mapping_allow_writable(mapping); + + flush_dcache_mmap_lock(mapping); +@@ -2827,7 +2827,7 @@ cannot_expand: + vma_mas_store(vma, &mas); + mm->map_count++; + if (vma->vm_file) { +- if (vma->vm_flags & VM_SHARED) ++ if (vma_is_shared_maywrite(vma)) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); +@@ -2901,7 +2901,7 @@ unsigned long mmap_region(struct file *f + return -EINVAL; + + /* Map writable and ensure this isn't a sealed memfd. */ +- if (file && (vm_flags & VM_SHARED)) { ++ if (file && is_shared_maywrite(vm_flags)) { + int error = mapping_map_writable(file->f_mapping); + + if (error) diff --git a/queue-6.1/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch b/queue-6.1/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch new file mode 100644 index 0000000000..f5ecc1c27a --- /dev/null +++ b/queue-6.1/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch @@ -0,0 +1,234 @@ +From stable+bounces-165166-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:57 2025 +From: "Isaac J. Manjarres" +Date: Tue, 29 Jul 2025 18:52:42 -0700 +Subject: mm: reinstate ability to map write-sealed memfd mappings read-only +To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org, Hugh Dickins , Baolin Wang , Andrew Morton , David Hildenbrand , "Liam R. Howlett" , Vlastimil Babka , Mike Rapoport , Suren Baghdasaryan , Michal Hocko , Jann Horn , Pedro Falcato +Cc: aliceryhl@google.com, stable@vger.kernel.org, "Isaac J. Manjarres" , kernel-team@android.com, Julian Orth , "Liam R. Howlett" , Linus Torvalds , Shuah Khan , linux-mm@kvack.org, linux-kernel@vger.kernel.org +Message-ID: <20250730015247.30827-4-isaacmanjarres@google.com> + +From: Lorenzo Stoakes + +[ Upstream commit 8ec396d05d1b737c87311fb7311f753b02c2a6b1 ] + +Patch series "mm: reinstate ability to map write-sealed memfd mappings +read-only". + +In commit 158978945f31 ("mm: perform the mapping_map_writable() check +after call_mmap()") (and preceding changes in the same series) it became +possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. + +Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path +behaviour") unintentionally undid this logic by moving the +mapping_map_writable() check before the shmem_mmap() hook is invoked, +thereby regressing this change. + +This series reworks how we both permit write-sealed mappings being mapped +read-only and disallow mprotect() from undoing the write-seal, fixing this +regression. + +We also add a regression test to ensure that we do not accidentally +regress this in future. + +Thanks to Julian Orth for reporting this regression. + +This patch (of 2): + +In commit 158978945f31 ("mm: perform the mapping_map_writable() check +after call_mmap()") (and preceding changes in the same series) it became +possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. + +This was previously unnecessarily disallowed, despite the man page +documentation indicating that it would be, thereby limiting the usefulness +of F_SEAL_WRITE logic. + +We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE +seal (one which disallows future writes to the memfd) to also be used for +F_SEAL_WRITE. + +For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a +read-only mapping to disallow mprotect() from overriding the seal - an +operation performed by seal_check_write(), invoked from shmem_mmap(), the +f_op->mmap() hook used by shmem mappings. + +By extending this to F_SEAL_WRITE and critically - checking +mapping_map_writable() to determine if we may map the memfd AFTER we +invoke shmem_mmap() - the desired logic becomes possible. This is because +mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will +have cleared. + +Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path +behaviour") unintentionally undid this logic by moving the +mapping_map_writable() check before the shmem_mmap() hook is invoked, +thereby regressing this change. + +We reinstate this functionality by moving the check out of shmem_mmap() +and instead performing it in do_mmap() at the point at which VMA flags are +being determined, which seems in any case to be a more appropriate place +in which to make this determination. + +In order to achieve this we rework memfd seal logic to allow us access to +this information using existing logic and eliminate the clearing of +VM_MAYWRITE from seal_check_write() which we are performing in do_mmap() +instead. + +Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com +Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") +Signed-off-by: Lorenzo Stoakes +Reported-by: Julian Orth +Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/ +Cc: Jann Horn +Cc: Liam R. Howlett +Cc: Linus Torvalds +Cc: Shuah Khan +Cc: Vlastimil Babka +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Isaac J. Manjarres +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/memfd.h | 14 ++++++++++++ + include/linux/mm.h | 58 ++++++++++++++++++++++++++++++++++---------------- + mm/memfd.c | 2 - + mm/mmap.c | 4 +++ + 4 files changed, 59 insertions(+), 19 deletions(-) + +--- a/include/linux/memfd.h ++++ b/include/linux/memfd.h +@@ -6,11 +6,25 @@ + + #ifdef CONFIG_MEMFD_CREATE + extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); ++unsigned int *memfd_file_seals_ptr(struct file *file); + #else + static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) + { + return -EINVAL; + } ++ ++static inline unsigned int *memfd_file_seals_ptr(struct file *file) ++{ ++ return NULL; ++} + #endif + ++/* Retrieve memfd seals associated with the file, if any. */ ++static inline unsigned int memfd_file_seals(struct file *file) ++{ ++ unsigned int *sealsp = memfd_file_seals_ptr(file); ++ ++ return sealsp ? *sealsp : 0; ++} ++ + #endif /* __LINUX_MEMFD_H */ +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3525,6 +3525,37 @@ void mem_dump_obj(void *object); + static inline void mem_dump_obj(void *object) {} + #endif + ++static inline bool is_write_sealed(int seals) ++{ ++ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); ++} ++ ++/** ++ * is_readonly_sealed - Checks whether write-sealed but mapped read-only, ++ * in which case writes should be disallowing moving ++ * forwards. ++ * @seals: the seals to check ++ * @vm_flags: the VMA flags to check ++ * ++ * Returns whether readonly sealed, in which case writess should be disallowed ++ * going forward. ++ */ ++static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) ++{ ++ /* ++ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as ++ * MAP_SHARED and read-only, take care to not allow mprotect to ++ * revert protections on such mappings. Do this only for shared ++ * mappings. For private mappings, don't need to mask ++ * VM_MAYWRITE as we still want them to be COW-writable. ++ */ ++ if (is_write_sealed(seals) && ++ ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) ++ return true; ++ ++ return false; ++} ++ + /** + * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and + * handle them. +@@ -3536,24 +3567,15 @@ static inline void mem_dump_obj(void *ob + */ + static inline int seal_check_write(int seals, struct vm_area_struct *vma) + { +- if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { +- /* +- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when +- * write seals are active. +- */ +- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) +- return -EPERM; +- +- /* +- * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as +- * MAP_SHARED and read-only, take care to not allow mprotect to +- * revert protections on such mappings. Do this only for shared +- * mappings. For private mappings, don't need to mask +- * VM_MAYWRITE as we still want them to be COW-writable. +- */ +- if (vma->vm_flags & VM_SHARED) +- vma->vm_flags &= ~(VM_MAYWRITE); +- } ++ if (!is_write_sealed(seals)) ++ return 0; ++ ++ /* ++ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when ++ * write seals are active. ++ */ ++ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) ++ return -EPERM; + + return 0; + } +--- a/mm/memfd.c ++++ b/mm/memfd.c +@@ -133,7 +133,7 @@ static int memfd_wait_for_pins(struct ad + return error; + } + +-static unsigned int *memfd_file_seals_ptr(struct file *file) ++unsigned int *memfd_file_seals_ptr(struct file *file) + { + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1336,6 +1337,7 @@ unsigned long do_mmap(struct file *file, + + if (file) { + struct inode *inode = file_inode(file); ++ unsigned int seals = memfd_file_seals(file); + unsigned long flags_mask; + + if (!file_mmap_ok(file, inode, pgoff, len)) +@@ -1374,6 +1376,8 @@ unsigned long do_mmap(struct file *file, + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); ++ else if (is_readonly_sealed(seals, vm_flags)) ++ vm_flags &= ~VM_MAYWRITE; + fallthrough; + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) diff --git a/queue-6.1/mm-update-memfd-seal-write-check-to-include-f_seal_write.patch b/queue-6.1/mm-update-memfd-seal-write-check-to-include-f_seal_write.patch new file mode 100644 index 0000000000..b2757f4519 --- /dev/null +++ b/queue-6.1/mm-update-memfd-seal-write-check-to-include-f_seal_write.patch @@ -0,0 +1,104 @@ +From stable+bounces-165165-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:44 2025 +From: "Isaac J. Manjarres" +Date: Tue, 29 Jul 2025 18:52:41 -0700 +Subject: mm: update memfd seal write check to include F_SEAL_WRITE +To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org, Muchun Song , Oscar Salvador , David Hildenbrand , Andrew Morton , "Liam R. Howlett" , Vlastimil Babka , Mike Rapoport , Suren Baghdasaryan , Michal Hocko , Hugh Dickins , Baolin Wang +Cc: aliceryhl@google.com, stable@vger.kernel.org, "Isaac J. Manjarres" , kernel-team@android.com, Lorenzo Stoakes , Jan Kara , Alexander Viro , Andy Lutomirski , Christian Brauner , "Matthew Wilcox (Oracle)" , Mike Kravetz , linux-mm@kvack.org, linux-kernel@vger.kernel.org +Message-ID: <20250730015247.30827-3-isaacmanjarres@google.com> + +From: Lorenzo Stoakes + +[ Upstream commit 28464bbb2ddc199433383994bcb9600c8034afa1 ] + +The seal_check_future_write() function is called by shmem_mmap() or +hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd +sealed this way. + +The F_SEAL_WRITE flag is not checked here, as that is handled via the +mapping->i_mmap_writable mechanism and so any attempt at a mapping would +fail before this could be run. + +However we intend to change this, meaning this check can be performed for +F_SEAL_WRITE mappings also. + +The logic here is equally applicable to both flags, so update this +function to accommodate both and rename it accordingly. + +Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com +Signed-off-by: Lorenzo Stoakes +Reviewed-by: Jan Kara +Cc: Alexander Viro +Cc: Andy Lutomirski +Cc: Christian Brauner +Cc: Hugh Dickins +Cc: Matthew Wilcox (Oracle) +Cc: Mike Kravetz +Cc: Muchun Song +Signed-off-by: Andrew Morton +Cc: stable@vger.kernel.org +Signed-off-by: Isaac J. Manjarres +Signed-off-by: Greg Kroah-Hartman +--- + fs/hugetlbfs/inode.c | 2 +- + include/linux/mm.h | 15 ++++++++------- + mm/shmem.c | 2 +- + 3 files changed, 10 insertions(+), 9 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -136,7 +136,7 @@ static int hugetlbfs_file_mmap(struct fi + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vma->vm_ops = &hugetlb_vm_ops; + +- ret = seal_check_future_write(info->seals, vma); ++ ret = seal_check_write(info->seals, vma); + if (ret) + return ret; + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3526,25 +3526,26 @@ static inline void mem_dump_obj(void *ob + #endif + + /** +- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it ++ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and ++ * handle them. + * @seals: the seals to check + * @vma: the vma to operate on + * +- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on +- * the vma flags. Return 0 if check pass, or <0 for errors. ++ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper ++ * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. + */ +-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) ++static inline int seal_check_write(int seals, struct vm_area_struct *vma) + { +- if (seals & F_SEAL_FUTURE_WRITE) { ++ if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when +- * "future write" seal active. ++ * write seals are active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* +- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as ++ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2302,7 +2302,7 @@ static int shmem_mmap(struct file *file, + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + int ret; + +- ret = seal_check_future_write(info->seals, vma); ++ ret = seal_check_write(info->seals, vma); + if (ret) + return ret; + diff --git a/queue-6.1/mptcp-make-fallback-action-and-fallback-decision-atomic.patch b/queue-6.1/mptcp-make-fallback-action-and-fallback-decision-atomic.patch new file mode 100644 index 0000000000..d7861eec48 --- /dev/null +++ b/queue-6.1/mptcp-make-fallback-action-and-fallback-decision-atomic.patch @@ -0,0 +1,387 @@ +From stable+bounces-164937-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:43 2025 +From: "Matthieu Baerts (NGI0)" +Date: Mon, 28 Jul 2025 15:29:21 +0200 +Subject: mptcp: make fallback action and fallback decision atomic +To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: Paolo Abeni , sashal@kernel.org, Matthieu Baerts , syzbot+5cf807c20386d699b524@syzkaller.appspotmail.com, Jakub Kicinski +Message-ID: <20250728132919.3904847-6-matttbe@kernel.org> + +From: Paolo Abeni + +commit f8a1d9b18c5efc76784f5a326e905f641f839894 upstream. + +Syzkaller reported the following splat: + + WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 __mptcp_do_fallback net/mptcp/protocol.h:1223 [inline] + WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 mptcp_do_fallback net/mptcp/protocol.h:1244 [inline] + WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 check_fully_established net/mptcp/options.c:982 [inline] + WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 mptcp_incoming_options+0x21a8/0x2510 net/mptcp/options.c:1153 + Modules linked in: + CPU: 1 UID: 0 PID: 7704 Comm: syz.3.1419 Not tainted 6.16.0-rc3-gbd5ce2324dba #20 PREEMPT(voluntary) + Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 + RIP: 0010:__mptcp_do_fallback net/mptcp/protocol.h:1223 [inline] + RIP: 0010:mptcp_do_fallback net/mptcp/protocol.h:1244 [inline] + RIP: 0010:check_fully_established net/mptcp/options.c:982 [inline] + RIP: 0010:mptcp_incoming_options+0x21a8/0x2510 net/mptcp/options.c:1153 + Code: 24 18 e8 bb 2a 00 fd e9 1b df ff ff e8 b1 21 0f 00 e8 ec 5f c4 fc 44 0f b7 ac 24 b0 00 00 00 e9 54 f1 ff ff e8 d9 5f c4 fc 90 <0f> 0b 90 e9 b8 f4 ff ff e8 8b 2a 00 fd e9 8d e6 ff ff e8 81 2a 00 + RSP: 0018:ffff8880a3f08448 EFLAGS: 00010246 + RAX: 0000000000000000 RBX: ffff8880180a8000 RCX: ffffffff84afcf45 + RDX: ffff888090223700 RSI: ffffffff84afdaa7 RDI: 0000000000000001 + RBP: ffff888017955780 R08: 0000000000000001 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 + R13: ffff8880180a8910 R14: ffff8880a3e9d058 R15: 0000000000000000 + FS: 00005555791b8500(0000) GS:ffff88811c495000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 000000110c2800b7 CR3: 0000000058e44000 CR4: 0000000000350ef0 + Call Trace: + + tcp_reset+0x26f/0x2b0 net/ipv4/tcp_input.c:4432 + tcp_validate_incoming+0x1057/0x1b60 net/ipv4/tcp_input.c:5975 + tcp_rcv_established+0x5b5/0x21f0 net/ipv4/tcp_input.c:6166 + tcp_v4_do_rcv+0x5dc/0xa70 net/ipv4/tcp_ipv4.c:1925 + tcp_v4_rcv+0x3473/0x44a0 net/ipv4/tcp_ipv4.c:2363 + ip_protocol_deliver_rcu+0xba/0x480 net/ipv4/ip_input.c:205 + ip_local_deliver_finish+0x2f1/0x500 net/ipv4/ip_input.c:233 + NF_HOOK include/linux/netfilter.h:317 [inline] + NF_HOOK include/linux/netfilter.h:311 [inline] + ip_local_deliver+0x1be/0x560 net/ipv4/ip_input.c:254 + dst_input include/net/dst.h:469 [inline] + ip_rcv_finish net/ipv4/ip_input.c:447 [inline] + NF_HOOK include/linux/netfilter.h:317 [inline] + NF_HOOK include/linux/netfilter.h:311 [inline] + ip_rcv+0x514/0x810 net/ipv4/ip_input.c:567 + __netif_receive_skb_one_core+0x197/0x1e0 net/core/dev.c:5975 + __netif_receive_skb+0x1f/0x120 net/core/dev.c:6088 + process_backlog+0x301/0x1360 net/core/dev.c:6440 + __napi_poll.constprop.0+0xba/0x550 net/core/dev.c:7453 + napi_poll net/core/dev.c:7517 [inline] + net_rx_action+0xb44/0x1010 net/core/dev.c:7644 + handle_softirqs+0x1d0/0x770 kernel/softirq.c:579 + do_softirq+0x3f/0x90 kernel/softirq.c:480 + + + __local_bh_enable_ip+0xed/0x110 kernel/softirq.c:407 + local_bh_enable include/linux/bottom_half.h:33 [inline] + inet_csk_listen_stop+0x2c5/0x1070 net/ipv4/inet_connection_sock.c:1524 + mptcp_check_listen_stop.part.0+0x1cc/0x220 net/mptcp/protocol.c:2985 + mptcp_check_listen_stop net/mptcp/mib.h:118 [inline] + __mptcp_close+0x9b9/0xbd0 net/mptcp/protocol.c:3000 + mptcp_close+0x2f/0x140 net/mptcp/protocol.c:3066 + inet_release+0xed/0x200 net/ipv4/af_inet.c:435 + inet6_release+0x4f/0x70 net/ipv6/af_inet6.c:487 + __sock_release+0xb3/0x270 net/socket.c:649 + sock_close+0x1c/0x30 net/socket.c:1439 + __fput+0x402/0xb70 fs/file_table.c:465 + task_work_run+0x150/0x240 kernel/task_work.c:227 + resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] + exit_to_user_mode_loop+0xd4/0xe0 kernel/entry/common.c:114 + exit_to_user_mode_prepare include/linux/entry-common.h:330 [inline] + syscall_exit_to_user_mode_work include/linux/entry-common.h:414 [inline] + syscall_exit_to_user_mode include/linux/entry-common.h:449 [inline] + do_syscall_64+0x245/0x360 arch/x86/entry/syscall_64.c:100 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + RIP: 0033:0x7fc92f8a36ad + Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 + RSP: 002b:00007ffcf52802d8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4 + RAX: 0000000000000000 RBX: 00007ffcf52803a8 RCX: 00007fc92f8a36ad + RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003 + RBP: 00007fc92fae7ba0 R08: 0000000000000001 R09: 0000002800000000 + R10: 00007fc92f700000 R11: 0000000000000246 R12: 00007fc92fae5fac + R13: 00007fc92fae5fa0 R14: 0000000000026d00 R15: 0000000000026c51 + + irq event stamp: 4068 + hardirqs last enabled at (4076): [] __up_console_sem+0x76/0x80 kernel/printk/printk.c:344 + hardirqs last disabled at (4085): [] __up_console_sem+0x5b/0x80 kernel/printk/printk.c:342 + softirqs last enabled at (3096): [] local_bh_enable include/linux/bottom_half.h:33 [inline] + softirqs last enabled at (3096): [] inet_csk_listen_stop+0x2c0/0x1070 net/ipv4/inet_connection_sock.c:1524 + softirqs last disabled at (3097): [] do_softirq+0x3f/0x90 kernel/softirq.c:480 + +Since we need to track the 'fallback is possible' condition and the +fallback status separately, there are a few possible races open between +the check and the actual fallback action. + +Add a spinlock to protect the fallback related information and use it +close all the possible related races. While at it also remove the +too-early clearing of allow_infinite_fallback in __mptcp_subflow_connect(): +the field will be correctly cleared by subflow_finish_connect() if/when +the connection will complete successfully. + +If fallback is not possible, as per RFC, reset the current subflow. + +Since the fallback operation can now fail and return value should be +checked, rename the helper accordingly. + +Fixes: 0530020a7c8f ("mptcp: track and update contiguous data status") +Cc: stable@vger.kernel.org +Reported-by: Matthieu Baerts +Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/570 +Reported-by: syzbot+5cf807c20386d699b524@syzkaller.appspotmail.com +Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/555 +Signed-off-by: Paolo Abeni +Reviewed-by: Matthieu Baerts (NGI0) +Signed-off-by: Matthieu Baerts (NGI0) +Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-1-391aff963322@kernel.org +Signed-off-by: Jakub Kicinski +[ Conflicts in protocol.h, because commit 6ebf6f90ab4a ("mptcp: add + mptcpi_subflows_total counter") is not in this version, and this + causes conflicts in the context. Commit 65b02260a0e0 ("mptcp: export + mptcp_subflow_early_fallback()") is also not in this version, and + moves code from protocol.c to protocol.h, but the modification can + still apply there. Conflicts in protocol.c because commit ee2708aedad0 + ("mptcp: use get_retrans wrapper") is not in this version and refactor + the code in __mptcp_retrans(), but the modification can still be + applied, just not at the same indentation level. There were other + conflicts in the context due to commit 8005184fd1ca ("mptcp: refactor + sndbuf auto-tuning"), commit b3ea6b272d79 ("mptcp: consolidate initial + ack seq generation"), and commit 013e3179dbd2 ("mptcp: fix rcv space + initialization") that are not in this version. ] +Signed-off-by: Matthieu Baerts (NGI0) +Signed-off-by: Greg Kroah-Hartman +--- + net/mptcp/options.c | 3 ++- + net/mptcp/protocol.c | 39 +++++++++++++++++++++++++++++++++------ + net/mptcp/protocol.h | 24 ++++++++++++++++++------ + net/mptcp/subflow.c | 11 +++++------ + 4 files changed, 58 insertions(+), 19 deletions(-) + +--- a/net/mptcp/options.c ++++ b/net/mptcp/options.c +@@ -973,8 +973,9 @@ static bool check_fully_established(stru + if (subflow->mp_join) + goto reset; + subflow->mp_capable = 0; ++ if (!mptcp_try_fallback(ssk)) ++ goto reset; + pr_fallback(msk); +- mptcp_do_fallback(ssk); + return false; + } + +--- a/net/mptcp/protocol.c ++++ b/net/mptcp/protocol.c +@@ -633,10 +633,9 @@ static bool mptcp_check_data_fin(struct + + static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) + { +- if (READ_ONCE(msk->allow_infinite_fallback)) { ++ if (mptcp_try_fallback(ssk)) { + MPTCP_INC_STATS(sock_net(ssk), + MPTCP_MIB_DSSCORRUPTIONFALLBACK); +- mptcp_do_fallback(ssk); + } else { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); + mptcp_subflow_reset(ssk); +@@ -897,6 +896,14 @@ static bool __mptcp_finish_join(struct m + if (sk->sk_state != TCP_ESTABLISHED) + return false; + ++ spin_lock_bh(&msk->fallback_lock); ++ if (__mptcp_check_fallback(msk)) { ++ spin_unlock_bh(&msk->fallback_lock); ++ return false; ++ } ++ mptcp_subflow_joined(msk, ssk); ++ spin_unlock_bh(&msk->fallback_lock); ++ + /* attach to msk socket only after we are sure we will deal with it + * at close time + */ +@@ -904,7 +911,6 @@ static bool __mptcp_finish_join(struct m + mptcp_sock_graft(ssk, sk->sk_socket); + + mptcp_sockopt_sync_locked(msk, ssk); +- mptcp_subflow_joined(msk, ssk); + mptcp_stop_tout_timer(sk); + return true; + } +@@ -1288,10 +1294,14 @@ static void mptcp_update_infinite_map(st + mpext->infinite_map = 1; + mpext->data_len = 0; + ++ if (!mptcp_try_fallback(ssk)) { ++ mptcp_subflow_reset(ssk); ++ return; ++ } ++ + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); + mptcp_subflow_ctx(ssk)->send_infinite_map = 0; + pr_fallback(msk); +- mptcp_do_fallback(ssk); + } + + #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) +@@ -2638,8 +2648,8 @@ static void mptcp_check_fastclose(struct + + static void __mptcp_retrans(struct sock *sk) + { ++ struct mptcp_sendmsg_info info = { .data_lock_held = true, }; + struct mptcp_sock *msk = mptcp_sk(sk); +- struct mptcp_sendmsg_info info = {}; + struct mptcp_data_frag *dfrag; + size_t copied = 0; + struct sock *ssk; +@@ -2675,6 +2685,15 @@ static void __mptcp_retrans(struct sock + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; ++ ++ /* make the whole retrans decision, xmit, disallow fallback atomic */ ++ spin_lock_bh(&msk->fallback_lock); ++ if (__mptcp_check_fallback(msk)) { ++ spin_unlock_bh(&msk->fallback_lock); ++ release_sock(ssk); ++ return; ++ } ++ + while (info.sent < info.limit) { + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) +@@ -2690,6 +2709,7 @@ static void __mptcp_retrans(struct sock + info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); + } ++ spin_unlock_bh(&msk->fallback_lock); + + release_sock(ssk); + +@@ -2819,6 +2839,7 @@ static int __mptcp_init_sock(struct sock + msk->recovery = false; + + mptcp_pm_data_init(msk); ++ spin_lock_init(&msk->fallback_lock); + + /* re-use the csk retrans timer for MPTCP-level retrans */ + timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); +@@ -3651,7 +3672,13 @@ bool mptcp_finish_join(struct sock *ssk) + + /* active subflow, already present inside the conn_list */ + if (!list_empty(&subflow->node)) { ++ spin_lock_bh(&msk->fallback_lock); ++ if (__mptcp_check_fallback(msk)) { ++ spin_unlock_bh(&msk->fallback_lock); ++ return false; ++ } + mptcp_subflow_joined(msk, ssk); ++ spin_unlock_bh(&msk->fallback_lock); + return true; + } + +@@ -3764,7 +3791,7 @@ static void mptcp_subflow_early_fallback + struct mptcp_subflow_context *subflow) + { + subflow->request_mptcp = 0; +- __mptcp_do_fallback(msk); ++ WARN_ON_ONCE(!__mptcp_try_fallback(msk)); + } + + static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +--- a/net/mptcp/protocol.h ++++ b/net/mptcp/protocol.h +@@ -317,6 +317,10 @@ struct mptcp_sock { + + u32 setsockopt_seq; + char ca_name[TCP_CA_NAME_MAX]; ++ ++ spinlock_t fallback_lock; /* protects fallback and ++ * allow_infinite_fallback ++ */ + }; + + #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) +@@ -975,25 +979,32 @@ static inline bool mptcp_check_fallback( + return __mptcp_check_fallback(msk); + } + +-static inline void __mptcp_do_fallback(struct mptcp_sock *msk) ++static inline bool __mptcp_try_fallback(struct mptcp_sock *msk) + { + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) { + pr_debug("TCP fallback already done (msk=%p)\n", msk); +- return; ++ return true; + } +- if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback))) +- return; ++ spin_lock_bh(&msk->fallback_lock); ++ if (!msk->allow_infinite_fallback) { ++ spin_unlock_bh(&msk->fallback_lock); ++ return false; ++ } ++ + set_bit(MPTCP_FALLBACK_DONE, &msk->flags); ++ spin_unlock_bh(&msk->fallback_lock); ++ return true; + } + +-static inline void mptcp_do_fallback(struct sock *ssk) ++static inline bool mptcp_try_fallback(struct sock *ssk) + { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + struct mptcp_sock *msk; + + msk = mptcp_sk(sk); +- __mptcp_do_fallback(msk); ++ if (!__mptcp_try_fallback(msk)) ++ return false; + if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { + gfp_t saved_allocation = ssk->sk_allocation; + +@@ -1005,6 +1016,7 @@ static inline void mptcp_do_fallback(str + tcp_shutdown(ssk, SEND_SHUTDOWN); + ssk->sk_allocation = saved_allocation; + } ++ return true; + } + + #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a) +--- a/net/mptcp/subflow.c ++++ b/net/mptcp/subflow.c +@@ -431,9 +431,11 @@ static void subflow_finish_connect(struc + mptcp_get_options(skb, &mp_opt); + if (subflow->request_mptcp) { + if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { ++ if (!mptcp_try_fallback(sk)) ++ goto do_reset; ++ + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); +- mptcp_do_fallback(sk); + pr_fallback(mptcp_sk(subflow->conn)); + goto fallback; + } +@@ -1269,7 +1271,7 @@ fallback: + return true; + } + +- if (!READ_ONCE(msk->allow_infinite_fallback)) { ++ if (!mptcp_try_fallback(ssk)) { + /* fatal protocol error, close the socket. + * subflow_error_report() will introduce the appropriate barriers + */ +@@ -1285,8 +1287,6 @@ reset: + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + return false; + } +- +- mptcp_do_fallback(ssk); + } + + skb = skb_peek(&ssk->sk_receive_queue); +@@ -1519,7 +1519,6 @@ int __mptcp_subflow_connect(struct sock + /* discard the subflow socket */ + mptcp_sock_graft(ssk, sk->sk_socket); + iput(SOCK_INODE(sf)); +- WRITE_ONCE(msk->allow_infinite_fallback, false); + mptcp_stop_tout_timer(sk); + return 0; + +@@ -1690,7 +1689,7 @@ static void subflow_state_change(struct + msk = mptcp_sk(parent); + if (subflow_simultaneous_connect(sk)) { + mptcp_propagate_sndbuf(parent, sk); +- mptcp_do_fallback(sk); ++ WARN_ON_ONCE(!mptcp_try_fallback(sk)); + mptcp_rcv_space_init(msk, sk); + pr_fallback(msk); + subflow->conn_finished = 1; diff --git a/queue-6.1/mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch b/queue-6.1/mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch new file mode 100644 index 0000000000..715ce44769 --- /dev/null +++ b/queue-6.1/mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch @@ -0,0 +1,201 @@ +From stable+bounces-164936-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:45 2025 +From: "Matthieu Baerts (NGI0)" +Date: Mon, 28 Jul 2025 15:29:22 +0200 +Subject: mptcp: plug races between subflow fail and subflow creation +To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: Paolo Abeni , sashal@kernel.org, "Matthieu Baerts (NGI0)" , Jakub Kicinski +Message-ID: <20250728132919.3904847-7-matttbe@kernel.org> + +From: Paolo Abeni + +commit def5b7b2643ebba696fc60ddf675dca13f073486 upstream. + +We have races similar to the one addressed by the previous patch between +subflow failing and additional subflow creation. They are just harder to +trigger. + +The solution is similar. Use a separate flag to track the condition +'socket state prevent any additional subflow creation' protected by the +fallback lock. + +The socket fallback makes such flag true, and also receiving or sending +an MP_FAIL option. + +The field 'allow_infinite_fallback' is now always touched under the +relevant lock, we can drop the ONCE annotation on write. + +Fixes: 478d770008b0 ("mptcp: send out MP_FAIL when data checksum fails") +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Abeni +Reviewed-by: Matthieu Baerts (NGI0) +Signed-off-by: Matthieu Baerts (NGI0) +Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-2-391aff963322@kernel.org +Signed-off-by: Jakub Kicinski +[ Conflicts in subflow.c, because commit f1f26512a9bf ("mptcp: use plain + bool instead of custom binary enum") and commit 46a5d3abedbe + ("mptcp: fix typos in comments") are not in this version. Both are + causing conflicts in the context, and the same modifications can still + be applied. Same in protocol.h with commit b8dc6d6ce931 ("mptcp: fix + rcv buffer auto-tuning"). Conflicts in protocol.c because commit + ee2708aedad0 ("mptcp: use get_retrans wrapper") is not in this version + and refactor the code in __mptcp_retrans(), but the modification can + still be applied, just not at the same indentation level. ] +Signed-off-by: Matthieu Baerts (NGI0) +Signed-off-by: Greg Kroah-Hartman +--- + net/mptcp/pm.c | 8 +++++++- + net/mptcp/protocol.c | 11 ++++++----- + net/mptcp/protocol.h | 7 +++++-- + net/mptcp/subflow.c | 19 ++++++++++++++----- + 4 files changed, 32 insertions(+), 13 deletions(-) + +--- a/net/mptcp/pm.c ++++ b/net/mptcp/pm.c +@@ -309,8 +309,14 @@ void mptcp_pm_mp_fail_received(struct so + + pr_debug("fail_seq=%llu\n", fail_seq); + +- if (!READ_ONCE(msk->allow_infinite_fallback)) ++ /* After accepting the fail, we can't create any other subflows */ ++ spin_lock_bh(&msk->fallback_lock); ++ if (!msk->allow_infinite_fallback) { ++ spin_unlock_bh(&msk->fallback_lock); + return; ++ } ++ msk->allow_subflows = false; ++ spin_unlock_bh(&msk->fallback_lock); + + if (!subflow->fail_tout) { + pr_debug("send MP_FAIL response and infinite map\n"); +--- a/net/mptcp/protocol.c ++++ b/net/mptcp/protocol.c +@@ -885,7 +885,7 @@ void mptcp_data_ready(struct sock *sk, s + static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) + { + mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); +- WRITE_ONCE(msk->allow_infinite_fallback, false); ++ msk->allow_infinite_fallback = false; + mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); + } + +@@ -897,7 +897,7 @@ static bool __mptcp_finish_join(struct m + return false; + + spin_lock_bh(&msk->fallback_lock); +- if (__mptcp_check_fallback(msk)) { ++ if (!msk->allow_subflows) { + spin_unlock_bh(&msk->fallback_lock); + return false; + } +@@ -2707,7 +2707,7 @@ static void __mptcp_retrans(struct sock + dfrag->already_sent = max(dfrag->already_sent, info.sent); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); +- WRITE_ONCE(msk->allow_infinite_fallback, false); ++ msk->allow_infinite_fallback = false; + } + spin_unlock_bh(&msk->fallback_lock); + +@@ -2835,7 +2835,8 @@ static int __mptcp_init_sock(struct sock + WRITE_ONCE(msk->first, NULL); + inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); +- WRITE_ONCE(msk->allow_infinite_fallback, true); ++ msk->allow_infinite_fallback = true; ++ msk->allow_subflows = true; + msk->recovery = false; + + mptcp_pm_data_init(msk); +@@ -3673,7 +3674,7 @@ bool mptcp_finish_join(struct sock *ssk) + /* active subflow, already present inside the conn_list */ + if (!list_empty(&subflow->node)) { + spin_lock_bh(&msk->fallback_lock); +- if (__mptcp_check_fallback(msk)) { ++ if (!msk->allow_subflows) { + spin_unlock_bh(&msk->fallback_lock); + return false; + } +--- a/net/mptcp/protocol.h ++++ b/net/mptcp/protocol.h +@@ -314,12 +314,14 @@ struct mptcp_sock { + u64 time; /* start time of measurement window */ + u64 rtt_us; /* last maximum rtt of subflows */ + } rcvq_space; ++ bool allow_subflows; + + u32 setsockopt_seq; + char ca_name[TCP_CA_NAME_MAX]; + +- spinlock_t fallback_lock; /* protects fallback and +- * allow_infinite_fallback ++ spinlock_t fallback_lock; /* protects fallback, ++ * allow_infinite_fallback and ++ * allow_join + */ + }; + +@@ -991,6 +993,7 @@ static inline bool __mptcp_try_fallback( + return false; + } + ++ msk->allow_subflows = false; + set_bit(MPTCP_FALLBACK_DONE, &msk->flags); + spin_unlock_bh(&msk->fallback_lock); + return true; +--- a/net/mptcp/subflow.c ++++ b/net/mptcp/subflow.c +@@ -1168,20 +1168,29 @@ static void subflow_sched_work_if_closed + mptcp_schedule_work(sk); + } + +-static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) ++static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) + { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned long fail_tout; + ++ /* we are really failing, prevent any later subflow join */ ++ spin_lock_bh(&msk->fallback_lock); ++ if (!msk->allow_infinite_fallback) { ++ spin_unlock_bh(&msk->fallback_lock); ++ return false; ++ } ++ msk->allow_subflows = false; ++ spin_unlock_bh(&msk->fallback_lock); ++ + /* greceful failure can happen only on the MPC subflow */ + if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) +- return; ++ return false; + + /* since the close timeout take precedence on the fail one, + * no need to start the latter when the first is already set + */ + if (sock_flag((struct sock *)msk, SOCK_DEAD)) +- return; ++ return true; + + /* we don't need extreme accuracy here, use a zero fail_tout as special + * value meaning no fail timeout at all; +@@ -1193,6 +1202,7 @@ static void mptcp_subflow_fail(struct mp + tcp_send_ack(ssk); + + mptcp_reset_tout_timer(msk, subflow->fail_tout); ++ return true; + } + + static bool subflow_check_data_avail(struct sock *ssk) +@@ -1261,12 +1271,11 @@ fallback: + (subflow->mp_join || subflow->valid_csum_seen)) { + subflow->send_mp_fail = 1; + +- if (!READ_ONCE(msk->allow_infinite_fallback)) { ++ if (!mptcp_subflow_fail(msk, ssk)) { + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; + goto reset; + } +- mptcp_subflow_fail(msk, ssk); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); + return true; + } diff --git a/queue-6.1/mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch b/queue-6.1/mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch new file mode 100644 index 0000000000..a7fea43e87 --- /dev/null +++ b/queue-6.1/mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch @@ -0,0 +1,58 @@ +From stable+bounces-164938-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:45 2025 +From: "Matthieu Baerts (NGI0)" +Date: Mon, 28 Jul 2025 15:29:23 +0200 +Subject: mptcp: reset fallback status gracefully at disconnect() time +To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: Paolo Abeni , sashal@kernel.org, "Matthieu Baerts (NGI0)" , Jakub Kicinski +Message-ID: <20250728132919.3904847-8-matttbe@kernel.org> + +From: Paolo Abeni + +commit da9b2fc7b73d147d88abe1922de5ab72d72d7756 upstream. + +mptcp_disconnect() clears the fallback bit unconditionally, without +touching the associated flags. + +The bit clear is safe, as no fallback operation can race with that -- +all subflow are already in TCP_CLOSE status thanks to the previous +FASTCLOSE -- but we need to consistently reset all the fallback related +status. + +Also acquire the relevant lock, to avoid fouling static analyzers. + +Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation") +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Abeni +Reviewed-by: Matthieu Baerts (NGI0) +Signed-off-by: Matthieu Baerts (NGI0) +Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-3-391aff963322@kernel.org +Signed-off-by: Jakub Kicinski +[ Conflicts in protocol.c, because commit ebc1e08f01eb ("mptcp: drop + last_snd and MPTCP_RESET_SCHEDULER") is not in this version and + changed the context. The same modification can still be applied at the + same place. ] +Signed-off-by: Matthieu Baerts (NGI0) +Signed-off-by: Greg Kroah-Hartman +--- + net/mptcp/protocol.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/net/mptcp/protocol.c ++++ b/net/mptcp/protocol.c +@@ -3204,7 +3204,16 @@ static int mptcp_disconnect(struct sock + */ + mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); + msk->last_snd = NULL; ++ ++ /* The first subflow is already in TCP_CLOSE status, the following ++ * can't overlap with a fallback anymore ++ */ ++ spin_lock_bh(&msk->fallback_lock); ++ msk->allow_subflows = true; ++ msk->allow_infinite_fallback = true; + WRITE_ONCE(msk->flags, 0); ++ spin_unlock_bh(&msk->fallback_lock); ++ + msk->cb_flags = 0; + msk->recovery = false; + msk->can_ack = false; diff --git a/queue-6.1/selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch b/queue-6.1/selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch new file mode 100644 index 0000000000..1988aa9fa8 --- /dev/null +++ b/queue-6.1/selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch @@ -0,0 +1,98 @@ +From stable+bounces-165167-greg=kroah.com@vger.kernel.org Wed Jul 30 03:54:17 2025 +From: "Isaac J. Manjarres" +Date: Tue, 29 Jul 2025 18:52:43 -0700 +Subject: selftests/memfd: add test for mapping write-sealed memfd read-only +To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org, Shuah Khan +Cc: aliceryhl@google.com, surenb@google.com, stable@vger.kernel.org, "Isaac J. Manjarres" , kernel-team@android.com, Jann Horn , Julian Orth , "Liam R. Howlett" , Linus Torvalds , Vlastimil Babka , Andrew Morton , linux-kselftest@vger.kernel.org, linux-kernel@vger.kernel.org +Message-ID: <20250730015247.30827-5-isaacmanjarres@google.com> + +From: Lorenzo Stoakes + +[ Upstream commit ea0916e01d0b0f2cce1369ac1494239a79827270 ] + +Now we have reinstated the ability to map F_SEAL_WRITE mappings read-only, +assert that we are able to do this in a test to ensure that we do not +regress this again. + +Link: https://lkml.kernel.org/r/a6377ec470b14c0539b4600cf8fa24bf2e4858ae.1732804776.git.lorenzo.stoakes@oracle.com +Signed-off-by: Lorenzo Stoakes +Cc: Jann Horn +Cc: Julian Orth +Cc: Liam R. Howlett +Cc: Linus Torvalds +Cc: Shuah Khan +Cc: Vlastimil Babka +Signed-off-by: Andrew Morton +Cc: stable@vger.kernel.org +Signed-off-by: Isaac J. Manjarres +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/memfd/memfd_test.c | 43 +++++++++++++++++++++++++++++ + 1 file changed, 43 insertions(+) + +--- a/tools/testing/selftests/memfd/memfd_test.c ++++ b/tools/testing/selftests/memfd/memfd_test.c +@@ -186,6 +186,24 @@ static void *mfd_assert_mmap_shared(int + return p; + } + ++static void *mfd_assert_mmap_read_shared(int fd) ++{ ++ void *p; ++ ++ p = mmap(NULL, ++ mfd_def_size, ++ PROT_READ, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ ++ return p; ++} ++ + static void *mfd_assert_mmap_private(int fd) + { + void *p; +@@ -802,6 +820,30 @@ static void test_seal_future_write(void) + close(fd); + } + ++static void test_seal_write_map_read_shared(void) ++{ ++ int fd; ++ void *p; ++ ++ printf("%s SEAL-WRITE-MAP-READ\n", memfd_str); ++ ++ fd = mfd_assert_new("kern_memfd_seal_write_map_read", ++ mfd_def_size, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ ++ mfd_assert_add_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE); ++ ++ p = mfd_assert_mmap_read_shared(fd); ++ ++ mfd_assert_read(fd); ++ mfd_assert_read_shared(fd); ++ mfd_fail_write(fd); ++ ++ munmap(p, mfd_def_size); ++ close(fd); ++} ++ + /* + * Test SEAL_SHRINK + * Test whether SEAL_SHRINK actually prevents shrinking +@@ -1056,6 +1098,7 @@ int main(int argc, char **argv) + + test_seal_write(); + test_seal_future_write(); ++ test_seal_write_map_read_shared(); + test_seal_shrink(); + test_seal_grow(); + test_seal_resize(); diff --git a/queue-6.1/series b/queue-6.1/series index 65cf4a8d28..c0add04357 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -375,3 +375,19 @@ platform-chrome-cros_ec-unregister-notifier-in-cros_ec_unregister.patch usb-dwc3-imx8mp-fix-device-leak-at-unbind.patch ata-fix-sata_mobile_lpm_policy-description-in-kconfig.patch btrfs-populate-otime-when-logging-an-inode-item.patch +tls-separate-no-async-decryption-request-handling-from-async.patch +crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch +arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch +kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch +mptcp-make-fallback-action-and-fallback-decision-atomic.patch +mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch +mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch +mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch +mm-update-memfd-seal-write-check-to-include-f_seal_write.patch +mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch +selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch +bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch +kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch +x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch +x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch +kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch diff --git a/queue-6.1/tls-separate-no-async-decryption-request-handling-from-async.patch b/queue-6.1/tls-separate-no-async-decryption-request-handling-from-async.patch new file mode 100644 index 0000000000..0ac3508c3d --- /dev/null +++ b/queue-6.1/tls-separate-no-async-decryption-request-handling-from-async.patch @@ -0,0 +1,61 @@ +From 41532b785e9d79636b3815a64ddf6a096647d011 Mon Sep 17 00:00:00 2001 +From: Sabrina Dubroca +Date: Wed, 28 Feb 2024 23:43:59 +0100 +Subject: tls: separate no-async decryption request handling from async + +From: Sabrina Dubroca + +commit 41532b785e9d79636b3815a64ddf6a096647d011 upstream. + +If we're not doing async, the handling is much simpler. There's no +reference counting, we just need to wait for the completion to wake us +up and return its result. + +We should preferably also use a separate crypto_wait. I'm not seeing a +UAF as I did in the past, I think aec7961916f3 ("tls: fix race between +async notify and socket close") took care of it. + +This will make the next fix easier. + +Signed-off-by: Sabrina Dubroca +Link: https://lore.kernel.org/r/47bde5f649707610eaef9f0d679519966fc31061.1709132643.git.sd@queasysnail.net +Signed-off-by: Jakub Kicinski +[ William: The original patch did not apply cleanly due to deletions of + non-existent lines in 6.1.y. The UAF the author stopped seeing can still + be reproduced on systems without AVX in conjunction with cryptd. + Also removed an extraneous statement after a return statement that is + adjacent to diff. ] +Link: https://lore.kernel.org/netdev/he2K1yz_u7bZ-CnYcTSQ4OxuLuHZXN6xZRgp6_ICSWnq8J5FpI_uD1i_1lTSf7WMrYb5ThiX1OR2GTOB2IltgT49Koy7Hhutr4du4KtLvyk=@willsroot.io/ +Signed-off-by: William Liu +Signed-off-by: Greg Kroah-Hartman +--- + net/tls/tls_sw.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -274,9 +274,15 @@ static int tls_do_decryption(struct sock + DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->decrypt_pending) < 1); + atomic_inc(&ctx->decrypt_pending); + } else { ++ DECLARE_CRYPTO_WAIT(wait); ++ + aead_request_set_callback(aead_req, + CRYPTO_TFM_REQ_MAY_BACKLOG, +- crypto_req_done, &ctx->async_wait); ++ crypto_req_done, &wait); ++ ret = crypto_aead_decrypt(aead_req); ++ if (ret == -EINPROGRESS || ret == -EBUSY) ++ ret = crypto_wait_req(ret, &wait); ++ return ret; + } + + ret = crypto_aead_decrypt(aead_req); +@@ -289,7 +295,6 @@ static int tls_do_decryption(struct sock + /* all completions have run, we're not doing async anymore */ + darg->async = false; + return ret; +- ret = ret ?: -EINPROGRESS; + } + + atomic_dec(&ctx->decrypt_pending); diff --git a/queue-6.1/x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch b/queue-6.1/x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch new file mode 100644 index 0000000000..3dcf3fa135 --- /dev/null +++ b/queue-6.1/x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch @@ -0,0 +1,113 @@ +From stable+bounces-164647-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:39 2025 +From: Sasha Levin +Date: Thu, 24 Jul 2025 13:07:23 -0400 +Subject: x86/reboot: Harden virtualization hooks for emergency reboot +To: stable@vger.kernel.org +Cc: Sean Christopherson , Kai Huang , Sasha Levin +Message-ID: <20250724170725.1404455-1-sashal@kernel.org> + +From: Sean Christopherson + +[ Upstream commit 5e408396c60cd0f0b53a43713016b6d6af8d69e0 ] + +Provide dedicated helpers to (un)register virt hooks used during an +emergency crash/reboot, and WARN if there is an attempt to overwrite +the registered callback, or an attempt to do an unpaired unregister. + +Opportunsitically use rcu_assign_pointer() instead of RCU_INIT_POINTER(), +mainly so that the set/unset paths are more symmetrical, but also because +any performance gains from using RCU_INIT_POINTER() are meaningless for +this code. + +Reviewed-by: Kai Huang +Link: https://lore.kernel.org/r/20230721201859.2307736-3-seanjc@google.com +Signed-off-by: Sean Christopherson +Stable-dep-of: a0ee1d5faff1 ("KVM: VMX: Flush shadow VMCS on emergency reboot") +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/reboot.h | 5 +++-- + arch/x86/kernel/reboot.c | 30 ++++++++++++++++++++++++------ + arch/x86/kvm/vmx/vmx.c | 6 ++---- + 3 files changed, 29 insertions(+), 12 deletions(-) + +--- a/arch/x86/include/asm/reboot.h ++++ b/arch/x86/include/asm/reboot.h +@@ -25,8 +25,9 @@ void __noreturn machine_real_restart(uns + #define MRR_BIOS 0 + #define MRR_APM 1 + +-typedef void crash_vmclear_fn(void); +-extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; ++typedef void (cpu_emergency_virt_cb)(void); ++void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); ++void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); + void cpu_emergency_disable_virtualization(void); + + typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); +--- a/arch/x86/kernel/reboot.c ++++ b/arch/x86/kernel/reboot.c +@@ -794,17 +794,35 @@ void machine_crash_shutdown(struct pt_re + * + * protected by rcu. + */ +-crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; +-EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); ++static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; ++ ++void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) ++{ ++ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) ++ return; ++ ++ rcu_assign_pointer(cpu_emergency_virt_callback, callback); ++} ++EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); ++ ++void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) ++{ ++ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) ++ return; ++ ++ rcu_assign_pointer(cpu_emergency_virt_callback, NULL); ++ synchronize_rcu(); ++} ++EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); + + static inline void cpu_crash_vmclear_loaded_vmcss(void) + { +- crash_vmclear_fn *do_vmclear_operation = NULL; ++ cpu_emergency_virt_cb *callback; + + rcu_read_lock(); +- do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); +- if (do_vmclear_operation) +- do_vmclear_operation(); ++ callback = rcu_dereference(cpu_emergency_virt_callback); ++ if (callback) ++ callback(); + rcu_read_unlock(); + } + +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -8602,8 +8602,7 @@ static void __vmx_exit(void) + { + allow_smaller_maxphyaddr = false; + +- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); +- synchronize_rcu(); ++ cpu_emergency_unregister_virt_callback(crash_vmclear_local_loaded_vmcss); + + vmx_cleanup_l1d_flush(); + } +@@ -8677,8 +8676,7 @@ static int __init vmx_init(void) + pi_init_cpu(cpu); + } + +- rcu_assign_pointer(crash_vmclear_loaded_vmcss, +- crash_vmclear_local_loaded_vmcss); ++ cpu_emergency_register_virt_callback(crash_vmclear_local_loaded_vmcss); + + vmx_check_vmcs12_offsets(); + diff --git a/queue-6.1/x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch b/queue-6.1/x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch new file mode 100644 index 0000000000..f1d5fdf69d --- /dev/null +++ b/queue-6.1/x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch @@ -0,0 +1,139 @@ +From stable+bounces-164648-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:43 2025 +From: Sasha Levin +Date: Thu, 24 Jul 2025 13:07:24 -0400 +Subject: x86/reboot: KVM: Handle VMXOFF in KVM's reboot callback +To: stable@vger.kernel.org +Cc: Sean Christopherson , Kai Huang , Sasha Levin +Message-ID: <20250724170725.1404455-2-sashal@kernel.org> + +From: Sean Christopherson + +[ Upstream commit 119b5cb4ffd0166f3e98e9ee042f5046f7744f28 ] + +Use KVM VMX's reboot/crash callback to do VMXOFF in an emergency instead +of manually and blindly doing VMXOFF. There's no need to attempt VMXOFF +if a hypervisor, i.e. KVM, isn't loaded/active, i.e. if the CPU can't +possibly be post-VMXON. + +Reviewed-by: Kai Huang +Link: https://lore.kernel.org/r/20230721201859.2307736-4-seanjc@google.com +Signed-off-by: Sean Christopherson +Stable-dep-of: a0ee1d5faff1 ("KVM: VMX: Flush shadow VMCS on emergency reboot") +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/virtext.h | 10 ---------- + arch/x86/kernel/reboot.c | 29 +++++++++-------------------- + arch/x86/kvm/vmx/vmx.c | 8 +++++--- + 3 files changed, 14 insertions(+), 33 deletions(-) + +--- a/arch/x86/include/asm/virtext.h ++++ b/arch/x86/include/asm/virtext.h +@@ -70,16 +70,6 @@ static inline void __cpu_emergency_vmxof + cpu_vmxoff(); + } + +-/** Disable VMX if it is supported and enabled on the current CPU +- */ +-static inline void cpu_emergency_vmxoff(void) +-{ +- if (cpu_has_vmx()) +- __cpu_emergency_vmxoff(); +-} +- +- +- + + /* + * SVM functions: +--- a/arch/x86/kernel/reboot.c ++++ b/arch/x86/kernel/reboot.c +@@ -787,13 +787,7 @@ void machine_crash_shutdown(struct pt_re + } + #endif + +-/* +- * This is used to VMCLEAR all VMCSs loaded on the +- * processor. And when loading kvm_intel module, the +- * callback function pointer will be assigned. +- * +- * protected by rcu. +- */ ++/* RCU-protected callback to disable virtualization prior to reboot. */ + static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; + + void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) +@@ -815,17 +809,6 @@ void cpu_emergency_unregister_virt_callb + } + EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); + +-static inline void cpu_crash_vmclear_loaded_vmcss(void) +-{ +- cpu_emergency_virt_cb *callback; +- +- rcu_read_lock(); +- callback = rcu_dereference(cpu_emergency_virt_callback); +- if (callback) +- callback(); +- rcu_read_unlock(); +-} +- + /* This is the CPU performing the emergency shutdown work. */ + int crashing_cpu = -1; + +@@ -836,9 +819,15 @@ int crashing_cpu = -1; + */ + void cpu_emergency_disable_virtualization(void) + { +- cpu_crash_vmclear_loaded_vmcss(); ++ cpu_emergency_virt_cb *callback; ++ ++ rcu_read_lock(); ++ callback = rcu_dereference(cpu_emergency_virt_callback); ++ if (callback) ++ callback(); ++ rcu_read_unlock(); + +- cpu_emergency_vmxoff(); ++ /* KVM_AMD doesn't yet utilize the common callback. */ + cpu_emergency_svm_disable(); + } + +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -707,7 +707,7 @@ static int vmx_set_guest_uret_msr(struct + return ret; + } + +-static void crash_vmclear_local_loaded_vmcss(void) ++static void vmx_emergency_disable(void) + { + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; +@@ -715,6 +715,8 @@ static void crash_vmclear_local_loaded_v + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); ++ ++ __cpu_emergency_vmxoff(); + } + + static void __loaded_vmcs_clear(void *arg) +@@ -8602,7 +8604,7 @@ static void __vmx_exit(void) + { + allow_smaller_maxphyaddr = false; + +- cpu_emergency_unregister_virt_callback(crash_vmclear_local_loaded_vmcss); ++ cpu_emergency_unregister_virt_callback(vmx_emergency_disable); + + vmx_cleanup_l1d_flush(); + } +@@ -8676,7 +8678,7 @@ static int __init vmx_init(void) + pi_init_cpu(cpu); + } + +- cpu_emergency_register_virt_callback(crash_vmclear_local_loaded_vmcss); ++ cpu_emergency_register_virt_callback(vmx_emergency_disable); + + vmx_check_vmcs12_offsets(); +