]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.1-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 22 Aug 2025 13:10:58 +0000 (15:10 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 22 Aug 2025 13:10:58 +0000 (15:10 +0200)
added patches:
arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch
bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch
crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch
kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch
kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch
kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch
mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch
mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch
mm-update-memfd-seal-write-check-to-include-f_seal_write.patch
mptcp-make-fallback-action-and-fallback-decision-atomic.patch
mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch
mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch
selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch
tls-separate-no-async-decryption-request-handling-from-async.patch
x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch
x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch

17 files changed:
queue-6.1/arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch [new file with mode: 0644]
queue-6.1/bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch [new file with mode: 0644]
queue-6.1/crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch [new file with mode: 0644]
queue-6.1/kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch [new file with mode: 0644]
queue-6.1/kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch [new file with mode: 0644]
queue-6.1/kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch [new file with mode: 0644]
queue-6.1/mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch [new file with mode: 0644]
queue-6.1/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch [new file with mode: 0644]
queue-6.1/mm-update-memfd-seal-write-check-to-include-f_seal_write.patch [new file with mode: 0644]
queue-6.1/mptcp-make-fallback-action-and-fallback-decision-atomic.patch [new file with mode: 0644]
queue-6.1/mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch [new file with mode: 0644]
queue-6.1/mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch [new file with mode: 0644]
queue-6.1/selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/tls-separate-no-async-decryption-request-handling-from-async.patch [new file with mode: 0644]
queue-6.1/x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch [new file with mode: 0644]
queue-6.1/x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch [new file with mode: 0644]

diff --git a/queue-6.1/arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch b/queue-6.1/arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch
new file mode 100644 (file)
index 0000000..b4d2737
--- /dev/null
@@ -0,0 +1,37 @@
+From e8cde32f111f7f5681a7bad3ec747e9e697569a9 Mon Sep 17 00:00:00 2001
+From: Nianyao Tang <tangnianyao@huawei.com>
+Date: Tue, 11 Jun 2024 12:20:49 +0000
+Subject: arm64/cpufeatures/kvm: Add ARMv8.9 FEAT_ECBHB bits in ID_AA64MMFR1 register
+
+From: Nianyao Tang <tangnianyao@huawei.com>
+
+commit e8cde32f111f7f5681a7bad3ec747e9e697569a9 upstream.
+
+Enable ECBHB bits in ID_AA64MMFR1 register as per ARM DDI 0487K.a
+specification.
+
+When guest OS read ID_AA64MMFR1_EL1, kvm emulate this reg using
+ftr_id_aa64mmfr1 and always return ID_AA64MMFR1_EL1.ECBHB=0 to guest.
+It results in guest syscall jump to tramp ventry, which is not needed
+in implementation with ID_AA64MMFR1_EL1.ECBHB=1.
+Let's make the guest syscall process the same as the host.
+
+Signed-off-by: Nianyao Tang <tangnianyao@huawei.com>
+Link: https://lore.kernel.org/r/20240611122049.2758600-1-tangnianyao@huawei.com
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/cpufeature.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm64/kernel/cpufeature.c
++++ b/arch/arm64/kernel/cpufeature.c
+@@ -343,6 +343,7 @@ static const struct arm64_ftr_bits ftr_i
+ };
+ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
++      ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ECBHB_SHIFT, 4, 0),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0),
+       ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0),
diff --git a/queue-6.1/bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch b/queue-6.1/bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch
new file mode 100644 (file)
index 0000000..9a60af9
--- /dev/null
@@ -0,0 +1,105 @@
+From stable+bounces-167094-greg=kroah.com@vger.kernel.org Tue Aug 12 04:16:01 2025
+From: Sumanth Gavini <sumanth.gavini@yahoo.com>
+Date: Mon, 11 Aug 2025 20:34:55 -0500
+Subject: Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync
+To: marcel@holtmann.org, johan.hedberg@gmail.com, luiz.dentz@gmail.com, davem@davemloft.net, edumazet@google.com, kuba@kernel.org, pabeni@redhat.com
+Cc: Sumanth Gavini <sumanth.gavini@yahoo.com>, linux-bluetooth@vger.kernel.org, netdev@vger.kernel.org, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Message-ID: <20250812013457.425332-1-sumanth.gavini@yahoo.com>
+
+From: Sumanth Gavini <sumanth.gavini@yahoo.com>
+
+commit 5af1f84ed13a416297ab9ced7537f4d5ae7f329a upstream.
+
+Connections may be cleanup while waiting for the commands to complete so
+this attempts to check if the connection handle remains valid in case of
+errors that would lead to call hci_conn_failed:
+
+BUG: KASAN: slab-use-after-free in hci_conn_failed+0x1f/0x160
+Read of size 8 at addr ffff888001376958 by task kworker/u3:0/52
+
+CPU: 0 PID: 52 Comm: kworker/u3:0 Not tainted
+6.5.0-rc1-00527-g2dfe76d58d3a #5615
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
+1.16.2-1.fc38 04/01/2014
+Workqueue: hci0 hci_cmd_sync_work
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x1d/0x70
+ print_report+0xce/0x620
+ ? __virt_addr_valid+0xd4/0x150
+ ? hci_conn_failed+0x1f/0x160
+ kasan_report+0xd1/0x100
+ ? hci_conn_failed+0x1f/0x160
+ hci_conn_failed+0x1f/0x160
+ hci_abort_conn_sync+0x237/0x360
+
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Sumanth Gavini <sumanth.gavini@yahoo.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bluetooth/hci_sync.c |   43 +++++++++++++++++++++++++++++--------------
+ 1 file changed, 29 insertions(+), 14 deletions(-)
+
+--- a/net/bluetooth/hci_sync.c
++++ b/net/bluetooth/hci_sync.c
+@@ -5525,31 +5525,46 @@ static int hci_reject_conn_sync(struct h
+ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason)
+ {
+-      int err;
++      int err = 0;
++      u16 handle = conn->handle;
+       switch (conn->state) {
+       case BT_CONNECTED:
+       case BT_CONFIG:
+-              return hci_disconnect_sync(hdev, conn, reason);
++              err = hci_disconnect_sync(hdev, conn, reason);
++              break;
+       case BT_CONNECT:
+               err = hci_connect_cancel_sync(hdev, conn);
+-              /* Cleanup hci_conn object if it cannot be cancelled as it
+-               * likelly means the controller and host stack are out of sync.
+-               */
+-              if (err) {
+-                      hci_dev_lock(hdev);
+-                      hci_conn_failed(conn, err);
+-                      hci_dev_unlock(hdev);
+-              }
+-              return err;
++              break;
+       case BT_CONNECT2:
+-              return hci_reject_conn_sync(hdev, conn, reason);
++              err = hci_reject_conn_sync(hdev, conn, reason);
++              break;
+       default:
+               conn->state = BT_CLOSED;
+-              break;
++              return 0;
++      }
++
++      /* Cleanup hci_conn object if it cannot be cancelled as it
++       * likelly means the controller and host stack are out of sync
++       * or in case of LE it was still scanning so it can be cleanup
++       * safely.
++       */
++      if (err) {
++              struct hci_conn *c;
++
++              /* Check if the connection hasn't been cleanup while waiting
++               * commands to complete.
++               */
++              c = hci_conn_hash_lookup_handle(hdev, handle);
++              if (!c || c != conn)
++                      return 0;
++
++              hci_dev_lock(hdev);
++              hci_conn_failed(conn, err);
++              hci_dev_unlock(hdev);
+       }
+-      return 0;
++      return err;
+ }
+ static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason)
diff --git a/queue-6.1/crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch b/queue-6.1/crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch
new file mode 100644 (file)
index 0000000..c691b40
--- /dev/null
@@ -0,0 +1,104 @@
+From a238487f7965d102794ed9f8aff0b667cd2ae886 Mon Sep 17 00:00:00 2001
+From: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+Date: Fri, 20 Oct 2023 15:49:23 +0200
+Subject: crypto: qat - fix ring to service map for QAT GEN4
+
+From: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+
+commit a238487f7965d102794ed9f8aff0b667cd2ae886 upstream.
+
+The 4xxx drivers hardcode the ring to service mapping. However, when
+additional configurations where added to the driver, the mappings were
+not updated. This implies that an incorrect mapping might be reported
+through pfvf for certain configurations.
+
+Add an algorithm that computes the correct ring to service mapping based
+on the firmware loaded on the device.
+
+Fixes: 0cec19c761e5 ("crypto: qat - add support for compression for 4xxx")
+Signed-off-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+Reviewed-by: Damian Muszynski <damian.muszynski@intel.com>
+Reviewed-by: Tero Kristo <tero.kristo@linux.intel.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+[Giovanni: backport to 6.1.y, conflict resolved simplifying the logic
+in the function get_ring_to_svc_map() as the QAT driver in v6.1 supports
+only limited configurations (crypto only and compression).  Differs from
+upstream as the ring to service mapping is hardcoded rather than being
+dynamically computed.]
+Reviewed-by: Ahsan Atta <ahsan.atta@intel.com>
+Tested-by: Ahsan Atta <ahsan.atta@intel.com>
+Signed-off-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c    |   13 +++++++++++++
+ drivers/crypto/qat/qat_common/adf_accel_devices.h |    1 +
+ drivers/crypto/qat/qat_common/adf_gen4_hw_data.h  |    6 ++++++
+ drivers/crypto/qat/qat_common/adf_init.c          |    3 +++
+ 4 files changed, 23 insertions(+)
+
+--- a/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c
++++ b/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c
+@@ -297,6 +297,18 @@ static char *uof_get_name(struct adf_acc
+       return NULL;
+ }
++static u16 get_ring_to_svc_map(struct adf_accel_dev *accel_dev)
++{
++      switch (get_service_enabled(accel_dev)) {
++      case SVC_CY:
++              return ADF_GEN4_DEFAULT_RING_TO_SRV_MAP;
++      case SVC_DC:
++              return ADF_GEN4_DEFAULT_RING_TO_SRV_MAP_DC;
++      }
++
++      return 0;
++}
++
+ static u32 uof_get_ae_mask(struct adf_accel_dev *accel_dev, u32 obj_num)
+ {
+       switch (get_service_enabled(accel_dev)) {
+@@ -353,6 +365,7 @@ void adf_init_hw_data_4xxx(struct adf_hw
+       hw_data->uof_get_ae_mask = uof_get_ae_mask;
+       hw_data->set_msix_rttable = set_msix_default_rttable;
+       hw_data->set_ssm_wdtimer = adf_gen4_set_ssm_wdtimer;
++      hw_data->get_ring_to_svc_map = get_ring_to_svc_map;
+       hw_data->disable_iov = adf_disable_sriov;
+       hw_data->ring_pair_reset = adf_gen4_ring_pair_reset;
+       hw_data->enable_pm = adf_gen4_enable_pm;
+--- a/drivers/crypto/qat/qat_common/adf_accel_devices.h
++++ b/drivers/crypto/qat/qat_common/adf_accel_devices.h
+@@ -176,6 +176,7 @@ struct adf_hw_device_data {
+       void (*get_arb_info)(struct arb_info *arb_csrs_info);
+       void (*get_admin_info)(struct admin_info *admin_csrs_info);
+       enum dev_sku_info (*get_sku)(struct adf_hw_device_data *self);
++      u16 (*get_ring_to_svc_map)(struct adf_accel_dev *accel_dev);
+       int (*alloc_irq)(struct adf_accel_dev *accel_dev);
+       void (*free_irq)(struct adf_accel_dev *accel_dev);
+       void (*enable_error_correction)(struct adf_accel_dev *accel_dev);
+--- a/drivers/crypto/qat/qat_common/adf_gen4_hw_data.h
++++ b/drivers/crypto/qat/qat_common/adf_gen4_hw_data.h
+@@ -95,6 +95,12 @@ do { \
+                  ADF_RING_BUNDLE_SIZE * (bank) + \
+                  ADF_RING_CSR_RING_SRV_ARB_EN, (value))
++#define ADF_GEN4_DEFAULT_RING_TO_SRV_MAP_DC \
++      (COMP << ADF_CFG_SERV_RING_PAIR_0_SHIFT | \
++       COMP << ADF_CFG_SERV_RING_PAIR_1_SHIFT | \
++       COMP << ADF_CFG_SERV_RING_PAIR_2_SHIFT | \
++       COMP << ADF_CFG_SERV_RING_PAIR_3_SHIFT)
++
+ /* Default ring mapping */
+ #define ADF_GEN4_DEFAULT_RING_TO_SRV_MAP \
+       (ASYM << ADF_CFG_SERV_RING_PAIR_0_SHIFT | \
+--- a/drivers/crypto/qat/qat_common/adf_init.c
++++ b/drivers/crypto/qat/qat_common/adf_init.c
+@@ -95,6 +95,9 @@ int adf_dev_init(struct adf_accel_dev *a
+               return -EFAULT;
+       }
++      if (hw_data->get_ring_to_svc_map)
++              hw_data->ring_to_svc_map = hw_data->get_ring_to_svc_map(accel_dev);
++
+       if (adf_ae_init(accel_dev)) {
+               dev_err(&GET_DEV(accel_dev),
+                       "Failed to initialise Acceleration Engine\n");
diff --git a/queue-6.1/kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch b/queue-6.1/kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch
new file mode 100644 (file)
index 0000000..ff4d3ef
--- /dev/null
@@ -0,0 +1,44 @@
+From nathan@kernel.org Thu Aug 21 20:30:59 2025
+From: Nathan Chancellor <nathan@kernel.org>
+Date: Thu, 21 Aug 2025 11:30:51 -0700
+Subject: kbuild: userprogs: use correct linker when mixing clang and GNU ld
+To: gregkh@linuxfoundation.org, sashal@kernel.org
+Cc: stable@vger.kernel.org, nathan@kernel.org, thomas.weissschuh@linutronix.de
+Message-ID: <20250821183051.1259435-1-nathan@kernel.org>
+
+From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
+
+commit 936599ca514973d44a766b7376c6bbdc96b6a8cc upstream.
+
+The userprogs infrastructure does not expect clang being used with GNU ld
+and in that case uses /usr/bin/ld for linking, not the configured $(LD).
+This fallback is problematic as it will break when cross-compiling.
+Mixing clang and GNU ld is used for example when building for SPARC64,
+as ld.lld is not sufficient; see Documentation/kbuild/llvm.rst.
+
+Relax the check around --ld-path so it gets used for all linkers.
+
+Fixes: dfc1b168a8c4 ("kbuild: userprogs: use correct lld when linking through clang")
+Cc: stable@vger.kernel.org
+Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
+Reviewed-by: Nathan Chancellor <nathan@kernel.org>
+Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
+[nathan: Work around wrapping '--ld-path' in cc-option in older stable
+         branches due to older minimum LLVM version]
+Signed-off-by: Nathan Chancellor <nathan@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Makefile |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/Makefile
++++ b/Makefile
+@@ -1143,7 +1143,7 @@ KBUILD_USERCFLAGS  += $(filter -m32 -m64
+ KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS))
+ # userspace programs are linked via the compiler, use the correct linker
+-ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_LD_IS_LLD),yy)
++ifdef CONFIG_CC_IS_CLANG
+ KBUILD_USERLDFLAGS += $(call cc-option, --ld-path=$(LD))
+ endif
diff --git a/queue-6.1/kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch b/queue-6.1/kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch
new file mode 100644 (file)
index 0000000..23b0697
--- /dev/null
@@ -0,0 +1,52 @@
+From stable+bounces-164649-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:43 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 13:07:25 -0400
+Subject: KVM: VMX: Flush shadow VMCS on emergency reboot
+To: stable@vger.kernel.org
+Cc: Chao Gao <chao.gao@intel.com>, Kai Huang <kai.huang@intel.com>, Sean Christopherson <seanjc@google.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20250724170725.1404455-3-sashal@kernel.org>
+
+From: Chao Gao <chao.gao@intel.com>
+
+[ Upstream commit a0ee1d5faff135e28810f29e0f06328c66f89852 ]
+
+Ensure the shadow VMCS cache is evicted during an emergency reboot to
+prevent potential memory corruption if the cache is evicted after reboot.
+
+This issue was identified through code inspection, as __loaded_vmcs_clear()
+flushes both the normal VMCS and the shadow VMCS.
+
+Avoid checking the "launched" state during an emergency reboot, unlike the
+behavior in __loaded_vmcs_clear(). This is important because reboot NMIs
+can interfere with operations like copy_shadow_to_vmcs12(), where shadow
+VMCSes are loaded directly using VMPTRLD. In such cases, if NMIs occur
+right after the VMCS load, the shadow VMCSes will be active but the
+"launched" state may not be set.
+
+Fixes: 16f5b9034b69 ("KVM: nVMX: Copy processor-specific shadow-vmcs to VMCS12")
+Cc: stable@vger.kernel.org
+Signed-off-by: Chao Gao <chao.gao@intel.com>
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Link: https://lore.kernel.org/r/20250324140849.2099723-1-chao.gao@intel.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -713,8 +713,11 @@ static void vmx_emergency_disable(void)
+       struct loaded_vmcs *v;
+       list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+-                          loaded_vmcss_on_cpu_link)
++                          loaded_vmcss_on_cpu_link) {
+               vmcs_clear(v->vmcs);
++              if (v->shadow_vmcs)
++                      vmcs_clear(v->shadow_vmcs);
++      }
+       __cpu_emergency_vmxoff();
+ }
diff --git a/queue-6.1/kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch b/queue-6.1/kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch
new file mode 100644 (file)
index 0000000..d6e01fc
--- /dev/null
@@ -0,0 +1,82 @@
+From f1fb088d9cecde5c3066d8ff8846789667519b7d Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Fri, 4 Apr 2025 12:38:19 -0700
+Subject: KVM: x86: Take irqfds.lock when adding/deleting IRQ bypass producer
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit f1fb088d9cecde5c3066d8ff8846789667519b7d upstream.
+
+Take irqfds.lock when adding/deleting an IRQ bypass producer to ensure
+irqfd->producer isn't modified while kvm_irq_routing_update() is running.
+The only lock held when a producer is added/removed is irqbypass's mutex.
+
+Fixes: 872768800652 ("KVM: x86: select IRQ_BYPASS_MANAGER")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-ID: <20250404193923.1413163-5-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[sean: account for lack of kvm_x86_call()]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |   19 ++++++++++++++++---
+ 1 file changed, 16 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -13387,16 +13387,22 @@ int kvm_arch_irq_bypass_add_producer(str
+ {
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
++      struct kvm *kvm = irqfd->kvm;
+       int ret;
+-      irqfd->producer = prod;
+       kvm_arch_start_assignment(irqfd->kvm);
++
++      spin_lock_irq(&kvm->irqfds.lock);
++      irqfd->producer = prod;
++
+       ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
+                                        prod->irq, irqfd->gsi, 1);
+-
+       if (ret)
+               kvm_arch_end_assignment(irqfd->kvm);
++      spin_unlock_irq(&kvm->irqfds.lock);
++
++
+       return ret;
+ }
+@@ -13406,9 +13412,9 @@ void kvm_arch_irq_bypass_del_producer(st
+       int ret;
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
++      struct kvm *kvm = irqfd->kvm;
+       WARN_ON(irqfd->producer != prod);
+-      irqfd->producer = NULL;
+       /*
+        * When producer of consumer is unregistered, we change back to
+@@ -13416,11 +13422,18 @@ void kvm_arch_irq_bypass_del_producer(st
+        * when the irq is masked/disabled or the consumer side (KVM
+        * int this case doesn't want to receive the interrupts.
+       */
++      spin_lock_irq(&kvm->irqfds.lock);
++      irqfd->producer = NULL;
++
++
+       ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+       if (ret)
+               printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+                      " fails: %d\n", irqfd->consumer.token, ret);
++      spin_unlock_irq(&kvm->irqfds.lock);
++
++
+       kvm_arch_end_assignment(irqfd->kvm);
+ }
diff --git a/queue-6.1/mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch b/queue-6.1/mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch
new file mode 100644 (file)
index 0000000..41c7397
--- /dev/null
@@ -0,0 +1,204 @@
+From stable+bounces-165164-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:29 2025
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Tue, 29 Jul 2025 18:52:40 -0700
+Subject: mm: drop the assumption that VM_SHARED always implies writable
+To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org,  Alexander Viro <viro@zeniv.linux.org.uk>, Christian Brauner <brauner@kernel.org>, Jan Kara <jack@suse.cz>,  Andrew Morton <akpm@linux-foundation.org>, David Hildenbrand <david@redhat.com>,  "Liam R. Howlett" <Liam.Howlett@oracle.com>, Vlastimil Babka <vbabka@suse.cz>,  Mike Rapoport <rppt@kernel.org>, Suren Baghdasaryan <surenb@google.com>, Michal Hocko <mhocko@suse.com>,  Kees Cook <kees@kernel.org>, Ingo Molnar <mingo@redhat.com>,  Peter Zijlstra <peterz@infradead.org>, Juri Lelli <juri.lelli@redhat.com>,  Vincent Guittot <vincent.guittot@linaro.org>, Dietmar Eggemann <dietmar.eggemann@arm.com>,  Steven Rostedt <rostedt@goodmis.org>, Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,  Valentin Schneider <vschneid@redhat.com>, "Matthew Wilcox (Oracle)" <willy@infradead.org>, Jann Horn <jannh@google.com>,  Pedro Falcato <pfalcato@suse.de>
+Cc: aliceryhl@google.com, stable@vger.kernel.org,  "Isaac J. Manjarres" <isaacmanjarres@google.com>, kernel-team@android.com,  Lorenzo Stoakes <lstoakes@gmail.com>, Andy Lutomirski <luto@kernel.org>, Hugh Dickins <hughd@google.com>,  Mike Kravetz <mike.kravetz@oracle.com>, Muchun Song <muchun.song@linux.dev>,  linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,  linux-mm@kvack.org
+Message-ID: <20250730015247.30827-2-isaacmanjarres@google.com>
+
+From: Lorenzo Stoakes <lstoakes@gmail.com>
+
+[ Upstream commit e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 ]
+
+Patch series "permit write-sealed memfd read-only shared mappings", v4.
+
+The man page for fcntl() describing memfd file seals states the following
+about F_SEAL_WRITE:-
+
+    Furthermore, trying to create new shared, writable memory-mappings via
+    mmap(2) will also fail with EPERM.
+
+With emphasis on 'writable'.  In turns out in fact that currently the
+kernel simply disallows all new shared memory mappings for a memfd with
+F_SEAL_WRITE applied, rendering this documentation inaccurate.
+
+This matters because users are therefore unable to obtain a shared mapping
+to a memfd after write sealing altogether, which limits their usefulness.
+This was reported in the discussion thread [1] originating from a bug
+report [2].
+
+This is a product of both using the struct address_space->i_mmap_writable
+atomic counter to determine whether writing may be permitted, and the
+kernel adjusting this counter when any VM_SHARED mapping is performed and
+more generally implicitly assuming VM_SHARED implies writable.
+
+It seems sensible that we should only update this mapping if VM_MAYWRITE
+is specified, i.e.  whether it is possible that this mapping could at any
+point be written to.
+
+If we do so then all we need to do to permit write seals to function as
+documented is to clear VM_MAYWRITE when mapping read-only.  It turns out
+this functionality already exists for F_SEAL_FUTURE_WRITE - we can
+therefore simply adapt this logic to do the same for F_SEAL_WRITE.
+
+We then hit a chicken and egg situation in mmap_region() where the check
+for VM_MAYWRITE occurs before we are able to clear this flag.  To work
+around this, perform this check after we invoke call_mmap(), with careful
+consideration of error paths.
+
+Thanks to Andy Lutomirski for the suggestion!
+
+[1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/
+[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238
+
+This patch (of 3):
+
+There is a general assumption that VMAs with the VM_SHARED flag set are
+writable.  If the VM_MAYWRITE flag is not set, then this is simply not the
+case.
+
+Update those checks which affect the struct address_space->i_mmap_writable
+field to explicitly test for this by introducing
+[vma_]is_shared_maywrite() helper functions.
+
+This remains entirely conservative, as the lack of VM_MAYWRITE guarantees
+that the VMA cannot be written to.
+
+Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com
+Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com
+Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
+Suggested-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+[isaacmanjarres: resolved merge conflicts due to
+due to refactoring that happened in upstream commit
+5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour")]
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/fs.h |    4 ++--
+ include/linux/mm.h |   11 +++++++++++
+ kernel/fork.c      |    2 +-
+ mm/filemap.c       |    2 +-
+ mm/madvise.c       |    2 +-
+ mm/mmap.c          |    8 ++++----
+ 6 files changed, 20 insertions(+), 9 deletions(-)
+
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -410,7 +410,7 @@ extern const struct address_space_operat
+  *   It is also used to block modification of page cache contents through
+  *   memory mappings.
+  * @gfp_mask: Memory allocation flags to use for allocating pages.
+- * @i_mmap_writable: Number of VM_SHARED mappings.
++ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
+  * @nr_thps: Number of THPs in the pagecache (non-shmem only).
+  * @i_mmap: Tree of private and shared mappings.
+  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+@@ -513,7 +513,7 @@ static inline int mapping_mapped(struct
+ /*
+  * Might pages of this file have been modified in userspace?
+- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
++ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
+  * marks vma as VM_SHARED if it is shared, and the file was opened for
+  * writing i.e. vma may be mprotected writable even if now readonly.
+  *
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -673,6 +673,17 @@ static inline bool vma_is_accessible(str
+       return vma->vm_flags & VM_ACCESS_FLAGS;
+ }
++static inline bool is_shared_maywrite(vm_flags_t vm_flags)
++{
++      return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
++              (VM_SHARED | VM_MAYWRITE);
++}
++
++static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
++{
++      return is_shared_maywrite(vma->vm_flags);
++}
++
+ static inline
+ struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+ {
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -669,7 +669,7 @@ static __latent_entropy int dup_mmap(str
+                       get_file(file);
+                       i_mmap_lock_write(mapping);
+-                      if (tmp->vm_flags & VM_SHARED)
++                      if (vma_is_shared_maywrite(tmp))
+                               mapping_allow_writable(mapping);
+                       flush_dcache_mmap_lock(mapping);
+                       /* insert tmp into the share list, just after mpnt */
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -3554,7 +3554,7 @@ int generic_file_mmap(struct file *file,
+  */
+ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
+ {
+-      if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
++      if (vma_is_shared_maywrite(vma))
+               return -EINVAL;
+       return generic_file_mmap(file, vma);
+ }
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -980,7 +980,7 @@ static long madvise_remove(struct vm_are
+                       return -EINVAL;
+       }
+-      if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
++      if (!vma_is_shared_maywrite(vma))
+               return -EACCES;
+       offset = (loff_t)(start - vma->vm_start)
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -106,7 +106,7 @@ void vma_set_page_prot(struct vm_area_st
+ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
+               struct file *file, struct address_space *mapping)
+ {
+-      if (vma->vm_flags & VM_SHARED)
++      if (vma_is_shared_maywrite(vma))
+               mapping_unmap_writable(mapping);
+       flush_dcache_mmap_lock(mapping);
+@@ -408,7 +408,7 @@ static unsigned long count_vma_pages_ran
+ static void __vma_link_file(struct vm_area_struct *vma,
+                           struct address_space *mapping)
+ {
+-      if (vma->vm_flags & VM_SHARED)
++      if (vma_is_shared_maywrite(vma))
+               mapping_allow_writable(mapping);
+       flush_dcache_mmap_lock(mapping);
+@@ -2827,7 +2827,7 @@ cannot_expand:
+       vma_mas_store(vma, &mas);
+       mm->map_count++;
+       if (vma->vm_file) {
+-              if (vma->vm_flags & VM_SHARED)
++              if (vma_is_shared_maywrite(vma))
+                       mapping_allow_writable(vma->vm_file->f_mapping);
+               flush_dcache_mmap_lock(vma->vm_file->f_mapping);
+@@ -2901,7 +2901,7 @@ unsigned long mmap_region(struct file *f
+               return -EINVAL;
+       /* Map writable and ensure this isn't a sealed memfd. */
+-      if (file && (vm_flags & VM_SHARED)) {
++      if (file && is_shared_maywrite(vm_flags)) {
+               int error = mapping_map_writable(file->f_mapping);
+               if (error)
diff --git a/queue-6.1/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch b/queue-6.1/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch
new file mode 100644 (file)
index 0000000..f5ecc1c
--- /dev/null
@@ -0,0 +1,234 @@
+From stable+bounces-165166-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:57 2025
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Tue, 29 Jul 2025 18:52:42 -0700
+Subject: mm: reinstate ability to map write-sealed memfd mappings read-only
+To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org,  Hugh Dickins <hughd@google.com>, Baolin Wang <baolin.wang@linux.alibaba.com>,  Andrew Morton <akpm@linux-foundation.org>, David Hildenbrand <david@redhat.com>,  "Liam R. Howlett" <Liam.Howlett@oracle.com>, Vlastimil Babka <vbabka@suse.cz>,  Mike Rapoport <rppt@kernel.org>, Suren Baghdasaryan <surenb@google.com>, Michal Hocko <mhocko@suse.com>,  Jann Horn <jannh@google.com>, Pedro Falcato <pfalcato@suse.de>
+Cc: aliceryhl@google.com, stable@vger.kernel.org,  "Isaac J. Manjarres" <isaacmanjarres@google.com>, kernel-team@android.com,  Julian Orth <ju.orth@gmail.com>, "Liam R. Howlett" <Liam.Howlett@Oracle.com>,  Linus Torvalds <torvalds@linux-foundation.org>, Shuah Khan <shuah@kernel.org>, linux-mm@kvack.org,  linux-kernel@vger.kernel.org
+Message-ID: <20250730015247.30827-4-isaacmanjarres@google.com>
+
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+
+[ Upstream commit 8ec396d05d1b737c87311fb7311f753b02c2a6b1 ]
+
+Patch series "mm: reinstate ability to map write-sealed memfd mappings
+read-only".
+
+In commit 158978945f31 ("mm: perform the mapping_map_writable() check
+after call_mmap()") (and preceding changes in the same series) it became
+possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only.
+
+Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path
+behaviour") unintentionally undid this logic by moving the
+mapping_map_writable() check before the shmem_mmap() hook is invoked,
+thereby regressing this change.
+
+This series reworks how we both permit write-sealed mappings being mapped
+read-only and disallow mprotect() from undoing the write-seal, fixing this
+regression.
+
+We also add a regression test to ensure that we do not accidentally
+regress this in future.
+
+Thanks to Julian Orth for reporting this regression.
+
+This patch (of 2):
+
+In commit 158978945f31 ("mm: perform the mapping_map_writable() check
+after call_mmap()") (and preceding changes in the same series) it became
+possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only.
+
+This was previously unnecessarily disallowed, despite the man page
+documentation indicating that it would be, thereby limiting the usefulness
+of F_SEAL_WRITE logic.
+
+We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE
+seal (one which disallows future writes to the memfd) to also be used for
+F_SEAL_WRITE.
+
+For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a
+read-only mapping to disallow mprotect() from overriding the seal - an
+operation performed by seal_check_write(), invoked from shmem_mmap(), the
+f_op->mmap() hook used by shmem mappings.
+
+By extending this to F_SEAL_WRITE and critically - checking
+mapping_map_writable() to determine if we may map the memfd AFTER we
+invoke shmem_mmap() - the desired logic becomes possible.  This is because
+mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will
+have cleared.
+
+Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path
+behaviour") unintentionally undid this logic by moving the
+mapping_map_writable() check before the shmem_mmap() hook is invoked,
+thereby regressing this change.
+
+We reinstate this functionality by moving the check out of shmem_mmap()
+and instead performing it in do_mmap() at the point at which VMA flags are
+being determined, which seems in any case to be a more appropriate place
+in which to make this determination.
+
+In order to achieve this we rework memfd seal logic to allow us access to
+this information using existing logic and eliminate the clearing of
+VM_MAYWRITE from seal_check_write() which we are performing in do_mmap()
+instead.
+
+Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com
+Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour")
+Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reported-by: Julian Orth <ju.orth@gmail.com>
+Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/
+Cc: Jann Horn <jannh@google.com>
+Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memfd.h |   14 ++++++++++++
+ include/linux/mm.h    |   58 ++++++++++++++++++++++++++++++++++----------------
+ mm/memfd.c            |    2 -
+ mm/mmap.c             |    4 +++
+ 4 files changed, 59 insertions(+), 19 deletions(-)
+
+--- a/include/linux/memfd.h
++++ b/include/linux/memfd.h
+@@ -6,11 +6,25 @@
+ #ifdef CONFIG_MEMFD_CREATE
+ extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
++unsigned int *memfd_file_seals_ptr(struct file *file);
+ #else
+ static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
+ {
+       return -EINVAL;
+ }
++
++static inline unsigned int *memfd_file_seals_ptr(struct file *file)
++{
++      return NULL;
++}
+ #endif
++/* Retrieve memfd seals associated with the file, if any. */
++static inline unsigned int memfd_file_seals(struct file *file)
++{
++      unsigned int *sealsp = memfd_file_seals_ptr(file);
++
++      return sealsp ? *sealsp : 0;
++}
++
+ #endif /* __LINUX_MEMFD_H */
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3525,6 +3525,37 @@ void mem_dump_obj(void *object);
+ static inline void mem_dump_obj(void *object) {}
+ #endif
++static inline bool is_write_sealed(int seals)
++{
++      return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
++}
++
++/**
++ * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
++ *                      in which case writes should be disallowing moving
++ *                      forwards.
++ * @seals: the seals to check
++ * @vm_flags: the VMA flags to check
++ *
++ * Returns whether readonly sealed, in which case writess should be disallowed
++ * going forward.
++ */
++static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
++{
++      /*
++       * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
++       * MAP_SHARED and read-only, take care to not allow mprotect to
++       * revert protections on such mappings. Do this only for shared
++       * mappings. For private mappings, don't need to mask
++       * VM_MAYWRITE as we still want them to be COW-writable.
++       */
++      if (is_write_sealed(seals) &&
++          ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
++              return true;
++
++      return false;
++}
++
+ /**
+  * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+  *                    handle them.
+@@ -3536,24 +3567,15 @@ static inline void mem_dump_obj(void *ob
+  */
+ static inline int seal_check_write(int seals, struct vm_area_struct *vma)
+ {
+-      if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+-              /*
+-               * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+-               * write seals are active.
+-               */
+-              if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+-                      return -EPERM;
+-
+-              /*
+-               * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
+-               * MAP_SHARED and read-only, take care to not allow mprotect to
+-               * revert protections on such mappings. Do this only for shared
+-               * mappings. For private mappings, don't need to mask
+-               * VM_MAYWRITE as we still want them to be COW-writable.
+-               */
+-              if (vma->vm_flags & VM_SHARED)
+-                      vma->vm_flags &= ~(VM_MAYWRITE);
+-      }
++      if (!is_write_sealed(seals))
++              return 0;
++
++      /*
++       * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
++       * write seals are active.
++       */
++      if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
++              return -EPERM;
+       return 0;
+ }
+--- a/mm/memfd.c
++++ b/mm/memfd.c
+@@ -133,7 +133,7 @@ static int memfd_wait_for_pins(struct ad
+       return error;
+ }
+-static unsigned int *memfd_file_seals_ptr(struct file *file)
++unsigned int *memfd_file_seals_ptr(struct file *file)
+ {
+       if (shmem_file(file))
+               return &SHMEM_I(file_inode(file))->seals;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -46,6 +46,7 @@
+ #include <linux/pkeys.h>
+ #include <linux/oom.h>
+ #include <linux/sched/mm.h>
++#include <linux/memfd.h>
+ #include <linux/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -1336,6 +1337,7 @@ unsigned long do_mmap(struct file *file,
+       if (file) {
+               struct inode *inode = file_inode(file);
++              unsigned int seals = memfd_file_seals(file);
+               unsigned long flags_mask;
+               if (!file_mmap_ok(file, inode, pgoff, len))
+@@ -1374,6 +1376,8 @@ unsigned long do_mmap(struct file *file,
+                       vm_flags |= VM_SHARED | VM_MAYSHARE;
+                       if (!(file->f_mode & FMODE_WRITE))
+                               vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
++                      else if (is_readonly_sealed(seals, vm_flags))
++                              vm_flags &= ~VM_MAYWRITE;
+                       fallthrough;
+               case MAP_PRIVATE:
+                       if (!(file->f_mode & FMODE_READ))
diff --git a/queue-6.1/mm-update-memfd-seal-write-check-to-include-f_seal_write.patch b/queue-6.1/mm-update-memfd-seal-write-check-to-include-f_seal_write.patch
new file mode 100644 (file)
index 0000000..b2757f4
--- /dev/null
@@ -0,0 +1,104 @@
+From stable+bounces-165165-greg=kroah.com@vger.kernel.org Wed Jul 30 03:53:44 2025
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Tue, 29 Jul 2025 18:52:41 -0700
+Subject: mm: update memfd seal write check to include F_SEAL_WRITE
+To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org,  Muchun Song <muchun.song@linux.dev>, Oscar Salvador <osalvador@suse.de>,  David Hildenbrand <david@redhat.com>, Andrew Morton <akpm@linux-foundation.org>,  "Liam R. Howlett" <Liam.Howlett@oracle.com>, Vlastimil Babka <vbabka@suse.cz>,  Mike Rapoport <rppt@kernel.org>, Suren Baghdasaryan <surenb@google.com>, Michal Hocko <mhocko@suse.com>,  Hugh Dickins <hughd@google.com>, Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: aliceryhl@google.com, stable@vger.kernel.org,  "Isaac J. Manjarres" <isaacmanjarres@google.com>, kernel-team@android.com,  Lorenzo Stoakes <lstoakes@gmail.com>, Jan Kara <jack@suse.cz>,  Alexander Viro <viro@zeniv.linux.org.uk>, Andy Lutomirski <luto@kernel.org>,  Christian Brauner <brauner@kernel.org>, "Matthew Wilcox (Oracle)" <willy@infradead.org>,  Mike Kravetz <mike.kravetz@oracle.com>, linux-mm@kvack.org, linux-kernel@vger.kernel.org
+Message-ID: <20250730015247.30827-3-isaacmanjarres@google.com>
+
+From: Lorenzo Stoakes <lstoakes@gmail.com>
+
+[ Upstream commit 28464bbb2ddc199433383994bcb9600c8034afa1 ]
+
+The seal_check_future_write() function is called by shmem_mmap() or
+hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd
+sealed this way.
+
+The F_SEAL_WRITE flag is not checked here, as that is handled via the
+mapping->i_mmap_writable mechanism and so any attempt at a mapping would
+fail before this could be run.
+
+However we intend to change this, meaning this check can be performed for
+F_SEAL_WRITE mappings also.
+
+The logic here is equally applicable to both flags, so update this
+function to accommodate both and rename it accordingly.
+
+Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com
+Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/hugetlbfs/inode.c |    2 +-
+ include/linux/mm.h   |   15 ++++++++-------
+ mm/shmem.c           |    2 +-
+ 3 files changed, 10 insertions(+), 9 deletions(-)
+
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -136,7 +136,7 @@ static int hugetlbfs_file_mmap(struct fi
+       vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+       vma->vm_ops = &hugetlb_vm_ops;
+-      ret = seal_check_future_write(info->seals, vma);
++      ret = seal_check_write(info->seals, vma);
+       if (ret)
+               return ret;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3526,25 +3526,26 @@ static inline void mem_dump_obj(void *ob
+ #endif
+ /**
+- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
++ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
++ *                    handle them.
+  * @seals: the seals to check
+  * @vma: the vma to operate on
+  *
+- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
+- * the vma flags.  Return 0 if check pass, or <0 for errors.
++ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
++ * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.
+  */
+-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
++static inline int seal_check_write(int seals, struct vm_area_struct *vma)
+ {
+-      if (seals & F_SEAL_FUTURE_WRITE) {
++      if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+               /*
+                * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+-               * "future write" seal active.
++               * write seals are active.
+                */
+               if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+                       return -EPERM;
+               /*
+-               * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
++               * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
+                * MAP_SHARED and read-only, take care to not allow mprotect to
+                * revert protections on such mappings. Do this only for shared
+                * mappings. For private mappings, don't need to mask
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -2302,7 +2302,7 @@ static int shmem_mmap(struct file *file,
+       struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+       int ret;
+-      ret = seal_check_future_write(info->seals, vma);
++      ret = seal_check_write(info->seals, vma);
+       if (ret)
+               return ret;
diff --git a/queue-6.1/mptcp-make-fallback-action-and-fallback-decision-atomic.patch b/queue-6.1/mptcp-make-fallback-action-and-fallback-decision-atomic.patch
new file mode 100644 (file)
index 0000000..d7861ee
--- /dev/null
@@ -0,0 +1,387 @@
+From stable+bounces-164937-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:43 2025
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Mon, 28 Jul 2025 15:29:21 +0200
+Subject: mptcp: make fallback action and fallback decision atomic
+To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: Paolo Abeni <pabeni@redhat.com>, sashal@kernel.org, Matthieu Baerts <matttbe@kernel.org>, syzbot+5cf807c20386d699b524@syzkaller.appspotmail.com, Jakub Kicinski <kuba@kernel.org>
+Message-ID: <20250728132919.3904847-6-matttbe@kernel.org>
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit f8a1d9b18c5efc76784f5a326e905f641f839894 upstream.
+
+Syzkaller reported the following splat:
+
+  WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 __mptcp_do_fallback net/mptcp/protocol.h:1223 [inline]
+  WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 mptcp_do_fallback net/mptcp/protocol.h:1244 [inline]
+  WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 check_fully_established net/mptcp/options.c:982 [inline]
+  WARNING: CPU: 1 PID: 7704 at net/mptcp/protocol.h:1223 mptcp_incoming_options+0x21a8/0x2510 net/mptcp/options.c:1153
+  Modules linked in:
+  CPU: 1 UID: 0 PID: 7704 Comm: syz.3.1419 Not tainted 6.16.0-rc3-gbd5ce2324dba #20 PREEMPT(voluntary)
+  Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
+  RIP: 0010:__mptcp_do_fallback net/mptcp/protocol.h:1223 [inline]
+  RIP: 0010:mptcp_do_fallback net/mptcp/protocol.h:1244 [inline]
+  RIP: 0010:check_fully_established net/mptcp/options.c:982 [inline]
+  RIP: 0010:mptcp_incoming_options+0x21a8/0x2510 net/mptcp/options.c:1153
+  Code: 24 18 e8 bb 2a 00 fd e9 1b df ff ff e8 b1 21 0f 00 e8 ec 5f c4 fc 44 0f b7 ac 24 b0 00 00 00 e9 54 f1 ff ff e8 d9 5f c4 fc 90 <0f> 0b 90 e9 b8 f4 ff ff e8 8b 2a 00 fd e9 8d e6 ff ff e8 81 2a 00
+  RSP: 0018:ffff8880a3f08448 EFLAGS: 00010246
+  RAX: 0000000000000000 RBX: ffff8880180a8000 RCX: ffffffff84afcf45
+  RDX: ffff888090223700 RSI: ffffffff84afdaa7 RDI: 0000000000000001
+  RBP: ffff888017955780 R08: 0000000000000001 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
+  R13: ffff8880180a8910 R14: ffff8880a3e9d058 R15: 0000000000000000
+  FS:  00005555791b8500(0000) GS:ffff88811c495000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 000000110c2800b7 CR3: 0000000058e44000 CR4: 0000000000350ef0
+  Call Trace:
+   <IRQ>
+   tcp_reset+0x26f/0x2b0 net/ipv4/tcp_input.c:4432
+   tcp_validate_incoming+0x1057/0x1b60 net/ipv4/tcp_input.c:5975
+   tcp_rcv_established+0x5b5/0x21f0 net/ipv4/tcp_input.c:6166
+   tcp_v4_do_rcv+0x5dc/0xa70 net/ipv4/tcp_ipv4.c:1925
+   tcp_v4_rcv+0x3473/0x44a0 net/ipv4/tcp_ipv4.c:2363
+   ip_protocol_deliver_rcu+0xba/0x480 net/ipv4/ip_input.c:205
+   ip_local_deliver_finish+0x2f1/0x500 net/ipv4/ip_input.c:233
+   NF_HOOK include/linux/netfilter.h:317 [inline]
+   NF_HOOK include/linux/netfilter.h:311 [inline]
+   ip_local_deliver+0x1be/0x560 net/ipv4/ip_input.c:254
+   dst_input include/net/dst.h:469 [inline]
+   ip_rcv_finish net/ipv4/ip_input.c:447 [inline]
+   NF_HOOK include/linux/netfilter.h:317 [inline]
+   NF_HOOK include/linux/netfilter.h:311 [inline]
+   ip_rcv+0x514/0x810 net/ipv4/ip_input.c:567
+   __netif_receive_skb_one_core+0x197/0x1e0 net/core/dev.c:5975
+   __netif_receive_skb+0x1f/0x120 net/core/dev.c:6088
+   process_backlog+0x301/0x1360 net/core/dev.c:6440
+   __napi_poll.constprop.0+0xba/0x550 net/core/dev.c:7453
+   napi_poll net/core/dev.c:7517 [inline]
+   net_rx_action+0xb44/0x1010 net/core/dev.c:7644
+   handle_softirqs+0x1d0/0x770 kernel/softirq.c:579
+   do_softirq+0x3f/0x90 kernel/softirq.c:480
+   </IRQ>
+   <TASK>
+   __local_bh_enable_ip+0xed/0x110 kernel/softirq.c:407
+   local_bh_enable include/linux/bottom_half.h:33 [inline]
+   inet_csk_listen_stop+0x2c5/0x1070 net/ipv4/inet_connection_sock.c:1524
+   mptcp_check_listen_stop.part.0+0x1cc/0x220 net/mptcp/protocol.c:2985
+   mptcp_check_listen_stop net/mptcp/mib.h:118 [inline]
+   __mptcp_close+0x9b9/0xbd0 net/mptcp/protocol.c:3000
+   mptcp_close+0x2f/0x140 net/mptcp/protocol.c:3066
+   inet_release+0xed/0x200 net/ipv4/af_inet.c:435
+   inet6_release+0x4f/0x70 net/ipv6/af_inet6.c:487
+   __sock_release+0xb3/0x270 net/socket.c:649
+   sock_close+0x1c/0x30 net/socket.c:1439
+   __fput+0x402/0xb70 fs/file_table.c:465
+   task_work_run+0x150/0x240 kernel/task_work.c:227
+   resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
+   exit_to_user_mode_loop+0xd4/0xe0 kernel/entry/common.c:114
+   exit_to_user_mode_prepare include/linux/entry-common.h:330 [inline]
+   syscall_exit_to_user_mode_work include/linux/entry-common.h:414 [inline]
+   syscall_exit_to_user_mode include/linux/entry-common.h:449 [inline]
+   do_syscall_64+0x245/0x360 arch/x86/entry/syscall_64.c:100
+   entry_SYSCALL_64_after_hwframe+0x77/0x7f
+  RIP: 0033:0x7fc92f8a36ad
+  Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
+  RSP: 002b:00007ffcf52802d8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4
+  RAX: 0000000000000000 RBX: 00007ffcf52803a8 RCX: 00007fc92f8a36ad
+  RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003
+  RBP: 00007fc92fae7ba0 R08: 0000000000000001 R09: 0000002800000000
+  R10: 00007fc92f700000 R11: 0000000000000246 R12: 00007fc92fae5fac
+  R13: 00007fc92fae5fa0 R14: 0000000000026d00 R15: 0000000000026c51
+   </TASK>
+  irq event stamp: 4068
+  hardirqs last  enabled at (4076): [<ffffffff81544816>] __up_console_sem+0x76/0x80 kernel/printk/printk.c:344
+  hardirqs last disabled at (4085): [<ffffffff815447fb>] __up_console_sem+0x5b/0x80 kernel/printk/printk.c:342
+  softirqs last  enabled at (3096): [<ffffffff840e1be0>] local_bh_enable include/linux/bottom_half.h:33 [inline]
+  softirqs last  enabled at (3096): [<ffffffff840e1be0>] inet_csk_listen_stop+0x2c0/0x1070 net/ipv4/inet_connection_sock.c:1524
+  softirqs last disabled at (3097): [<ffffffff813b6b9f>] do_softirq+0x3f/0x90 kernel/softirq.c:480
+
+Since we need to track the 'fallback is possible' condition and the
+fallback status separately, there are a few possible races open between
+the check and the actual fallback action.
+
+Add a spinlock to protect the fallback related information and use it
+close all the possible related races. While at it also remove the
+too-early clearing of allow_infinite_fallback in __mptcp_subflow_connect():
+the field will be correctly cleared by subflow_finish_connect() if/when
+the connection will complete successfully.
+
+If fallback is not possible, as per RFC, reset the current subflow.
+
+Since the fallback operation can now fail and return value should be
+checked, rename the helper accordingly.
+
+Fixes: 0530020a7c8f ("mptcp: track and update contiguous data status")
+Cc: stable@vger.kernel.org
+Reported-by: Matthieu Baerts <matttbe@kernel.org>
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/570
+Reported-by: syzbot+5cf807c20386d699b524@syzkaller.appspotmail.com
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/555
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-1-391aff963322@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ Conflicts in protocol.h, because commit 6ebf6f90ab4a ("mptcp: add
+  mptcpi_subflows_total counter") is not in this version, and this
+  causes conflicts in the context. Commit 65b02260a0e0 ("mptcp: export
+  mptcp_subflow_early_fallback()") is also not in this version, and
+  moves code from protocol.c to protocol.h, but the modification can
+  still apply there. Conflicts in protocol.c because commit ee2708aedad0
+  ("mptcp: use get_retrans wrapper") is not in this version and refactor
+  the code in __mptcp_retrans(), but the modification can still be
+  applied, just not at the same indentation level. There were other
+  conflicts in the context due to commit 8005184fd1ca ("mptcp: refactor
+  sndbuf auto-tuning"), commit b3ea6b272d79 ("mptcp: consolidate initial
+  ack seq generation"), and commit 013e3179dbd2 ("mptcp: fix rcv space
+  initialization") that are not in this version. ]
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/options.c  |    3 ++-
+ net/mptcp/protocol.c |   39 +++++++++++++++++++++++++++++++++------
+ net/mptcp/protocol.h |   24 ++++++++++++++++++------
+ net/mptcp/subflow.c  |   11 +++++------
+ 4 files changed, 58 insertions(+), 19 deletions(-)
+
+--- a/net/mptcp/options.c
++++ b/net/mptcp/options.c
+@@ -973,8 +973,9 @@ static bool check_fully_established(stru
+               if (subflow->mp_join)
+                       goto reset;
+               subflow->mp_capable = 0;
++              if (!mptcp_try_fallback(ssk))
++                      goto reset;
+               pr_fallback(msk);
+-              mptcp_do_fallback(ssk);
+               return false;
+       }
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -633,10 +633,9 @@ static bool mptcp_check_data_fin(struct
+ static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk)
+ {
+-      if (READ_ONCE(msk->allow_infinite_fallback)) {
++      if (mptcp_try_fallback(ssk)) {
+               MPTCP_INC_STATS(sock_net(ssk),
+                               MPTCP_MIB_DSSCORRUPTIONFALLBACK);
+-              mptcp_do_fallback(ssk);
+       } else {
+               MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET);
+               mptcp_subflow_reset(ssk);
+@@ -897,6 +896,14 @@ static bool __mptcp_finish_join(struct m
+       if (sk->sk_state != TCP_ESTABLISHED)
+               return false;
++      spin_lock_bh(&msk->fallback_lock);
++      if (__mptcp_check_fallback(msk)) {
++              spin_unlock_bh(&msk->fallback_lock);
++              return false;
++      }
++      mptcp_subflow_joined(msk, ssk);
++      spin_unlock_bh(&msk->fallback_lock);
++
+       /* attach to msk socket only after we are sure we will deal with it
+        * at close time
+        */
+@@ -904,7 +911,6 @@ static bool __mptcp_finish_join(struct m
+               mptcp_sock_graft(ssk, sk->sk_socket);
+       mptcp_sockopt_sync_locked(msk, ssk);
+-      mptcp_subflow_joined(msk, ssk);
+       mptcp_stop_tout_timer(sk);
+       return true;
+ }
+@@ -1288,10 +1294,14 @@ static void mptcp_update_infinite_map(st
+       mpext->infinite_map = 1;
+       mpext->data_len = 0;
++      if (!mptcp_try_fallback(ssk)) {
++              mptcp_subflow_reset(ssk);
++              return;
++      }
++
+       MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
+       mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
+       pr_fallback(msk);
+-      mptcp_do_fallback(ssk);
+ }
+ #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))
+@@ -2638,8 +2648,8 @@ static void mptcp_check_fastclose(struct
+ static void __mptcp_retrans(struct sock *sk)
+ {
++      struct mptcp_sendmsg_info info = { .data_lock_held = true, };
+       struct mptcp_sock *msk = mptcp_sk(sk);
+-      struct mptcp_sendmsg_info info = {};
+       struct mptcp_data_frag *dfrag;
+       size_t copied = 0;
+       struct sock *ssk;
+@@ -2675,6 +2685,15 @@ static void __mptcp_retrans(struct sock
+       /* limit retransmission to the bytes already sent on some subflows */
+       info.sent = 0;
+       info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
++
++      /* make the whole retrans decision, xmit, disallow fallback atomic */
++      spin_lock_bh(&msk->fallback_lock);
++      if (__mptcp_check_fallback(msk)) {
++              spin_unlock_bh(&msk->fallback_lock);
++              release_sock(ssk);
++              return;
++      }
++
+       while (info.sent < info.limit) {
+               ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+               if (ret <= 0)
+@@ -2690,6 +2709,7 @@ static void __mptcp_retrans(struct sock
+                        info.size_goal);
+               WRITE_ONCE(msk->allow_infinite_fallback, false);
+       }
++      spin_unlock_bh(&msk->fallback_lock);
+       release_sock(ssk);
+@@ -2819,6 +2839,7 @@ static int __mptcp_init_sock(struct sock
+       msk->recovery = false;
+       mptcp_pm_data_init(msk);
++      spin_lock_init(&msk->fallback_lock);
+       /* re-use the csk retrans timer for MPTCP-level retrans */
+       timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
+@@ -3651,7 +3672,13 @@ bool mptcp_finish_join(struct sock *ssk)
+       /* active subflow, already present inside the conn_list */
+       if (!list_empty(&subflow->node)) {
++              spin_lock_bh(&msk->fallback_lock);
++              if (__mptcp_check_fallback(msk)) {
++                      spin_unlock_bh(&msk->fallback_lock);
++                      return false;
++              }
+               mptcp_subflow_joined(msk, ssk);
++              spin_unlock_bh(&msk->fallback_lock);
+               return true;
+       }
+@@ -3764,7 +3791,7 @@ static void mptcp_subflow_early_fallback
+                                        struct mptcp_subflow_context *subflow)
+ {
+       subflow->request_mptcp = 0;
+-      __mptcp_do_fallback(msk);
++      WARN_ON_ONCE(!__mptcp_try_fallback(msk));
+ }
+ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+--- a/net/mptcp/protocol.h
++++ b/net/mptcp/protocol.h
+@@ -317,6 +317,10 @@ struct mptcp_sock {
+       u32 setsockopt_seq;
+       char            ca_name[TCP_CA_NAME_MAX];
++
++      spinlock_t      fallback_lock;  /* protects fallback and
++                                       * allow_infinite_fallback
++                                       */
+ };
+ #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
+@@ -975,25 +979,32 @@ static inline bool mptcp_check_fallback(
+       return __mptcp_check_fallback(msk);
+ }
+-static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
++static inline bool __mptcp_try_fallback(struct mptcp_sock *msk)
+ {
+       if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
+               pr_debug("TCP fallback already done (msk=%p)\n", msk);
+-              return;
++              return true;
+       }
+-      if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback)))
+-              return;
++      spin_lock_bh(&msk->fallback_lock);
++      if (!msk->allow_infinite_fallback) {
++              spin_unlock_bh(&msk->fallback_lock);
++              return false;
++      }
++
+       set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
++      spin_unlock_bh(&msk->fallback_lock);
++      return true;
+ }
+-static inline void mptcp_do_fallback(struct sock *ssk)
++static inline bool mptcp_try_fallback(struct sock *ssk)
+ {
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+       struct sock *sk = subflow->conn;
+       struct mptcp_sock *msk;
+       msk = mptcp_sk(sk);
+-      __mptcp_do_fallback(msk);
++      if (!__mptcp_try_fallback(msk))
++              return false;
+       if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
+               gfp_t saved_allocation = ssk->sk_allocation;
+@@ -1005,6 +1016,7 @@ static inline void mptcp_do_fallback(str
+               tcp_shutdown(ssk, SEND_SHUTDOWN);
+               ssk->sk_allocation = saved_allocation;
+       }
++      return true;
+ }
+ #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a)
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -431,9 +431,11 @@ static void subflow_finish_connect(struc
+       mptcp_get_options(skb, &mp_opt);
+       if (subflow->request_mptcp) {
+               if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) {
++                      if (!mptcp_try_fallback(sk))
++                              goto do_reset;
++
+                       MPTCP_INC_STATS(sock_net(sk),
+                                       MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+-                      mptcp_do_fallback(sk);
+                       pr_fallback(mptcp_sk(subflow->conn));
+                       goto fallback;
+               }
+@@ -1269,7 +1271,7 @@ fallback:
+                       return true;
+               }
+-              if (!READ_ONCE(msk->allow_infinite_fallback)) {
++              if (!mptcp_try_fallback(ssk)) {
+                       /* fatal protocol error, close the socket.
+                        * subflow_error_report() will introduce the appropriate barriers
+                        */
+@@ -1285,8 +1287,6 @@ reset:
+                       WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+                       return false;
+               }
+-
+-              mptcp_do_fallback(ssk);
+       }
+       skb = skb_peek(&ssk->sk_receive_queue);
+@@ -1519,7 +1519,6 @@ int __mptcp_subflow_connect(struct sock
+       /* discard the subflow socket */
+       mptcp_sock_graft(ssk, sk->sk_socket);
+       iput(SOCK_INODE(sf));
+-      WRITE_ONCE(msk->allow_infinite_fallback, false);
+       mptcp_stop_tout_timer(sk);
+       return 0;
+@@ -1690,7 +1689,7 @@ static void subflow_state_change(struct
+       msk = mptcp_sk(parent);
+       if (subflow_simultaneous_connect(sk)) {
+               mptcp_propagate_sndbuf(parent, sk);
+-              mptcp_do_fallback(sk);
++              WARN_ON_ONCE(!mptcp_try_fallback(sk));
+               mptcp_rcv_space_init(msk, sk);
+               pr_fallback(msk);
+               subflow->conn_finished = 1;
diff --git a/queue-6.1/mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch b/queue-6.1/mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch
new file mode 100644 (file)
index 0000000..715ce44
--- /dev/null
@@ -0,0 +1,201 @@
+From stable+bounces-164936-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:45 2025
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Mon, 28 Jul 2025 15:29:22 +0200
+Subject: mptcp: plug races between subflow fail and subflow creation
+To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: Paolo Abeni <pabeni@redhat.com>, sashal@kernel.org, "Matthieu Baerts (NGI0)" <matttbe@kernel.org>, Jakub Kicinski <kuba@kernel.org>
+Message-ID: <20250728132919.3904847-7-matttbe@kernel.org>
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit def5b7b2643ebba696fc60ddf675dca13f073486 upstream.
+
+We have races similar to the one addressed by the previous patch between
+subflow failing and additional subflow creation. They are just harder to
+trigger.
+
+The solution is similar. Use a separate flag to track the condition
+'socket state prevent any additional subflow creation' protected by the
+fallback lock.
+
+The socket fallback makes such flag true, and also receiving or sending
+an MP_FAIL option.
+
+The field 'allow_infinite_fallback' is now always touched under the
+relevant lock, we can drop the ONCE annotation on write.
+
+Fixes: 478d770008b0 ("mptcp: send out MP_FAIL when data checksum fails")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-2-391aff963322@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ Conflicts in subflow.c, because commit f1f26512a9bf ("mptcp: use plain
+  bool instead of custom binary enum") and commit 46a5d3abedbe
+  ("mptcp: fix typos in comments") are not in this version. Both are
+  causing conflicts in the context, and the same modifications can still
+  be applied. Same in protocol.h with commit b8dc6d6ce931 ("mptcp: fix
+  rcv buffer auto-tuning"). Conflicts in protocol.c because commit
+  ee2708aedad0 ("mptcp: use get_retrans wrapper") is not in this version
+  and refactor the code in __mptcp_retrans(), but the modification can
+  still be applied, just not at the same indentation level. ]
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c       |    8 +++++++-
+ net/mptcp/protocol.c |   11 ++++++-----
+ net/mptcp/protocol.h |    7 +++++--
+ net/mptcp/subflow.c  |   19 ++++++++++++++-----
+ 4 files changed, 32 insertions(+), 13 deletions(-)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -309,8 +309,14 @@ void mptcp_pm_mp_fail_received(struct so
+       pr_debug("fail_seq=%llu\n", fail_seq);
+-      if (!READ_ONCE(msk->allow_infinite_fallback))
++      /* After accepting the fail, we can't create any other subflows */
++      spin_lock_bh(&msk->fallback_lock);
++      if (!msk->allow_infinite_fallback) {
++              spin_unlock_bh(&msk->fallback_lock);
+               return;
++      }
++      msk->allow_subflows = false;
++      spin_unlock_bh(&msk->fallback_lock);
+       if (!subflow->fail_tout) {
+               pr_debug("send MP_FAIL response and infinite map\n");
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -885,7 +885,7 @@ void mptcp_data_ready(struct sock *sk, s
+ static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
+ {
+       mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
+-      WRITE_ONCE(msk->allow_infinite_fallback, false);
++      msk->allow_infinite_fallback = false;
+       mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
+ }
+@@ -897,7 +897,7 @@ static bool __mptcp_finish_join(struct m
+               return false;
+       spin_lock_bh(&msk->fallback_lock);
+-      if (__mptcp_check_fallback(msk)) {
++      if (!msk->allow_subflows) {
+               spin_unlock_bh(&msk->fallback_lock);
+               return false;
+       }
+@@ -2707,7 +2707,7 @@ static void __mptcp_retrans(struct sock
+               dfrag->already_sent = max(dfrag->already_sent, info.sent);
+               tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+                        info.size_goal);
+-              WRITE_ONCE(msk->allow_infinite_fallback, false);
++              msk->allow_infinite_fallback = false;
+       }
+       spin_unlock_bh(&msk->fallback_lock);
+@@ -2835,7 +2835,8 @@ static int __mptcp_init_sock(struct sock
+       WRITE_ONCE(msk->first, NULL);
+       inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
+       WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+-      WRITE_ONCE(msk->allow_infinite_fallback, true);
++      msk->allow_infinite_fallback = true;
++      msk->allow_subflows = true;
+       msk->recovery = false;
+       mptcp_pm_data_init(msk);
+@@ -3673,7 +3674,7 @@ bool mptcp_finish_join(struct sock *ssk)
+       /* active subflow, already present inside the conn_list */
+       if (!list_empty(&subflow->node)) {
+               spin_lock_bh(&msk->fallback_lock);
+-              if (__mptcp_check_fallback(msk)) {
++              if (!msk->allow_subflows) {
+                       spin_unlock_bh(&msk->fallback_lock);
+                       return false;
+               }
+--- a/net/mptcp/protocol.h
++++ b/net/mptcp/protocol.h
+@@ -314,12 +314,14 @@ struct mptcp_sock {
+               u64     time;   /* start time of measurement window */
+               u64     rtt_us; /* last maximum rtt of subflows */
+       } rcvq_space;
++      bool            allow_subflows;
+       u32 setsockopt_seq;
+       char            ca_name[TCP_CA_NAME_MAX];
+-      spinlock_t      fallback_lock;  /* protects fallback and
+-                                       * allow_infinite_fallback
++      spinlock_t      fallback_lock;  /* protects fallback,
++                                       * allow_infinite_fallback and
++                                       * allow_join
+                                        */
+ };
+@@ -991,6 +993,7 @@ static inline bool __mptcp_try_fallback(
+               return false;
+       }
++      msk->allow_subflows = false;
+       set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+       spin_unlock_bh(&msk->fallback_lock);
+       return true;
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -1168,20 +1168,29 @@ static void subflow_sched_work_if_closed
+               mptcp_schedule_work(sk);
+ }
+-static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
++static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
+ {
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+       unsigned long fail_tout;
++      /* we are really failing, prevent any later subflow join */
++      spin_lock_bh(&msk->fallback_lock);
++      if (!msk->allow_infinite_fallback) {
++              spin_unlock_bh(&msk->fallback_lock);
++              return false;
++      }
++      msk->allow_subflows = false;
++      spin_unlock_bh(&msk->fallback_lock);
++
+       /* greceful failure can happen only on the MPC subflow */
+       if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
+-              return;
++              return false;
+       /* since the close timeout take precedence on the fail one,
+        * no need to start the latter when the first is already set
+        */
+       if (sock_flag((struct sock *)msk, SOCK_DEAD))
+-              return;
++              return true;
+       /* we don't need extreme accuracy here, use a zero fail_tout as special
+        * value meaning no fail timeout at all;
+@@ -1193,6 +1202,7 @@ static void mptcp_subflow_fail(struct mp
+       tcp_send_ack(ssk);
+       mptcp_reset_tout_timer(msk, subflow->fail_tout);
++      return true;
+ }
+ static bool subflow_check_data_avail(struct sock *ssk)
+@@ -1261,12 +1271,11 @@ fallback:
+                   (subflow->mp_join || subflow->valid_csum_seen)) {
+                       subflow->send_mp_fail = 1;
+-                      if (!READ_ONCE(msk->allow_infinite_fallback)) {
++                      if (!mptcp_subflow_fail(msk, ssk)) {
+                               subflow->reset_transient = 0;
+                               subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
+                               goto reset;
+                       }
+-                      mptcp_subflow_fail(msk, ssk);
+                       WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+                       return true;
+               }
diff --git a/queue-6.1/mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch b/queue-6.1/mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch
new file mode 100644 (file)
index 0000000..a7fea43
--- /dev/null
@@ -0,0 +1,58 @@
+From stable+bounces-164938-greg=kroah.com@vger.kernel.org Mon Jul 28 15:29:45 2025
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Mon, 28 Jul 2025 15:29:23 +0200
+Subject: mptcp: reset fallback status gracefully at disconnect() time
+To: mptcp@lists.linux.dev, stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: Paolo Abeni <pabeni@redhat.com>, sashal@kernel.org, "Matthieu Baerts (NGI0)" <matttbe@kernel.org>, Jakub Kicinski <kuba@kernel.org>
+Message-ID: <20250728132919.3904847-8-matttbe@kernel.org>
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit da9b2fc7b73d147d88abe1922de5ab72d72d7756 upstream.
+
+mptcp_disconnect() clears the fallback bit unconditionally, without
+touching the associated flags.
+
+The bit clear is safe, as no fallback operation can race with that --
+all subflow are already in TCP_CLOSE status thanks to the previous
+FASTCLOSE -- but we need to consistently reset all the fallback related
+status.
+
+Also acquire the relevant lock, to avoid fouling static analyzers.
+
+Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-3-391aff963322@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ Conflicts in protocol.c, because commit ebc1e08f01eb ("mptcp: drop
+  last_snd and MPTCP_RESET_SCHEDULER") is not in this version and
+  changed the context. The same modification can still be applied at the
+  same place. ]
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -3204,7 +3204,16 @@ static int mptcp_disconnect(struct sock
+        */
+       mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
+       msk->last_snd = NULL;
++
++      /* The first subflow is already in TCP_CLOSE status, the following
++       * can't overlap with a fallback anymore
++       */
++      spin_lock_bh(&msk->fallback_lock);
++      msk->allow_subflows = true;
++      msk->allow_infinite_fallback = true;
+       WRITE_ONCE(msk->flags, 0);
++      spin_unlock_bh(&msk->fallback_lock);
++
+       msk->cb_flags = 0;
+       msk->recovery = false;
+       msk->can_ack = false;
diff --git a/queue-6.1/selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch b/queue-6.1/selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch
new file mode 100644 (file)
index 0000000..1988aa9
--- /dev/null
@@ -0,0 +1,98 @@
+From stable+bounces-165167-greg=kroah.com@vger.kernel.org Wed Jul 30 03:54:17 2025
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Tue, 29 Jul 2025 18:52:43 -0700
+Subject: selftests/memfd: add test for mapping write-sealed memfd read-only
+To: lorenzo.stoakes@oracle.com, gregkh@linuxfoundation.org,  Shuah Khan <shuah@kernel.org>
+Cc: aliceryhl@google.com, surenb@google.com, stable@vger.kernel.org,  "Isaac J. Manjarres" <isaacmanjarres@google.com>, kernel-team@android.com,  Jann Horn <jannh@google.com>, Julian Orth <ju.orth@gmail.com>,  "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Linus Torvalds <torvalds@linux-foundation.org>,  Vlastimil Babka <vbabka@suse.cz>, Andrew Morton <akpm@linux-foundation.org>,  linux-kselftest@vger.kernel.org, linux-kernel@vger.kernel.org
+Message-ID: <20250730015247.30827-5-isaacmanjarres@google.com>
+
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+
+[ Upstream commit ea0916e01d0b0f2cce1369ac1494239a79827270 ]
+
+Now we have reinstated the ability to map F_SEAL_WRITE mappings read-only,
+assert that we are able to do this in a test to ensure that we do not
+regress this again.
+
+Link: https://lkml.kernel.org/r/a6377ec470b14c0539b4600cf8fa24bf2e4858ae.1732804776.git.lorenzo.stoakes@oracle.com
+Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Julian Orth <ju.orth@gmail.com>
+Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/memfd/memfd_test.c |   43 +++++++++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+--- a/tools/testing/selftests/memfd/memfd_test.c
++++ b/tools/testing/selftests/memfd/memfd_test.c
+@@ -186,6 +186,24 @@ static void *mfd_assert_mmap_shared(int
+       return p;
+ }
++static void *mfd_assert_mmap_read_shared(int fd)
++{
++      void *p;
++
++      p = mmap(NULL,
++               mfd_def_size,
++               PROT_READ,
++               MAP_SHARED,
++               fd,
++               0);
++      if (p == MAP_FAILED) {
++              printf("mmap() failed: %m\n");
++              abort();
++      }
++
++      return p;
++}
++
+ static void *mfd_assert_mmap_private(int fd)
+ {
+       void *p;
+@@ -802,6 +820,30 @@ static void test_seal_future_write(void)
+       close(fd);
+ }
++static void test_seal_write_map_read_shared(void)
++{
++      int fd;
++      void *p;
++
++      printf("%s SEAL-WRITE-MAP-READ\n", memfd_str);
++
++      fd = mfd_assert_new("kern_memfd_seal_write_map_read",
++                          mfd_def_size,
++                          MFD_CLOEXEC | MFD_ALLOW_SEALING);
++
++      mfd_assert_add_seals(fd, F_SEAL_WRITE);
++      mfd_assert_has_seals(fd, F_SEAL_WRITE);
++
++      p = mfd_assert_mmap_read_shared(fd);
++
++      mfd_assert_read(fd);
++      mfd_assert_read_shared(fd);
++      mfd_fail_write(fd);
++
++      munmap(p, mfd_def_size);
++      close(fd);
++}
++
+ /*
+  * Test SEAL_SHRINK
+  * Test whether SEAL_SHRINK actually prevents shrinking
+@@ -1056,6 +1098,7 @@ int main(int argc, char **argv)
+       test_seal_write();
+       test_seal_future_write();
++      test_seal_write_map_read_shared();
+       test_seal_shrink();
+       test_seal_grow();
+       test_seal_resize();
index 65cf4a8d28906e28d299de358dcad13552006b7c..c0add043570549bfd45ff581093323ef67f7d0ab 100644 (file)
@@ -375,3 +375,19 @@ platform-chrome-cros_ec-unregister-notifier-in-cros_ec_unregister.patch
 usb-dwc3-imx8mp-fix-device-leak-at-unbind.patch
 ata-fix-sata_mobile_lpm_policy-description-in-kconfig.patch
 btrfs-populate-otime-when-logging-an-inode-item.patch
+tls-separate-no-async-decryption-request-handling-from-async.patch
+crypto-qat-fix-ring-to-service-map-for-qat-gen4.patch
+arm64-cpufeatures-kvm-add-armv8.9-feat_ecbhb-bits-in-id_aa64mmfr1-register.patch
+kvm-x86-take-irqfds.lock-when-adding-deleting-irq-bypass-producer.patch
+mptcp-make-fallback-action-and-fallback-decision-atomic.patch
+mptcp-plug-races-between-subflow-fail-and-subflow-creation.patch
+mptcp-reset-fallback-status-gracefully-at-disconnect-time.patch
+mm-drop-the-assumption-that-vm_shared-always-implies-writable.patch
+mm-update-memfd-seal-write-check-to-include-f_seal_write.patch
+mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch
+selftests-memfd-add-test-for-mapping-write-sealed-memfd-read-only.patch
+bluetooth-hci_sync-fix-uaf-on-hci_abort_conn_sync.patch
+kbuild-userprogs-use-correct-linker-when-mixing-clang-and-gnu-ld.patch
+x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch
+x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch
+kvm-vmx-flush-shadow-vmcs-on-emergency-reboot.patch
diff --git a/queue-6.1/tls-separate-no-async-decryption-request-handling-from-async.patch b/queue-6.1/tls-separate-no-async-decryption-request-handling-from-async.patch
new file mode 100644 (file)
index 0000000..0ac3508
--- /dev/null
@@ -0,0 +1,61 @@
+From 41532b785e9d79636b3815a64ddf6a096647d011 Mon Sep 17 00:00:00 2001
+From: Sabrina Dubroca <sd@queasysnail.net>
+Date: Wed, 28 Feb 2024 23:43:59 +0100
+Subject: tls: separate no-async decryption request handling from async
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+commit 41532b785e9d79636b3815a64ddf6a096647d011 upstream.
+
+If we're not doing async, the handling is much simpler. There's no
+reference counting, we just need to wait for the completion to wake us
+up and return its result.
+
+We should preferably also use a separate crypto_wait. I'm not seeing a
+UAF as I did in the past, I think aec7961916f3 ("tls: fix race between
+async notify and socket close") took care of it.
+
+This will make the next fix easier.
+
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Link: https://lore.kernel.org/r/47bde5f649707610eaef9f0d679519966fc31061.1709132643.git.sd@queasysnail.net
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ William: The original patch did not apply cleanly due to deletions of
+  non-existent lines in 6.1.y. The UAF the author stopped seeing can still
+  be reproduced on systems without AVX in conjunction with cryptd.
+  Also removed an extraneous statement after a return statement that is
+  adjacent to diff. ]
+Link: https://lore.kernel.org/netdev/he2K1yz_u7bZ-CnYcTSQ4OxuLuHZXN6xZRgp6_ICSWnq8J5FpI_uD1i_1lTSf7WMrYb5ThiX1OR2GTOB2IltgT49Koy7Hhutr4du4KtLvyk=@willsroot.io/
+Signed-off-by: William Liu <will@willsroot.io>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/tls/tls_sw.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -274,9 +274,15 @@ static int tls_do_decryption(struct sock
+               DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->decrypt_pending) < 1);
+               atomic_inc(&ctx->decrypt_pending);
+       } else {
++              DECLARE_CRYPTO_WAIT(wait);
++
+               aead_request_set_callback(aead_req,
+                                         CRYPTO_TFM_REQ_MAY_BACKLOG,
+-                                        crypto_req_done, &ctx->async_wait);
++                                        crypto_req_done, &wait);
++              ret = crypto_aead_decrypt(aead_req);
++              if (ret == -EINPROGRESS || ret == -EBUSY)
++                      ret = crypto_wait_req(ret, &wait);
++              return ret;
+       }
+       ret = crypto_aead_decrypt(aead_req);
+@@ -289,7 +295,6 @@ static int tls_do_decryption(struct sock
+               /* all completions have run, we're not doing async anymore */
+               darg->async = false;
+               return ret;
+-              ret = ret ?: -EINPROGRESS;
+       }
+       atomic_dec(&ctx->decrypt_pending);
diff --git a/queue-6.1/x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch b/queue-6.1/x86-reboot-harden-virtualization-hooks-for-emergency-reboot.patch
new file mode 100644 (file)
index 0000000..3dcf3fa
--- /dev/null
@@ -0,0 +1,113 @@
+From stable+bounces-164647-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:39 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 13:07:23 -0400
+Subject: x86/reboot: Harden virtualization hooks for emergency reboot
+To: stable@vger.kernel.org
+Cc: Sean Christopherson <seanjc@google.com>, Kai Huang <kai.huang@intel.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20250724170725.1404455-1-sashal@kernel.org>
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 5e408396c60cd0f0b53a43713016b6d6af8d69e0 ]
+
+Provide dedicated helpers to (un)register virt hooks used during an
+emergency crash/reboot, and WARN if there is an attempt to overwrite
+the registered callback, or an attempt to do an unpaired unregister.
+
+Opportunsitically use rcu_assign_pointer() instead of RCU_INIT_POINTER(),
+mainly so that the set/unset paths are more symmetrical, but also because
+any performance gains from using RCU_INIT_POINTER() are meaningless for
+this code.
+
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Link: https://lore.kernel.org/r/20230721201859.2307736-3-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Stable-dep-of: a0ee1d5faff1 ("KVM: VMX: Flush shadow VMCS on emergency reboot")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/reboot.h |    5 +++--
+ arch/x86/kernel/reboot.c      |   30 ++++++++++++++++++++++++------
+ arch/x86/kvm/vmx/vmx.c        |    6 ++----
+ 3 files changed, 29 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/include/asm/reboot.h
++++ b/arch/x86/include/asm/reboot.h
+@@ -25,8 +25,9 @@ void __noreturn machine_real_restart(uns
+ #define MRR_BIOS      0
+ #define MRR_APM               1
+-typedef void crash_vmclear_fn(void);
+-extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
++typedef void (cpu_emergency_virt_cb)(void);
++void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
++void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
+ void cpu_emergency_disable_virtualization(void);
+ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
+--- a/arch/x86/kernel/reboot.c
++++ b/arch/x86/kernel/reboot.c
+@@ -794,17 +794,35 @@ void machine_crash_shutdown(struct pt_re
+  *
+  * protected by rcu.
+  */
+-crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+-EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
++static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
++
++void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
++{
++      if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
++              return;
++
++      rcu_assign_pointer(cpu_emergency_virt_callback, callback);
++}
++EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
++
++void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
++{
++      if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
++              return;
++
++      rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
++      synchronize_rcu();
++}
++EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+ static inline void cpu_crash_vmclear_loaded_vmcss(void)
+ {
+-      crash_vmclear_fn *do_vmclear_operation = NULL;
++      cpu_emergency_virt_cb *callback;
+       rcu_read_lock();
+-      do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
+-      if (do_vmclear_operation)
+-              do_vmclear_operation();
++      callback = rcu_dereference(cpu_emergency_virt_callback);
++      if (callback)
++              callback();
+       rcu_read_unlock();
+ }
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -8602,8 +8602,7 @@ static void __vmx_exit(void)
+ {
+       allow_smaller_maxphyaddr = false;
+-      RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+-      synchronize_rcu();
++      cpu_emergency_unregister_virt_callback(crash_vmclear_local_loaded_vmcss);
+       vmx_cleanup_l1d_flush();
+ }
+@@ -8677,8 +8676,7 @@ static int __init vmx_init(void)
+               pi_init_cpu(cpu);
+       }
+-      rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+-                         crash_vmclear_local_loaded_vmcss);
++      cpu_emergency_register_virt_callback(crash_vmclear_local_loaded_vmcss);
+       vmx_check_vmcs12_offsets();
diff --git a/queue-6.1/x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch b/queue-6.1/x86-reboot-kvm-handle-vmxoff-in-kvm-s-reboot-callback.patch
new file mode 100644 (file)
index 0000000..f1d5fdf
--- /dev/null
@@ -0,0 +1,139 @@
+From stable+bounces-164648-greg=kroah.com@vger.kernel.org Thu Jul 24 19:07:43 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Jul 2025 13:07:24 -0400
+Subject: x86/reboot: KVM: Handle VMXOFF in KVM's reboot callback
+To: stable@vger.kernel.org
+Cc: Sean Christopherson <seanjc@google.com>, Kai Huang <kai.huang@intel.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20250724170725.1404455-2-sashal@kernel.org>
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 119b5cb4ffd0166f3e98e9ee042f5046f7744f28 ]
+
+Use KVM VMX's reboot/crash callback to do VMXOFF in an emergency instead
+of manually and blindly doing VMXOFF.  There's no need to attempt VMXOFF
+if a hypervisor, i.e. KVM, isn't loaded/active, i.e. if the CPU can't
+possibly be post-VMXON.
+
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Link: https://lore.kernel.org/r/20230721201859.2307736-4-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Stable-dep-of: a0ee1d5faff1 ("KVM: VMX: Flush shadow VMCS on emergency reboot")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/virtext.h |   10 ----------
+ arch/x86/kernel/reboot.c       |   29 +++++++++--------------------
+ arch/x86/kvm/vmx/vmx.c         |    8 +++++---
+ 3 files changed, 14 insertions(+), 33 deletions(-)
+
+--- a/arch/x86/include/asm/virtext.h
++++ b/arch/x86/include/asm/virtext.h
+@@ -70,16 +70,6 @@ static inline void __cpu_emergency_vmxof
+               cpu_vmxoff();
+ }
+-/** Disable VMX if it is supported and enabled on the current CPU
+- */
+-static inline void cpu_emergency_vmxoff(void)
+-{
+-      if (cpu_has_vmx())
+-              __cpu_emergency_vmxoff();
+-}
+-
+-
+-
+ /*
+  * SVM functions:
+--- a/arch/x86/kernel/reboot.c
++++ b/arch/x86/kernel/reboot.c
+@@ -787,13 +787,7 @@ void machine_crash_shutdown(struct pt_re
+ }
+ #endif
+-/*
+- * This is used to VMCLEAR all VMCSs loaded on the
+- * processor. And when loading kvm_intel module, the
+- * callback function pointer will be assigned.
+- *
+- * protected by rcu.
+- */
++/* RCU-protected callback to disable virtualization prior to reboot. */
+ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
+ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
+@@ -815,17 +809,6 @@ void cpu_emergency_unregister_virt_callb
+ }
+ EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+-static inline void cpu_crash_vmclear_loaded_vmcss(void)
+-{
+-      cpu_emergency_virt_cb *callback;
+-
+-      rcu_read_lock();
+-      callback = rcu_dereference(cpu_emergency_virt_callback);
+-      if (callback)
+-              callback();
+-      rcu_read_unlock();
+-}
+-
+ /* This is the CPU performing the emergency shutdown work. */
+ int crashing_cpu = -1;
+@@ -836,9 +819,15 @@ int crashing_cpu = -1;
+  */
+ void cpu_emergency_disable_virtualization(void)
+ {
+-      cpu_crash_vmclear_loaded_vmcss();
++      cpu_emergency_virt_cb *callback;
++
++      rcu_read_lock();
++      callback = rcu_dereference(cpu_emergency_virt_callback);
++      if (callback)
++              callback();
++      rcu_read_unlock();
+-      cpu_emergency_vmxoff();
++      /* KVM_AMD doesn't yet utilize the common callback. */
+       cpu_emergency_svm_disable();
+ }
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -707,7 +707,7 @@ static int vmx_set_guest_uret_msr(struct
+       return ret;
+ }
+-static void crash_vmclear_local_loaded_vmcss(void)
++static void vmx_emergency_disable(void)
+ {
+       int cpu = raw_smp_processor_id();
+       struct loaded_vmcs *v;
+@@ -715,6 +715,8 @@ static void crash_vmclear_local_loaded_v
+       list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+                           loaded_vmcss_on_cpu_link)
+               vmcs_clear(v->vmcs);
++
++      __cpu_emergency_vmxoff();
+ }
+ static void __loaded_vmcs_clear(void *arg)
+@@ -8602,7 +8604,7 @@ static void __vmx_exit(void)
+ {
+       allow_smaller_maxphyaddr = false;
+-      cpu_emergency_unregister_virt_callback(crash_vmclear_local_loaded_vmcss);
++      cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
+       vmx_cleanup_l1d_flush();
+ }
+@@ -8676,7 +8678,7 @@ static int __init vmx_init(void)
+               pi_init_cpu(cpu);
+       }
+-      cpu_emergency_register_virt_callback(crash_vmclear_local_loaded_vmcss);
++      cpu_emergency_register_virt_callback(vmx_emergency_disable);
+       vmx_check_vmcs12_offsets();