--- /dev/null
+From 767e22001dfce64cc03b7def1562338591ab6031 Mon Sep 17 00:00:00 2001
+From: Nicolin Chen <nicolinc@nvidia.com>
+Date: Mon, 7 Apr 2025 13:19:08 -0700
+Subject: iommu/tegra241-cmdqv: Fix warnings due to dmam_free_coherent()
+
+From: Nicolin Chen <nicolinc@nvidia.com>
+
+commit 767e22001dfce64cc03b7def1562338591ab6031 upstream.
+
+Two WARNINGs are observed when SMMU driver rolls back upon failure:
+ arm-smmu-v3.9.auto: Failed to register iommu
+ arm-smmu-v3.9.auto: probe with driver arm-smmu-v3 failed with error -22
+ ------------[ cut here ]------------
+ WARNING: CPU: 5 PID: 1 at kernel/dma/mapping.c:74 dmam_free_coherent+0xc0/0xd8
+ Call trace:
+ dmam_free_coherent+0xc0/0xd8 (P)
+ tegra241_vintf_free_lvcmdq+0x74/0x188
+ tegra241_cmdqv_remove_vintf+0x60/0x148
+ tegra241_cmdqv_remove+0x48/0xc8
+ arm_smmu_impl_remove+0x28/0x60
+ devm_action_release+0x1c/0x40
+ ------------[ cut here ]------------
+ 128 pages are still in use!
+ WARNING: CPU: 16 PID: 1 at mm/page_alloc.c:6902 free_contig_range+0x18c/0x1c8
+ Call trace:
+ free_contig_range+0x18c/0x1c8 (P)
+ cma_release+0x154/0x2f0
+ dma_free_contiguous+0x38/0xa0
+ dma_direct_free+0x10c/0x248
+ dma_free_attrs+0x100/0x290
+ dmam_free_coherent+0x78/0xd8
+ tegra241_vintf_free_lvcmdq+0x74/0x160
+ tegra241_cmdqv_remove+0x98/0x198
+ arm_smmu_impl_remove+0x28/0x60
+ devm_action_release+0x1c/0x40
+
+This is because the LVCMDQ queue memory are managed by devres, while that
+dmam_free_coherent() is called in the context of devm_action_release().
+
+Jason pointed out that "arm_smmu_impl_probe() has mis-ordered the devres
+callbacks if ops->device_remove() is going to be manually freeing things
+that probe allocated":
+https://lore.kernel.org/linux-iommu/20250407174408.GB1722458@nvidia.com/
+
+In fact, tegra241_cmdqv_init_structures() only allocates memory resources
+which means any failure that it generates would be similar to -ENOMEM, so
+there is no point in having that "falling back to standard SMMU" routine,
+as the standard SMMU would likely fail to allocate memory too.
+
+Remove the unwind part in tegra241_cmdqv_init_structures(), and return a
+proper error code to ask SMMU driver to call tegra241_cmdqv_remove() via
+impl_ops->device_remove(). Then, drop tegra241_vintf_free_lvcmdq() since
+devres will take care of that.
+
+Fixes: 483e0bd8883a ("iommu/tegra241-cmdqv: Do not allocate vcmdq until dma_set_mask_and_coherent")
+Cc: stable@vger.kernel.org
+Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Link: https://lore.kernel.org/r/20250407201908.172225-1-nicolinc@nvidia.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 32 +++----------------
+ 1 file changed, 5 insertions(+), 27 deletions(-)
+
+diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+index d525ab43a4ae..dd7d030d2e89 100644
+--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
++++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+@@ -487,17 +487,6 @@ static int tegra241_cmdqv_hw_reset(struct arm_smmu_device *smmu)
+
+ /* VCMDQ Resource Helpers */
+
+-static void tegra241_vcmdq_free_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
+-{
+- struct arm_smmu_queue *q = &vcmdq->cmdq.q;
+- size_t nents = 1 << q->llq.max_n_shift;
+- size_t qsz = nents << CMDQ_ENT_SZ_SHIFT;
+-
+- if (!q->base)
+- return;
+- dmam_free_coherent(vcmdq->cmdqv->smmu.dev, qsz, q->base, q->base_dma);
+-}
+-
+ static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
+ {
+ struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu;
+@@ -560,7 +549,8 @@ static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
+ struct tegra241_vcmdq *vcmdq = vintf->lvcmdqs[lidx];
+ char header[64];
+
+- tegra241_vcmdq_free_smmu_cmdq(vcmdq);
++ /* Note that the lvcmdq queue memory space is managed by devres */
++
+ tegra241_vintf_deinit_lvcmdq(vintf, lidx);
+
+ dev_dbg(vintf->cmdqv->dev,
+@@ -768,13 +758,13 @@ static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu)
+
+ vintf = kzalloc(sizeof(*vintf), GFP_KERNEL);
+ if (!vintf)
+- goto out_fallback;
++ return -ENOMEM;
+
+ /* Init VINTF0 for in-kernel use */
+ ret = tegra241_cmdqv_init_vintf(cmdqv, 0, vintf);
+ if (ret) {
+ dev_err(cmdqv->dev, "failed to init vintf0: %d\n", ret);
+- goto free_vintf;
++ return ret;
+ }
+
+ /* Preallocate logical VCMDQs to VINTF0 */
+@@ -783,24 +773,12 @@ static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu)
+
+ vcmdq = tegra241_vintf_alloc_lvcmdq(vintf, lidx);
+ if (IS_ERR(vcmdq))
+- goto free_lvcmdq;
++ return PTR_ERR(vcmdq);
+ }
+
+ /* Now, we are ready to run all the impl ops */
+ smmu->impl_ops = &tegra241_cmdqv_impl_ops;
+ return 0;
+-
+-free_lvcmdq:
+- for (lidx--; lidx >= 0; lidx--)
+- tegra241_vintf_free_lvcmdq(vintf, lidx);
+- tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
+-free_vintf:
+- kfree(vintf);
+-out_fallback:
+- dev_info(smmu->impl_dev, "Falling back to standard SMMU CMDQ\n");
+- smmu->options &= ~ARM_SMMU_OPT_TEGRA241_CMDQV;
+- tegra241_cmdqv_remove(smmu);
+- return 0;
+ }
+
+ #ifdef CONFIG_IOMMU_DEBUGFS
+--
+2.49.0
+
--- /dev/null
+From 688124cc541f60d26a7547f45637b23dada4e527 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 19 Mar 2025 10:21:00 +0800
+Subject: iommu/vt-d: Don't clobber posted vCPU IRTE when host IRQ affinity changes
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 688124cc541f60d26a7547f45637b23dada4e527 upstream.
+
+Don't overwrite an IRTE that is posting IRQs to a vCPU with a posted MSI
+entry if the host IRQ affinity happens to change. If/when the IRTE is
+reverted back to "host mode", it will be reconfigured as a posted MSI or
+remapped entry as appropriate.
+
+Drop the "mode" field, which doesn't differentiate between posted MSIs and
+posted vCPUs, in favor of a dedicated posted_vcpu flag. Note! The two
+posted_{msi,vcpu} flags are intentionally not mutually exclusive; an IRTE
+can transition between posted MSI and posted vCPU.
+
+Fixes: ed1e48ea4370 ("iommu/vt-d: Enable posted mode for device MSIs")
+Cc: stable@vger.kernel.org
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250315025135.2365846-3-seanjc@google.com
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/irq_remapping.c | 25 +++++++++++++++----------
+ 1 file changed, 15 insertions(+), 10 deletions(-)
+
+--- a/drivers/iommu/intel/irq_remapping.c
++++ b/drivers/iommu/intel/irq_remapping.c
+@@ -26,11 +26,6 @@
+ #include "../iommu-pages.h"
+ #include "cap_audit.h"
+
+-enum irq_mode {
+- IRQ_REMAPPING,
+- IRQ_POSTING,
+-};
+-
+ struct ioapic_scope {
+ struct intel_iommu *iommu;
+ unsigned int id;
+@@ -50,8 +45,8 @@ struct irq_2_iommu {
+ u16 irte_index;
+ u16 sub_handle;
+ u8 irte_mask;
+- enum irq_mode mode;
+ bool posted_msi;
++ bool posted_vcpu;
+ };
+
+ struct intel_ir_data {
+@@ -139,7 +134,6 @@ static int alloc_irte(struct intel_iommu
+ irq_iommu->irte_index = index;
+ irq_iommu->sub_handle = 0;
+ irq_iommu->irte_mask = mask;
+- irq_iommu->mode = IRQ_REMAPPING;
+ }
+ raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+@@ -194,8 +188,6 @@ static int modify_irte(struct irq_2_iomm
+
+ rc = qi_flush_iec(iommu, index, 0);
+
+- /* Update iommu mode according to the IRTE mode */
+- irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING;
+ raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+ return rc;
+@@ -1177,9 +1169,18 @@ static void __intel_ir_reconfigure_irte(
+ {
+ struct intel_ir_data *ir_data = irqd->chip_data;
+
++ /*
++ * Don't modify IRTEs for IRQs that are being posted to vCPUs if the
++ * host CPU affinity changes.
++ */
++ if (ir_data->irq_2_iommu.posted_vcpu && !force_host)
++ return;
++
++ ir_data->irq_2_iommu.posted_vcpu = false;
++
+ if (ir_data->irq_2_iommu.posted_msi)
+ intel_ir_reconfigure_irte_posted(irqd);
+- else if (force_host || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
++ else
+ modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
+ }
+
+@@ -1274,6 +1275,7 @@ static int intel_ir_set_vcpu_affinity(st
+ irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+ ~(-1UL << PDA_HIGH_BIT);
+
++ ir_data->irq_2_iommu.posted_vcpu = true;
+ modify_irte(&ir_data->irq_2_iommu, &irte_pi);
+ }
+
+@@ -1501,6 +1503,9 @@ static void intel_irq_remapping_deactiva
+ struct intel_ir_data *data = irq_data->chip_data;
+ struct irte entry;
+
++ WARN_ON_ONCE(data->irq_2_iommu.posted_vcpu);
++ data->irq_2_iommu.posted_vcpu = false;
++
+ memset(&entry, 0, sizeof(entry));
+ modify_irte(&data->irq_2_iommu, &entry);
+ }
--- /dev/null
+From 93ae6e68b6d6b62d92b3a89d1c253d4a1721a1d3 Mon Sep 17 00:00:00 2001
+From: Lu Baolu <baolu.lu@linux.intel.com>
+Date: Wed, 19 Mar 2025 10:21:01 +0800
+Subject: iommu/vt-d: Fix possible circular locking dependency
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+commit 93ae6e68b6d6b62d92b3a89d1c253d4a1721a1d3 upstream.
+
+We have recently seen report of lockdep circular lock dependency warnings
+on platforms like Skylake and Kabylake:
+
+ ======================================================
+ WARNING: possible circular locking dependency detected
+ 6.14.0-rc6-CI_DRM_16276-gca2c04fe76e8+ #1 Not tainted
+ ------------------------------------------------------
+ swapper/0/1 is trying to acquire lock:
+ ffffffff8360ee48 (iommu_probe_device_lock){+.+.}-{3:3},
+ at: iommu_probe_device+0x1d/0x70
+
+ but task is already holding lock:
+ ffff888102c7efa8 (&device->physical_node_lock){+.+.}-{3:3},
+ at: intel_iommu_init+0xe75/0x11f0
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #6 (&device->physical_node_lock){+.+.}-{3:3}:
+ __mutex_lock+0xb4/0xe40
+ mutex_lock_nested+0x1b/0x30
+ intel_iommu_init+0xe75/0x11f0
+ pci_iommu_init+0x13/0x70
+ do_one_initcall+0x62/0x3f0
+ kernel_init_freeable+0x3da/0x6a0
+ kernel_init+0x1b/0x200
+ ret_from_fork+0x44/0x70
+ ret_from_fork_asm+0x1a/0x30
+
+ -> #5 (dmar_global_lock){++++}-{3:3}:
+ down_read+0x43/0x1d0
+ enable_drhd_fault_handling+0x21/0x110
+ cpuhp_invoke_callback+0x4c6/0x870
+ cpuhp_issue_call+0xbf/0x1f0
+ __cpuhp_setup_state_cpuslocked+0x111/0x320
+ __cpuhp_setup_state+0xb0/0x220
+ irq_remap_enable_fault_handling+0x3f/0xa0
+ apic_intr_mode_init+0x5c/0x110
+ x86_late_time_init+0x24/0x40
+ start_kernel+0x895/0xbd0
+ x86_64_start_reservations+0x18/0x30
+ x86_64_start_kernel+0xbf/0x110
+ common_startup_64+0x13e/0x141
+
+ -> #4 (cpuhp_state_mutex){+.+.}-{3:3}:
+ __mutex_lock+0xb4/0xe40
+ mutex_lock_nested+0x1b/0x30
+ __cpuhp_setup_state_cpuslocked+0x67/0x320
+ __cpuhp_setup_state+0xb0/0x220
+ page_alloc_init_cpuhp+0x2d/0x60
+ mm_core_init+0x18/0x2c0
+ start_kernel+0x576/0xbd0
+ x86_64_start_reservations+0x18/0x30
+ x86_64_start_kernel+0xbf/0x110
+ common_startup_64+0x13e/0x141
+
+ -> #3 (cpu_hotplug_lock){++++}-{0:0}:
+ __cpuhp_state_add_instance+0x4f/0x220
+ iova_domain_init_rcaches+0x214/0x280
+ iommu_setup_dma_ops+0x1a4/0x710
+ iommu_device_register+0x17d/0x260
+ intel_iommu_init+0xda4/0x11f0
+ pci_iommu_init+0x13/0x70
+ do_one_initcall+0x62/0x3f0
+ kernel_init_freeable+0x3da/0x6a0
+ kernel_init+0x1b/0x200
+ ret_from_fork+0x44/0x70
+ ret_from_fork_asm+0x1a/0x30
+
+ -> #2 (&domain->iova_cookie->mutex){+.+.}-{3:3}:
+ __mutex_lock+0xb4/0xe40
+ mutex_lock_nested+0x1b/0x30
+ iommu_setup_dma_ops+0x16b/0x710
+ iommu_device_register+0x17d/0x260
+ intel_iommu_init+0xda4/0x11f0
+ pci_iommu_init+0x13/0x70
+ do_one_initcall+0x62/0x3f0
+ kernel_init_freeable+0x3da/0x6a0
+ kernel_init+0x1b/0x200
+ ret_from_fork+0x44/0x70
+ ret_from_fork_asm+0x1a/0x30
+
+ -> #1 (&group->mutex){+.+.}-{3:3}:
+ __mutex_lock+0xb4/0xe40
+ mutex_lock_nested+0x1b/0x30
+ __iommu_probe_device+0x24c/0x4e0
+ probe_iommu_group+0x2b/0x50
+ bus_for_each_dev+0x7d/0xe0
+ iommu_device_register+0xe1/0x260
+ intel_iommu_init+0xda4/0x11f0
+ pci_iommu_init+0x13/0x70
+ do_one_initcall+0x62/0x3f0
+ kernel_init_freeable+0x3da/0x6a0
+ kernel_init+0x1b/0x200
+ ret_from_fork+0x44/0x70
+ ret_from_fork_asm+0x1a/0x30
+
+ -> #0 (iommu_probe_device_lock){+.+.}-{3:3}:
+ __lock_acquire+0x1637/0x2810
+ lock_acquire+0xc9/0x300
+ __mutex_lock+0xb4/0xe40
+ mutex_lock_nested+0x1b/0x30
+ iommu_probe_device+0x1d/0x70
+ intel_iommu_init+0xe90/0x11f0
+ pci_iommu_init+0x13/0x70
+ do_one_initcall+0x62/0x3f0
+ kernel_init_freeable+0x3da/0x6a0
+ kernel_init+0x1b/0x200
+ ret_from_fork+0x44/0x70
+ ret_from_fork_asm+0x1a/0x30
+
+ other info that might help us debug this:
+
+ Chain exists of:
+ iommu_probe_device_lock --> dmar_global_lock -->
+ &device->physical_node_lock
+
+ Possible unsafe locking scenario:
+
+ CPU0 CPU1
+ ---- ----
+ lock(&device->physical_node_lock);
+ lock(dmar_global_lock);
+ lock(&device->physical_node_lock);
+ lock(iommu_probe_device_lock);
+
+ *** DEADLOCK ***
+
+This driver uses a global lock to protect the list of enumerated DMA
+remapping units. It is necessary due to the driver's support for dynamic
+addition and removal of remapping units at runtime.
+
+Two distinct code paths require iteration over this remapping unit list:
+
+- Device registration and probing: the driver iterates the list to
+ register each remapping unit with the upper layer IOMMU framework
+ and subsequently probe the devices managed by that unit.
+- Global configuration: Upper layer components may also iterate the list
+ to apply configuration changes.
+
+The lock acquisition order between these two code paths was reversed. This
+caused lockdep warnings, indicating a risk of deadlock. Fix this warning
+by releasing the global lock before invoking upper layer interfaces for
+device registration.
+
+Fixes: b150654f74bf ("iommu/vt-d: Fix suspicious RCU usage")
+Closes: https://lore.kernel.org/linux-iommu/SJ1PR11MB612953431F94F18C954C4A9CB9D32@SJ1PR11MB6129.namprd11.prod.outlook.com/
+Tested-by: Chaitanya Kumar Borah <chaitanya.kumar.borah@intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20250317035714.1041549-1-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/iommu.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -3174,6 +3174,7 @@ static int __init probe_acpi_namespace_d
+ if (dev->bus != &acpi_bus_type)
+ continue;
+
++ up_read(&dmar_global_lock);
+ adev = to_acpi_device(dev);
+ mutex_lock(&adev->physical_node_lock);
+ list_for_each_entry(pn,
+@@ -3183,6 +3184,7 @@ static int __init probe_acpi_namespace_d
+ break;
+ }
+ mutex_unlock(&adev->physical_node_lock);
++ down_read(&dmar_global_lock);
+
+ if (ret)
+ return ret;
--- /dev/null
+From 2454823e97a63d85a6b215905f71e5a06324eab7 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 19 Mar 2025 10:20:59 +0800
+Subject: iommu/vt-d: Put IRTE back into posted MSI mode if vCPU posting is disabled
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 2454823e97a63d85a6b215905f71e5a06324eab7 upstream.
+
+Add a helper to take care of reconfiguring an IRTE to deliver IRQs to the
+host, i.e. not to a vCPU, and use the helper when an IRTE's vCPU affinity
+is nullified, i.e. when KVM puts an IRTE back into "host" mode. Because
+posted MSIs use an ephemeral IRTE, using modify_irte() puts the IRTE into
+full remapped mode, i.e. unintentionally disables posted MSIs on the IRQ.
+
+Fixes: ed1e48ea4370 ("iommu/vt-d: Enable posted mode for device MSIs")
+Cc: stable@vger.kernel.org
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Link: https://lore.kernel.org/r/20250315025135.2365846-2-seanjc@google.com
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/irq_remapping.c | 19 +++++++++++++------
+ 1 file changed, 13 insertions(+), 6 deletions(-)
+
+--- a/drivers/iommu/intel/irq_remapping.c
++++ b/drivers/iommu/intel/irq_remapping.c
+@@ -1173,7 +1173,17 @@ static void intel_ir_reconfigure_irte_po
+ static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {}
+ #endif
+
+-static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
++static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host)
++{
++ struct intel_ir_data *ir_data = irqd->chip_data;
++
++ if (ir_data->irq_2_iommu.posted_msi)
++ intel_ir_reconfigure_irte_posted(irqd);
++ else if (force_host || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
++ modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
++}
++
++static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host)
+ {
+ struct intel_ir_data *ir_data = irqd->chip_data;
+ struct irte *irte = &ir_data->irte_entry;
+@@ -1186,10 +1196,7 @@ static void intel_ir_reconfigure_irte(st
+ irte->vector = cfg->vector;
+ irte->dest_id = IRTE_DEST(cfg->dest_apicid);
+
+- if (ir_data->irq_2_iommu.posted_msi)
+- intel_ir_reconfigure_irte_posted(irqd);
+- else if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
+- modify_irte(&ir_data->irq_2_iommu, irte);
++ __intel_ir_reconfigure_irte(irqd, force_host);
+ }
+
+ /*
+@@ -1244,7 +1251,7 @@ static int intel_ir_set_vcpu_affinity(st
+
+ /* stop posting interrupts, back to the default mode */
+ if (!vcpu_pi_info) {
+- modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
++ __intel_ir_reconfigure_irte(data, true);
+ } else {
+ struct irte irte_pi;
+
--- /dev/null
+From 548183ea388c12b6d76d6982f3d72df3887af0da Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 10 Apr 2025 15:32:46 +0800
+Subject: iommu/vt-d: Wire up irq_ack() to irq_move_irq() for posted MSIs
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 548183ea388c12b6d76d6982f3d72df3887af0da upstream.
+
+Set the posted MSI irq_chip's irq_ack() hook to irq_move_irq() instead of
+a dummy/empty callback so that posted MSIs process pending changes to the
+IRQ's SMP affinity. Failure to honor a pending set-affinity results in
+userspace being unable to change the effective affinity of the IRQ, as
+IRQD_SETAFFINITY_PENDING is never cleared and so irq_set_affinity_locked()
+always defers moving the IRQ.
+
+The issue is most easily reproducible by setting /proc/irq/xx/smp_affinity
+multiple times in quick succession, as only the first update is likely to
+be handled in process context.
+
+Fixes: ed1e48ea4370 ("iommu/vt-d: Enable posted mode for device MSIs")
+Cc: Robert Lippert <rlippert@google.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Reported-by: Wentao Yang <wentaoyang@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lore.kernel.org/r/20250321194249.1217961-1-seanjc@google.com
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/irq_remapping.c | 29 +++++++++++++++--------------
+ 1 file changed, 15 insertions(+), 14 deletions(-)
+
+--- a/drivers/iommu/intel/irq_remapping.c
++++ b/drivers/iommu/intel/irq_remapping.c
+@@ -1291,43 +1291,44 @@ static struct irq_chip intel_ir_chip = {
+ };
+
+ /*
+- * With posted MSIs, all vectors are multiplexed into a single notification
+- * vector. Devices MSIs are then dispatched in a demux loop where
+- * EOIs can be coalesced as well.
++ * With posted MSIs, the MSI vectors are multiplexed into a single notification
++ * vector, and only the notification vector is sent to the APIC IRR. Device
++ * MSIs are then dispatched in a demux loop that harvests the MSIs from the
++ * CPU's Posted Interrupt Request bitmap. I.e. Posted MSIs never get sent to
++ * the APIC IRR, and thus do not need an EOI. The notification handler instead
++ * performs a single EOI after processing the PIR.
+ *
+- * "INTEL-IR-POST" IRQ chip does not do EOI on ACK, thus the dummy irq_ack()
+- * function. Instead EOI is performed by the posted interrupt notification
+- * handler.
++ * Note! Pending SMP/CPU affinity changes, which are per MSI, must still be
++ * honored, only the APIC EOI is omitted.
+ *
+ * For the example below, 3 MSIs are coalesced into one CPU notification. Only
+- * one apic_eoi() is needed.
++ * one apic_eoi() is needed, but each MSI needs to process pending changes to
++ * its CPU affinity.
+ *
+ * __sysvec_posted_msi_notification()
+ * irq_enter();
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+- * dummy(); // No EOI
++ * irq_move_irq(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+- * dummy(); // No EOI
++ * irq_move_irq(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+- * dummy(); // No EOI
++ * irq_move_irq(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * apic_eoi()
+ * irq_exit()
++ *
+ */
+-
+-static void dummy_ack(struct irq_data *d) { }
+-
+ static struct irq_chip intel_ir_chip_post_msi = {
+ .name = "INTEL-IR-POST",
+- .irq_ack = dummy_ack,
++ .irq_ack = irq_move_irq,
+ .irq_set_affinity = intel_ir_set_affinity,
+ .irq_compose_msi_msg = intel_ir_compose_msi_msg,
+ .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
--- /dev/null
+From a05df03a88bc1088be8e9d958f208d6484691e43 Mon Sep 17 00:00:00 2001
+From: Nicolin Chen <nicolinc@nvidia.com>
+Date: Thu, 27 Feb 2025 12:07:29 -0800
+Subject: iommufd: Fix uninitialized rc in iommufd_access_rw()
+
+From: Nicolin Chen <nicolinc@nvidia.com>
+
+commit a05df03a88bc1088be8e9d958f208d6484691e43 upstream.
+
+Reported by smatch:
+drivers/iommu/iommufd/device.c:1392 iommufd_access_rw() error: uninitialized symbol 'rc'.
+
+Fixes: 8d40205f6093 ("iommufd: Add kAPI toward external drivers for kernel access")
+Link: https://patch.msgid.link/r/20250227200729.85030-1-nicolinc@nvidia.com
+Cc: stable@vger.kernel.org
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <error27@gmail.com>
+Closes: https://lore.kernel.org/r/202502271339.a2nWr9UA-lkp@intel.com/
+[nicolinc: can't find an original report but only in "old smatch warnings"]
+Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/iommufd/device.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/iommu/iommufd/device.c
++++ b/drivers/iommu/iommufd/device.c
+@@ -1127,7 +1127,7 @@ int iommufd_access_rw(struct iommufd_acc
+ struct io_pagetable *iopt;
+ struct iopt_area *area;
+ unsigned long last_iova;
+- int rc;
++ int rc = -EINVAL;
+
+ if (!length)
+ return -EINVAL;
--- /dev/null
+From c0ebbb3841e07c4493e6fe351698806b09a87a37 Mon Sep 17 00:00:00 2001
+From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Date: Wed, 12 Mar 2025 10:10:13 -0400
+Subject: mm: add missing release barrier on PGDAT_RECLAIM_LOCKED unlock
+
+From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+commit c0ebbb3841e07c4493e6fe351698806b09a87a37 upstream.
+
+The PGDAT_RECLAIM_LOCKED bit is used to provide mutual exclusion of node
+reclaim for struct pglist_data using a single bit.
+
+It is "locked" with a test_and_set_bit (similarly to a try lock) which
+provides full ordering with respect to loads and stores done within
+__node_reclaim().
+
+It is "unlocked" with clear_bit(), which does not provide any ordering
+with respect to loads and stores done before clearing the bit.
+
+The lack of clear_bit() memory ordering with respect to stores within
+__node_reclaim() can cause a subsequent CPU to fail to observe stores from
+a prior node reclaim. This is not an issue in practice on TSO (e.g.
+x86), but it is an issue on weakly-ordered architectures (e.g. arm64).
+
+Fix this by using clear_bit_unlock rather than clear_bit to clear
+PGDAT_RECLAIM_LOCKED with a release memory ordering semantic.
+
+This provides stronger memory ordering (release rather than relaxed).
+
+Link: https://lkml.kernel.org/r/20250312141014.129725-1-mathieu.desnoyers@efficios.com
+Fixes: d773ed6b856a ("mm: test and set zone reclaim lock before starting reclaim")
+Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Alan Stern <stern@rowland.harvard.edu>
+Cc: Andrea Parri <parri.andrea@gmail.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Boqun Feng <boqun.feng@gmail.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Cc: David Howells <dhowells@redhat.com>
+Cc: Jade Alglave <j.alglave@ucl.ac.uk>
+Cc: Luc Maranget <luc.maranget@inria.fr>
+Cc: "Paul E. McKenney" <paulmck@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -7557,7 +7557,7 @@ int node_reclaim(struct pglist_data *pgd
+ return NODE_RECLAIM_NOSCAN;
+
+ ret = __node_reclaim(pgdat, gfp_mask, order);
+- clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
++ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+
+ if (ret)
+ count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
--- /dev/null
+From 3a06696305e757f652dd0dcf4dfa2272eda39434 Mon Sep 17 00:00:00 2001
+From: Usama Arif <usamaarif642@gmail.com>
+Date: Fri, 7 Feb 2025 13:20:32 -0800
+Subject: mm/damon/ops: have damon_get_folio return folio even for tail pages
+
+From: Usama Arif <usamaarif642@gmail.com>
+
+commit 3a06696305e757f652dd0dcf4dfa2272eda39434 upstream.
+
+Patch series "mm/damon/paddr: fix large folios access and schemes handling".
+
+DAMON operations set for physical address space, namely 'paddr', treats
+tail pages as unaccessed always. It can also apply DAMOS action to a
+large folio multiple times within single DAMOS' regions walking. As a
+result, the monitoring output has poor quality and DAMOS works in
+unexpected ways when large folios are being used. Fix those.
+
+The patches were parts of Usama's hugepage_size DAMOS filter patch
+series[1]. The first fix has collected from there with a slight commit
+message change for the subject prefix. The second fix is re-written by SJ
+and posted as an RFC before this series. The second one also got a slight
+commit message change for the subject prefix.
+
+[1] https://lore.kernel.org/20250203225604.44742-1-usamaarif642@gmail.com
+[2] https://lore.kernel.org/20250206231103.38298-1-sj@kernel.org
+
+
+This patch (of 2):
+
+This effectively adds support for large folios in damon for paddr, as
+damon_pa_mkold/young won't get a null folio from this function and won't
+ignore it, hence access will be checked and reported. This also means
+that larger folios will be considered for different DAMOS actions like
+pageout, prioritization and migration. As these DAMOS actions will
+consider larger folios, iterate through the region at folio_size and not
+PAGE_SIZE intervals. This should not have an affect on vaddr, as
+damon_young_pmd_entry considers pmd entries.
+
+Link: https://lkml.kernel.org/r/20250207212033.45269-1-sj@kernel.org
+Link: https://lkml.kernel.org/r/20250207212033.45269-2-sj@kernel.org
+Fixes: a28397beb55b ("mm/damon: implement primitives for physical address space monitoring")
+Signed-off-by: Usama Arif <usamaarif642@gmail.com>
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Reviewed-by: SeongJae Park <sj@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/ops-common.c | 2 +-
+ mm/damon/paddr.c | 24 ++++++++++++++++++------
+ 2 files changed, 19 insertions(+), 7 deletions(-)
+
+--- a/mm/damon/ops-common.c
++++ b/mm/damon/ops-common.c
+@@ -24,7 +24,7 @@ struct folio *damon_get_folio(unsigned l
+ struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
+
+- if (!page || PageTail(page))
++ if (!page)
+ return NULL;
+
+ folio = page_folio(page);
+--- a/mm/damon/paddr.c
++++ b/mm/damon/paddr.c
+@@ -264,11 +264,14 @@ static unsigned long damon_pa_pageout(st
+ damos_add_filter(s, filter);
+ }
+
+- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
++ addr = r->ar.start;
++ while (addr < r->ar.end) {
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+- if (!folio)
++ if (!folio) {
++ addr += PAGE_SIZE;
+ continue;
++ }
+
+ if (damos_pa_filter_out(s, folio))
+ goto put_folio;
+@@ -282,6 +285,7 @@ static unsigned long damon_pa_pageout(st
+ else
+ list_add(&folio->lru, &folio_list);
+ put_folio:
++ addr += folio_size(folio);
+ folio_put(folio);
+ }
+ if (install_young_filter)
+@@ -296,11 +300,14 @@ static inline unsigned long damon_pa_mar
+ {
+ unsigned long addr, applied = 0;
+
+- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
++ addr = r->ar.start;
++ while (addr < r->ar.end) {
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+- if (!folio)
++ if (!folio) {
++ addr += PAGE_SIZE;
+ continue;
++ }
+
+ if (damos_pa_filter_out(s, folio))
+ goto put_folio;
+@@ -311,6 +318,7 @@ static inline unsigned long damon_pa_mar
+ folio_deactivate(folio);
+ applied += folio_nr_pages(folio);
+ put_folio:
++ addr += folio_size(folio);
+ folio_put(folio);
+ }
+ return applied * PAGE_SIZE;
+@@ -454,11 +462,14 @@ static unsigned long damon_pa_migrate(st
+ unsigned long addr, applied;
+ LIST_HEAD(folio_list);
+
+- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
++ addr = r->ar.start;
++ while (addr < r->ar.end) {
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+- if (!folio)
++ if (!folio) {
++ addr += PAGE_SIZE;
+ continue;
++ }
+
+ if (damos_pa_filter_out(s, folio))
+ goto put_folio;
+@@ -467,6 +478,7 @@ static unsigned long damon_pa_migrate(st
+ goto put_folio;
+ list_add(&folio->lru, &folio_list);
+ put_folio:
++ addr += folio_size(folio);
+ folio_put(folio);
+ }
+ applied = damon_pa_migrate_pages(&folio_list, s->target_nid);
--- /dev/null
+From 691ee97e1a9de0cdb3efb893c1f180e3f4a35e32 Mon Sep 17 00:00:00 2001
+From: Ryan Roberts <ryan.roberts@arm.com>
+Date: Mon, 3 Mar 2025 14:15:35 +0000
+Subject: mm: fix lazy mmu docs and usage
+
+From: Ryan Roberts <ryan.roberts@arm.com>
+
+commit 691ee97e1a9de0cdb3efb893c1f180e3f4a35e32 upstream.
+
+Patch series "Fix lazy mmu mode", v2.
+
+I'm planning to implement lazy mmu mode for arm64 to optimize vmalloc. As
+part of that, I will extend lazy mmu mode to cover kernel mappings in
+vmalloc table walkers. While lazy mmu mode is already used for kernel
+mappings in a few places, this will extend it's use significantly.
+
+Having reviewed the existing lazy mmu implementations in powerpc, sparc
+and x86, it looks like there are a bunch of bugs, some of which may be
+more likely to trigger once I extend the use of lazy mmu. So this series
+attempts to clarify the requirements and fix all the bugs in advance of
+that series. See patch #1 commit log for all the details.
+
+
+This patch (of 5):
+
+The docs, implementations and use of arch_[enter|leave]_lazy_mmu_mode() is
+a bit of a mess (to put it politely). There are a number of issues
+related to nesting of lazy mmu regions and confusion over whether the
+task, when in a lazy mmu region, is preemptible or not. Fix all the
+issues relating to the core-mm. Follow up commits will fix the
+arch-specific implementations. 3 arches implement lazy mmu; powerpc,
+sparc and x86.
+
+When arch_[enter|leave]_lazy_mmu_mode() was first introduced by commit
+6606c3e0da53 ("[PATCH] paravirt: lazy mmu mode hooks.patch"), it was
+expected that lazy mmu regions would never nest and that the appropriate
+page table lock(s) would be held while in the region, thus ensuring the
+region is non-preemptible. Additionally lazy mmu regions were only used
+during manipulation of user mappings.
+
+Commit 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy
+updates") started invoking the lazy mmu mode in apply_to_pte_range(),
+which is used for both user and kernel mappings. For kernel mappings the
+region is no longer protected by any lock so there is no longer any
+guarantee about non-preemptibility. Additionally, for RT configs, the
+holding the PTL only implies no CPU migration, it doesn't prevent
+preemption.
+
+Commit bcc6cc832573 ("mm: add default definition of set_ptes()") added
+arch_[enter|leave]_lazy_mmu_mode() to the default implementation of
+set_ptes(), used by x86. So after this commit, lazy mmu regions can be
+nested. Additionally commit 1a10a44dfc1d ("sparc64: implement the new
+page table range API") and commit 9fee28baa601 ("powerpc: implement the
+new page table range API") did the same for the sparc and powerpc
+set_ptes() overrides.
+
+powerpc couldn't deal with preemption so avoids it in commit b9ef323ea168
+("powerpc/64s: Disable preemption in hash lazy mmu mode"), which
+explicitly disables preemption for the whole region in its implementation.
+x86 can support preemption (or at least it could until it tried to add
+support nesting; more on this below). Sparc looks to be totally broken in
+the face of preemption, as far as I can tell.
+
+powerpc can't deal with nesting, so avoids it in commit 47b8def9358c
+("powerpc/mm: Avoid calling arch_enter/leave_lazy_mmu() in set_ptes"),
+which removes the lazy mmu calls from its implementation of set_ptes().
+x86 attempted to support nesting in commit 49147beb0ccb ("x86/xen: allow
+nesting of same lazy mode") but as far as I can tell, this breaks its
+support for preemption.
+
+In short, it's all a mess; the semantics for
+arch_[enter|leave]_lazy_mmu_mode() are not clearly defined and as a result
+the implementations all have different expectations, sticking plasters and
+bugs.
+
+arm64 is aiming to start using these hooks, so let's clean everything up
+before adding an arm64 implementation. Update the documentation to state
+that lazy mmu regions can never be nested, must not be called in interrupt
+context and preemption may or may not be enabled for the duration of the
+region. And fix the generic implementation of set_ptes() to avoid
+nesting.
+
+arch-specific fixes to conform to the new spec will proceed this one.
+
+These issues were spotted by code review and I have no evidence of issues
+being reported in the wild.
+
+Link: https://lkml.kernel.org/r/20250303141542.3371656-1-ryan.roberts@arm.com
+Link: https://lkml.kernel.org/r/20250303141542.3371656-2-ryan.roberts@arm.com
+Fixes: bcc6cc832573 ("mm: add default definition of set_ptes()")
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Juergen Gross <jgross@suse.com>
+Cc: Andreas Larsson <andreas@gaisler.com>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David S. Miller <davem@davemloft.net>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Juegren Gross <jgross@suse.com>
+Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/pgtable.h | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -201,10 +201,14 @@ static inline int pmd_dirty(pmd_t pmd)
+ * hazard could result in the direct mode hypervisor case, since the actual
+ * write to the page tables may not yet have taken place, so reads though
+ * a raw PTE pointer after it has been modified are not guaranteed to be
+- * up to date. This mode can only be entered and left under the protection of
+- * the page table locks for all page tables which may be modified. In the UP
+- * case, this is required so that preemption is disabled, and in the SMP case,
+- * it must synchronize the delayed page table writes properly on other CPUs.
++ * up to date.
++ *
++ * In the general case, no lock is guaranteed to be held between entry and exit
++ * of the lazy mode. So the implementation must assume preemption may be enabled
++ * and cpu migration is possible; it must take steps to be robust against this.
++ * (In practice, for user PTE updates, the appropriate page table lock(s) are
++ * held, but for kernel PTE updates, no lock is held). Nesting is not permitted
++ * and the mode cannot be used in interrupt context.
+ */
+ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+ #define arch_enter_lazy_mmu_mode() do {} while (0)
+@@ -266,7 +270,6 @@ static inline void set_ptes(struct mm_st
+ {
+ page_table_check_ptes_set(mm, ptep, pte, nr);
+
+- arch_enter_lazy_mmu_mode();
+ for (;;) {
+ set_pte(ptep, pte);
+ if (--nr == 0)
+@@ -274,7 +277,6 @@ static inline void set_ptes(struct mm_st
+ ptep++;
+ pte = pte_next_pfn(pte);
+ }
+- arch_leave_lazy_mmu_mode();
+ }
+ #endif
+ #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)
--- /dev/null
+From 1ca77ff1837249701053a7fcbdedabc41f4ae67c Mon Sep 17 00:00:00 2001
+From: Marc Herbert <Marc.Herbert@linux.intel.com>
+Date: Wed, 19 Mar 2025 06:00:30 +0000
+Subject: mm/hugetlb: move hugetlb_sysctl_init() to the __init section
+
+From: Marc Herbert <Marc.Herbert@linux.intel.com>
+
+commit 1ca77ff1837249701053a7fcbdedabc41f4ae67c upstream.
+
+hugetlb_sysctl_init() is only invoked once by an __init function and is
+merely a wrapper around another __init function so there is not reason to
+keep it.
+
+Fixes the following warning when toning down some GCC inline options:
+
+ WARNING: modpost: vmlinux: section mismatch in reference:
+ hugetlb_sysctl_init+0x1b (section: .text) ->
+ __register_sysctl_init (section: .init.text)
+
+Link: https://lkml.kernel.org/r/20250319060041.2737320-1-marc.herbert@linux.intel.com
+Signed-off-by: Marc Herbert <Marc.Herbert@linux.intel.com>
+Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
+Reviewed-by: Muchun Song <muchun.song@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -4863,7 +4863,7 @@ static struct ctl_table hugetlb_table[]
+ },
+ };
+
+-static void hugetlb_sysctl_init(void)
++static void __init hugetlb_sysctl_init(void)
+ {
+ register_sysctl_init("vm", hugetlb_table);
+ }
--- /dev/null
+From aaf99ac2ceb7c974f758a635723eeaf48596388e Mon Sep 17 00:00:00 2001
+From: Shuai Xue <xueshuai@linux.alibaba.com>
+Date: Wed, 12 Mar 2025 19:28:51 +0800
+Subject: mm/hwpoison: do not send SIGBUS to processes with recovered clean pages
+
+From: Shuai Xue <xueshuai@linux.alibaba.com>
+
+commit aaf99ac2ceb7c974f758a635723eeaf48596388e upstream.
+
+When an uncorrected memory error is consumed there is a race between the
+CMCI from the memory controller reporting an uncorrected error with a UCNA
+signature, and the core reporting and SRAR signature machine check when
+the data is about to be consumed.
+
+- Background: why *UN*corrected errors tied to *C*MCI in Intel platform [1]
+
+Prior to Icelake memory controllers reported patrol scrub events that
+detected a previously unseen uncorrected error in memory by signaling a
+broadcast machine check with an SRAO (Software Recoverable Action
+Optional) signature in the machine check bank. This was overkill because
+it's not an urgent problem that no core is on the verge of consuming that
+bad data. It's also found that multi SRAO UCE may cause nested MCE
+interrupts and finally become an IERR.
+
+Hence, Intel downgrades the machine check bank signature of patrol scrub
+from SRAO to UCNA (Uncorrected, No Action required), and signal changed to
+#CMCI. Just to add to the confusion, Linux does take an action (in
+uc_decode_notifier()) to try to offline the page despite the UC*NA*
+signature name.
+
+- Background: why #CMCI and #MCE race when poison is consuming in Intel platform [1]
+
+Having decided that CMCI/UCNA is the best action for patrol scrub errors,
+the memory controller uses it for reads too. But the memory controller is
+executing asynchronously from the core, and can't tell the difference
+between a "real" read and a speculative read. So it will do CMCI/UCNA if
+an error is found in any read.
+
+Thus:
+
+1) Core is clever and thinks address A is needed soon, issues a speculative read.
+2) Core finds it is going to use address A soon after sending the read request
+3) The CMCI from the memory controller is in a race with MCE from the core
+ that will soon try to retire the load from address A.
+
+Quite often (because speculation has got better) the CMCI from the memory
+controller is delivered before the core is committed to the instruction
+reading address A, so the interrupt is taken, and Linux offlines the page
+(marking it as poison).
+
+- Why user process is killed for instr case
+
+Commit 046545a661af ("mm/hwpoison: fix error page recovered but reported
+"not recovered"") tries to fix noise message "Memory error not recovered"
+and skips duplicate SIGBUSs due to the race. But it also introduced a bug
+that kill_accessing_process() return -EHWPOISON for instr case, as result,
+kill_me_maybe() send a SIGBUS to user process.
+
+If the CMCI wins that race, the page is marked poisoned when
+uc_decode_notifier() calls memory_failure(). For dirty pages,
+memory_failure() invokes try_to_unmap() with the TTU_HWPOISON flag,
+converting the PTE to a hwpoison entry. As a result,
+kill_accessing_process():
+
+- call walk_page_range() and return 1 regardless of whether
+ try_to_unmap() succeeds or fails,
+- call kill_proc() to make sure a SIGBUS is sent
+- return -EHWPOISON to indicate that SIGBUS is already sent to the
+ process and kill_me_maybe() doesn't have to send it again.
+
+However, for clean pages, the TTU_HWPOISON flag is cleared, leaving the
+PTE unchanged and not converted to a hwpoison entry. Conversely, for
+clean pages where PTE entries are not marked as hwpoison,
+kill_accessing_process() returns -EFAULT, causing kill_me_maybe() to send
+a SIGBUS.
+
+Console log looks like this:
+
+ Memory failure: 0x827ca68: corrupted page was clean: dropped without side effects
+ Memory failure: 0x827ca68: recovery action for clean LRU page: Recovered
+ Memory failure: 0x827ca68: already hardware poisoned
+ mce: Memory error not recovered
+
+To fix it, return 0 for "corrupted page was clean", preventing an
+unnecessary SIGBUS to user process.
+
+[1] https://lore.kernel.org/lkml/20250217063335.22257-1-xueshuai@linux.alibaba.com/T/#mba94f1305b3009dd340ce4114d3221fe810d1871
+Link: https://lkml.kernel.org/r/20250312112852.82415-3-xueshuai@linux.alibaba.com
+Fixes: 046545a661af ("mm/hwpoison: fix error page recovered but reported "not recovered"")
+Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
+Tested-by: Tony Luck <tony.luck@intel.com>
+Acked-by: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jane Chu <jane.chu@oracle.com>
+Cc: Jarkko Sakkinen <jarkko@kernel.org>
+Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
+Cc: Josh Poimboeuf <jpoimboe@kernel.org>
+Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Ruidong Tian <tianruidong@linux.alibaba.com>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: Yazen Ghannam <yazen.ghannam@amd.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory-failure.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -879,12 +879,17 @@ static int kill_accessing_process(struct
+ mmap_read_lock(p->mm);
+ ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
+ (void *)&priv);
++ /*
++ * ret = 1 when CMCI wins, regardless of whether try_to_unmap()
++ * succeeds or fails, then kill the process with SIGBUS.
++ * ret = 0 when poison page is a clean page and it's dropped, no
++ * SIGBUS is needed.
++ */
+ if (ret == 1 && priv.tk.addr)
+ kill_proc(&priv.tk, pfn, flags);
+- else
+- ret = 0;
+ mmap_read_unlock(p->mm);
+- return ret > 0 ? -EHWPOISON : -EFAULT;
++
++ return ret > 0 ? -EHWPOISON : 0;
+ }
+
+ /*
--- /dev/null
+From 5f5ee52d4f58605330b09851273d6e56aaadd29e Mon Sep 17 00:00:00 2001
+From: Jinjiang Tu <tujinjiang@huawei.com>
+Date: Tue, 18 Mar 2025 16:39:38 +0800
+Subject: mm/hwpoison: introduce folio_contain_hwpoisoned_page() helper
+
+From: Jinjiang Tu <tujinjiang@huawei.com>
+
+commit 5f5ee52d4f58605330b09851273d6e56aaadd29e upstream.
+
+Patch series "mm/vmscan: don't try to reclaim hwpoison folio".
+
+Fix a bug during memory reclaim if folio is hwpoisoned.
+
+
+This patch (of 2):
+
+Introduce helper folio_contain_hwpoisoned_page() to check if the entire
+folio is hwpoisoned or it contains hwpoisoned pages.
+
+Link: https://lkml.kernel.org/r/20250318083939.987651-1-tujinjiang@huawei.com
+Link: https://lkml.kernel.org/r/20250318083939.987651-2-tujinjiang@huawei.com
+Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
+Acked-by: Miaohe Lin <linmiaohe@huawei.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Nanyong Sun <sunnanyong@huawei.com>
+Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
+Cc: <stable@vger,kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/page-flags.h | 6 ++++++
+ mm/memory_hotplug.c | 3 +--
+ mm/shmem.c | 3 +--
+ 3 files changed, 8 insertions(+), 4 deletions(-)
+
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -1111,6 +1111,12 @@ static inline bool is_page_hwpoison(cons
+ return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
+ }
+
++static inline bool folio_contain_hwpoisoned_page(struct folio *folio)
++{
++ return folio_test_hwpoison(folio) ||
++ (folio_test_large(folio) && folio_test_has_hwpoisoned(folio));
++}
++
+ bool is_free_buddy_page(const struct page *page);
+
+ PAGEFLAG(Isolated, isolated, PF_ANY);
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -1801,8 +1801,7 @@ static void do_migrate_range(unsigned lo
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
+
+- if (folio_test_hwpoison(folio) ||
+- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
++ if (folio_contain_hwpoisoned_page(folio)) {
+ if (WARN_ON(folio_test_lru(folio)))
+ folio_isolate_lru(folio);
+ if (folio_mapped(folio)) {
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -3042,8 +3042,7 @@ shmem_write_begin(struct file *file, str
+ if (ret)
+ return ret;
+
+- if (folio_test_hwpoison(folio) ||
+- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
++ if (folio_contain_hwpoisoned_page(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -EIO;
--- /dev/null
+From 442b1eca223b4860cc85ef970ae602d125aec5a4 Mon Sep 17 00:00:00 2001
+From: Jane Chu <jane.chu@oracle.com>
+Date: Mon, 24 Feb 2025 14:14:45 -0700
+Subject: mm: make page_mapped_in_vma() hugetlb walk aware
+
+From: Jane Chu <jane.chu@oracle.com>
+
+commit 442b1eca223b4860cc85ef970ae602d125aec5a4 upstream.
+
+When a process consumes a UE in a page, the memory failure handler
+attempts to collect information for a potential SIGBUS. If the page is an
+anonymous page, page_mapped_in_vma(page, vma) is invoked in order to
+
+ 1. retrieve the vaddr from the process' address space,
+
+ 2. verify that the vaddr is indeed mapped to the poisoned page,
+ where 'page' is the precise small page with UE.
+
+It's been observed that when injecting poison to a non-head subpage of an
+anonymous hugetlb page, no SIGBUS shows up, while injecting to the head
+page produces a SIGBUS. The cause is that, though hugetlb_walk() returns
+a valid pmd entry (on x86), but check_pte() detects mismatch between the
+head page per the pmd and the input subpage. Thus the vaddr is considered
+not mapped to the subpage and the process is not collected for SIGBUS
+purpose. This is the calling stack:
+
+ collect_procs_anon
+ page_mapped_in_vma
+ page_vma_mapped_walk
+ hugetlb_walk
+ huge_pte_lock
+ check_pte
+
+check_pte() header says that it
+"check if [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) is mapped at the @pvmw->pte"
+but practically works only if pvmw->pfn is the head page pfn at pvmw->pte.
+Hindsight acknowledging that some pvmw->pte could point to a hugepage of
+some sort such that it makes sense to make check_pte() work for hugepage.
+
+Link: https://lkml.kernel.org/r/20250224211445.2663312-1-jane.chu@oracle.com
+Signed-off-by: Jane Chu <jane.chu@oracle.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
+Cc: linmiaohe <linmiaohe@huawei.com>
+Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_vma_mapped.c | 13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/mm/page_vma_mapped.c
++++ b/mm/page_vma_mapped.c
+@@ -77,6 +77,7 @@ static bool map_pte(struct page_vma_mapp
+ * mapped at the @pvmw->pte
+ * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range
+ * for checking
++ * @pte_nr: the number of small pages described by @pvmw->pte.
+ *
+ * page_vma_mapped_walk() found a place where pfn range is *potentially*
+ * mapped. check_pte() has to validate this.
+@@ -93,7 +94,7 @@ static bool map_pte(struct page_vma_mapp
+ * Otherwise, return false.
+ *
+ */
+-static bool check_pte(struct page_vma_mapped_walk *pvmw)
++static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr)
+ {
+ unsigned long pfn;
+ pte_t ptent = ptep_get(pvmw->pte);
+@@ -126,7 +127,11 @@ static bool check_pte(struct page_vma_ma
+ pfn = pte_pfn(ptent);
+ }
+
+- return (pfn - pvmw->pfn) < pvmw->nr_pages;
++ if ((pfn + pte_nr - 1) < pvmw->pfn)
++ return false;
++ if (pfn > (pvmw->pfn + pvmw->nr_pages - 1))
++ return false;
++ return true;
+ }
+
+ /* Returns true if the two ranges overlap. Careful to not overflow. */
+@@ -201,7 +206,7 @@ bool page_vma_mapped_walk(struct page_vm
+ return false;
+
+ pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
+- if (!check_pte(pvmw))
++ if (!check_pte(pvmw, pages_per_huge_page(hstate)))
+ return not_found(pvmw);
+ return true;
+ }
+@@ -284,7 +289,7 @@ restart:
+ goto next_pte;
+ }
+ this_pte:
+- if (check_pte(pvmw))
++ if (check_pte(pvmw, 1))
+ return true;
+ next_pte:
+ do {
--- /dev/null
+From 937582ee8e8d227c30ec147629a0179131feaa80 Mon Sep 17 00:00:00 2001
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Date: Mon, 10 Mar 2025 20:50:34 +0000
+Subject: mm/mremap: correctly handle partial mremap() of VMA starting at 0
+
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+
+commit 937582ee8e8d227c30ec147629a0179131feaa80 upstream.
+
+Patch series "refactor mremap and fix bug", v3.
+
+The existing mremap() logic has grown organically over a very long period
+of time, resulting in code that is in many parts, very difficult to follow
+and full of subtleties and sources of confusion.
+
+In addition, it is difficult to thread state through the operation
+correctly, as function arguments have expanded, some parameters are
+expected to be temporarily altered during the operation, others are
+intended to remain static and some can be overridden.
+
+This series completely refactors the mremap implementation, sensibly
+separating functions, adding comments to explain the more subtle aspects
+of the implementation and making use of small structs to thread state
+through everything.
+
+The reason for doing so is to lay the groundwork for planned future
+changes to the mremap logic, changes which require the ability to easily
+pass around state.
+
+Additionally, it would be unhelpful to add yet more logic to code that is
+already difficult to follow without first refactoring it like this.
+
+The first patch in this series additionally fixes a bug when a VMA with
+start address zero is partially remapped.
+
+Tested on real hardware under heavy workload and all self tests are
+passing.
+
+
+This patch (of 3):
+
+Consider the case of a partial mremap() (that results in a VMA split) of
+an accountable VMA (i.e. which has the VM_ACCOUNT flag set) whose start
+address is zero, with the MREMAP_MAYMOVE flag specified and a scenario
+where a move does in fact occur:
+
+ addr end
+ | |
+ v v
+ |-------------|
+ | vma |
+ |-------------|
+ 0
+
+This move is affected by unmapping the range [addr, end). In order to
+prevent an incorrect decrement of accounted memory which has already been
+determined, the mremap() code in move_vma() clears VM_ACCOUNT from the VMA
+prior to doing so, before reestablishing it in each of the VMAs
+post-split:
+
+ addr end
+ | |
+ v v
+ |---| |---|
+ | A | | B |
+ |---| |---|
+
+Commit 6b73cff239e5 ("mm: change munmap splitting order and move_vma()")
+changed this logic such as to determine whether there is a need to do so
+by establishing account_start and account_end and, in the instance where
+such an operation is required, assigning them to vma->vm_start and
+vma->vm_end.
+
+Later the code checks if the operation is required for 'A' referenced
+above thusly:
+
+ if (account_start) {
+ ...
+ }
+
+However, if the VMA described above has vma->vm_start == 0, which is now
+assigned to account_start, this branch will not be executed.
+
+As a result, the VMA 'A' above will remain stripped of its VM_ACCOUNT
+flag, incorrectly.
+
+The fix is to simply convert these variables to booleans and set them as
+required.
+
+Link: https://lkml.kernel.org/r/cover.1741639347.git.lorenzo.stoakes@oracle.com
+Link: https://lkml.kernel.org/r/dc55cb6db25d97c3d9e460de4986a323fa959676.1741639347.git.lorenzo.stoakes@oracle.com
+Fixes: 6b73cff239e5 ("mm: change munmap splitting order and move_vma()")
+Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
+Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mremap.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -696,8 +696,8 @@ static unsigned long move_vma(struct vm_
+ unsigned long vm_flags = vma->vm_flags;
+ unsigned long new_pgoff;
+ unsigned long moved_len;
+- unsigned long account_start = 0;
+- unsigned long account_end = 0;
++ bool account_start = false;
++ bool account_end = false;
+ unsigned long hiwater_vm;
+ int err = 0;
+ bool need_rmap_locks;
+@@ -781,9 +781,9 @@ static unsigned long move_vma(struct vm_
+ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
+ vm_flags_clear(vma, VM_ACCOUNT);
+ if (vma->vm_start < old_addr)
+- account_start = vma->vm_start;
++ account_start = true;
+ if (vma->vm_end > old_addr + old_len)
+- account_end = vma->vm_end;
++ account_end = true;
+ }
+
+ /*
+@@ -823,7 +823,7 @@ static unsigned long move_vma(struct vm_
+ /* OOM: unable to split vma, just get accounts right */
+ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
+ vm_acct_memory(old_len >> PAGE_SHIFT);
+- account_start = account_end = 0;
++ account_start = account_end = false;
+ }
+
+ if (vm_flags & VM_LOCKED) {
--- /dev/null
+From bc3fe6805cf09a25a086573a17d40e525208c5d8 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Mon, 10 Feb 2025 20:37:44 +0100
+Subject: mm/rmap: reject hugetlb folios in folio_make_device_exclusive()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit bc3fe6805cf09a25a086573a17d40e525208c5d8 upstream.
+
+Even though FOLL_SPLIT_PMD on hugetlb now always fails with -EOPNOTSUPP,
+let's add a safety net in case FOLL_SPLIT_PMD usage would ever be
+reworked.
+
+In particular, before commit 9cb28da54643 ("mm/gup: handle hugetlb in the
+generic follow_page_mask code"), GUP(FOLL_SPLIT_PMD) would just have
+returned a page. In particular, hugetlb folios that are not PMD-sized
+would never have been prone to FOLL_SPLIT_PMD.
+
+hugetlb folios can be anonymous, and page_make_device_exclusive_one() is
+not really prepared for handling them at all. So let's spell that out.
+
+Link: https://lkml.kernel.org/r/20250210193801.781278-3-david@redhat.com
+Fixes: b756a3b5e7ea ("mm: device exclusive memory access")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Alistair Popple <apopple@nvidia.com>
+Tested-by: Alistair Popple <apopple@nvidia.com>
+Cc: Alex Shi <alexs@kernel.org>
+Cc: Danilo Krummrich <dakr@kernel.org>
+Cc: Dave Airlie <airlied@gmail.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jason Gunthorpe <jgg@nvidia.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Karol Herbst <kherbst@redhat.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Lyude <lyude@redhat.com>
+Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: SeongJae Park <sj@kernel.org>
+Cc: Simona Vetter <simona.vetter@ffwll.ch>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Yanteng Si <si.yanteng@linux.dev>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/rmap.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -2488,7 +2488,7 @@ static bool folio_make_device_exclusive(
+ * Restrict to anonymous folios for now to avoid potential writeback
+ * issues.
+ */
+- if (!folio_test_anon(folio))
++ if (!folio_test_anon(folio) || folio_test_hugetlb(folio))
+ return false;
+
+ rmap_walk(folio, &rwc);
--- /dev/null
+From fe4cdc2c4e248f48de23bc778870fd71e772a274 Mon Sep 17 00:00:00 2001
+From: Peter Xu <peterx@redhat.com>
+Date: Wed, 12 Mar 2025 10:51:31 -0400
+Subject: mm/userfaultfd: fix release hang over concurrent GUP
+
+From: Peter Xu <peterx@redhat.com>
+
+commit fe4cdc2c4e248f48de23bc778870fd71e772a274 upstream.
+
+This patch should fix a possible userfaultfd release() hang during
+concurrent GUP.
+
+This problem was initially reported by Dimitris Siakavaras in July 2023
+[1] in a firecracker use case. Firecracker has a separate process
+handling page faults remotely, and when the process releases the
+userfaultfd it can race with a concurrent GUP from KVM trying to fault in
+a guest page during the secondary MMU page fault process.
+
+A similar problem was reported recently again by Jinjiang Tu in March 2025
+[2], even though the race happened this time with a mlockall() operation,
+which does GUP in a similar fashion.
+
+In 2017, commit 656710a60e36 ("userfaultfd: non-cooperative: closing the
+uffd without triggering SIGBUS") was trying to fix this issue. AFAIU,
+that fixes well the fault paths but may not work yet for GUP. In GUP, the
+issue is NOPAGE will be almost treated the same as "page fault resolved"
+in faultin_page(), then the GUP will follow page again, seeing page
+missing, and it'll keep going into a live lock situation as reported.
+
+This change makes core mm return RETRY instead of NOPAGE for both the GUP
+and fault paths, proactively releasing the mmap read lock. This should
+guarantee the other release thread make progress on taking the write lock
+and avoid the live lock even for GUP.
+
+When at it, rearrange the comments to make sure it's uptodate.
+
+[1] https://lore.kernel.org/r/79375b71-db2e-3e66-346b-254c90d915e2@cslab.ece.ntua.gr
+[2] https://lore.kernel.org/r/20250307072133.3522652-1-tujinjiang@huawei.com
+
+Link: https://lkml.kernel.org/r/20250312145131.1143062-1-peterx@redhat.com
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Mike Rapoport (IBM) <rppt@kernel.org>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Jinjiang Tu <tujinjiang@huawei.com>
+Cc: Dimitris Siakavaras <jimsiak@cslab.ece.ntua.gr>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/userfaultfd.c | 51 +++++++++++++++++++++++++--------------------------
+ 1 file changed, 25 insertions(+), 26 deletions(-)
+
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -396,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fa
+ goto out;
+
+ /*
+- * If it's already released don't get it. This avoids to loop
+- * in __get_user_pages if userfaultfd_release waits on the
+- * caller of handle_userfault to release the mmap_lock.
+- */
+- if (unlikely(READ_ONCE(ctx->released))) {
+- /*
+- * Don't return VM_FAULT_SIGBUS in this case, so a non
+- * cooperative manager can close the uffd after the
+- * last UFFDIO_COPY, without risking to trigger an
+- * involuntary SIGBUS if the process was starting the
+- * userfaultfd while the userfaultfd was still armed
+- * (but after the last UFFDIO_COPY). If the uffd
+- * wasn't already closed when the userfault reached
+- * this point, that would normally be solved by
+- * userfaultfd_must_wait returning 'false'.
+- *
+- * If we were to return VM_FAULT_SIGBUS here, the non
+- * cooperative manager would be instead forced to
+- * always call UFFDIO_UNREGISTER before it can safely
+- * close the uffd.
+- */
+- ret = VM_FAULT_NOPAGE;
+- goto out;
+- }
+-
+- /*
+ * Check that we can return VM_FAULT_RETRY.
+ *
+ * NOTE: it should become possible to return VM_FAULT_RETRY
+@@ -457,6 +431,31 @@ vm_fault_t handle_userfault(struct vm_fa
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ goto out;
+
++ if (unlikely(READ_ONCE(ctx->released))) {
++ /*
++ * If a concurrent release is detected, do not return
++ * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
++ * return VM_FAULT_RETRY with lock released proactively.
++ *
++ * If we were to return VM_FAULT_SIGBUS here, the non
++ * cooperative manager would be instead forced to
++ * always call UFFDIO_UNREGISTER before it can safely
++ * close the uffd, to avoid involuntary SIGBUS triggered.
++ *
++ * If we were to return VM_FAULT_NOPAGE, it would work for
++ * the fault path, in which the lock will be released
++ * later. However for GUP, faultin_page() does nothing
++ * special on NOPAGE, so GUP would spin retrying without
++ * releasing the mmap read lock, causing possible livelock.
++ *
++ * Here only VM_FAULT_RETRY would make sure the mmap lock
++ * be released immediately, so that the thread concurrently
++ * releasing the userfault would always make progress.
++ */
++ release_fault_lock(vmf);
++ goto out;
++ }
++
+ /* take the reference before dropping the mmap_lock */
+ userfaultfd_ctx_get(ctx);
+
--- /dev/null
+From 0bb2f7a1ad1f11d861f58e5ee5051c8974ff9569 Mon Sep 17 00:00:00 2001
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+Date: Mon, 7 Apr 2025 09:33:11 -0700
+Subject: net: Fix null-ptr-deref by sock_lock_init_class_and_name() and rmmod.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 0bb2f7a1ad1f11d861f58e5ee5051c8974ff9569 upstream.
+
+When I ran the repro [0] and waited a few seconds, I observed two
+LOCKDEP splats: a warning immediately followed by a null-ptr-deref. [1]
+
+Reproduction Steps:
+
+ 1) Mount CIFS
+ 2) Add an iptables rule to drop incoming FIN packets for CIFS
+ 3) Unmount CIFS
+ 4) Unload the CIFS module
+ 5) Remove the iptables rule
+
+At step 3), the CIFS module calls sock_release() for the underlying
+TCP socket, and it returns quickly. However, the socket remains in
+FIN_WAIT_1 because incoming FIN packets are dropped.
+
+At this point, the module's refcnt is 0 while the socket is still
+alive, so the following rmmod command succeeds.
+
+ # ss -tan
+ State Recv-Q Send-Q Local Address:Port Peer Address:Port
+ FIN-WAIT-1 0 477 10.0.2.15:51062 10.0.0.137:445
+
+ # lsmod | grep cifs
+ cifs 1159168 0
+
+This highlights a discrepancy between the lifetime of the CIFS module
+and the underlying TCP socket. Even after CIFS calls sock_release()
+and it returns, the TCP socket does not die immediately in order to
+close the connection gracefully.
+
+While this is generally fine, it causes an issue with LOCKDEP because
+CIFS assigns a different lock class to the TCP socket's sk->sk_lock
+using sock_lock_init_class_and_name().
+
+Once an incoming packet is processed for the socket or a timer fires,
+sk->sk_lock is acquired.
+
+Then, LOCKDEP checks the lock context in check_wait_context(), where
+hlock_class() is called to retrieve the lock class. However, since
+the module has already been unloaded, hlock_class() logs a warning
+and returns NULL, triggering the null-ptr-deref.
+
+If LOCKDEP is enabled, we must ensure that a module calling
+sock_lock_init_class_and_name() (CIFS, NFS, etc) cannot be unloaded
+while such a socket is still alive to prevent this issue.
+
+Let's hold the module reference in sock_lock_init_class_and_name()
+and release it when the socket is freed in sk_prot_free().
+
+Note that sock_lock_init() clears sk->sk_owner for svc_create_socket()
+that calls sock_lock_init_class_and_name() for a listening socket,
+which clones a socket by sk_clone_lock() without GFP_ZERO.
+
+[0]:
+CIFS_SERVER="10.0.0.137"
+CIFS_PATH="//${CIFS_SERVER}/Users/Administrator/Desktop/CIFS_TEST"
+DEV="enp0s3"
+CRED="/root/WindowsCredential.txt"
+
+MNT=$(mktemp -d /tmp/XXXXXX)
+mount -t cifs ${CIFS_PATH} ${MNT} -o vers=3.0,credentials=${CRED},cache=none,echo_interval=1
+
+iptables -A INPUT -s ${CIFS_SERVER} -j DROP
+
+for i in $(seq 10);
+do
+ umount ${MNT}
+ rmmod cifs
+ sleep 1
+done
+
+rm -r ${MNT}
+
+iptables -D INPUT -s ${CIFS_SERVER} -j DROP
+
+[1]:
+DEBUG_LOCKS_WARN_ON(1)
+WARNING: CPU: 10 PID: 0 at kernel/locking/lockdep.c:234 hlock_class (kernel/locking/lockdep.c:234 kernel/locking/lockdep.c:223)
+Modules linked in: cifs_arc4 nls_ucs2_utils cifs_md4 [last unloaded: cifs]
+CPU: 10 UID: 0 PID: 0 Comm: swapper/10 Not tainted 6.14.0 #36
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
+RIP: 0010:hlock_class (kernel/locking/lockdep.c:234 kernel/locking/lockdep.c:223)
+...
+Call Trace:
+ <IRQ>
+ __lock_acquire (kernel/locking/lockdep.c:4853 kernel/locking/lockdep.c:5178)
+ lock_acquire (kernel/locking/lockdep.c:469 kernel/locking/lockdep.c:5853 kernel/locking/lockdep.c:5816)
+ _raw_spin_lock_nested (kernel/locking/spinlock.c:379)
+ tcp_v4_rcv (./include/linux/skbuff.h:1678 ./include/net/tcp.h:2547 net/ipv4/tcp_ipv4.c:2350)
+...
+
+BUG: kernel NULL pointer dereference, address: 00000000000000c4
+ PF: supervisor read access in kernel mode
+ PF: error_code(0x0000) - not-present page
+PGD 0
+Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
+CPU: 10 UID: 0 PID: 0 Comm: swapper/10 Tainted: G W 6.14.0 #36
+Tainted: [W]=WARN
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
+RIP: 0010:__lock_acquire (kernel/locking/lockdep.c:4852 kernel/locking/lockdep.c:5178)
+Code: 15 41 09 c7 41 8b 44 24 20 25 ff 1f 00 00 41 09 c7 8b 84 24 a0 00 00 00 45 89 7c 24 20 41 89 44 24 24 e8 e1 bc ff ff 4c 89 e7 <44> 0f b6 b8 c4 00 00 00 e8 d1 bc ff ff 0f b6 80 c5 00 00 00 88 44
+RSP: 0018:ffa0000000468a10 EFLAGS: 00010046
+RAX: 0000000000000000 RBX: ff1100010091cc38 RCX: 0000000000000027
+RDX: ff1100081f09ca48 RSI: 0000000000000001 RDI: ff1100010091cc88
+RBP: ff1100010091c200 R08: ff1100083fe6e228 R09: 00000000ffffbfff
+R10: ff1100081eca0000 R11: ff1100083fe10dc0 R12: ff1100010091cc88
+R13: 0000000000000001 R14: 0000000000000000 R15: 00000000000424b1
+FS: 0000000000000000(0000) GS:ff1100081f080000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00000000000000c4 CR3: 0000000002c4a003 CR4: 0000000000771ef0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
+PKRU: 55555554
+Call Trace:
+ <IRQ>
+ lock_acquire (kernel/locking/lockdep.c:469 kernel/locking/lockdep.c:5853 kernel/locking/lockdep.c:5816)
+ _raw_spin_lock_nested (kernel/locking/spinlock.c:379)
+ tcp_v4_rcv (./include/linux/skbuff.h:1678 ./include/net/tcp.h:2547 net/ipv4/tcp_ipv4.c:2350)
+ ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205 (discriminator 1))
+ ip_local_deliver_finish (./include/linux/rcupdate.h:878 net/ipv4/ip_input.c:234)
+ ip_sublist_rcv_finish (net/ipv4/ip_input.c:576)
+ ip_list_rcv_finish (net/ipv4/ip_input.c:628)
+ ip_list_rcv (net/ipv4/ip_input.c:670)
+ __netif_receive_skb_list_core (net/core/dev.c:5939 net/core/dev.c:5986)
+ netif_receive_skb_list_internal (net/core/dev.c:6040 net/core/dev.c:6129)
+ napi_complete_done (./include/linux/list.h:37 ./include/net/gro.h:519 ./include/net/gro.h:514 net/core/dev.c:6496)
+ e1000_clean (drivers/net/ethernet/intel/e1000/e1000_main.c:3815)
+ __napi_poll.constprop.0 (net/core/dev.c:7191)
+ net_rx_action (net/core/dev.c:7262 net/core/dev.c:7382)
+ handle_softirqs (kernel/softirq.c:561)
+ __irq_exit_rcu (kernel/softirq.c:596 kernel/softirq.c:435 kernel/softirq.c:662)
+ irq_exit_rcu (kernel/softirq.c:680)
+ common_interrupt (arch/x86/kernel/irq.c:280 (discriminator 14))
+ </IRQ>
+ <TASK>
+ asm_common_interrupt (./arch/x86/include/asm/idtentry.h:693)
+RIP: 0010:default_idle (./arch/x86/include/asm/irqflags.h:37 ./arch/x86/include/asm/irqflags.h:92 arch/x86/kernel/process.c:744)
+Code: 4c 01 c7 4c 29 c2 e9 72 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa eb 07 0f 00 2d c3 2b 15 00 fb f4 <fa> c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90
+RSP: 0018:ffa00000000ffee8 EFLAGS: 00000202
+RAX: 000000000000640b RBX: ff1100010091c200 RCX: 0000000000061aa4
+RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff812f30c5
+RBP: 000000000000000a R08: 0000000000000001 R09: 0000000000000000
+R10: 0000000000000001 R11: 0000000000000002 R12: 0000000000000000
+R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+ ? do_idle (kernel/sched/idle.c:186 kernel/sched/idle.c:325)
+ default_idle_call (./include/linux/cpuidle.h:143 kernel/sched/idle.c:118)
+ do_idle (kernel/sched/idle.c:186 kernel/sched/idle.c:325)
+ cpu_startup_entry (kernel/sched/idle.c:422 (discriminator 1))
+ start_secondary (arch/x86/kernel/smpboot.c:315)
+ common_startup_64 (arch/x86/kernel/head_64.S:421)
+ </TASK>
+Modules linked in: cifs_arc4 nls_ucs2_utils cifs_md4 [last unloaded: cifs]
+CR2: 00000000000000c4
+
+Fixes: ed07536ed673 ("[PATCH] lockdep: annotate nfs/nfsd in-kernel sockets")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Cc: stable@vger.kernel.org
+Link: https://patch.msgid.link/20250407163313.22682-1-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/sock.h | 40 ++++++++++++++++++++++++++++++++++++++--
+ net/core/sock.c | 5 +++++
+ 2 files changed, 43 insertions(+), 2 deletions(-)
+
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -338,6 +338,8 @@ struct sk_filter;
+ * @sk_txtime_unused: unused txtime flags
+ * @ns_tracker: tracker for netns reference
+ * @sk_user_frags: xarray of pages the user is holding a reference on.
++ * @sk_owner: reference to the real owner of the socket that calls
++ * sock_lock_init_class_and_name().
+ */
+ struct sock {
+ /*
+@@ -544,6 +546,10 @@ struct sock {
+ struct rcu_head sk_rcu;
+ netns_tracker ns_tracker;
+ struct xarray sk_user_frags;
++
++#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
++ struct module *sk_owner;
++#endif
+ };
+
+ struct sock_bh_locked {
+@@ -1585,6 +1591,35 @@ static inline void sk_mem_uncharge(struc
+ sk_mem_reclaim(sk);
+ }
+
++#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
++static inline void sk_owner_set(struct sock *sk, struct module *owner)
++{
++ __module_get(owner);
++ sk->sk_owner = owner;
++}
++
++static inline void sk_owner_clear(struct sock *sk)
++{
++ sk->sk_owner = NULL;
++}
++
++static inline void sk_owner_put(struct sock *sk)
++{
++ module_put(sk->sk_owner);
++}
++#else
++static inline void sk_owner_set(struct sock *sk, struct module *owner)
++{
++}
++
++static inline void sk_owner_clear(struct sock *sk)
++{
++}
++
++static inline void sk_owner_put(struct sock *sk)
++{
++}
++#endif
+ /*
+ * Macro so as to not evaluate some arguments when
+ * lockdep is not enabled.
+@@ -1594,13 +1629,14 @@ static inline void sk_mem_uncharge(struc
+ */
+ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \
+ do { \
++ sk_owner_set(sk, THIS_MODULE); \
+ sk->sk_lock.owned = 0; \
+ init_waitqueue_head(&sk->sk_lock.wq); \
+ spin_lock_init(&(sk)->sk_lock.slock); \
+ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \
+- sizeof((sk)->sk_lock)); \
++ sizeof((sk)->sk_lock)); \
+ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \
+- (skey), (sname)); \
++ (skey), (sname)); \
+ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
+ } while (0)
+
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -2107,6 +2107,8 @@ lenout:
+ */
+ static inline void sock_lock_init(struct sock *sk)
+ {
++ sk_owner_clear(sk);
++
+ if (sk->sk_kern_sock)
+ sock_lock_init_class_and_name(
+ sk,
+@@ -2203,6 +2205,9 @@ static void sk_prot_free(struct proto *p
+ cgroup_sk_free(&sk->sk_cgrp_data);
+ mem_cgroup_sk_free(sk);
+ security_sk_free(sk);
++
++ sk_owner_put(sk);
++
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
--- /dev/null
+From f1a69a940de58b16e8249dff26f74c8cc59b32be Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ricardo=20Ca=C3=B1uelo=20Navarro?= <rcn@igalia.com>
+Date: Fri, 4 Apr 2025 16:53:21 +0200
+Subject: sctp: detect and prevent references to a freed transport in sendmsg
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ricardo Cañuelo Navarro <rcn@igalia.com>
+
+commit f1a69a940de58b16e8249dff26f74c8cc59b32be upstream.
+
+sctp_sendmsg() re-uses associations and transports when possible by
+doing a lookup based on the socket endpoint and the message destination
+address, and then sctp_sendmsg_to_asoc() sets the selected transport in
+all the message chunks to be sent.
+
+There's a possible race condition if another thread triggers the removal
+of that selected transport, for instance, by explicitly unbinding an
+address with setsockopt(SCTP_SOCKOPT_BINDX_REM), after the chunks have
+been set up and before the message is sent. This can happen if the send
+buffer is full, during the period when the sender thread temporarily
+releases the socket lock in sctp_wait_for_sndbuf().
+
+This causes the access to the transport data in
+sctp_outq_select_transport(), when the association outqueue is flushed,
+to result in a use-after-free read.
+
+This change avoids this scenario by having sctp_transport_free() signal
+the freeing of the transport, tagging it as "dead". In order to do this,
+the patch restores the "dead" bit in struct sctp_transport, which was
+removed in
+commit 47faa1e4c50e ("sctp: remove the dead field of sctp_transport").
+
+Then, in the scenario where the sender thread has released the socket
+lock in sctp_wait_for_sndbuf(), the bit is checked again after
+re-acquiring the socket lock to detect the deletion. This is done while
+holding a reference to the transport to prevent it from being freed in
+the process.
+
+If the transport was deleted while the socket lock was relinquished,
+sctp_sendmsg_to_asoc() will return -EAGAIN to let userspace retry the
+send.
+
+The bug was found by a private syzbot instance (see the error report [1]
+and the C reproducer that triggers it [2]).
+
+Link: https://people.igalia.com/rcn/kernel_logs/20250402__KASAN_slab-use-after-free_Read_in_sctp_outq_select_transport.txt [1]
+Link: https://people.igalia.com/rcn/kernel_logs/20250402__KASAN_slab-use-after-free_Read_in_sctp_outq_select_transport__repro.c [2]
+Cc: stable@vger.kernel.org
+Fixes: df132eff4638 ("sctp: clear the transport of some out_chunk_list chunks in sctp_assoc_rm_peer")
+Suggested-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: Ricardo Cañuelo Navarro <rcn@igalia.com>
+Acked-by: Xin Long <lucien.xin@gmail.com>
+Link: https://patch.msgid.link/20250404-kasan_slab-use-after-free_read_in_sctp_outq_select_transport__20250404-v1-1-5ce4a0b78ef2@igalia.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/sctp/structs.h | 3 ++-
+ net/sctp/socket.c | 22 ++++++++++++++--------
+ net/sctp/transport.c | 2 ++
+ 3 files changed, 18 insertions(+), 9 deletions(-)
+
+--- a/include/net/sctp/structs.h
++++ b/include/net/sctp/structs.h
+@@ -775,6 +775,7 @@ struct sctp_transport {
+
+ /* Reference counting. */
+ refcount_t refcnt;
++ __u32 dead:1,
+ /* RTO-Pending : A flag used to track if one of the DATA
+ * chunks sent to this address is currently being
+ * used to compute a RTT. If this flag is 0,
+@@ -784,7 +785,7 @@ struct sctp_transport {
+ * calculation completes (i.e. the DATA chunk
+ * is SACK'd) clear this flag.
+ */
+- __u32 rto_pending:1,
++ rto_pending:1,
+
+ /*
+ * hb_sent : a flag that signals that we have a pending
+--- a/net/sctp/socket.c
++++ b/net/sctp/socket.c
+@@ -72,8 +72,9 @@
+ /* Forward declarations for internal helper functions. */
+ static bool sctp_writeable(const struct sock *sk);
+ static void sctp_wfree(struct sk_buff *skb);
+-static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
+- size_t msg_len);
++static int sctp_wait_for_sndbuf(struct sctp_association *asoc,
++ struct sctp_transport *transport,
++ long *timeo_p, size_t msg_len);
+ static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p);
+ static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p);
+ static int sctp_wait_for_accept(struct sock *sk, long timeo);
+@@ -1828,7 +1829,7 @@ static int sctp_sendmsg_to_asoc(struct s
+
+ if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) {
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+- err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
++ err = sctp_wait_for_sndbuf(asoc, transport, &timeo, msg_len);
+ if (err)
+ goto err;
+ if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) {
+@@ -9214,8 +9215,9 @@ void sctp_sock_rfree(struct sk_buff *skb
+
+
+ /* Helper function to wait for space in the sndbuf. */
+-static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
+- size_t msg_len)
++static int sctp_wait_for_sndbuf(struct sctp_association *asoc,
++ struct sctp_transport *transport,
++ long *timeo_p, size_t msg_len)
+ {
+ struct sock *sk = asoc->base.sk;
+ long current_timeo = *timeo_p;
+@@ -9225,7 +9227,9 @@ static int sctp_wait_for_sndbuf(struct s
+ pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc,
+ *timeo_p, msg_len);
+
+- /* Increment the association's refcnt. */
++ /* Increment the transport and association's refcnt. */
++ if (transport)
++ sctp_transport_hold(transport);
+ sctp_association_hold(asoc);
+
+ /* Wait on the association specific sndbuf space. */
+@@ -9234,7 +9238,7 @@ static int sctp_wait_for_sndbuf(struct s
+ TASK_INTERRUPTIBLE);
+ if (asoc->base.dead)
+ goto do_dead;
+- if (!*timeo_p)
++ if ((!*timeo_p) || (transport && transport->dead))
+ goto do_nonblock;
+ if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING)
+ goto do_error;
+@@ -9259,7 +9263,9 @@ static int sctp_wait_for_sndbuf(struct s
+ out:
+ finish_wait(&asoc->wait, &wait);
+
+- /* Release the association's refcnt. */
++ /* Release the transport and association's refcnt. */
++ if (transport)
++ sctp_transport_put(transport);
+ sctp_association_put(asoc);
+
+ return err;
+--- a/net/sctp/transport.c
++++ b/net/sctp/transport.c
+@@ -117,6 +117,8 @@ fail:
+ */
+ void sctp_transport_free(struct sctp_transport *transport)
+ {
++ transport->dead = 1;
++
+ /* Try to delete the heartbeat timer. */
+ if (del_timer(&transport->hb_timer))
+ sctp_transport_put(transport);
btrfs-tests-fix-chunk-map-leak-after-failure-to-add-it-to-the-tree.patch
btrfs-zoned-fix-zone-activation-with-missing-devices.patch
btrfs-zoned-fix-zone-finishing-with-missing-devices.patch
+iommufd-fix-uninitialized-rc-in-iommufd_access_rw.patch
+iommu-tegra241-cmdqv-fix-warnings-due-to-dmam_free_coherent.patch
+iommu-vt-d-put-irte-back-into-posted-msi-mode-if-vcpu-posting-is-disabled.patch
+iommu-vt-d-don-t-clobber-posted-vcpu-irte-when-host-irq-affinity-changes.patch
+iommu-vt-d-fix-possible-circular-locking-dependency.patch
+iommu-vt-d-wire-up-irq_ack-to-irq_move_irq-for-posted-msis.patch
+sparc-mm-disable-preemption-in-lazy-mmu-mode.patch
+sparc-mm-avoid-calling-arch_enter-leave_lazy_mmu-in-set_ptes.patch
+net-fix-null-ptr-deref-by-sock_lock_init_class_and_name-and-rmmod.patch
+mm-damon-ops-have-damon_get_folio-return-folio-even-for-tail-pages.patch
+mm-rmap-reject-hugetlb-folios-in-folio_make_device_exclusive.patch
+mm-make-page_mapped_in_vma-hugetlb-walk-aware.patch
+mm-fix-lazy-mmu-docs-and-usage.patch
+mm-mremap-correctly-handle-partial-mremap-of-vma-starting-at-0.patch
+mm-add-missing-release-barrier-on-pgdat_reclaim_locked-unlock.patch
+mm-userfaultfd-fix-release-hang-over-concurrent-gup.patch
+mm-hwpoison-do-not-send-sigbus-to-processes-with-recovered-clean-pages.patch
+mm-hugetlb-move-hugetlb_sysctl_init-to-the-__init-section.patch
+mm-hwpoison-introduce-folio_contain_hwpoisoned_page-helper.patch
+sctp-detect-and-prevent-references-to-a-freed-transport-in-sendmsg.patch
--- /dev/null
+From eb61ad14c459b54f71f76331ca35d12fa3eb8f98 Mon Sep 17 00:00:00 2001
+From: Ryan Roberts <ryan.roberts@arm.com>
+Date: Mon, 3 Mar 2025 14:15:38 +0000
+Subject: sparc/mm: avoid calling arch_enter/leave_lazy_mmu() in set_ptes
+
+From: Ryan Roberts <ryan.roberts@arm.com>
+
+commit eb61ad14c459b54f71f76331ca35d12fa3eb8f98 upstream.
+
+With commit 1a10a44dfc1d ("sparc64: implement the new page table range
+API") set_ptes was added to the sparc architecture. The implementation
+included calling arch_enter/leave_lazy_mmu() calls.
+
+The patch removes the usage of arch_enter/leave_lazy_mmu() since this
+implies nesting of lazy mmu regions which is not supported. Without this
+fix, lazy mmu mode is effectively disabled because we exit the mode after
+the first set_ptes:
+
+remap_pte_range()
+ -> arch_enter_lazy_mmu()
+ -> set_ptes()
+ -> arch_enter_lazy_mmu()
+ -> arch_leave_lazy_mmu()
+ -> arch_leave_lazy_mmu()
+
+Powerpc suffered the same problem and fixed it in a corresponding way with
+commit 47b8def9358c ("powerpc/mm: Avoid calling
+arch_enter/leave_lazy_mmu() in set_ptes").
+
+Link: https://lkml.kernel.org/r/20250303141542.3371656-5-ryan.roberts@arm.com
+Fixes: 1a10a44dfc1d ("sparc64: implement the new page table range API")
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Andreas Larsson <andreas@gaisler.com>
+Acked-by: Juergen Gross <jgross@suse.com>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David S. Miller <davem@davemloft.net>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Juegren Gross <jgross@suse.com>
+Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/sparc/include/asm/pgtable_64.h | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/arch/sparc/include/asm/pgtable_64.h
++++ b/arch/sparc/include/asm/pgtable_64.h
+@@ -936,7 +936,6 @@ static inline void __set_pte_at(struct m
+ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
+ {
+- arch_enter_lazy_mmu_mode();
+ for (;;) {
+ __set_pte_at(mm, addr, ptep, pte, 0);
+ if (--nr == 0)
+@@ -945,7 +944,6 @@ static inline void set_ptes(struct mm_st
+ pte_val(pte) += PAGE_SIZE;
+ addr += PAGE_SIZE;
+ }
+- arch_leave_lazy_mmu_mode();
+ }
+ #define set_ptes set_ptes
+
--- /dev/null
+From a1d416bf9faf4f4871cb5a943614a07f80a7d70f Mon Sep 17 00:00:00 2001
+From: Ryan Roberts <ryan.roberts@arm.com>
+Date: Mon, 3 Mar 2025 14:15:37 +0000
+Subject: sparc/mm: disable preemption in lazy mmu mode
+
+From: Ryan Roberts <ryan.roberts@arm.com>
+
+commit a1d416bf9faf4f4871cb5a943614a07f80a7d70f upstream.
+
+Since commit 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy
+updates") it's been possible for arch_[enter|leave]_lazy_mmu_mode() to be
+called without holding a page table lock (for the kernel mappings case),
+and therefore it is possible that preemption may occur while in the lazy
+mmu mode. The Sparc lazy mmu implementation is not robust to preemption
+since it stores the lazy mode state in a per-cpu structure and does not
+attempt to manage that state on task switch.
+
+Powerpc had the same issue and fixed it by explicitly disabling preemption
+in arch_enter_lazy_mmu_mode() and re-enabling in
+arch_leave_lazy_mmu_mode(). See commit b9ef323ea168 ("powerpc/64s:
+Disable preemption in hash lazy mmu mode").
+
+Given Sparc's lazy mmu mode is based on powerpc's, let's fix it in the
+same way here.
+
+Link: https://lkml.kernel.org/r/20250303141542.3371656-4-ryan.roberts@arm.com
+Fixes: 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates")
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Andreas Larsson <andreas@gaisler.com>
+Acked-by: Juergen Gross <jgross@suse.com>
+Cc: Borislav Betkov <bp@alien8.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David S. Miller <davem@davemloft.net>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Juegren Gross <jgross@suse.com>
+Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/sparc/mm/tlb.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/arch/sparc/mm/tlb.c
++++ b/arch/sparc/mm/tlb.c
+@@ -52,8 +52,10 @@ out:
+
+ void arch_enter_lazy_mmu_mode(void)
+ {
+- struct tlb_batch *tb = this_cpu_ptr(&tlb_batch);
++ struct tlb_batch *tb;
+
++ preempt_disable();
++ tb = this_cpu_ptr(&tlb_batch);
+ tb->active = 1;
+ }
+
+@@ -64,6 +66,7 @@ void arch_leave_lazy_mmu_mode(void)
+ if (tb->tlb_nr)
+ flush_tlb_pending();
+ tb->active = 0;
++ preempt_enable();
+ }
+
+ static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,