--- /dev/null
+From e9e28ebb2dbe7298a40d52a59b47479992a16bce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 4 Apr 2025 14:14:38 +0800
+Subject: ata: pata_pxa: Fix potential NULL pointer dereference in
+ pxa_ata_probe()
+
+From: Henry Martin <bsdhenrymartin@gmail.com>
+
+[ Upstream commit ad320e408a8c95a282ab9c05cdf0c9b95e317985 ]
+
+devm_ioremap() returns NULL on error. Currently, pxa_ata_probe() does
+not check for this case, which can result in a NULL pointer dereference.
+
+Add NULL check after devm_ioremap() to prevent this issue.
+
+Fixes: 2dc6c6f15da9 ("[ARM] pata_pxa: DMA-capable PATA driver")
+Signed-off-by: Henry Martin <bsdhenrymartin@gmail.com>
+Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ata/pata_pxa.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/ata/pata_pxa.c b/drivers/ata/pata_pxa.c
+index 538bd3423d859..1bdcd6ee741d3 100644
+--- a/drivers/ata/pata_pxa.c
++++ b/drivers/ata/pata_pxa.c
+@@ -223,10 +223,16 @@ static int pxa_ata_probe(struct platform_device *pdev)
+
+ ap->ioaddr.cmd_addr = devm_ioremap(&pdev->dev, cmd_res->start,
+ resource_size(cmd_res));
++ if (!ap->ioaddr.cmd_addr)
++ return -ENOMEM;
+ ap->ioaddr.ctl_addr = devm_ioremap(&pdev->dev, ctl_res->start,
+ resource_size(ctl_res));
++ if (!ap->ioaddr.ctl_addr)
++ return -ENOMEM;
+ ap->ioaddr.bmdma_addr = devm_ioremap(&pdev->dev, dma_res->start,
+ resource_size(dma_res));
++ if (!ap->ioaddr.bmdma_addr)
++ return -ENOMEM;
+
+ /*
+ * Adjust register offsets
+--
+2.39.5
+
--- /dev/null
+From 6c4c35b408bebe76f47805f186257f9723e59224 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 15:30:01 +0800
+Subject: ata: sata_sx4: Add error handling in pdc20621_i2c_read()
+
+From: Wentao Liang <vulab@iscas.ac.cn>
+
+[ Upstream commit 8d46a27085039158eb5e253ab8a35a0e33b5e864 ]
+
+The function pdc20621_prog_dimm0() calls the function pdc20621_i2c_read()
+but does not handle the error if the read fails. This could lead to
+process with invalid data. A proper implementation can be found in
+/source/drivers/ata/sata_sx4.c, pdc20621_prog_dimm_global(). As mentioned
+in its commit: bb44e154e25125bef31fa956785e90fccd24610b, the variable spd0
+might be used uninitialized when pdc20621_i2c_read() fails.
+
+Add error handling to pdc20621_i2c_read(). If a read operation fails,
+an error message is logged via dev_err(), and return a negative error
+code.
+
+Add error handling to pdc20621_prog_dimm0() in pdc20621_dimm_init(), and
+return a negative error code if pdc20621_prog_dimm0() fails.
+
+Fixes: 4447d3515616 ("libata: convert the remaining SATA drivers to new init model")
+Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
+Reviewed-by: Niklas Cassel <cassel@kernel.org>
+Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ata/sata_sx4.c | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c
+index a482741eb181f..c3042eca6332d 100644
+--- a/drivers/ata/sata_sx4.c
++++ b/drivers/ata/sata_sx4.c
+@@ -1117,9 +1117,14 @@ static int pdc20621_prog_dimm0(struct ata_host *host)
+ mmio += PDC_CHIP0_OFS;
+
+ for (i = 0; i < ARRAY_SIZE(pdc_i2c_read_data); i++)
+- pdc20621_i2c_read(host, PDC_DIMM0_SPD_DEV_ADDRESS,
+- pdc_i2c_read_data[i].reg,
+- &spd0[pdc_i2c_read_data[i].ofs]);
++ if (!pdc20621_i2c_read(host, PDC_DIMM0_SPD_DEV_ADDRESS,
++ pdc_i2c_read_data[i].reg,
++ &spd0[pdc_i2c_read_data[i].ofs])) {
++ dev_err(host->dev,
++ "Failed in i2c read at index %d: device=%#x, reg=%#x\n",
++ i, PDC_DIMM0_SPD_DEV_ADDRESS, pdc_i2c_read_data[i].reg);
++ return -EIO;
++ }
+
+ data |= (spd0[4] - 8) | ((spd0[21] != 0) << 3) | ((spd0[3]-11) << 4);
+ data |= ((spd0[17] / 4) << 6) | ((spd0[5] / 2) << 7) |
+@@ -1284,6 +1289,8 @@ static unsigned int pdc20621_dimm_init(struct ata_host *host)
+
+ /* Programming DIMM0 Module Control Register (index_CID0:80h) */
+ size = pdc20621_prog_dimm0(host);
++ if (size < 0)
++ return size;
+ dev_dbg(host->dev, "Local DIMM Size = %dMB\n", size);
+
+ /* Programming DIMM Module Global Control Register (index_CID0:88h) */
+--
+2.39.5
+
--- /dev/null
+From dfec9762be414d6cb96d9dd919239b0b584355ac Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Nov 2024 21:50:22 -0500
+Subject: cgroup/cpuset: Enforce at most one rebuild_sched_domains_locked()
+ call per operation
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit a040c351283e3ac75422621ea205b1d8d687e108 ]
+
+Since commit ff0ce721ec21 ("cgroup/cpuset: Eliminate unncessary
+sched domains rebuilds in hotplug"), there is only one
+rebuild_sched_domains_locked() call per hotplug operation. However,
+writing to the various cpuset control files may still casue more than
+one rebuild_sched_domains_locked() call to happen in some cases.
+
+Juri had found that two rebuild_sched_domains_locked() calls in
+update_prstate(), one from update_cpumasks_hier() and another one from
+update_partition_sd_lb() could cause cpuset partition to be created
+with null total_bw for DL tasks. IOW, DL tasks may not be scheduled
+correctly in such a partition.
+
+A sample command sequence that can reproduce null total_bw is as
+follows.
+
+ # echo Y >/sys/kernel/debug/sched/verbose
+ # echo +cpuset >/sys/fs/cgroup/cgroup.subtree_control
+ # mkdir /sys/fs/cgroup/test
+ # echo 0-7 > /sys/fs/cgroup/test/cpuset.cpus
+ # echo 6-7 > /sys/fs/cgroup/test/cpuset.cpus.exclusive
+ # echo root >/sys/fs/cgroup/test/cpuset.cpus.partition
+
+Fix this double rebuild_sched_domains_locked() calls problem
+by replacing existing calls with cpuset_force_rebuild() except
+the rebuild_sched_domains_cpuslocked() call at the end of
+cpuset_handle_hotplug(). Checking of the force_sd_rebuild flag is
+now done at the end of cpuset_write_resmask() and update_prstate()
+to determine if rebuild_sched_domains_locked() should be called or not.
+
+The cpuset v1 code can still call rebuild_sched_domains_locked()
+directly as double rebuild_sched_domains_locked() calls is not possible.
+
+Reported-by: Juri Lelli <juri.lelli@redhat.com>
+Closes: https://lore.kernel.org/lkml/ZyuUcJDPBln1BK1Y@jlelli-thinkpadt14gen4.remote.csb/
+Signed-off-by: Waiman Long <longman@redhat.com>
+Tested-by: Juri Lelli <juri.lelli@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Stable-dep-of: a22b3d54de94 ("cgroup/cpuset: Fix race between newly created partition and dying one")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c | 49 ++++++++++++++++++++++++++++--------------
+ 1 file changed, 33 insertions(+), 16 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 0012c34bb8601..7ac2a634128b3 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -84,9 +84,19 @@ static bool have_boot_isolcpus;
+ static struct list_head remote_children;
+
+ /*
+- * A flag to force sched domain rebuild at the end of an operation while
+- * inhibiting it in the intermediate stages when set. Currently it is only
+- * set in hotplug code.
++ * A flag to force sched domain rebuild at the end of an operation.
++ * It can be set in
++ * - update_partition_sd_lb()
++ * - remote_partition_check()
++ * - update_cpumasks_hier()
++ * - cpuset_update_flag()
++ * - cpuset_hotplug_update_tasks()
++ * - cpuset_handle_hotplug()
++ *
++ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
++ *
++ * Note that update_relax_domain_level() in cpuset-v1.c can still call
++ * rebuild_sched_domains_locked() directly without using this flag.
+ */
+ static bool force_sd_rebuild;
+
+@@ -998,6 +1008,7 @@ void rebuild_sched_domains_locked(void)
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&cpuset_mutex);
++ force_sd_rebuild = false;
+
+ /*
+ * If we have raced with CPU hotplug, return early to avoid
+@@ -1172,8 +1183,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ }
+
+- if (rebuild_domains && !force_sd_rebuild)
+- rebuild_sched_domains_locked();
++ if (rebuild_domains)
++ cpuset_force_rebuild();
+ }
+
+ /*
+@@ -1530,8 +1541,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
+ remote_partition_disable(child, tmp);
+ disable_cnt++;
+ }
+- if (disable_cnt && !force_sd_rebuild)
+- rebuild_sched_domains_locked();
++ if (disable_cnt)
++ cpuset_force_rebuild();
+ }
+
+ /*
+@@ -2124,8 +2135,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ }
+ rcu_read_unlock();
+
+- if (need_rebuild_sched_domains && !force_sd_rebuild)
+- rebuild_sched_domains_locked();
++ if (need_rebuild_sched_domains)
++ cpuset_force_rebuild();
+ }
+
+ /**
+@@ -2744,9 +2755,13 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ cs->flags = trialcs->flags;
+ spin_unlock_irq(&callback_lock);
+
+- if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed &&
+- !force_sd_rebuild)
+- rebuild_sched_domains_locked();
++ if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
++ if (!IS_ENABLED(CONFIG_CPUSETS_V1) ||
++ cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
++ cpuset_force_rebuild();
++ else
++ rebuild_sched_domains_locked();
++ }
+
+ if (spread_flag_changed)
+ cpuset1_update_tasks_flags(cs);
+@@ -2866,6 +2881,8 @@ static int update_prstate(struct cpuset *cs, int new_prs)
+ update_partition_sd_lb(cs, old_prs);
+
+ notify_partition_change(cs, old_prs);
++ if (force_sd_rebuild)
++ rebuild_sched_domains_locked();
+ free_cpumasks(NULL, &tmpmask);
+ return 0;
+ }
+@@ -3136,6 +3153,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ }
+
+ free_cpuset(trialcs);
++ if (force_sd_rebuild)
++ rebuild_sched_domains_locked();
+ out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+@@ -3879,11 +3898,9 @@ static void cpuset_handle_hotplug(void)
+ rcu_read_unlock();
+ }
+
+- /* rebuild sched domains if cpus_allowed has changed */
+- if (force_sd_rebuild) {
+- force_sd_rebuild = false;
++ /* rebuild sched domains if necessary */
++ if (force_sd_rebuild)
+ rebuild_sched_domains_cpuslocked();
+- }
+
+ free_cpumasks(NULL, ptmp);
+ }
+--
+2.39.5
+
--- /dev/null
+From 404218837e9d61a1e8080c32dd86273693626a86 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 30 Mar 2025 17:52:41 -0400
+Subject: cgroup/cpuset: Fix error handling in remote_partition_disable()
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit 8bf450f3aec3d1bbd725d179502c64b8992588e4 ]
+
+When remote_partition_disable() is called to disable a remote partition,
+it always sets the partition to an invalid partition state. It should
+only do so if an error code (prs_err) has been set. Correct that and
+add proper error code in places where remote_partition_disable() is
+called due to error.
+
+Fixes: 181c8e091aae ("cgroup/cpuset: Introduce remote partition")
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c | 29 ++++++++++++++++++++---------
+ 1 file changed, 20 insertions(+), 9 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index f7ad5651c93db..70fac05123c6d 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -1383,6 +1383,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
+ list_add(&cs->remote_sibling, &remote_children);
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
++ cs->prs_err = 0;
+
+ /*
+ * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+@@ -1413,9 +1414,11 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
+ list_del_init(&cs->remote_sibling);
+ isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
+ NULL, tmp->new_cpus);
+- cs->partition_root_state = -cs->partition_root_state;
+- if (!cs->prs_err)
+- cs->prs_err = PERR_INVCPUS;
++ if (cs->prs_err)
++ cs->partition_root_state = -cs->partition_root_state;
++ else
++ cs->partition_root_state = PRS_MEMBER;
++
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
+@@ -1448,8 +1451,10 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
+
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+- if (cpumask_empty(newmask))
++ if (cpumask_empty(newmask)) {
++ cs->prs_err = PERR_CPUSEMPTY;
+ goto invalidate;
++ }
+
+ adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
+ deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
+@@ -1459,10 +1464,15 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
+ * not allocated to other partitions and there are effective_cpus
+ * left in the top cpuset.
+ */
+- if (adding && (!capable(CAP_SYS_ADMIN) ||
+- cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+- cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
+- goto invalidate;
++ if (adding) {
++ if (!capable(CAP_SYS_ADMIN))
++ cs->prs_err = PERR_ACCESS;
++ else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
++ cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
++ cs->prs_err = PERR_NOCPUS;
++ if (cs->prs_err)
++ goto invalidate;
++ }
+
+ spin_lock_irq(&callback_lock);
+ if (adding)
+@@ -1578,7 +1588,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+- * check for error and so partition_root_state and prs_error will be updated
++ * check for error and so partition_root_state and prs_err will be updated
+ * directly.
+ */
+ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+@@ -3726,6 +3736,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
+
+ if (remote && cpumask_empty(&new_cpus) &&
+ partition_is_populated(cs, NULL)) {
++ cs->prs_err = PERR_HOTPLUG;
+ remote_partition_disable(cs, tmp);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ remote = false;
+--
+2.39.5
+
--- /dev/null
+From b81b1f0815c00515c983706d1a70a44cff8c1972 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 30 Mar 2025 17:52:40 -0400
+Subject: cgroup/cpuset: Fix incorrect isolated_cpus update in
+ update_parent_effective_cpumask()
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit 668e041662e92ab3ebcb9eb606d3ec01884546ab ]
+
+Before commit f0af1bfc27b5 ("cgroup/cpuset: Relax constraints to
+partition & cpus changes"), a cpuset partition cannot be enabled if not
+all the requested CPUs can be granted from the parent cpuset. After
+that commit, a cpuset partition can be created even if the requested
+exclusive CPUs contain CPUs not allowed its parent. The delmask
+containing exclusive CPUs to be removed from its parent wasn't
+adjusted accordingly.
+
+That is not a problem until the introduction of a new isolated_cpus
+mask in commit 11e5f407b64a ("cgroup/cpuset: Keep track of CPUs in
+isolated partitions") as the CPUs in the delmask may be added directly
+into isolated_cpus.
+
+As a result, isolated_cpus may incorrectly contain CPUs that are not
+isolated leading to incorrect data reporting. Fix this by adjusting
+the delmask to reflect the actual exclusive CPUs for the creation of
+the partition.
+
+Fixes: 11e5f407b64a ("cgroup/cpuset: Keep track of CPUs in isolated partitions")
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 24ece85fd3b12..f7ad5651c93db 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -1656,9 +1656,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+ if (nocpu)
+ return PERR_NOCPUS;
+
+- cpumask_copy(tmp->delmask, xcpus);
+- deleting = true;
+- subparts_delta++;
++ deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus);
++ if (deleting)
++ subparts_delta++;
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
+ } else if (cmd == partcmd_disable) {
+ /*
+--
+2.39.5
+
--- /dev/null
+From bcd649e716bee3dbdaf99ca56a639b8e639def7a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 30 Mar 2025 17:52:39 -0400
+Subject: cgroup/cpuset: Fix race between newly created partition and dying one
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit a22b3d54de94f82ca057cc2ebf9496fa91ebf698 ]
+
+There is a possible race between removing a cgroup diectory that is
+a partition root and the creation of a new partition. The partition
+to be removed can be dying but still online, it doesn't not currently
+participate in checking for exclusive CPUs conflict, but the exclusive
+CPUs are still there in subpartitions_cpus and isolated_cpus. These
+two cpumasks are global states that affect the operation of cpuset
+partitions. The exclusive CPUs in dying cpusets will only be removed
+when cpuset_css_offline() function is called after an RCU delay.
+
+As a result, it is possible that a new partition can be created with
+exclusive CPUs that overlap with those of a dying one. When that dying
+partition is finally offlined, it removes those overlapping exclusive
+CPUs from subpartitions_cpus and maybe isolated_cpus resulting in an
+incorrect CPU configuration.
+
+This bug was found when a warning was triggered in
+remote_partition_disable() during testing because the subpartitions_cpus
+mask was empty.
+
+One possible way to fix this is to iterate the dying cpusets as well and
+avoid using the exclusive CPUs in those dying cpusets. However, this
+can still cause random partition creation failures or other anomalies
+due to racing. A better way to fix this race is to reset the partition
+state at the moment when a cpuset is being killed.
+
+Introduce a new css_killed() CSS function pointer and call it, if
+defined, before setting CSS_DYING flag in kill_css(). Also update the
+css_is_dying() helper to use the CSS_DYING flag introduced by commit
+33c35aa48178 ("cgroup: Prevent kill_css() from being called more than
+once") for proper synchronization.
+
+Add a new cpuset_css_killed() function to reset the partition state of
+a valid partition root if it is being killed.
+
+Fixes: ee8dde0cd2ce ("cpuset: Add new v2 cpuset.sched.partition flag")
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/cgroup-defs.h | 1 +
+ include/linux/cgroup.h | 2 +-
+ kernel/cgroup/cgroup.c | 6 ++++++
+ kernel/cgroup/cpuset.c | 20 +++++++++++++++++---
+ 4 files changed, 25 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index 38b2af336e4a0..252eed781a6e9 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -711,6 +711,7 @@ struct cgroup_subsys {
+ void (*css_released)(struct cgroup_subsys_state *css);
+ void (*css_free)(struct cgroup_subsys_state *css);
+ void (*css_reset)(struct cgroup_subsys_state *css);
++ void (*css_killed)(struct cgroup_subsys_state *css);
+ void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
+ int (*css_extra_stat_show)(struct seq_file *seq,
+ struct cgroup_subsys_state *css);
+diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
+index f8ef47f8a634d..fc1324ed597d6 100644
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -343,7 +343,7 @@ static inline u64 cgroup_id(const struct cgroup *cgrp)
+ */
+ static inline bool css_is_dying(struct cgroup_subsys_state *css)
+ {
+- return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt);
++ return css->flags & CSS_DYING;
+ }
+
+ static inline void cgroup_get(struct cgroup *cgrp)
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index 216535e055e11..4378f3eff25d2 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -5909,6 +5909,12 @@ static void kill_css(struct cgroup_subsys_state *css)
+ if (css->flags & CSS_DYING)
+ return;
+
++ /*
++ * Call css_killed(), if defined, before setting the CSS_DYING flag
++ */
++ if (css->ss->css_killed)
++ css->ss->css_killed(css);
++
+ css->flags |= CSS_DYING;
+
+ /*
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 07ea3a563150b..839f88ba17f7d 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -3479,9 +3479,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
+
+- if (is_partition_valid(cs))
+- update_prstate(cs, 0);
+-
+ if (!cpuset_v2() && is_sched_load_balance(cs))
+ cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+@@ -3492,6 +3489,22 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
+ cpus_read_unlock();
+ }
+
++static void cpuset_css_killed(struct cgroup_subsys_state *css)
++{
++ struct cpuset *cs = css_cs(css);
++
++ cpus_read_lock();
++ mutex_lock(&cpuset_mutex);
++
++ /* Reset valid partition back to member */
++ if (is_partition_valid(cs))
++ update_prstate(cs, PRS_MEMBER);
++
++ mutex_unlock(&cpuset_mutex);
++ cpus_read_unlock();
++
++}
++
+ static void cpuset_css_free(struct cgroup_subsys_state *css)
+ {
+ struct cpuset *cs = css_cs(css);
+@@ -3613,6 +3626,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
++ .css_killed = cpuset_css_killed,
+ .css_free = cpuset_css_free,
+ .can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
+--
+2.39.5
+
--- /dev/null
+From afe6105bbd1857ec636b7b285c874a46e81edde0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Nov 2024 21:50:23 -0500
+Subject: cgroup/cpuset: Further optimize code if CONFIG_CPUSETS_V1 not set
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit c4c9cebe2fb9cdc73e55513de7af7a4f50260e88 ]
+
+Currently the cpuset code uses group_subsys_on_dfl() to check if we
+are running with cgroup v2. If CONFIG_CPUSETS_V1 isn't set, there is
+really no need to do this check and we can optimize out some of the
+unneeded v1 specific code paths. Introduce a new cpuset_v2() and use it
+to replace the cgroup_subsys_on_dfl() check to further optimize the
+code.
+
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Stable-dep-of: a22b3d54de94 ("cgroup/cpuset: Fix race between newly created partition and dying one")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c | 39 +++++++++++++++++++--------------------
+ 1 file changed, 19 insertions(+), 20 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 7ac2a634128b3..07ea3a563150b 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -293,6 +293,12 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
+ mutex_unlock(&cpuset_mutex);
+ }
+
++static inline bool cpuset_v2(void)
++{
++ return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
++ cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
++}
++
+ /*
+ * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
+ * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
+@@ -303,7 +309,7 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
+ */
+ static inline bool is_in_v2_mode(void)
+ {
+- return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
++ return cpuset_v2() ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+ }
+
+@@ -738,7 +744,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
+ int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
+- bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
++ bool cgrpv2 = cpuset_v2();
+ int nslot_update;
+
+ doms = NULL;
+@@ -1206,7 +1212,7 @@ static void reset_partition_data(struct cpuset *cs)
+ {
+ struct cpuset *parent = parent_cs(cs);
+
+- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
++ if (!cpuset_v2())
+ return;
+
+ lockdep_assert_held(&callback_lock);
+@@ -2035,7 +2041,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ */
+ if (!cp->partition_root_state && !force &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
+- (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
++ (!cpuset_v2() ||
+ (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+@@ -2109,8 +2115,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ * from parent if current cpuset isn't a valid partition root
+ * and their load balance states differ.
+ */
+- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+- !is_partition_valid(cp) &&
++ if (cpuset_v2() && !is_partition_valid(cp) &&
+ (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
+ if (is_sched_load_balance(parent))
+ set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+@@ -2126,8 +2131,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ is_sched_load_balance(cp) &&
+- (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+- is_partition_valid(cp)))
++ (!cpuset_v2() || is_partition_valid(cp)))
+ need_rebuild_sched_domains = true;
+
+ rcu_read_lock();
+@@ -2264,7 +2268,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+
+ retval = validate_change(cs, trialcs);
+
+- if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
++ if ((retval == -EINVAL) && cpuset_v2()) {
+ struct cgroup_subsys_state *css;
+ struct cpuset *cp;
+
+@@ -2756,8 +2760,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ spin_unlock_irq(&callback_lock);
+
+ if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
+- if (!IS_ENABLED(CONFIG_CPUSETS_V1) ||
+- cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
++ if (cpuset_v2())
+ cpuset_force_rebuild();
+ else
+ rebuild_sched_domains_locked();
+@@ -2943,8 +2946,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+- (cpus_updated || mems_updated)) {
++ if (!cpuset_v2() || (cpus_updated || mems_updated)) {
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+@@ -3058,8 +3060,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+- !cpus_updated && !mems_updated) {
++ if (cpuset_v2() && !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
+@@ -3384,7 +3385,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
+ INIT_LIST_HEAD(&cs->remote_sibling);
+
+ /* Set CS_MEMORY_MIGRATE for default hierarchy */
+- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
++ if (cpuset_v2())
+ __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
+
+ return &cs->css;
+@@ -3411,8 +3412,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+- !is_sched_load_balance(parent))
++ if (cpuset_v2() && !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+
+ cpuset_inc();
+@@ -3482,8 +3482,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
+ if (is_partition_valid(cs))
+ update_prstate(cs, 0);
+
+- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+- is_sched_load_balance(cs))
++ if (!cpuset_v2() && is_sched_load_balance(cs))
+ cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+ cpuset_dec();
+--
+2.39.5
+
--- /dev/null
+From 9504cb2ff3491769f43d10a648dae93bce2cd795 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Nov 2024 21:50:21 -0500
+Subject: cgroup/cpuset: Revert "Allow suppression of sched domain rebuild in
+ update_cpumasks_hier()"
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit bcd7012afd7bcd45fcd7a0e2f48e57b273702317 ]
+
+Revert commit 3ae0b773211e ("cgroup/cpuset: Allow suppression of sched
+domain rebuild in update_cpumasks_hier()") to allow for an alternative
+way to suppress unnecessary rebuild_sched_domains_locked() calls in
+update_cpumasks_hier() and elsewhere in a following commit.
+
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Stable-dep-of: a22b3d54de94 ("cgroup/cpuset: Fix race between newly created partition and dying one")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c | 39 ++++++++++++++-------------------------
+ 1 file changed, 14 insertions(+), 25 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 70fac05123c6d..0012c34bb8601 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -1940,12 +1940,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
+ rcu_read_unlock();
+ }
+
+-/*
+- * update_cpumasks_hier() flags
+- */
+-#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
+-#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
+-
+ /*
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+@@ -1960,7 +1954,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
+ * Called with cpuset_mutex held
+ */
+ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+- int flags)
++ bool force)
+ {
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+@@ -2025,10 +2019,10 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ * Skip the whole subtree if
+ * 1) the cpumask remains the same,
+ * 2) has no partition root state,
+- * 3) HIER_CHECKALL flag not set, and
++ * 3) force flag not set, and
+ * 4) for v2 load balance state same as its parent.
+ */
+- if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
++ if (!cp->partition_root_state && !force &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
+@@ -2130,8 +2124,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ }
+ rcu_read_unlock();
+
+- if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) &&
+- !force_sd_rebuild)
++ if (need_rebuild_sched_domains && !force_sd_rebuild)
+ rebuild_sched_domains_locked();
+ }
+
+@@ -2159,9 +2152,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ * directly.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+- * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
+- * flag is used to suppress rebuild of sched domains as the callers
+- * will take care of that.
++ * release the RCU read lock before calling it.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, pos_css, parent) {
+@@ -2177,7 +2168,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ continue;
+
+ rcu_read_unlock();
+- update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
++ update_cpumasks_hier(sibling, tmp, false);
+ rcu_read_lock();
+ css_put(&sibling->css);
+ }
+@@ -2197,7 +2188,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks tmp;
+ struct cpuset *parent = parent_cs(cs);
+ bool invalidate = false;
+- int hier_flags = 0;
++ bool force = false;
+ int old_prs = cs->partition_root_state;
+
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
+@@ -2258,8 +2249,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+- if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+- hier_flags = HIER_CHECKALL;
++ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+ retval = validate_change(cs, trialcs);
+
+@@ -2327,7 +2317,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ spin_unlock_irq(&callback_lock);
+
+ /* effective_cpus/effective_xcpus will be updated here */
+- update_cpumasks_hier(cs, &tmp, hier_flags);
++ update_cpumasks_hier(cs, &tmp, force);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+@@ -2352,7 +2342,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks tmp;
+ struct cpuset *parent = parent_cs(cs);
+ bool invalidate = false;
+- int hier_flags = 0;
++ bool force = false;
+ int old_prs = cs->partition_root_state;
+
+ if (!*buf) {
+@@ -2375,8 +2365,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+- if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+- hier_flags = HIER_CHECKALL;
++ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+ retval = validate_change(cs, trialcs);
+ if (retval)
+@@ -2429,8 +2418,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ * of the subtree when it is a valid partition root or effective_xcpus
+ * is updated.
+ */
+- if (is_partition_valid(cs) || hier_flags)
+- update_cpumasks_hier(cs, &tmp, hier_flags);
++ if (is_partition_valid(cs) || force)
++ update_cpumasks_hier(cs, &tmp, force);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+@@ -2871,7 +2860,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
+ update_unbound_workqueue_cpumask(new_xcpus_state);
+
+ /* Force update if switching back to member */
+- update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
++ update_cpumasks_hier(cs, &tmpmask, !new_prs);
+
+ /* Update sched domains and load balance flag */
+ update_partition_sd_lb(cs, old_prs);
+--
+2.39.5
+
--- /dev/null
+From 988f1acdfd23857c1b73b1e6358b88adbe7b8c92 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 3 Apr 2025 14:16:31 -0700
+Subject: codel: remove sch->q.qlen check before qdisc_tree_reduce_backlog()
+
+From: Cong Wang <xiyou.wangcong@gmail.com>
+
+[ Upstream commit 342debc12183b51773b3345ba267e9263bdfaaef ]
+
+After making all ->qlen_notify() callbacks idempotent, now it is safe to
+remove the check of qlen!=0 from both fq_codel_dequeue() and
+codel_qdisc_dequeue().
+
+Reported-by: Gerrard Tai <gerrard.tai@starlabs.sg>
+Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM")
+Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM")
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20250403211636.166257-1-xiyou.wangcong@gmail.com
+Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/sch_codel.c | 5 +----
+ net/sched/sch_fq_codel.c | 6 ++----
+ 2 files changed, 3 insertions(+), 8 deletions(-)
+
+diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
+index 3e8d4fe4d91e3..e1f6e7618debd 100644
+--- a/net/sched/sch_codel.c
++++ b/net/sched/sch_codel.c
+@@ -65,10 +65,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
+ &q->stats, qdisc_pkt_len, codel_get_enqueue_time,
+ drop_func, dequeue_func);
+
+- /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
+- * or HTB crashes. Defer it for next round.
+- */
+- if (q->stats.drop_count && sch->q.qlen) {
++ if (q->stats.drop_count) {
+ qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
+ q->stats.drop_count = 0;
+ q->stats.drop_len = 0;
+diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
+index 4f908c11ba952..778f6e5966be8 100644
+--- a/net/sched/sch_fq_codel.c
++++ b/net/sched/sch_fq_codel.c
+@@ -314,10 +314,8 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+ }
+ qdisc_bstats_update(sch, skb);
+ flow->deficit -= qdisc_pkt_len(skb);
+- /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
+- * or HTB crashes. Defer it for next round.
+- */
+- if (q->cstats.drop_count && sch->q.qlen) {
++
++ if (q->cstats.drop_count) {
+ qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
+ q->cstats.drop_len);
+ q->cstats.drop_count = 0;
+--
+2.39.5
+
--- /dev/null
+From d7645673374a4accc1b7510c61aa9f8f6de67b82 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 10 Mar 2025 20:58:21 +0530
+Subject: drm/i915: Disable RPG during live selftest
+
+From: Badal Nilawar <badal.nilawar@intel.com>
+
+[ Upstream commit 9d3d9776bd3bd9c32d460dfe6c3363134de578bc ]
+
+The Forcewake timeout issue has been observed on Gen 12.0 and above.
+To address this, disable Render Power-Gating (RPG) during live self-tests
+for these generations. The temporary workaround 'drm/i915/mtl: do not
+enable render power-gating on MTL' disables RPG globally, which is
+unnecessary since the issues were only seen during self-tests.
+
+v2: take runtime pm wakeref
+
+Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/9413
+Fixes: 25e7976db86b ("drm/i915/mtl: do not enable render power-gating on MTL")
+Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Cc: Andi Shyti <andi.shyti@intel.com>
+Cc: Andrzej Hajda <andrzej.hajda@intel.com>
+Signed-off-by: Badal Nilawar <badal.nilawar@intel.com>
+Signed-off-by: Sk Anirban <sk.anirban@intel.com>
+Reviewed-by: Karthik Poosa <karthik.poosa@intel.com>
+Signed-off-by: Anshuman Gupta <anshuman.gupta@intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20250310152821.2931678-1-sk.anirban@intel.com
+(cherry picked from commit 0a4ae87706c6d15d14648e428c3a76351f823e48)
+Signed-off-by: Jani Nikula <jani.nikula@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/i915/gt/intel_rc6.c | 19 ++++---------------
+ .../gpu/drm/i915/selftests/i915_selftest.c | 18 ++++++++++++++++++
+ 2 files changed, 22 insertions(+), 15 deletions(-)
+
+diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c
+index 9378d5901c493..9ca42589da4da 100644
+--- a/drivers/gpu/drm/i915/gt/intel_rc6.c
++++ b/drivers/gpu/drm/i915/gt/intel_rc6.c
+@@ -117,21 +117,10 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6)
+ GEN6_RC_CTL_RC6_ENABLE |
+ GEN6_RC_CTL_EI_MODE(1);
+
+- /*
+- * BSpec 52698 - Render powergating must be off.
+- * FIXME BSpec is outdated, disabling powergating for MTL is just
+- * temporary wa and should be removed after fixing real cause
+- * of forcewake timeouts.
+- */
+- if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
+- pg_enable =
+- GEN9_MEDIA_PG_ENABLE |
+- GEN11_MEDIA_SAMPLER_PG_ENABLE;
+- else
+- pg_enable =
+- GEN9_RENDER_PG_ENABLE |
+- GEN9_MEDIA_PG_ENABLE |
+- GEN11_MEDIA_SAMPLER_PG_ENABLE;
++ pg_enable =
++ GEN9_RENDER_PG_ENABLE |
++ GEN9_MEDIA_PG_ENABLE |
++ GEN11_MEDIA_SAMPLER_PG_ENABLE;
+
+ if (GRAPHICS_VER(gt->i915) >= 12 && !IS_DG1(gt->i915)) {
+ for (i = 0; i < I915_MAX_VCS; i++)
+diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c b/drivers/gpu/drm/i915/selftests/i915_selftest.c
+index fee76c1d2f450..889281819c5b1 100644
+--- a/drivers/gpu/drm/i915/selftests/i915_selftest.c
++++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c
+@@ -23,7 +23,9 @@
+
+ #include <linux/random.h>
+
++#include "gt/intel_gt.h"
+ #include "gt/intel_gt_pm.h"
++#include "gt/intel_gt_regs.h"
+ #include "gt/uc/intel_gsc_fw.h"
+
+ #include "i915_driver.h"
+@@ -253,11 +255,27 @@ int i915_mock_selftests(void)
+ int i915_live_selftests(struct pci_dev *pdev)
+ {
+ struct drm_i915_private *i915 = pdev_to_i915(pdev);
++ struct intel_uncore *uncore = &i915->uncore;
+ int err;
++ u32 pg_enable;
++ intel_wakeref_t wakeref;
+
+ if (!i915_selftest.live)
+ return 0;
+
++ /*
++ * FIXME Disable render powergating, this is temporary wa and should be removed
++ * after fixing real cause of forcewake timeouts.
++ */
++ with_intel_runtime_pm(uncore->rpm, wakeref) {
++ if (IS_GFX_GT_IP_RANGE(to_gt(i915), IP_VER(12, 00), IP_VER(12, 74))) {
++ pg_enable = intel_uncore_read(uncore, GEN9_PG_ENABLE);
++ if (pg_enable & GEN9_RENDER_PG_ENABLE)
++ intel_uncore_write_fw(uncore, GEN9_PG_ENABLE,
++ pg_enable & ~GEN9_RENDER_PG_ENABLE);
++ }
++ }
++
+ __wait_gsc_proxy_completed(i915);
+ __wait_gsc_huc_load_completed(i915);
+
+--
+2.39.5
+
--- /dev/null
+From d2bd289e3163073ab1f9ea60ba2257c040014692 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 2 Apr 2025 19:20:57 +0200
+Subject: drm/i915/huc: Fix fence not released on early probe errors
+
+From: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
+
+[ Upstream commit e3ea2eae70692a455e256787e4f54153fb739b90 ]
+
+HuC delayed loading fence, introduced with commit 27536e03271da
+("drm/i915/huc: track delayed HuC load with a fence"), is registered with
+object tracker early on driver probe but unregistered only from driver
+remove, which is not called on early probe errors. Since its memory is
+allocated under devres, then released anyway, it may happen to be
+allocated again to the fence and reused on future driver probes, resulting
+in kernel warnings that taint the kernel:
+
+<4> [309.731371] ------------[ cut here ]------------
+<3> [309.731373] ODEBUG: init destroyed (active state 0) object: ffff88813d7dd2e0 object type: i915_sw_fence hint: sw_fence_dummy_notify+0x0/0x20 [i915]
+<4> [309.731575] WARNING: CPU: 2 PID: 3161 at lib/debugobjects.c:612 debug_print_object+0x93/0xf0
+...
+<4> [309.731693] CPU: 2 UID: 0 PID: 3161 Comm: i915_module_loa Tainted: G U 6.14.0-CI_DRM_16362-gf0fd77956987+ #1
+...
+<4> [309.731700] RIP: 0010:debug_print_object+0x93/0xf0
+...
+<4> [309.731728] Call Trace:
+<4> [309.731730] <TASK>
+...
+<4> [309.731949] __debug_object_init+0x17b/0x1c0
+<4> [309.731957] debug_object_init+0x34/0x50
+<4> [309.732126] __i915_sw_fence_init+0x34/0x60 [i915]
+<4> [309.732256] intel_huc_init_early+0x4b/0x1d0 [i915]
+<4> [309.732468] intel_uc_init_early+0x61/0x680 [i915]
+<4> [309.732667] intel_gt_common_init_early+0x105/0x130 [i915]
+<4> [309.732804] intel_root_gt_init_early+0x63/0x80 [i915]
+<4> [309.732938] i915_driver_probe+0x1fa/0xeb0 [i915]
+<4> [309.733075] i915_pci_probe+0xe6/0x220 [i915]
+<4> [309.733198] local_pci_probe+0x44/0xb0
+<4> [309.733203] pci_device_probe+0xf4/0x270
+<4> [309.733209] really_probe+0xee/0x3c0
+<4> [309.733215] __driver_probe_device+0x8c/0x180
+<4> [309.733219] driver_probe_device+0x24/0xd0
+<4> [309.733223] __driver_attach+0x10f/0x220
+<4> [309.733230] bus_for_each_dev+0x7d/0xe0
+<4> [309.733236] driver_attach+0x1e/0x30
+<4> [309.733239] bus_add_driver+0x151/0x290
+<4> [309.733244] driver_register+0x5e/0x130
+<4> [309.733247] __pci_register_driver+0x7d/0x90
+<4> [309.733251] i915_pci_register_driver+0x23/0x30 [i915]
+<4> [309.733413] i915_init+0x34/0x120 [i915]
+<4> [309.733655] do_one_initcall+0x62/0x3f0
+<4> [309.733667] do_init_module+0x97/0x2a0
+<4> [309.733671] load_module+0x25ff/0x2890
+<4> [309.733688] init_module_from_file+0x97/0xe0
+<4> [309.733701] idempotent_init_module+0x118/0x330
+<4> [309.733711] __x64_sys_finit_module+0x77/0x100
+<4> [309.733715] x64_sys_call+0x1f37/0x2650
+<4> [309.733719] do_syscall_64+0x91/0x180
+<4> [309.733763] entry_SYSCALL_64_after_hwframe+0x76/0x7e
+<4> [309.733792] </TASK>
+...
+<4> [309.733806] ---[ end trace 0000000000000000 ]---
+
+That scenario is most easily reproducible with
+igt@i915_module_load@reload-with-fault-injection.
+
+Fix the issue by moving the cleanup step to driver release path.
+
+Fixes: 27536e03271da ("drm/i915/huc: track delayed HuC load with a fence")
+Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13592
+Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
+Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
+Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
+Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
+Reviewed-by: Krzysztof Karas <krzysztof.karas@intel.com>
+Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
+Link: https://lore.kernel.org/r/20250402172057.209924-2-janusz.krzysztofik@linux.intel.com
+(cherry picked from commit 795dbde92fe5c6996a02a5b579481de73035e7bf)
+Signed-off-by: Jani Nikula <jani.nikula@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/i915/gt/uc/intel_huc.c | 11 +++++------
+ drivers/gpu/drm/i915/gt/uc/intel_huc.h | 1 +
+ drivers/gpu/drm/i915/gt/uc/intel_uc.c | 1 +
+ 3 files changed, 7 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
+index 2d9152eb72825..24fdce844d9e3 100644
+--- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c
++++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
+@@ -317,6 +317,11 @@ void intel_huc_init_early(struct intel_huc *huc)
+ }
+ }
+
++void intel_huc_fini_late(struct intel_huc *huc)
++{
++ delayed_huc_load_fini(huc);
++}
++
+ #define HUC_LOAD_MODE_STRING(x) (x ? "GSC" : "legacy")
+ static int check_huc_loading_mode(struct intel_huc *huc)
+ {
+@@ -414,12 +419,6 @@ int intel_huc_init(struct intel_huc *huc)
+
+ void intel_huc_fini(struct intel_huc *huc)
+ {
+- /*
+- * the fence is initialized in init_early, so we need to clean it up
+- * even if HuC loading is off.
+- */
+- delayed_huc_load_fini(huc);
+-
+ if (huc->heci_pkt)
+ i915_vma_unpin_and_release(&huc->heci_pkt, 0);
+
+diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.h b/drivers/gpu/drm/i915/gt/uc/intel_huc.h
+index ba5cb08e9e7bf..09aff3148f7dd 100644
+--- a/drivers/gpu/drm/i915/gt/uc/intel_huc.h
++++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.h
+@@ -55,6 +55,7 @@ struct intel_huc {
+
+ int intel_huc_sanitize(struct intel_huc *huc);
+ void intel_huc_init_early(struct intel_huc *huc);
++void intel_huc_fini_late(struct intel_huc *huc);
+ int intel_huc_init(struct intel_huc *huc);
+ void intel_huc_fini(struct intel_huc *huc);
+ void intel_huc_suspend(struct intel_huc *huc);
+diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+index 5b8080ec5315b..4f751ce74214d 100644
+--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
++++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+@@ -136,6 +136,7 @@ void intel_uc_init_late(struct intel_uc *uc)
+
+ void intel_uc_driver_late_release(struct intel_uc *uc)
+ {
++ intel_huc_fini_late(&uc->huc);
+ }
+
+ /**
+--
+2.39.5
+
--- /dev/null
+From 04f2ceaecd9aba9f68bdea16ce349b6e412b7703 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 09:34:10 +0200
+Subject: drm/tests: cmdline: Fix drm_display_mode memory leak
+
+From: Maxime Ripard <mripard@kernel.org>
+
+[ Upstream commit 70f29ca3117a8796cd6bde7612a3ded96d0f2dde ]
+
+drm_analog_tv_mode() and its variants return a drm_display_mode that
+needs to be destroyed later one. The drm_test_cmdline_tv_options() test
+never does however, which leads to a memory leak.
+
+Let's make sure it's freed.
+
+Reported-by: Philipp Stanner <phasta@mailbox.org>
+Closes: https://lore.kernel.org/dri-devel/a7655158a6367ac46194d57f4b7433ef0772a73e.camel@mailbox.org/
+Fixes: e691c9992ae1 ("drm/modes: Introduce the tv_mode property as a command-line option")
+Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
+Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-4-996305a2e75a@kernel.org
+Signed-off-by: Maxime Ripard <mripard@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/tests/drm_cmdline_parser_test.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/tests/drm_cmdline_parser_test.c b/drivers/gpu/drm/tests/drm_cmdline_parser_test.c
+index 59c8408c453c2..1cfcb597b088b 100644
+--- a/drivers/gpu/drm/tests/drm_cmdline_parser_test.c
++++ b/drivers/gpu/drm/tests/drm_cmdline_parser_test.c
+@@ -7,6 +7,7 @@
+ #include <kunit/test.h>
+
+ #include <drm/drm_connector.h>
++#include <drm/drm_kunit_helpers.h>
+ #include <drm/drm_modes.h>
+
+ static const struct drm_connector no_connector = {};
+@@ -955,8 +956,15 @@ struct drm_cmdline_tv_option_test {
+ static void drm_test_cmdline_tv_options(struct kunit *test)
+ {
+ const struct drm_cmdline_tv_option_test *params = test->param_value;
+- const struct drm_display_mode *expected_mode = params->mode_fn(NULL);
++ struct drm_display_mode *expected_mode;
+ struct drm_cmdline_mode mode = { };
++ int ret;
++
++ expected_mode = params->mode_fn(NULL);
++ KUNIT_ASSERT_NOT_NULL(test, expected_mode);
++
++ ret = drm_kunit_add_mode_destroy_action(test, expected_mode);
++ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, drm_mode_parse_command_line_for_connector(params->cmdline,
+ &no_connector, &mode));
+--
+2.39.5
+
--- /dev/null
+From 4fa7c6c9a0102c7364c28013c8d9d5dc613cbb81 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 09:34:07 +0200
+Subject: drm/tests: helpers: Create kunit helper to destroy a drm_display_mode
+
+From: Maxime Ripard <mripard@kernel.org>
+
+[ Upstream commit 13c1d5f3a7fa7b55a26e73bb9e95342374a489b2 ]
+
+A number of test suites call functions that expect the returned
+drm_display_mode to be destroyed eventually.
+
+However, none of the tests called drm_mode_destroy, which results in a
+memory leak.
+
+Since drm_mode_destroy takes two pointers as argument, we can't use a
+kunit wrapper. Let's just create a helper every test suite can use.
+
+Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
+Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-1-996305a2e75a@kernel.org
+Signed-off-by: Maxime Ripard <mripard@kernel.org>
+Stable-dep-of: 70f29ca3117a ("drm/tests: cmdline: Fix drm_display_mode memory leak")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/tests/drm_kunit_helpers.c | 22 ++++++++++++++++++++++
+ include/drm/drm_kunit_helpers.h | 3 +++
+ 2 files changed, 25 insertions(+)
+
+diff --git a/drivers/gpu/drm/tests/drm_kunit_helpers.c b/drivers/gpu/drm/tests/drm_kunit_helpers.c
+index 3c0b7824c0be3..922c4b6ed1dc9 100644
+--- a/drivers/gpu/drm/tests/drm_kunit_helpers.c
++++ b/drivers/gpu/drm/tests/drm_kunit_helpers.c
+@@ -319,6 +319,28 @@ static void kunit_action_drm_mode_destroy(void *ptr)
+ drm_mode_destroy(NULL, mode);
+ }
+
++/**
++ * drm_kunit_add_mode_destroy_action() - Add a drm_destroy_mode kunit action
++ * @test: The test context object
++ * @mode: The drm_display_mode to destroy eventually
++ *
++ * Registers a kunit action that will destroy the drm_display_mode at
++ * the end of the test.
++ *
++ * If an error occurs, the drm_display_mode will be destroyed.
++ *
++ * Returns:
++ * 0 on success, an error code otherwise.
++ */
++int drm_kunit_add_mode_destroy_action(struct kunit *test,
++ struct drm_display_mode *mode)
++{
++ return kunit_add_action_or_reset(test,
++ kunit_action_drm_mode_destroy,
++ mode);
++}
++EXPORT_SYMBOL_GPL(drm_kunit_add_mode_destroy_action);
++
+ /**
+ * drm_kunit_display_mode_from_cea_vic() - return a mode for CEA VIC for a KUnit test
+ * @test: The test context object
+diff --git a/include/drm/drm_kunit_helpers.h b/include/drm/drm_kunit_helpers.h
+index afdd46ef04f70..c835f113055dc 100644
+--- a/include/drm/drm_kunit_helpers.h
++++ b/include/drm/drm_kunit_helpers.h
+@@ -120,6 +120,9 @@ drm_kunit_helper_create_crtc(struct kunit *test,
+ const struct drm_crtc_funcs *funcs,
+ const struct drm_crtc_helper_funcs *helper_funcs);
+
++int drm_kunit_add_mode_destroy_action(struct kunit *test,
++ struct drm_display_mode *mode);
++
+ struct drm_display_mode *
+ drm_kunit_display_mode_from_cea_vic(struct kunit *test, struct drm_device *dev,
+ u8 video_code);
+--
+2.39.5
+
--- /dev/null
+From 9cae88f438ed0f71a796bdfd72f6b4cd51d649c5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 09:34:11 +0200
+Subject: drm/tests: modes: Fix drm_display_mode memory leak
+
+From: Maxime Ripard <mripard@kernel.org>
+
+[ Upstream commit d34146340f95cd9bf06d4ce71cca72127dc0b7cd ]
+
+drm_analog_tv_mode() and its variants return a drm_display_mode that
+needs to be destroyed later one. The drm_modes_analog_tv tests never
+do however, which leads to a memory leak.
+
+Let's make sure it's freed.
+
+Reported-by: Philipp Stanner <phasta@mailbox.org>
+Closes: https://lore.kernel.org/dri-devel/a7655158a6367ac46194d57f4b7433ef0772a73e.camel@mailbox.org/
+Fixes: 4fcd238560ee ("drm/modes: Add a function to generate analog display modes")
+Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
+Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-5-996305a2e75a@kernel.org
+Signed-off-by: Maxime Ripard <mripard@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/tests/drm_modes_test.c | 22 ++++++++++++++++++++++
+ 1 file changed, 22 insertions(+)
+
+diff --git a/drivers/gpu/drm/tests/drm_modes_test.c b/drivers/gpu/drm/tests/drm_modes_test.c
+index 6ed51f99e133c..7ba646d87856f 100644
+--- a/drivers/gpu/drm/tests/drm_modes_test.c
++++ b/drivers/gpu/drm/tests/drm_modes_test.c
+@@ -40,6 +40,7 @@ static void drm_test_modes_analog_tv_ntsc_480i(struct kunit *test)
+ {
+ struct drm_test_modes_priv *priv = test->priv;
+ struct drm_display_mode *mode;
++ int ret;
+
+ mode = drm_analog_tv_mode(priv->drm,
+ DRM_MODE_TV_MODE_NTSC,
+@@ -47,6 +48,9 @@ static void drm_test_modes_analog_tv_ntsc_480i(struct kunit *test)
+ true);
+ KUNIT_ASSERT_NOT_NULL(test, mode);
+
++ ret = drm_kunit_add_mode_destroy_action(test, mode);
++ KUNIT_ASSERT_EQ(test, ret, 0);
++
+ KUNIT_EXPECT_EQ(test, drm_mode_vrefresh(mode), 60);
+ KUNIT_EXPECT_EQ(test, mode->hdisplay, 720);
+
+@@ -70,6 +74,7 @@ static void drm_test_modes_analog_tv_ntsc_480i_inlined(struct kunit *test)
+ {
+ struct drm_test_modes_priv *priv = test->priv;
+ struct drm_display_mode *expected, *mode;
++ int ret;
+
+ expected = drm_analog_tv_mode(priv->drm,
+ DRM_MODE_TV_MODE_NTSC,
+@@ -77,9 +82,15 @@ static void drm_test_modes_analog_tv_ntsc_480i_inlined(struct kunit *test)
+ true);
+ KUNIT_ASSERT_NOT_NULL(test, expected);
+
++ ret = drm_kunit_add_mode_destroy_action(test, expected);
++ KUNIT_ASSERT_EQ(test, ret, 0);
++
+ mode = drm_mode_analog_ntsc_480i(priv->drm);
+ KUNIT_ASSERT_NOT_NULL(test, mode);
+
++ ret = drm_kunit_add_mode_destroy_action(test, mode);
++ KUNIT_ASSERT_EQ(test, ret, 0);
++
+ KUNIT_EXPECT_TRUE(test, drm_mode_equal(expected, mode));
+ }
+
+@@ -87,6 +98,7 @@ static void drm_test_modes_analog_tv_pal_576i(struct kunit *test)
+ {
+ struct drm_test_modes_priv *priv = test->priv;
+ struct drm_display_mode *mode;
++ int ret;
+
+ mode = drm_analog_tv_mode(priv->drm,
+ DRM_MODE_TV_MODE_PAL,
+@@ -94,6 +106,9 @@ static void drm_test_modes_analog_tv_pal_576i(struct kunit *test)
+ true);
+ KUNIT_ASSERT_NOT_NULL(test, mode);
+
++ ret = drm_kunit_add_mode_destroy_action(test, mode);
++ KUNIT_ASSERT_EQ(test, ret, 0);
++
+ KUNIT_EXPECT_EQ(test, drm_mode_vrefresh(mode), 50);
+ KUNIT_EXPECT_EQ(test, mode->hdisplay, 720);
+
+@@ -117,6 +132,7 @@ static void drm_test_modes_analog_tv_pal_576i_inlined(struct kunit *test)
+ {
+ struct drm_test_modes_priv *priv = test->priv;
+ struct drm_display_mode *expected, *mode;
++ int ret;
+
+ expected = drm_analog_tv_mode(priv->drm,
+ DRM_MODE_TV_MODE_PAL,
+@@ -124,9 +140,15 @@ static void drm_test_modes_analog_tv_pal_576i_inlined(struct kunit *test)
+ true);
+ KUNIT_ASSERT_NOT_NULL(test, expected);
+
++ ret = drm_kunit_add_mode_destroy_action(test, expected);
++ KUNIT_ASSERT_EQ(test, ret, 0);
++
+ mode = drm_mode_analog_pal_576i(priv->drm);
+ KUNIT_ASSERT_NOT_NULL(test, mode);
+
++ ret = drm_kunit_add_mode_destroy_action(test, mode);
++ KUNIT_ASSERT_EQ(test, ret, 0);
++
+ KUNIT_EXPECT_TRUE(test, drm_mode_equal(expected, mode));
+ }
+
+--
+2.39.5
+
--- /dev/null
+From 7e26f2a6191fd119378fbea6d5a983d0ba2f994f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 09:34:08 +0200
+Subject: drm/tests: modeset: Fix drm_display_mode memory leak
+
+From: Maxime Ripard <mripard@kernel.org>
+
+[ Upstream commit dacafdcc7789cfeb0f0552716db56f210238225d ]
+
+drm_mode_find_dmt() returns a drm_display_mode that needs to be
+destroyed later one. The drm_test_pick_cmdline_res_1920_1080_60() test
+never does however, which leads to a memory leak.
+
+Let's make sure it's freed.
+
+Reported-by: Philipp Stanner <phasta@mailbox.org>
+Closes: https://lore.kernel.org/dri-devel/a7655158a6367ac46194d57f4b7433ef0772a73e.camel@mailbox.org/
+Fixes: 8fc0380f6ba7 ("drm/client: Add some tests for drm_connector_pick_cmdline_mode()")
+Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
+Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-2-996305a2e75a@kernel.org
+Signed-off-by: Maxime Ripard <mripard@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/tests/drm_client_modeset_test.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/gpu/drm/tests/drm_client_modeset_test.c b/drivers/gpu/drm/tests/drm_client_modeset_test.c
+index 7516f6cb36e4e..3e9518d7b8b7e 100644
+--- a/drivers/gpu/drm/tests/drm_client_modeset_test.c
++++ b/drivers/gpu/drm/tests/drm_client_modeset_test.c
+@@ -95,6 +95,9 @@ static void drm_test_pick_cmdline_res_1920_1080_60(struct kunit *test)
+ expected_mode = drm_mode_find_dmt(priv->drm, 1920, 1080, 60, false);
+ KUNIT_ASSERT_NOT_NULL(test, expected_mode);
+
++ ret = drm_kunit_add_mode_destroy_action(test, expected_mode);
++ KUNIT_ASSERT_EQ(test, ret, 0);
++
+ KUNIT_ASSERT_TRUE(test,
+ drm_mode_parse_command_line_for_connector(cmdline,
+ connector,
+--
+2.39.5
+
--- /dev/null
+From 246e68c896d3c096eab5604c89e7e3b585b05969 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 09:34:13 +0200
+Subject: drm/tests: probe-helper: Fix drm_display_mode memory leak
+
+From: Maxime Ripard <mripard@kernel.org>
+
+[ Upstream commit 8b6f2e28431b2f9f84073bff50353aeaf25559d0 ]
+
+drm_analog_tv_mode() and its variants return a drm_display_mode that
+needs to be destroyed later one. The
+drm_test_connector_helper_tv_get_modes_check() test never does however,
+which leads to a memory leak.
+
+Let's make sure it's freed.
+
+Reported-by: Philipp Stanner <phasta@mailbox.org>
+Closes: https://lore.kernel.org/dri-devel/a7655158a6367ac46194d57f4b7433ef0772a73e.camel@mailbox.org/
+Fixes: 1e4a91db109f ("drm/probe-helper: Provide a TV get_modes helper")
+Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
+Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-7-996305a2e75a@kernel.org
+Signed-off-by: Maxime Ripard <mripard@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/tests/drm_probe_helper_test.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/tests/drm_probe_helper_test.c b/drivers/gpu/drm/tests/drm_probe_helper_test.c
+index bc09ff38aca18..db0e4f5df275e 100644
+--- a/drivers/gpu/drm/tests/drm_probe_helper_test.c
++++ b/drivers/gpu/drm/tests/drm_probe_helper_test.c
+@@ -98,7 +98,7 @@ drm_test_connector_helper_tv_get_modes_check(struct kunit *test)
+ struct drm_connector *connector = &priv->connector;
+ struct drm_cmdline_mode *cmdline = &connector->cmdline_mode;
+ struct drm_display_mode *mode;
+- const struct drm_display_mode *expected;
++ struct drm_display_mode *expected;
+ size_t len;
+ int ret;
+
+@@ -134,6 +134,9 @@ drm_test_connector_helper_tv_get_modes_check(struct kunit *test)
+
+ KUNIT_EXPECT_TRUE(test, drm_mode_equal(mode, expected));
+ KUNIT_EXPECT_TRUE(test, mode->type & DRM_MODE_TYPE_PREFERRED);
++
++ ret = drm_kunit_add_mode_destroy_action(test, expected);
++ KUNIT_ASSERT_EQ(test, ret, 0);
+ }
+
+ if (params->num_expected_modes >= 2) {
+@@ -145,6 +148,9 @@ drm_test_connector_helper_tv_get_modes_check(struct kunit *test)
+
+ KUNIT_EXPECT_TRUE(test, drm_mode_equal(mode, expected));
+ KUNIT_EXPECT_FALSE(test, mode->type & DRM_MODE_TYPE_PREFERRED);
++
++ ret = drm_kunit_add_mode_destroy_action(test, expected);
++ KUNIT_ASSERT_EQ(test, ret, 0);
+ }
+
+ mutex_unlock(&priv->drm->mode_config.mutex);
+--
+2.39.5
+
--- /dev/null
+From 72563a7af0e14879d1ac9cfc48b7f0469a496bf1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 27 Mar 2025 17:56:47 +0530
+Subject: drm/xe/hw_engine: define sysfs_ops on all directories
+
+From: Tejas Upadhyay <tejas.upadhyay@intel.com>
+
+[ Upstream commit a5c71fd5b69b9da77e5e0b268e69e256932ba49c ]
+
+Sysfs_ops needs to be defined on all directories which
+can have attr files with set/get method. Add sysfs_ops
+to even those directories which is currently empty but
+would have attr files with set/get method in future.
+Leave .default with default sysfs_ops as it will never
+have setter method.
+
+V2(Himal/Rodrigo):
+ - use single sysfs_ops for all dir and attr with set/get
+ - add default ops as ./default does not need runtime pm at all
+
+Fixes: 3f0e14651ab0 ("drm/xe: Runtime PM wake on every sysfs call")
+Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20250327122647.886637-1-tejas.upadhyay@intel.com
+Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
+(cherry picked from commit 40780b9760b561e093508d07b8b9b06c94ab201e)
+Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c | 108 +++++++++---------
+ 1 file changed, 52 insertions(+), 56 deletions(-)
+
+diff --git a/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c
+index b53e8d2accdbd..a440442b4d727 100644
+--- a/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c
++++ b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c
+@@ -32,14 +32,61 @@ bool xe_hw_engine_timeout_in_range(u64 timeout, u64 min, u64 max)
+ return timeout >= min && timeout <= max;
+ }
+
+-static void kobj_xe_hw_engine_release(struct kobject *kobj)
++static void xe_hw_engine_sysfs_kobj_release(struct kobject *kobj)
+ {
+ kfree(kobj);
+ }
+
++static ssize_t xe_hw_engine_class_sysfs_attr_show(struct kobject *kobj,
++ struct attribute *attr,
++ char *buf)
++{
++ struct xe_device *xe = kobj_to_xe(kobj);
++ struct kobj_attribute *kattr;
++ ssize_t ret = -EIO;
++
++ kattr = container_of(attr, struct kobj_attribute, attr);
++ if (kattr->show) {
++ xe_pm_runtime_get(xe);
++ ret = kattr->show(kobj, kattr, buf);
++ xe_pm_runtime_put(xe);
++ }
++
++ return ret;
++}
++
++static ssize_t xe_hw_engine_class_sysfs_attr_store(struct kobject *kobj,
++ struct attribute *attr,
++ const char *buf,
++ size_t count)
++{
++ struct xe_device *xe = kobj_to_xe(kobj);
++ struct kobj_attribute *kattr;
++ ssize_t ret = -EIO;
++
++ kattr = container_of(attr, struct kobj_attribute, attr);
++ if (kattr->store) {
++ xe_pm_runtime_get(xe);
++ ret = kattr->store(kobj, kattr, buf, count);
++ xe_pm_runtime_put(xe);
++ }
++
++ return ret;
++}
++
++static const struct sysfs_ops xe_hw_engine_class_sysfs_ops = {
++ .show = xe_hw_engine_class_sysfs_attr_show,
++ .store = xe_hw_engine_class_sysfs_attr_store,
++};
++
+ static const struct kobj_type kobj_xe_hw_engine_type = {
+- .release = kobj_xe_hw_engine_release,
+- .sysfs_ops = &kobj_sysfs_ops
++ .release = xe_hw_engine_sysfs_kobj_release,
++ .sysfs_ops = &xe_hw_engine_class_sysfs_ops,
++};
++
++static const struct kobj_type kobj_xe_hw_engine_type_def = {
++ .release = xe_hw_engine_sysfs_kobj_release,
++ .sysfs_ops = &kobj_sysfs_ops,
+ };
+
+ static ssize_t job_timeout_max_store(struct kobject *kobj,
+@@ -543,7 +590,7 @@ static int xe_add_hw_engine_class_defaults(struct xe_device *xe,
+ if (!kobj)
+ return -ENOMEM;
+
+- kobject_init(kobj, &kobj_xe_hw_engine_type);
++ kobject_init(kobj, &kobj_xe_hw_engine_type_def);
+ err = kobject_add(kobj, parent, "%s", ".defaults");
+ if (err)
+ goto err_object;
+@@ -559,57 +606,6 @@ static int xe_add_hw_engine_class_defaults(struct xe_device *xe,
+ return err;
+ }
+
+-static void xe_hw_engine_sysfs_kobj_release(struct kobject *kobj)
+-{
+- kfree(kobj);
+-}
+-
+-static ssize_t xe_hw_engine_class_sysfs_attr_show(struct kobject *kobj,
+- struct attribute *attr,
+- char *buf)
+-{
+- struct xe_device *xe = kobj_to_xe(kobj);
+- struct kobj_attribute *kattr;
+- ssize_t ret = -EIO;
+-
+- kattr = container_of(attr, struct kobj_attribute, attr);
+- if (kattr->show) {
+- xe_pm_runtime_get(xe);
+- ret = kattr->show(kobj, kattr, buf);
+- xe_pm_runtime_put(xe);
+- }
+-
+- return ret;
+-}
+-
+-static ssize_t xe_hw_engine_class_sysfs_attr_store(struct kobject *kobj,
+- struct attribute *attr,
+- const char *buf,
+- size_t count)
+-{
+- struct xe_device *xe = kobj_to_xe(kobj);
+- struct kobj_attribute *kattr;
+- ssize_t ret = -EIO;
+-
+- kattr = container_of(attr, struct kobj_attribute, attr);
+- if (kattr->store) {
+- xe_pm_runtime_get(xe);
+- ret = kattr->store(kobj, kattr, buf, count);
+- xe_pm_runtime_put(xe);
+- }
+-
+- return ret;
+-}
+-
+-static const struct sysfs_ops xe_hw_engine_class_sysfs_ops = {
+- .show = xe_hw_engine_class_sysfs_attr_show,
+- .store = xe_hw_engine_class_sysfs_attr_store,
+-};
+-
+-static const struct kobj_type xe_hw_engine_sysfs_kobj_type = {
+- .release = xe_hw_engine_sysfs_kobj_release,
+- .sysfs_ops = &xe_hw_engine_class_sysfs_ops,
+-};
+
+ static void hw_engine_class_sysfs_fini(void *arg)
+ {
+@@ -640,7 +636,7 @@ int xe_hw_engine_class_sysfs_init(struct xe_gt *gt)
+ if (!kobj)
+ return -ENOMEM;
+
+- kobject_init(kobj, &xe_hw_engine_sysfs_kobj_type);
++ kobject_init(kobj, &kobj_xe_hw_engine_type);
+
+ err = kobject_add(kobj, gt->sysfs, "engines");
+ if (err)
+--
+2.39.5
+
--- /dev/null
+From 20a9f0b448136b981645e1c66231b457079df6f7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 2 Apr 2025 15:20:00 +0300
+Subject: gpiolib: of: Fix the choice for Ingenic NAND quirk
+
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+
+[ Upstream commit 2b9c536430126c233552cdcd6ec9d5077454ece4 ]
+
+The Ingenic NAND quirk has been added under CONFIG_LCD_HX8357 ifdeffery
+which sounds quite wrong. Fix the choice for Ingenic NAND quirk
+by wrapping it into own ifdeffery related to the respective driver.
+
+Fixes: 3a7fd473bd5d ("mtd: rawnand: ingenic: move the GPIO quirk to gpiolib-of.c")
+Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Link: https://lore.kernel.org/r/20250402122058.1517393-2-andriy.shevchenko@linux.intel.com
+Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpio/gpiolib-of.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
+index 880f1efcaca53..e543129d36050 100644
+--- a/drivers/gpio/gpiolib-of.c
++++ b/drivers/gpio/gpiolib-of.c
+@@ -193,6 +193,8 @@ static void of_gpio_try_fixup_polarity(const struct device_node *np,
+ */
+ { "himax,hx8357", "gpios-reset", false },
+ { "himax,hx8369", "gpios-reset", false },
++#endif
++#if IS_ENABLED(CONFIG_MTD_NAND_JZ4780)
+ /*
+ * The rb-gpios semantics was undocumented and qi,lb60 (along with
+ * the ingenic driver) got it wrong. The active state encodes the
+--
+2.39.5
+
--- /dev/null
+From 358c19a01093095e43bafe1f32ea72c829d46b83 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 1 Apr 2025 22:27:31 +0200
+Subject: iommu/exynos: Fix suspend/resume with IDENTITY domain
+
+From: Marek Szyprowski <m.szyprowski@samsung.com>
+
+[ Upstream commit 99deffc409b69000ac4877486e69ec6516becd53 ]
+
+Commit bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe
+path") changed the sequence of probing the SYSMMU controller devices and
+calls to arm_iommu_attach_device(), what results in resuming SYSMMU
+controller earlier, when it is still set to IDENTITY mapping. Such change
+revealed the bug in IDENTITY handling in the exynos-iommu driver. When
+SYSMMU controller is set to IDENTITY mapping, data->domain is NULL, so
+adjust checks in suspend & resume callbacks to handle this case
+correctly.
+
+Fixes: b3d14960e629 ("iommu/exynos: Implement an IDENTITY domain")
+Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
+Link: https://lore.kernel.org/r/20250401202731.2810474-1-m.szyprowski@samsung.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/exynos-iommu.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
+index c666ecab955d2..7465dbb6fa80c 100644
+--- a/drivers/iommu/exynos-iommu.c
++++ b/drivers/iommu/exynos-iommu.c
+@@ -832,7 +832,7 @@ static int __maybe_unused exynos_sysmmu_suspend(struct device *dev)
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(master);
+
+ mutex_lock(&owner->rpm_lock);
+- if (&data->domain->domain != &exynos_identity_domain) {
++ if (data->domain) {
+ dev_dbg(data->sysmmu, "saving state\n");
+ __sysmmu_disable(data);
+ }
+@@ -850,7 +850,7 @@ static int __maybe_unused exynos_sysmmu_resume(struct device *dev)
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(master);
+
+ mutex_lock(&owner->rpm_lock);
+- if (&data->domain->domain != &exynos_identity_domain) {
++ if (data->domain) {
+ dev_dbg(data->sysmmu, "restoring state\n");
+ __sysmmu_enable(data);
+ }
+--
+2.39.5
+
--- /dev/null
+From 0ff66a9484c481648869d65615bce93bd3f36c57 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 3 Apr 2025 12:22:12 +0200
+Subject: iommu/mediatek: Fix NULL pointer deference in mtk_iommu_device_group
+
+From: Louis-Alexis Eyraud <louisalexis.eyraud@collabora.com>
+
+[ Upstream commit 38e8844005e6068f336a3ad45451a562a0040ca1 ]
+
+Currently, mtk_iommu calls during probe iommu_device_register before
+the hw_list from driver data is initialized. Since iommu probing issue
+fix, it leads to NULL pointer dereference in mtk_iommu_device_group when
+hw_list is accessed with list_first_entry (not null safe).
+
+So, change the call order to ensure iommu_device_register is called
+after the driver data are initialized.
+
+Fixes: 9e3a2a643653 ("iommu/mediatek: Adapt sharing and non-sharing pgtable case")
+Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path")
+Reviewed-by: Yong Wu <yong.wu@mediatek.com>
+Tested-by: Chen-Yu Tsai <wenst@chromium.org> # MT8183 Juniper, MT8186 Tentacruel
+Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+Tested-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+Signed-off-by: Louis-Alexis Eyraud <louisalexis.eyraud@collabora.com>
+Link: https://lore.kernel.org/r/20250403-fix-mtk-iommu-error-v2-1-fe8b18f8b0a8@collabora.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/mtk_iommu.c | 26 +++++++++++++-------------
+ 1 file changed, 13 insertions(+), 13 deletions(-)
+
+diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
+index 6a2707fe7a78c..32deab732209e 100644
+--- a/drivers/iommu/mtk_iommu.c
++++ b/drivers/iommu/mtk_iommu.c
+@@ -1371,15 +1371,6 @@ static int mtk_iommu_probe(struct platform_device *pdev)
+ platform_set_drvdata(pdev, data);
+ mutex_init(&data->mutex);
+
+- ret = iommu_device_sysfs_add(&data->iommu, dev, NULL,
+- "mtk-iommu.%pa", &ioaddr);
+- if (ret)
+- goto out_link_remove;
+-
+- ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev);
+- if (ret)
+- goto out_sysfs_remove;
+-
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, SHARE_PGTABLE)) {
+ list_add_tail(&data->list, data->plat_data->hw_list);
+ data->hw_list = data->plat_data->hw_list;
+@@ -1389,19 +1380,28 @@ static int mtk_iommu_probe(struct platform_device *pdev)
+ data->hw_list = &data->hw_list_head;
+ }
+
++ ret = iommu_device_sysfs_add(&data->iommu, dev, NULL,
++ "mtk-iommu.%pa", &ioaddr);
++ if (ret)
++ goto out_list_del;
++
++ ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev);
++ if (ret)
++ goto out_sysfs_remove;
++
+ if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
+ ret = component_master_add_with_match(dev, &mtk_iommu_com_ops, match);
+ if (ret)
+- goto out_list_del;
++ goto out_device_unregister;
+ }
+ return ret;
+
+-out_list_del:
+- list_del(&data->list);
++out_device_unregister:
+ iommu_device_unregister(&data->iommu);
+ out_sysfs_remove:
+ iommu_device_sysfs_remove(&data->iommu);
+-out_link_remove:
++out_list_del:
++ list_del(&data->list);
+ if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+ device_link_remove(data->smicomm_dev, dev);
+ out_runtime_disable:
+--
+2.39.5
+
--- /dev/null
+From 797d38f57fc28f362a483ce5e394b0d3e3df8f70 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 11:43:16 +0300
+Subject: ipv6: Align behavior across nexthops during path selection
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+[ Upstream commit 6933cd4714861eea6848f18396a119d741f25fc3 ]
+
+A nexthop is only chosen when the calculated multipath hash falls in the
+nexthop's hash region (i.e., the hash is smaller than the nexthop's hash
+threshold) and when the nexthop is assigned a non-negative score by
+rt6_score_route().
+
+Commit 4d0ab3a6885e ("ipv6: Start path selection from the first
+nexthop") introduced an unintentional difference between the first
+nexthop and the rest when the score is negative.
+
+When the first nexthop matches, but has a negative score, the code will
+currently evaluate subsequent nexthops until one is found with a
+non-negative score. On the other hand, when a different nexthop matches,
+but has a negative score, the code will fallback to the nexthop with
+which the selection started ('match').
+
+Align the behavior across all nexthops and fallback to 'match' when the
+first nexthop matches, but has a negative score.
+
+Fixes: 3d709f69a3e7 ("ipv6: Use hash-threshold instead of modulo-N")
+Fixes: 4d0ab3a6885e ("ipv6: Start path selection from the first nexthop")
+Reported-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
+Closes: https://lore.kernel.org/netdev/67efef607bc41_1ddca82948c@willemb.c.googlers.com.notmuch/
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://patch.msgid.link/20250408084316.243559-1-idosch@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/route.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/net/ipv6/route.c b/net/ipv6/route.c
+index 987492dcb07ca..bae8ece3e881e 100644
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -470,10 +470,10 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
+ goto out;
+
+ hash = fl6->mp_hash;
+- if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) &&
+- rt6_score_route(first->fib6_nh, first->fib6_flags, oif,
+- strict) >= 0) {
+- match = first;
++ if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) {
++ if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif,
++ strict) >= 0)
++ match = first;
+ goto out;
+ }
+
+--
+2.39.5
+
--- /dev/null
+From e331ea7d583b0f77e7454003fb9f9010d0258ed2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 15:05:10 +0200
+Subject: net: ethtool: Don't call .cleanup_data when prepare_data fails
+
+From: Maxime Chevallier <maxime.chevallier@bootlin.com>
+
+[ Upstream commit 4f038a6a02d20859a3479293cbf172b0f14cbdd6 ]
+
+There's a consistent pattern where the .cleanup_data() callback is
+called when .prepare_data() fails, when it should really be called to
+clean after a successful .prepare_data() as per the documentation.
+
+Rewrite the error-handling paths to make sure we don't cleanup
+un-prepared data.
+
+Fixes: c781ff12a2f3 ("ethtool: Allow network drivers to dump arbitrary EEPROM data")
+Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
+Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
+Link: https://patch.msgid.link/20250407130511.75621-1-maxime.chevallier@bootlin.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ethtool/netlink.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
+index e233dfc8ca4be..a52be67139d0a 100644
+--- a/net/ethtool/netlink.c
++++ b/net/ethtool/netlink.c
+@@ -490,7 +490,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
+ ret = ops->prepare_data(req_info, reply_data, info);
+ rtnl_unlock();
+ if (ret < 0)
+- goto err_cleanup;
++ goto err_dev;
+ ret = ops->reply_size(req_info, reply_data);
+ if (ret < 0)
+ goto err_cleanup;
+@@ -548,7 +548,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
+ ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info);
+ rtnl_unlock();
+ if (ret < 0)
+- goto out;
++ goto out_cancel;
+ ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr);
+ if (ret < 0)
+ goto out;
+@@ -557,6 +557,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
+ out:
+ if (ctx->ops->cleanup_data)
+ ctx->ops->cleanup_data(ctx->reply_data);
++out_cancel:
+ ctx->reply_data->dev = NULL;
+ if (ret < 0)
+ genlmsg_cancel(skb, ehdr);
+@@ -760,7 +761,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,
+ ethnl_init_reply_data(reply_data, ops, dev);
+ ret = ops->prepare_data(req_info, reply_data, &info);
+ if (ret < 0)
+- goto err_cleanup;
++ goto err_rep;
+ ret = ops->reply_size(req_info, reply_data);
+ if (ret < 0)
+ goto err_cleanup;
+@@ -795,6 +796,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,
+ err_cleanup:
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
++err_rep:
+ kfree(reply_data);
+ kfree(req_info);
+ return;
+--
+2.39.5
+
--- /dev/null
+From 8acf93575795d035b9db106a9083490f7daefb5a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 13:49:52 -0500
+Subject: net: libwx: handle page_pool_dev_alloc_pages error
+
+From: Chenyuan Yang <chenyuan0y@gmail.com>
+
+[ Upstream commit 7f1ff1b38a7c8b872382b796023419d87d78c47e ]
+
+page_pool_dev_alloc_pages could return NULL. There was a WARN_ON(!page)
+but it would still proceed to use the NULL pointer and then crash.
+
+This is similar to commit 001ba0902046
+("net: fec: handle page_pool_dev_alloc_pages error").
+
+This is found by our static analysis tool KNighter.
+
+Signed-off-by: Chenyuan Yang <chenyuan0y@gmail.com>
+Fixes: 3c47e8ae113a ("net: libwx: Support to receive packets in NAPI")
+Reviewed-by: Joe Damato <jdamato@fastly.com>
+Link: https://patch.msgid.link/20250407184952.2111299-1-chenyuan0y@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/wangxun/libwx/wx_lib.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c
+index 2b3d6586f44a5..71c891d14fb62 100644
+--- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c
++++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c
+@@ -309,7 +309,8 @@ static bool wx_alloc_mapped_page(struct wx_ring *rx_ring,
+ return true;
+
+ page = page_pool_dev_alloc_pages(rx_ring->page_pool);
+- WARN_ON(!page);
++ if (unlikely(!page))
++ return false;
+ dma = page_pool_get_dma_addr(page);
+
+ bi->page_dma = dma;
+--
+2.39.5
+
--- /dev/null
+From a7e1926fe6383cde19231b413580d47b7139a2f5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 12:40:42 +0300
+Subject: net: phy: allow MDIO bus PM ops to start/stop state machine for
+ phylink-controlled PHY
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+[ Upstream commit fc75ea20ffb452652f0d4033f38fe88d7cfdae35 ]
+
+DSA has 2 kinds of drivers:
+
+1. Those who call dsa_switch_suspend() and dsa_switch_resume() from
+ their device PM ops: qca8k-8xxx, bcm_sf2, microchip ksz
+2. Those who don't: all others. The above methods should be optional.
+
+For type 1, dsa_switch_suspend() calls dsa_user_suspend() -> phylink_stop(),
+and dsa_switch_resume() calls dsa_user_resume() -> phylink_start().
+These seem good candidates for setting mac_managed_pm = true because
+that is essentially its definition [1], but that does not seem to be the
+biggest problem for now, and is not what this change focuses on.
+
+Talking strictly about the 2nd category of DSA drivers here (which
+do not have MAC managed PM, meaning that for their attached PHYs,
+mdio_bus_phy_suspend() and mdio_bus_phy_resume() should run in full),
+I have noticed that the following warning from mdio_bus_phy_resume() is
+triggered:
+
+ WARN_ON(phydev->state != PHY_HALTED && phydev->state != PHY_READY &&
+ phydev->state != PHY_UP);
+
+because the PHY state machine is running.
+
+It's running as a result of a previous dsa_user_open() -> ... ->
+phylink_start() -> phy_start() having been initiated by the user.
+
+The previous mdio_bus_phy_suspend() was supposed to have called
+phy_stop_machine(), but it didn't. So this is why the PHY is in state
+PHY_NOLINK by the time mdio_bus_phy_resume() runs.
+
+mdio_bus_phy_suspend() did not call phy_stop_machine() because for
+phylink, the phydev->adjust_link function pointer is NULL. This seems a
+technicality introduced by commit fddd91016d16 ("phylib: fix PAL state
+machine restart on resume"). That commit was written before phylink
+existed, and was intended to avoid crashing with consumer drivers which
+don't use the PHY state machine - phylink always does, when using a PHY.
+But phylink itself has historically not been developed with
+suspend/resume in mind, and apparently not tested too much in that
+scenario, allowing this bug to exist unnoticed for so long. Plus, prior
+to the WARN_ON(), it would have likely been invisible.
+
+This issue is not in fact restricted to type 2 DSA drivers (according to
+the above ad-hoc classification), but can be extrapolated to any MAC
+driver with phylink and MDIO-bus-managed PHY PM ops. DSA is just where
+the issue was reported. Assuming mac_managed_pm is set correctly, a
+quick search indicates the following other drivers might be affected:
+
+$ grep -Zlr PHYLINK_NETDEV drivers/ | xargs -0 grep -L mac_managed_pm
+drivers/net/ethernet/atheros/ag71xx.c
+drivers/net/ethernet/microchip/sparx5/sparx5_main.c
+drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
+drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
+drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+drivers/net/ethernet/freescale/ucc_geth.c
+drivers/net/ethernet/freescale/enetc/enetc_pf_common.c
+drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+drivers/net/ethernet/marvell/mvneta.c
+drivers/net/ethernet/marvell/prestera/prestera_main.c
+drivers/net/ethernet/mediatek/mtk_eth_soc.c
+drivers/net/ethernet/altera/altera_tse_main.c
+drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c
+drivers/net/ethernet/meta/fbnic/fbnic_phylink.c
+drivers/net/ethernet/tehuti/tn40_phy.c
+drivers/net/ethernet/mscc/ocelot_net.c
+
+Make the existing conditions dependent on the PHY device having a
+phydev->phy_link_change() implementation equal to the default
+phy_link_change() provided by phylib. Otherwise, we implicitly know that
+the phydev has the phylink-provided phylink_phy_change() callback, and
+when phylink is used, the PHY state machine always needs to be stopped/
+started on the suspend/resume path. The code is structured as such that
+if phydev->phy_link_change() is absent, it is a matter of time until the
+kernel will crash - no need to further complicate the test.
+
+Thus, for the situation where the PM is not managed by the MAC, we will
+make the MDIO bus PM ops treat identically the phylink-controlled PHYs
+with the phylib-controlled PHYs where an adjust_link() callback is
+supplied. In both cases, the MDIO bus PM ops should stop and restart the
+PHY state machine.
+
+[1] https://lore.kernel.org/netdev/Z-1tiW9zjcoFkhwc@shell.armlinux.org.uk/
+
+Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state")
+Reported-by: Wei Fang <wei.fang@nxp.com>
+Tested-by: Wei Fang <wei.fang@nxp.com>
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://patch.msgid.link/20250407094042.2155633-1-vladimir.oltean@nxp.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/phy_device.c | 31 +++++++++++++++++++++++++++++--
+ 1 file changed, 29 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
+index 44aa67fd544dc..8af44224480f1 100644
+--- a/drivers/net/phy/phy_device.c
++++ b/drivers/net/phy/phy_device.c
+@@ -302,6 +302,33 @@ static void phy_link_change(struct phy_device *phydev, bool up)
+ phydev->mii_ts->link_state(phydev->mii_ts, phydev);
+ }
+
++/**
++ * phy_uses_state_machine - test whether consumer driver uses PAL state machine
++ * @phydev: the target PHY device structure
++ *
++ * Ultimately, this aims to indirectly determine whether the PHY is attached
++ * to a consumer which uses the state machine by calling phy_start() and
++ * phy_stop().
++ *
++ * When the PHY driver consumer uses phylib, it must have previously called
++ * phy_connect_direct() or one of its derivatives, so that phy_prepare_link()
++ * has set up a hook for monitoring state changes.
++ *
++ * When the PHY driver is used by the MAC driver consumer through phylink (the
++ * only other provider of a phy_link_change() method), using the PHY state
++ * machine is not optional.
++ *
++ * Return: true if consumer calls phy_start() and phy_stop(), false otherwise.
++ */
++static bool phy_uses_state_machine(struct phy_device *phydev)
++{
++ if (phydev->phy_link_change == phy_link_change)
++ return phydev->attached_dev && phydev->adjust_link;
++
++ /* phydev->phy_link_change is implicitly phylink_phy_change() */
++ return true;
++}
++
+ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
+ {
+ struct device_driver *drv = phydev->mdio.dev.driver;
+@@ -368,7 +395,7 @@ static __maybe_unused int mdio_bus_phy_suspend(struct device *dev)
+ * may call phy routines that try to grab the same lock, and that may
+ * lead to a deadlock.
+ */
+- if (phydev->attached_dev && phydev->adjust_link)
++ if (phy_uses_state_machine(phydev))
+ phy_stop_machine(phydev);
+
+ if (!mdio_bus_phy_may_suspend(phydev))
+@@ -422,7 +449,7 @@ static __maybe_unused int mdio_bus_phy_resume(struct device *dev)
+ }
+ }
+
+- if (phydev->attached_dev && phydev->adjust_link)
++ if (phy_uses_state_machine(phydev))
+ phy_start_machine(phydev);
+
+ return 0;
+--
+2.39.5
+
--- /dev/null
+From f270095ff9e571d3e8fa4125c90a5dc9486efe50 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 12:38:59 +0300
+Subject: net: phy: move phy_link_change() prior to mdio_bus_phy_may_suspend()
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+[ Upstream commit f40a673d6b4a128fe95dd9b8c3ed02da50a6a862 ]
+
+In an upcoming change, mdio_bus_phy_may_suspend() will need to
+distinguish a phylib-based PHY client from a phylink PHY client.
+For that, it will need to compare the phydev->phy_link_change() function
+pointer with the eponymous phy_link_change() provided by phylib.
+
+To avoid forward function declarations, the default PHY link state
+change method should be moved upwards. There is no functional change
+associated with this patch, it is only to reduce the noise from a real
+bug fix.
+
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+Link: https://patch.msgid.link/20250407093900.2155112-1-vladimir.oltean@nxp.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: fc75ea20ffb4 ("net: phy: allow MDIO bus PM ops to start/stop state machine for phylink-controlled PHY")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/phy_device.c | 26 +++++++++++++-------------
+ 1 file changed, 13 insertions(+), 13 deletions(-)
+
+diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
+index 119dfa2d6643a..44aa67fd544dc 100644
+--- a/drivers/net/phy/phy_device.c
++++ b/drivers/net/phy/phy_device.c
+@@ -289,6 +289,19 @@ static bool phy_drv_wol_enabled(struct phy_device *phydev)
+ return wol.wolopts != 0;
+ }
+
++static void phy_link_change(struct phy_device *phydev, bool up)
++{
++ struct net_device *netdev = phydev->attached_dev;
++
++ if (up)
++ netif_carrier_on(netdev);
++ else
++ netif_carrier_off(netdev);
++ phydev->adjust_link(netdev);
++ if (phydev->mii_ts && phydev->mii_ts->link_state)
++ phydev->mii_ts->link_state(phydev->mii_ts, phydev);
++}
++
+ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
+ {
+ struct device_driver *drv = phydev->mdio.dev.driver;
+@@ -1101,19 +1114,6 @@ struct phy_device *phy_find_first(struct mii_bus *bus)
+ }
+ EXPORT_SYMBOL(phy_find_first);
+
+-static void phy_link_change(struct phy_device *phydev, bool up)
+-{
+- struct net_device *netdev = phydev->attached_dev;
+-
+- if (up)
+- netif_carrier_on(netdev);
+- else
+- netif_carrier_off(netdev);
+- phydev->adjust_link(netdev);
+- if (phydev->mii_ts && phydev->mii_ts->link_state)
+- phydev->mii_ts->link_state(phydev->mii_ts, phydev);
+-}
+-
+ /**
+ * phy_prepare_link - prepares the PHY layer to monitor link status
+ * @phydev: target phy_device struct
+--
+2.39.5
+
--- /dev/null
+From a70e76abebc12b71419d34a228784e4bb8151991 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 17:55:08 +0200
+Subject: net: ppp: Add bound checking for skb data on ppp_sync_txmung
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Arnaud Lecomte <contact@arnaud-lcm.com>
+
+[ Upstream commit aabc6596ffb377c4c9c8f335124b92ea282c9821 ]
+
+Ensure we have enough data in linear buffer from skb before accessing
+initial bytes. This prevents potential out-of-bounds accesses
+when processing short packets.
+
+When ppp_sync_txmung receives an incoming package with an empty
+payload:
+(remote) gef➤ p *(struct pppoe_hdr *) (skb->head + skb->network_header)
+$18 = {
+ type = 0x1,
+ ver = 0x1,
+ code = 0x0,
+ sid = 0x2,
+ length = 0x0,
+ tag = 0xffff8880371cdb96
+}
+
+from the skb struct (trimmed)
+ tail = 0x16,
+ end = 0x140,
+ head = 0xffff88803346f400 "4",
+ data = 0xffff88803346f416 ":\377",
+ truesize = 0x380,
+ len = 0x0,
+ data_len = 0x0,
+ mac_len = 0xe,
+ hdr_len = 0x0,
+
+it is not safe to access data[2].
+
+Reported-by: syzbot+29fc8991b0ecb186cf40@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=29fc8991b0ecb186cf40
+Tested-by: syzbot+29fc8991b0ecb186cf40@syzkaller.appspotmail.com
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Arnaud Lecomte <contact@arnaud-lcm.com>
+Link: https://patch.msgid.link/20250408-bound-checking-ppp_txmung-v2-1-94bb6e1b92d0@arnaud-lcm.com
+[pabeni@redhat.com: fixed subj typo]
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ppp/ppp_synctty.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
+index 644e99fc3623f..9c4932198931f 100644
+--- a/drivers/net/ppp/ppp_synctty.c
++++ b/drivers/net/ppp/ppp_synctty.c
+@@ -506,6 +506,11 @@ ppp_sync_txmunge(struct syncppp *ap, struct sk_buff *skb)
+ unsigned char *data;
+ int islcp;
+
++ /* Ensure we can safely access protocol field and LCP code */
++ if (!pskb_may_pull(skb, 3)) {
++ kfree_skb(skb);
++ return NULL;
++ }
+ data = skb->data;
+ proto = get_unaligned_be16(data);
+
+--
+2.39.5
+
--- /dev/null
+From ad6c6cd42fdb10155cfa852a97eb6653c6b5891f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 4 Apr 2025 11:03:33 -0700
+Subject: net: tls: explicitly disallow disconnect
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 5071a1e606b30c0c11278d3c6620cd6a24724cf6 ]
+
+syzbot discovered that it can disconnect a TLS socket and then
+run into all sort of unexpected corner cases. I have a vague
+recollection of Eric pointing this out to us a long time ago.
+Supporting disconnect is really hard, for one thing if offload
+is enabled we'd need to wait for all packets to be _acked_.
+Disconnect is not commonly used, disallow it.
+
+The immediate problem syzbot run into is the warning in the strp,
+but that's just the easiest bug to trigger:
+
+ WARNING: CPU: 0 PID: 5834 at net/tls/tls_strp.c:486 tls_strp_msg_load+0x72e/0xa80 net/tls/tls_strp.c:486
+ RIP: 0010:tls_strp_msg_load+0x72e/0xa80 net/tls/tls_strp.c:486
+ Call Trace:
+ <TASK>
+ tls_rx_rec_wait+0x280/0xa60 net/tls/tls_sw.c:1363
+ tls_sw_recvmsg+0x85c/0x1c30 net/tls/tls_sw.c:2043
+ inet6_recvmsg+0x2c9/0x730 net/ipv6/af_inet6.c:678
+ sock_recvmsg_nosec net/socket.c:1023 [inline]
+ sock_recvmsg+0x109/0x280 net/socket.c:1045
+ __sys_recvfrom+0x202/0x380 net/socket.c:2237
+
+Fixes: 3c4d7559159b ("tls: kernel TLS support")
+Reported-by: syzbot+b4cd76826045a1eb93c1@syzkaller.appspotmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
+Link: https://patch.msgid.link/20250404180334.3224206-1-kuba@kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_main.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
+index 6b4b9f2749a6f..0acf313deb01f 100644
+--- a/net/tls/tls_main.c
++++ b/net/tls/tls_main.c
+@@ -809,6 +809,11 @@ static int tls_setsockopt(struct sock *sk, int level, int optname,
+ return do_tls_setsockopt(sk, optname, optval, optlen);
+ }
+
++static int tls_disconnect(struct sock *sk, int flags)
++{
++ return -EOPNOTSUPP;
++}
++
+ struct tls_context *tls_ctx_create(struct sock *sk)
+ {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -904,6 +909,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
+ prot[TLS_BASE][TLS_BASE] = *base;
+ prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt;
+ prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt;
++ prot[TLS_BASE][TLS_BASE].disconnect = tls_disconnect;
+ prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close;
+
+ prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+--
+2.39.5
+
--- /dev/null
+From 53379af57776b65e79b2bc87e3d3993ed71da68c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 13:24:08 -0700
+Subject: net_sched: sch_sfq: move the limit validation
+
+From: Octavian Purdila <tavip@google.com>
+
+[ Upstream commit b3bf8f63e6179076b57c9de660c9f80b5abefe70 ]
+
+It is not sufficient to directly validate the limit on the data that
+the user passes as it can be updated based on how the other parameters
+are changed.
+
+Move the check at the end of the configuration update process to also
+catch scenarios where the limit is indirectly updated, for example
+with the following configurations:
+
+tc qdisc add dev dummy0 handle 1: root sfq limit 2 flows 1 depth 1
+tc qdisc add dev dummy0 handle 1: root sfq limit 2 flows 1 divisor 1
+
+This fixes the following syzkaller reported crash:
+
+------------[ cut here ]------------
+UBSAN: array-index-out-of-bounds in net/sched/sch_sfq.c:203:6
+index 65535 is out of range for type 'struct sfq_head[128]'
+CPU: 1 UID: 0 PID: 3037 Comm: syz.2.16 Not tainted 6.14.0-rc2-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/27/2024
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:94 [inline]
+ dump_stack_lvl+0x201/0x300 lib/dump_stack.c:120
+ ubsan_epilogue lib/ubsan.c:231 [inline]
+ __ubsan_handle_out_of_bounds+0xf5/0x120 lib/ubsan.c:429
+ sfq_link net/sched/sch_sfq.c:203 [inline]
+ sfq_dec+0x53c/0x610 net/sched/sch_sfq.c:231
+ sfq_dequeue+0x34e/0x8c0 net/sched/sch_sfq.c:493
+ sfq_reset+0x17/0x60 net/sched/sch_sfq.c:518
+ qdisc_reset+0x12e/0x600 net/sched/sch_generic.c:1035
+ tbf_reset+0x41/0x110 net/sched/sch_tbf.c:339
+ qdisc_reset+0x12e/0x600 net/sched/sch_generic.c:1035
+ dev_reset_queue+0x100/0x1b0 net/sched/sch_generic.c:1311
+ netdev_for_each_tx_queue include/linux/netdevice.h:2590 [inline]
+ dev_deactivate_many+0x7e5/0xe70 net/sched/sch_generic.c:1375
+
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Fixes: 10685681bafc ("net_sched: sch_sfq: don't allow 1 packet limit")
+Signed-off-by: Octavian Purdila <tavip@google.com>
+Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/sch_sfq.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
+index 7714ae94e0521..58b42dcf8f201 100644
+--- a/net/sched/sch_sfq.c
++++ b/net/sched/sch_sfq.c
+@@ -661,10 +661,6 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
+ if (!p)
+ return -ENOMEM;
+ }
+- if (ctl->limit == 1) {
+- NL_SET_ERR_MSG_MOD(extack, "invalid limit");
+- return -EINVAL;
+- }
+
+ sch_tree_lock(sch);
+
+@@ -705,6 +701,12 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
+ limit = min_t(u32, ctl->limit, maxdepth * maxflows);
+ maxflows = min_t(u32, maxflows, limit);
+ }
++ if (limit == 1) {
++ sch_tree_unlock(sch);
++ kfree(p);
++ NL_SET_ERR_MSG_MOD(extack, "invalid limit");
++ return -EINVAL;
++ }
+
+ /* commit configuration */
+ q->limit = limit;
+--
+2.39.5
+
--- /dev/null
+From 9c3fd40dd511f0c20dd105609fbc09f935622f4e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 13:24:07 -0700
+Subject: net_sched: sch_sfq: use a temporary work area for validating
+ configuration
+
+From: Octavian Purdila <tavip@google.com>
+
+[ Upstream commit 8c0cea59d40cf6dd13c2950437631dd614fbade6 ]
+
+Many configuration parameters have influence on others (e.g. divisor
+-> flows -> limit, depth -> limit) and so it is difficult to correctly
+do all of the validation before applying the configuration. And if a
+validation error is detected late it is difficult to roll back a
+partially applied configuration.
+
+To avoid these issues use a temporary work area to update and validate
+the configuration and only then apply the configuration to the
+internal state.
+
+Signed-off-by: Octavian Purdila <tavip@google.com>
+Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: b3bf8f63e617 ("net_sched: sch_sfq: move the limit validation")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/sch_sfq.c | 56 +++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 44 insertions(+), 12 deletions(-)
+
+diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
+index 65d5b59da5830..7714ae94e0521 100644
+--- a/net/sched/sch_sfq.c
++++ b/net/sched/sch_sfq.c
+@@ -631,6 +631,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
+ struct red_parms *p = NULL;
+ struct sk_buff *to_free = NULL;
+ struct sk_buff *tail = NULL;
++ unsigned int maxflows;
++ unsigned int quantum;
++ unsigned int divisor;
++ int perturb_period;
++ u8 headdrop;
++ u8 maxdepth;
++ int limit;
++ u8 flags;
++
+
+ if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
+ return -EINVAL;
+@@ -656,36 +665,59 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
+ NL_SET_ERR_MSG_MOD(extack, "invalid limit");
+ return -EINVAL;
+ }
++
+ sch_tree_lock(sch);
++
++ limit = q->limit;
++ divisor = q->divisor;
++ headdrop = q->headdrop;
++ maxdepth = q->maxdepth;
++ maxflows = q->maxflows;
++ perturb_period = q->perturb_period;
++ quantum = q->quantum;
++ flags = q->flags;
++
++ /* update and validate configuration */
+ if (ctl->quantum)
+- q->quantum = ctl->quantum;
+- WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
++ quantum = ctl->quantum;
++ perturb_period = ctl->perturb_period * HZ;
+ if (ctl->flows)
+- q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
++ maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ if (ctl->divisor) {
+- q->divisor = ctl->divisor;
+- q->maxflows = min_t(u32, q->maxflows, q->divisor);
++ divisor = ctl->divisor;
++ maxflows = min_t(u32, maxflows, divisor);
+ }
+ if (ctl_v1) {
+ if (ctl_v1->depth)
+- q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
++ maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ if (p) {
+- swap(q->red_parms, p);
+- red_set_parms(q->red_parms,
++ red_set_parms(p,
+ ctl_v1->qth_min, ctl_v1->qth_max,
+ ctl_v1->Wlog,
+ ctl_v1->Plog, ctl_v1->Scell_log,
+ NULL,
+ ctl_v1->max_P);
+ }
+- q->flags = ctl_v1->flags;
+- q->headdrop = ctl_v1->headdrop;
++ flags = ctl_v1->flags;
++ headdrop = ctl_v1->headdrop;
+ }
+ if (ctl->limit) {
+- q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
+- q->maxflows = min_t(u32, q->maxflows, q->limit);
++ limit = min_t(u32, ctl->limit, maxdepth * maxflows);
++ maxflows = min_t(u32, maxflows, limit);
+ }
+
++ /* commit configuration */
++ q->limit = limit;
++ q->divisor = divisor;
++ q->headdrop = headdrop;
++ q->maxdepth = maxdepth;
++ q->maxflows = maxflows;
++ WRITE_ONCE(q->perturb_period, perturb_period);
++ q->quantum = quantum;
++ q->flags = flags;
++ if (p)
++ swap(q->red_parms, p);
++
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > q->limit) {
+ dropped += sfq_drop(sch, &to_free);
+--
+2.39.5
+
--- /dev/null
+From 5bac21ef679581de6ad46ebd929720c6dec12e8f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 19:40:18 +0200
+Subject: nft_set_pipapo: fix incorrect avx2 match of 5th field octet
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit e042ed950d4e176379ba4c0722146cd96fb38aa2 ]
+
+Given a set element like:
+
+ icmpv6 . dead:beef:00ff::1
+
+The value of 'ff' is irrelevant, any address will be matched
+as long as the other octets are the same.
+
+This is because of too-early register clobbering:
+ymm7 is reloaded with new packet data (pkt[9]) but it still holds data
+of an earlier load that wasn't processed yet.
+
+The existing tests in nft_concat_range.sh selftests do exercise this code
+path, but do not trigger incorrect matching due to the network prefix
+limitation.
+
+Fixes: 7400b063969b ("nft_set_pipapo: Introduce AVX2-based lookup implementation")
+Reported-by: sontu mazumdar <sontu21@gmail.com>
+Closes: https://lore.kernel.org/netfilter/CANgxkqwnMH7fXra+VUfODT-8+qFLgskq3set1cAzqqJaV4iEZg@mail.gmail.com/T/#t
+Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_set_pipapo_avx2.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
+index b8d3c3213efee..c15db28c5ebc4 100644
+--- a/net/netfilter/nft_set_pipapo_avx2.c
++++ b/net/netfilter/nft_set_pipapo_avx2.c
+@@ -994,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
++ NFT_PIPAPO_AVX2_AND(3, 4, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
+- NFT_PIPAPO_AVX2_AND(0, 4, 5);
++ NFT_PIPAPO_AVX2_AND(0, 3, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
+--
+2.39.5
+
--- /dev/null
+From 5c96383b2bca8bb9b22ff8a36ca14a441e27b4de Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 17:29:03 +0200
+Subject: nvmet-fcloop: swap list_add_tail arguments
+
+From: Daniel Wagner <wagi@kernel.org>
+
+[ Upstream commit 2b5f0c5bc819af2b0759a8fcddc1b39102735c0f ]
+
+The newly element to be added to the list is the first argument of
+list_add_tail. This fix is missing dcfad4ab4d67 ("nvmet-fcloop: swap
+the list_add_tail arguments").
+
+Fixes: 437c0b824dbd ("nvme-fcloop: add target to host LS request support")
+Signed-off-by: Daniel Wagner <wagi@kernel.org>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/nvme/target/fcloop.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
+index e1abb27927ff7..da195d61a9664 100644
+--- a/drivers/nvme/target/fcloop.c
++++ b/drivers/nvme/target/fcloop.c
+@@ -478,7 +478,7 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport,
+ if (targetport) {
+ tport = targetport->private;
+ spin_lock(&tport->lock);
+- list_add_tail(&tport->ls_list, &tls_req->ls_list);
++ list_add_tail(&tls_req->ls_list, &tport->ls_list);
+ spin_unlock(&tport->lock);
+ queue_work(nvmet_wq, &tport->ls_work);
+ }
+--
+2.39.5
+
--- /dev/null
+From 5da3d7e8191dfcc760bf8991c745dda3f89b32eb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Apr 2025 00:02:13 -0700
+Subject: objtool: Fix INSN_CONTEXT_SWITCH handling in validate_unret()
+
+From: Josh Poimboeuf <jpoimboe@kernel.org>
+
+[ Upstream commit a8df7d0ef92eca28c610206c6748daf537ac0586 ]
+
+The !CONFIG_IA32_EMULATION version of xen_entry_SYSCALL_compat() ends
+with a SYSCALL instruction which is classified by objtool as
+INSN_CONTEXT_SWITCH.
+
+Unlike validate_branch(), validate_unret() doesn't consider
+INSN_CONTEXT_SWITCH in a non-function to be a dead end, so it keeps
+going past the end of xen_entry_SYSCALL_compat(), resulting in the
+following warning:
+
+ vmlinux.o: warning: objtool: xen_reschedule_interrupt+0x2a: RET before UNTRAIN
+
+Fix that by adding INSN_CONTEXT_SWITCH handling to validate_unret() to
+match what validate_branch() is already doing.
+
+Fixes: a09a6e2399ba ("objtool: Add entry UNRET validation")
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/r/f5eda46fd09f15b1f5cde3d9ae3b92b958342add.1744095216.git.jpoimboe@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/objtool/check.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/tools/objtool/check.c b/tools/objtool/check.c
+index 286a2c0af02aa..127862fa05c61 100644
+--- a/tools/objtool/check.c
++++ b/tools/objtool/check.c
+@@ -3990,6 +3990,11 @@ static int validate_unret(struct objtool_file *file, struct instruction *insn)
+ WARN_INSN(insn, "RET before UNTRAIN");
+ return 1;
+
++ case INSN_CONTEXT_SWITCH:
++ if (insn_func(insn))
++ break;
++ return 0;
++
+ case INSN_NOP:
+ if (insn->retpoline_safe)
+ return 0;
+--
+2.39.5
+
--- /dev/null
+From 8de7e6196270f78cc45e5e8032b0314ab14388c0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 12:33:41 +0530
+Subject: octeontx2-pf: qos: fix VF root node parent queue index
+
+From: Hariprasad Kelam <hkelam@marvell.com>
+
+[ Upstream commit b7db94734e785e380b0db0f9295e07024f4d42a0 ]
+
+The current code configures the Physical Function (PF) root node at TL1
+and the Virtual Function (VF) root node at TL2.
+
+This ensure at any given point of time PF traffic gets more priority.
+
+ PF root node
+ TL1
+ / \
+ TL2 TL2 VF root node
+ / \
+ TL3 TL3
+ / \
+ TL4 TL4
+ / \
+ SMQ SMQ
+
+Due to a bug in the current code, the TL2 parent queue index on the
+VF interface is not being configured, leading to 'SMQ Flush' errors
+
+Fixes: 5e6808b4c68d ("octeontx2-pf: Add support for HTB offload")
+Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20250407070341.2765426-1-hkelam@marvell.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/marvell/octeontx2/nic/qos.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
+index 0f844c14485a0..35acc07bd9648 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c
+@@ -165,6 +165,11 @@ static void __otx2_qos_txschq_cfg(struct otx2_nic *pfvf,
+
+ otx2_config_sched_shaping(pfvf, node, cfg, &num_regs);
+ } else if (level == NIX_TXSCH_LVL_TL2) {
++ /* configure parent txschq */
++ cfg->reg[num_regs] = NIX_AF_TL2X_PARENT(node->schq);
++ cfg->regval[num_regs] = (u64)hw->tx_link << 16;
++ num_regs++;
++
+ /* configure link cfg */
+ if (level == pfvf->qos.link_cfg_lvl) {
+ cfg->reg[num_regs] = NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link);
+--
+2.39.5
+
--- /dev/null
+From e8cab07dfefee17286ace53dc41dbac24c4ed0ab Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 22 Oct 2024 18:59:08 +0300
+Subject: perf/core: Add aux_pause, aux_resume, aux_start_paused
+
+From: Adrian Hunter <adrian.hunter@intel.com>
+
+[ Upstream commit 18d92bb57c39504d9da11c6ef604f58eb1d5a117 ]
+
+Hardware traces, such as instruction traces, can produce a vast amount of
+trace data, so being able to reduce tracing to more specific circumstances
+can be useful.
+
+The ability to pause or resume tracing when another event happens, can do
+that.
+
+Add ability for an event to "pause" or "resume" AUX area tracing.
+
+Add aux_pause bit to perf_event_attr to indicate that, if the event
+happens, the associated AUX area tracing should be paused. Ditto
+aux_resume. Do not allow aux_pause and aux_resume to be set together.
+
+Add aux_start_paused bit to perf_event_attr to indicate to an AUX area
+event that it should start in a "paused" state.
+
+Add aux_paused to struct hw_perf_event for AUX area events to keep track of
+the "paused" state. aux_paused is initialized to aux_start_paused.
+
+Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start()
+callbacks. Call as needed, during __perf_event_output(). Add
+aux_in_pause_resume to struct perf_buffer to prevent races with the NMI
+handler. Pause/resume in NMI context will miss out if it coincides with
+another pause/resume.
+
+To use aux_pause or aux_resume, an event must be in a group with the AUX
+area event as the group leader.
+
+Example (requires Intel PT and tools patches also):
+
+ $ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname
+ Linux
+ [ perf record: Woken up 1 times to write data ]
+ [ perf record: Captured and wrote 0.043 MB perf.data ]
+ $ perf script --call-trace
+ uname 30805 [000] 24001.058782799: name: 0x7ffc9c1865b0
+ uname 30805 [000] 24001.058784424: psb offs: 0
+ uname 30805 [000] 24001.058784424: cbr: 39 freq: 3904 MHz (139%)
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) debug_smp_processor_id
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __x64_sys_newuname
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) down_read
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __cond_resched
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) in_lock_functions
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_sub
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) up_read
+ uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add
+ uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) in_lock_functions
+ uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) preempt_count_sub
+ uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) _copy_to_user
+ uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_to_user_mode
+ uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_work
+ uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) perf_syscall_exit
+ uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) debug_smp_processor_id
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_alloc
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_get_recursion_context
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_tp_event
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_update
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) tracing_gen_ctx_irq_test
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_event
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __perf_event_account_interrupt
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __this_cpu_preempt_check
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_output_forward
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_aux_pause
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) ring_buffer_get
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_lock
+ uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_unlock
+ uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) pt_event_stop
+ uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id
+ uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id
+ uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) native_write_msr
+ uname 30805 [000] 24001.058785463: ([kernel.kallsyms]) native_write_msr
+ uname 30805 [000] 24001.058785639: 0x0
+
+Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: James Clark <james.clark@arm.com>
+Link: https://lkml.kernel.org/r/20241022155920.17511-3-adrian.hunter@intel.com
+Stable-dep-of: 56799bc03565 ("perf: Fix hang while freeing sigtrap event")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/perf_event.h | 28 ++++++++++++
+ include/uapi/linux/perf_event.h | 11 ++++-
+ kernel/events/core.c | 75 +++++++++++++++++++++++++++++++--
+ kernel/events/internal.h | 1 +
+ 4 files changed, 110 insertions(+), 5 deletions(-)
+
+diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
+index 347901525a46a..19551d664bce2 100644
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -170,6 +170,12 @@ struct hw_perf_event {
+ };
+ struct { /* aux / Intel-PT */
+ u64 aux_config;
++ /*
++ * For AUX area events, aux_paused cannot be a state
++ * flag because it can be updated asynchronously to
++ * state.
++ */
++ unsigned int aux_paused;
+ };
+ struct { /* software */
+ struct hrtimer hrtimer;
+@@ -294,6 +300,7 @@ struct perf_event_pmu_context;
+ #define PERF_PMU_CAP_NO_EXCLUDE 0x0040
+ #define PERF_PMU_CAP_AUX_OUTPUT 0x0080
+ #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
++#define PERF_PMU_CAP_AUX_PAUSE 0x0200
+
+ /**
+ * pmu::scope
+@@ -384,6 +391,8 @@ struct pmu {
+ #define PERF_EF_START 0x01 /* start the counter when adding */
+ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
+ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
++#define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */
++#define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */
+
+ /*
+ * Adds/Removes a counter to/from the PMU, can be done inside a
+@@ -423,6 +432,18 @@ struct pmu {
+ *
+ * ->start() with PERF_EF_RELOAD will reprogram the counter
+ * value, must be preceded by a ->stop() with PERF_EF_UPDATE.
++ *
++ * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
++ * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
++ * PERF_EF_RESUME.
++ *
++ * ->start() with PERF_EF_RESUME will start as simply as possible but
++ * only if the counter is not otherwise stopped. Will not overlap
++ * another ->start() with PERF_EF_RESUME nor ->stop() with
++ * PERF_EF_PAUSE.
++ *
++ * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other
++ * ->stop()/->start() invocations, just not itself.
+ */
+ void (*start) (struct perf_event *event, int flags);
+ void (*stop) (struct perf_event *event, int flags);
+@@ -1685,6 +1706,13 @@ static inline bool has_aux(struct perf_event *event)
+ return event->pmu->setup_aux;
+ }
+
++static inline bool has_aux_action(struct perf_event *event)
++{
++ return event->attr.aux_sample_size ||
++ event->attr.aux_pause ||
++ event->attr.aux_resume;
++}
++
+ static inline bool is_write_backward(struct perf_event *event)
+ {
+ return !!event->attr.write_backward;
+diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
+index 4842c36fdf801..0524d541d4e3d 100644
+--- a/include/uapi/linux/perf_event.h
++++ b/include/uapi/linux/perf_event.h
+@@ -511,7 +511,16 @@ struct perf_event_attr {
+ __u16 sample_max_stack;
+ __u16 __reserved_2;
+ __u32 aux_sample_size;
+- __u32 __reserved_3;
++
++ union {
++ __u32 aux_action;
++ struct {
++ __u32 aux_start_paused : 1, /* start AUX area tracing paused */
++ aux_pause : 1, /* on overflow, pause AUX area tracing */
++ aux_resume : 1, /* on overflow, resume AUX area tracing */
++ __reserved_3 : 29;
++ };
++ };
+
+ /*
+ * User provided data if sigtrap=1, passed back to user via
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index b5ccf52bb71ba..bee6f88d0556b 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -2146,7 +2146,7 @@ static void perf_put_aux_event(struct perf_event *event)
+
+ static bool perf_need_aux_event(struct perf_event *event)
+ {
+- return !!event->attr.aux_output || !!event->attr.aux_sample_size;
++ return event->attr.aux_output || has_aux_action(event);
+ }
+
+ static int perf_get_aux_event(struct perf_event *event,
+@@ -2171,6 +2171,10 @@ static int perf_get_aux_event(struct perf_event *event,
+ !perf_aux_output_match(event, group_leader))
+ return 0;
+
++ if ((event->attr.aux_pause || event->attr.aux_resume) &&
++ !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
++ return 0;
++
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
+ return 0;
+
+@@ -8029,6 +8033,49 @@ void perf_prepare_header(struct perf_event_header *header,
+ WARN_ON_ONCE(header->size & 7);
+ }
+
++static void __perf_event_aux_pause(struct perf_event *event, bool pause)
++{
++ if (pause) {
++ if (!event->hw.aux_paused) {
++ event->hw.aux_paused = 1;
++ event->pmu->stop(event, PERF_EF_PAUSE);
++ }
++ } else {
++ if (event->hw.aux_paused) {
++ event->hw.aux_paused = 0;
++ event->pmu->start(event, PERF_EF_RESUME);
++ }
++ }
++}
++
++static void perf_event_aux_pause(struct perf_event *event, bool pause)
++{
++ struct perf_buffer *rb;
++
++ if (WARN_ON_ONCE(!event))
++ return;
++
++ rb = ring_buffer_get(event);
++ if (!rb)
++ return;
++
++ scoped_guard (irqsave) {
++ /*
++ * Guard against self-recursion here. Another event could trip
++ * this same from NMI context.
++ */
++ if (READ_ONCE(rb->aux_in_pause_resume))
++ break;
++
++ WRITE_ONCE(rb->aux_in_pause_resume, 1);
++ barrier();
++ __perf_event_aux_pause(event, pause);
++ barrier();
++ WRITE_ONCE(rb->aux_in_pause_resume, 0);
++ }
++ ring_buffer_put(rb);
++}
++
+ static __always_inline int
+ __perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+@@ -9832,9 +9879,12 @@ static int __perf_event_overflow(struct perf_event *event,
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
++ if (event->attr.aux_pause)
++ perf_event_aux_pause(event->aux_event, true);
++
+ if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ !bpf_overflow_handler(event, data, regs))
+- return ret;
++ goto out;
+
+ /*
+ * XXX event_limit might not quite work as expected on inherited
+@@ -9896,6 +9946,9 @@ static int __perf_event_overflow(struct perf_event *event,
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending_irq);
+ }
++out:
++ if (event->attr.aux_resume)
++ perf_event_aux_pause(event->aux_event, false);
+
+ return ret;
+ }
+@@ -12312,11 +12365,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ }
+
+ if (event->attr.aux_output &&
+- !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
++ (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
++ event->attr.aux_pause || event->attr.aux_resume)) {
+ err = -EOPNOTSUPP;
+ goto err_pmu;
+ }
+
++ if (event->attr.aux_pause && event->attr.aux_resume) {
++ err = -EINVAL;
++ goto err_pmu;
++ }
++
++ if (event->attr.aux_start_paused) {
++ if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
++ err = -EOPNOTSUPP;
++ goto err_pmu;
++ }
++ event->hw.aux_paused = 1;
++ }
++
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+@@ -13112,7 +13179,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+ * Grouping is not supported for kernel events, neither is 'AUX',
+ * make sure the caller's intentions are adjusted.
+ */
+- if (attr->aux_output)
++ if (attr->aux_output || attr->aux_action)
+ return ERR_PTR(-EINVAL);
+
+ event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+diff --git a/kernel/events/internal.h b/kernel/events/internal.h
+index e072d995d670f..249288d82b8dc 100644
+--- a/kernel/events/internal.h
++++ b/kernel/events/internal.h
+@@ -52,6 +52,7 @@ struct perf_buffer {
+ void (*free_aux)(void *);
+ refcount_t aux_refcount;
+ int aux_in_sampling;
++ int aux_in_pause_resume;
+ void **aux_pages;
+ void *aux_priv;
+
+--
+2.39.5
+
--- /dev/null
+From ce9c49bfdf59d19fbfd78cc899cf08c158f93847 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Nov 2024 14:39:13 +0100
+Subject: perf/core: Simplify the perf_event_alloc() error path
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit c70ca298036c58a88686ff388d3d367e9d21acf0 ]
+
+The error cleanup sequence in perf_event_alloc() is a subset of the
+existing _free_event() function (it must of course be).
+
+Split this out into __free_event() and simplify the error path.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lore.kernel.org/r/20241104135517.967889521@infradead.org
+Stable-dep-of: 56799bc03565 ("perf: Fix hang while freeing sigtrap event")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/perf_event.h | 16 +++--
+ kernel/events/core.c | 138 ++++++++++++++++++-------------------
+ 2 files changed, 78 insertions(+), 76 deletions(-)
+
+diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
+index 19551d664bce2..db6d281644447 100644
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -673,13 +673,15 @@ struct swevent_hlist {
+ struct rcu_head rcu_head;
+ };
+
+-#define PERF_ATTACH_CONTEXT 0x01
+-#define PERF_ATTACH_GROUP 0x02
+-#define PERF_ATTACH_TASK 0x04
+-#define PERF_ATTACH_TASK_DATA 0x08
+-#define PERF_ATTACH_ITRACE 0x10
+-#define PERF_ATTACH_SCHED_CB 0x20
+-#define PERF_ATTACH_CHILD 0x40
++#define PERF_ATTACH_CONTEXT 0x0001
++#define PERF_ATTACH_GROUP 0x0002
++#define PERF_ATTACH_TASK 0x0004
++#define PERF_ATTACH_TASK_DATA 0x0008
++#define PERF_ATTACH_ITRACE 0x0010
++#define PERF_ATTACH_SCHED_CB 0x0020
++#define PERF_ATTACH_CHILD 0x0040
++#define PERF_ATTACH_EXCLUSIVE 0x0080
++#define PERF_ATTACH_CALLCHAIN 0x0100
+
+ struct bpf_prog;
+ struct perf_cgroup;
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index bee6f88d0556b..255bae926f10a 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -5262,6 +5262,8 @@ static int exclusive_event_init(struct perf_event *event)
+ return -EBUSY;
+ }
+
++ event->attach_state |= PERF_ATTACH_EXCLUSIVE;
++
+ return 0;
+ }
+
+@@ -5269,14 +5271,13 @@ static void exclusive_event_destroy(struct perf_event *event)
+ {
+ struct pmu *pmu = event->pmu;
+
+- if (!is_exclusive_pmu(pmu))
+- return;
+-
+ /* see comment in exclusive_event_init() */
+ if (event->attach_state & PERF_ATTACH_TASK)
+ atomic_dec(&pmu->exclusive_cnt);
+ else
+ atomic_inc(&pmu->exclusive_cnt);
++
++ event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
+ }
+
+ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
+@@ -5335,40 +5336,20 @@ static void perf_pending_task_sync(struct perf_event *event)
+ rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
+ }
+
+-static void _free_event(struct perf_event *event)
++/* vs perf_event_alloc() error */
++static void __free_event(struct perf_event *event)
+ {
+- irq_work_sync(&event->pending_irq);
+- irq_work_sync(&event->pending_disable_irq);
+- perf_pending_task_sync(event);
++ if (event->attach_state & PERF_ATTACH_CALLCHAIN)
++ put_callchain_buffers();
+
+- unaccount_event(event);
++ kfree(event->addr_filter_ranges);
+
+- security_perf_event_free(event);
+-
+- if (event->rb) {
+- /*
+- * Can happen when we close an event with re-directed output.
+- *
+- * Since we have a 0 refcount, perf_mmap_close() will skip
+- * over us; possibly making our ring_buffer_put() the last.
+- */
+- mutex_lock(&event->mmap_mutex);
+- ring_buffer_attach(event, NULL);
+- mutex_unlock(&event->mmap_mutex);
+- }
++ if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
++ exclusive_event_destroy(event);
+
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
+- if (!event->parent) {
+- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+- put_callchain_buffers();
+- }
+-
+- perf_event_free_bpf_prog(event);
+- perf_addr_filters_splice(event, NULL);
+- kfree(event->addr_filter_ranges);
+-
+ if (event->destroy)
+ event->destroy(event);
+
+@@ -5379,22 +5360,58 @@ static void _free_event(struct perf_event *event)
+ if (event->hw.target)
+ put_task_struct(event->hw.target);
+
+- if (event->pmu_ctx)
++ if (event->pmu_ctx) {
++ /*
++ * put_pmu_ctx() needs an event->ctx reference, because of
++ * epc->ctx.
++ */
++ WARN_ON_ONCE(!event->ctx);
++ WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
+ put_pmu_ctx(event->pmu_ctx);
++ }
+
+ /*
+- * perf_event_free_task() relies on put_ctx() being 'last', in particular
+- * all task references must be cleaned up.
++ * perf_event_free_task() relies on put_ctx() being 'last', in
++ * particular all task references must be cleaned up.
+ */
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+- exclusive_event_destroy(event);
+- module_put(event->pmu->module);
++ if (event->pmu)
++ module_put(event->pmu->module);
+
+ call_rcu(&event->rcu_head, free_event_rcu);
+ }
+
++/* vs perf_event_alloc() success */
++static void _free_event(struct perf_event *event)
++{
++ irq_work_sync(&event->pending_irq);
++ irq_work_sync(&event->pending_disable_irq);
++ perf_pending_task_sync(event);
++
++ unaccount_event(event);
++
++ security_perf_event_free(event);
++
++ if (event->rb) {
++ /*
++ * Can happen when we close an event with re-directed output.
++ *
++ * Since we have a 0 refcount, perf_mmap_close() will skip
++ * over us; possibly making our ring_buffer_put() the last.
++ */
++ mutex_lock(&event->mmap_mutex);
++ ring_buffer_attach(event, NULL);
++ mutex_unlock(&event->mmap_mutex);
++ }
++
++ perf_event_free_bpf_prog(event);
++ perf_addr_filters_splice(event, NULL);
++
++ __free_event(event);
++}
++
+ /*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+@@ -12014,8 +12031,10 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
+ event->destroy(event);
+ }
+
+- if (ret)
++ if (ret) {
++ event->pmu = NULL;
+ module_put(pmu->module);
++ }
+
+ return ret;
+ }
+@@ -12343,7 +12362,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ * See perf_output_read().
+ */
+ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
+- goto err_ns;
++ goto err;
+
+ if (!has_branch_stack(event))
+ event->attr.branch_sample_type = 0;
+@@ -12351,7 +12370,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ pmu = perf_init_event(event);
+ if (IS_ERR(pmu)) {
+ err = PTR_ERR(pmu);
+- goto err_ns;
++ goto err;
+ }
+
+ /*
+@@ -12361,25 +12380,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ */
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
+ err = -EINVAL;
+- goto err_pmu;
++ goto err;
+ }
+
+ if (event->attr.aux_output &&
+ (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
+ event->attr.aux_pause || event->attr.aux_resume)) {
+ err = -EOPNOTSUPP;
+- goto err_pmu;
++ goto err;
+ }
+
+ if (event->attr.aux_pause && event->attr.aux_resume) {
+ err = -EINVAL;
+- goto err_pmu;
++ goto err;
+ }
+
+ if (event->attr.aux_start_paused) {
+ if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
+ err = -EOPNOTSUPP;
+- goto err_pmu;
++ goto err;
+ }
+ event->hw.aux_paused = 1;
+ }
+@@ -12387,12 +12406,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+- goto err_pmu;
++ goto err;
+ }
+
+ err = exclusive_event_init(event);
+ if (err)
+- goto err_pmu;
++ goto err;
+
+ if (has_addr_filter(event)) {
+ event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
+@@ -12400,7 +12419,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ GFP_KERNEL);
+ if (!event->addr_filter_ranges) {
+ err = -ENOMEM;
+- goto err_per_task;
++ goto err;
+ }
+
+ /*
+@@ -12425,41 +12444,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+ err = get_callchain_buffers(attr->sample_max_stack);
+ if (err)
+- goto err_addr_filters;
++ goto err;
++ event->attach_state |= PERF_ATTACH_CALLCHAIN;
+ }
+ }
+
+ err = security_perf_event_alloc(event);
+ if (err)
+- goto err_callchain_buffer;
++ goto err;
+
+ /* symmetric to unaccount_event() in _free_event() */
+ account_event(event);
+
+ return event;
+
+-err_callchain_buffer:
+- if (!event->parent) {
+- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+- put_callchain_buffers();
+- }
+-err_addr_filters:
+- kfree(event->addr_filter_ranges);
+-
+-err_per_task:
+- exclusive_event_destroy(event);
+-
+-err_pmu:
+- if (is_cgroup_event(event))
+- perf_detach_cgroup(event);
+- if (event->destroy)
+- event->destroy(event);
+- module_put(pmu->module);
+-err_ns:
+- if (event->hw.target)
+- put_task_struct(event->hw.target);
+- call_rcu(&event->rcu_head, free_event_rcu);
+-
++err:
++ __free_event(event);
+ return ERR_PTR(err);
+ }
+
+--
+2.39.5
+
--- /dev/null
+From 4cec7ae4bf7ba81b02ad3cfc0bcab5e8ba1b295f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 4 Mar 2025 14:54:46 +0100
+Subject: perf: Fix hang while freeing sigtrap event
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit 56799bc035658738f362acec3e7647bb84e68933 ]
+
+Perf can hang while freeing a sigtrap event if a related deferred
+signal hadn't managed to be sent before the file got closed:
+
+perf_event_overflow()
+ task_work_add(perf_pending_task)
+
+fput()
+ task_work_add(____fput())
+
+task_work_run()
+ ____fput()
+ perf_release()
+ perf_event_release_kernel()
+ _free_event()
+ perf_pending_task_sync()
+ task_work_cancel() -> FAILED
+ rcuwait_wait_event()
+
+Once task_work_run() is running, the list of pending callbacks is
+removed from the task_struct and from this point on task_work_cancel()
+can't remove any pending and not yet started work items, hence the
+task_work_cancel() failure and the hang on rcuwait_wait_event().
+
+Task work could be changed to remove one work at a time, so a work
+running on the current task can always cancel a pending one, however
+the wait / wake design is still subject to inverted dependencies when
+remote targets are involved, as pictured by Oleg:
+
+T1 T2
+
+fd = perf_event_open(pid => T2->pid); fd = perf_event_open(pid => T1->pid);
+close(fd) close(fd)
+ <IRQ> <IRQ>
+ perf_event_overflow() perf_event_overflow()
+ task_work_add(perf_pending_task) task_work_add(perf_pending_task)
+ </IRQ> </IRQ>
+ fput() fput()
+ task_work_add(____fput()) task_work_add(____fput())
+
+ task_work_run() task_work_run()
+ ____fput() ____fput()
+ perf_release() perf_release()
+ perf_event_release_kernel() perf_event_release_kernel()
+ _free_event() _free_event()
+ perf_pending_task_sync() perf_pending_task_sync()
+ rcuwait_wait_event() rcuwait_wait_event()
+
+Therefore the only option left is to acquire the event reference count
+upon queueing the perf task work and release it from the task work, just
+like it was done before 3a5465418f5f ("perf: Fix event leak upon exec and file release")
+but without the leaks it fixed.
+
+Some adjustments are necessary to make it work:
+
+* A child event might dereference its parent upon freeing. Care must be
+ taken to release the parent last.
+
+* Some places assuming the event doesn't have any reference held and
+ therefore can be freed right away must instead put the reference and
+ let the reference counting to its job.
+
+Reported-by: "Yi Lai" <yi1.lai@linux.intel.com>
+Closes: https://lore.kernel.org/all/Zx9Losv4YcJowaP%2F@ly-workstation/
+Reported-by: syzbot+3c4321e10eea460eb606@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/all/673adf75.050a0220.87769.0024.GAE@google.com/
+Fixes: 3a5465418f5f ("perf: Fix event leak upon exec and file release")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20250304135446.18905-1-frederic@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/perf_event.h | 1 -
+ kernel/events/core.c | 64 +++++++++++---------------------------
+ 2 files changed, 18 insertions(+), 47 deletions(-)
+
+diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
+index db6d281644447..0997077bcc52a 100644
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -833,7 +833,6 @@ struct perf_event {
+ struct irq_work pending_disable_irq;
+ struct callback_head pending_task;
+ unsigned int pending_work;
+- struct rcuwait pending_work_wait;
+
+ atomic_t event_limit;
+
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index 255bae926f10a..97af53c43608e 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -5312,30 +5312,6 @@ static bool exclusive_event_installable(struct perf_event *event,
+ static void perf_addr_filters_splice(struct perf_event *event,
+ struct list_head *head);
+
+-static void perf_pending_task_sync(struct perf_event *event)
+-{
+- struct callback_head *head = &event->pending_task;
+-
+- if (!event->pending_work)
+- return;
+- /*
+- * If the task is queued to the current task's queue, we
+- * obviously can't wait for it to complete. Simply cancel it.
+- */
+- if (task_work_cancel(current, head)) {
+- event->pending_work = 0;
+- local_dec(&event->ctx->nr_no_switch_fast);
+- return;
+- }
+-
+- /*
+- * All accesses related to the event are within the same RCU section in
+- * perf_pending_task(). The RCU grace period before the event is freed
+- * will make sure all those accesses are complete by then.
+- */
+- rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
+-}
+-
+ /* vs perf_event_alloc() error */
+ static void __free_event(struct perf_event *event)
+ {
+@@ -5388,7 +5364,6 @@ static void _free_event(struct perf_event *event)
+ {
+ irq_work_sync(&event->pending_irq);
+ irq_work_sync(&event->pending_disable_irq);
+- perf_pending_task_sync(event);
+
+ unaccount_event(event);
+
+@@ -5481,10 +5456,17 @@ static void perf_remove_from_owner(struct perf_event *event)
+
+ static void put_event(struct perf_event *event)
+ {
++ struct perf_event *parent;
++
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
+
++ parent = event->parent;
+ _free_event(event);
++
++ /* Matches the refcount bump in inherit_event() */
++ if (parent)
++ put_event(parent);
+ }
+
+ /*
+@@ -5568,11 +5550,6 @@ int perf_event_release_kernel(struct perf_event *event)
+ if (tmp == child) {
+ perf_remove_from_context(child, DETACH_GROUP);
+ list_move(&child->child_list, &free_list);
+- /*
+- * This matches the refcount bump in inherit_event();
+- * this can't be the last reference.
+- */
+- put_event(event);
+ } else {
+ var = &ctx->refcount;
+ }
+@@ -5598,7 +5575,8 @@ int perf_event_release_kernel(struct perf_event *event)
+ void *var = &child->ctx->refcount;
+
+ list_del(&child->child_list);
+- free_event(child);
++ /* Last reference unless ->pending_task work is pending */
++ put_event(child);
+
+ /*
+ * Wake any perf_event_free_task() waiting for this event to be
+@@ -5609,7 +5587,11 @@ int perf_event_release_kernel(struct perf_event *event)
+ }
+
+ no_ctx:
+- put_event(event); /* Must be the 'last' reference */
++ /*
++ * Last reference unless ->pending_task work is pending on this event
++ * or any of its children.
++ */
++ put_event(event);
+ return 0;
+ }
+ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+@@ -6994,12 +6976,6 @@ static void perf_pending_task(struct callback_head *head)
+ struct perf_event *event = container_of(head, struct perf_event, pending_task);
+ int rctx;
+
+- /*
+- * All accesses to the event must belong to the same implicit RCU read-side
+- * critical section as the ->pending_work reset. See comment in
+- * perf_pending_task_sync().
+- */
+- rcu_read_lock();
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+@@ -7010,9 +6986,8 @@ static void perf_pending_task(struct callback_head *head)
+ event->pending_work = 0;
+ perf_sigtrap(event);
+ local_dec(&event->ctx->nr_no_switch_fast);
+- rcuwait_wake_up(&event->pending_work_wait);
+ }
+- rcu_read_unlock();
++ put_event(event);
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+@@ -9935,6 +9910,7 @@ static int __perf_event_overflow(struct perf_event *event,
+ !task_work_add(current, &event->pending_task, notify_mode)) {
+ event->pending_work = pending_id;
+ local_inc(&event->ctx->nr_no_switch_fast);
++ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+
+ event->pending_addr = 0;
+ if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+@@ -12283,7 +12259,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ init_irq_work(&event->pending_irq, perf_pending_irq);
+ event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
+ init_task_work(&event->pending_task, perf_pending_task);
+- rcuwait_init(&event->pending_work_wait);
+
+ mutex_init(&event->mmap_mutex);
+ raw_spin_lock_init(&event->addr_filters.lock);
+@@ -13426,8 +13401,7 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+ * Kick perf_poll() for is_event_hup();
+ */
+ perf_event_wakeup(parent_event);
+- free_event(event);
+- put_event(parent_event);
++ put_event(event);
+ return;
+ }
+
+@@ -13545,13 +13519,11 @@ static void perf_free_event(struct perf_event *event,
+ list_del_init(&event->child_list);
+ mutex_unlock(&parent->child_mutex);
+
+- put_event(parent);
+-
+ raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
+ list_del_event(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+- free_event(event);
++ put_event(event);
+ }
+
+ /*
+--
+2.39.5
+
--- /dev/null
+From 0ddbdb68cfc6faece946f50915074a9e390d96d2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 4 Apr 2025 22:12:20 +0000
+Subject: selftests/futex: futex_waitv wouldblock test should fail
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Edward Liaw <edliaw@google.com>
+
+[ Upstream commit 7d50e00fef2832e98d7e06bbfc85c1d66ee110ca ]
+
+Testcase should fail if -EWOULDBLOCK is not returned when expected value
+differs from actual value from the waiter.
+
+Link: https://lore.kernel.org/r/20250404221225.1596324-1-edliaw@google.com
+Fixes: 9d57f7c79748920636f8293d2f01192d702fe390 ("selftests: futex: Test sys_futex_waitv() wouldblock")
+Signed-off-by: Edward Liaw <edliaw@google.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: André Almeida <andrealmeid@igalia.com>
+Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../testing/selftests/futex/functional/futex_wait_wouldblock.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
+index 7d7a6a06cdb75..2d8230da90642 100644
+--- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
++++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
+@@ -98,7 +98,7 @@ int main(int argc, char *argv[])
+ info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
+ res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC);
+ if (!res || errno != EWOULDBLOCK) {
+- ksft_test_result_pass("futex_waitv returned: %d %s\n",
++ ksft_test_result_fail("futex_waitv returned: %d %s\n",
+ res ? errno : res,
+ res ? strerror(errno) : "");
+ ret = RET_FAIL;
+--
+2.39.5
+
asoc-intel-adl-add-2xrt1316-audio-configuration.patch
+cgroup-cpuset-fix-incorrect-isolated_cpus-update-in-.patch
+cgroup-cpuset-fix-error-handling-in-remote_partition.patch
+cgroup-cpuset-revert-allow-suppression-of-sched-doma.patch
+cgroup-cpuset-enforce-at-most-one-rebuild_sched_doma.patch
+cgroup-cpuset-further-optimize-code-if-config_cpuset.patch
+cgroup-cpuset-fix-race-between-newly-created-partiti.patch
+gpiolib-of-fix-the-choice-for-ingenic-nand-quirk.patch
+selftests-futex-futex_waitv-wouldblock-test-should-f.patch
+ublk-refactor-recovery-configuration-flag-helpers.patch
+ublk-fix-handling-recovery-reissue-in-ublk_abort_que.patch
+drm-i915-disable-rpg-during-live-selftest.patch
+x86-acpi-don-t-limit-cpus-to-1-for-xen-pv-guests-due.patch
+drm-xe-hw_engine-define-sysfs_ops-on-all-directories.patch
+ata-pata_pxa-fix-potential-null-pointer-dereference-.patch
+objtool-fix-insn_context_switch-handling-in-validate.patch
+tipc-fix-memory-leak-in-tipc_link_xmit.patch
+codel-remove-sch-q.qlen-check-before-qdisc_tree_redu.patch
+net-tls-explicitly-disallow-disconnect.patch
+octeontx2-pf-qos-fix-vf-root-node-parent-queue-index.patch
+tc-ensure-we-have-enough-buffer-space-when-sending-f.patch
+net-ethtool-don-t-call-.cleanup_data-when-prepare_da.patch
+drm-tests-modeset-fix-drm_display_mode-memory-leak.patch
+drm-tests-helpers-create-kunit-helper-to-destroy-a-d.patch
+drm-tests-cmdline-fix-drm_display_mode-memory-leak.patch
+drm-tests-modes-fix-drm_display_mode-memory-leak.patch
+drm-tests-probe-helper-fix-drm_display_mode-memory-l.patch
+net-libwx-handle-page_pool_dev_alloc_pages-error.patch
+ata-sata_sx4-add-error-handling-in-pdc20621_i2c_read.patch
+drm-i915-huc-fix-fence-not-released-on-early-probe-e.patch
+nvmet-fcloop-swap-list_add_tail-arguments.patch
+net_sched-sch_sfq-use-a-temporary-work-area-for-vali.patch
+net_sched-sch_sfq-move-the-limit-validation.patch
+smb-client-fix-uaf-in-decryption-with-multichannel.patch
+net-phy-move-phy_link_change-prior-to-mdio_bus_phy_m.patch
+net-phy-allow-mdio-bus-pm-ops-to-start-stop-state-ma.patch
+ipv6-align-behavior-across-nexthops-during-path-sele.patch
+net-ppp-add-bound-checking-for-skb-data-on-ppp_sync_.patch
+nft_set_pipapo-fix-incorrect-avx2-match-of-5th-field.patch
+iommu-exynos-fix-suspend-resume-with-identity-domain.patch
+iommu-mediatek-fix-null-pointer-deference-in-mtk_iom.patch
+perf-core-add-aux_pause-aux_resume-aux_start_paused.patch
+perf-core-simplify-the-perf_event_alloc-error-path.patch
+perf-fix-hang-while-freeing-sigtrap-event.patch
--- /dev/null
+From 66dc2f6bcc583b14cb6c928dbd13f7ef08ddd693 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 9 Apr 2025 11:14:21 -0300
+Subject: smb: client: fix UAF in decryption with multichannel
+
+From: Paulo Alcantara <pc@manguebit.com>
+
+[ Upstream commit 9502dd5c7029902f4a425bf959917a5a9e7c0e50 ]
+
+After commit f7025d861694 ("smb: client: allocate crypto only for
+primary server") and commit b0abcd65ec54 ("smb: client: fix UAF in
+async decryption"), the channels started reusing AEAD TFM from primary
+channel to perform synchronous decryption, but that can't done as
+there could be multiple cifsd threads (one per channel) simultaneously
+accessing it to perform decryption.
+
+This fixes the following KASAN splat when running fstest generic/249
+with 'vers=3.1.1,multichannel,max_channels=4,seal' against Windows
+Server 2022:
+
+BUG: KASAN: slab-use-after-free in gf128mul_4k_lle+0xba/0x110
+Read of size 8 at addr ffff8881046c18a0 by task cifsd/986
+CPU: 3 UID: 0 PID: 986 Comm: cifsd Not tainted 6.15.0-rc1 #1
+PREEMPT(voluntary)
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-3.fc41
+04/01/2014
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x5d/0x80
+ print_report+0x156/0x528
+ ? gf128mul_4k_lle+0xba/0x110
+ ? __virt_addr_valid+0x145/0x300
+ ? __phys_addr+0x46/0x90
+ ? gf128mul_4k_lle+0xba/0x110
+ kasan_report+0xdf/0x1a0
+ ? gf128mul_4k_lle+0xba/0x110
+ gf128mul_4k_lle+0xba/0x110
+ ghash_update+0x189/0x210
+ shash_ahash_update+0x295/0x370
+ ? __pfx_shash_ahash_update+0x10/0x10
+ ? __pfx_shash_ahash_update+0x10/0x10
+ ? __pfx_extract_iter_to_sg+0x10/0x10
+ ? ___kmalloc_large_node+0x10e/0x180
+ ? __asan_memset+0x23/0x50
+ crypto_ahash_update+0x3c/0xc0
+ gcm_hash_assoc_remain_continue+0x93/0xc0
+ crypt_message+0xe09/0xec0 [cifs]
+ ? __pfx_crypt_message+0x10/0x10 [cifs]
+ ? _raw_spin_unlock+0x23/0x40
+ ? __pfx_cifs_readv_from_socket+0x10/0x10 [cifs]
+ decrypt_raw_data+0x229/0x380 [cifs]
+ ? __pfx_decrypt_raw_data+0x10/0x10 [cifs]
+ ? __pfx_cifs_read_iter_from_socket+0x10/0x10 [cifs]
+ smb3_receive_transform+0x837/0xc80 [cifs]
+ ? __pfx_smb3_receive_transform+0x10/0x10 [cifs]
+ ? __pfx___might_resched+0x10/0x10
+ ? __pfx_smb3_is_transform_hdr+0x10/0x10 [cifs]
+ cifs_demultiplex_thread+0x692/0x1570 [cifs]
+ ? __pfx_cifs_demultiplex_thread+0x10/0x10 [cifs]
+ ? rcu_is_watching+0x20/0x50
+ ? rcu_lockdep_current_cpu_online+0x62/0xb0
+ ? find_held_lock+0x32/0x90
+ ? kvm_sched_clock_read+0x11/0x20
+ ? local_clock_noinstr+0xd/0xd0
+ ? trace_irq_enable.constprop.0+0xa8/0xe0
+ ? __pfx_cifs_demultiplex_thread+0x10/0x10 [cifs]
+ kthread+0x1fe/0x380
+ ? kthread+0x10f/0x380
+ ? __pfx_kthread+0x10/0x10
+ ? local_clock_noinstr+0xd/0xd0
+ ? ret_from_fork+0x1b/0x60
+ ? local_clock+0x15/0x30
+ ? lock_release+0x29b/0x390
+ ? rcu_is_watching+0x20/0x50
+ ? __pfx_kthread+0x10/0x10
+ ret_from_fork+0x31/0x60
+ ? __pfx_kthread+0x10/0x10
+ ret_from_fork_asm+0x1a/0x30
+ </TASK>
+
+Tested-by: David Howells <dhowells@redhat.com>
+Reported-by: Steve French <stfrench@microsoft.com>
+Closes: https://lore.kernel.org/r/CAH2r5mu6Yc0-RJXM3kFyBYUB09XmXBrNodOiCVR4EDrmxq5Szg@mail.gmail.com
+Fixes: f7025d861694 ("smb: client: allocate crypto only for primary server")
+Fixes: b0abcd65ec54 ("smb: client: fix UAF in async decryption")
+Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/smb/client/cifsencrypt.c | 16 +++++-----------
+ fs/smb/client/smb2ops.c | 6 +++---
+ fs/smb/client/smb2pdu.c | 11 ++---------
+ 3 files changed, 10 insertions(+), 23 deletions(-)
+
+diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
+index 7a43daacc8159..7c61c1e944c7a 100644
+--- a/fs/smb/client/cifsencrypt.c
++++ b/fs/smb/client/cifsencrypt.c
+@@ -702,18 +702,12 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)
+ cifs_free_hash(&server->secmech.md5);
+ cifs_free_hash(&server->secmech.sha512);
+
+- if (!SERVER_IS_CHAN(server)) {
+- if (server->secmech.enc) {
+- crypto_free_aead(server->secmech.enc);
+- server->secmech.enc = NULL;
+- }
+-
+- if (server->secmech.dec) {
+- crypto_free_aead(server->secmech.dec);
+- server->secmech.dec = NULL;
+- }
+- } else {
++ if (server->secmech.enc) {
++ crypto_free_aead(server->secmech.enc);
+ server->secmech.enc = NULL;
++ }
++ if (server->secmech.dec) {
++ crypto_free_aead(server->secmech.dec);
+ server->secmech.dec = NULL;
+ }
+ }
+diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
+index 516be8c0b2a9b..590b70d71694b 100644
+--- a/fs/smb/client/smb2ops.c
++++ b/fs/smb/client/smb2ops.c
+@@ -4576,9 +4576,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
+ return rc;
+ }
+ } else {
+- if (unlikely(!server->secmech.dec))
+- return -EIO;
+-
++ rc = smb3_crypto_aead_allocate(server);
++ if (unlikely(rc))
++ return rc;
+ tfm = server->secmech.dec;
+ }
+
+diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
+index 75b13175a2e78..1a7b82664255a 100644
+--- a/fs/smb/client/smb2pdu.c
++++ b/fs/smb/client/smb2pdu.c
+@@ -1269,15 +1269,8 @@ SMB2_negotiate(const unsigned int xid,
+ cifs_server_dbg(VFS, "Missing expected negotiate contexts\n");
+ }
+
+- if (server->cipher_type && !rc) {
+- if (!SERVER_IS_CHAN(server)) {
+- rc = smb3_crypto_aead_allocate(server);
+- } else {
+- /* For channels, just reuse the primary server crypto secmech. */
+- server->secmech.enc = server->primary_server->secmech.enc;
+- server->secmech.dec = server->primary_server->secmech.dec;
+- }
+- }
++ if (server->cipher_type && !rc)
++ rc = smb3_crypto_aead_allocate(server);
+ neg_exit:
+ free_rsp_buf(resp_buftype, rsp);
+ return rc;
+--
+2.39.5
+
--- /dev/null
+From fd9378d998f2601c51d6855ccf4497ac7f9dbe4e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 12:55:34 +0200
+Subject: tc: Ensure we have enough buffer space when sending filter netlink
+ notifications
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Toke Høiland-Jørgensen <toke@redhat.com>
+
+[ Upstream commit 369609fc6272c2f6ad666ba4fd913f3baf32908f ]
+
+The tfilter_notify() and tfilter_del_notify() functions assume that
+NLMSG_GOODSIZE is always enough to dump the filter chain. This is not
+always the case, which can lead to silent notify failures (because the
+return code of tfilter_notify() is not always checked). In particular,
+this can lead to NLM_F_ECHO not being honoured even though an action
+succeeds, which forces userspace to create workarounds[0].
+
+Fix this by increasing the message size if dumping the filter chain into
+the allocated skb fails. Use the size of the incoming skb as a size hint
+if set, so we can start at a larger value when appropriate.
+
+To trigger this, run the following commands:
+
+ # ip link add type veth
+ # tc qdisc replace dev veth0 root handle 1: fq_codel
+ # tc -echo filter add dev veth0 parent 1: u32 match u32 0 0 $(for i in $(seq 32); do echo action pedit munge ip dport set 22; done)
+
+Before this fix, tc just returns:
+
+Not a filter(cmd 2)
+
+After the fix, we get the correct echo:
+
+added filter dev veth0 parent 1: protocol all pref 49152 u32 chain 0 fh 800::800 order 2048 key ht 800 bkt 0 terminal flowid not_in_hw
+ match 00000000/00000000 at 0
+ action order 1: pedit action pass keys 1
+ index 1 ref 1 bind 1
+ key #0 at 20: val 00000016 mask ffff0000
+[repeated 32 times]
+
+[0] https://github.com/openvswitch/ovs/commit/106ef21860c935e5e0017a88bf42b94025c4e511
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Reported-by: Frode Nordahl <frode.nordahl@canonical.com>
+Closes: https://bugs.launchpad.net/ubuntu/+source/openvswitch/+bug/2018500
+Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Link: https://patch.msgid.link/20250407105542.16601-1-toke@redhat.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/cls_api.c | 66 ++++++++++++++++++++++++++++++---------------
+ 1 file changed, 45 insertions(+), 21 deletions(-)
+
+diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
+index 998ea3b5badfc..a3bab5e27e71b 100644
+--- a/net/sched/cls_api.c
++++ b/net/sched/cls_api.c
+@@ -2051,6 +2051,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
++ int ret = -EMSGSIZE;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+@@ -2095,11 +2096,45 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
+
+ return skb->len;
+
++cls_op_not_supp:
++ ret = -EOPNOTSUPP;
+ out_nlmsg_trim:
+ nla_put_failure:
+-cls_op_not_supp:
+ nlmsg_trim(skb, b);
+- return -1;
++ return ret;
++}
++
++static struct sk_buff *tfilter_notify_prep(struct net *net,
++ struct sk_buff *oskb,
++ struct nlmsghdr *n,
++ struct tcf_proto *tp,
++ struct tcf_block *block,
++ struct Qdisc *q, u32 parent,
++ void *fh, int event,
++ u32 portid, bool rtnl_held,
++ struct netlink_ext_ack *extack)
++{
++ unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE;
++ struct sk_buff *skb;
++ int ret;
++
++retry:
++ skb = alloc_skb(size, GFP_KERNEL);
++ if (!skb)
++ return ERR_PTR(-ENOBUFS);
++
++ ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
++ n->nlmsg_seq, n->nlmsg_flags, event, false,
++ rtnl_held, extack);
++ if (ret <= 0) {
++ kfree_skb(skb);
++ if (ret == -EMSGSIZE) {
++ size += NLMSG_GOODSIZE;
++ goto retry;
++ }
++ return ERR_PTR(-EINVAL);
++ }
++ return skb;
+ }
+
+ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+@@ -2115,16 +2150,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+ if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
+ return 0;
+
+- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+- if (!skb)
+- return -ENOBUFS;
+-
+- if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+- n->nlmsg_seq, n->nlmsg_flags, event,
+- false, rtnl_held, extack) <= 0) {
+- kfree_skb(skb);
+- return -EINVAL;
+- }
++ skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event,
++ portid, rtnl_held, extack);
++ if (IS_ERR(skb))
++ return PTR_ERR(skb);
+
+ if (unicast)
+ err = rtnl_unicast(skb, net, portid);
+@@ -2147,16 +2176,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
+ if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
+ return tp->ops->delete(tp, fh, last, rtnl_held, extack);
+
+- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+- if (!skb)
+- return -ENOBUFS;
+-
+- if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+- n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER,
+- false, rtnl_held, extack) <= 0) {
++ skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh,
++ RTM_DELTFILTER, portid, rtnl_held, extack);
++ if (IS_ERR(skb)) {
+ NL_SET_ERR_MSG(extack, "Failed to build del event notification");
+- kfree_skb(skb);
+- return -EINVAL;
++ return PTR_ERR(skb);
+ }
+
+ err = tp->ops->delete(tp, fh, last, rtnl_held, extack);
+--
+2.39.5
+
--- /dev/null
+From fb80ff152a6b29870c5fdf14efa9ebae2579894d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 3 Apr 2025 09:24:31 +0000
+Subject: tipc: fix memory leak in tipc_link_xmit
+
+From: Tung Nguyen <tung.quang.nguyen@est.tech>
+
+[ Upstream commit 69ae94725f4fc9e75219d2d69022029c5b24bc9a ]
+
+In case the backlog transmit queue for system-importance messages is overloaded,
+tipc_link_xmit() returns -ENOBUFS but the skb list is not purged. This leads to
+memory leak and failure when a skb is allocated.
+
+This commit fixes this issue by purging the skb list before tipc_link_xmit()
+returns.
+
+Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
+Signed-off-by: Tung Nguyen <tung.quang.nguyen@est.tech>
+Link: https://patch.msgid.link/20250403092431.514063-1-tung.quang.nguyen@est.tech
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tipc/link.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/net/tipc/link.c b/net/tipc/link.c
+index 5c2088a469cea..5689e1f485479 100644
+--- a/net/tipc/link.c
++++ b/net/tipc/link.c
+@@ -1046,6 +1046,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
+ if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
+ if (imp == TIPC_SYSTEM_IMPORTANCE) {
+ pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
++ __skb_queue_purge(list);
+ return -ENOBUFS;
+ }
+ rc = link_schedule_user(l, hdr);
+--
+2.39.5
+
--- /dev/null
+From 6250772c615286c89bd110e3fe5b0f58244d23ff Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 9 Apr 2025 09:14:41 +0800
+Subject: ublk: fix handling recovery & reissue in ublk_abort_queue()
+
+From: Ming Lei <ming.lei@redhat.com>
+
+[ Upstream commit 6ee6bd5d4fce502a5b5a2ea805e9ff16e6aa890f ]
+
+Commit 8284066946e6 ("ublk: grab request reference when the request is handled
+by userspace") doesn't grab request reference in case of recovery reissue.
+Then the request can be requeued & re-dispatch & failed when canceling
+uring command.
+
+If it is one zc request, the request can be freed before io_uring
+returns the zc buffer back, then cause kernel panic:
+
+[ 126.773061] BUG: kernel NULL pointer dereference, address: 00000000000000c8
+[ 126.773657] #PF: supervisor read access in kernel mode
+[ 126.774052] #PF: error_code(0x0000) - not-present page
+[ 126.774455] PGD 0 P4D 0
+[ 126.774698] Oops: Oops: 0000 [#1] SMP NOPTI
+[ 126.775034] CPU: 13 UID: 0 PID: 1612 Comm: kworker/u64:55 Not tainted 6.14.0_blk+ #182 PREEMPT(full)
+[ 126.775676] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014
+[ 126.776275] Workqueue: iou_exit io_ring_exit_work
+[ 126.776651] RIP: 0010:ublk_io_release+0x14/0x130 [ublk_drv]
+
+Fixes it by always grabbing request reference for aborting the request.
+
+Reported-by: Caleb Sander Mateos <csander@purestorage.com>
+Closes: https://lore.kernel.org/linux-block/CADUfDZodKfOGUeWrnAxcZiLT+puaZX8jDHoj_sfHZCOZwhzz6A@mail.gmail.com/
+Fixes: 8284066946e6 ("ublk: grab request reference when the request is handled by userspace")
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Link: https://lore.kernel.org/r/20250409011444.2142010-2-ming.lei@redhat.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/block/ublk_drv.c | 30 ++++++++++++++++++++++++++----
+ 1 file changed, 26 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
+index dd328d40c7de5..38b9e485e520d 100644
+--- a/drivers/block/ublk_drv.c
++++ b/drivers/block/ublk_drv.c
+@@ -1081,6 +1081,25 @@ static void ublk_complete_rq(struct kref *ref)
+ __ublk_complete_rq(req);
+ }
+
++static void ublk_do_fail_rq(struct request *req)
++{
++ struct ublk_queue *ubq = req->mq_hctx->driver_data;
++
++ if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
++ blk_mq_requeue_request(req, false);
++ else
++ __ublk_complete_rq(req);
++}
++
++static void ublk_fail_rq_fn(struct kref *ref)
++{
++ struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
++ ref);
++ struct request *req = blk_mq_rq_from_pdu(data);
++
++ ublk_do_fail_rq(req);
++}
++
+ /*
+ * Since __ublk_rq_task_work always fails requests immediately during
+ * exiting, __ublk_fail_req() is only called from abort context during
+@@ -1094,10 +1113,13 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+ {
+ WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
+
+- if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
+- blk_mq_requeue_request(req, false);
+- else
+- ublk_put_req_ref(ubq, req);
++ if (ublk_need_req_ref(ubq)) {
++ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
++
++ kref_put(&data->ref, ublk_fail_rq_fn);
++ } else {
++ ublk_do_fail_rq(req);
++ }
+ }
+
+ static void ubq_complete_io_cmd(struct ublk_io *io, int res,
+--
+2.39.5
+
--- /dev/null
+From 6b548abc33fd25278d5394f234f6573d8152f44a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Oct 2024 12:24:15 -0600
+Subject: ublk: refactor recovery configuration flag helpers
+
+From: Uday Shankar <ushankar@purestorage.com>
+
+[ Upstream commit 3b939b8f715e014adcc48f7827fe9417252f0833 ]
+
+ublk currently supports the following behaviors on ublk server exit:
+
+A: outstanding I/Os get errors, subsequently issued I/Os get errors
+B: outstanding I/Os get errors, subsequently issued I/Os queue
+C: outstanding I/Os get reissued, subsequently issued I/Os queue
+
+and the following behaviors for recovery of preexisting block devices by
+a future incarnation of the ublk server:
+
+1: ublk devices stopped on ublk server exit (no recovery possible)
+2: ublk devices are recoverable using start/end_recovery commands
+
+The userspace interface allows selection of combinations of these
+behaviors using flags specified at device creation time, namely:
+
+default behavior: A + 1
+UBLK_F_USER_RECOVERY: B + 2
+UBLK_F_USER_RECOVERY|UBLK_F_USER_RECOVERY_REISSUE: C + 2
+
+We can't easily change the userspace interface to allow independent
+selection of one of {A, B, C} and one of {1, 2}, but we can refactor the
+internal helpers which test for the flags. Replace the existing helpers
+with the following set:
+
+ublk_nosrv_should_reissue_outstanding: tests for behavior C
+ublk_nosrv_[dev_]should_queue_io: tests for behavior B
+ublk_nosrv_should_stop_dev: tests for behavior 1
+
+Signed-off-by: Uday Shankar <ushankar@purestorage.com>
+Reviewed-by: Ming Lei <ming.lei@redhat.com>
+Link: https://lore.kernel.org/r/20241007182419.3263186-3-ushankar@purestorage.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Stable-dep-of: 6ee6bd5d4fce ("ublk: fix handling recovery & reissue in ublk_abort_queue()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/block/ublk_drv.c | 62 +++++++++++++++++++++++++++-------------
+ 1 file changed, 42 insertions(+), 20 deletions(-)
+
+diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
+index 79b7bd8bfd458..dd328d40c7de5 100644
+--- a/drivers/block/ublk_drv.c
++++ b/drivers/block/ublk_drv.c
+@@ -681,22 +681,44 @@ static int ublk_max_cmd_buf_size(void)
+ return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
+ }
+
+-static inline bool ublk_queue_can_use_recovery_reissue(
+- struct ublk_queue *ubq)
++/*
++ * Should I/O outstanding to the ublk server when it exits be reissued?
++ * If not, outstanding I/O will get errors.
++ */
++static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
+ {
+- return (ubq->flags & UBLK_F_USER_RECOVERY) &&
+- (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE);
++ return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
++ (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
+ }
+
+-static inline bool ublk_queue_can_use_recovery(
+- struct ublk_queue *ubq)
++/*
++ * Should I/O issued while there is no ublk server queue? If not, I/O
++ * issued while there is no ublk server will get errors.
++ */
++static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
++{
++ return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
++}
++
++/*
++ * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
++ * of the device flags for smaller cache footprint - better for fast
++ * paths.
++ */
++static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
+ {
+ return ubq->flags & UBLK_F_USER_RECOVERY;
+ }
+
+-static inline bool ublk_can_use_recovery(struct ublk_device *ub)
++/*
++ * Should ublk devices be stopped (i.e. no recovery possible) when the
++ * ublk server exits? If not, devices can be used again by a future
++ * incarnation of a ublk server via the start_recovery/end_recovery
++ * commands.
++ */
++static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
+ {
+- return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
++ return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
+ }
+
+ static void ublk_free_disk(struct gendisk *disk)
+@@ -1072,7 +1094,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+ {
+ WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
+
+- if (ublk_queue_can_use_recovery_reissue(ubq))
++ if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
+ blk_mq_requeue_request(req, false);
+ else
+ ublk_put_req_ref(ubq, req);
+@@ -1100,7 +1122,7 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
+ struct request *rq)
+ {
+ /* We cannot process this rq so just requeue it. */
+- if (ublk_queue_can_use_recovery(ubq))
++ if (ublk_nosrv_dev_should_queue_io(ubq->dev))
+ blk_mq_requeue_request(rq, false);
+ else
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+@@ -1245,10 +1267,10 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
+ struct ublk_device *ub = ubq->dev;
+
+ if (ublk_abort_requests(ub, ubq)) {
+- if (ublk_can_use_recovery(ub))
+- schedule_work(&ub->quiesce_work);
+- else
++ if (ublk_nosrv_should_stop_dev(ub))
+ schedule_work(&ub->stop_work);
++ else
++ schedule_work(&ub->quiesce_work);
+ }
+ return BLK_EH_DONE;
+ }
+@@ -1277,7 +1299,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
+ * Note: force_abort is guaranteed to be seen because it is set
+ * before request queue is unqiuesced.
+ */
+- if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
++ if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort))
+ return BLK_STS_IOERR;
+
+ if (unlikely(ubq->canceling)) {
+@@ -1517,10 +1539,10 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
+ ublk_cancel_cmd(ubq, io, issue_flags);
+
+ if (need_schedule) {
+- if (ublk_can_use_recovery(ub))
+- schedule_work(&ub->quiesce_work);
+- else
++ if (ublk_nosrv_should_stop_dev(ub))
+ schedule_work(&ub->stop_work);
++ else
++ schedule_work(&ub->quiesce_work);
+ }
+ }
+
+@@ -1640,7 +1662,7 @@ static void ublk_stop_dev(struct ublk_device *ub)
+ mutex_lock(&ub->mutex);
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ goto unlock;
+- if (ublk_can_use_recovery(ub)) {
++ if (ublk_nosrv_dev_should_queue_io(ub)) {
+ if (ub->dev_info.state == UBLK_S_DEV_LIVE)
+ __ublk_quiesce_dev(ub);
+ ublk_unquiesce_dev(ub);
+@@ -2738,7 +2760,7 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub,
+ int i;
+
+ mutex_lock(&ub->mutex);
+- if (!ublk_can_use_recovery(ub))
++ if (ublk_nosrv_should_stop_dev(ub))
+ goto out_unlock;
+ if (!ub->nr_queues_ready)
+ goto out_unlock;
+@@ -2791,7 +2813,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
+ __func__, ub->dev_info.nr_hw_queues, header->dev_id);
+
+ mutex_lock(&ub->mutex);
+- if (!ublk_can_use_recovery(ub))
++ if (ublk_nosrv_should_stop_dev(ub))
+ goto out_unlock;
+
+ if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+--
+2.39.5
+
--- /dev/null
+From 1b5c7f79a08f8e345a7f4b56daa396991f28d3d9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Apr 2025 15:24:27 +0200
+Subject: x86/acpi: Don't limit CPUs to 1 for Xen PV guests due to disabled
+ ACPI
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Petr Vaněk <arkamar@atlas.cz>
+
+[ Upstream commit 8b37357a78d7fa13d88ea822b35b40137da1c85e ]
+
+Xen disables ACPI for PV guests in DomU, which causes acpi_mps_check() to
+return 1 when CONFIG_X86_MPPARSE is not set. As a result, the local APIC is
+disabled and the guest is later limited to a single vCPU, despite being
+configured with more.
+
+This regression was introduced in version 6.9 in commit 7c0edad3643f
+("x86/cpu/topology: Rework possible CPU management"), which added an
+early check that limits CPUs to 1 if apic_is_disabled.
+
+Update the acpi_mps_check() logic to return 0 early when running as a Xen
+PV guest in DomU, preventing APIC from being disabled in this specific case
+and restoring correct multi-vCPU behaviour.
+
+Fixes: 7c0edad3643f ("x86/cpu/topology: Rework possible CPU management")
+Signed-off-by: Petr Vaněk <arkamar@atlas.cz>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lore.kernel.org/all/20250407132445.6732-2-arkamar@atlas.cz
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/acpi/boot.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
+index c70b86f1f2954..63adda8a143f9 100644
+--- a/arch/x86/kernel/acpi/boot.c
++++ b/arch/x86/kernel/acpi/boot.c
+@@ -23,6 +23,8 @@
+ #include <linux/serial_core.h>
+ #include <linux/pgtable.h>
+
++#include <xen/xen.h>
++
+ #include <asm/e820/api.h>
+ #include <asm/irqdomain.h>
+ #include <asm/pci_x86.h>
+@@ -1730,6 +1732,15 @@ int __init acpi_mps_check(void)
+ {
+ #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
+ /* mptable code is not built-in*/
++
++ /*
++ * Xen disables ACPI in PV DomU guests but it still emulates APIC and
++ * supports SMP. Returning early here ensures that APIC is not disabled
++ * unnecessarily and the guest is not limited to a single vCPU.
++ */
++ if (xen_pv_domain() && !xen_initial_domain())
++ return 0;
++
+ if (acpi_disabled || acpi_noirq) {
+ pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n");
+ return 1;
+--
+2.39.5
+