Fixes for 6.12

author Sasha Levin <sashal@kernel.org>

Sat, 11 Jan 2025 19:12:10 +0000 (14:12 -0500)

committer Sasha Levin <sashal@kernel.org>

Sat, 11 Jan 2025 19:12:10 +0000 (14:12 -0500)
author Sasha Levin <sashal@kernel.org>
Sat, 11 Jan 2025 19:12:10 +0000 (14:12 -0500)
committer Sasha Levin <sashal@kernel.org>
Sat, 11 Jan 2025 19:12:10 +0000 (14:12 -0500)
diff --git a/queue-6.12/cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch b/queue-6.12/cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch

new file mode 100644 (file)

index 0000000..cc1f5e5
--- /dev/null
+++ b/queue-6.12/cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch
@@ -0,0 +1,132 @@
+From 440a89783fa04b31bcdb72c38faaf40f112f7047 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Dec 2024 14:51:01 -0500
+Subject: cgroup/cpuset: Prevent leakage of isolated CPUs into sched domains
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit 9b496a8bbed9cc292b0dfd796f38ec58b6d0375f ]
+
+Isolated CPUs are not allowed to be used in a non-isolated partition.
+The only exception is the top cpuset which is allowed to contain boot
+time isolated CPUs.
+
+Commit ccac8e8de99c ("cgroup/cpuset: Fix remote root partition creation
+problem") introduces a simplified scheme of including only partition
+roots in sched domain generation. However, it does not properly account
+for this exception case. This can result in leakage of isolated CPUs
+into a sched domain.
+
+Fix it by making sure that isolated CPUs are excluded from the top
+cpuset before generating sched domains.
+
+Also update the way the boot time isolated CPUs are handled in
+test_cpuset_prs.sh to make sure that those isolated CPUs are really
+isolated instead of just skipping them in the tests.
+
+Fixes: ccac8e8de99c ("cgroup/cpuset: Fix remote root partition creation problem")
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c                        | 10 +++++-
+ .../selftests/cgroup/test_cpuset_prs.sh       | 33 +++++++++++--------
+ 2 files changed, 28 insertions(+), 15 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index a4dd285cdf39..c431c50512bd 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -862,7 +862,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
+        */
+       if (cgrpv2) {
+               for (i = 0; i < ndoms; i++) {
+-                      cpumask_copy(doms[i], csa[i]->effective_cpus);
++                      /*
++                       * The top cpuset may contain some boot time isolated
++                       * CPUs that need to be excluded from the sched domain.
++                       */
++                      if (csa[i] == &top_cpuset)
++                              cpumask_and(doms[i], csa[i]->effective_cpus,
++                                          housekeeping_cpumask(HK_TYPE_DOMAIN));
++                      else
++                              cpumask_copy(doms[i], csa[i]->effective_cpus);
+                       if (dattr)
+                               dattr[i] = SD_ATTR_INIT;
+               }
+diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+index 03c1bdaed2c3..400a696a0d21 100755
+--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
++++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+@@ -86,15 +86,15 @@ echo "" > test/cpuset.cpus
+ 
+ #
+ # If isolated CPUs have been reserved at boot time (as shown in
+-# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-7
++# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-8
+ # that will be used by this script for testing purpose. If not, some of
+-# the tests may fail incorrectly. These isolated CPUs will also be removed
+-# before being compared with the expected results.
++# the tests may fail incorrectly. These pre-isolated CPUs should stay in
++# an isolated state throughout the testing process for now.
+ #
+ BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated)
+ if [[ -n "$BOOT_ISOLCPUS" ]]
+ then
+-      [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 7 ]] &&
++      [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 8 ]] &&
+               skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested"
+       echo "Pre-isolated CPUs: $BOOT_ISOLCPUS"
+ fi
+@@ -683,15 +683,19 @@ check_isolcpus()
+               EXPECT_VAL2=$EXPECT_VAL
+       fi
+ 
++      #
++      # Appending pre-isolated CPUs
++      # Even though CPU #8 isn't used for testing, it can't be pre-isolated
++      # to make appending those CPUs easier.
++      #
++      [[ -n "$BOOT_ISOLCPUS" ]] && {
++              EXPECT_VAL=${EXPECT_VAL:+${EXPECT_VAL},}${BOOT_ISOLCPUS}
++              EXPECT_VAL2=${EXPECT_VAL2:+${EXPECT_VAL2},}${BOOT_ISOLCPUS}
++      }
++
+       #
+       # Check cpuset.cpus.isolated cpumask
+       #
+-      if [[ -z "$BOOT_ISOLCPUS" ]]
+-      then
+-              ISOLCPUS=$(cat $ISCPUS)
+-      else
+-              ISOLCPUS=$(cat $ISCPUS | sed -e "s/,*$BOOT_ISOLCPUS//")
+-      fi
+       [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && {
+               # Take a 50ms pause and try again
+               pause 0.05
+@@ -731,8 +735,6 @@ check_isolcpus()
+               fi
+       done
+       [[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU
+-      [[ -n "BOOT_ISOLCPUS" ]] &&
+-              ISOLCPUS=$(echo $ISOLCPUS | sed -e "s/,*$BOOT_ISOLCPUS//")
+ 
+       [[ "$EXPECT_VAL" = "$ISOLCPUS" ]]
+ }
+@@ -836,8 +838,11 @@ run_state_test()
+               # if available
+               [[ -n "$ICPUS" ]] && {
+                       check_isolcpus $ICPUS
+-                      [[ $? -ne 0 ]] && test_fail $I "isolated CPU" \
+-                              "Expect $ICPUS, get $ISOLCPUS instead"
++                      [[ $? -ne 0 ]] && {
++                              [[ -n "$BOOT_ISOLCPUS" ]] && ICPUS=${ICPUS},${BOOT_ISOLCPUS}
++                              test_fail $I "isolated CPU" \
++                                      "Expect $ICPUS, get $ISOLCPUS instead"
++                      }
+               }
+               reset_cgroup_states
+               #
+-- 
+2.39.5
+
diff --git a/queue-6.12/cgroup-cpuset-remove-kernfs-active-break.patch b/queue-6.12/cgroup-cpuset-remove-kernfs-active-break.patch

new file mode 100644 (file)

index 0000000..c77f0b5
--- /dev/null
+++ b/queue-6.12/cgroup-cpuset-remove-kernfs-active-break.patch
@@ -0,0 +1,132 @@
+From 968e8c09385638f21f4875b71591b49e7d6a1676 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 6 Jan 2025 08:19:04 +0000
+Subject: cgroup/cpuset: remove kernfs active break
+
+From: Chen Ridong <chenridong@huawei.com>
+
+[ Upstream commit 3cb97a927fffe443e1e7e8eddbfebfdb062e86ed ]
+
+A warning was found:
+
+WARNING: CPU: 10 PID: 3486953 at fs/kernfs/file.c:828
+CPU: 10 PID: 3486953 Comm: rmdir Kdump: loaded Tainted: G
+RIP: 0010:kernfs_should_drain_open_files+0x1a1/0x1b0
+RSP: 0018:ffff8881107ef9e0 EFLAGS: 00010202
+RAX: 0000000080000002 RBX: ffff888154738c00 RCX: dffffc0000000000
+RDX: 0000000000000007 RSI: 0000000000000004 RDI: ffff888154738c04
+RBP: ffff888154738c04 R08: ffffffffaf27fa15 R09: ffffed102a8e7180
+R10: ffff888154738c07 R11: 0000000000000000 R12: ffff888154738c08
+R13: ffff888750f8c000 R14: ffff888750f8c0e8 R15: ffff888154738ca0
+FS:  00007f84cd0be740(0000) GS:ffff8887ddc00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000555f9fbe00c8 CR3: 0000000153eec001 CR4: 0000000000370ee0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ kernfs_drain+0x15e/0x2f0
+ __kernfs_remove+0x165/0x300
+ kernfs_remove_by_name_ns+0x7b/0xc0
+ cgroup_rm_file+0x154/0x1c0
+ cgroup_addrm_files+0x1c2/0x1f0
+ css_clear_dir+0x77/0x110
+ kill_css+0x4c/0x1b0
+ cgroup_destroy_locked+0x194/0x380
+ cgroup_rmdir+0x2a/0x140
+
+It can be explained by:
+rmdir                          echo 1 > cpuset.cpus
+                               kernfs_fop_write_iter // active=0
+cgroup_rm_file
+kernfs_remove_by_name_ns       kernfs_get_active // active=1
+__kernfs_remove                                          // active=0x80000002
+kernfs_drain                   cpuset_write_resmask
+wait_event
+//waiting (active == 0x80000001)
+                               kernfs_break_active_protection
+                               // active = 0x80000001
+// continue
+                               kernfs_unbreak_active_protection
+                               // active = 0x80000002
+...
+kernfs_should_drain_open_files
+// warning occurs
+                               kernfs_put_active
+
+This warning is caused by 'kernfs_break_active_protection' when it is
+writing to cpuset.cpus, and the cgroup is removed concurrently.
+
+The commit 3a5a6d0c2b03 ("cpuset: don't nest cgroup_mutex inside
+get_online_cpus()") made cpuset_hotplug_workfn asynchronous, This change
+involves calling flush_work(), which can create a multiple processes
+circular locking dependency that involve cgroup_mutex, potentially leading
+to a deadlock. To avoid deadlock. the commit 76bb5ab8f6e3 ("cpuset: break
+kernfs active protection in cpuset_write_resmask()") added
+'kernfs_break_active_protection' in the cpuset_write_resmask. This could
+lead to this warning.
+
+After the commit 2125c0034c5d ("cgroup/cpuset: Make cpuset hotplug
+processing synchronous"), the cpuset_write_resmask no longer needs to
+wait the hotplug to finish, which means that concurrent hotplug and cpuset
+operations are no longer possible. Therefore, the deadlock doesn't exist
+anymore and it does not have to 'break active protection' now. To fix this
+warning, just remove kernfs_break_active_protection operation in the
+'cpuset_write_resmask'.
+
+Fixes: bdb2fd7fc56e ("kernfs: Skip kernfs_drain_open_files() more aggressively")
+Fixes: 76bb5ab8f6e3 ("cpuset: break kernfs active protection in cpuset_write_resmask()")
+Reported-by: Ji Fa <jifa@huawei.com>
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+Acked-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c | 25 -------------------------
+ 1 file changed, 25 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index c431c50512bd..24ece85fd3b1 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -3110,29 +3110,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+       int retval = -ENODEV;
+ 
+       buf = strstrip(buf);
+-
+-      /*
+-       * CPU or memory hotunplug may leave @cs w/o any execution
+-       * resources, in which case the hotplug code asynchronously updates
+-       * configuration and transfers all tasks to the nearest ancestor
+-       * which can execute.
+-       *
+-       * As writes to "cpus" or "mems" may restore @cs's execution
+-       * resources, wait for the previously scheduled operations before
+-       * proceeding, so that we don't end up keep removing tasks added
+-       * after execution capability is restored.
+-       *
+-       * cpuset_handle_hotplug may call back into cgroup core asynchronously
+-       * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
+-       * operation like this one can lead to a deadlock through kernfs
+-       * active_ref protection.  Let's break the protection.  Losing the
+-       * protection is okay as we check whether @cs is online after
+-       * grabbing cpuset_mutex anyway.  This only happens on the legacy
+-       * hierarchies.
+-       */
+-      css_get(&cs->css);
+-      kernfs_break_active_protection(of->kn);
+-
+       cpus_read_lock();
+       mutex_lock(&cpuset_mutex);
+       if (!is_cpuset_online(cs))
+@@ -3163,8 +3140,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ out_unlock:
+       mutex_unlock(&cpuset_mutex);
+       cpus_read_unlock();
+-      kernfs_unbreak_active_protection(of->kn);
+-      css_put(&cs->css);
+       flush_workqueue(cpuset_migrate_mm_wq);
+       return retval ?: nbytes;
+ }
+-- 
+2.39.5
+
diff --git a/queue-6.12/sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch b/queue-6.12/sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch

new file mode 100644 (file)

index 0000000..20a93d1
--- /dev/null
+++ b/queue-6.12/sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch
@@ -0,0 +1,202 @@
+From 84f5aa317f74c6c22d042df68b069aa3d6611bcb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 10 Jan 2025 23:16:31 +0100
+Subject: sched_ext: idle: Refresh idle masks during idle-to-idle transitions
+
+From: Andrea Righi <arighi@nvidia.com>
+
+[ Upstream commit a2a3374c47c428c0edb0bbc693638d4783f81e31 ]
+
+With the consolidation of put_prev_task/set_next_task(), see
+commit 436f3eed5c69 ("sched: Combine the last put_prev_task() and the
+first set_next_task()"), we are now skipping the transition between
+these two functions when the previous and the next tasks are the same.
+
+As a result, the scx idle state of a CPU is updated only when
+transitioning to or from the idle thread. While this is generally
+correct, it can lead to uneven and inefficient core utilization in
+certain scenarios [1].
+
+A typical scenario involves proactive wake-ups: scx_bpf_pick_idle_cpu()
+selects and marks an idle CPU as busy, followed by a wake-up via
+scx_bpf_kick_cpu(), without dispatching any tasks. In this case, the CPU
+continues running the idle thread, returns to idle, but remains marked
+as busy, preventing it from being selected again as an idle CPU (until a
+task eventually runs on it and releases the CPU).
+
+For example, running a workload that uses 20% of each CPU, combined with
+an scx scheduler using proactive wake-ups, results in the following core
+utilization:
+
+ CPU 0: 25.7%
+ CPU 1: 29.3%
+ CPU 2: 26.5%
+ CPU 3: 25.5%
+ CPU 4:  0.0%
+ CPU 5: 25.5%
+ CPU 6:  0.0%
+ CPU 7: 10.5%
+
+To address this, refresh the idle state also in pick_task_idle(), during
+idle-to-idle transitions, but only trigger ops.update_idle() on actual
+state changes to prevent unnecessary updates to the scx scheduler and
+maintain balanced state transitions.
+
+With this change in place, the core utilization in the previous example
+becomes the following:
+
+ CPU 0: 18.8%
+ CPU 1: 19.4%
+ CPU 2: 18.0%
+ CPU 3: 18.7%
+ CPU 4: 19.3%
+ CPU 5: 18.9%
+ CPU 6: 18.7%
+ CPU 7: 19.3%
+
+[1] https://github.com/sched-ext/scx/pull/1139
+
+Fixes: 7c65ae81ea86 ("sched_ext: Don't call put_prev_task_scx() before picking the next task")
+Signed-off-by: Andrea Righi <arighi@nvidia.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/ext.c  | 61 ++++++++++++++++++++++++++++++++++++++-------
+ kernel/sched/ext.h  |  8 +++---
+ kernel/sched/idle.c |  5 ++--
+ 3 files changed, 59 insertions(+), 15 deletions(-)
+
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index f3ca1a88375c..f928a67a07d2 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -3240,16 +3240,8 @@ static void reset_idle_masks(void)
+       cpumask_copy(idle_masks.smt, cpu_online_mask);
+ }
+ 
+-void __scx_update_idle(struct rq *rq, bool idle)
++static void update_builtin_idle(int cpu, bool idle)
+ {
+-      int cpu = cpu_of(rq);
+-
+-      if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
+-              SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+-              if (!static_branch_unlikely(&scx_builtin_idle_enabled))
+-                      return;
+-      }
+-
+       if (idle)
+               cpumask_set_cpu(cpu, idle_masks.cpu);
+       else
+@@ -3276,6 +3268,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
+ #endif
+ }
+ 
++/*
++ * Update the idle state of a CPU to @idle.
++ *
++ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
++ * scheduler of an actual idle state transition (idle to busy or vice
++ * versa). If @do_notify is false, only the idle state in the idle masks is
++ * refreshed without invoking ops.update_idle().
++ *
++ * This distinction is necessary, because an idle CPU can be "reserved" and
++ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
++ * busy even if no tasks are dispatched. In this case, the CPU may return
++ * to idle without a true state transition. Refreshing the idle masks
++ * without invoking ops.update_idle() ensures accurate idle state tracking
++ * while avoiding unnecessary updates and maintaining balanced state
++ * transitions.
++ */
++void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
++{
++      int cpu = cpu_of(rq);
++
++      lockdep_assert_rq_held(rq);
++
++      /*
++       * Trigger ops.update_idle() only when transitioning from a task to
++       * the idle thread and vice versa.
++       *
++       * Idle transitions are indicated by do_notify being set to true,
++       * managed by put_prev_task_idle()/set_next_task_idle().
++       */
++      if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
++              SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
++
++      /*
++       * Update the idle masks:
++       * - for real idle transitions (do_notify == true)
++       * - for idle-to-idle transitions (indicated by the previous task
++       *   being the idle thread, managed by pick_task_idle())
++       *
++       * Skip updating idle masks if the previous task is not the idle
++       * thread, since set_next_task_idle() has already handled it when
++       * transitioning from a task to the idle thread (calling this
++       * function with do_notify == true).
++       *
++       * In this way we can avoid updating the idle masks twice,
++       * unnecessarily.
++       */
++      if (static_branch_likely(&scx_builtin_idle_enabled))
++              if (do_notify || is_idle_task(rq->curr))
++                      update_builtin_idle(cpu, idle);
++}
++
+ static void handle_hotplug(struct rq *rq, bool online)
+ {
+       int cpu = cpu_of(rq);
+diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
+index b1675bb59fc4..4d022d17ac7d 100644
+--- a/kernel/sched/ext.h
++++ b/kernel/sched/ext.h
+@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
+ #endif        /* CONFIG_SCHED_CLASS_EXT */
+ 
+ #if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+-void __scx_update_idle(struct rq *rq, bool idle);
++void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
+ 
+-static inline void scx_update_idle(struct rq *rq, bool idle)
++static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+ {
+       if (scx_enabled())
+-              __scx_update_idle(rq, idle);
++              __scx_update_idle(rq, idle, do_notify);
+ }
+ #else
+-static inline void scx_update_idle(struct rq *rq, bool idle) {}
++static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
+ #endif
+ 
+ #ifdef CONFIG_CGROUP_SCHED
+diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
+index d2f096bb274c..53bb9193c537 100644
+--- a/kernel/sched/idle.c
++++ b/kernel/sched/idle.c
+@@ -453,19 +453,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
+ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+ {
+       dl_server_update_idle_time(rq, prev);
+-      scx_update_idle(rq, false);
++      scx_update_idle(rq, false, true);
+ }
+ 
+ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
+ {
+       update_idle_core(rq);
+-      scx_update_idle(rq, true);
++      scx_update_idle(rq, true, true);
+       schedstat_inc(rq->sched_goidle);
+       next->se.exec_start = rq_clock_task(rq);
+ }
+ 
+ struct task_struct *pick_task_idle(struct rq *rq)
+ {
++      scx_update_idle(rq, true, false);
+       return rq->idle;
+ }
+ 
+-- 
+2.39.5
+
diff --git a/queue-6.12/sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch b/queue-6.12/sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch

new file mode 100644 (file)

index 0000000..f60599a
--- /dev/null
+++ b/queue-6.12/sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch
@@ -0,0 +1,77 @@
+From cd27aa29a0476bae712bb54202cc3e91f1d1ef8b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Jan 2025 00:08:06 +0900
+Subject: sched_ext: Replace rq_lock() to raw_spin_rq_lock() in
+ scx_ops_bypass()
+
+From: Changwoo Min <changwoo@igalia.com>
+
+[ Upstream commit 6268d5bc10354fc2ab8d44a0cd3b042d49a0417e ]
+
+scx_ops_bypass() iterates all CPUs to re-enqueue all the scx tasks.
+For each CPU, it acquires a lock using rq_lock() regardless of whether
+a CPU is offline or the CPU is currently running a task in a higher
+scheduler class (e.g., deadline). The rq_lock() is supposed to be used
+for online CPUs, and the use of rq_lock() may trigger an unnecessary
+warning in rq_pin_lock(). Therefore, replace rq_lock() to
+raw_spin_rq_lock() in scx_ops_bypass().
+
+Without this change, we observe the following warning:
+
+===== START =====
+[    6.615205] rq->balance_callback && rq->balance_callback != &balance_push_callback
+[    6.615208] WARNING: CPU: 2 PID: 0 at kernel/sched/sched.h:1730 __schedule+0x1130/0x1c90
+=====  END  =====
+
+Fixes: 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()")
+Signed-off-by: Changwoo Min <changwoo@igalia.com>
+Acked-by: Andrea Righi <arighi@nvidia.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/ext.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index 40f915f893e2..81235942555a 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -4348,10 +4348,9 @@ static void scx_ops_bypass(bool bypass)
+        */
+       for_each_possible_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+-              struct rq_flags rf;
+               struct task_struct *p, *n;
+ 
+-              rq_lock(rq, &rf);
++              raw_spin_rq_lock(rq);
+ 
+               if (bypass) {
+                       WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
+@@ -4367,7 +4366,7 @@ static void scx_ops_bypass(bool bypass)
+                * sees scx_rq_bypassing() before moving tasks to SCX.
+                */
+               if (!scx_enabled()) {
+-                      rq_unlock(rq, &rf);
++                      raw_spin_rq_unlock(rq);
+                       continue;
+               }
+ 
+@@ -4387,10 +4386,11 @@ static void scx_ops_bypass(bool bypass)
+                       sched_enq_and_set_task(&ctx);
+               }
+ 
+-              rq_unlock(rq, &rf);
+-
+               /* resched to restore ticks and idle state */
+-              resched_cpu(cpu);
++              if (cpu_online(cpu) || cpu == smp_processor_id())
++                      resched_curr(rq);
++
++              raw_spin_rq_unlock(rq);
+       }
+ unlock:
+       raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
+-- 
+2.39.5
+
diff --git a/queue-6.12/sched_ext-switch-class-when-preempted-by-higher-prio.patch b/queue-6.12/sched_ext-switch-class-when-preempted-by-higher-prio.patch

new file mode 100644 (file)

index 0000000..8c38e59
--- /dev/null
+++ b/queue-6.12/sched_ext-switch-class-when-preempted-by-higher-prio.patch
@@ -0,0 +1,47 @@
+From 58e0563c10d0a205b16b906447f6a0b9146ebf08 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 8 Jan 2025 10:33:28 +0800
+Subject: sched_ext: switch class when preempted by higher priority scheduler
+
+From: Honglei Wang <jameshongleiwang@126.com>
+
+[ Upstream commit 68e449d849fd50bd5e61d8bd32b3458dbd3a3df6 ]
+
+ops.cpu_release() function, if defined, must be invoked when preempted by
+a higher priority scheduler class task. This scenario was skipped in
+commit f422316d7466 ("sched_ext: Remove switch_class_scx()"). Let's fix
+it.
+
+Fixes: f422316d7466 ("sched_ext: Remove switch_class_scx()")
+Signed-off-by: Honglei Wang <jameshongleiwang@126.com>
+Acked-by: Andrea Righi <arighi@nvidia.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/ext.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index 81235942555a..f3ca1a88375c 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -2917,7 +2917,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
+                */
+               if (p->scx.slice && !scx_rq_bypassing(rq)) {
+                       dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+-                      return;
++                      goto switch_class;
+               }
+ 
+               /*
+@@ -2934,6 +2934,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
+               }
+       }
+ 
++switch_class:
+       if (next && next->sched_class != &ext_sched_class)
+               switch_class(rq, next);
+ }
+-- 
+2.39.5
+
diff --git a/queue-6.12/series b/queue-6.12/series

index c3e2712d7968556d112d1c3d17df921b44408a37..a7e4b70921af09bc7997eb35955856a1a52c8317 100644 (file)
--- a/queue-6.12/series
+++ b/queue-6.12/series
@@ -103,3 +103,9 @@ btrfs-zlib-fix-avail_in-bytes-for-s390-zlib-hw-compression-path.patch
  revert-drm-mediatek-dsi-correct-calculation-formula-of-phy-timing.patch
  drm-amd-display-remove-unnecessary-amdgpu_irq_get-put.patch
  drm-amd-display-add-check-for-granularity-in-dml-ceil-floor-helpers.patch
+cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch
+thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch
+sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch
+sched_ext-switch-class-when-preempted-by-higher-prio.patch
+cgroup-cpuset-remove-kernfs-active-break.patch
+sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch
diff --git a/queue-6.12/thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch b/queue-6.12/thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch

new file mode 100644 (file)

index 0000000..c011827
--- /dev/null
+++ b/queue-6.12/thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch
@@ -0,0 +1,39 @@
+From 68f73315316c6db979ed157934afde15ef581a68 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 24 Dec 2024 12:18:09 +0900
+Subject: thermal: of: fix OF node leak in of_thermal_zone_find()
+
+From: Joe Hattori <joe@pf.is.s.u-tokyo.ac.jp>
+
+[ Upstream commit 9164e0912af206a72ddac4915f7784e470a04ace ]
+
+of_thermal_zone_find() calls of_parse_phandle_with_args(), but does not
+release the OF node reference obtained by it.
+
+Add a of_node_put() call when the call is successful.
+
+Fixes: 3fd6d6e2b4e8 ("thermal/of: Rework the thermal device tree initialization")
+Signed-off-by: Joe Hattori <joe@pf.is.s.u-tokyo.ac.jp>
+Link: https://patch.msgid.link/20241224031809.950461-1-joe@pf.is.s.u-tokyo.ac.jp
+[ rjw: Changelog edit ]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/thermal/thermal_of.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
+index 07e09897165f..5d3d8ce672cd 100644
+--- a/drivers/thermal/thermal_of.c
++++ b/drivers/thermal/thermal_of.c
+@@ -176,6 +176,7 @@ static struct device_node *of_thermal_zone_find(struct device_node *sensor, int
+                               goto out;
+                       }
+ 
++                      of_node_put(sensor_specs.np);
+                       if ((sensor == sensor_specs.np) && id == (sensor_specs.args_count ?
+                                                                 sensor_specs.args[0] : 0)) {
+                               pr_debug("sensor %pOFn id=%d belongs to %pOFn\n", sensor, id, child);
+-- 
+2.39.5
+
author	Sasha Levin <sashal@kernel.org>
	Sat, 11 Jan 2025 19:12:10 +0000 (14:12 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Sat, 11 Jan 2025 19:12:10 +0000 (14:12 -0500)
queue-6.12/cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch	[new file with mode: 0644]	patch \| blob
queue-6.12/cgroup-cpuset-remove-kernfs-active-break.patch	[new file with mode: 0644]	patch \| blob
queue-6.12/sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch	[new file with mode: 0644]	patch \| blob
queue-6.12/sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch	[new file with mode: 0644]	patch \| blob
queue-6.12/sched_ext-switch-class-when-preempted-by-higher-prio.patch	[new file with mode: 0644]	patch \| blob
queue-6.12/series		patch \| blob \| blame \| history
queue-6.12/thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch	[new file with mode: 0644]	patch \| blob