From d117d5ad842c013e48d0243a20fea0815a372954 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 11 Jan 2025 14:12:10 -0500 Subject: [PATCH] Fixes for 6.12 Signed-off-by: Sasha Levin --- ...event-leakage-of-isolated-cpus-into-.patch | 132 ++++++++++++ ...up-cpuset-remove-kernfs-active-break.patch | 132 ++++++++++++ ...efresh-idle-masks-during-idle-to-idl.patch | 202 ++++++++++++++++++ ...e-rq_lock-to-raw_spin_rq_lock-in-scx.patch | 77 +++++++ ...-class-when-preempted-by-higher-prio.patch | 47 ++++ queue-6.12/series | 6 + ...of-node-leak-in-of_thermal_zone_find.patch | 39 ++++ 7 files changed, 635 insertions(+) create mode 100644 queue-6.12/cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch create mode 100644 queue-6.12/cgroup-cpuset-remove-kernfs-active-break.patch create mode 100644 queue-6.12/sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch create mode 100644 queue-6.12/sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch create mode 100644 queue-6.12/sched_ext-switch-class-when-preempted-by-higher-prio.patch create mode 100644 queue-6.12/thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch diff --git a/queue-6.12/cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch b/queue-6.12/cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch new file mode 100644 index 00000000000..cc1f5e51fdf --- /dev/null +++ b/queue-6.12/cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch @@ -0,0 +1,132 @@ +From 440a89783fa04b31bcdb72c38faaf40f112f7047 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 5 Dec 2024 14:51:01 -0500 +Subject: cgroup/cpuset: Prevent leakage of isolated CPUs into sched domains + +From: Waiman Long + +[ Upstream commit 9b496a8bbed9cc292b0dfd796f38ec58b6d0375f ] + +Isolated CPUs are not allowed to be used in a non-isolated partition. +The only exception is the top cpuset which is allowed to contain boot +time isolated CPUs. + +Commit ccac8e8de99c ("cgroup/cpuset: Fix remote root partition creation +problem") introduces a simplified scheme of including only partition +roots in sched domain generation. However, it does not properly account +for this exception case. This can result in leakage of isolated CPUs +into a sched domain. + +Fix it by making sure that isolated CPUs are excluded from the top +cpuset before generating sched domains. + +Also update the way the boot time isolated CPUs are handled in +test_cpuset_prs.sh to make sure that those isolated CPUs are really +isolated instead of just skipping them in the tests. + +Fixes: ccac8e8de99c ("cgroup/cpuset: Fix remote root partition creation problem") +Signed-off-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/cgroup/cpuset.c | 10 +++++- + .../selftests/cgroup/test_cpuset_prs.sh | 33 +++++++++++-------- + 2 files changed, 28 insertions(+), 15 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index a4dd285cdf39..c431c50512bd 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -862,7 +862,15 @@ static int generate_sched_domains(cpumask_var_t **domains, + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { +- cpumask_copy(doms[i], csa[i]->effective_cpus); ++ /* ++ * The top cpuset may contain some boot time isolated ++ * CPUs that need to be excluded from the sched domain. ++ */ ++ if (csa[i] == &top_cpuset) ++ cpumask_and(doms[i], csa[i]->effective_cpus, ++ housekeeping_cpumask(HK_TYPE_DOMAIN)); ++ else ++ cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; + } +diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh +index 03c1bdaed2c3..400a696a0d21 100755 +--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh ++++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh +@@ -86,15 +86,15 @@ echo "" > test/cpuset.cpus + + # + # If isolated CPUs have been reserved at boot time (as shown in +-# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-7 ++# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-8 + # that will be used by this script for testing purpose. If not, some of +-# the tests may fail incorrectly. These isolated CPUs will also be removed +-# before being compared with the expected results. ++# the tests may fail incorrectly. These pre-isolated CPUs should stay in ++# an isolated state throughout the testing process for now. + # + BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated) + if [[ -n "$BOOT_ISOLCPUS" ]] + then +- [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 7 ]] && ++ [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 8 ]] && + skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested" + echo "Pre-isolated CPUs: $BOOT_ISOLCPUS" + fi +@@ -683,15 +683,19 @@ check_isolcpus() + EXPECT_VAL2=$EXPECT_VAL + fi + ++ # ++ # Appending pre-isolated CPUs ++ # Even though CPU #8 isn't used for testing, it can't be pre-isolated ++ # to make appending those CPUs easier. ++ # ++ [[ -n "$BOOT_ISOLCPUS" ]] && { ++ EXPECT_VAL=${EXPECT_VAL:+${EXPECT_VAL},}${BOOT_ISOLCPUS} ++ EXPECT_VAL2=${EXPECT_VAL2:+${EXPECT_VAL2},}${BOOT_ISOLCPUS} ++ } ++ + # + # Check cpuset.cpus.isolated cpumask + # +- if [[ -z "$BOOT_ISOLCPUS" ]] +- then +- ISOLCPUS=$(cat $ISCPUS) +- else +- ISOLCPUS=$(cat $ISCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") +- fi + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { + # Take a 50ms pause and try again + pause 0.05 +@@ -731,8 +735,6 @@ check_isolcpus() + fi + done + [[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU +- [[ -n "BOOT_ISOLCPUS" ]] && +- ISOLCPUS=$(echo $ISOLCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") + + [[ "$EXPECT_VAL" = "$ISOLCPUS" ]] + } +@@ -836,8 +838,11 @@ run_state_test() + # if available + [[ -n "$ICPUS" ]] && { + check_isolcpus $ICPUS +- [[ $? -ne 0 ]] && test_fail $I "isolated CPU" \ +- "Expect $ICPUS, get $ISOLCPUS instead" ++ [[ $? -ne 0 ]] && { ++ [[ -n "$BOOT_ISOLCPUS" ]] && ICPUS=${ICPUS},${BOOT_ISOLCPUS} ++ test_fail $I "isolated CPU" \ ++ "Expect $ICPUS, get $ISOLCPUS instead" ++ } + } + reset_cgroup_states + # +-- +2.39.5 + diff --git a/queue-6.12/cgroup-cpuset-remove-kernfs-active-break.patch b/queue-6.12/cgroup-cpuset-remove-kernfs-active-break.patch new file mode 100644 index 00000000000..c77f0b51d2a --- /dev/null +++ b/queue-6.12/cgroup-cpuset-remove-kernfs-active-break.patch @@ -0,0 +1,132 @@ +From 968e8c09385638f21f4875b71591b49e7d6a1676 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 6 Jan 2025 08:19:04 +0000 +Subject: cgroup/cpuset: remove kernfs active break + +From: Chen Ridong + +[ Upstream commit 3cb97a927fffe443e1e7e8eddbfebfdb062e86ed ] + +A warning was found: + +WARNING: CPU: 10 PID: 3486953 at fs/kernfs/file.c:828 +CPU: 10 PID: 3486953 Comm: rmdir Kdump: loaded Tainted: G +RIP: 0010:kernfs_should_drain_open_files+0x1a1/0x1b0 +RSP: 0018:ffff8881107ef9e0 EFLAGS: 00010202 +RAX: 0000000080000002 RBX: ffff888154738c00 RCX: dffffc0000000000 +RDX: 0000000000000007 RSI: 0000000000000004 RDI: ffff888154738c04 +RBP: ffff888154738c04 R08: ffffffffaf27fa15 R09: ffffed102a8e7180 +R10: ffff888154738c07 R11: 0000000000000000 R12: ffff888154738c08 +R13: ffff888750f8c000 R14: ffff888750f8c0e8 R15: ffff888154738ca0 +FS: 00007f84cd0be740(0000) GS:ffff8887ddc00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000555f9fbe00c8 CR3: 0000000153eec001 CR4: 0000000000370ee0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + kernfs_drain+0x15e/0x2f0 + __kernfs_remove+0x165/0x300 + kernfs_remove_by_name_ns+0x7b/0xc0 + cgroup_rm_file+0x154/0x1c0 + cgroup_addrm_files+0x1c2/0x1f0 + css_clear_dir+0x77/0x110 + kill_css+0x4c/0x1b0 + cgroup_destroy_locked+0x194/0x380 + cgroup_rmdir+0x2a/0x140 + +It can be explained by: +rmdir echo 1 > cpuset.cpus + kernfs_fop_write_iter // active=0 +cgroup_rm_file +kernfs_remove_by_name_ns kernfs_get_active // active=1 +__kernfs_remove // active=0x80000002 +kernfs_drain cpuset_write_resmask +wait_event +//waiting (active == 0x80000001) + kernfs_break_active_protection + // active = 0x80000001 +// continue + kernfs_unbreak_active_protection + // active = 0x80000002 +... +kernfs_should_drain_open_files +// warning occurs + kernfs_put_active + +This warning is caused by 'kernfs_break_active_protection' when it is +writing to cpuset.cpus, and the cgroup is removed concurrently. + +The commit 3a5a6d0c2b03 ("cpuset: don't nest cgroup_mutex inside +get_online_cpus()") made cpuset_hotplug_workfn asynchronous, This change +involves calling flush_work(), which can create a multiple processes +circular locking dependency that involve cgroup_mutex, potentially leading +to a deadlock. To avoid deadlock. the commit 76bb5ab8f6e3 ("cpuset: break +kernfs active protection in cpuset_write_resmask()") added +'kernfs_break_active_protection' in the cpuset_write_resmask. This could +lead to this warning. + +After the commit 2125c0034c5d ("cgroup/cpuset: Make cpuset hotplug +processing synchronous"), the cpuset_write_resmask no longer needs to +wait the hotplug to finish, which means that concurrent hotplug and cpuset +operations are no longer possible. Therefore, the deadlock doesn't exist +anymore and it does not have to 'break active protection' now. To fix this +warning, just remove kernfs_break_active_protection operation in the +'cpuset_write_resmask'. + +Fixes: bdb2fd7fc56e ("kernfs: Skip kernfs_drain_open_files() more aggressively") +Fixes: 76bb5ab8f6e3 ("cpuset: break kernfs active protection in cpuset_write_resmask()") +Reported-by: Ji Fa +Signed-off-by: Chen Ridong +Acked-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/cgroup/cpuset.c | 25 ------------------------- + 1 file changed, 25 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index c431c50512bd..24ece85fd3b1 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -3110,29 +3110,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + int retval = -ENODEV; + + buf = strstrip(buf); +- +- /* +- * CPU or memory hotunplug may leave @cs w/o any execution +- * resources, in which case the hotplug code asynchronously updates +- * configuration and transfers all tasks to the nearest ancestor +- * which can execute. +- * +- * As writes to "cpus" or "mems" may restore @cs's execution +- * resources, wait for the previously scheduled operations before +- * proceeding, so that we don't end up keep removing tasks added +- * after execution capability is restored. +- * +- * cpuset_handle_hotplug may call back into cgroup core asynchronously +- * via cgroup_transfer_tasks() and waiting for it from a cgroupfs +- * operation like this one can lead to a deadlock through kernfs +- * active_ref protection. Let's break the protection. Losing the +- * protection is okay as we check whether @cs is online after +- * grabbing cpuset_mutex anyway. This only happens on the legacy +- * hierarchies. +- */ +- css_get(&cs->css); +- kernfs_break_active_protection(of->kn); +- + cpus_read_lock(); + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) +@@ -3163,8 +3140,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + out_unlock: + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +- kernfs_unbreak_active_protection(of->kn); +- css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); + return retval ?: nbytes; + } +-- +2.39.5 + diff --git a/queue-6.12/sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch b/queue-6.12/sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch new file mode 100644 index 00000000000..20a93d181fe --- /dev/null +++ b/queue-6.12/sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch @@ -0,0 +1,202 @@ +From 84f5aa317f74c6c22d042df68b069aa3d6611bcb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Jan 2025 23:16:31 +0100 +Subject: sched_ext: idle: Refresh idle masks during idle-to-idle transitions + +From: Andrea Righi + +[ Upstream commit a2a3374c47c428c0edb0bbc693638d4783f81e31 ] + +With the consolidation of put_prev_task/set_next_task(), see +commit 436f3eed5c69 ("sched: Combine the last put_prev_task() and the +first set_next_task()"), we are now skipping the transition between +these two functions when the previous and the next tasks are the same. + +As a result, the scx idle state of a CPU is updated only when +transitioning to or from the idle thread. While this is generally +correct, it can lead to uneven and inefficient core utilization in +certain scenarios [1]. + +A typical scenario involves proactive wake-ups: scx_bpf_pick_idle_cpu() +selects and marks an idle CPU as busy, followed by a wake-up via +scx_bpf_kick_cpu(), without dispatching any tasks. In this case, the CPU +continues running the idle thread, returns to idle, but remains marked +as busy, preventing it from being selected again as an idle CPU (until a +task eventually runs on it and releases the CPU). + +For example, running a workload that uses 20% of each CPU, combined with +an scx scheduler using proactive wake-ups, results in the following core +utilization: + + CPU 0: 25.7% + CPU 1: 29.3% + CPU 2: 26.5% + CPU 3: 25.5% + CPU 4: 0.0% + CPU 5: 25.5% + CPU 6: 0.0% + CPU 7: 10.5% + +To address this, refresh the idle state also in pick_task_idle(), during +idle-to-idle transitions, but only trigger ops.update_idle() on actual +state changes to prevent unnecessary updates to the scx scheduler and +maintain balanced state transitions. + +With this change in place, the core utilization in the previous example +becomes the following: + + CPU 0: 18.8% + CPU 1: 19.4% + CPU 2: 18.0% + CPU 3: 18.7% + CPU 4: 19.3% + CPU 5: 18.9% + CPU 6: 18.7% + CPU 7: 19.3% + +[1] https://github.com/sched-ext/scx/pull/1139 + +Fixes: 7c65ae81ea86 ("sched_ext: Don't call put_prev_task_scx() before picking the next task") +Signed-off-by: Andrea Righi +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/sched/ext.c | 61 ++++++++++++++++++++++++++++++++++++++------- + kernel/sched/ext.h | 8 +++--- + kernel/sched/idle.c | 5 ++-- + 3 files changed, 59 insertions(+), 15 deletions(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index f3ca1a88375c..f928a67a07d2 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -3240,16 +3240,8 @@ static void reset_idle_masks(void) + cpumask_copy(idle_masks.smt, cpu_online_mask); + } + +-void __scx_update_idle(struct rq *rq, bool idle) ++static void update_builtin_idle(int cpu, bool idle) + { +- int cpu = cpu_of(rq); +- +- if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { +- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); +- if (!static_branch_unlikely(&scx_builtin_idle_enabled)) +- return; +- } +- + if (idle) + cpumask_set_cpu(cpu, idle_masks.cpu); + else +@@ -3276,6 +3268,57 @@ void __scx_update_idle(struct rq *rq, bool idle) + #endif + } + ++/* ++ * Update the idle state of a CPU to @idle. ++ * ++ * If @do_notify is true, ops.update_idle() is invoked to notify the scx ++ * scheduler of an actual idle state transition (idle to busy or vice ++ * versa). If @do_notify is false, only the idle state in the idle masks is ++ * refreshed without invoking ops.update_idle(). ++ * ++ * This distinction is necessary, because an idle CPU can be "reserved" and ++ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as ++ * busy even if no tasks are dispatched. In this case, the CPU may return ++ * to idle without a true state transition. Refreshing the idle masks ++ * without invoking ops.update_idle() ensures accurate idle state tracking ++ * while avoiding unnecessary updates and maintaining balanced state ++ * transitions. ++ */ ++void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) ++{ ++ int cpu = cpu_of(rq); ++ ++ lockdep_assert_rq_held(rq); ++ ++ /* ++ * Trigger ops.update_idle() only when transitioning from a task to ++ * the idle thread and vice versa. ++ * ++ * Idle transitions are indicated by do_notify being set to true, ++ * managed by put_prev_task_idle()/set_next_task_idle(). ++ */ ++ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) ++ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); ++ ++ /* ++ * Update the idle masks: ++ * - for real idle transitions (do_notify == true) ++ * - for idle-to-idle transitions (indicated by the previous task ++ * being the idle thread, managed by pick_task_idle()) ++ * ++ * Skip updating idle masks if the previous task is not the idle ++ * thread, since set_next_task_idle() has already handled it when ++ * transitioning from a task to the idle thread (calling this ++ * function with do_notify == true). ++ * ++ * In this way we can avoid updating the idle masks twice, ++ * unnecessarily. ++ */ ++ if (static_branch_likely(&scx_builtin_idle_enabled)) ++ if (do_notify || is_idle_task(rq->curr)) ++ update_builtin_idle(cpu, idle); ++} ++ + static void handle_hotplug(struct rq *rq, bool online) + { + int cpu = cpu_of(rq); +diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h +index b1675bb59fc4..4d022d17ac7d 100644 +--- a/kernel/sched/ext.h ++++ b/kernel/sched/ext.h +@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {} + #endif /* CONFIG_SCHED_CLASS_EXT */ + + #if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +-void __scx_update_idle(struct rq *rq, bool idle); ++void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); + +-static inline void scx_update_idle(struct rq *rq, bool idle) ++static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) + { + if (scx_enabled()) +- __scx_update_idle(rq, idle); ++ __scx_update_idle(rq, idle, do_notify); + } + #else +-static inline void scx_update_idle(struct rq *rq, bool idle) {} ++static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} + #endif + + #ifdef CONFIG_CGROUP_SCHED +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index d2f096bb274c..53bb9193c537 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -453,19 +453,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) + static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) + { + dl_server_update_idle_time(rq, prev); +- scx_update_idle(rq, false); ++ scx_update_idle(rq, false, true); + } + + static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) + { + update_idle_core(rq); +- scx_update_idle(rq, true); ++ scx_update_idle(rq, true, true); + schedstat_inc(rq->sched_goidle); + next->se.exec_start = rq_clock_task(rq); + } + + struct task_struct *pick_task_idle(struct rq *rq) + { ++ scx_update_idle(rq, true, false); + return rq->idle; + } + +-- +2.39.5 + diff --git a/queue-6.12/sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch b/queue-6.12/sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch new file mode 100644 index 00000000000..f60599a63b7 --- /dev/null +++ b/queue-6.12/sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch @@ -0,0 +1,77 @@ +From cd27aa29a0476bae712bb54202cc3e91f1d1ef8b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Jan 2025 00:08:06 +0900 +Subject: sched_ext: Replace rq_lock() to raw_spin_rq_lock() in + scx_ops_bypass() + +From: Changwoo Min + +[ Upstream commit 6268d5bc10354fc2ab8d44a0cd3b042d49a0417e ] + +scx_ops_bypass() iterates all CPUs to re-enqueue all the scx tasks. +For each CPU, it acquires a lock using rq_lock() regardless of whether +a CPU is offline or the CPU is currently running a task in a higher +scheduler class (e.g., deadline). The rq_lock() is supposed to be used +for online CPUs, and the use of rq_lock() may trigger an unnecessary +warning in rq_pin_lock(). Therefore, replace rq_lock() to +raw_spin_rq_lock() in scx_ops_bypass(). + +Without this change, we observe the following warning: + +===== START ===== +[ 6.615205] rq->balance_callback && rq->balance_callback != &balance_push_callback +[ 6.615208] WARNING: CPU: 2 PID: 0 at kernel/sched/sched.h:1730 __schedule+0x1130/0x1c90 +===== END ===== + +Fixes: 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") +Signed-off-by: Changwoo Min +Acked-by: Andrea Righi +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/sched/ext.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 40f915f893e2..81235942555a 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -4348,10 +4348,9 @@ static void scx_ops_bypass(bool bypass) + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); +- struct rq_flags rf; + struct task_struct *p, *n; + +- rq_lock(rq, &rf); ++ raw_spin_rq_lock(rq); + + if (bypass) { + WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); +@@ -4367,7 +4366,7 @@ static void scx_ops_bypass(bool bypass) + * sees scx_rq_bypassing() before moving tasks to SCX. + */ + if (!scx_enabled()) { +- rq_unlock(rq, &rf); ++ raw_spin_rq_unlock(rq); + continue; + } + +@@ -4387,10 +4386,11 @@ static void scx_ops_bypass(bool bypass) + sched_enq_and_set_task(&ctx); + } + +- rq_unlock(rq, &rf); +- + /* resched to restore ticks and idle state */ +- resched_cpu(cpu); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ ++ raw_spin_rq_unlock(rq); + } + unlock: + raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags); +-- +2.39.5 + diff --git a/queue-6.12/sched_ext-switch-class-when-preempted-by-higher-prio.patch b/queue-6.12/sched_ext-switch-class-when-preempted-by-higher-prio.patch new file mode 100644 index 00000000000..8c38e594306 --- /dev/null +++ b/queue-6.12/sched_ext-switch-class-when-preempted-by-higher-prio.patch @@ -0,0 +1,47 @@ +From 58e0563c10d0a205b16b906447f6a0b9146ebf08 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 8 Jan 2025 10:33:28 +0800 +Subject: sched_ext: switch class when preempted by higher priority scheduler + +From: Honglei Wang + +[ Upstream commit 68e449d849fd50bd5e61d8bd32b3458dbd3a3df6 ] + +ops.cpu_release() function, if defined, must be invoked when preempted by +a higher priority scheduler class task. This scenario was skipped in +commit f422316d7466 ("sched_ext: Remove switch_class_scx()"). Let's fix +it. + +Fixes: f422316d7466 ("sched_ext: Remove switch_class_scx()") +Signed-off-by: Honglei Wang +Acked-by: Andrea Righi +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/sched/ext.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 81235942555a..f3ca1a88375c 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -2917,7 +2917,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, + */ + if (p->scx.slice && !scx_rq_bypassing(rq)) { + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); +- return; ++ goto switch_class; + } + + /* +@@ -2934,6 +2934,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, + } + } + ++switch_class: + if (next && next->sched_class != &ext_sched_class) + switch_class(rq, next); + } +-- +2.39.5 + diff --git a/queue-6.12/series b/queue-6.12/series index c3e2712d796..a7e4b70921a 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -103,3 +103,9 @@ btrfs-zlib-fix-avail_in-bytes-for-s390-zlib-hw-compression-path.patch revert-drm-mediatek-dsi-correct-calculation-formula-of-phy-timing.patch drm-amd-display-remove-unnecessary-amdgpu_irq_get-put.patch drm-amd-display-add-check-for-granularity-in-dml-ceil-floor-helpers.patch +cgroup-cpuset-prevent-leakage-of-isolated-cpus-into-.patch +thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch +sched_ext-replace-rq_lock-to-raw_spin_rq_lock-in-scx.patch +sched_ext-switch-class-when-preempted-by-higher-prio.patch +cgroup-cpuset-remove-kernfs-active-break.patch +sched_ext-idle-refresh-idle-masks-during-idle-to-idl.patch diff --git a/queue-6.12/thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch b/queue-6.12/thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch new file mode 100644 index 00000000000..c01182761d4 --- /dev/null +++ b/queue-6.12/thermal-of-fix-of-node-leak-in-of_thermal_zone_find.patch @@ -0,0 +1,39 @@ +From 68f73315316c6db979ed157934afde15ef581a68 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 24 Dec 2024 12:18:09 +0900 +Subject: thermal: of: fix OF node leak in of_thermal_zone_find() + +From: Joe Hattori + +[ Upstream commit 9164e0912af206a72ddac4915f7784e470a04ace ] + +of_thermal_zone_find() calls of_parse_phandle_with_args(), but does not +release the OF node reference obtained by it. + +Add a of_node_put() call when the call is successful. + +Fixes: 3fd6d6e2b4e8 ("thermal/of: Rework the thermal device tree initialization") +Signed-off-by: Joe Hattori +Link: https://patch.msgid.link/20241224031809.950461-1-joe@pf.is.s.u-tokyo.ac.jp +[ rjw: Changelog edit ] +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/thermal/thermal_of.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c +index 07e09897165f..5d3d8ce672cd 100644 +--- a/drivers/thermal/thermal_of.c ++++ b/drivers/thermal/thermal_of.c +@@ -176,6 +176,7 @@ static struct device_node *of_thermal_zone_find(struct device_node *sensor, int + goto out; + } + ++ of_node_put(sensor_specs.np); + if ((sensor == sensor_specs.np) && id == (sensor_specs.args_count ? + sensor_specs.args[0] : 0)) { + pr_debug("sensor %pOFn id=%d belongs to %pOFn\n", sensor, id, child); +-- +2.39.5 + -- 2.47.3