]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
authorK Prateek Nayak <kprateek.nayak@amd.com>
Mon, 16 Mar 2026 08:18:49 +0000 (08:18 +0000)
committerMario Limonciello (AMD) <superm1@kernel.org>
Thu, 2 Apr 2026 16:30:24 +0000 (11:30 -0500)
cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent
writer(s), however amd-pstate depends on fetching the cpudata via the
policy's driver data which necessitates grabbing the reference.

Since schedutil governor can call "cpufreq_driver->update_perf()"
during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled,
fetching the policy object using the cpufreq_cpu_get() helper in the
scheduler fast-path leads to "BUG: scheduling while atomic" on
PREEMPT_RT [1].

Pass the cached cpufreq policy object in sg_policy to the update_perf()
instead of just the CPU. The CPU can be inferred using "policy->cpu".

The lifetime of cpufreq_policy object outlasts that of the governor and
the cpufreq driver (allocated when the CPU is onlined and only reclaimed
when the CPU is offlined / the CPU device is removed) which makes it
safe to be referenced throughout the governor's lifetime.

Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1]

Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State")
Reported-by: Bert Karwatzki <spasswolf@web.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Acked-by: Gary Guo <gary@garyguo.net> # Rust
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Zhongqiu Han <zhongqiu.han@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20260316081849.19368-3-kprateek.nayak@amd.com
Signed-off-by: Mario Limonciello (AMD) <superm1@kernel.org>
drivers/cpufreq/amd-pstate.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/intel_pstate.c
include/linux/cpufreq.h
kernel/sched/cpufreq_schedutil.c
rust/kernel/cpufreq.rs

index 2ea4d27fe020c2aee5b397eedfe10f71a424d254..c825fab0bf5c13db470285e214eda3d13bb4954d 100644 (file)
@@ -788,13 +788,12 @@ static unsigned int amd_pstate_fast_switch(struct cpufreq_policy *policy,
        return policy->cur;
 }
 
-static void amd_pstate_adjust_perf(unsigned int cpu,
+static void amd_pstate_adjust_perf(struct cpufreq_policy *policy,
                                   unsigned long _min_perf,
                                   unsigned long target_perf,
                                   unsigned long capacity)
 {
        u8 max_perf, min_perf, des_perf, cap_perf;
-       struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu);
        struct amd_cpudata *cpudata;
        union perf_cached perf;
 
index 277884d91913c84616104ce49c1960a4d3496680..90e939069cde4fc87ac63225b313546cb2a98a92 100644 (file)
@@ -2231,7 +2231,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
 
 /**
  * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
- * @cpu: Target CPU.
+ * @policy: cpufreq policy object of the target CPU.
  * @min_perf: Minimum (required) performance level (units of @capacity).
  * @target_perf: Target (desired) performance level (units of @capacity).
  * @capacity: Capacity of the target CPU.
@@ -2250,12 +2250,12 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
  * parallel with either ->target() or ->target_index() or ->fast_switch() for
  * the same CPU.
  */
-void cpufreq_driver_adjust_perf(unsigned int cpu,
+void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy,
                                 unsigned long min_perf,
                                 unsigned long target_perf,
                                 unsigned long capacity)
 {
-       cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity);
+       cpufreq_driver->adjust_perf(policy, min_perf, target_perf, capacity);
 }
 
 /**
index 11c58af41900645154938618caf9bc66d8bb5a23..0f50034e4b6808acd60e86e741aa52e5600a0c82 100644 (file)
@@ -3239,12 +3239,12 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
        return target_pstate * cpu->pstate.scaling;
 }
 
-static void intel_cpufreq_adjust_perf(unsigned int cpunum,
+static void intel_cpufreq_adjust_perf(struct cpufreq_policy *policy,
                                      unsigned long min_perf,
                                      unsigned long target_perf,
                                      unsigned long capacity)
 {
-       struct cpudata *cpu = all_cpu_data[cpunum];
+       struct cpudata *cpu = all_cpu_data[policy->cpu];
        u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
        int old_pstate = cpu->pstate.current_pstate;
        int cap_pstate, min_pstate, max_pstate, target_pstate;
index cc894fc389710521152f5884e543e03bd194958a..4317c5a312bd1a4f11d83625bd1251f8196e6668 100644 (file)
@@ -372,7 +372,7 @@ struct cpufreq_driver {
         * conditions) scale invariance can be disabled, which causes the
         * schedutil governor to fall back to the latter.
         */
-       void            (*adjust_perf)(unsigned int cpu,
+       void            (*adjust_perf)(struct cpufreq_policy *policy,
                                       unsigned long min_perf,
                                       unsigned long target_perf,
                                       unsigned long capacity);
@@ -617,7 +617,7 @@ struct cpufreq_governor {
 /* Pass a target to the cpufreq driver */
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
                                        unsigned int target_freq);
-void cpufreq_driver_adjust_perf(unsigned int cpu,
+void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy,
                                unsigned long min_perf,
                                unsigned long target_perf,
                                unsigned long capacity);
index 153232dd8276ae645ea513643b807b8911985763..ae9fd211cec1f3efb2a1d7ce41a2f1b9efa7b7ce 100644 (file)
@@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
                                     unsigned int flags)
 {
        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+       struct sugov_policy *sg_policy = sg_cpu->sg_policy;
        unsigned long prev_util = sg_cpu->util;
        unsigned long max_cap;
 
@@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
        if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
                sg_cpu->util = prev_util;
 
-       cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+       cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min,
                                   sg_cpu->util, max_cap);
 
-       sg_cpu->sg_policy->last_freq_update_time = time;
+       sg_policy->last_freq_update_time = time;
 }
 
 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
index f5adee48d40cb83d9e326f593db88f150bb1a79c..d8d26870bea2eb81ca77a4483aa412f7a7109a0d 100644 (file)
@@ -1257,18 +1257,17 @@ impl<T: Driver> Registration<T> {
     /// # Safety
     ///
     /// - This function may only be called from the cpufreq C infrastructure.
+    /// - The pointer arguments must be valid pointers.
     unsafe extern "C" fn adjust_perf_callback(
-        cpu: c_uint,
+        ptr: *mut bindings::cpufreq_policy,
         min_perf: c_ulong,
         target_perf: c_ulong,
         capacity: c_ulong,
     ) {
-        // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number.
-        let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) };
-
-        if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) {
-            T::adjust_perf(&mut policy, min_perf, target_perf, capacity);
-        }
+        // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the
+        // lifetime of `policy`.
+        let policy = unsafe { Policy::from_raw_mut(ptr) };
+        T::adjust_perf(policy, min_perf, target_perf, capacity);
     }
 
     /// Driver's `get_intermediate` callback.