From: Frederic Weisbecker Date: Wed, 28 May 2025 16:05:32 +0000 (+0200) Subject: cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=03ff7351016983653bab5b38e58529d0e38a28cf;p=thirdparty%2Flinux.git cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset Until now, HK_TYPE_DOMAIN used to only include boot defined isolated CPUs passed through isolcpus= boot option. Users interested in also knowing the runtime defined isolated CPUs through cpuset must use different APIs: cpuset_cpu_is_isolated(), cpu_is_isolated(), etc... There are many drawbacks to that approach: 1) Most interested subsystems want to know about all isolated CPUs, not just those defined on boot time. 2) cpuset_cpu_is_isolated() / cpu_is_isolated() are not synchronized with concurrent cpuset changes. 3) Further cpuset modifications are not propagated to subsystems Solve 1) and 2) and centralize all isolated CPUs within the HK_TYPE_DOMAIN housekeeping cpumask. Subsystems can rely on RCU to synchronize against concurrent changes. The propagation mentioned in 3) will be handled in further patches. [Chen Ridong: Fix cpu_hotplug_lock deadlock and use correct static branch API] Signed-off-by: Frederic Weisbecker Reviewed-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Chen Ridong Cc: "Michal Koutný" Cc: Ingo Molnar Cc: Johannes Weiner Cc: Marco Crivellari Cc: Michal Hocko Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Waiman Long Cc: cgroups@vger.kernel.org --- diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index c7cf6934489cd..d8d9baf445169 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -9,6 +9,11 @@ enum hk_type { /* Inverse of boot-time isolcpus= argument */ HK_TYPE_DOMAIN_BOOT, + /* + * Same as HK_TYPE_DOMAIN_BOOT but also includes the + * inverse of cpuset isolated partitions. As such it + * is always a subset of HK_TYPE_DOMAIN_BOOT. + */ HK_TYPE_DOMAIN, /* Inverse of boot-time isolcpus=managed_irq argument */ HK_TYPE_MANAGED_IRQ, @@ -35,6 +40,7 @@ extern const struct cpumask *housekeeping_cpumask(enum hk_type type); extern bool housekeeping_enabled(enum hk_type type); extern void housekeeping_affine(struct task_struct *t, enum hk_type type); extern bool housekeeping_test_cpu(int cpu, enum hk_type type); +extern int housekeeping_update(struct cpumask *isol_mask); extern void __init housekeeping_init(void); #else @@ -62,6 +68,7 @@ static inline bool housekeeping_test_cpu(int cpu, enum hk_type type) return true; } +static inline int housekeeping_update(struct cpumask *isol_mask) { return 0; } static inline void housekeeping_init(void) { } #endif /* CONFIG_CPU_ISOLATION */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5e2e3514c22e9..e146e1f34bf97 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1482,14 +1482,15 @@ static void update_isolation_cpumasks(void) if (!isolated_cpus_updating) return; - lockdep_assert_cpus_held(); - ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); ret = tmigr_isolated_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); + ret = housekeeping_update(isolated_cpus); + WARN_ON_ONCE(ret < 0); + isolated_cpus_updating = false; } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 6f77289c14c35..674a02891b380 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -29,18 +29,48 @@ static struct housekeeping housekeeping; bool housekeeping_enabled(enum hk_type type) { - return !!(housekeeping.flags & BIT(type)); + return !!(READ_ONCE(housekeeping.flags) & BIT(type)); } EXPORT_SYMBOL_GPL(housekeeping_enabled); +static bool housekeeping_dereference_check(enum hk_type type) +{ + if (IS_ENABLED(CONFIG_LOCKDEP) && type == HK_TYPE_DOMAIN) { + /* Cpuset isn't even writable yet? */ + if (system_state <= SYSTEM_SCHEDULING) + return true; + + /* CPU hotplug write locked, so cpuset partition can't be overwritten */ + if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_write_held()) + return true; + + /* Cpuset lock held, partitions not writable */ + if (IS_ENABLED(CONFIG_CPUSETS) && lockdep_is_cpuset_held()) + return true; + + return false; + } + + return true; +} + +static inline struct cpumask *housekeeping_cpumask_dereference(enum hk_type type) +{ + return rcu_dereference_all_check(housekeeping.cpumasks[type], + housekeeping_dereference_check(type)); +} + const struct cpumask *housekeeping_cpumask(enum hk_type type) { + const struct cpumask *mask = NULL; + if (static_branch_unlikely(&housekeeping_overridden)) { - if (housekeeping.flags & BIT(type)) { - return rcu_dereference_check(housekeeping.cpumasks[type], 1); - } + if (READ_ONCE(housekeeping.flags) & BIT(type)) + mask = housekeeping_cpumask_dereference(type); } - return cpu_possible_mask; + if (!mask) + mask = cpu_possible_mask; + return mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); @@ -80,12 +110,45 @@ EXPORT_SYMBOL_GPL(housekeeping_affine); bool housekeeping_test_cpu(int cpu, enum hk_type type) { - if (static_branch_unlikely(&housekeeping_overridden) && housekeeping.flags & BIT(type)) + if (static_branch_unlikely(&housekeeping_overridden) && + READ_ONCE(housekeeping.flags) & BIT(type)) return cpumask_test_cpu(cpu, housekeeping_cpumask(type)); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); +int housekeeping_update(struct cpumask *isol_mask) +{ + struct cpumask *trial, *old = NULL; + + lockdep_assert_cpus_held(); + + trial = kmalloc(cpumask_size(), GFP_KERNEL); + if (!trial) + return -ENOMEM; + + cpumask_andnot(trial, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT), isol_mask); + if (!cpumask_intersects(trial, cpu_online_mask)) { + kfree(trial); + return -EINVAL; + } + + if (!housekeeping.flags) + static_branch_enable_cpuslocked(&housekeeping_overridden); + + if (housekeeping.flags & HK_FLAG_DOMAIN) + old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN); + else + WRITE_ONCE(housekeeping.flags, housekeeping.flags | HK_FLAG_DOMAIN); + rcu_assign_pointer(housekeeping.cpumasks[HK_TYPE_DOMAIN], trial); + + synchronize_rcu(); + + kfree(old); + + return 0; +} + void __init housekeeping_init(void) { enum hk_type type; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 475bdab3b8db2..653e898a996a4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include