]>
Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * kernel/cpuset.c | |
3 | * | |
4 | * Processor and Memory placement constraints for sets of tasks. | |
5 | * | |
6 | * Copyright (C) 2003 BULL SA. | |
029190c5 | 7 | * Copyright (C) 2004-2007 Silicon Graphics, Inc. |
8793d854 | 8 | * Copyright (C) 2006 Google, Inc |
1da177e4 LT |
9 | * |
10 | * Portions derived from Patrick Mochel's sysfs code. | |
11 | * sysfs is Copyright (c) 2001-3 Patrick Mochel | |
1da177e4 | 12 | * |
825a46af | 13 | * 2003-10-10 Written by Simon Derr. |
1da177e4 | 14 | * 2003-10-22 Updates by Stephen Hemminger. |
825a46af | 15 | * 2004 May-July Rework by Paul Jackson. |
8793d854 | 16 | * 2006 Rework by Paul Menage to use generic cgroups |
cf417141 MK |
17 | * 2008 Rework of the scheduler domains and CPU hotplug handling |
18 | * by Max Krasnyansky | |
1da177e4 LT |
19 | * |
20 | * This file is subject to the terms and conditions of the GNU General Public | |
21 | * License. See the file COPYING in the main directory of the Linux | |
22 | * distribution for more details. | |
23 | */ | |
24 | ||
1da177e4 LT |
25 | #include <linux/cpu.h> |
26 | #include <linux/cpumask.h> | |
27 | #include <linux/cpuset.h> | |
72c6303a | 28 | #include <linux/delay.h> |
1da177e4 LT |
29 | #include <linux/init.h> |
30 | #include <linux/interrupt.h> | |
31 | #include <linux/kernel.h> | |
68860ec1 | 32 | #include <linux/mempolicy.h> |
1da177e4 | 33 | #include <linux/mm.h> |
f481891f | 34 | #include <linux/memory.h> |
9984de1a | 35 | #include <linux/export.h> |
6b9c2603 | 36 | #include <linux/rcupdate.h> |
1da177e4 | 37 | #include <linux/sched.h> |
f9a25f77 | 38 | #include <linux/sched/deadline.h> |
6e84f315 | 39 | #include <linux/sched/mm.h> |
f719ff9b | 40 | #include <linux/sched/task.h> |
22fb52dd | 41 | #include <linux/security.h> |
1da177e4 | 42 | #include <linux/spinlock.h> |
da99ecf1 | 43 | #include <linux/oom.h> |
edb93821 | 44 | #include <linux/sched/isolation.h> |
956db3ca | 45 | #include <linux/cgroup.h> |
e44193d3 | 46 | #include <linux/wait.h> |
72c6303a | 47 | #include <linux/workqueue.h> |
1da177e4 | 48 | |
89affbf5 | 49 | DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); |
002f2906 | 50 | DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); |
202f72d5 | 51 | |
8ca1b5a4 FT |
52 | /* |
53 | * There could be abnormal cpuset configurations for cpu or memory | |
f9da322e | 54 | * node binding, add this key to provide a quick low-cost judgment |
8ca1b5a4 FT |
55 | * of the situation. |
56 | */ | |
57 | DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); | |
58 | ||
3e0d98b9 PJ |
59 | /* See "Frequency meter" comments, below. */ |
60 | ||
61 | struct fmeter { | |
62 | int cnt; /* unprocessed events count */ | |
63 | int val; /* most recent output value */ | |
d2b43658 | 64 | time64_t time; /* clock (secs) when val computed */ |
3e0d98b9 PJ |
65 | spinlock_t lock; /* guards read or write of above */ |
66 | }; | |
67 | ||
7476a636 WL |
68 | /* |
69 | * Invalid partition error code | |
70 | */ | |
71 | enum prs_errcode { | |
72 | PERR_NONE = 0, | |
73 | PERR_INVCPUS, | |
74 | PERR_INVPARENT, | |
75 | PERR_NOTPART, | |
76 | PERR_NOTEXCL, | |
77 | PERR_NOCPUS, | |
78 | PERR_HOTPLUG, | |
79 | PERR_CPUSEMPTY, | |
4a74e418 | 80 | PERR_HKEEPING, |
7476a636 WL |
81 | }; |
82 | ||
83 | static const char * const perr_strings[] = { | |
0c7f293e | 84 | [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", |
7476a636 WL |
85 | [PERR_INVPARENT] = "Parent is an invalid partition root", |
86 | [PERR_NOTPART] = "Parent is not a partition root", | |
87 | [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", | |
88 | [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", | |
89 | [PERR_HOTPLUG] = "No cpu available due to hotplug", | |
90 | [PERR_CPUSEMPTY] = "cpuset.cpus is empty", | |
4a74e418 | 91 | [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", |
7476a636 WL |
92 | }; |
93 | ||
1da177e4 | 94 | struct cpuset { |
8793d854 PM |
95 | struct cgroup_subsys_state css; |
96 | ||
1da177e4 | 97 | unsigned long flags; /* "unsigned long" so bitops work */ |
e2b9a3d7 | 98 | |
7e88291b LZ |
99 | /* |
100 | * On default hierarchy: | |
101 | * | |
102 | * The user-configured masks can only be changed by writing to | |
103 | * cpuset.cpus and cpuset.mems, and won't be limited by the | |
104 | * parent masks. | |
105 | * | |
106 | * The effective masks is the real masks that apply to the tasks | |
107 | * in the cpuset. They may be changed if the configured masks are | |
108 | * changed or hotplug happens. | |
109 | * | |
110 | * effective_mask == configured_mask & parent's effective_mask, | |
111 | * and if it ends up empty, it will inherit the parent's mask. | |
112 | * | |
113 | * | |
415de5fd | 114 | * On legacy hierarchy: |
7e88291b LZ |
115 | * |
116 | * The user-configured masks are always the same with effective masks. | |
117 | */ | |
118 | ||
e2b9a3d7 LZ |
119 | /* user-configured CPUs and Memory Nodes allow to tasks */ |
120 | cpumask_var_t cpus_allowed; | |
121 | nodemask_t mems_allowed; | |
122 | ||
123 | /* effective CPUs and Memory Nodes allow to tasks */ | |
124 | cpumask_var_t effective_cpus; | |
125 | nodemask_t effective_mems; | |
1da177e4 | 126 | |
58b74842 | 127 | /* |
0c7f293e | 128 | * Exclusive CPUs dedicated to current cgroup (default hierarchy only) |
4b842da2 | 129 | * |
0c7f293e WL |
130 | * This exclusive CPUs must be a subset of cpus_allowed. A parent |
131 | * cgroup can only grant exclusive CPUs to one of its children. | |
132 | * | |
133 | * When the cgroup becomes a valid partition root, effective_xcpus | |
134 | * defaults to cpus_allowed if not set. The effective_cpus of a valid | |
135 | * partition root comes solely from its effective_xcpus and some of the | |
136 | * effective_xcpus may be distributed to sub-partitions below & hence | |
137 | * excluded from its effective_cpus. | |
58b74842 | 138 | */ |
0c7f293e | 139 | cpumask_var_t effective_xcpus; |
58b74842 | 140 | |
e2ffe502 WL |
141 | /* |
142 | * Exclusive CPUs as requested by the user (default hierarchy only) | |
143 | */ | |
144 | cpumask_var_t exclusive_cpus; | |
145 | ||
33ad801d LZ |
146 | /* |
147 | * This is old Memory Nodes tasks took on. | |
148 | * | |
149 | * - top_cpuset.old_mems_allowed is initialized to mems_allowed. | |
150 | * - A new cpuset's old_mems_allowed is initialized when some | |
151 | * task is moved into it. | |
152 | * - old_mems_allowed is used in cpuset_migrate_mm() when we change | |
153 | * cpuset.mems_allowed and have tasks' nodemask updated, and | |
154 | * then old_mems_allowed is updated to mems_allowed. | |
155 | */ | |
156 | nodemask_t old_mems_allowed; | |
157 | ||
3e0d98b9 | 158 | struct fmeter fmeter; /* memory_pressure filter */ |
029190c5 | 159 | |
452477fa TH |
160 | /* |
161 | * Tasks are being attached to this cpuset. Used to prevent | |
162 | * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). | |
163 | */ | |
164 | int attach_in_progress; | |
165 | ||
029190c5 PJ |
166 | /* partition number for rebuild_sched_domains() */ |
167 | int pn; | |
956db3ca | 168 | |
1d3504fc HS |
169 | /* for custom sched domain */ |
170 | int relax_domain_level; | |
58b74842 | 171 | |
0c7f293e WL |
172 | /* number of valid sub-partitions */ |
173 | int nr_subparts; | |
58b74842 WL |
174 | |
175 | /* partition root state */ | |
176 | int partition_root_state; | |
4716909c WL |
177 | |
178 | /* | |
179 | * Default hierarchy only: | |
180 | * use_parent_ecpus - set if using parent's effective_cpus | |
181 | * child_ecpus_count - # of children with use_parent_ecpus set | |
182 | */ | |
183 | int use_parent_ecpus; | |
184 | int child_ecpus_count; | |
e7cc9888 | 185 | |
6c24849f JL |
186 | /* |
187 | * number of SCHED_DEADLINE tasks attached to this cpuset, so that we | |
188 | * know when to rebuild associated root domain bandwidth information. | |
189 | */ | |
190 | int nr_deadline_tasks; | |
2ef269ef DE |
191 | int nr_migrate_dl_tasks; |
192 | u64 sum_migrate_dl_bw; | |
6c24849f | 193 | |
7476a636 WL |
194 | /* Invalid partition error code, not lock protected */ |
195 | enum prs_errcode prs_err; | |
196 | ||
e7cc9888 WL |
197 | /* Handle for cpuset.cpus.partition */ |
198 | struct cgroup_file partition_file; | |
181c8e09 WL |
199 | |
200 | /* Remote partition silbling list anchored at remote_children */ | |
201 | struct list_head remote_sibling; | |
58b74842 WL |
202 | }; |
203 | ||
2125c003 WL |
204 | /* |
205 | * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously | |
206 | */ | |
207 | struct cpuset_remove_tasks_struct { | |
208 | struct work_struct work; | |
209 | struct cpuset *cs; | |
210 | }; | |
211 | ||
0c7f293e WL |
212 | /* |
213 | * Exclusive CPUs distributed out to sub-partitions of top_cpuset | |
214 | */ | |
215 | static cpumask_var_t subpartitions_cpus; | |
216 | ||
11e5f407 WL |
217 | /* |
218 | * Exclusive CPUs in isolated partitions | |
219 | */ | |
220 | static cpumask_var_t isolated_cpus; | |
221 | ||
181c8e09 WL |
222 | /* List of remote partition root children */ |
223 | static struct list_head remote_children; | |
224 | ||
58b74842 WL |
225 | /* |
226 | * Partition root states: | |
227 | * | |
18065ebe | 228 | * 0 - member (not a partition root) |
58b74842 | 229 | * 1 - partition root |
f28e2244 | 230 | * 2 - partition root without load balancing (isolated) |
3881b861 | 231 | * -1 - invalid partition root |
f28e2244 | 232 | * -2 - invalid isolated partition root |
58b74842 | 233 | */ |
18065ebe WL |
234 | #define PRS_MEMBER 0 |
235 | #define PRS_ROOT 1 | |
f28e2244 | 236 | #define PRS_ISOLATED 2 |
18065ebe | 237 | #define PRS_INVALID_ROOT -1 |
f28e2244 | 238 | #define PRS_INVALID_ISOLATED -2 |
18065ebe WL |
239 | |
240 | static inline bool is_prs_invalid(int prs_state) | |
241 | { | |
242 | return prs_state < 0; | |
243 | } | |
58b74842 WL |
244 | |
245 | /* | |
246 | * Temporary cpumasks for working with partitions that are passed among | |
247 | * functions to avoid memory allocation in inner functions. | |
248 | */ | |
249 | struct tmpmasks { | |
250 | cpumask_var_t addmask, delmask; /* For partition root */ | |
251 | cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ | |
1da177e4 LT |
252 | }; |
253 | ||
a7c6d554 | 254 | static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) |
8793d854 | 255 | { |
a7c6d554 | 256 | return css ? container_of(css, struct cpuset, css) : NULL; |
8793d854 PM |
257 | } |
258 | ||
259 | /* Retrieve the cpuset for a task */ | |
260 | static inline struct cpuset *task_cs(struct task_struct *task) | |
261 | { | |
073219e9 | 262 | return css_cs(task_css(task, cpuset_cgrp_id)); |
8793d854 | 263 | } |
8793d854 | 264 | |
c9710d80 | 265 | static inline struct cpuset *parent_cs(struct cpuset *cs) |
c431069f | 266 | { |
5c9d535b | 267 | return css_cs(cs->css.parent); |
c431069f TH |
268 | } |
269 | ||
6c24849f JL |
270 | void inc_dl_tasks_cs(struct task_struct *p) |
271 | { | |
272 | struct cpuset *cs = task_cs(p); | |
273 | ||
274 | cs->nr_deadline_tasks++; | |
275 | } | |
276 | ||
277 | void dec_dl_tasks_cs(struct task_struct *p) | |
278 | { | |
279 | struct cpuset *cs = task_cs(p); | |
280 | ||
281 | cs->nr_deadline_tasks--; | |
282 | } | |
283 | ||
1da177e4 LT |
284 | /* bits in struct cpuset flags field */ |
285 | typedef enum { | |
efeb77b2 | 286 | CS_ONLINE, |
1da177e4 LT |
287 | CS_CPU_EXCLUSIVE, |
288 | CS_MEM_EXCLUSIVE, | |
78608366 | 289 | CS_MEM_HARDWALL, |
45b07ef3 | 290 | CS_MEMORY_MIGRATE, |
029190c5 | 291 | CS_SCHED_LOAD_BALANCE, |
825a46af PJ |
292 | CS_SPREAD_PAGE, |
293 | CS_SPREAD_SLAB, | |
1da177e4 LT |
294 | } cpuset_flagbits_t; |
295 | ||
296 | /* convenient tests for these bits */ | |
41c25707 | 297 | static inline bool is_cpuset_online(struct cpuset *cs) |
efeb77b2 | 298 | { |
41c25707 | 299 | return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); |
efeb77b2 TH |
300 | } |
301 | ||
1da177e4 LT |
302 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
303 | { | |
7b5b9ef0 | 304 | return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); |
1da177e4 LT |
305 | } |
306 | ||
307 | static inline int is_mem_exclusive(const struct cpuset *cs) | |
308 | { | |
7b5b9ef0 | 309 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); |
1da177e4 LT |
310 | } |
311 | ||
78608366 PM |
312 | static inline int is_mem_hardwall(const struct cpuset *cs) |
313 | { | |
314 | return test_bit(CS_MEM_HARDWALL, &cs->flags); | |
315 | } | |
316 | ||
029190c5 PJ |
317 | static inline int is_sched_load_balance(const struct cpuset *cs) |
318 | { | |
319 | return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | |
320 | } | |
321 | ||
45b07ef3 PJ |
322 | static inline int is_memory_migrate(const struct cpuset *cs) |
323 | { | |
7b5b9ef0 | 324 | return test_bit(CS_MEMORY_MIGRATE, &cs->flags); |
45b07ef3 PJ |
325 | } |
326 | ||
825a46af PJ |
327 | static inline int is_spread_page(const struct cpuset *cs) |
328 | { | |
329 | return test_bit(CS_SPREAD_PAGE, &cs->flags); | |
330 | } | |
331 | ||
332 | static inline int is_spread_slab(const struct cpuset *cs) | |
333 | { | |
334 | return test_bit(CS_SPREAD_SLAB, &cs->flags); | |
335 | } | |
336 | ||
18065ebe | 337 | static inline int is_partition_valid(const struct cpuset *cs) |
58b74842 | 338 | { |
3881b861 | 339 | return cs->partition_root_state > 0; |
58b74842 WL |
340 | } |
341 | ||
18065ebe WL |
342 | static inline int is_partition_invalid(const struct cpuset *cs) |
343 | { | |
344 | return cs->partition_root_state < 0; | |
345 | } | |
346 | ||
347 | /* | |
348 | * Callers should hold callback_lock to modify partition_root_state. | |
349 | */ | |
350 | static inline void make_partition_invalid(struct cpuset *cs) | |
351 | { | |
0c7f293e | 352 | if (cs->partition_root_state > 0) |
f28e2244 | 353 | cs->partition_root_state = -cs->partition_root_state; |
18065ebe WL |
354 | } |
355 | ||
e7cc9888 WL |
356 | /* |
357 | * Send notification event of whenever partition_root_state changes. | |
358 | */ | |
18065ebe | 359 | static inline void notify_partition_change(struct cpuset *cs, int old_prs) |
e7cc9888 | 360 | { |
18065ebe WL |
361 | if (old_prs == cs->partition_root_state) |
362 | return; | |
363 | cgroup_file_notify(&cs->partition_file); | |
7476a636 WL |
364 | |
365 | /* Reset prs_err if not invalid */ | |
366 | if (is_partition_valid(cs)) | |
367 | WRITE_ONCE(cs->prs_err, PERR_NONE); | |
e7cc9888 WL |
368 | } |
369 | ||
1da177e4 | 370 | static struct cpuset top_cpuset = { |
04d63da4 WL |
371 | .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | |
372 | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), | |
18065ebe | 373 | .partition_root_state = PRS_ROOT, |
8996f93f | 374 | .relax_domain_level = -1, |
181c8e09 | 375 | .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), |
1da177e4 LT |
376 | }; |
377 | ||
ae8086ce TH |
378 | /** |
379 | * cpuset_for_each_child - traverse online children of a cpuset | |
380 | * @child_cs: loop cursor pointing to the current child | |
492eb21b | 381 | * @pos_css: used for iteration |
ae8086ce TH |
382 | * @parent_cs: target cpuset to walk children of |
383 | * | |
384 | * Walk @child_cs through the online children of @parent_cs. Must be used | |
385 | * with RCU read locked. | |
386 | */ | |
492eb21b TH |
387 | #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ |
388 | css_for_each_child((pos_css), &(parent_cs)->css) \ | |
389 | if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) | |
ae8086ce | 390 | |
fc560a26 TH |
391 | /** |
392 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants | |
393 | * @des_cs: loop cursor pointing to the current descendant | |
492eb21b | 394 | * @pos_css: used for iteration |
fc560a26 TH |
395 | * @root_cs: target cpuset to walk ancestor of |
396 | * | |
397 | * Walk @des_cs through the online descendants of @root_cs. Must be used | |
492eb21b | 398 | * with RCU read locked. The caller may modify @pos_css by calling |
bd8815a6 TH |
399 | * css_rightmost_descendant() to skip subtree. @root_cs is included in the |
400 | * iteration and the first node to be visited. | |
fc560a26 | 401 | */ |
492eb21b TH |
402 | #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ |
403 | css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ | |
404 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) | |
fc560a26 | 405 | |
1da177e4 | 406 | /* |
111cd11b | 407 | * There are two global locks guarding cpuset structures - cpuset_mutex and |
8447a0fe VD |
408 | * callback_lock. We also require taking task_lock() when dereferencing a |
409 | * task's cpuset pointer. See "The task_lock() exception", at the end of this | |
111cd11b JL |
410 | * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems |
411 | * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset | |
412 | * structures. Note that cpuset_mutex needs to be a mutex as it is used in | |
413 | * paths that rely on priority inheritance (e.g. scheduler - on RT) for | |
414 | * correctness. | |
5d21cc2d | 415 | * |
8447a0fe | 416 | * A task must hold both locks to modify cpusets. If a task holds |
111cd11b JL |
417 | * cpuset_mutex, it blocks others, ensuring that it is the only task able to |
418 | * also acquire callback_lock and be able to modify cpusets. It can perform | |
419 | * various checks on the cpuset structure first, knowing nothing will change. | |
420 | * It can also allocate memory while just holding cpuset_mutex. While it is | |
421 | * performing these checks, various callback routines can briefly acquire | |
422 | * callback_lock to query cpusets. Once it is ready to make the changes, it | |
423 | * takes callback_lock, blocking everyone else. | |
053199ed PJ |
424 | * |
425 | * Calls to the kernel memory allocator can not be made while holding | |
8447a0fe | 426 | * callback_lock, as that would risk double tripping on callback_lock |
053199ed PJ |
427 | * from one of the callbacks into the cpuset code from within |
428 | * __alloc_pages(). | |
429 | * | |
8447a0fe | 430 | * If a task is only holding callback_lock, then it has read-only |
053199ed PJ |
431 | * access to cpusets. |
432 | * | |
58568d2a MX |
433 | * Now, the task_struct fields mems_allowed and mempolicy may be changed |
434 | * by other task, we use alloc_lock in the task_struct fields to protect | |
435 | * them. | |
053199ed | 436 | * |
8447a0fe | 437 | * The cpuset_common_file_read() handlers only hold callback_lock across |
053199ed PJ |
438 | * small pieces of code, such as when reading out possibly multi-word |
439 | * cpumasks and nodemasks. | |
440 | * | |
2df167a3 PM |
441 | * Accessing a task's cpuset should be done in accordance with the |
442 | * guidelines for accessing subsystem state in kernel/cgroup.c | |
1da177e4 LT |
443 | */ |
444 | ||
111cd11b | 445 | static DEFINE_MUTEX(cpuset_mutex); |
710da3c8 | 446 | |
111cd11b | 447 | void cpuset_lock(void) |
710da3c8 | 448 | { |
111cd11b | 449 | mutex_lock(&cpuset_mutex); |
710da3c8 JL |
450 | } |
451 | ||
111cd11b | 452 | void cpuset_unlock(void) |
710da3c8 | 453 | { |
111cd11b | 454 | mutex_unlock(&cpuset_mutex); |
710da3c8 JL |
455 | } |
456 | ||
8447a0fe | 457 | static DEFINE_SPINLOCK(callback_lock); |
4247bdc6 | 458 | |
e93ad19d TH |
459 | static struct workqueue_struct *cpuset_migrate_mm_wq; |
460 | ||
e44193d3 LZ |
461 | static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); |
462 | ||
8ca1b5a4 FT |
463 | static inline void check_insane_mems_config(nodemask_t *nodes) |
464 | { | |
465 | if (!cpusets_insane_config() && | |
466 | movable_only_nodes(nodes)) { | |
467 | static_branch_enable(&cpusets_insane_config_key); | |
468 | pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" | |
469 | "Cpuset allocations might fail even with a lot of memory available.\n", | |
470 | nodemask_pr_args(nodes)); | |
471 | } | |
472 | } | |
473 | ||
b8d1b8ee | 474 | /* |
0c05b9bd WL |
475 | * Cgroup v2 behavior is used on the "cpus" and "mems" control files when |
476 | * on default hierarchy or when the cpuset_v2_mode flag is set by mounting | |
477 | * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. | |
478 | * With v2 behavior, "cpus" and "mems" are always what the users have | |
479 | * requested and won't be changed by hotplug events. Only the effective | |
480 | * cpus or mems will be affected. | |
b8d1b8ee WL |
481 | */ |
482 | static inline bool is_in_v2_mode(void) | |
483 | { | |
484 | return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || | |
485 | (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); | |
486 | } | |
487 | ||
e2d59900 WL |
488 | /** |
489 | * partition_is_populated - check if partition has tasks | |
490 | * @cs: partition root to be checked | |
491 | * @excluded_child: a child cpuset to be excluded in task checking | |
492 | * Return: true if there are tasks, false otherwise | |
493 | * | |
494 | * It is assumed that @cs is a valid partition root. @excluded_child should | |
495 | * be non-NULL when this cpuset is going to become a partition itself. | |
496 | */ | |
497 | static inline bool partition_is_populated(struct cpuset *cs, | |
498 | struct cpuset *excluded_child) | |
499 | { | |
500 | struct cgroup_subsys_state *css; | |
501 | struct cpuset *child; | |
502 | ||
503 | if (cs->css.cgroup->nr_populated_csets) | |
504 | return true; | |
0c7f293e | 505 | if (!excluded_child && !cs->nr_subparts) |
e2d59900 WL |
506 | return cgroup_is_populated(cs->css.cgroup); |
507 | ||
508 | rcu_read_lock(); | |
509 | cpuset_for_each_child(child, css, cs) { | |
510 | if (child == excluded_child) | |
511 | continue; | |
512 | if (is_partition_valid(child)) | |
513 | continue; | |
514 | if (cgroup_is_populated(child->css.cgroup)) { | |
515 | rcu_read_unlock(); | |
516 | return true; | |
517 | } | |
518 | } | |
519 | rcu_read_unlock(); | |
520 | return false; | |
521 | } | |
522 | ||
1da177e4 | 523 | /* |
431c69fa WD |
524 | * Return in pmask the portion of a task's cpusets's cpus_allowed that |
525 | * are online and are capable of running the task. If none are found, | |
526 | * walk up the cpuset hierarchy until we find one that does have some | |
527 | * appropriate cpus. | |
1da177e4 LT |
528 | * |
529 | * One way or another, we guarantee to return some non-empty subset | |
5f054e31 | 530 | * of cpu_online_mask. |
1da177e4 | 531 | * |
111cd11b | 532 | * Call with callback_lock or cpuset_mutex held. |
1da177e4 | 533 | */ |
431c69fa WD |
534 | static void guarantee_online_cpus(struct task_struct *tsk, |
535 | struct cpumask *pmask) | |
1da177e4 | 536 | { |
431c69fa WD |
537 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); |
538 | struct cpuset *cs; | |
539 | ||
540 | if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) | |
541 | cpumask_copy(pmask, cpu_online_mask); | |
542 | ||
543 | rcu_read_lock(); | |
544 | cs = task_cs(tsk); | |
545 | ||
2125c003 | 546 | while (!cpumask_intersects(cs->effective_cpus, pmask)) |
c431069f | 547 | cs = parent_cs(cs); |
431c69fa | 548 | |
2125c003 | 549 | cpumask_and(pmask, pmask, cs->effective_cpus); |
431c69fa | 550 | rcu_read_unlock(); |
1da177e4 LT |
551 | } |
552 | ||
553 | /* | |
554 | * Return in *pmask the portion of a cpusets's mems_allowed that | |
0e1e7c7a CL |
555 | * are online, with memory. If none are online with memory, walk |
556 | * up the cpuset hierarchy until we find one that does have some | |
40df2deb | 557 | * online mems. The top cpuset always has some mems online. |
1da177e4 LT |
558 | * |
559 | * One way or another, we guarantee to return some non-empty subset | |
38d7bee9 | 560 | * of node_states[N_MEMORY]. |
1da177e4 | 561 | * |
111cd11b | 562 | * Call with callback_lock or cpuset_mutex held. |
1da177e4 | 563 | */ |
c9710d80 | 564 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
1da177e4 | 565 | { |
ae1c8023 | 566 | while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) |
c431069f | 567 | cs = parent_cs(cs); |
ae1c8023 | 568 | nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); |
1da177e4 LT |
569 | } |
570 | ||
f3b39d47 MX |
571 | /* |
572 | * update task's spread flag if cpuset's page/slab spread flag is set | |
573 | * | |
111cd11b | 574 | * Call with callback_lock or cpuset_mutex held. The check can be skipped |
18f9a4d4 | 575 | * if on default hierarchy. |
f3b39d47 | 576 | */ |
18f9a4d4 | 577 | static void cpuset_update_task_spread_flags(struct cpuset *cs, |
f3b39d47 MX |
578 | struct task_struct *tsk) |
579 | { | |
18f9a4d4 WL |
580 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) |
581 | return; | |
582 | ||
f3b39d47 | 583 | if (is_spread_page(cs)) |
2ad654bc | 584 | task_set_spread_page(tsk); |
f3b39d47 | 585 | else |
2ad654bc ZL |
586 | task_clear_spread_page(tsk); |
587 | ||
f3b39d47 | 588 | if (is_spread_slab(cs)) |
2ad654bc | 589 | task_set_spread_slab(tsk); |
f3b39d47 | 590 | else |
2ad654bc | 591 | task_clear_spread_slab(tsk); |
f3b39d47 MX |
592 | } |
593 | ||
1da177e4 LT |
594 | /* |
595 | * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? | |
596 | * | |
597 | * One cpuset is a subset of another if all its allowed CPUs and | |
598 | * Memory Nodes are a subset of the other, and its exclusive flags | |
111cd11b | 599 | * are only set if the other's are set. Call holding cpuset_mutex. |
1da177e4 LT |
600 | */ |
601 | ||
602 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |
603 | { | |
300ed6cb | 604 | return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && |
1da177e4 LT |
605 | nodes_subset(p->mems_allowed, q->mems_allowed) && |
606 | is_cpu_exclusive(p) <= is_cpu_exclusive(q) && | |
607 | is_mem_exclusive(p) <= is_mem_exclusive(q); | |
608 | } | |
609 | ||
bf92370c WL |
610 | /** |
611 | * alloc_cpumasks - allocate three cpumasks for cpuset | |
612 | * @cs: the cpuset that have cpumasks to be allocated. | |
613 | * @tmp: the tmpmasks structure pointer | |
614 | * Return: 0 if successful, -ENOMEM otherwise. | |
615 | * | |
616 | * Only one of the two input arguments should be non-NULL. | |
617 | */ | |
618 | static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) | |
619 | { | |
e2ffe502 | 620 | cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; |
bf92370c WL |
621 | |
622 | if (cs) { | |
623 | pmask1 = &cs->cpus_allowed; | |
624 | pmask2 = &cs->effective_cpus; | |
0c7f293e | 625 | pmask3 = &cs->effective_xcpus; |
e2ffe502 | 626 | pmask4 = &cs->exclusive_cpus; |
bf92370c WL |
627 | } else { |
628 | pmask1 = &tmp->new_cpus; | |
629 | pmask2 = &tmp->addmask; | |
630 | pmask3 = &tmp->delmask; | |
e2ffe502 | 631 | pmask4 = NULL; |
bf92370c WL |
632 | } |
633 | ||
634 | if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) | |
635 | return -ENOMEM; | |
636 | ||
637 | if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) | |
638 | goto free_one; | |
639 | ||
640 | if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) | |
641 | goto free_two; | |
642 | ||
e2ffe502 WL |
643 | if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) |
644 | goto free_three; | |
645 | ||
646 | ||
bf92370c WL |
647 | return 0; |
648 | ||
e2ffe502 WL |
649 | free_three: |
650 | free_cpumask_var(*pmask3); | |
bf92370c WL |
651 | free_two: |
652 | free_cpumask_var(*pmask2); | |
653 | free_one: | |
654 | free_cpumask_var(*pmask1); | |
655 | return -ENOMEM; | |
656 | } | |
657 | ||
658 | /** | |
659 | * free_cpumasks - free cpumasks in a tmpmasks structure | |
660 | * @cs: the cpuset that have cpumasks to be free. | |
661 | * @tmp: the tmpmasks structure pointer | |
662 | */ | |
663 | static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) | |
664 | { | |
665 | if (cs) { | |
666 | free_cpumask_var(cs->cpus_allowed); | |
667 | free_cpumask_var(cs->effective_cpus); | |
0c7f293e | 668 | free_cpumask_var(cs->effective_xcpus); |
e2ffe502 | 669 | free_cpumask_var(cs->exclusive_cpus); |
bf92370c WL |
670 | } |
671 | if (tmp) { | |
672 | free_cpumask_var(tmp->new_cpus); | |
673 | free_cpumask_var(tmp->addmask); | |
674 | free_cpumask_var(tmp->delmask); | |
675 | } | |
676 | } | |
677 | ||
645fcc9d LZ |
678 | /** |
679 | * alloc_trial_cpuset - allocate a trial cpuset | |
680 | * @cs: the cpuset that the trial cpuset duplicates | |
681 | */ | |
c9710d80 | 682 | static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) |
645fcc9d | 683 | { |
300ed6cb LZ |
684 | struct cpuset *trial; |
685 | ||
686 | trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); | |
687 | if (!trial) | |
688 | return NULL; | |
689 | ||
bf92370c WL |
690 | if (alloc_cpumasks(trial, NULL)) { |
691 | kfree(trial); | |
692 | return NULL; | |
693 | } | |
300ed6cb | 694 | |
e2b9a3d7 LZ |
695 | cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); |
696 | cpumask_copy(trial->effective_cpus, cs->effective_cpus); | |
0c7f293e | 697 | cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); |
e2ffe502 | 698 | cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); |
300ed6cb | 699 | return trial; |
645fcc9d LZ |
700 | } |
701 | ||
702 | /** | |
bf92370c WL |
703 | * free_cpuset - free the cpuset |
704 | * @cs: the cpuset to be freed | |
645fcc9d | 705 | */ |
bf92370c | 706 | static inline void free_cpuset(struct cpuset *cs) |
645fcc9d | 707 | { |
bf92370c WL |
708 | free_cpumasks(cs, NULL); |
709 | kfree(cs); | |
645fcc9d LZ |
710 | } |
711 | ||
e2ffe502 WL |
712 | static inline struct cpumask *fetch_xcpus(struct cpuset *cs) |
713 | { | |
714 | return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : | |
715 | cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed | |
716 | : cs->effective_xcpus; | |
717 | } | |
718 | ||
0c7f293e | 719 | /* |
783a8334 | 720 | * cpusets_are_exclusive() - check if two cpusets are exclusive |
0c7f293e | 721 | * |
783a8334 | 722 | * Return true if exclusive, false if not |
0c7f293e | 723 | */ |
783a8334 | 724 | static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) |
0c7f293e | 725 | { |
e2ffe502 WL |
726 | struct cpumask *xcpus1 = fetch_xcpus(cs1); |
727 | struct cpumask *xcpus2 = fetch_xcpus(cs2); | |
0c7f293e | 728 | |
e2ffe502 | 729 | if (cpumask_intersects(xcpus1, xcpus2)) |
783a8334 HM |
730 | return false; |
731 | return true; | |
0c7f293e WL |
732 | } |
733 | ||
d068eebb MK |
734 | /* |
735 | * validate_change_legacy() - Validate conditions specific to legacy (v1) | |
736 | * behavior. | |
737 | */ | |
738 | static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) | |
739 | { | |
740 | struct cgroup_subsys_state *css; | |
741 | struct cpuset *c, *par; | |
742 | int ret; | |
743 | ||
744 | WARN_ON_ONCE(!rcu_read_lock_held()); | |
745 | ||
746 | /* Each of our child cpusets must be a subset of us */ | |
747 | ret = -EBUSY; | |
748 | cpuset_for_each_child(c, css, cur) | |
749 | if (!is_cpuset_subset(c, trial)) | |
750 | goto out; | |
751 | ||
752 | /* On legacy hierarchy, we must be a subset of our parent cpuset. */ | |
753 | ret = -EACCES; | |
754 | par = parent_cs(cur); | |
755 | if (par && !is_cpuset_subset(trial, par)) | |
756 | goto out; | |
757 | ||
758 | ret = 0; | |
759 | out: | |
760 | return ret; | |
761 | } | |
762 | ||
1da177e4 LT |
763 | /* |
764 | * validate_change() - Used to validate that any proposed cpuset change | |
765 | * follows the structural rules for cpusets. | |
766 | * | |
767 | * If we replaced the flag and mask values of the current cpuset | |
768 | * (cur) with those values in the trial cpuset (trial), would | |
769 | * our various subset and exclusive rules still be valid? Presumes | |
111cd11b | 770 | * cpuset_mutex held. |
1da177e4 LT |
771 | * |
772 | * 'cur' is the address of an actual, in-use cpuset. Operations | |
773 | * such as list traversal that depend on the actual address of the | |
774 | * cpuset in the list must use cur below, not trial. | |
775 | * | |
776 | * 'trial' is the address of bulk structure copy of cur, with | |
777 | * perhaps one or more of the fields cpus_allowed, mems_allowed, | |
778 | * or flags changed to new, trial values. | |
779 | * | |
780 | * Return 0 if valid, -errno if not. | |
781 | */ | |
782 | ||
c9710d80 | 783 | static int validate_change(struct cpuset *cur, struct cpuset *trial) |
1da177e4 | 784 | { |
492eb21b | 785 | struct cgroup_subsys_state *css; |
1da177e4 | 786 | struct cpuset *c, *par; |
d068eebb | 787 | int ret = 0; |
1da177e4 | 788 | |
1f1562fc | 789 | rcu_read_lock(); |
69604067 | 790 | |
d068eebb MK |
791 | if (!is_in_v2_mode()) |
792 | ret = validate_change_legacy(cur, trial); | |
793 | if (ret) | |
ae8086ce | 794 | goto out; |
1da177e4 | 795 | |
d068eebb MK |
796 | /* Remaining checks don't apply to root cpuset */ |
797 | if (cur == &top_cpuset) | |
ae8086ce | 798 | goto out; |
1da177e4 | 799 | |
d068eebb MK |
800 | par = parent_cs(cur); |
801 | ||
452477fa TH |
802 | /* |
803 | * Cpusets with tasks - existing or newly being attached - can't | |
1c09b195 | 804 | * be changed to have empty cpus_allowed or mems_allowed. |
452477fa | 805 | */ |
ae8086ce | 806 | ret = -ENOSPC; |
27bd4dbb | 807 | if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { |
1c09b195 LZ |
808 | if (!cpumask_empty(cur->cpus_allowed) && |
809 | cpumask_empty(trial->cpus_allowed)) | |
810 | goto out; | |
811 | if (!nodes_empty(cur->mems_allowed) && | |
812 | nodes_empty(trial->mems_allowed)) | |
813 | goto out; | |
814 | } | |
020958b6 | 815 | |
f82f8042 JL |
816 | /* |
817 | * We can't shrink if we won't have enough room for SCHED_DEADLINE | |
818 | * tasks. | |
819 | */ | |
820 | ret = -EBUSY; | |
821 | if (is_cpu_exclusive(cur) && | |
822 | !cpuset_cpumask_can_shrink(cur->cpus_allowed, | |
823 | trial->cpus_allowed)) | |
824 | goto out; | |
825 | ||
74027a65 WL |
826 | /* |
827 | * If either I or some sibling (!= me) is exclusive, we can't | |
828 | * overlap | |
829 | */ | |
830 | ret = -EINVAL; | |
831 | cpuset_for_each_child(c, css, par) { | |
832 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | |
0c7f293e | 833 | c != cur) { |
783a8334 | 834 | if (!cpusets_are_exclusive(trial, c)) |
0c7f293e WL |
835 | goto out; |
836 | } | |
74027a65 WL |
837 | if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && |
838 | c != cur && | |
839 | nodes_intersects(trial->mems_allowed, c->mems_allowed)) | |
840 | goto out; | |
841 | } | |
842 | ||
ae8086ce TH |
843 | ret = 0; |
844 | out: | |
845 | rcu_read_unlock(); | |
846 | return ret; | |
1da177e4 LT |
847 | } |
848 | ||
db7f47cf | 849 | #ifdef CONFIG_SMP |
029190c5 | 850 | /* |
cf417141 | 851 | * Helper routine for generate_sched_domains(). |
8b5f1c52 | 852 | * Do cpusets a, b have overlapping effective cpus_allowed masks? |
029190c5 | 853 | */ |
029190c5 PJ |
854 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
855 | { | |
8b5f1c52 | 856 | return cpumask_intersects(a->effective_cpus, b->effective_cpus); |
029190c5 PJ |
857 | } |
858 | ||
1d3504fc HS |
859 | static void |
860 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |
861 | { | |
1d3504fc HS |
862 | if (dattr->relax_domain_level < c->relax_domain_level) |
863 | dattr->relax_domain_level = c->relax_domain_level; | |
864 | return; | |
865 | } | |
866 | ||
fc560a26 TH |
867 | static void update_domain_attr_tree(struct sched_domain_attr *dattr, |
868 | struct cpuset *root_cs) | |
f5393693 | 869 | { |
fc560a26 | 870 | struct cpuset *cp; |
492eb21b | 871 | struct cgroup_subsys_state *pos_css; |
f5393693 | 872 | |
fc560a26 | 873 | rcu_read_lock(); |
492eb21b | 874 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
fc560a26 TH |
875 | /* skip the whole subtree if @cp doesn't have any CPU */ |
876 | if (cpumask_empty(cp->cpus_allowed)) { | |
492eb21b | 877 | pos_css = css_rightmost_descendant(pos_css); |
f5393693 | 878 | continue; |
fc560a26 | 879 | } |
f5393693 LJ |
880 | |
881 | if (is_sched_load_balance(cp)) | |
882 | update_domain_attr(dattr, cp); | |
f5393693 | 883 | } |
fc560a26 | 884 | rcu_read_unlock(); |
f5393693 LJ |
885 | } |
886 | ||
111cd11b | 887 | /* Must be called with cpuset_mutex held. */ |
be040bea PB |
888 | static inline int nr_cpusets(void) |
889 | { | |
890 | /* jump label reference count + the top-level cpuset */ | |
891 | return static_key_count(&cpusets_enabled_key.key) + 1; | |
892 | } | |
893 | ||
029190c5 | 894 | /* |
cf417141 MK |
895 | * generate_sched_domains() |
896 | * | |
897 | * This function builds a partial partition of the systems CPUs | |
898 | * A 'partial partition' is a set of non-overlapping subsets whose | |
899 | * union is a subset of that set. | |
0a0fca9d | 900 | * The output of this function needs to be passed to kernel/sched/core.c |
cf417141 MK |
901 | * partition_sched_domains() routine, which will rebuild the scheduler's |
902 | * load balancing domains (sched domains) as specified by that partial | |
903 | * partition. | |
029190c5 | 904 | * |
da82c92f | 905 | * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst |
029190c5 PJ |
906 | * for a background explanation of this. |
907 | * | |
908 | * Does not return errors, on the theory that the callers of this | |
909 | * routine would rather not worry about failures to rebuild sched | |
910 | * domains when operating in the severe memory shortage situations | |
911 | * that could cause allocation failures below. | |
912 | * | |
111cd11b | 913 | * Must be called with cpuset_mutex held. |
029190c5 PJ |
914 | * |
915 | * The three key local variables below are: | |
b6fbbf31 JL |
916 | * cp - cpuset pointer, used (together with pos_css) to perform a |
917 | * top-down scan of all cpusets. For our purposes, rebuilding | |
918 | * the schedulers sched domains, we can ignore !is_sched_load_ | |
919 | * balance cpusets. | |
029190c5 PJ |
920 | * csa - (for CpuSet Array) Array of pointers to all the cpusets |
921 | * that need to be load balanced, for convenient iterative | |
922 | * access by the subsequent code that finds the best partition, | |
923 | * i.e the set of domains (subsets) of CPUs such that the | |
924 | * cpus_allowed of every cpuset marked is_sched_load_balance | |
925 | * is a subset of one of these domains, while there are as | |
926 | * many such domains as possible, each as small as possible. | |
927 | * doms - Conversion of 'csa' to an array of cpumasks, for passing to | |
0a0fca9d | 928 | * the kernel/sched/core.c routine partition_sched_domains() in a |
029190c5 PJ |
929 | * convenient format, that can be easily compared to the prior |
930 | * value to determine what partition elements (sched domains) | |
931 | * were changed (added or removed.) | |
932 | * | |
933 | * Finding the best partition (set of domains): | |
934 | * The triple nested loops below over i, j, k scan over the | |
935 | * load balanced cpusets (using the array of cpuset pointers in | |
936 | * csa[]) looking for pairs of cpusets that have overlapping | |
937 | * cpus_allowed, but which don't have the same 'pn' partition | |
938 | * number and gives them in the same partition number. It keeps | |
939 | * looping on the 'restart' label until it can no longer find | |
940 | * any such pairs. | |
941 | * | |
942 | * The union of the cpus_allowed masks from the set of | |
943 | * all cpusets having the same 'pn' value then form the one | |
944 | * element of the partition (one sched domain) to be passed to | |
945 | * partition_sched_domains(). | |
946 | */ | |
acc3f5d7 | 947 | static int generate_sched_domains(cpumask_var_t **domains, |
cf417141 | 948 | struct sched_domain_attr **attributes) |
029190c5 | 949 | { |
b6fbbf31 | 950 | struct cpuset *cp; /* top-down scan of cpusets */ |
029190c5 PJ |
951 | struct cpuset **csa; /* array of all cpuset ptrs */ |
952 | int csn; /* how many cpuset ptrs in csa so far */ | |
953 | int i, j, k; /* indices for partition finding loops */ | |
acc3f5d7 | 954 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ |
1d3504fc | 955 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
1583715d | 956 | int ndoms = 0; /* number of sched domains in result */ |
6af866af | 957 | int nslot; /* next empty doms[] struct cpumask slot */ |
492eb21b | 958 | struct cgroup_subsys_state *pos_css; |
0ccea8fe | 959 | bool root_load_balance = is_sched_load_balance(&top_cpuset); |
029190c5 | 960 | |
029190c5 | 961 | doms = NULL; |
1d3504fc | 962 | dattr = NULL; |
cf417141 | 963 | csa = NULL; |
029190c5 PJ |
964 | |
965 | /* Special case for the 99% of systems with one, full, sched domain */ | |
0c7f293e | 966 | if (root_load_balance && !top_cpuset.nr_subparts) { |
acc3f5d7 RR |
967 | ndoms = 1; |
968 | doms = alloc_sched_domains(ndoms); | |
029190c5 | 969 | if (!doms) |
cf417141 MK |
970 | goto done; |
971 | ||
1d3504fc HS |
972 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
973 | if (dattr) { | |
974 | *dattr = SD_ATTR_INIT; | |
93a65575 | 975 | update_domain_attr_tree(dattr, &top_cpuset); |
1d3504fc | 976 | } |
47b8ea71 | 977 | cpumask_and(doms[0], top_cpuset.effective_cpus, |
04d4e665 | 978 | housekeeping_cpumask(HK_TYPE_DOMAIN)); |
cf417141 | 979 | |
cf417141 | 980 | goto done; |
029190c5 PJ |
981 | } |
982 | ||
6da2ec56 | 983 | csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); |
029190c5 PJ |
984 | if (!csa) |
985 | goto done; | |
986 | csn = 0; | |
987 | ||
fc560a26 | 988 | rcu_read_lock(); |
0ccea8fe WL |
989 | if (root_load_balance) |
990 | csa[csn++] = &top_cpuset; | |
492eb21b | 991 | cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { |
bd8815a6 TH |
992 | if (cp == &top_cpuset) |
993 | continue; | |
f5393693 | 994 | /* |
fc560a26 TH |
995 | * Continue traversing beyond @cp iff @cp has some CPUs and |
996 | * isn't load balancing. The former is obvious. The | |
997 | * latter: All child cpusets contain a subset of the | |
998 | * parent's cpus, so just skip them, and then we call | |
999 | * update_domain_attr_tree() to calc relax_domain_level of | |
1000 | * the corresponding sched domain. | |
0ccea8fe WL |
1001 | * |
1002 | * If root is load-balancing, we can skip @cp if it | |
1003 | * is a subset of the root's effective_cpus. | |
f5393693 | 1004 | */ |
fc560a26 | 1005 | if (!cpumask_empty(cp->cpus_allowed) && |
47b8ea71 | 1006 | !(is_sched_load_balance(cp) && |
edb93821 | 1007 | cpumask_intersects(cp->cpus_allowed, |
04d4e665 | 1008 | housekeeping_cpumask(HK_TYPE_DOMAIN)))) |
f5393693 | 1009 | continue; |
489a5393 | 1010 | |
0ccea8fe WL |
1011 | if (root_load_balance && |
1012 | cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) | |
1013 | continue; | |
1014 | ||
cd1cb335 VS |
1015 | if (is_sched_load_balance(cp) && |
1016 | !cpumask_empty(cp->effective_cpus)) | |
fc560a26 TH |
1017 | csa[csn++] = cp; |
1018 | ||
0ccea8fe | 1019 | /* skip @cp's subtree if not a partition root */ |
18065ebe | 1020 | if (!is_partition_valid(cp)) |
0ccea8fe | 1021 | pos_css = css_rightmost_descendant(pos_css); |
fc560a26 TH |
1022 | } |
1023 | rcu_read_unlock(); | |
029190c5 PJ |
1024 | |
1025 | for (i = 0; i < csn; i++) | |
1026 | csa[i]->pn = i; | |
1027 | ndoms = csn; | |
1028 | ||
1029 | restart: | |
1030 | /* Find the best partition (set of sched domains) */ | |
1031 | for (i = 0; i < csn; i++) { | |
1032 | struct cpuset *a = csa[i]; | |
1033 | int apn = a->pn; | |
1034 | ||
1035 | for (j = 0; j < csn; j++) { | |
1036 | struct cpuset *b = csa[j]; | |
1037 | int bpn = b->pn; | |
1038 | ||
1039 | if (apn != bpn && cpusets_overlap(a, b)) { | |
1040 | for (k = 0; k < csn; k++) { | |
1041 | struct cpuset *c = csa[k]; | |
1042 | ||
1043 | if (c->pn == bpn) | |
1044 | c->pn = apn; | |
1045 | } | |
1046 | ndoms--; /* one less element */ | |
1047 | goto restart; | |
1048 | } | |
1049 | } | |
1050 | } | |
1051 | ||
cf417141 MK |
1052 | /* |
1053 | * Now we know how many domains to create. | |
1054 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | |
1055 | */ | |
acc3f5d7 | 1056 | doms = alloc_sched_domains(ndoms); |
700018e0 | 1057 | if (!doms) |
cf417141 | 1058 | goto done; |
cf417141 MK |
1059 | |
1060 | /* | |
1061 | * The rest of the code, including the scheduler, can deal with | |
1062 | * dattr==NULL case. No need to abort if alloc fails. | |
1063 | */ | |
6da2ec56 KC |
1064 | dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), |
1065 | GFP_KERNEL); | |
029190c5 PJ |
1066 | |
1067 | for (nslot = 0, i = 0; i < csn; i++) { | |
1068 | struct cpuset *a = csa[i]; | |
6af866af | 1069 | struct cpumask *dp; |
029190c5 PJ |
1070 | int apn = a->pn; |
1071 | ||
cf417141 MK |
1072 | if (apn < 0) { |
1073 | /* Skip completed partitions */ | |
1074 | continue; | |
1075 | } | |
1076 | ||
acc3f5d7 | 1077 | dp = doms[nslot]; |
cf417141 MK |
1078 | |
1079 | if (nslot == ndoms) { | |
1080 | static int warnings = 10; | |
1081 | if (warnings) { | |
12d3089c FF |
1082 | pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", |
1083 | nslot, ndoms, csn, i, apn); | |
cf417141 | 1084 | warnings--; |
029190c5 | 1085 | } |
cf417141 MK |
1086 | continue; |
1087 | } | |
029190c5 | 1088 | |
6af866af | 1089 | cpumask_clear(dp); |
cf417141 MK |
1090 | if (dattr) |
1091 | *(dattr + nslot) = SD_ATTR_INIT; | |
1092 | for (j = i; j < csn; j++) { | |
1093 | struct cpuset *b = csa[j]; | |
1094 | ||
1095 | if (apn == b->pn) { | |
8b5f1c52 | 1096 | cpumask_or(dp, dp, b->effective_cpus); |
04d4e665 | 1097 | cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); |
cf417141 MK |
1098 | if (dattr) |
1099 | update_domain_attr_tree(dattr + nslot, b); | |
1100 | ||
1101 | /* Done with this partition */ | |
1102 | b->pn = -1; | |
029190c5 | 1103 | } |
029190c5 | 1104 | } |
cf417141 | 1105 | nslot++; |
029190c5 PJ |
1106 | } |
1107 | BUG_ON(nslot != ndoms); | |
1108 | ||
cf417141 MK |
1109 | done: |
1110 | kfree(csa); | |
1111 | ||
700018e0 LZ |
1112 | /* |
1113 | * Fallback to the default domain if kmalloc() failed. | |
1114 | * See comments in partition_sched_domains(). | |
1115 | */ | |
1116 | if (doms == NULL) | |
1117 | ndoms = 1; | |
1118 | ||
cf417141 MK |
1119 | *domains = doms; |
1120 | *attributes = dattr; | |
1121 | return ndoms; | |
1122 | } | |
1123 | ||
ad3a557d | 1124 | static void dl_update_tasks_root_domain(struct cpuset *cs) |
f9a25f77 MP |
1125 | { |
1126 | struct css_task_iter it; | |
1127 | struct task_struct *task; | |
1128 | ||
c0f78fd5 JL |
1129 | if (cs->nr_deadline_tasks == 0) |
1130 | return; | |
1131 | ||
f9a25f77 MP |
1132 | css_task_iter_start(&cs->css, 0, &it); |
1133 | ||
1134 | while ((task = css_task_iter_next(&it))) | |
1135 | dl_add_task_root_domain(task); | |
1136 | ||
1137 | css_task_iter_end(&it); | |
1138 | } | |
1139 | ||
ad3a557d | 1140 | static void dl_rebuild_rd_accounting(void) |
f9a25f77 MP |
1141 | { |
1142 | struct cpuset *cs = NULL; | |
1143 | struct cgroup_subsys_state *pos_css; | |
1144 | ||
111cd11b | 1145 | lockdep_assert_held(&cpuset_mutex); |
f9a25f77 MP |
1146 | lockdep_assert_cpus_held(); |
1147 | lockdep_assert_held(&sched_domains_mutex); | |
1148 | ||
f9a25f77 MP |
1149 | rcu_read_lock(); |
1150 | ||
1151 | /* | |
1152 | * Clear default root domain DL accounting, it will be computed again | |
1153 | * if a task belongs to it. | |
1154 | */ | |
1155 | dl_clear_root_domain(&def_root_domain); | |
1156 | ||
1157 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | |
1158 | ||
1159 | if (cpumask_empty(cs->effective_cpus)) { | |
1160 | pos_css = css_rightmost_descendant(pos_css); | |
1161 | continue; | |
1162 | } | |
1163 | ||
1164 | css_get(&cs->css); | |
1165 | ||
1166 | rcu_read_unlock(); | |
1167 | ||
ad3a557d | 1168 | dl_update_tasks_root_domain(cs); |
f9a25f77 MP |
1169 | |
1170 | rcu_read_lock(); | |
1171 | css_put(&cs->css); | |
1172 | } | |
1173 | rcu_read_unlock(); | |
1174 | } | |
1175 | ||
1176 | static void | |
1177 | partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | |
1178 | struct sched_domain_attr *dattr_new) | |
1179 | { | |
1180 | mutex_lock(&sched_domains_mutex); | |
1181 | partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); | |
ad3a557d | 1182 | dl_rebuild_rd_accounting(); |
f9a25f77 MP |
1183 | mutex_unlock(&sched_domains_mutex); |
1184 | } | |
1185 | ||
cf417141 MK |
1186 | /* |
1187 | * Rebuild scheduler domains. | |
1188 | * | |
699140ba TH |
1189 | * If the flag 'sched_load_balance' of any cpuset with non-empty |
1190 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | |
1191 | * which has that flag enabled, or if any cpuset with a non-empty | |
1192 | * 'cpus' is removed, then call this routine to rebuild the | |
1193 | * scheduler's dynamic sched domains. | |
cf417141 | 1194 | * |
111cd11b | 1195 | * Call with cpuset_mutex held. Takes cpus_read_lock(). |
cf417141 | 1196 | */ |
699140ba | 1197 | static void rebuild_sched_domains_locked(void) |
cf417141 | 1198 | { |
406100f3 | 1199 | struct cgroup_subsys_state *pos_css; |
cf417141 | 1200 | struct sched_domain_attr *attr; |
acc3f5d7 | 1201 | cpumask_var_t *doms; |
406100f3 | 1202 | struct cpuset *cs; |
cf417141 MK |
1203 | int ndoms; |
1204 | ||
d74b27d6 | 1205 | lockdep_assert_cpus_held(); |
111cd11b | 1206 | lockdep_assert_held(&cpuset_mutex); |
cf417141 | 1207 | |
5b16c2a4 | 1208 | /* |
406100f3 | 1209 | * If we have raced with CPU hotplug, return early to avoid |
5b16c2a4 | 1210 | * passing doms with offlined cpu to partition_sched_domains(). |
2125c003 | 1211 | * Anyways, cpuset_handle_hotplug() will rebuild sched domains. |
406100f3 DJ |
1212 | * |
1213 | * With no CPUs in any subpartitions, top_cpuset's effective CPUs | |
1214 | * should be the same as the active CPUs, so checking only top_cpuset | |
1215 | * is enough to detect racing CPU offlines. | |
5b16c2a4 | 1216 | */ |
0c7f293e | 1217 | if (cpumask_empty(subpartitions_cpus) && |
0ccea8fe | 1218 | !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) |
d74b27d6 | 1219 | return; |
0ccea8fe | 1220 | |
406100f3 DJ |
1221 | /* |
1222 | * With subpartition CPUs, however, the effective CPUs of a partition | |
1223 | * root should be only a subset of the active CPUs. Since a CPU in any | |
1224 | * partition root could be offlined, all must be checked. | |
1225 | */ | |
0c7f293e | 1226 | if (top_cpuset.nr_subparts) { |
406100f3 DJ |
1227 | rcu_read_lock(); |
1228 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | |
18065ebe | 1229 | if (!is_partition_valid(cs)) { |
406100f3 DJ |
1230 | pos_css = css_rightmost_descendant(pos_css); |
1231 | continue; | |
1232 | } | |
1233 | if (!cpumask_subset(cs->effective_cpus, | |
1234 | cpu_active_mask)) { | |
1235 | rcu_read_unlock(); | |
1236 | return; | |
1237 | } | |
1238 | } | |
1239 | rcu_read_unlock(); | |
1240 | } | |
5b16c2a4 | 1241 | |
cf417141 | 1242 | /* Generate domain masks and attrs */ |
cf417141 | 1243 | ndoms = generate_sched_domains(&doms, &attr); |
cf417141 MK |
1244 | |
1245 | /* Have scheduler rebuild the domains */ | |
f9a25f77 | 1246 | partition_and_rebuild_sched_domains(ndoms, doms, attr); |
cf417141 | 1247 | } |
db7f47cf | 1248 | #else /* !CONFIG_SMP */ |
699140ba | 1249 | static void rebuild_sched_domains_locked(void) |
db7f47cf PM |
1250 | { |
1251 | } | |
db7f47cf | 1252 | #endif /* CONFIG_SMP */ |
029190c5 | 1253 | |
2125c003 | 1254 | static void rebuild_sched_domains_cpuslocked(void) |
cf417141 | 1255 | { |
111cd11b | 1256 | mutex_lock(&cpuset_mutex); |
699140ba | 1257 | rebuild_sched_domains_locked(); |
111cd11b | 1258 | mutex_unlock(&cpuset_mutex); |
2125c003 WL |
1259 | } |
1260 | ||
1261 | void rebuild_sched_domains(void) | |
1262 | { | |
1263 | cpus_read_lock(); | |
1264 | rebuild_sched_domains_cpuslocked(); | |
c5c63b9a | 1265 | cpus_read_unlock(); |
029190c5 PJ |
1266 | } |
1267 | ||
0b2f630a MX |
1268 | /** |
1269 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | |
1270 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | |
7a2127e6 | 1271 | * @new_cpus: the temp variable for the new effective_cpus mask |
0b2f630a | 1272 | * |
d66393e5 | 1273 | * Iterate through each task of @cs updating its cpus_allowed to the |
111cd11b | 1274 | * effective cpuset's. As this function is called with cpuset_mutex held, |
6667439f WL |
1275 | * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() |
1276 | * is used instead of effective_cpus to make sure all offline CPUs are also | |
1277 | * included as hotplug code won't update cpumasks for tasks in top_cpuset. | |
0b2f630a | 1278 | */ |
7a2127e6 | 1279 | static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) |
0b2f630a | 1280 | { |
d66393e5 TH |
1281 | struct css_task_iter it; |
1282 | struct task_struct *task; | |
ec5fbdfb | 1283 | bool top_cs = cs == &top_cpuset; |
d66393e5 | 1284 | |
bc2fb7ed | 1285 | css_task_iter_start(&cs->css, 0, &it); |
ec5fbdfb | 1286 | while ((task = css_task_iter_next(&it))) { |
6667439f | 1287 | const struct cpumask *possible_mask = task_cpu_possible_mask(task); |
7a2127e6 | 1288 | |
6667439f WL |
1289 | if (top_cs) { |
1290 | /* | |
1291 | * Percpu kthreads in top_cpuset are ignored | |
1292 | */ | |
a453be97 | 1293 | if (kthread_is_per_cpu(task)) |
6667439f | 1294 | continue; |
0c7f293e | 1295 | cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); |
6667439f WL |
1296 | } else { |
1297 | cpumask_and(new_cpus, possible_mask, cs->effective_cpus); | |
1298 | } | |
7a2127e6 | 1299 | set_cpus_allowed_ptr(task, new_cpus); |
ec5fbdfb | 1300 | } |
d66393e5 | 1301 | css_task_iter_end(&it); |
0b2f630a MX |
1302 | } |
1303 | ||
ee8dde0c WL |
1304 | /** |
1305 | * compute_effective_cpumask - Compute the effective cpumask of the cpuset | |
1306 | * @new_cpus: the temp variable for the new effective_cpus mask | |
1307 | * @cs: the cpuset the need to recompute the new effective_cpus mask | |
1308 | * @parent: the parent cpuset | |
1309 | * | |
0c7f293e | 1310 | * The result is valid only if the given cpuset isn't a partition root. |
ee8dde0c WL |
1311 | */ |
1312 | static void compute_effective_cpumask(struct cpumask *new_cpus, | |
1313 | struct cpuset *cs, struct cpuset *parent) | |
1314 | { | |
0c7f293e | 1315 | cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); |
ee8dde0c WL |
1316 | } |
1317 | ||
1318 | /* | |
0c7f293e | 1319 | * Commands for update_parent_effective_cpumask |
ee8dde0c | 1320 | */ |
0c7f293e WL |
1321 | enum partition_cmd { |
1322 | partcmd_enable, /* Enable partition root */ | |
11e5f407 | 1323 | partcmd_enablei, /* Enable isolated partition root */ |
0c7f293e WL |
1324 | partcmd_disable, /* Disable partition root */ |
1325 | partcmd_update, /* Update parent's effective_cpus */ | |
1326 | partcmd_invalidate, /* Make partition invalid */ | |
ee8dde0c WL |
1327 | }; |
1328 | ||
f0af1bfc WL |
1329 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
1330 | int turning_on); | |
99fe36ba WL |
1331 | static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, |
1332 | struct tmpmasks *tmp); | |
a86ce680 WL |
1333 | |
1334 | /* | |
1335 | * Update partition exclusive flag | |
1336 | * | |
1337 | * Return: 0 if successful, an error code otherwise | |
1338 | */ | |
1339 | static int update_partition_exclusive(struct cpuset *cs, int new_prs) | |
1340 | { | |
1341 | bool exclusive = (new_prs > 0); | |
1342 | ||
1343 | if (exclusive && !is_cpu_exclusive(cs)) { | |
1344 | if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) | |
1345 | return PERR_NOTEXCL; | |
1346 | } else if (!exclusive && is_cpu_exclusive(cs)) { | |
1347 | /* Turning off CS_CPU_EXCLUSIVE will not return error */ | |
1348 | update_flag(CS_CPU_EXCLUSIVE, cs, 0); | |
1349 | } | |
1350 | return 0; | |
1351 | } | |
1352 | ||
1353 | /* | |
1354 | * Update partition load balance flag and/or rebuild sched domain | |
1355 | * | |
1356 | * Changing load balance flag will automatically call | |
1357 | * rebuild_sched_domains_locked(). | |
6fcdb018 | 1358 | * This function is for cgroup v2 only. |
a86ce680 WL |
1359 | */ |
1360 | static void update_partition_sd_lb(struct cpuset *cs, int old_prs) | |
1361 | { | |
1362 | int new_prs = cs->partition_root_state; | |
a86ce680 | 1363 | bool rebuild_domains = (new_prs > 0) || (old_prs > 0); |
6fcdb018 | 1364 | bool new_lb; |
a86ce680 | 1365 | |
6fcdb018 WL |
1366 | /* |
1367 | * If cs is not a valid partition root, the load balance state | |
1368 | * will follow its parent. | |
1369 | */ | |
1370 | if (new_prs > 0) { | |
1371 | new_lb = (new_prs != PRS_ISOLATED); | |
1372 | } else { | |
1373 | new_lb = is_sched_load_balance(parent_cs(cs)); | |
1374 | } | |
a86ce680 WL |
1375 | if (new_lb != !!is_sched_load_balance(cs)) { |
1376 | rebuild_domains = true; | |
1377 | if (new_lb) | |
1378 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | |
1379 | else | |
1380 | clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | |
1381 | } | |
1382 | ||
1383 | if (rebuild_domains) | |
1384 | rebuild_sched_domains_locked(); | |
1385 | } | |
1386 | ||
0c7f293e WL |
1387 | /* |
1388 | * tasks_nocpu_error - Return true if tasks will have no effective_cpus | |
1389 | */ | |
1390 | static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, | |
1391 | struct cpumask *xcpus) | |
1392 | { | |
1393 | /* | |
1394 | * A populated partition (cs or parent) can't have empty effective_cpus | |
1395 | */ | |
1396 | return (cpumask_subset(parent->effective_cpus, xcpus) && | |
1397 | partition_is_populated(parent, cs)) || | |
1398 | (!cpumask_intersects(xcpus, cpu_active_mask) && | |
1399 | partition_is_populated(cs, NULL)); | |
1400 | } | |
1401 | ||
e2ffe502 WL |
1402 | static void reset_partition_data(struct cpuset *cs) |
1403 | { | |
1404 | struct cpuset *parent = parent_cs(cs); | |
1405 | ||
1406 | if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) | |
1407 | return; | |
1408 | ||
1409 | lockdep_assert_held(&callback_lock); | |
1410 | ||
1411 | cs->nr_subparts = 0; | |
1412 | if (cpumask_empty(cs->exclusive_cpus)) { | |
1413 | cpumask_clear(cs->effective_xcpus); | |
1414 | if (is_cpu_exclusive(cs)) | |
1415 | clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); | |
1416 | } | |
1417 | if (!cpumask_and(cs->effective_cpus, | |
1418 | parent->effective_cpus, cs->cpus_allowed)) { | |
1419 | cs->use_parent_ecpus = true; | |
1420 | parent->child_ecpus_count++; | |
1421 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); | |
1422 | } | |
1423 | } | |
1424 | ||
11e5f407 WL |
1425 | /* |
1426 | * partition_xcpus_newstate - Exclusive CPUs state change | |
1427 | * @old_prs: old partition_root_state | |
1428 | * @new_prs: new partition_root_state | |
1429 | * @xcpus: exclusive CPUs with state change | |
1430 | */ | |
1431 | static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) | |
1432 | { | |
1433 | WARN_ON_ONCE(old_prs == new_prs); | |
1434 | if (new_prs == PRS_ISOLATED) | |
1435 | cpumask_or(isolated_cpus, isolated_cpus, xcpus); | |
1436 | else | |
1437 | cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); | |
1438 | } | |
1439 | ||
1440 | /* | |
1441 | * partition_xcpus_add - Add new exclusive CPUs to partition | |
1442 | * @new_prs: new partition_root_state | |
1443 | * @parent: parent cpuset | |
1444 | * @xcpus: exclusive CPUs to be added | |
72c6303a | 1445 | * Return: true if isolated_cpus modified, false otherwise |
11e5f407 WL |
1446 | * |
1447 | * Remote partition if parent == NULL | |
1448 | */ | |
72c6303a | 1449 | static bool partition_xcpus_add(int new_prs, struct cpuset *parent, |
11e5f407 WL |
1450 | struct cpumask *xcpus) |
1451 | { | |
72c6303a WL |
1452 | bool isolcpus_updated; |
1453 | ||
11e5f407 WL |
1454 | WARN_ON_ONCE(new_prs < 0); |
1455 | lockdep_assert_held(&callback_lock); | |
1456 | if (!parent) | |
1457 | parent = &top_cpuset; | |
1458 | ||
72c6303a | 1459 | |
11e5f407 WL |
1460 | if (parent == &top_cpuset) |
1461 | cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); | |
1462 | ||
72c6303a WL |
1463 | isolcpus_updated = (new_prs != parent->partition_root_state); |
1464 | if (isolcpus_updated) | |
11e5f407 WL |
1465 | partition_xcpus_newstate(parent->partition_root_state, new_prs, |
1466 | xcpus); | |
1467 | ||
1468 | cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); | |
72c6303a | 1469 | return isolcpus_updated; |
11e5f407 WL |
1470 | } |
1471 | ||
1472 | /* | |
1473 | * partition_xcpus_del - Remove exclusive CPUs from partition | |
1474 | * @old_prs: old partition_root_state | |
1475 | * @parent: parent cpuset | |
1476 | * @xcpus: exclusive CPUs to be removed | |
72c6303a | 1477 | * Return: true if isolated_cpus modified, false otherwise |
11e5f407 WL |
1478 | * |
1479 | * Remote partition if parent == NULL | |
1480 | */ | |
72c6303a | 1481 | static bool partition_xcpus_del(int old_prs, struct cpuset *parent, |
11e5f407 WL |
1482 | struct cpumask *xcpus) |
1483 | { | |
72c6303a WL |
1484 | bool isolcpus_updated; |
1485 | ||
11e5f407 WL |
1486 | WARN_ON_ONCE(old_prs < 0); |
1487 | lockdep_assert_held(&callback_lock); | |
1488 | if (!parent) | |
1489 | parent = &top_cpuset; | |
1490 | ||
1491 | if (parent == &top_cpuset) | |
1492 | cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); | |
1493 | ||
72c6303a WL |
1494 | isolcpus_updated = (old_prs != parent->partition_root_state); |
1495 | if (isolcpus_updated) | |
11e5f407 WL |
1496 | partition_xcpus_newstate(old_prs, parent->partition_root_state, |
1497 | xcpus); | |
1498 | ||
1499 | cpumask_and(xcpus, xcpus, cpu_active_mask); | |
1500 | cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); | |
72c6303a WL |
1501 | return isolcpus_updated; |
1502 | } | |
1503 | ||
1504 | static void update_unbound_workqueue_cpumask(bool isolcpus_updated) | |
1505 | { | |
1506 | int ret; | |
1507 | ||
1508 | lockdep_assert_cpus_held(); | |
1509 | ||
1510 | if (!isolcpus_updated) | |
1511 | return; | |
1512 | ||
1513 | ret = workqueue_unbound_exclude_cpumask(isolated_cpus); | |
1514 | WARN_ON_ONCE(ret < 0); | |
11e5f407 WL |
1515 | } |
1516 | ||
3232e7aa WL |
1517 | /** |
1518 | * cpuset_cpu_is_isolated - Check if the given CPU is isolated | |
1519 | * @cpu: the CPU number to be checked | |
1520 | * Return: true if CPU is used in an isolated partition, false otherwise | |
1521 | */ | |
1522 | bool cpuset_cpu_is_isolated(int cpu) | |
1523 | { | |
1524 | return cpumask_test_cpu(cpu, isolated_cpus); | |
1525 | } | |
1526 | EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); | |
1527 | ||
e2ffe502 WL |
1528 | /* |
1529 | * compute_effective_exclusive_cpumask - compute effective exclusive CPUs | |
1530 | * @cs: cpuset | |
1531 | * @xcpus: effective exclusive CPUs value to be set | |
1532 | * Return: true if xcpus is not empty, false otherwise. | |
1533 | * | |
1534 | * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), | |
1535 | * it must be a subset of cpus_allowed and parent's effective_xcpus. | |
1536 | */ | |
1537 | static bool compute_effective_exclusive_cpumask(struct cpuset *cs, | |
1538 | struct cpumask *xcpus) | |
1539 | { | |
1540 | struct cpuset *parent = parent_cs(cs); | |
1541 | ||
1542 | if (!xcpus) | |
1543 | xcpus = cs->effective_xcpus; | |
1544 | ||
1545 | if (!cpumask_empty(cs->exclusive_cpus)) | |
1546 | cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); | |
1547 | else | |
1548 | cpumask_copy(xcpus, cs->cpus_allowed); | |
1549 | ||
1550 | return cpumask_and(xcpus, xcpus, parent->effective_xcpus); | |
1551 | } | |
1552 | ||
181c8e09 WL |
1553 | static inline bool is_remote_partition(struct cpuset *cs) |
1554 | { | |
1555 | return !list_empty(&cs->remote_sibling); | |
1556 | } | |
1557 | ||
1558 | static inline bool is_local_partition(struct cpuset *cs) | |
1559 | { | |
1560 | return is_partition_valid(cs) && !is_remote_partition(cs); | |
1561 | } | |
1562 | ||
1563 | /* | |
1564 | * remote_partition_enable - Enable current cpuset as a remote partition root | |
1565 | * @cs: the cpuset to update | |
11e5f407 | 1566 | * @new_prs: new partition_root_state |
181c8e09 WL |
1567 | * @tmp: temparary masks |
1568 | * Return: 1 if successful, 0 if error | |
1569 | * | |
1570 | * Enable the current cpuset to become a remote partition root taking CPUs | |
1571 | * directly from the top cpuset. cpuset_mutex must be held by the caller. | |
1572 | */ | |
11e5f407 WL |
1573 | static int remote_partition_enable(struct cpuset *cs, int new_prs, |
1574 | struct tmpmasks *tmp) | |
181c8e09 | 1575 | { |
72c6303a WL |
1576 | bool isolcpus_updated; |
1577 | ||
181c8e09 WL |
1578 | /* |
1579 | * The user must have sysadmin privilege. | |
1580 | */ | |
1581 | if (!capable(CAP_SYS_ADMIN)) | |
1582 | return 0; | |
1583 | ||
1584 | /* | |
1585 | * The requested exclusive_cpus must not be allocated to other | |
1586 | * partitions and it can't use up all the root's effective_cpus. | |
1587 | * | |
1588 | * Note that if there is any local partition root above it or | |
1589 | * remote partition root underneath it, its exclusive_cpus must | |
1590 | * have overlapped with subpartitions_cpus. | |
1591 | */ | |
1592 | compute_effective_exclusive_cpumask(cs, tmp->new_cpus); | |
1593 | if (cpumask_empty(tmp->new_cpus) || | |
1594 | cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || | |
1595 | cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) | |
1596 | return 0; | |
1597 | ||
1598 | spin_lock_irq(&callback_lock); | |
72c6303a | 1599 | isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); |
11e5f407 | 1600 | list_add(&cs->remote_sibling, &remote_children); |
181c8e09 WL |
1601 | if (cs->use_parent_ecpus) { |
1602 | struct cpuset *parent = parent_cs(cs); | |
1603 | ||
1604 | cs->use_parent_ecpus = false; | |
1605 | parent->child_ecpus_count--; | |
1606 | } | |
181c8e09 | 1607 | spin_unlock_irq(&callback_lock); |
72c6303a | 1608 | update_unbound_workqueue_cpumask(isolcpus_updated); |
181c8e09 WL |
1609 | |
1610 | /* | |
1611 | * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. | |
1612 | */ | |
1613 | update_tasks_cpumask(&top_cpuset, tmp->new_cpus); | |
1614 | update_sibling_cpumasks(&top_cpuset, NULL, tmp); | |
181c8e09 WL |
1615 | return 1; |
1616 | } | |
1617 | ||
1618 | /* | |
1619 | * remote_partition_disable - Remove current cpuset from remote partition list | |
1620 | * @cs: the cpuset to update | |
1621 | * @tmp: temparary masks | |
1622 | * | |
1623 | * The effective_cpus is also updated. | |
1624 | * | |
1625 | * cpuset_mutex must be held by the caller. | |
1626 | */ | |
1627 | static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) | |
1628 | { | |
72c6303a WL |
1629 | bool isolcpus_updated; |
1630 | ||
181c8e09 WL |
1631 | compute_effective_exclusive_cpumask(cs, tmp->new_cpus); |
1632 | WARN_ON_ONCE(!is_remote_partition(cs)); | |
1633 | WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); | |
1634 | ||
1635 | spin_lock_irq(&callback_lock); | |
181c8e09 | 1636 | list_del_init(&cs->remote_sibling); |
72c6303a WL |
1637 | isolcpus_updated = partition_xcpus_del(cs->partition_root_state, |
1638 | NULL, tmp->new_cpus); | |
181c8e09 WL |
1639 | cs->partition_root_state = -cs->partition_root_state; |
1640 | if (!cs->prs_err) | |
1641 | cs->prs_err = PERR_INVCPUS; | |
1642 | reset_partition_data(cs); | |
1643 | spin_unlock_irq(&callback_lock); | |
72c6303a | 1644 | update_unbound_workqueue_cpumask(isolcpus_updated); |
181c8e09 WL |
1645 | |
1646 | /* | |
1647 | * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. | |
1648 | */ | |
1649 | update_tasks_cpumask(&top_cpuset, tmp->new_cpus); | |
1650 | update_sibling_cpumasks(&top_cpuset, NULL, tmp); | |
1651 | } | |
1652 | ||
1653 | /* | |
1654 | * remote_cpus_update - cpus_exclusive change of remote partition | |
1655 | * @cs: the cpuset to be updated | |
1656 | * @newmask: the new effective_xcpus mask | |
1657 | * @tmp: temparary masks | |
1658 | * | |
1659 | * top_cpuset and subpartitions_cpus will be updated or partition can be | |
1660 | * invalidated. | |
1661 | */ | |
1662 | static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, | |
1663 | struct tmpmasks *tmp) | |
1664 | { | |
1665 | bool adding, deleting; | |
11e5f407 | 1666 | int prs = cs->partition_root_state; |
72c6303a | 1667 | int isolcpus_updated = 0; |
181c8e09 WL |
1668 | |
1669 | if (WARN_ON_ONCE(!is_remote_partition(cs))) | |
1670 | return; | |
1671 | ||
1672 | WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); | |
1673 | ||
1674 | if (cpumask_empty(newmask)) | |
1675 | goto invalidate; | |
1676 | ||
1677 | adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); | |
1678 | deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); | |
1679 | ||
1680 | /* | |
1681 | * Additions of remote CPUs is only allowed if those CPUs are | |
1682 | * not allocated to other partitions and there are effective_cpus | |
1683 | * left in the top cpuset. | |
1684 | */ | |
1685 | if (adding && (!capable(CAP_SYS_ADMIN) || | |
1686 | cpumask_intersects(tmp->addmask, subpartitions_cpus) || | |
1687 | cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) | |
1688 | goto invalidate; | |
1689 | ||
1690 | spin_lock_irq(&callback_lock); | |
11e5f407 | 1691 | if (adding) |
72c6303a | 1692 | isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); |
11e5f407 | 1693 | if (deleting) |
72c6303a | 1694 | isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); |
181c8e09 | 1695 | spin_unlock_irq(&callback_lock); |
72c6303a | 1696 | update_unbound_workqueue_cpumask(isolcpus_updated); |
181c8e09 WL |
1697 | |
1698 | /* | |
1699 | * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. | |
1700 | */ | |
1701 | update_tasks_cpumask(&top_cpuset, tmp->new_cpus); | |
1702 | update_sibling_cpumasks(&top_cpuset, NULL, tmp); | |
1703 | return; | |
1704 | ||
1705 | invalidate: | |
1706 | remote_partition_disable(cs, tmp); | |
1707 | } | |
1708 | ||
1709 | /* | |
1710 | * remote_partition_check - check if a child remote partition needs update | |
1711 | * @cs: the cpuset to be updated | |
1712 | * @newmask: the new effective_xcpus mask | |
1713 | * @delmask: temporary mask for deletion (not in tmp) | |
1714 | * @tmp: temparary masks | |
1715 | * | |
1716 | * This should be called before the given cs has updated its cpus_allowed | |
1717 | * and/or effective_xcpus. | |
1718 | */ | |
1719 | static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, | |
1720 | struct cpumask *delmask, struct tmpmasks *tmp) | |
1721 | { | |
1722 | struct cpuset *child, *next; | |
1723 | int disable_cnt = 0; | |
1724 | ||
1725 | /* | |
1726 | * Compute the effective exclusive CPUs that will be deleted. | |
1727 | */ | |
1728 | if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || | |
1729 | !cpumask_intersects(delmask, subpartitions_cpus)) | |
1730 | return; /* No deletion of exclusive CPUs in partitions */ | |
1731 | ||
1732 | /* | |
1733 | * Searching the remote children list to look for those that will | |
1734 | * be impacted by the deletion of exclusive CPUs. | |
1735 | * | |
1736 | * Since a cpuset must be removed from the remote children list | |
1737 | * before it can go offline and holding cpuset_mutex will prevent | |
1738 | * any change in cpuset status. RCU read lock isn't needed. | |
1739 | */ | |
1740 | lockdep_assert_held(&cpuset_mutex); | |
1741 | list_for_each_entry_safe(child, next, &remote_children, remote_sibling) | |
1742 | if (cpumask_intersects(child->effective_cpus, delmask)) { | |
1743 | remote_partition_disable(child, tmp); | |
1744 | disable_cnt++; | |
1745 | } | |
1746 | if (disable_cnt) | |
1747 | rebuild_sched_domains_locked(); | |
1748 | } | |
1749 | ||
4a74e418 WL |
1750 | /* |
1751 | * prstate_housekeeping_conflict - check for partition & housekeeping conflicts | |
1752 | * @prstate: partition root state to be checked | |
1753 | * @new_cpus: cpu mask | |
1754 | * Return: true if there is conflict, false otherwise | |
1755 | * | |
1756 | * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in | |
1757 | * an isolated partition. | |
1758 | */ | |
1759 | static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) | |
1760 | { | |
1761 | const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); | |
1762 | bool all_in_hk = cpumask_subset(new_cpus, hk_domain); | |
1763 | ||
1764 | if (!all_in_hk && (prstate != PRS_ISOLATED)) | |
1765 | return true; | |
1766 | ||
1767 | return false; | |
1768 | } | |
1769 | ||
ee8dde0c | 1770 | /** |
0c7f293e | 1771 | * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset |
32a47817 | 1772 | * @cs: The cpuset that requests change in partition root state |
ee8dde0c WL |
1773 | * @cmd: Partition root state change command |
1774 | * @newmask: Optional new cpumask for partcmd_update | |
1775 | * @tmp: Temporary addmask and delmask | |
7476a636 | 1776 | * Return: 0 or a partition root state error code |
ee8dde0c | 1777 | * |
11e5f407 WL |
1778 | * For partcmd_enable*, the cpuset is being transformed from a non-partition |
1779 | * root to a partition root. The effective_xcpus (cpus_allowed if | |
1780 | * effective_xcpus not set) mask of the given cpuset will be taken away from | |
1781 | * parent's effective_cpus. The function will return 0 if all the CPUs listed | |
1782 | * in effective_xcpus can be granted or an error code will be returned. | |
ee8dde0c | 1783 | * |
f9da322e | 1784 | * For partcmd_disable, the cpuset is being transformed from a partition |
0c7f293e WL |
1785 | * root back to a non-partition root. Any CPUs in effective_xcpus will be |
1786 | * given back to parent's effective_cpus. 0 will always be returned. | |
ee8dde0c | 1787 | * |
f0af1bfc | 1788 | * For partcmd_update, if the optional newmask is specified, the cpu list is |
0c7f293e | 1789 | * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is |
f0af1bfc WL |
1790 | * assumed to remain the same. The cpuset should either be a valid or invalid |
1791 | * partition root. The partition root state may change from valid to invalid | |
0c7f293e | 1792 | * or vice versa. An error code will be returned if transitioning from |
f0af1bfc | 1793 | * invalid to valid violates the exclusivity rule. |
ee8dde0c | 1794 | * |
d7c8142d WL |
1795 | * For partcmd_invalidate, the current partition will be made invalid. |
1796 | * | |
11e5f407 | 1797 | * The partcmd_enable* and partcmd_disable commands are used by |
f0af1bfc WL |
1798 | * update_prstate(). An error code may be returned and the caller will check |
1799 | * for error. | |
ee8dde0c | 1800 | * |
f0af1bfc | 1801 | * The partcmd_update command is used by update_cpumasks_hier() with newmask |
d7c8142d WL |
1802 | * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used |
1803 | * by update_cpumask() with NULL newmask. In both cases, the callers won't | |
1804 | * check for error and so partition_root_state and prs_error will be updated | |
1805 | * directly. | |
ee8dde0c | 1806 | */ |
0c7f293e WL |
1807 | static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, |
1808 | struct cpumask *newmask, | |
1809 | struct tmpmasks *tmp) | |
ee8dde0c | 1810 | { |
18065ebe | 1811 | struct cpuset *parent = parent_cs(cs); |
0c7f293e WL |
1812 | int adding; /* Adding cpus to parent's effective_cpus */ |
1813 | int deleting; /* Deleting cpus from parent's effective_cpus */ | |
e7cc9888 | 1814 | int old_prs, new_prs; |
7476a636 | 1815 | int part_error = PERR_NONE; /* Partition error? */ |
0c7f293e WL |
1816 | int subparts_delta = 0; |
1817 | struct cpumask *xcpus; /* cs effective_xcpus */ | |
72c6303a | 1818 | int isolcpus_updated = 0; |
0c7f293e | 1819 | bool nocpu; |
ee8dde0c | 1820 | |
111cd11b | 1821 | lockdep_assert_held(&cpuset_mutex); |
ee8dde0c | 1822 | |
0c7f293e WL |
1823 | /* |
1824 | * new_prs will only be changed for the partcmd_update and | |
1825 | * partcmd_invalidate commands. | |
1826 | */ | |
1827 | adding = deleting = false; | |
1828 | old_prs = new_prs = cs->partition_root_state; | |
e2ffe502 | 1829 | xcpus = !cpumask_empty(cs->exclusive_cpus) |
0c7f293e WL |
1830 | ? cs->effective_xcpus : cs->cpus_allowed; |
1831 | ||
1832 | if (cmd == partcmd_invalidate) { | |
1833 | if (is_prs_invalid(old_prs)) | |
1834 | return 0; | |
1835 | ||
1836 | /* | |
1837 | * Make the current partition invalid. | |
1838 | */ | |
1839 | if (is_partition_valid(parent)) | |
1840 | adding = cpumask_and(tmp->addmask, | |
1841 | xcpus, parent->effective_xcpus); | |
1842 | if (old_prs > 0) { | |
1843 | new_prs = -old_prs; | |
1844 | subparts_delta--; | |
1845 | } | |
1846 | goto write_error; | |
1847 | } | |
1848 | ||
ee8dde0c WL |
1849 | /* |
1850 | * The parent must be a partition root. | |
1851 | * The new cpumask, if present, or the current cpus_allowed must | |
1852 | * not be empty. | |
1853 | */ | |
7476a636 WL |
1854 | if (!is_partition_valid(parent)) { |
1855 | return is_partition_invalid(parent) | |
1856 | ? PERR_INVPARENT : PERR_NOTPART; | |
1857 | } | |
a86ce680 | 1858 | if (!newmask && cpumask_empty(cs->cpus_allowed)) |
7476a636 | 1859 | return PERR_CPUSEMPTY; |
ee8dde0c | 1860 | |
0c7f293e WL |
1861 | nocpu = tasks_nocpu_error(parent, cs, xcpus); |
1862 | ||
11e5f407 | 1863 | if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { |
e2d59900 | 1864 | /* |
0c7f293e WL |
1865 | * Enabling partition root is not allowed if its |
1866 | * effective_xcpus is empty or doesn't overlap with | |
1867 | * parent's effective_xcpus. | |
e2d59900 | 1868 | */ |
0c7f293e WL |
1869 | if (cpumask_empty(xcpus) || |
1870 | !cpumask_intersects(xcpus, parent->effective_xcpus)) | |
7476a636 | 1871 | return PERR_INVCPUS; |
e2d59900 | 1872 | |
4a74e418 WL |
1873 | if (prstate_housekeeping_conflict(new_prs, xcpus)) |
1874 | return PERR_HKEEPING; | |
1875 | ||
e2d59900 WL |
1876 | /* |
1877 | * A parent can be left with no CPU as long as there is no | |
f0af1bfc | 1878 | * task directly associated with the parent partition. |
e2d59900 | 1879 | */ |
0c7f293e | 1880 | if (nocpu) |
7476a636 | 1881 | return PERR_NOCPUS; |
e2d59900 | 1882 | |
0c7f293e WL |
1883 | cpumask_copy(tmp->delmask, xcpus); |
1884 | deleting = true; | |
1885 | subparts_delta++; | |
11e5f407 | 1886 | new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; |
ee8dde0c | 1887 | } else if (cmd == partcmd_disable) { |
f0af1bfc | 1888 | /* |
181c8e09 | 1889 | * May need to add cpus to parent's effective_cpus for |
0c7f293e | 1890 | * valid partition root. |
f0af1bfc | 1891 | */ |
0c7f293e WL |
1892 | adding = !is_prs_invalid(old_prs) && |
1893 | cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); | |
1894 | if (adding) | |
1895 | subparts_delta--; | |
11e5f407 | 1896 | new_prs = PRS_MEMBER; |
0c7f293e | 1897 | } else if (newmask) { |
d7c8142d | 1898 | /* |
0c7f293e | 1899 | * Empty cpumask is not allowed |
d7c8142d | 1900 | */ |
0c7f293e WL |
1901 | if (cpumask_empty(newmask)) { |
1902 | part_error = PERR_CPUSEMPTY; | |
1903 | goto write_error; | |
d7c8142d | 1904 | } |
0c7f293e | 1905 | |
ee8dde0c WL |
1906 | /* |
1907 | * partcmd_update with newmask: | |
1908 | * | |
0c7f293e | 1909 | * Compute add/delete mask to/from effective_cpus |
f0af1bfc | 1910 | * |
46c521ba WL |
1911 | * For valid partition: |
1912 | * addmask = exclusive_cpus & ~newmask | |
1913 | * & parent->effective_xcpus | |
1914 | * delmask = newmask & ~exclusive_cpus | |
1915 | * & parent->effective_xcpus | |
1916 | * | |
1917 | * For invalid partition: | |
1918 | * delmask = newmask & parent->effective_xcpus | |
ee8dde0c | 1919 | */ |
46c521ba WL |
1920 | if (is_prs_invalid(old_prs)) { |
1921 | adding = false; | |
1922 | deleting = cpumask_and(tmp->delmask, | |
1923 | newmask, parent->effective_xcpus); | |
1924 | } else { | |
1925 | cpumask_andnot(tmp->addmask, xcpus, newmask); | |
1926 | adding = cpumask_and(tmp->addmask, tmp->addmask, | |
1927 | parent->effective_xcpus); | |
ee8dde0c | 1928 | |
46c521ba WL |
1929 | cpumask_andnot(tmp->delmask, newmask, xcpus); |
1930 | deleting = cpumask_and(tmp->delmask, tmp->delmask, | |
1931 | parent->effective_xcpus); | |
1932 | } | |
ee8dde0c | 1933 | /* |
f0af1bfc WL |
1934 | * Make partition invalid if parent's effective_cpus could |
1935 | * become empty and there are tasks in the parent. | |
ee8dde0c | 1936 | */ |
0c7f293e WL |
1937 | if (nocpu && (!adding || |
1938 | !cpumask_intersects(tmp->addmask, cpu_active_mask))) { | |
7476a636 | 1939 | part_error = PERR_NOCPUS; |
0c7f293e WL |
1940 | deleting = false; |
1941 | adding = cpumask_and(tmp->addmask, | |
1942 | xcpus, parent->effective_xcpus); | |
4b842da2 | 1943 | } |
ee8dde0c WL |
1944 | } else { |
1945 | /* | |
0c7f293e | 1946 | * partcmd_update w/o newmask |
ee8dde0c | 1947 | * |
0c7f293e | 1948 | * delmask = effective_xcpus & parent->effective_cpus |
ee8dde0c | 1949 | * |
0c7f293e WL |
1950 | * This can be called from: |
1951 | * 1) update_cpumasks_hier() | |
1952 | * 2) cpuset_hotplug_update_tasks() | |
1953 | * | |
1954 | * Check to see if it can be transitioned from valid to | |
1955 | * invalid partition or vice versa. | |
1956 | * | |
1957 | * A partition error happens when parent has tasks and all | |
1958 | * its effective CPUs will have to be distributed out. | |
ee8dde0c | 1959 | */ |
0c7f293e WL |
1960 | WARN_ON_ONCE(!is_partition_valid(parent)); |
1961 | if (nocpu) { | |
7476a636 | 1962 | part_error = PERR_NOCPUS; |
0c7f293e WL |
1963 | if (is_partition_valid(cs)) |
1964 | adding = cpumask_and(tmp->addmask, | |
1965 | xcpus, parent->effective_xcpus); | |
1966 | } else if (is_partition_invalid(cs) && | |
1967 | cpumask_subset(xcpus, parent->effective_xcpus)) { | |
1968 | struct cgroup_subsys_state *css; | |
1969 | struct cpuset *child; | |
1970 | bool exclusive = true; | |
f0af1bfc | 1971 | |
0c7f293e WL |
1972 | /* |
1973 | * Convert invalid partition to valid has to | |
1974 | * pass the cpu exclusivity test. | |
1975 | */ | |
1976 | rcu_read_lock(); | |
1977 | cpuset_for_each_child(child, css, parent) { | |
1978 | if (child == cs) | |
1979 | continue; | |
783a8334 | 1980 | if (!cpusets_are_exclusive(cs, child)) { |
0c7f293e WL |
1981 | exclusive = false; |
1982 | break; | |
1983 | } | |
1984 | } | |
1985 | rcu_read_unlock(); | |
1986 | if (exclusive) | |
1987 | deleting = cpumask_and(tmp->delmask, | |
1988 | xcpus, parent->effective_cpus); | |
1989 | else | |
1990 | part_error = PERR_NOTEXCL; | |
1991 | } | |
3881b861 | 1992 | } |
0c7f293e WL |
1993 | |
1994 | write_error: | |
7476a636 WL |
1995 | if (part_error) |
1996 | WRITE_ONCE(cs->prs_err, part_error); | |
3881b861 WL |
1997 | |
1998 | if (cmd == partcmd_update) { | |
3881b861 | 1999 | /* |
f28e2244 WL |
2000 | * Check for possible transition between valid and invalid |
2001 | * partition root. | |
3881b861 | 2002 | */ |
18065ebe WL |
2003 | switch (cs->partition_root_state) { |
2004 | case PRS_ROOT: | |
f28e2244 | 2005 | case PRS_ISOLATED: |
0c7f293e | 2006 | if (part_error) { |
f28e2244 | 2007 | new_prs = -old_prs; |
0c7f293e WL |
2008 | subparts_delta--; |
2009 | } | |
3881b861 | 2010 | break; |
18065ebe | 2011 | case PRS_INVALID_ROOT: |
f28e2244 | 2012 | case PRS_INVALID_ISOLATED: |
0c7f293e | 2013 | if (!part_error) { |
f28e2244 | 2014 | new_prs = -old_prs; |
0c7f293e WL |
2015 | subparts_delta++; |
2016 | } | |
3881b861 WL |
2017 | break; |
2018 | } | |
ee8dde0c WL |
2019 | } |
2020 | ||
e7cc9888 | 2021 | if (!adding && !deleting && (new_prs == old_prs)) |
ee8dde0c WL |
2022 | return 0; |
2023 | ||
f0af1bfc WL |
2024 | /* |
2025 | * Transitioning between invalid to valid or vice versa may require | |
46c521ba WL |
2026 | * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, |
2027 | * validate_change() has already been successfully called and | |
2028 | * CPU lists in cs haven't been updated yet. So defer it to later. | |
f0af1bfc | 2029 | */ |
46c521ba | 2030 | if ((old_prs != new_prs) && (cmd != partcmd_update)) { |
a86ce680 WL |
2031 | int err = update_partition_exclusive(cs, new_prs); |
2032 | ||
2033 | if (err) | |
2034 | return err; | |
f0af1bfc WL |
2035 | } |
2036 | ||
ee8dde0c | 2037 | /* |
0c7f293e WL |
2038 | * Change the parent's effective_cpus & effective_xcpus (top cpuset |
2039 | * only). | |
2040 | * | |
ee8dde0c WL |
2041 | * Newly added CPUs will be removed from effective_cpus and |
2042 | * newly deleted ones will be added back to effective_cpus. | |
2043 | */ | |
2044 | spin_lock_irq(&callback_lock); | |
0c7f293e | 2045 | if (old_prs != new_prs) { |
18065ebe | 2046 | cs->partition_root_state = new_prs; |
0c7f293e WL |
2047 | if (new_prs <= 0) |
2048 | cs->nr_subparts = 0; | |
2049 | } | |
11e5f407 WL |
2050 | /* |
2051 | * Adding to parent's effective_cpus means deletion CPUs from cs | |
2052 | * and vice versa. | |
2053 | */ | |
2054 | if (adding) | |
72c6303a WL |
2055 | isolcpus_updated += partition_xcpus_del(old_prs, parent, |
2056 | tmp->addmask); | |
11e5f407 | 2057 | if (deleting) |
72c6303a WL |
2058 | isolcpus_updated += partition_xcpus_add(new_prs, parent, |
2059 | tmp->delmask); | |
e7cc9888 | 2060 | |
11e5f407 WL |
2061 | if (is_partition_valid(parent)) { |
2062 | parent->nr_subparts += subparts_delta; | |
2063 | WARN_ON_ONCE(parent->nr_subparts < 0); | |
2064 | } | |
ee8dde0c | 2065 | spin_unlock_irq(&callback_lock); |
72c6303a | 2066 | update_unbound_workqueue_cpumask(isolcpus_updated); |
f0af1bfc | 2067 | |
46c521ba WL |
2068 | if ((old_prs != new_prs) && (cmd == partcmd_update)) |
2069 | update_partition_exclusive(cs, new_prs); | |
2070 | ||
99fe36ba | 2071 | if (adding || deleting) { |
292fd843 | 2072 | update_tasks_cpumask(parent, tmp->addmask); |
e2ffe502 | 2073 | update_sibling_cpumasks(parent, cs, tmp); |
99fe36ba | 2074 | } |
f0af1bfc | 2075 | |
f28e2244 | 2076 | /* |
a86ce680 | 2077 | * For partcmd_update without newmask, it is being called from |
2125c003 WL |
2078 | * cpuset_handle_hotplug(). Update the load balance flag and |
2079 | * scheduling domain accordingly. | |
f28e2244 | 2080 | */ |
2125c003 | 2081 | if ((cmd == partcmd_update) && !newmask) |
a86ce680 | 2082 | update_partition_sd_lb(cs, old_prs); |
a86ce680 | 2083 | |
18065ebe | 2084 | notify_partition_change(cs, old_prs); |
f0af1bfc | 2085 | return 0; |
ee8dde0c WL |
2086 | } |
2087 | ||
0c7f293e WL |
2088 | /** |
2089 | * compute_partition_effective_cpumask - compute effective_cpus for partition | |
2090 | * @cs: partition root cpuset | |
2091 | * @new_ecpus: previously computed effective_cpus to be updated | |
2092 | * | |
2093 | * Compute the effective_cpus of a partition root by scanning effective_xcpus | |
181c8e09 | 2094 | * of child partition roots and excluding their effective_xcpus. |
0c7f293e WL |
2095 | * |
2096 | * This has the side effect of invalidating valid child partition roots, | |
2097 | * if necessary. Since it is called from either cpuset_hotplug_update_tasks() | |
2098 | * or update_cpumasks_hier() where parent and children are modified | |
2099 | * successively, we don't need to call update_parent_effective_cpumask() | |
2100 | * and the child's effective_cpus will be updated in later iterations. | |
2101 | * | |
2102 | * Note that rcu_read_lock() is assumed to be held. | |
2103 | */ | |
2104 | static void compute_partition_effective_cpumask(struct cpuset *cs, | |
2105 | struct cpumask *new_ecpus) | |
2106 | { | |
2107 | struct cgroup_subsys_state *css; | |
2108 | struct cpuset *child; | |
2109 | bool populated = partition_is_populated(cs, NULL); | |
2110 | ||
2111 | /* | |
2112 | * Check child partition roots to see if they should be | |
2113 | * invalidated when | |
2114 | * 1) child effective_xcpus not a subset of new | |
2115 | * excluisve_cpus | |
2116 | * 2) All the effective_cpus will be used up and cp | |
2117 | * has tasks | |
2118 | */ | |
e2ffe502 WL |
2119 | compute_effective_exclusive_cpumask(cs, new_ecpus); |
2120 | cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); | |
2121 | ||
0c7f293e WL |
2122 | rcu_read_lock(); |
2123 | cpuset_for_each_child(child, css, cs) { | |
2124 | if (!is_partition_valid(child)) | |
2125 | continue; | |
2126 | ||
2127 | child->prs_err = 0; | |
2128 | if (!cpumask_subset(child->effective_xcpus, | |
2129 | cs->effective_xcpus)) | |
2130 | child->prs_err = PERR_INVCPUS; | |
2131 | else if (populated && | |
2132 | cpumask_subset(new_ecpus, child->effective_xcpus)) | |
2133 | child->prs_err = PERR_NOCPUS; | |
2134 | ||
2135 | if (child->prs_err) { | |
2136 | int old_prs = child->partition_root_state; | |
2137 | ||
2138 | /* | |
2139 | * Invalidate child partition | |
2140 | */ | |
2141 | spin_lock_irq(&callback_lock); | |
2142 | make_partition_invalid(child); | |
2143 | cs->nr_subparts--; | |
2144 | child->nr_subparts = 0; | |
2145 | spin_unlock_irq(&callback_lock); | |
2146 | notify_partition_change(child, old_prs); | |
2147 | continue; | |
2148 | } | |
2149 | cpumask_andnot(new_ecpus, new_ecpus, | |
2150 | child->effective_xcpus); | |
2151 | } | |
2152 | rcu_read_unlock(); | |
2153 | } | |
2154 | ||
3ae0b773 WL |
2155 | /* |
2156 | * update_cpumasks_hier() flags | |
2157 | */ | |
2158 | #define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ | |
2159 | #define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ | |
2160 | ||
5c5cc623 | 2161 | /* |
734d4513 | 2162 | * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree |
ee8dde0c WL |
2163 | * @cs: the cpuset to consider |
2164 | * @tmp: temp variables for calculating effective_cpus & partition setup | |
f0af1bfc | 2165 | * @force: don't skip any descendant cpusets if set |
734d4513 | 2166 | * |
415de5fd | 2167 | * When configured cpumask is changed, the effective cpumasks of this cpuset |
734d4513 | 2168 | * and all its descendants need to be updated. |
5c5cc623 | 2169 | * |
415de5fd | 2170 | * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. |
5c5cc623 | 2171 | * |
111cd11b | 2172 | * Called with cpuset_mutex held |
5c5cc623 | 2173 | */ |
f0af1bfc | 2174 | static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, |
3ae0b773 | 2175 | int flags) |
5c5cc623 LZ |
2176 | { |
2177 | struct cpuset *cp; | |
492eb21b | 2178 | struct cgroup_subsys_state *pos_css; |
8b5f1c52 | 2179 | bool need_rebuild_sched_domains = false; |
e7cc9888 | 2180 | int old_prs, new_prs; |
5c5cc623 LZ |
2181 | |
2182 | rcu_read_lock(); | |
734d4513 LZ |
2183 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { |
2184 | struct cpuset *parent = parent_cs(cp); | |
181c8e09 | 2185 | bool remote = is_remote_partition(cp); |
f0af1bfc | 2186 | bool update_parent = false; |
734d4513 | 2187 | |
181c8e09 WL |
2188 | /* |
2189 | * Skip descendent remote partition that acquires CPUs | |
2190 | * directly from top cpuset unless it is cs. | |
2191 | */ | |
2192 | if (remote && (cp != cs)) { | |
2193 | pos_css = css_rightmost_descendant(pos_css); | |
2194 | continue; | |
2195 | } | |
734d4513 | 2196 | |
e2ffe502 WL |
2197 | /* |
2198 | * Update effective_xcpus if exclusive_cpus set. | |
2199 | * The case when exclusive_cpus isn't set is handled later. | |
2200 | */ | |
2201 | if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { | |
2202 | spin_lock_irq(&callback_lock); | |
2203 | compute_effective_exclusive_cpumask(cp, NULL); | |
2204 | spin_unlock_irq(&callback_lock); | |
2205 | } | |
2206 | ||
181c8e09 WL |
2207 | old_prs = new_prs = cp->partition_root_state; |
2208 | if (remote || (is_partition_valid(parent) && | |
2209 | is_partition_valid(cp))) | |
0c7f293e | 2210 | compute_partition_effective_cpumask(cp, tmp->new_cpus); |
181c8e09 WL |
2211 | else |
2212 | compute_effective_cpumask(tmp->new_cpus, cp, parent); | |
0c7f293e WL |
2213 | |
2214 | /* | |
2215 | * A partition with no effective_cpus is allowed as long as | |
2216 | * there is no task associated with it. Call | |
2217 | * update_parent_effective_cpumask() to check it. | |
2218 | */ | |
2219 | if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { | |
2220 | update_parent = true; | |
2221 | goto update_parent_effective; | |
2222 | } | |
2223 | ||
554b0d1c LZ |
2224 | /* |
2225 | * If it becomes empty, inherit the effective mask of the | |
e2d59900 WL |
2226 | * parent, which is guaranteed to have some CPUs unless |
2227 | * it is a partition root that has explicitly distributed | |
2228 | * out all its CPUs. | |
554b0d1c | 2229 | */ |
181c8e09 | 2230 | if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { |
ee8dde0c | 2231 | cpumask_copy(tmp->new_cpus, parent->effective_cpus); |
4716909c WL |
2232 | if (!cp->use_parent_ecpus) { |
2233 | cp->use_parent_ecpus = true; | |
2234 | parent->child_ecpus_count++; | |
2235 | } | |
2236 | } else if (cp->use_parent_ecpus) { | |
2237 | cp->use_parent_ecpus = false; | |
2238 | WARN_ON_ONCE(!parent->child_ecpus_count); | |
2239 | parent->child_ecpus_count--; | |
2240 | } | |
554b0d1c | 2241 | |
181c8e09 WL |
2242 | if (remote) |
2243 | goto get_css; | |
2244 | ||
ee8dde0c | 2245 | /* |
c8c92620 WL |
2246 | * Skip the whole subtree if |
2247 | * 1) the cpumask remains the same, | |
2248 | * 2) has no partition root state, | |
3ae0b773 | 2249 | * 3) HIER_CHECKALL flag not set, and |
c8c92620 | 2250 | * 4) for v2 load balance state same as its parent. |
ee8dde0c | 2251 | */ |
3ae0b773 | 2252 | if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && |
c8c92620 WL |
2253 | cpumask_equal(tmp->new_cpus, cp->effective_cpus) && |
2254 | (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || | |
2255 | (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { | |
734d4513 LZ |
2256 | pos_css = css_rightmost_descendant(pos_css); |
2257 | continue; | |
5c5cc623 | 2258 | } |
734d4513 | 2259 | |
0c7f293e | 2260 | update_parent_effective: |
ee8dde0c | 2261 | /* |
0c7f293e | 2262 | * update_parent_effective_cpumask() should have been called |
ee8dde0c WL |
2263 | * for cs already in update_cpumask(). We should also call |
2264 | * update_tasks_cpumask() again for tasks in the parent | |
0c7f293e | 2265 | * cpuset if the parent's effective_cpus changes. |
ee8dde0c | 2266 | */ |
e7cc9888 | 2267 | if ((cp != cs) && old_prs) { |
3881b861 | 2268 | switch (parent->partition_root_state) { |
18065ebe | 2269 | case PRS_ROOT: |
f28e2244 | 2270 | case PRS_ISOLATED: |
f0af1bfc | 2271 | update_parent = true; |
3881b861 WL |
2272 | break; |
2273 | ||
f0af1bfc | 2274 | default: |
3881b861 | 2275 | /* |
f0af1bfc WL |
2276 | * When parent is not a partition root or is |
2277 | * invalid, child partition roots become | |
2278 | * invalid too. | |
3881b861 | 2279 | */ |
f28e2244 WL |
2280 | if (is_partition_valid(cp)) |
2281 | new_prs = -cp->partition_root_state; | |
7476a636 WL |
2282 | WRITE_ONCE(cp->prs_err, |
2283 | is_partition_invalid(parent) | |
2284 | ? PERR_INVPARENT : PERR_NOTPART); | |
3881b861 WL |
2285 | break; |
2286 | } | |
ee8dde0c | 2287 | } |
181c8e09 | 2288 | get_css: |
ec903c0c | 2289 | if (!css_tryget_online(&cp->css)) |
5c5cc623 LZ |
2290 | continue; |
2291 | rcu_read_unlock(); | |
2292 | ||
f0af1bfc | 2293 | if (update_parent) { |
0c7f293e | 2294 | update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); |
f0af1bfc WL |
2295 | /* |
2296 | * The cpuset partition_root_state may become | |
2297 | * invalid. Capture it. | |
2298 | */ | |
2299 | new_prs = cp->partition_root_state; | |
2300 | } | |
2301 | ||
8447a0fe | 2302 | spin_lock_irq(&callback_lock); |
f0af1bfc | 2303 | cpumask_copy(cp->effective_cpus, tmp->new_cpus); |
f0af1bfc | 2304 | cp->partition_root_state = new_prs; |
e2ffe502 WL |
2305 | /* |
2306 | * Make sure effective_xcpus is properly set for a valid | |
2307 | * partition root. | |
2308 | */ | |
2309 | if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) | |
0c7f293e WL |
2310 | cpumask_and(cp->effective_xcpus, |
2311 | cp->cpus_allowed, parent->effective_xcpus); | |
181c8e09 WL |
2312 | else if (new_prs < 0) |
2313 | reset_partition_data(cp); | |
8447a0fe | 2314 | spin_unlock_irq(&callback_lock); |
f0af1bfc | 2315 | |
18065ebe | 2316 | notify_partition_change(cp, old_prs); |
734d4513 | 2317 | |
b8d1b8ee | 2318 | WARN_ON(!is_in_v2_mode() && |
734d4513 LZ |
2319 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); |
2320 | ||
e2ffe502 | 2321 | update_tasks_cpumask(cp, cp->effective_cpus); |
5c5cc623 | 2322 | |
c8c92620 WL |
2323 | /* |
2324 | * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE | |
2325 | * from parent if current cpuset isn't a valid partition root | |
2326 | * and their load balance states differ. | |
2327 | */ | |
2328 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && | |
2329 | !is_partition_valid(cp) && | |
2330 | (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { | |
2331 | if (is_sched_load_balance(parent)) | |
2332 | set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); | |
2333 | else | |
2334 | clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); | |
2335 | } | |
2336 | ||
8b5f1c52 | 2337 | /* |
0ccea8fe WL |
2338 | * On legacy hierarchy, if the effective cpumask of any non- |
2339 | * empty cpuset is changed, we need to rebuild sched domains. | |
2340 | * On default hierarchy, the cpuset needs to be a partition | |
2341 | * root as well. | |
8b5f1c52 LZ |
2342 | */ |
2343 | if (!cpumask_empty(cp->cpus_allowed) && | |
0ccea8fe WL |
2344 | is_sched_load_balance(cp) && |
2345 | (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || | |
18065ebe | 2346 | is_partition_valid(cp))) |
8b5f1c52 LZ |
2347 | need_rebuild_sched_domains = true; |
2348 | ||
5c5cc623 LZ |
2349 | rcu_read_lock(); |
2350 | css_put(&cp->css); | |
2351 | } | |
2352 | rcu_read_unlock(); | |
8b5f1c52 | 2353 | |
3ae0b773 | 2354 | if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) |
8b5f1c52 | 2355 | rebuild_sched_domains_locked(); |
5c5cc623 LZ |
2356 | } |
2357 | ||
4716909c WL |
2358 | /** |
2359 | * update_sibling_cpumasks - Update siblings cpumasks | |
2360 | * @parent: Parent cpuset | |
2361 | * @cs: Current cpuset | |
2362 | * @tmp: Temp variables | |
2363 | */ | |
2364 | static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, | |
2365 | struct tmpmasks *tmp) | |
2366 | { | |
2367 | struct cpuset *sibling; | |
2368 | struct cgroup_subsys_state *pos_css; | |
2369 | ||
111cd11b | 2370 | lockdep_assert_held(&cpuset_mutex); |
2bdfd282 | 2371 | |
4716909c WL |
2372 | /* |
2373 | * Check all its siblings and call update_cpumasks_hier() | |
e2ffe502 WL |
2374 | * if their effective_cpus will need to be changed. |
2375 | * | |
2376 | * With the addition of effective_xcpus which is a subset of | |
2377 | * cpus_allowed. It is possible a change in parent's effective_cpus | |
2378 | * due to a change in a child partition's effective_xcpus will impact | |
2379 | * its siblings even if they do not inherit parent's effective_cpus | |
2380 | * directly. | |
2bdfd282 WL |
2381 | * |
2382 | * The update_cpumasks_hier() function may sleep. So we have to | |
3ae0b773 WL |
2383 | * release the RCU read lock before calling it. HIER_NO_SD_REBUILD |
2384 | * flag is used to suppress rebuild of sched domains as the callers | |
2385 | * will take care of that. | |
4716909c WL |
2386 | */ |
2387 | rcu_read_lock(); | |
2388 | cpuset_for_each_child(sibling, pos_css, parent) { | |
2389 | if (sibling == cs) | |
2390 | continue; | |
e2ffe502 WL |
2391 | if (!sibling->use_parent_ecpus && |
2392 | !is_partition_valid(sibling)) { | |
2393 | compute_effective_cpumask(tmp->new_cpus, sibling, | |
2394 | parent); | |
2395 | if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) | |
2396 | continue; | |
2397 | } | |
2bdfd282 WL |
2398 | if (!css_tryget_online(&sibling->css)) |
2399 | continue; | |
4716909c | 2400 | |
2bdfd282 | 2401 | rcu_read_unlock(); |
3ae0b773 | 2402 | update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); |
2bdfd282 WL |
2403 | rcu_read_lock(); |
2404 | css_put(&sibling->css); | |
4716909c WL |
2405 | } |
2406 | rcu_read_unlock(); | |
2407 | } | |
2408 | ||
58f4790b CW |
2409 | /** |
2410 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | |
2411 | * @cs: the cpuset to consider | |
fc34ac1d | 2412 | * @trialcs: trial cpuset |
58f4790b CW |
2413 | * @buf: buffer of cpu numbers written to this cpuset |
2414 | */ | |
645fcc9d LZ |
2415 | static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, |
2416 | const char *buf) | |
1da177e4 | 2417 | { |
58f4790b | 2418 | int retval; |
ee8dde0c | 2419 | struct tmpmasks tmp; |
0c7f293e | 2420 | struct cpuset *parent = parent_cs(cs); |
d7c8142d | 2421 | bool invalidate = false; |
e2ffe502 | 2422 | int hier_flags = 0; |
a86ce680 | 2423 | int old_prs = cs->partition_root_state; |
1da177e4 | 2424 | |
5f054e31 | 2425 | /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ |
4c4d50f7 PJ |
2426 | if (cs == &top_cpuset) |
2427 | return -EACCES; | |
2428 | ||
6f7f02e7 | 2429 | /* |
c8d9c90c | 2430 | * An empty cpus_allowed is ok only if the cpuset has no tasks. |
020958b6 PJ |
2431 | * Since cpulist_parse() fails on an empty mask, we special case |
2432 | * that parsing. The validate_change() call ensures that cpusets | |
2433 | * with tasks have cpus. | |
6f7f02e7 | 2434 | */ |
020958b6 | 2435 | if (!*buf) { |
300ed6cb | 2436 | cpumask_clear(trialcs->cpus_allowed); |
0c7f293e | 2437 | cpumask_clear(trialcs->effective_xcpus); |
6f7f02e7 | 2438 | } else { |
300ed6cb | 2439 | retval = cpulist_parse(buf, trialcs->cpus_allowed); |
6f7f02e7 DR |
2440 | if (retval < 0) |
2441 | return retval; | |
37340746 | 2442 | |
5d8ba82c LZ |
2443 | if (!cpumask_subset(trialcs->cpus_allowed, |
2444 | top_cpuset.cpus_allowed)) | |
37340746 | 2445 | return -EINVAL; |
0c7f293e WL |
2446 | |
2447 | /* | |
e2ffe502 WL |
2448 | * When exclusive_cpus isn't explicitly set, it is constrainted |
2449 | * by cpus_allowed and parent's effective_xcpus. Otherwise, | |
2450 | * trialcs->effective_xcpus is used as a temporary cpumask | |
2451 | * for checking validity of the partition root. | |
0c7f293e | 2452 | */ |
e2ffe502 WL |
2453 | if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) |
2454 | compute_effective_exclusive_cpumask(trialcs, NULL); | |
6f7f02e7 | 2455 | } |
029190c5 | 2456 | |
8707d8b8 | 2457 | /* Nothing to do if the cpus didn't change */ |
300ed6cb | 2458 | if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) |
8707d8b8 | 2459 | return 0; |
58f4790b | 2460 | |
99fe36ba WL |
2461 | if (alloc_cpumasks(NULL, &tmp)) |
2462 | return -ENOMEM; | |
ee8dde0c | 2463 | |
46c521ba WL |
2464 | if (old_prs) { |
2465 | if (is_partition_valid(cs) && | |
2466 | cpumask_empty(trialcs->effective_xcpus)) { | |
0c7f293e WL |
2467 | invalidate = true; |
2468 | cs->prs_err = PERR_INVCPUS; | |
4a74e418 WL |
2469 | } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { |
2470 | invalidate = true; | |
2471 | cs->prs_err = PERR_HKEEPING; | |
0c7f293e WL |
2472 | } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { |
2473 | invalidate = true; | |
2474 | cs->prs_err = PERR_NOCPUS; | |
2475 | } | |
2476 | } | |
2477 | ||
e2ffe502 WL |
2478 | /* |
2479 | * Check all the descendants in update_cpumasks_hier() if | |
2480 | * effective_xcpus is to be changed. | |
2481 | */ | |
2482 | if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) | |
2483 | hier_flags = HIER_CHECKALL; | |
2484 | ||
d7c8142d WL |
2485 | retval = validate_change(cs, trialcs); |
2486 | ||
2487 | if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { | |
d7c8142d | 2488 | struct cgroup_subsys_state *css; |
0c7f293e | 2489 | struct cpuset *cp; |
d7c8142d WL |
2490 | |
2491 | /* | |
2492 | * The -EINVAL error code indicates that partition sibling | |
2493 | * CPU exclusivity rule has been violated. We still allow | |
2494 | * the cpumask change to proceed while invalidating the | |
2495 | * partition. However, any conflicting sibling partitions | |
2496 | * have to be marked as invalid too. | |
2497 | */ | |
2498 | invalidate = true; | |
2499 | rcu_read_lock(); | |
46c521ba WL |
2500 | cpuset_for_each_child(cp, css, parent) { |
2501 | struct cpumask *xcpus = fetch_xcpus(trialcs); | |
2502 | ||
d7c8142d | 2503 | if (is_partition_valid(cp) && |
46c521ba | 2504 | cpumask_intersects(xcpus, cp->effective_xcpus)) { |
d7c8142d | 2505 | rcu_read_unlock(); |
0c7f293e | 2506 | update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); |
d7c8142d WL |
2507 | rcu_read_lock(); |
2508 | } | |
46c521ba | 2509 | } |
d7c8142d WL |
2510 | rcu_read_unlock(); |
2511 | retval = 0; | |
2512 | } | |
0c7f293e | 2513 | |
d7c8142d | 2514 | if (retval < 0) |
99fe36ba | 2515 | goto out_free; |
d7c8142d | 2516 | |
46c521ba WL |
2517 | if (is_partition_valid(cs) || |
2518 | (is_partition_invalid(cs) && !invalidate)) { | |
2519 | struct cpumask *xcpus = trialcs->effective_xcpus; | |
2520 | ||
2521 | if (cpumask_empty(xcpus) && is_partition_invalid(cs)) | |
2522 | xcpus = trialcs->cpus_allowed; | |
2523 | ||
181c8e09 WL |
2524 | /* |
2525 | * Call remote_cpus_update() to handle valid remote partition | |
2526 | */ | |
2527 | if (is_remote_partition(cs)) | |
46c521ba | 2528 | remote_cpus_update(cs, xcpus, &tmp); |
181c8e09 | 2529 | else if (invalidate) |
0c7f293e WL |
2530 | update_parent_effective_cpumask(cs, partcmd_invalidate, |
2531 | NULL, &tmp); | |
d7c8142d | 2532 | else |
0c7f293e | 2533 | update_parent_effective_cpumask(cs, partcmd_update, |
46c521ba | 2534 | xcpus, &tmp); |
181c8e09 WL |
2535 | } else if (!cpumask_empty(cs->exclusive_cpus)) { |
2536 | /* | |
2537 | * Use trialcs->effective_cpus as a temp cpumask | |
2538 | */ | |
2539 | remote_partition_check(cs, trialcs->effective_xcpus, | |
2540 | trialcs->effective_cpus, &tmp); | |
ee8dde0c WL |
2541 | } |
2542 | ||
8447a0fe | 2543 | spin_lock_irq(&callback_lock); |
300ed6cb | 2544 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
e2ffe502 WL |
2545 | cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); |
2546 | if ((old_prs > 0) && !is_partition_valid(cs)) | |
2547 | reset_partition_data(cs); | |
8447a0fe | 2548 | spin_unlock_irq(&callback_lock); |
029190c5 | 2549 | |
e2ffe502 WL |
2550 | /* effective_cpus/effective_xcpus will be updated here */ |
2551 | update_cpumasks_hier(cs, &tmp, hier_flags); | |
4716909c | 2552 | |
0c7f293e WL |
2553 | /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ |
2554 | if (cs->partition_root_state) | |
a86ce680 | 2555 | update_partition_sd_lb(cs, old_prs); |
99fe36ba WL |
2556 | out_free: |
2557 | free_cpumasks(NULL, &tmp); | |
25125a47 | 2558 | return retval; |
1da177e4 LT |
2559 | } |
2560 | ||
e2ffe502 WL |
2561 | /** |
2562 | * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset | |
2563 | * @cs: the cpuset to consider | |
2564 | * @trialcs: trial cpuset | |
2565 | * @buf: buffer of cpu numbers written to this cpuset | |
2566 | * | |
2567 | * The tasks' cpumask will be updated if cs is a valid partition root. | |
2568 | */ | |
2569 | static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |
2570 | const char *buf) | |
2571 | { | |
2572 | int retval; | |
2573 | struct tmpmasks tmp; | |
2574 | struct cpuset *parent = parent_cs(cs); | |
2575 | bool invalidate = false; | |
2576 | int hier_flags = 0; | |
2577 | int old_prs = cs->partition_root_state; | |
2578 | ||
2579 | if (!*buf) { | |
2580 | cpumask_clear(trialcs->exclusive_cpus); | |
181c8e09 | 2581 | cpumask_clear(trialcs->effective_xcpus); |
e2ffe502 WL |
2582 | } else { |
2583 | retval = cpulist_parse(buf, trialcs->exclusive_cpus); | |
2584 | if (retval < 0) | |
2585 | return retval; | |
2586 | if (!is_cpu_exclusive(cs)) | |
2587 | set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); | |
2588 | } | |
2589 | ||
2590 | /* Nothing to do if the CPUs didn't change */ | |
2591 | if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) | |
2592 | return 0; | |
2593 | ||
181c8e09 WL |
2594 | if (*buf) |
2595 | compute_effective_exclusive_cpumask(trialcs, NULL); | |
e2ffe502 WL |
2596 | |
2597 | /* | |
2598 | * Check all the descendants in update_cpumasks_hier() if | |
2599 | * effective_xcpus is to be changed. | |
2600 | */ | |
2601 | if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) | |
2602 | hier_flags = HIER_CHECKALL; | |
2603 | ||
2604 | retval = validate_change(cs, trialcs); | |
2605 | if (retval) | |
2606 | return retval; | |
2607 | ||
66f40b92 WL |
2608 | if (alloc_cpumasks(NULL, &tmp)) |
2609 | return -ENOMEM; | |
2610 | ||
46c521ba | 2611 | if (old_prs) { |
e2ffe502 WL |
2612 | if (cpumask_empty(trialcs->effective_xcpus)) { |
2613 | invalidate = true; | |
2614 | cs->prs_err = PERR_INVCPUS; | |
4a74e418 WL |
2615 | } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { |
2616 | invalidate = true; | |
2617 | cs->prs_err = PERR_HKEEPING; | |
e2ffe502 WL |
2618 | } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { |
2619 | invalidate = true; | |
2620 | cs->prs_err = PERR_NOCPUS; | |
2621 | } | |
2622 | ||
181c8e09 WL |
2623 | if (is_remote_partition(cs)) { |
2624 | if (invalidate) | |
2625 | remote_partition_disable(cs, &tmp); | |
2626 | else | |
2627 | remote_cpus_update(cs, trialcs->effective_xcpus, | |
2628 | &tmp); | |
2629 | } else if (invalidate) { | |
e2ffe502 WL |
2630 | update_parent_effective_cpumask(cs, partcmd_invalidate, |
2631 | NULL, &tmp); | |
181c8e09 | 2632 | } else { |
e2ffe502 WL |
2633 | update_parent_effective_cpumask(cs, partcmd_update, |
2634 | trialcs->effective_xcpus, &tmp); | |
181c8e09 WL |
2635 | } |
2636 | } else if (!cpumask_empty(trialcs->exclusive_cpus)) { | |
2637 | /* | |
2638 | * Use trialcs->effective_cpus as a temp cpumask | |
2639 | */ | |
2640 | remote_partition_check(cs, trialcs->effective_xcpus, | |
2641 | trialcs->effective_cpus, &tmp); | |
e2ffe502 | 2642 | } |
e2ffe502 WL |
2643 | spin_lock_irq(&callback_lock); |
2644 | cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); | |
2645 | cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); | |
2646 | if ((old_prs > 0) && !is_partition_valid(cs)) | |
2647 | reset_partition_data(cs); | |
2648 | spin_unlock_irq(&callback_lock); | |
2649 | ||
2650 | /* | |
2651 | * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus | |
2652 | * of the subtree when it is a valid partition root or effective_xcpus | |
2653 | * is updated. | |
2654 | */ | |
2655 | if (is_partition_valid(cs) || hier_flags) | |
2656 | update_cpumasks_hier(cs, &tmp, hier_flags); | |
2657 | ||
2658 | /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ | |
2659 | if (cs->partition_root_state) | |
2660 | update_partition_sd_lb(cs, old_prs); | |
2661 | ||
2662 | free_cpumasks(NULL, &tmp); | |
2663 | return 0; | |
2664 | } | |
2665 | ||
e4e364e8 | 2666 | /* |
e93ad19d TH |
2667 | * Migrate memory region from one set of nodes to another. This is |
2668 | * performed asynchronously as it can be called from process migration path | |
2669 | * holding locks involved in process management. All mm migrations are | |
2670 | * performed in the queued order and can be waited for by flushing | |
2671 | * cpuset_migrate_mm_wq. | |
e4e364e8 PJ |
2672 | */ |
2673 | ||
e93ad19d TH |
2674 | struct cpuset_migrate_mm_work { |
2675 | struct work_struct work; | |
2676 | struct mm_struct *mm; | |
2677 | nodemask_t from; | |
2678 | nodemask_t to; | |
2679 | }; | |
2680 | ||
2681 | static void cpuset_migrate_mm_workfn(struct work_struct *work) | |
2682 | { | |
2683 | struct cpuset_migrate_mm_work *mwork = | |
2684 | container_of(work, struct cpuset_migrate_mm_work, work); | |
2685 | ||
2686 | /* on a wq worker, no need to worry about %current's mems_allowed */ | |
2687 | do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); | |
2688 | mmput(mwork->mm); | |
2689 | kfree(mwork); | |
2690 | } | |
2691 | ||
e4e364e8 PJ |
2692 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, |
2693 | const nodemask_t *to) | |
2694 | { | |
e93ad19d | 2695 | struct cpuset_migrate_mm_work *mwork; |
e4e364e8 | 2696 | |
9f72daf7 NSJ |
2697 | if (nodes_equal(*from, *to)) { |
2698 | mmput(mm); | |
2699 | return; | |
2700 | } | |
2701 | ||
e93ad19d TH |
2702 | mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); |
2703 | if (mwork) { | |
2704 | mwork->mm = mm; | |
2705 | mwork->from = *from; | |
2706 | mwork->to = *to; | |
2707 | INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); | |
2708 | queue_work(cpuset_migrate_mm_wq, &mwork->work); | |
2709 | } else { | |
2710 | mmput(mm); | |
2711 | } | |
2712 | } | |
e4e364e8 | 2713 | |
5cf1cacb | 2714 | static void cpuset_post_attach(void) |
e93ad19d TH |
2715 | { |
2716 | flush_workqueue(cpuset_migrate_mm_wq); | |
e4e364e8 PJ |
2717 | } |
2718 | ||
3b6766fe | 2719 | /* |
58568d2a MX |
2720 | * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy |
2721 | * @tsk: the task to change | |
2722 | * @newmems: new nodes that the task will be set | |
2723 | * | |
5f155f27 VB |
2724 | * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed |
2725 | * and rebind an eventual tasks' mempolicy. If the task is allocating in | |
2726 | * parallel, it might temporarily see an empty intersection, which results in | |
2727 | * a seqlock check and retry before OOM or allocation failure. | |
58568d2a MX |
2728 | */ |
2729 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | |
2730 | nodemask_t *newmems) | |
2731 | { | |
c0ff7453 | 2732 | task_lock(tsk); |
c0ff7453 | 2733 | |
5f155f27 VB |
2734 | local_irq_disable(); |
2735 | write_seqcount_begin(&tsk->mems_allowed_seq); | |
c0ff7453 | 2736 | |
cc9a6c87 | 2737 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
213980c0 | 2738 | mpol_rebind_task(tsk, newmems); |
58568d2a | 2739 | tsk->mems_allowed = *newmems; |
cc9a6c87 | 2740 | |
5f155f27 VB |
2741 | write_seqcount_end(&tsk->mems_allowed_seq); |
2742 | local_irq_enable(); | |
cc9a6c87 | 2743 | |
c0ff7453 | 2744 | task_unlock(tsk); |
58568d2a MX |
2745 | } |
2746 | ||
8793d854 PM |
2747 | static void *cpuset_being_rebound; |
2748 | ||
0b2f630a MX |
2749 | /** |
2750 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | |
2751 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | |
0b2f630a | 2752 | * |
d66393e5 | 2753 | * Iterate through each task of @cs updating its mems_allowed to the |
111cd11b | 2754 | * effective cpuset's. As this function is called with cpuset_mutex held, |
d66393e5 | 2755 | * cpuset membership stays stable. |
0b2f630a | 2756 | */ |
d66393e5 | 2757 | static void update_tasks_nodemask(struct cpuset *cs) |
1da177e4 | 2758 | { |
111cd11b | 2759 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
d66393e5 TH |
2760 | struct css_task_iter it; |
2761 | struct task_struct *task; | |
59dac16f | 2762 | |
846a16bf | 2763 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
4225399a | 2764 | |
ae1c8023 | 2765 | guarantee_online_mems(cs, &newmems); |
33ad801d | 2766 | |
4225399a | 2767 | /* |
c1e8d7c6 | 2768 | * The mpol_rebind_mm() call takes mmap_lock, which we couldn't |
3b6766fe LZ |
2769 | * take while holding tasklist_lock. Forks can happen - the |
2770 | * mpol_dup() cpuset_being_rebound check will catch such forks, | |
2771 | * and rebind their vma mempolicies too. Because we still hold | |
111cd11b | 2772 | * the global cpuset_mutex, we know that no other rebind effort |
3b6766fe | 2773 | * will be contending for the global variable cpuset_being_rebound. |
4225399a | 2774 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
04c19fa6 | 2775 | * is idempotent. Also migrate pages in each mm to new nodes. |
4225399a | 2776 | */ |
bc2fb7ed | 2777 | css_task_iter_start(&cs->css, 0, &it); |
d66393e5 TH |
2778 | while ((task = css_task_iter_next(&it))) { |
2779 | struct mm_struct *mm; | |
2780 | bool migrate; | |
2781 | ||
2782 | cpuset_change_task_nodemask(task, &newmems); | |
2783 | ||
2784 | mm = get_task_mm(task); | |
2785 | if (!mm) | |
2786 | continue; | |
2787 | ||
2788 | migrate = is_memory_migrate(cs); | |
2789 | ||
2790 | mpol_rebind_mm(mm, &cs->mems_allowed); | |
2791 | if (migrate) | |
2792 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); | |
e93ad19d TH |
2793 | else |
2794 | mmput(mm); | |
d66393e5 TH |
2795 | } |
2796 | css_task_iter_end(&it); | |
4225399a | 2797 | |
33ad801d LZ |
2798 | /* |
2799 | * All the tasks' nodemasks have been updated, update | |
2800 | * cs->old_mems_allowed. | |
2801 | */ | |
2802 | cs->old_mems_allowed = newmems; | |
2803 | ||
2df167a3 | 2804 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ |
8793d854 | 2805 | cpuset_being_rebound = NULL; |
1da177e4 LT |
2806 | } |
2807 | ||
5c5cc623 | 2808 | /* |
734d4513 LZ |
2809 | * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree |
2810 | * @cs: the cpuset to consider | |
2811 | * @new_mems: a temp variable for calculating new effective_mems | |
5c5cc623 | 2812 | * |
734d4513 LZ |
2813 | * When configured nodemask is changed, the effective nodemasks of this cpuset |
2814 | * and all its descendants need to be updated. | |
5c5cc623 | 2815 | * |
d95af61d | 2816 | * On legacy hierarchy, effective_mems will be the same with mems_allowed. |
5c5cc623 | 2817 | * |
111cd11b | 2818 | * Called with cpuset_mutex held |
5c5cc623 | 2819 | */ |
734d4513 | 2820 | static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) |
5c5cc623 LZ |
2821 | { |
2822 | struct cpuset *cp; | |
492eb21b | 2823 | struct cgroup_subsys_state *pos_css; |
5c5cc623 LZ |
2824 | |
2825 | rcu_read_lock(); | |
734d4513 LZ |
2826 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { |
2827 | struct cpuset *parent = parent_cs(cp); | |
2828 | ||
2829 | nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); | |
2830 | ||
554b0d1c LZ |
2831 | /* |
2832 | * If it becomes empty, inherit the effective mask of the | |
2833 | * parent, which is guaranteed to have some MEMs. | |
2834 | */ | |
b8d1b8ee | 2835 | if (is_in_v2_mode() && nodes_empty(*new_mems)) |
554b0d1c LZ |
2836 | *new_mems = parent->effective_mems; |
2837 | ||
734d4513 LZ |
2838 | /* Skip the whole subtree if the nodemask remains the same. */ |
2839 | if (nodes_equal(*new_mems, cp->effective_mems)) { | |
2840 | pos_css = css_rightmost_descendant(pos_css); | |
2841 | continue; | |
5c5cc623 | 2842 | } |
734d4513 | 2843 | |
ec903c0c | 2844 | if (!css_tryget_online(&cp->css)) |
5c5cc623 LZ |
2845 | continue; |
2846 | rcu_read_unlock(); | |
2847 | ||
8447a0fe | 2848 | spin_lock_irq(&callback_lock); |
734d4513 | 2849 | cp->effective_mems = *new_mems; |
8447a0fe | 2850 | spin_unlock_irq(&callback_lock); |
734d4513 | 2851 | |
b8d1b8ee | 2852 | WARN_ON(!is_in_v2_mode() && |
a1381268 | 2853 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); |
734d4513 | 2854 | |
d66393e5 | 2855 | update_tasks_nodemask(cp); |
5c5cc623 LZ |
2856 | |
2857 | rcu_read_lock(); | |
2858 | css_put(&cp->css); | |
2859 | } | |
2860 | rcu_read_unlock(); | |
2861 | } | |
2862 | ||
0b2f630a MX |
2863 | /* |
2864 | * Handle user request to change the 'mems' memory placement | |
2865 | * of a cpuset. Needs to validate the request, update the | |
58568d2a MX |
2866 | * cpusets mems_allowed, and for each task in the cpuset, |
2867 | * update mems_allowed and rebind task's mempolicy and any vma | |
2868 | * mempolicies and if the cpuset is marked 'memory_migrate', | |
2869 | * migrate the tasks pages to the new memory. | |
0b2f630a | 2870 | * |
111cd11b | 2871 | * Call with cpuset_mutex held. May take callback_lock during call. |
0b2f630a | 2872 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
c1e8d7c6 | 2873 | * lock each such tasks mm->mmap_lock, scan its vma's and rebind |
0b2f630a MX |
2874 | * their mempolicies to the cpusets new mems_allowed. |
2875 | */ | |
645fcc9d LZ |
2876 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, |
2877 | const char *buf) | |
0b2f630a | 2878 | { |
0b2f630a MX |
2879 | int retval; |
2880 | ||
2881 | /* | |
38d7bee9 | 2882 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
0b2f630a MX |
2883 | * it's read-only |
2884 | */ | |
53feb297 MX |
2885 | if (cs == &top_cpuset) { |
2886 | retval = -EACCES; | |
2887 | goto done; | |
2888 | } | |
0b2f630a | 2889 | |
0b2f630a MX |
2890 | /* |
2891 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | |
2892 | * Since nodelist_parse() fails on an empty mask, we special case | |
2893 | * that parsing. The validate_change() call ensures that cpusets | |
2894 | * with tasks have memory. | |
2895 | */ | |
2896 | if (!*buf) { | |
645fcc9d | 2897 | nodes_clear(trialcs->mems_allowed); |
0b2f630a | 2898 | } else { |
645fcc9d | 2899 | retval = nodelist_parse(buf, trialcs->mems_allowed); |
0b2f630a MX |
2900 | if (retval < 0) |
2901 | goto done; | |
2902 | ||
645fcc9d | 2903 | if (!nodes_subset(trialcs->mems_allowed, |
5d8ba82c LZ |
2904 | top_cpuset.mems_allowed)) { |
2905 | retval = -EINVAL; | |
53feb297 MX |
2906 | goto done; |
2907 | } | |
0b2f630a | 2908 | } |
33ad801d LZ |
2909 | |
2910 | if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { | |
0b2f630a MX |
2911 | retval = 0; /* Too easy - nothing to do */ |
2912 | goto done; | |
2913 | } | |
645fcc9d | 2914 | retval = validate_change(cs, trialcs); |
0b2f630a MX |
2915 | if (retval < 0) |
2916 | goto done; | |
2917 | ||
8ca1b5a4 FT |
2918 | check_insane_mems_config(&trialcs->mems_allowed); |
2919 | ||
8447a0fe | 2920 | spin_lock_irq(&callback_lock); |
645fcc9d | 2921 | cs->mems_allowed = trialcs->mems_allowed; |
8447a0fe | 2922 | spin_unlock_irq(&callback_lock); |
0b2f630a | 2923 | |
734d4513 | 2924 | /* use trialcs->mems_allowed as a temp variable */ |
24ee3cf8 | 2925 | update_nodemasks_hier(cs, &trialcs->mems_allowed); |
0b2f630a MX |
2926 | done: |
2927 | return retval; | |
2928 | } | |
2929 | ||
77ef80c6 | 2930 | bool current_cpuset_is_being_rebound(void) |
8793d854 | 2931 | { |
77ef80c6 | 2932 | bool ret; |
391acf97 GZ |
2933 | |
2934 | rcu_read_lock(); | |
2935 | ret = task_cs(current) == cpuset_being_rebound; | |
2936 | rcu_read_unlock(); | |
2937 | ||
2938 | return ret; | |
8793d854 PM |
2939 | } |
2940 | ||
5be7a479 | 2941 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
1d3504fc | 2942 | { |
db7f47cf | 2943 | #ifdef CONFIG_SMP |
a1fd0b9d | 2944 | if (val < -1 || val > sched_domain_level_max + 1) |
30e0e178 | 2945 | return -EINVAL; |
db7f47cf | 2946 | #endif |
1d3504fc HS |
2947 | |
2948 | if (val != cs->relax_domain_level) { | |
2949 | cs->relax_domain_level = val; | |
300ed6cb LZ |
2950 | if (!cpumask_empty(cs->cpus_allowed) && |
2951 | is_sched_load_balance(cs)) | |
699140ba | 2952 | rebuild_sched_domains_locked(); |
1d3504fc HS |
2953 | } |
2954 | ||
2955 | return 0; | |
2956 | } | |
2957 | ||
72ec7029 | 2958 | /** |
950592f7 MX |
2959 | * update_tasks_flags - update the spread flags of tasks in the cpuset. |
2960 | * @cs: the cpuset in which each task's spread flags needs to be changed | |
950592f7 | 2961 | * |
d66393e5 | 2962 | * Iterate through each task of @cs updating its spread flags. As this |
111cd11b | 2963 | * function is called with cpuset_mutex held, cpuset membership stays |
d66393e5 | 2964 | * stable. |
950592f7 | 2965 | */ |
d66393e5 | 2966 | static void update_tasks_flags(struct cpuset *cs) |
950592f7 | 2967 | { |
d66393e5 TH |
2968 | struct css_task_iter it; |
2969 | struct task_struct *task; | |
2970 | ||
bc2fb7ed | 2971 | css_task_iter_start(&cs->css, 0, &it); |
d66393e5 | 2972 | while ((task = css_task_iter_next(&it))) |
18f9a4d4 | 2973 | cpuset_update_task_spread_flags(cs, task); |
d66393e5 | 2974 | css_task_iter_end(&it); |
950592f7 MX |
2975 | } |
2976 | ||
1da177e4 LT |
2977 | /* |
2978 | * update_flag - read a 0 or a 1 in a file and update associated flag | |
78608366 PM |
2979 | * bit: the bit to update (see cpuset_flagbits_t) |
2980 | * cs: the cpuset to update | |
2981 | * turning_on: whether the flag is being set or cleared | |
053199ed | 2982 | * |
111cd11b | 2983 | * Call with cpuset_mutex held. |
1da177e4 LT |
2984 | */ |
2985 | ||
700fe1ab PM |
2986 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
2987 | int turning_on) | |
1da177e4 | 2988 | { |
645fcc9d | 2989 | struct cpuset *trialcs; |
40b6a762 | 2990 | int balance_flag_changed; |
950592f7 | 2991 | int spread_flag_changed; |
950592f7 | 2992 | int err; |
1da177e4 | 2993 | |
645fcc9d LZ |
2994 | trialcs = alloc_trial_cpuset(cs); |
2995 | if (!trialcs) | |
2996 | return -ENOMEM; | |
2997 | ||
1da177e4 | 2998 | if (turning_on) |
645fcc9d | 2999 | set_bit(bit, &trialcs->flags); |
1da177e4 | 3000 | else |
645fcc9d | 3001 | clear_bit(bit, &trialcs->flags); |
1da177e4 | 3002 | |
645fcc9d | 3003 | err = validate_change(cs, trialcs); |
85d7b949 | 3004 | if (err < 0) |
645fcc9d | 3005 | goto out; |
029190c5 | 3006 | |
029190c5 | 3007 | balance_flag_changed = (is_sched_load_balance(cs) != |
645fcc9d | 3008 | is_sched_load_balance(trialcs)); |
029190c5 | 3009 | |
950592f7 MX |
3010 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) |
3011 | || (is_spread_page(cs) != is_spread_page(trialcs))); | |
3012 | ||
8447a0fe | 3013 | spin_lock_irq(&callback_lock); |
645fcc9d | 3014 | cs->flags = trialcs->flags; |
8447a0fe | 3015 | spin_unlock_irq(&callback_lock); |
85d7b949 | 3016 | |
300ed6cb | 3017 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
699140ba | 3018 | rebuild_sched_domains_locked(); |
029190c5 | 3019 | |
950592f7 | 3020 | if (spread_flag_changed) |
d66393e5 | 3021 | update_tasks_flags(cs); |
645fcc9d | 3022 | out: |
bf92370c | 3023 | free_cpuset(trialcs); |
645fcc9d | 3024 | return err; |
1da177e4 LT |
3025 | } |
3026 | ||
18065ebe | 3027 | /** |
f9da322e | 3028 | * update_prstate - update partition_root_state |
18065ebe WL |
3029 | * @cs: the cpuset to update |
3030 | * @new_prs: new partition root state | |
7476a636 | 3031 | * Return: 0 if successful, != 0 if error |
ee8dde0c | 3032 | * |
111cd11b | 3033 | * Call with cpuset_mutex held. |
ee8dde0c | 3034 | */ |
0f3adb8a | 3035 | static int update_prstate(struct cpuset *cs, int new_prs) |
ee8dde0c | 3036 | { |
7476a636 | 3037 | int err = PERR_NONE, old_prs = cs->partition_root_state; |
181c8e09 | 3038 | struct cpuset *parent = parent_cs(cs); |
0f3adb8a | 3039 | struct tmpmasks tmpmask; |
11e5f407 | 3040 | bool new_xcpus_state = false; |
ee8dde0c | 3041 | |
6ba34d3c | 3042 | if (old_prs == new_prs) |
ee8dde0c WL |
3043 | return 0; |
3044 | ||
3045 | /* | |
46c521ba | 3046 | * Treat a previously invalid partition root as if it is a "member". |
ee8dde0c | 3047 | */ |
46c521ba WL |
3048 | if (new_prs && is_prs_invalid(old_prs)) |
3049 | old_prs = PRS_MEMBER; | |
ee8dde0c | 3050 | |
0f3adb8a | 3051 | if (alloc_cpumasks(NULL, &tmpmask)) |
ee8dde0c WL |
3052 | return -ENOMEM; |
3053 | ||
0c7f293e | 3054 | /* |
e2ffe502 WL |
3055 | * Setup effective_xcpus if not properly set yet, it will be cleared |
3056 | * later if partition becomes invalid. | |
0c7f293e | 3057 | */ |
e2ffe502 | 3058 | if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { |
0c7f293e WL |
3059 | spin_lock_irq(&callback_lock); |
3060 | cpumask_and(cs->effective_xcpus, | |
3061 | cs->cpus_allowed, parent->effective_xcpus); | |
3062 | spin_unlock_irq(&callback_lock); | |
3063 | } | |
3064 | ||
a86ce680 WL |
3065 | err = update_partition_exclusive(cs, new_prs); |
3066 | if (err) | |
3067 | goto out; | |
3068 | ||
6ba34d3c | 3069 | if (!old_prs) { |
11e5f407 WL |
3070 | enum partition_cmd cmd = (new_prs == PRS_ROOT) |
3071 | ? partcmd_enable : partcmd_enablei; | |
3072 | ||
ee8dde0c | 3073 | /* |
a86ce680 | 3074 | * cpus_allowed cannot be empty. |
ee8dde0c | 3075 | */ |
f0af1bfc | 3076 | if (cpumask_empty(cs->cpus_allowed)) { |
7476a636 | 3077 | err = PERR_CPUSEMPTY; |
ee8dde0c | 3078 | goto out; |
f0af1bfc | 3079 | } |
ee8dde0c | 3080 | |
11e5f407 | 3081 | err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); |
181c8e09 WL |
3082 | /* |
3083 | * If an attempt to become local partition root fails, | |
3084 | * try to become a remote partition root instead. | |
3085 | */ | |
11e5f407 | 3086 | if (err && remote_partition_enable(cs, new_prs, &tmpmask)) |
181c8e09 | 3087 | err = 0; |
f28e2244 WL |
3088 | } else if (old_prs && new_prs) { |
3089 | /* | |
3090 | * A change in load balance state only, no change in cpumasks. | |
3091 | */ | |
11e5f407 | 3092 | new_xcpus_state = true; |
ee8dde0c | 3093 | } else { |
3881b861 | 3094 | /* |
f0af1bfc WL |
3095 | * Switching back to member is always allowed even if it |
3096 | * disables child partitions. | |
3881b861 | 3097 | */ |
181c8e09 WL |
3098 | if (is_remote_partition(cs)) |
3099 | remote_partition_disable(cs, &tmpmask); | |
3100 | else | |
3101 | update_parent_effective_cpumask(cs, partcmd_disable, | |
3102 | NULL, &tmpmask); | |
3881b861 | 3103 | |
f0af1bfc | 3104 | /* |
0c7f293e WL |
3105 | * Invalidation of child partitions will be done in |
3106 | * update_cpumasks_hier(). | |
f0af1bfc | 3107 | */ |
ee8dde0c | 3108 | } |
ee8dde0c | 3109 | out: |
f0af1bfc | 3110 | /* |
a86ce680 WL |
3111 | * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error |
3112 | * happens. | |
f0af1bfc | 3113 | */ |
a86ce680 | 3114 | if (err) { |
f28e2244 | 3115 | new_prs = -new_prs; |
a86ce680 WL |
3116 | update_partition_exclusive(cs, new_prs); |
3117 | } | |
3118 | ||
f0af1bfc WL |
3119 | spin_lock_irq(&callback_lock); |
3120 | cs->partition_root_state = new_prs; | |
e5ae8803 | 3121 | WRITE_ONCE(cs->prs_err, err); |
0c7f293e | 3122 | if (!is_partition_valid(cs)) |
e2ffe502 | 3123 | reset_partition_data(cs); |
11e5f407 WL |
3124 | else if (new_xcpus_state) |
3125 | partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); | |
f0af1bfc | 3126 | spin_unlock_irq(&callback_lock); |
72c6303a | 3127 | update_unbound_workqueue_cpumask(new_xcpus_state); |
a86ce680 | 3128 | |
0c7f293e WL |
3129 | /* Force update if switching back to member */ |
3130 | update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); | |
6ba34d3c | 3131 | |
a86ce680 WL |
3132 | /* Update sched domains and load balance flag */ |
3133 | update_partition_sd_lb(cs, old_prs); | |
3134 | ||
f0af1bfc | 3135 | notify_partition_change(cs, old_prs); |
0f3adb8a | 3136 | free_cpumasks(NULL, &tmpmask); |
f0af1bfc | 3137 | return 0; |
1da177e4 LT |
3138 | } |
3139 | ||
3e0d98b9 | 3140 | /* |
80f7228b | 3141 | * Frequency meter - How fast is some event occurring? |
3e0d98b9 PJ |
3142 | * |
3143 | * These routines manage a digitally filtered, constant time based, | |
3144 | * event frequency meter. There are four routines: | |
3145 | * fmeter_init() - initialize a frequency meter. | |
3146 | * fmeter_markevent() - called each time the event happens. | |
3147 | * fmeter_getrate() - returns the recent rate of such events. | |
3148 | * fmeter_update() - internal routine used to update fmeter. | |
3149 | * | |
3150 | * A common data structure is passed to each of these routines, | |
3151 | * which is used to keep track of the state required to manage the | |
3152 | * frequency meter and its digital filter. | |
3153 | * | |
3154 | * The filter works on the number of events marked per unit time. | |
3155 | * The filter is single-pole low-pass recursive (IIR). The time unit | |
3156 | * is 1 second. Arithmetic is done using 32-bit integers scaled to | |
3157 | * simulate 3 decimal digits of precision (multiplied by 1000). | |
3158 | * | |
3159 | * With an FM_COEF of 933, and a time base of 1 second, the filter | |
3160 | * has a half-life of 10 seconds, meaning that if the events quit | |
3161 | * happening, then the rate returned from the fmeter_getrate() | |
3162 | * will be cut in half each 10 seconds, until it converges to zero. | |
3163 | * | |
3164 | * It is not worth doing a real infinitely recursive filter. If more | |
3165 | * than FM_MAXTICKS ticks have elapsed since the last filter event, | |
3166 | * just compute FM_MAXTICKS ticks worth, by which point the level | |
3167 | * will be stable. | |
3168 | * | |
3169 | * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid | |
3170 | * arithmetic overflow in the fmeter_update() routine. | |
3171 | * | |
3172 | * Given the simple 32 bit integer arithmetic used, this meter works | |
3173 | * best for reporting rates between one per millisecond (msec) and | |
3174 | * one per 32 (approx) seconds. At constant rates faster than one | |
3175 | * per msec it maxes out at values just under 1,000,000. At constant | |
3176 | * rates between one per msec, and one per second it will stabilize | |
3177 | * to a value N*1000, where N is the rate of events per second. | |
3178 | * At constant rates between one per second and one per 32 seconds, | |
3179 | * it will be choppy, moving up on the seconds that have an event, | |
3180 | * and then decaying until the next event. At rates slower than | |
3181 | * about one in 32 seconds, it decays all the way back to zero between | |
3182 | * each event. | |
3183 | */ | |
3184 | ||
3185 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ | |
d2b43658 | 3186 | #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ |
3e0d98b9 PJ |
3187 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ |
3188 | #define FM_SCALE 1000 /* faux fixed point scale */ | |
3189 | ||
3190 | /* Initialize a frequency meter */ | |
3191 | static void fmeter_init(struct fmeter *fmp) | |
3192 | { | |
3193 | fmp->cnt = 0; | |
3194 | fmp->val = 0; | |
3195 | fmp->time = 0; | |
3196 | spin_lock_init(&fmp->lock); | |
3197 | } | |
3198 | ||
3199 | /* Internal meter update - process cnt events and update value */ | |
3200 | static void fmeter_update(struct fmeter *fmp) | |
3201 | { | |
d2b43658 AB |
3202 | time64_t now; |
3203 | u32 ticks; | |
3204 | ||
3205 | now = ktime_get_seconds(); | |
3206 | ticks = now - fmp->time; | |
3e0d98b9 PJ |
3207 | |
3208 | if (ticks == 0) | |
3209 | return; | |
3210 | ||
3211 | ticks = min(FM_MAXTICKS, ticks); | |
3212 | while (ticks-- > 0) | |
3213 | fmp->val = (FM_COEF * fmp->val) / FM_SCALE; | |
3214 | fmp->time = now; | |
3215 | ||
3216 | fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; | |
3217 | fmp->cnt = 0; | |
3218 | } | |
3219 | ||
3220 | /* Process any previous ticks, then bump cnt by one (times scale). */ | |
3221 | static void fmeter_markevent(struct fmeter *fmp) | |
3222 | { | |
3223 | spin_lock(&fmp->lock); | |
3224 | fmeter_update(fmp); | |
3225 | fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); | |
3226 | spin_unlock(&fmp->lock); | |
3227 | } | |
3228 | ||
3229 | /* Process any previous ticks, then return current value. */ | |
3230 | static int fmeter_getrate(struct fmeter *fmp) | |
3231 | { | |
3232 | int val; | |
3233 | ||
3234 | spin_lock(&fmp->lock); | |
3235 | fmeter_update(fmp); | |
3236 | val = fmp->val; | |
3237 | spin_unlock(&fmp->lock); | |
3238 | return val; | |
3239 | } | |
3240 | ||
57fce0a6 TH |
3241 | static struct cpuset *cpuset_attach_old_cs; |
3242 | ||
eee87853 WL |
3243 | /* |
3244 | * Check to see if a cpuset can accept a new task | |
3245 | * For v1, cpus_allowed and mems_allowed can't be empty. | |
3246 | * For v2, effective_cpus can't be empty. | |
3247 | * Note that in v1, effective_cpus = cpus_allowed. | |
3248 | */ | |
3249 | static int cpuset_can_attach_check(struct cpuset *cs) | |
3250 | { | |
3251 | if (cpumask_empty(cs->effective_cpus) || | |
3252 | (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) | |
3253 | return -ENOSPC; | |
3254 | return 0; | |
3255 | } | |
3256 | ||
2ef269ef DE |
3257 | static void reset_migrate_dl_data(struct cpuset *cs) |
3258 | { | |
3259 | cs->nr_migrate_dl_tasks = 0; | |
3260 | cs->sum_migrate_dl_bw = 0; | |
3261 | } | |
3262 | ||
111cd11b | 3263 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
1f7dd3e5 | 3264 | static int cpuset_can_attach(struct cgroup_taskset *tset) |
f780bdb7 | 3265 | { |
1f7dd3e5 | 3266 | struct cgroup_subsys_state *css; |
2ef269ef | 3267 | struct cpuset *cs, *oldcs; |
bb9d97b6 | 3268 | struct task_struct *task; |
0a67b847 | 3269 | bool cpus_updated, mems_updated; |
bb9d97b6 | 3270 | int ret; |
1da177e4 | 3271 | |
57fce0a6 | 3272 | /* used later by cpuset_attach() */ |
1f7dd3e5 | 3273 | cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); |
2ef269ef | 3274 | oldcs = cpuset_attach_old_cs; |
1f7dd3e5 | 3275 | cs = css_cs(css); |
57fce0a6 | 3276 | |
111cd11b | 3277 | mutex_lock(&cpuset_mutex); |
5d21cc2d | 3278 | |
eee87853 WL |
3279 | /* Check to see if task is allowed in the cpuset */ |
3280 | ret = cpuset_can_attach_check(cs); | |
3281 | if (ret) | |
e2d59900 WL |
3282 | goto out_unlock; |
3283 | ||
0a67b847 MK |
3284 | cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); |
3285 | mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); | |
3286 | ||
1f7dd3e5 | 3287 | cgroup_taskset_for_each(task, css, tset) { |
2ef269ef | 3288 | ret = task_can_attach(task); |
7f51412a | 3289 | if (ret) |
5d21cc2d | 3290 | goto out_unlock; |
0a67b847 MK |
3291 | |
3292 | /* | |
3293 | * Skip rights over task check in v2 when nothing changes, | |
3294 | * migration permission derives from hierarchy ownership in | |
3295 | * cgroup_procs_write_permission()). | |
3296 | */ | |
3297 | if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || | |
3298 | (cpus_updated || mems_updated)) { | |
3299 | ret = security_task_setscheduler(task); | |
3300 | if (ret) | |
3301 | goto out_unlock; | |
3302 | } | |
6c24849f JL |
3303 | |
3304 | if (dl_task(task)) { | |
2ef269ef DE |
3305 | cs->nr_migrate_dl_tasks++; |
3306 | cs->sum_migrate_dl_bw += task->dl.dl_bw; | |
6c24849f | 3307 | } |
bb9d97b6 | 3308 | } |
f780bdb7 | 3309 | |
2ef269ef DE |
3310 | if (!cs->nr_migrate_dl_tasks) |
3311 | goto out_success; | |
3312 | ||
3313 | if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { | |
3314 | int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); | |
3315 | ||
3316 | if (unlikely(cpu >= nr_cpu_ids)) { | |
3317 | reset_migrate_dl_data(cs); | |
3318 | ret = -EINVAL; | |
3319 | goto out_unlock; | |
3320 | } | |
3321 | ||
3322 | ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); | |
3323 | if (ret) { | |
3324 | reset_migrate_dl_data(cs); | |
3325 | goto out_unlock; | |
3326 | } | |
3327 | } | |
3328 | ||
3329 | out_success: | |
452477fa TH |
3330 | /* |
3331 | * Mark attach is in progress. This makes validate_change() fail | |
3332 | * changes which zero cpus/mems_allowed. | |
3333 | */ | |
3334 | cs->attach_in_progress++; | |
5d21cc2d | 3335 | out_unlock: |
111cd11b | 3336 | mutex_unlock(&cpuset_mutex); |
5d21cc2d | 3337 | return ret; |
8793d854 | 3338 | } |
f780bdb7 | 3339 | |
1f7dd3e5 | 3340 | static void cpuset_cancel_attach(struct cgroup_taskset *tset) |
452477fa | 3341 | { |
1f7dd3e5 | 3342 | struct cgroup_subsys_state *css; |
ba9182a8 | 3343 | struct cpuset *cs; |
1f7dd3e5 TH |
3344 | |
3345 | cgroup_taskset_first(tset, &css); | |
ba9182a8 | 3346 | cs = css_cs(css); |
1f7dd3e5 | 3347 | |
111cd11b | 3348 | mutex_lock(&cpuset_mutex); |
ba9182a8 WL |
3349 | cs->attach_in_progress--; |
3350 | if (!cs->attach_in_progress) | |
3351 | wake_up(&cpuset_attach_wq); | |
2ef269ef DE |
3352 | |
3353 | if (cs->nr_migrate_dl_tasks) { | |
3354 | int cpu = cpumask_any(cs->effective_cpus); | |
3355 | ||
3356 | dl_bw_free(cpu, cs->sum_migrate_dl_bw); | |
3357 | reset_migrate_dl_data(cs); | |
3358 | } | |
3359 | ||
111cd11b | 3360 | mutex_unlock(&cpuset_mutex); |
8793d854 | 3361 | } |
1da177e4 | 3362 | |
4e4c9a14 | 3363 | /* |
111cd11b | 3364 | * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() |
4e4c9a14 TH |
3365 | * but we can't allocate it dynamically there. Define it global and |
3366 | * allocate from cpuset_init(). | |
3367 | */ | |
3368 | static cpumask_var_t cpus_attach; | |
42a11bf5 WL |
3369 | static nodemask_t cpuset_attach_nodemask_to; |
3370 | ||
3371 | static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) | |
3372 | { | |
111cd11b | 3373 | lockdep_assert_held(&cpuset_mutex); |
42a11bf5 WL |
3374 | |
3375 | if (cs != &top_cpuset) | |
3376 | guarantee_online_cpus(task, cpus_attach); | |
3377 | else | |
7e27cb6a | 3378 | cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), |
0c7f293e | 3379 | subpartitions_cpus); |
42a11bf5 WL |
3380 | /* |
3381 | * can_attach beforehand should guarantee that this doesn't | |
3382 | * fail. TODO: have a better way to handle failure here | |
3383 | */ | |
3384 | WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); | |
3385 | ||
3386 | cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); | |
3387 | cpuset_update_task_spread_flags(cs, task); | |
3388 | } | |
4e4c9a14 | 3389 | |
1f7dd3e5 | 3390 | static void cpuset_attach(struct cgroup_taskset *tset) |
8793d854 | 3391 | { |
bb9d97b6 | 3392 | struct task_struct *task; |
4530eddb | 3393 | struct task_struct *leader; |
1f7dd3e5 TH |
3394 | struct cgroup_subsys_state *css; |
3395 | struct cpuset *cs; | |
57fce0a6 | 3396 | struct cpuset *oldcs = cpuset_attach_old_cs; |
7fd4da9c | 3397 | bool cpus_updated, mems_updated; |
22fb52dd | 3398 | |
1f7dd3e5 TH |
3399 | cgroup_taskset_first(tset, &css); |
3400 | cs = css_cs(css); | |
3401 | ||
4f7e7236 | 3402 | lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ |
111cd11b | 3403 | mutex_lock(&cpuset_mutex); |
7fd4da9c WL |
3404 | cpus_updated = !cpumask_equal(cs->effective_cpus, |
3405 | oldcs->effective_cpus); | |
3406 | mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); | |
3407 | ||
3408 | /* | |
3409 | * In the default hierarchy, enabling cpuset in the child cgroups | |
3410 | * will trigger a number of cpuset_attach() calls with no change | |
3411 | * in effective cpus and mems. In that case, we can optimize out | |
3412 | * by skipping the task iteration and update. | |
3413 | */ | |
3414 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && | |
3415 | !cpus_updated && !mems_updated) { | |
3416 | cpuset_attach_nodemask_to = cs->effective_mems; | |
3417 | goto out; | |
3418 | } | |
5d21cc2d | 3419 | |
ae1c8023 | 3420 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); |
4e4c9a14 | 3421 | |
42a11bf5 WL |
3422 | cgroup_taskset_for_each(task, css, tset) |
3423 | cpuset_attach_task(cs, task); | |
22fb52dd | 3424 | |
f780bdb7 | 3425 | /* |
4530eddb | 3426 | * Change mm for all threadgroup leaders. This is expensive and may |
7fd4da9c WL |
3427 | * sleep and should be moved outside migration path proper. Skip it |
3428 | * if there is no change in effective_mems and CS_MEMORY_MIGRATE is | |
3429 | * not set. | |
f780bdb7 | 3430 | */ |
ae1c8023 | 3431 | cpuset_attach_nodemask_to = cs->effective_mems; |
7fd4da9c WL |
3432 | if (!is_memory_migrate(cs) && !mems_updated) |
3433 | goto out; | |
3434 | ||
1f7dd3e5 | 3435 | cgroup_taskset_for_each_leader(leader, css, tset) { |
3df9ca0a TH |
3436 | struct mm_struct *mm = get_task_mm(leader); |
3437 | ||
3438 | if (mm) { | |
3439 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); | |
3440 | ||
3441 | /* | |
3442 | * old_mems_allowed is the same with mems_allowed | |
3443 | * here, except if this task is being moved | |
3444 | * automatically due to hotplug. In that case | |
3445 | * @mems_allowed has been updated and is empty, so | |
3446 | * @old_mems_allowed is the right nodesets that we | |
3447 | * migrate mm from. | |
3448 | */ | |
e93ad19d | 3449 | if (is_memory_migrate(cs)) |
3df9ca0a TH |
3450 | cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, |
3451 | &cpuset_attach_nodemask_to); | |
e93ad19d TH |
3452 | else |
3453 | mmput(mm); | |
f047cecf | 3454 | } |
4225399a | 3455 | } |
452477fa | 3456 | |
7fd4da9c | 3457 | out: |
33ad801d | 3458 | cs->old_mems_allowed = cpuset_attach_nodemask_to; |
02bb5863 | 3459 | |
2ef269ef DE |
3460 | if (cs->nr_migrate_dl_tasks) { |
3461 | cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; | |
3462 | oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; | |
3463 | reset_migrate_dl_data(cs); | |
3464 | } | |
3465 | ||
452477fa | 3466 | cs->attach_in_progress--; |
e44193d3 LZ |
3467 | if (!cs->attach_in_progress) |
3468 | wake_up(&cpuset_attach_wq); | |
5d21cc2d | 3469 | |
111cd11b | 3470 | mutex_unlock(&cpuset_mutex); |
1da177e4 LT |
3471 | } |
3472 | ||
3473 | /* The various types of files and directories in a cpuset file system */ | |
3474 | ||
3475 | typedef enum { | |
45b07ef3 | 3476 | FILE_MEMORY_MIGRATE, |
1da177e4 LT |
3477 | FILE_CPULIST, |
3478 | FILE_MEMLIST, | |
afd1a8b3 LZ |
3479 | FILE_EFFECTIVE_CPULIST, |
3480 | FILE_EFFECTIVE_MEMLIST, | |
5cf8114d | 3481 | FILE_SUBPARTS_CPULIST, |
e2ffe502 | 3482 | FILE_EXCLUSIVE_CPULIST, |
0c7f293e | 3483 | FILE_EFFECTIVE_XCPULIST, |
11e5f407 | 3484 | FILE_ISOLATED_CPULIST, |
1da177e4 LT |
3485 | FILE_CPU_EXCLUSIVE, |
3486 | FILE_MEM_EXCLUSIVE, | |
78608366 | 3487 | FILE_MEM_HARDWALL, |
029190c5 | 3488 | FILE_SCHED_LOAD_BALANCE, |
ee8dde0c | 3489 | FILE_PARTITION_ROOT, |
1d3504fc | 3490 | FILE_SCHED_RELAX_DOMAIN_LEVEL, |
3e0d98b9 PJ |
3491 | FILE_MEMORY_PRESSURE_ENABLED, |
3492 | FILE_MEMORY_PRESSURE, | |
825a46af PJ |
3493 | FILE_SPREAD_PAGE, |
3494 | FILE_SPREAD_SLAB, | |
1da177e4 LT |
3495 | } cpuset_filetype_t; |
3496 | ||
182446d0 TH |
3497 | static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, |
3498 | u64 val) | |
700fe1ab | 3499 | { |
182446d0 | 3500 | struct cpuset *cs = css_cs(css); |
700fe1ab | 3501 | cpuset_filetype_t type = cft->private; |
a903f086 | 3502 | int retval = 0; |
700fe1ab | 3503 | |
c5c63b9a | 3504 | cpus_read_lock(); |
111cd11b | 3505 | mutex_lock(&cpuset_mutex); |
a903f086 LZ |
3506 | if (!is_cpuset_online(cs)) { |
3507 | retval = -ENODEV; | |
5d21cc2d | 3508 | goto out_unlock; |
a903f086 | 3509 | } |
700fe1ab PM |
3510 | |
3511 | switch (type) { | |
1da177e4 | 3512 | case FILE_CPU_EXCLUSIVE: |
700fe1ab | 3513 | retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); |
1da177e4 LT |
3514 | break; |
3515 | case FILE_MEM_EXCLUSIVE: | |
700fe1ab | 3516 | retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); |
1da177e4 | 3517 | break; |
78608366 PM |
3518 | case FILE_MEM_HARDWALL: |
3519 | retval = update_flag(CS_MEM_HARDWALL, cs, val); | |
3520 | break; | |
029190c5 | 3521 | case FILE_SCHED_LOAD_BALANCE: |
700fe1ab | 3522 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); |
1d3504fc | 3523 | break; |
45b07ef3 | 3524 | case FILE_MEMORY_MIGRATE: |
700fe1ab | 3525 | retval = update_flag(CS_MEMORY_MIGRATE, cs, val); |
45b07ef3 | 3526 | break; |
3e0d98b9 | 3527 | case FILE_MEMORY_PRESSURE_ENABLED: |
700fe1ab | 3528 | cpuset_memory_pressure_enabled = !!val; |
3e0d98b9 | 3529 | break; |
825a46af | 3530 | case FILE_SPREAD_PAGE: |
700fe1ab | 3531 | retval = update_flag(CS_SPREAD_PAGE, cs, val); |
825a46af PJ |
3532 | break; |
3533 | case FILE_SPREAD_SLAB: | |
700fe1ab | 3534 | retval = update_flag(CS_SPREAD_SLAB, cs, val); |
825a46af | 3535 | break; |
1da177e4 LT |
3536 | default: |
3537 | retval = -EINVAL; | |
700fe1ab | 3538 | break; |
1da177e4 | 3539 | } |
5d21cc2d | 3540 | out_unlock: |
111cd11b | 3541 | mutex_unlock(&cpuset_mutex); |
c5c63b9a | 3542 | cpus_read_unlock(); |
1da177e4 LT |
3543 | return retval; |
3544 | } | |
3545 | ||
182446d0 TH |
3546 | static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, |
3547 | s64 val) | |
5be7a479 | 3548 | { |
182446d0 | 3549 | struct cpuset *cs = css_cs(css); |
5be7a479 | 3550 | cpuset_filetype_t type = cft->private; |
5d21cc2d | 3551 | int retval = -ENODEV; |
5be7a479 | 3552 | |
c5c63b9a | 3553 | cpus_read_lock(); |
111cd11b | 3554 | mutex_lock(&cpuset_mutex); |
5d21cc2d TH |
3555 | if (!is_cpuset_online(cs)) |
3556 | goto out_unlock; | |
e3712395 | 3557 | |
5be7a479 PM |
3558 | switch (type) { |
3559 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | |
3560 | retval = update_relax_domain_level(cs, val); | |
3561 | break; | |
3562 | default: | |
3563 | retval = -EINVAL; | |
3564 | break; | |
3565 | } | |
5d21cc2d | 3566 | out_unlock: |
111cd11b | 3567 | mutex_unlock(&cpuset_mutex); |
c5c63b9a | 3568 | cpus_read_unlock(); |
5be7a479 PM |
3569 | return retval; |
3570 | } | |
3571 | ||
e3712395 PM |
3572 | /* |
3573 | * Common handling for a write to a "cpus" or "mems" file. | |
3574 | */ | |
451af504 TH |
3575 | static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, |
3576 | char *buf, size_t nbytes, loff_t off) | |
e3712395 | 3577 | { |
451af504 | 3578 | struct cpuset *cs = css_cs(of_css(of)); |
645fcc9d | 3579 | struct cpuset *trialcs; |
5d21cc2d | 3580 | int retval = -ENODEV; |
e3712395 | 3581 | |
451af504 TH |
3582 | buf = strstrip(buf); |
3583 | ||
3a5a6d0c TH |
3584 | /* |
3585 | * CPU or memory hotunplug may leave @cs w/o any execution | |
3586 | * resources, in which case the hotplug code asynchronously updates | |
3587 | * configuration and transfers all tasks to the nearest ancestor | |
3588 | * which can execute. | |
3589 | * | |
3590 | * As writes to "cpus" or "mems" may restore @cs's execution | |
3591 | * resources, wait for the previously scheduled operations before | |
3592 | * proceeding, so that we don't end up keep removing tasks added | |
3593 | * after execution capability is restored. | |
76bb5ab8 | 3594 | * |
2125c003 WL |
3595 | * cpuset_handle_hotplug may call back into cgroup core asynchronously |
3596 | * via cgroup_transfer_tasks() and waiting for it from a cgroupfs | |
76bb5ab8 TH |
3597 | * operation like this one can lead to a deadlock through kernfs |
3598 | * active_ref protection. Let's break the protection. Losing the | |
3599 | * protection is okay as we check whether @cs is online after | |
111cd11b | 3600 | * grabbing cpuset_mutex anyway. This only happens on the legacy |
76bb5ab8 | 3601 | * hierarchies. |
3a5a6d0c | 3602 | */ |
76bb5ab8 TH |
3603 | css_get(&cs->css); |
3604 | kernfs_break_active_protection(of->kn); | |
3a5a6d0c | 3605 | |
c5c63b9a | 3606 | cpus_read_lock(); |
111cd11b | 3607 | mutex_lock(&cpuset_mutex); |
5d21cc2d TH |
3608 | if (!is_cpuset_online(cs)) |
3609 | goto out_unlock; | |
e3712395 | 3610 | |
645fcc9d | 3611 | trialcs = alloc_trial_cpuset(cs); |
b75f38d6 LZ |
3612 | if (!trialcs) { |
3613 | retval = -ENOMEM; | |
5d21cc2d | 3614 | goto out_unlock; |
b75f38d6 | 3615 | } |
645fcc9d | 3616 | |
451af504 | 3617 | switch (of_cft(of)->private) { |
e3712395 | 3618 | case FILE_CPULIST: |
645fcc9d | 3619 | retval = update_cpumask(cs, trialcs, buf); |
e3712395 | 3620 | break; |
e2ffe502 WL |
3621 | case FILE_EXCLUSIVE_CPULIST: |
3622 | retval = update_exclusive_cpumask(cs, trialcs, buf); | |
3623 | break; | |
e3712395 | 3624 | case FILE_MEMLIST: |
645fcc9d | 3625 | retval = update_nodemask(cs, trialcs, buf); |
e3712395 PM |
3626 | break; |
3627 | default: | |
3628 | retval = -EINVAL; | |
3629 | break; | |
3630 | } | |
645fcc9d | 3631 | |
bf92370c | 3632 | free_cpuset(trialcs); |
5d21cc2d | 3633 | out_unlock: |
111cd11b | 3634 | mutex_unlock(&cpuset_mutex); |
c5c63b9a | 3635 | cpus_read_unlock(); |
76bb5ab8 TH |
3636 | kernfs_unbreak_active_protection(of->kn); |
3637 | css_put(&cs->css); | |
e93ad19d | 3638 | flush_workqueue(cpuset_migrate_mm_wq); |
451af504 | 3639 | return retval ?: nbytes; |
e3712395 PM |
3640 | } |
3641 | ||
1da177e4 LT |
3642 | /* |
3643 | * These ascii lists should be read in a single call, by using a user | |
3644 | * buffer large enough to hold the entire map. If read in smaller | |
3645 | * chunks, there is no guarantee of atomicity. Since the display format | |
3646 | * used, list of ranges of sequential numbers, is variable length, | |
3647 | * and since these maps can change value dynamically, one could read | |
3648 | * gibberish by doing partial reads while a list was changing. | |
1da177e4 | 3649 | */ |
2da8ca82 | 3650 | static int cpuset_common_seq_show(struct seq_file *sf, void *v) |
1da177e4 | 3651 | { |
2da8ca82 TH |
3652 | struct cpuset *cs = css_cs(seq_css(sf)); |
3653 | cpuset_filetype_t type = seq_cft(sf)->private; | |
51ffe411 | 3654 | int ret = 0; |
1da177e4 | 3655 | |
8447a0fe | 3656 | spin_lock_irq(&callback_lock); |
1da177e4 LT |
3657 | |
3658 | switch (type) { | |
3659 | case FILE_CPULIST: | |
e8e6d97c | 3660 | seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); |
1da177e4 LT |
3661 | break; |
3662 | case FILE_MEMLIST: | |
e8e6d97c | 3663 | seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); |
1da177e4 | 3664 | break; |
afd1a8b3 | 3665 | case FILE_EFFECTIVE_CPULIST: |
e8e6d97c | 3666 | seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); |
afd1a8b3 LZ |
3667 | break; |
3668 | case FILE_EFFECTIVE_MEMLIST: | |
e8e6d97c | 3669 | seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); |
afd1a8b3 | 3670 | break; |
e2ffe502 WL |
3671 | case FILE_EXCLUSIVE_CPULIST: |
3672 | seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); | |
3673 | break; | |
0c7f293e WL |
3674 | case FILE_EFFECTIVE_XCPULIST: |
3675 | seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); | |
3676 | break; | |
5cf8114d | 3677 | case FILE_SUBPARTS_CPULIST: |
0c7f293e | 3678 | seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); |
5cf8114d | 3679 | break; |
11e5f407 WL |
3680 | case FILE_ISOLATED_CPULIST: |
3681 | seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); | |
3682 | break; | |
1da177e4 | 3683 | default: |
51ffe411 | 3684 | ret = -EINVAL; |
1da177e4 | 3685 | } |
1da177e4 | 3686 | |
8447a0fe | 3687 | spin_unlock_irq(&callback_lock); |
51ffe411 | 3688 | return ret; |
1da177e4 LT |
3689 | } |
3690 | ||
182446d0 | 3691 | static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) |
700fe1ab | 3692 | { |
182446d0 | 3693 | struct cpuset *cs = css_cs(css); |
700fe1ab PM |
3694 | cpuset_filetype_t type = cft->private; |
3695 | switch (type) { | |
3696 | case FILE_CPU_EXCLUSIVE: | |
3697 | return is_cpu_exclusive(cs); | |
3698 | case FILE_MEM_EXCLUSIVE: | |
3699 | return is_mem_exclusive(cs); | |
78608366 PM |
3700 | case FILE_MEM_HARDWALL: |
3701 | return is_mem_hardwall(cs); | |
700fe1ab PM |
3702 | case FILE_SCHED_LOAD_BALANCE: |
3703 | return is_sched_load_balance(cs); | |
3704 | case FILE_MEMORY_MIGRATE: | |
3705 | return is_memory_migrate(cs); | |
3706 | case FILE_MEMORY_PRESSURE_ENABLED: | |
3707 | return cpuset_memory_pressure_enabled; | |
3708 | case FILE_MEMORY_PRESSURE: | |
3709 | return fmeter_getrate(&cs->fmeter); | |
3710 | case FILE_SPREAD_PAGE: | |
3711 | return is_spread_page(cs); | |
3712 | case FILE_SPREAD_SLAB: | |
3713 | return is_spread_slab(cs); | |
3714 | default: | |
3715 | BUG(); | |
3716 | } | |
cf417141 MK |
3717 | |
3718 | /* Unreachable but makes gcc happy */ | |
3719 | return 0; | |
700fe1ab | 3720 | } |
1da177e4 | 3721 | |
182446d0 | 3722 | static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) |
5be7a479 | 3723 | { |
182446d0 | 3724 | struct cpuset *cs = css_cs(css); |
5be7a479 PM |
3725 | cpuset_filetype_t type = cft->private; |
3726 | switch (type) { | |
3727 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | |
3728 | return cs->relax_domain_level; | |
3729 | default: | |
3730 | BUG(); | |
3731 | } | |
cf417141 | 3732 | |
d95af61d | 3733 | /* Unreachable but makes gcc happy */ |
cf417141 | 3734 | return 0; |
5be7a479 PM |
3735 | } |
3736 | ||
bb5b553c WL |
3737 | static int sched_partition_show(struct seq_file *seq, void *v) |
3738 | { | |
3739 | struct cpuset *cs = css_cs(seq_css(seq)); | |
7476a636 | 3740 | const char *err, *type = NULL; |
bb5b553c WL |
3741 | |
3742 | switch (cs->partition_root_state) { | |
18065ebe | 3743 | case PRS_ROOT: |
bb5b553c WL |
3744 | seq_puts(seq, "root\n"); |
3745 | break; | |
f28e2244 WL |
3746 | case PRS_ISOLATED: |
3747 | seq_puts(seq, "isolated\n"); | |
3748 | break; | |
18065ebe | 3749 | case PRS_MEMBER: |
bb5b553c WL |
3750 | seq_puts(seq, "member\n"); |
3751 | break; | |
18065ebe | 3752 | case PRS_INVALID_ROOT: |
7476a636 WL |
3753 | type = "root"; |
3754 | fallthrough; | |
f28e2244 | 3755 | case PRS_INVALID_ISOLATED: |
7476a636 WL |
3756 | if (!type) |
3757 | type = "isolated"; | |
3758 | err = perr_strings[READ_ONCE(cs->prs_err)]; | |
3759 | if (err) | |
3760 | seq_printf(seq, "%s invalid (%s)\n", type, err); | |
3761 | else | |
3762 | seq_printf(seq, "%s invalid\n", type); | |
3763 | break; | |
bb5b553c WL |
3764 | } |
3765 | return 0; | |
3766 | } | |
3767 | ||
3768 | static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, | |
3769 | size_t nbytes, loff_t off) | |
3770 | { | |
3771 | struct cpuset *cs = css_cs(of_css(of)); | |
3772 | int val; | |
3773 | int retval = -ENODEV; | |
3774 | ||
3775 | buf = strstrip(buf); | |
3776 | ||
b1e3aeb1 | 3777 | if (!strcmp(buf, "root")) |
18065ebe | 3778 | val = PRS_ROOT; |
b1e3aeb1 | 3779 | else if (!strcmp(buf, "member")) |
18065ebe | 3780 | val = PRS_MEMBER; |
f28e2244 WL |
3781 | else if (!strcmp(buf, "isolated")) |
3782 | val = PRS_ISOLATED; | |
bb5b553c WL |
3783 | else |
3784 | return -EINVAL; | |
3785 | ||
3786 | css_get(&cs->css); | |
c5c63b9a | 3787 | cpus_read_lock(); |
111cd11b | 3788 | mutex_lock(&cpuset_mutex); |
bb5b553c WL |
3789 | if (!is_cpuset_online(cs)) |
3790 | goto out_unlock; | |
3791 | ||
3792 | retval = update_prstate(cs, val); | |
3793 | out_unlock: | |
111cd11b | 3794 | mutex_unlock(&cpuset_mutex); |
c5c63b9a | 3795 | cpus_read_unlock(); |
bb5b553c WL |
3796 | css_put(&cs->css); |
3797 | return retval ?: nbytes; | |
3798 | } | |
1da177e4 LT |
3799 | |
3800 | /* | |
3801 | * for the common functions, 'private' gives the type of file | |
3802 | */ | |
3803 | ||
4ec22e9c | 3804 | static struct cftype legacy_files[] = { |
addf2c73 PM |
3805 | { |
3806 | .name = "cpus", | |
2da8ca82 | 3807 | .seq_show = cpuset_common_seq_show, |
451af504 | 3808 | .write = cpuset_write_resmask, |
e3712395 | 3809 | .max_write_len = (100U + 6 * NR_CPUS), |
addf2c73 PM |
3810 | .private = FILE_CPULIST, |
3811 | }, | |
3812 | ||
3813 | { | |
3814 | .name = "mems", | |
2da8ca82 | 3815 | .seq_show = cpuset_common_seq_show, |
451af504 | 3816 | .write = cpuset_write_resmask, |
e3712395 | 3817 | .max_write_len = (100U + 6 * MAX_NUMNODES), |
addf2c73 PM |
3818 | .private = FILE_MEMLIST, |
3819 | }, | |
3820 | ||
afd1a8b3 LZ |
3821 | { |
3822 | .name = "effective_cpus", | |
3823 | .seq_show = cpuset_common_seq_show, | |
3824 | .private = FILE_EFFECTIVE_CPULIST, | |
3825 | }, | |
3826 | ||
3827 | { | |
3828 | .name = "effective_mems", | |
3829 | .seq_show = cpuset_common_seq_show, | |
3830 | .private = FILE_EFFECTIVE_MEMLIST, | |
3831 | }, | |
3832 | ||
addf2c73 PM |
3833 | { |
3834 | .name = "cpu_exclusive", | |
3835 | .read_u64 = cpuset_read_u64, | |
3836 | .write_u64 = cpuset_write_u64, | |
3837 | .private = FILE_CPU_EXCLUSIVE, | |
3838 | }, | |
3839 | ||
3840 | { | |
3841 | .name = "mem_exclusive", | |
3842 | .read_u64 = cpuset_read_u64, | |
3843 | .write_u64 = cpuset_write_u64, | |
3844 | .private = FILE_MEM_EXCLUSIVE, | |
3845 | }, | |
3846 | ||
78608366 PM |
3847 | { |
3848 | .name = "mem_hardwall", | |
3849 | .read_u64 = cpuset_read_u64, | |
3850 | .write_u64 = cpuset_write_u64, | |
3851 | .private = FILE_MEM_HARDWALL, | |
3852 | }, | |
3853 | ||
addf2c73 PM |
3854 | { |
3855 | .name = "sched_load_balance", | |
3856 | .read_u64 = cpuset_read_u64, | |
3857 | .write_u64 = cpuset_write_u64, | |
3858 | .private = FILE_SCHED_LOAD_BALANCE, | |
3859 | }, | |
3860 | ||
3861 | { | |
3862 | .name = "sched_relax_domain_level", | |
5be7a479 PM |
3863 | .read_s64 = cpuset_read_s64, |
3864 | .write_s64 = cpuset_write_s64, | |
addf2c73 PM |
3865 | .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, |
3866 | }, | |
3867 | ||
3868 | { | |
3869 | .name = "memory_migrate", | |
3870 | .read_u64 = cpuset_read_u64, | |
3871 | .write_u64 = cpuset_write_u64, | |
3872 | .private = FILE_MEMORY_MIGRATE, | |
3873 | }, | |
3874 | ||
3875 | { | |
3876 | .name = "memory_pressure", | |
3877 | .read_u64 = cpuset_read_u64, | |
1c08c22c | 3878 | .private = FILE_MEMORY_PRESSURE, |
addf2c73 PM |
3879 | }, |
3880 | ||
3881 | { | |
3882 | .name = "memory_spread_page", | |
3883 | .read_u64 = cpuset_read_u64, | |
3884 | .write_u64 = cpuset_write_u64, | |
3885 | .private = FILE_SPREAD_PAGE, | |
3886 | }, | |
3887 | ||
3888 | { | |
3ab67a9c | 3889 | /* obsolete, may be removed in the future */ |
addf2c73 PM |
3890 | .name = "memory_spread_slab", |
3891 | .read_u64 = cpuset_read_u64, | |
3892 | .write_u64 = cpuset_write_u64, | |
3893 | .private = FILE_SPREAD_SLAB, | |
3894 | }, | |
3e0d98b9 | 3895 | |
4baf6e33 TH |
3896 | { |
3897 | .name = "memory_pressure_enabled", | |
3898 | .flags = CFTYPE_ONLY_ON_ROOT, | |
3899 | .read_u64 = cpuset_read_u64, | |
3900 | .write_u64 = cpuset_write_u64, | |
3901 | .private = FILE_MEMORY_PRESSURE_ENABLED, | |
3902 | }, | |
1da177e4 | 3903 | |
4baf6e33 TH |
3904 | { } /* terminate */ |
3905 | }; | |
1da177e4 | 3906 | |
4ec22e9c WL |
3907 | /* |
3908 | * This is currently a minimal set for the default hierarchy. It can be | |
3909 | * expanded later on by migrating more features and control files from v1. | |
3910 | */ | |
3911 | static struct cftype dfl_files[] = { | |
3912 | { | |
3913 | .name = "cpus", | |
3914 | .seq_show = cpuset_common_seq_show, | |
3915 | .write = cpuset_write_resmask, | |
3916 | .max_write_len = (100U + 6 * NR_CPUS), | |
3917 | .private = FILE_CPULIST, | |
3918 | .flags = CFTYPE_NOT_ON_ROOT, | |
3919 | }, | |
3920 | ||
3921 | { | |
3922 | .name = "mems", | |
3923 | .seq_show = cpuset_common_seq_show, | |
3924 | .write = cpuset_write_resmask, | |
3925 | .max_write_len = (100U + 6 * MAX_NUMNODES), | |
3926 | .private = FILE_MEMLIST, | |
3927 | .flags = CFTYPE_NOT_ON_ROOT, | |
3928 | }, | |
3929 | ||
3930 | { | |
3931 | .name = "cpus.effective", | |
3932 | .seq_show = cpuset_common_seq_show, | |
3933 | .private = FILE_EFFECTIVE_CPULIST, | |
4ec22e9c WL |
3934 | }, |
3935 | ||
3936 | { | |
3937 | .name = "mems.effective", | |
3938 | .seq_show = cpuset_common_seq_show, | |
3939 | .private = FILE_EFFECTIVE_MEMLIST, | |
4ec22e9c WL |
3940 | }, |
3941 | ||
ee8dde0c | 3942 | { |
b1e3aeb1 | 3943 | .name = "cpus.partition", |
bb5b553c WL |
3944 | .seq_show = sched_partition_show, |
3945 | .write = sched_partition_write, | |
ee8dde0c WL |
3946 | .private = FILE_PARTITION_ROOT, |
3947 | .flags = CFTYPE_NOT_ON_ROOT, | |
e7cc9888 | 3948 | .file_offset = offsetof(struct cpuset, partition_file), |
ee8dde0c WL |
3949 | }, |
3950 | ||
e2ffe502 WL |
3951 | { |
3952 | .name = "cpus.exclusive", | |
3953 | .seq_show = cpuset_common_seq_show, | |
3954 | .write = cpuset_write_resmask, | |
3955 | .max_write_len = (100U + 6 * NR_CPUS), | |
3956 | .private = FILE_EXCLUSIVE_CPULIST, | |
3957 | .flags = CFTYPE_NOT_ON_ROOT, | |
3958 | }, | |
3959 | ||
0c7f293e WL |
3960 | { |
3961 | .name = "cpus.exclusive.effective", | |
3962 | .seq_show = cpuset_common_seq_show, | |
3963 | .private = FILE_EFFECTIVE_XCPULIST, | |
3964 | .flags = CFTYPE_NOT_ON_ROOT, | |
3965 | }, | |
3966 | ||
5cf8114d WL |
3967 | { |
3968 | .name = "cpus.subpartitions", | |
3969 | .seq_show = cpuset_common_seq_show, | |
3970 | .private = FILE_SUBPARTS_CPULIST, | |
0c7f293e | 3971 | .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, |
5cf8114d WL |
3972 | }, |
3973 | ||
11e5f407 WL |
3974 | { |
3975 | .name = "cpus.isolated", | |
3976 | .seq_show = cpuset_common_seq_show, | |
3977 | .private = FILE_ISOLATED_CPULIST, | |
877c737d | 3978 | .flags = CFTYPE_ONLY_ON_ROOT, |
11e5f407 WL |
3979 | }, |
3980 | ||
4ec22e9c WL |
3981 | { } /* terminate */ |
3982 | }; | |
3983 | ||
3984 | ||
0a2cafe6 KB |
3985 | /** |
3986 | * cpuset_css_alloc - Allocate a cpuset css | |
3987 | * @parent_css: Parent css of the control group that the new cpuset will be | |
3988 | * part of | |
3989 | * Return: cpuset css on success, -ENOMEM on failure. | |
3990 | * | |
3991 | * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return | |
3992 | * top cpuset css otherwise. | |
1da177e4 | 3993 | */ |
eb95419b TH |
3994 | static struct cgroup_subsys_state * |
3995 | cpuset_css_alloc(struct cgroup_subsys_state *parent_css) | |
1da177e4 | 3996 | { |
c8f699bb | 3997 | struct cpuset *cs; |
1da177e4 | 3998 | |
eb95419b | 3999 | if (!parent_css) |
8793d854 | 4000 | return &top_cpuset.css; |
033fa1c5 | 4001 | |
c8f699bb | 4002 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
1da177e4 | 4003 | if (!cs) |
8793d854 | 4004 | return ERR_PTR(-ENOMEM); |
bf92370c WL |
4005 | |
4006 | if (alloc_cpumasks(cs, NULL)) { | |
4007 | kfree(cs); | |
4008 | return ERR_PTR(-ENOMEM); | |
4009 | } | |
1da177e4 | 4010 | |
ee9707e8 | 4011 | __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
f9a86fcb | 4012 | nodes_clear(cs->mems_allowed); |
e2b9a3d7 | 4013 | nodes_clear(cs->effective_mems); |
3e0d98b9 | 4014 | fmeter_init(&cs->fmeter); |
1d3504fc | 4015 | cs->relax_domain_level = -1; |
181c8e09 | 4016 | INIT_LIST_HEAD(&cs->remote_sibling); |
1da177e4 | 4017 | |
ee9707e8 WL |
4018 | /* Set CS_MEMORY_MIGRATE for default hierarchy */ |
4019 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) | |
4020 | __set_bit(CS_MEMORY_MIGRATE, &cs->flags); | |
4021 | ||
c8f699bb TH |
4022 | return &cs->css; |
4023 | } | |
4024 | ||
eb95419b | 4025 | static int cpuset_css_online(struct cgroup_subsys_state *css) |
c8f699bb | 4026 | { |
eb95419b | 4027 | struct cpuset *cs = css_cs(css); |
c431069f | 4028 | struct cpuset *parent = parent_cs(cs); |
ae8086ce | 4029 | struct cpuset *tmp_cs; |
492eb21b | 4030 | struct cgroup_subsys_state *pos_css; |
c8f699bb TH |
4031 | |
4032 | if (!parent) | |
4033 | return 0; | |
4034 | ||
c5c63b9a | 4035 | cpus_read_lock(); |
111cd11b | 4036 | mutex_lock(&cpuset_mutex); |
5d21cc2d | 4037 | |
efeb77b2 | 4038 | set_bit(CS_ONLINE, &cs->flags); |
c8f699bb TH |
4039 | if (is_spread_page(parent)) |
4040 | set_bit(CS_SPREAD_PAGE, &cs->flags); | |
4041 | if (is_spread_slab(parent)) | |
4042 | set_bit(CS_SPREAD_SLAB, &cs->flags); | |
1da177e4 | 4043 | |
664eedde | 4044 | cpuset_inc(); |
033fa1c5 | 4045 | |
8447a0fe | 4046 | spin_lock_irq(&callback_lock); |
b8d1b8ee | 4047 | if (is_in_v2_mode()) { |
e2b9a3d7 LZ |
4048 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); |
4049 | cs->effective_mems = parent->effective_mems; | |
4716909c WL |
4050 | cs->use_parent_ecpus = true; |
4051 | parent->child_ecpus_count++; | |
e2b9a3d7 | 4052 | } |
c8c92620 WL |
4053 | |
4054 | /* | |
4055 | * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated | |
4056 | */ | |
4057 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && | |
4058 | !is_sched_load_balance(parent)) | |
4059 | clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | |
4060 | ||
8447a0fe | 4061 | spin_unlock_irq(&callback_lock); |
e2b9a3d7 | 4062 | |
eb95419b | 4063 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
5d21cc2d | 4064 | goto out_unlock; |
033fa1c5 TH |
4065 | |
4066 | /* | |
4067 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | |
4068 | * set. This flag handling is implemented in cgroup core for | |
f9da322e | 4069 | * historical reasons - the flag may be specified during mount. |
033fa1c5 TH |
4070 | * |
4071 | * Currently, if any sibling cpusets have exclusive cpus or mem, we | |
4072 | * refuse to clone the configuration - thereby refusing the task to | |
4073 | * be entered, and as a result refusing the sys_unshare() or | |
4074 | * clone() which initiated it. If this becomes a problem for some | |
4075 | * users who wish to allow that scenario, then this could be | |
4076 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | |
4077 | * (and likewise for mems) to the new cgroup. | |
4078 | */ | |
ae8086ce | 4079 | rcu_read_lock(); |
492eb21b | 4080 | cpuset_for_each_child(tmp_cs, pos_css, parent) { |
ae8086ce TH |
4081 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { |
4082 | rcu_read_unlock(); | |
5d21cc2d | 4083 | goto out_unlock; |
ae8086ce | 4084 | } |
033fa1c5 | 4085 | } |
ae8086ce | 4086 | rcu_read_unlock(); |
033fa1c5 | 4087 | |
8447a0fe | 4088 | spin_lock_irq(&callback_lock); |
033fa1c5 | 4089 | cs->mems_allowed = parent->mems_allowed; |
790317e1 | 4090 | cs->effective_mems = parent->mems_allowed; |
033fa1c5 | 4091 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); |
790317e1 | 4092 | cpumask_copy(cs->effective_cpus, parent->cpus_allowed); |
cea74465 | 4093 | spin_unlock_irq(&callback_lock); |
5d21cc2d | 4094 | out_unlock: |
111cd11b | 4095 | mutex_unlock(&cpuset_mutex); |
c5c63b9a | 4096 | cpus_read_unlock(); |
c8f699bb TH |
4097 | return 0; |
4098 | } | |
4099 | ||
0b9e6965 ZH |
4100 | /* |
4101 | * If the cpuset being removed has its flag 'sched_load_balance' | |
4102 | * enabled, then simulate turning sched_load_balance off, which | |
ee8dde0c WL |
4103 | * will call rebuild_sched_domains_locked(). That is not needed |
4104 | * in the default hierarchy where only changes in partition | |
4105 | * will cause repartitioning. | |
4106 | * | |
4107 | * If the cpuset has the 'sched.partition' flag enabled, simulate | |
4108 | * turning 'sched.partition" off. | |
0b9e6965 ZH |
4109 | */ |
4110 | ||
eb95419b | 4111 | static void cpuset_css_offline(struct cgroup_subsys_state *css) |
c8f699bb | 4112 | { |
eb95419b | 4113 | struct cpuset *cs = css_cs(css); |
c8f699bb | 4114 | |
c5c63b9a | 4115 | cpus_read_lock(); |
111cd11b | 4116 | mutex_lock(&cpuset_mutex); |
c8f699bb | 4117 | |
18065ebe | 4118 | if (is_partition_valid(cs)) |
ee8dde0c WL |
4119 | update_prstate(cs, 0); |
4120 | ||
4121 | if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && | |
4122 | is_sched_load_balance(cs)) | |
c8f699bb TH |
4123 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
4124 | ||
4716909c WL |
4125 | if (cs->use_parent_ecpus) { |
4126 | struct cpuset *parent = parent_cs(cs); | |
4127 | ||
4128 | cs->use_parent_ecpus = false; | |
4129 | parent->child_ecpus_count--; | |
4130 | } | |
4131 | ||
664eedde | 4132 | cpuset_dec(); |
efeb77b2 | 4133 | clear_bit(CS_ONLINE, &cs->flags); |
c8f699bb | 4134 | |
111cd11b | 4135 | mutex_unlock(&cpuset_mutex); |
c5c63b9a | 4136 | cpus_read_unlock(); |
1da177e4 LT |
4137 | } |
4138 | ||
eb95419b | 4139 | static void cpuset_css_free(struct cgroup_subsys_state *css) |
1da177e4 | 4140 | { |
eb95419b | 4141 | struct cpuset *cs = css_cs(css); |
1da177e4 | 4142 | |
bf92370c | 4143 | free_cpuset(cs); |
1da177e4 LT |
4144 | } |
4145 | ||
39bd0d15 LZ |
4146 | static void cpuset_bind(struct cgroup_subsys_state *root_css) |
4147 | { | |
111cd11b | 4148 | mutex_lock(&cpuset_mutex); |
8447a0fe | 4149 | spin_lock_irq(&callback_lock); |
39bd0d15 | 4150 | |
b8d1b8ee | 4151 | if (is_in_v2_mode()) { |
39bd0d15 | 4152 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); |
0c7f293e | 4153 | cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); |
39bd0d15 LZ |
4154 | top_cpuset.mems_allowed = node_possible_map; |
4155 | } else { | |
4156 | cpumask_copy(top_cpuset.cpus_allowed, | |
4157 | top_cpuset.effective_cpus); | |
4158 | top_cpuset.mems_allowed = top_cpuset.effective_mems; | |
4159 | } | |
4160 | ||
8447a0fe | 4161 | spin_unlock_irq(&callback_lock); |
111cd11b | 4162 | mutex_unlock(&cpuset_mutex); |
39bd0d15 LZ |
4163 | } |
4164 | ||
eee87853 WL |
4165 | /* |
4166 | * In case the child is cloned into a cpuset different from its parent, | |
4167 | * additional checks are done to see if the move is allowed. | |
4168 | */ | |
4169 | static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) | |
4170 | { | |
4171 | struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); | |
4172 | bool same_cs; | |
4173 | int ret; | |
4174 | ||
4175 | rcu_read_lock(); | |
4176 | same_cs = (cs == task_cs(current)); | |
4177 | rcu_read_unlock(); | |
4178 | ||
4179 | if (same_cs) | |
4180 | return 0; | |
4181 | ||
4182 | lockdep_assert_held(&cgroup_mutex); | |
111cd11b | 4183 | mutex_lock(&cpuset_mutex); |
eee87853 WL |
4184 | |
4185 | /* Check to see if task is allowed in the cpuset */ | |
4186 | ret = cpuset_can_attach_check(cs); | |
4187 | if (ret) | |
4188 | goto out_unlock; | |
4189 | ||
2ef269ef | 4190 | ret = task_can_attach(task); |
eee87853 WL |
4191 | if (ret) |
4192 | goto out_unlock; | |
4193 | ||
4194 | ret = security_task_setscheduler(task); | |
4195 | if (ret) | |
4196 | goto out_unlock; | |
4197 | ||
4198 | /* | |
4199 | * Mark attach is in progress. This makes validate_change() fail | |
4200 | * changes which zero cpus/mems_allowed. | |
4201 | */ | |
4202 | cs->attach_in_progress++; | |
4203 | out_unlock: | |
111cd11b | 4204 | mutex_unlock(&cpuset_mutex); |
eee87853 WL |
4205 | return ret; |
4206 | } | |
4207 | ||
4208 | static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) | |
4209 | { | |
4210 | struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); | |
4211 | bool same_cs; | |
4212 | ||
4213 | rcu_read_lock(); | |
4214 | same_cs = (cs == task_cs(current)); | |
4215 | rcu_read_unlock(); | |
4216 | ||
4217 | if (same_cs) | |
4218 | return; | |
4219 | ||
111cd11b | 4220 | mutex_lock(&cpuset_mutex); |
eee87853 WL |
4221 | cs->attach_in_progress--; |
4222 | if (!cs->attach_in_progress) | |
4223 | wake_up(&cpuset_attach_wq); | |
111cd11b | 4224 | mutex_unlock(&cpuset_mutex); |
eee87853 WL |
4225 | } |
4226 | ||
06f4e948 ZL |
4227 | /* |
4228 | * Make sure the new task conform to the current state of its parent, | |
4229 | * which could have been changed by cpuset just after it inherits the | |
4230 | * state from the parent and before it sits on the cgroup's task list. | |
4231 | */ | |
8a15b817 | 4232 | static void cpuset_fork(struct task_struct *task) |
06f4e948 | 4233 | { |
42a11bf5 WL |
4234 | struct cpuset *cs; |
4235 | bool same_cs; | |
4236 | ||
4237 | rcu_read_lock(); | |
4238 | cs = task_cs(task); | |
4239 | same_cs = (cs == task_cs(current)); | |
4240 | rcu_read_unlock(); | |
4241 | ||
4242 | if (same_cs) { | |
4243 | if (cs == &top_cpuset) | |
4244 | return; | |
4245 | ||
4246 | set_cpus_allowed_ptr(task, current->cpus_ptr); | |
4247 | task->mems_allowed = current->mems_allowed; | |
06f4e948 | 4248 | return; |
42a11bf5 | 4249 | } |
06f4e948 | 4250 | |
42a11bf5 | 4251 | /* CLONE_INTO_CGROUP */ |
111cd11b | 4252 | mutex_lock(&cpuset_mutex); |
42a11bf5 WL |
4253 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); |
4254 | cpuset_attach_task(cs, task); | |
eee87853 WL |
4255 | |
4256 | cs->attach_in_progress--; | |
4257 | if (!cs->attach_in_progress) | |
4258 | wake_up(&cpuset_attach_wq); | |
4259 | ||
111cd11b | 4260 | mutex_unlock(&cpuset_mutex); |
06f4e948 ZL |
4261 | } |
4262 | ||
073219e9 | 4263 | struct cgroup_subsys cpuset_cgrp_subsys = { |
39bd0d15 LZ |
4264 | .css_alloc = cpuset_css_alloc, |
4265 | .css_online = cpuset_css_online, | |
4266 | .css_offline = cpuset_css_offline, | |
4267 | .css_free = cpuset_css_free, | |
4268 | .can_attach = cpuset_can_attach, | |
4269 | .cancel_attach = cpuset_cancel_attach, | |
4270 | .attach = cpuset_attach, | |
5cf1cacb | 4271 | .post_attach = cpuset_post_attach, |
39bd0d15 | 4272 | .bind = cpuset_bind, |
eee87853 WL |
4273 | .can_fork = cpuset_can_fork, |
4274 | .cancel_fork = cpuset_cancel_fork, | |
06f4e948 | 4275 | .fork = cpuset_fork, |
4ec22e9c WL |
4276 | .legacy_cftypes = legacy_files, |
4277 | .dfl_cftypes = dfl_files, | |
b38e42e9 | 4278 | .early_init = true, |
4ec22e9c | 4279 | .threaded = true, |
8793d854 PM |
4280 | }; |
4281 | ||
1da177e4 LT |
4282 | /** |
4283 | * cpuset_init - initialize cpusets at system boot | |
4284 | * | |
d5f68d33 | 4285 | * Description: Initialize top_cpuset |
1da177e4 LT |
4286 | **/ |
4287 | ||
4288 | int __init cpuset_init(void) | |
4289 | { | |
75fa8e5d NMG |
4290 | BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); |
4291 | BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); | |
0c7f293e | 4292 | BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); |
e2ffe502 | 4293 | BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); |
0c7f293e | 4294 | BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); |
11e5f407 | 4295 | BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); |
58568d2a | 4296 | |
300ed6cb | 4297 | cpumask_setall(top_cpuset.cpus_allowed); |
f9a86fcb | 4298 | nodes_setall(top_cpuset.mems_allowed); |
e2b9a3d7 | 4299 | cpumask_setall(top_cpuset.effective_cpus); |
0c7f293e | 4300 | cpumask_setall(top_cpuset.effective_xcpus); |
e2ffe502 | 4301 | cpumask_setall(top_cpuset.exclusive_cpus); |
e2b9a3d7 | 4302 | nodes_setall(top_cpuset.effective_mems); |
1da177e4 | 4303 | |
3e0d98b9 | 4304 | fmeter_init(&top_cpuset.fmeter); |
181c8e09 | 4305 | INIT_LIST_HEAD(&remote_children); |
1da177e4 | 4306 | |
75fa8e5d | 4307 | BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); |
2341d1b6 | 4308 | |
8793d854 | 4309 | return 0; |
1da177e4 LT |
4310 | } |
4311 | ||
b1aac8bb | 4312 | /* |
cf417141 | 4313 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
b1aac8bb PJ |
4314 | * or memory nodes, we need to walk over the cpuset hierarchy, |
4315 | * removing that CPU or node from all cpusets. If this removes the | |
956db3ca CW |
4316 | * last CPU or node from a cpuset, then move the tasks in the empty |
4317 | * cpuset to its next-highest non-empty parent. | |
b1aac8bb | 4318 | */ |
956db3ca CW |
4319 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) |
4320 | { | |
4321 | struct cpuset *parent; | |
4322 | ||
956db3ca CW |
4323 | /* |
4324 | * Find its next-highest non-empty parent, (top cpuset | |
4325 | * has online cpus, so can't be empty). | |
4326 | */ | |
c431069f | 4327 | parent = parent_cs(cs); |
300ed6cb | 4328 | while (cpumask_empty(parent->cpus_allowed) || |
b4501295 | 4329 | nodes_empty(parent->mems_allowed)) |
c431069f | 4330 | parent = parent_cs(parent); |
956db3ca | 4331 | |
8cc99345 | 4332 | if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { |
12d3089c | 4333 | pr_err("cpuset: failed to transfer tasks out of empty cpuset "); |
e61734c5 TH |
4334 | pr_cont_cgroup_name(cs->css.cgroup); |
4335 | pr_cont("\n"); | |
8cc99345 | 4336 | } |
956db3ca CW |
4337 | } |
4338 | ||
2125c003 WL |
4339 | static void cpuset_migrate_tasks_workfn(struct work_struct *work) |
4340 | { | |
4341 | struct cpuset_remove_tasks_struct *s; | |
4342 | ||
4343 | s = container_of(work, struct cpuset_remove_tasks_struct, work); | |
4344 | remove_tasks_in_empty_cpuset(s->cs); | |
4345 | css_put(&s->cs->css); | |
4346 | kfree(s); | |
4347 | } | |
4348 | ||
be4c9dd7 LZ |
4349 | static void |
4350 | hotplug_update_tasks_legacy(struct cpuset *cs, | |
4351 | struct cpumask *new_cpus, nodemask_t *new_mems, | |
4352 | bool cpus_updated, bool mems_updated) | |
390a36aa LZ |
4353 | { |
4354 | bool is_empty; | |
4355 | ||
8447a0fe | 4356 | spin_lock_irq(&callback_lock); |
be4c9dd7 LZ |
4357 | cpumask_copy(cs->cpus_allowed, new_cpus); |
4358 | cpumask_copy(cs->effective_cpus, new_cpus); | |
4359 | cs->mems_allowed = *new_mems; | |
4360 | cs->effective_mems = *new_mems; | |
8447a0fe | 4361 | spin_unlock_irq(&callback_lock); |
390a36aa LZ |
4362 | |
4363 | /* | |
4364 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, | |
f9da322e | 4365 | * as the tasks will be migrated to an ancestor. |
390a36aa | 4366 | */ |
be4c9dd7 | 4367 | if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) |
7a2127e6 | 4368 | update_tasks_cpumask(cs, new_cpus); |
be4c9dd7 | 4369 | if (mems_updated && !nodes_empty(cs->mems_allowed)) |
390a36aa LZ |
4370 | update_tasks_nodemask(cs); |
4371 | ||
4372 | is_empty = cpumask_empty(cs->cpus_allowed) || | |
4373 | nodes_empty(cs->mems_allowed); | |
4374 | ||
390a36aa LZ |
4375 | /* |
4376 | * Move tasks to the nearest ancestor with execution resources, | |
4377 | * This is full cgroup operation which will also call back into | |
2125c003 | 4378 | * cpuset. Execute it asynchronously using workqueue. |
390a36aa | 4379 | */ |
2125c003 WL |
4380 | if (is_empty && cs->css.cgroup->nr_populated_csets && |
4381 | css_tryget_online(&cs->css)) { | |
4382 | struct cpuset_remove_tasks_struct *s; | |
4383 | ||
4384 | s = kzalloc(sizeof(*s), GFP_KERNEL); | |
4385 | if (WARN_ON_ONCE(!s)) { | |
4386 | css_put(&cs->css); | |
4387 | return; | |
4388 | } | |
4389 | ||
4390 | s->cs = cs; | |
4391 | INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); | |
4392 | schedule_work(&s->work); | |
48f07456 | 4393 | } |
390a36aa LZ |
4394 | } |
4395 | ||
be4c9dd7 LZ |
4396 | static void |
4397 | hotplug_update_tasks(struct cpuset *cs, | |
4398 | struct cpumask *new_cpus, nodemask_t *new_mems, | |
4399 | bool cpus_updated, bool mems_updated) | |
390a36aa | 4400 | { |
e2d59900 WL |
4401 | /* A partition root is allowed to have empty effective cpus */ |
4402 | if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) | |
be4c9dd7 LZ |
4403 | cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); |
4404 | if (nodes_empty(*new_mems)) | |
4405 | *new_mems = parent_cs(cs)->effective_mems; | |
4406 | ||
8447a0fe | 4407 | spin_lock_irq(&callback_lock); |
be4c9dd7 LZ |
4408 | cpumask_copy(cs->effective_cpus, new_cpus); |
4409 | cs->effective_mems = *new_mems; | |
8447a0fe | 4410 | spin_unlock_irq(&callback_lock); |
390a36aa | 4411 | |
be4c9dd7 | 4412 | if (cpus_updated) |
7a2127e6 | 4413 | update_tasks_cpumask(cs, new_cpus); |
be4c9dd7 | 4414 | if (mems_updated) |
390a36aa LZ |
4415 | update_tasks_nodemask(cs); |
4416 | } | |
4417 | ||
4b842da2 WL |
4418 | static bool force_rebuild; |
4419 | ||
4420 | void cpuset_force_rebuild(void) | |
4421 | { | |
4422 | force_rebuild = true; | |
4423 | } | |
4424 | ||
deb7aa30 | 4425 | /** |
388afd85 | 4426 | * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug |
deb7aa30 | 4427 | * @cs: cpuset in interest |
4b842da2 | 4428 | * @tmp: the tmpmasks structure pointer |
956db3ca | 4429 | * |
deb7aa30 TH |
4430 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone |
4431 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, | |
4432 | * all its tasks are moved to the nearest ancestor with both resources. | |
80d1fa64 | 4433 | */ |
4b842da2 | 4434 | static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) |
80d1fa64 | 4435 | { |
be4c9dd7 LZ |
4436 | static cpumask_t new_cpus; |
4437 | static nodemask_t new_mems; | |
4438 | bool cpus_updated; | |
4439 | bool mems_updated; | |
181c8e09 | 4440 | bool remote; |
72c6303a | 4441 | int partcmd = -1; |
4b842da2 | 4442 | struct cpuset *parent; |
e44193d3 LZ |
4443 | retry: |
4444 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); | |
80d1fa64 | 4445 | |
111cd11b | 4446 | mutex_lock(&cpuset_mutex); |
7ddf96b0 | 4447 | |
e44193d3 LZ |
4448 | /* |
4449 | * We have raced with task attaching. We wait until attaching | |
4450 | * is finished, so we won't attach a task to an empty cpuset. | |
4451 | */ | |
4452 | if (cs->attach_in_progress) { | |
111cd11b | 4453 | mutex_unlock(&cpuset_mutex); |
e44193d3 LZ |
4454 | goto retry; |
4455 | } | |
4456 | ||
0f3adb8a | 4457 | parent = parent_cs(cs); |
4b842da2 WL |
4458 | compute_effective_cpumask(&new_cpus, cs, parent); |
4459 | nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); | |
4460 | ||
4b842da2 WL |
4461 | if (!tmp || !cs->partition_root_state) |
4462 | goto update_tasks; | |
80d1fa64 | 4463 | |
4b842da2 | 4464 | /* |
0c7f293e WL |
4465 | * Compute effective_cpus for valid partition root, may invalidate |
4466 | * child partition roots if necessary. | |
4b842da2 | 4467 | */ |
181c8e09 WL |
4468 | remote = is_remote_partition(cs); |
4469 | if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) | |
0c7f293e | 4470 | compute_partition_effective_cpumask(cs, &new_cpus); |
f0af1bfc | 4471 | |
181c8e09 | 4472 | if (remote && cpumask_empty(&new_cpus) && |
2125c003 | 4473 | partition_is_populated(cs, NULL)) { |
181c8e09 WL |
4474 | remote_partition_disable(cs, tmp); |
4475 | compute_effective_cpumask(&new_cpus, cs, parent); | |
4476 | remote = false; | |
4477 | cpuset_force_rebuild(); | |
4478 | } | |
4479 | ||
f0af1bfc WL |
4480 | /* |
4481 | * Force the partition to become invalid if either one of | |
4482 | * the following conditions hold: | |
4483 | * 1) empty effective cpus but not valid empty partition. | |
4484 | * 2) parent is invalid or doesn't grant any cpus to child | |
4485 | * partitions. | |
4486 | */ | |
181c8e09 | 4487 | if (is_local_partition(cs) && (!is_partition_valid(parent) || |
72c6303a WL |
4488 | tasks_nocpu_error(parent, cs, &new_cpus))) |
4489 | partcmd = partcmd_invalidate; | |
4b842da2 | 4490 | /* |
18065ebe | 4491 | * On the other hand, an invalid partition root may be transitioned |
f0af1bfc | 4492 | * back to a regular one. |
4b842da2 | 4493 | */ |
72c6303a WL |
4494 | else if (is_partition_valid(parent) && is_partition_invalid(cs)) |
4495 | partcmd = partcmd_update; | |
4496 | ||
72c6303a | 4497 | if (partcmd >= 0) { |
72c6303a | 4498 | update_parent_effective_cpumask(cs, partcmd, NULL, tmp); |
72c6303a | 4499 | if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { |
0c7f293e | 4500 | compute_partition_effective_cpumask(cs, &new_cpus); |
f0af1bfc | 4501 | cpuset_force_rebuild(); |
0c7f293e | 4502 | } |
f0af1bfc | 4503 | } |
4b842da2 WL |
4504 | |
4505 | update_tasks: | |
be4c9dd7 LZ |
4506 | cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); |
4507 | mems_updated = !nodes_equal(new_mems, cs->effective_mems); | |
df59b72c WL |
4508 | if (!cpus_updated && !mems_updated) |
4509 | goto unlock; /* Hotplug doesn't affect this cpuset */ | |
deb7aa30 | 4510 | |
8ca1b5a4 FT |
4511 | if (mems_updated) |
4512 | check_insane_mems_config(&new_mems); | |
4513 | ||
b8d1b8ee | 4514 | if (is_in_v2_mode()) |
be4c9dd7 LZ |
4515 | hotplug_update_tasks(cs, &new_cpus, &new_mems, |
4516 | cpus_updated, mems_updated); | |
390a36aa | 4517 | else |
be4c9dd7 LZ |
4518 | hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, |
4519 | cpus_updated, mems_updated); | |
8d033948 | 4520 | |
df59b72c | 4521 | unlock: |
111cd11b | 4522 | mutex_unlock(&cpuset_mutex); |
b1aac8bb PJ |
4523 | } |
4524 | ||
deb7aa30 | 4525 | /** |
2125c003 | 4526 | * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset |
956db3ca | 4527 | * |
deb7aa30 TH |
4528 | * This function is called after either CPU or memory configuration has |
4529 | * changed and updates cpuset accordingly. The top_cpuset is always | |
4530 | * synchronized to cpu_active_mask and N_MEMORY, which is necessary in | |
4531 | * order to make cpusets transparent (of no affect) on systems that are | |
4532 | * actively using CPU hotplug but making no active use of cpusets. | |
956db3ca | 4533 | * |
deb7aa30 | 4534 | * Non-root cpusets are only affected by offlining. If any CPUs or memory |
388afd85 LZ |
4535 | * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on |
4536 | * all descendants. | |
956db3ca | 4537 | * |
deb7aa30 TH |
4538 | * Note that CPU offlining during suspend is ignored. We don't modify |
4539 | * cpusets across suspend/resume cycles at all. | |
2125c003 WL |
4540 | * |
4541 | * CPU / memory hotplug is handled synchronously. | |
956db3ca | 4542 | */ |
2125c003 | 4543 | static void cpuset_handle_hotplug(void) |
b1aac8bb | 4544 | { |
5c5cc623 LZ |
4545 | static cpumask_t new_cpus; |
4546 | static nodemask_t new_mems; | |
deb7aa30 | 4547 | bool cpus_updated, mems_updated; |
b8d1b8ee | 4548 | bool on_dfl = is_in_v2_mode(); |
4b842da2 WL |
4549 | struct tmpmasks tmp, *ptmp = NULL; |
4550 | ||
4551 | if (on_dfl && !alloc_cpumasks(NULL, &tmp)) | |
4552 | ptmp = &tmp; | |
b1aac8bb | 4553 | |
2125c003 | 4554 | lockdep_assert_cpus_held(); |
111cd11b | 4555 | mutex_lock(&cpuset_mutex); |
956db3ca | 4556 | |
deb7aa30 TH |
4557 | /* fetch the available cpus/mems and find out which changed how */ |
4558 | cpumask_copy(&new_cpus, cpu_active_mask); | |
4559 | new_mems = node_states[N_MEMORY]; | |
7ddf96b0 | 4560 | |
4b842da2 | 4561 | /* |
0c7f293e WL |
4562 | * If subpartitions_cpus is populated, it is likely that the check |
4563 | * below will produce a false positive on cpus_updated when the cpu | |
4564 | * list isn't changed. It is extra work, but it is better to be safe. | |
4b842da2 | 4565 | */ |
0c7f293e WL |
4566 | cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || |
4567 | !cpumask_empty(subpartitions_cpus); | |
7e88291b | 4568 | mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); |
7ddf96b0 | 4569 | |
15d428e6 | 4570 | /* |
0c7f293e WL |
4571 | * In the rare case that hotplug removes all the cpus in |
4572 | * subpartitions_cpus, we assumed that cpus are updated. | |
15d428e6 | 4573 | */ |
0c7f293e | 4574 | if (!cpus_updated && top_cpuset.nr_subparts) |
15d428e6 WL |
4575 | cpus_updated = true; |
4576 | ||
0c7f293e | 4577 | /* For v1, synchronize cpus_allowed to cpu_active_mask */ |
deb7aa30 | 4578 | if (cpus_updated) { |
8447a0fe | 4579 | spin_lock_irq(&callback_lock); |
7e88291b LZ |
4580 | if (!on_dfl) |
4581 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); | |
4b842da2 WL |
4582 | /* |
4583 | * Make sure that CPUs allocated to child partitions | |
4584 | * do not show up in effective_cpus. If no CPU is left, | |
0c7f293e | 4585 | * we clear the subpartitions_cpus & let the child partitions |
4b842da2 WL |
4586 | * fight for the CPUs again. |
4587 | */ | |
0c7f293e WL |
4588 | if (!cpumask_empty(subpartitions_cpus)) { |
4589 | if (cpumask_subset(&new_cpus, subpartitions_cpus)) { | |
4590 | top_cpuset.nr_subparts = 0; | |
4591 | cpumask_clear(subpartitions_cpus); | |
4b842da2 WL |
4592 | } else { |
4593 | cpumask_andnot(&new_cpus, &new_cpus, | |
0c7f293e | 4594 | subpartitions_cpus); |
4b842da2 WL |
4595 | } |
4596 | } | |
1344ab9c | 4597 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); |
8447a0fe | 4598 | spin_unlock_irq(&callback_lock); |
deb7aa30 TH |
4599 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
4600 | } | |
b4501295 | 4601 | |
deb7aa30 TH |
4602 | /* synchronize mems_allowed to N_MEMORY */ |
4603 | if (mems_updated) { | |
8447a0fe | 4604 | spin_lock_irq(&callback_lock); |
7e88291b LZ |
4605 | if (!on_dfl) |
4606 | top_cpuset.mems_allowed = new_mems; | |
1344ab9c | 4607 | top_cpuset.effective_mems = new_mems; |
8447a0fe | 4608 | spin_unlock_irq(&callback_lock); |
d66393e5 | 4609 | update_tasks_nodemask(&top_cpuset); |
deb7aa30 | 4610 | } |
b4501295 | 4611 | |
111cd11b | 4612 | mutex_unlock(&cpuset_mutex); |
388afd85 | 4613 | |
5c5cc623 LZ |
4614 | /* if cpus or mems changed, we need to propagate to descendants */ |
4615 | if (cpus_updated || mems_updated) { | |
deb7aa30 | 4616 | struct cpuset *cs; |
492eb21b | 4617 | struct cgroup_subsys_state *pos_css; |
f9b4fb8d | 4618 | |
fc560a26 | 4619 | rcu_read_lock(); |
492eb21b | 4620 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
ec903c0c | 4621 | if (cs == &top_cpuset || !css_tryget_online(&cs->css)) |
388afd85 LZ |
4622 | continue; |
4623 | rcu_read_unlock(); | |
7ddf96b0 | 4624 | |
4b842da2 | 4625 | cpuset_hotplug_update_tasks(cs, ptmp); |
b4501295 | 4626 | |
388afd85 LZ |
4627 | rcu_read_lock(); |
4628 | css_put(&cs->css); | |
4629 | } | |
4630 | rcu_read_unlock(); | |
4631 | } | |
8d033948 | 4632 | |
deb7aa30 | 4633 | /* rebuild sched domains if cpus_allowed has changed */ |
50e76632 PZ |
4634 | if (cpus_updated || force_rebuild) { |
4635 | force_rebuild = false; | |
2125c003 | 4636 | rebuild_sched_domains_cpuslocked(); |
50e76632 | 4637 | } |
4b842da2 WL |
4638 | |
4639 | free_cpumasks(NULL, ptmp); | |
b1aac8bb PJ |
4640 | } |
4641 | ||
2b729fe7 | 4642 | void cpuset_update_active_cpus(void) |
4c4d50f7 | 4643 | { |
2b729fe7 TH |
4644 | /* |
4645 | * We're inside cpu hotplug critical region which usually nests | |
4646 | * inside cgroup synchronization. Bounce actual hotplug processing | |
4647 | * to a work item to avoid reverse locking order. | |
4648 | */ | |
2125c003 | 4649 | cpuset_handle_hotplug(); |
50e76632 PZ |
4650 | } |
4651 | ||
38837fc7 | 4652 | /* |
38d7bee9 LJ |
4653 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. |
4654 | * Call this routine anytime after node_states[N_MEMORY] changes. | |
a1cd2b13 | 4655 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
38837fc7 | 4656 | */ |
f481891f MX |
4657 | static int cpuset_track_online_nodes(struct notifier_block *self, |
4658 | unsigned long action, void *arg) | |
38837fc7 | 4659 | { |
2125c003 | 4660 | cpuset_handle_hotplug(); |
f481891f | 4661 | return NOTIFY_OK; |
38837fc7 | 4662 | } |
d8f10cb3 | 4663 | |
1da177e4 LT |
4664 | /** |
4665 | * cpuset_init_smp - initialize cpus_allowed | |
4666 | * | |
4667 | * Description: Finish top cpuset after cpu, node maps are initialized | |
d8f10cb3 | 4668 | */ |
1da177e4 LT |
4669 | void __init cpuset_init_smp(void) |
4670 | { | |
2685027f WL |
4671 | /* |
4672 | * cpus_allowd/mems_allowed set to v2 values in the initial | |
4673 | * cpuset_bind() call will be reset to v1 values in another | |
4674 | * cpuset_bind() call when v1 cpuset is mounted. | |
4675 | */ | |
33ad801d | 4676 | top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; |
4c4d50f7 | 4677 | |
e2b9a3d7 LZ |
4678 | cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); |
4679 | top_cpuset.effective_mems = node_states[N_MEMORY]; | |
4680 | ||
1eeaa4fd | 4681 | hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); |
e93ad19d TH |
4682 | |
4683 | cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); | |
4684 | BUG_ON(!cpuset_migrate_mm_wq); | |
1da177e4 LT |
4685 | } |
4686 | ||
4687 | /** | |
1da177e4 LT |
4688 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
4689 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | |
6af866af | 4690 | * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. |
1da177e4 | 4691 | * |
300ed6cb | 4692 | * Description: Returns the cpumask_var_t cpus_allowed of the cpuset |
1da177e4 | 4693 | * attached to the specified @tsk. Guaranteed to return some non-empty |
5f054e31 | 4694 | * subset of cpu_online_mask, even if this means going outside the |
3fb906e7 | 4695 | * tasks cpuset, except when the task is in the top cpuset. |
1da177e4 LT |
4696 | **/ |
4697 | ||
6af866af | 4698 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
1da177e4 | 4699 | { |
8447a0fe | 4700 | unsigned long flags; |
3fb906e7 | 4701 | struct cpuset *cs; |
8447a0fe VD |
4702 | |
4703 | spin_lock_irqsave(&callback_lock, flags); | |
3fb906e7 WL |
4704 | rcu_read_lock(); |
4705 | ||
4706 | cs = task_cs(tsk); | |
4707 | if (cs != &top_cpuset) | |
4708 | guarantee_online_cpus(tsk, pmask); | |
4709 | /* | |
4710 | * Tasks in the top cpuset won't get update to their cpumasks | |
4711 | * when a hotplug online/offline event happens. So we include all | |
4712 | * offline cpus in the allowed cpu list. | |
4713 | */ | |
4714 | if ((cs == &top_cpuset) || cpumask_empty(pmask)) { | |
4715 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); | |
4716 | ||
4717 | /* | |
4718 | * We first exclude cpus allocated to partitions. If there is no | |
4719 | * allowable online cpu left, we fall back to all possible cpus. | |
4720 | */ | |
0c7f293e | 4721 | cpumask_andnot(pmask, possible_mask, subpartitions_cpus); |
3fb906e7 WL |
4722 | if (!cpumask_intersects(pmask, cpu_online_mask)) |
4723 | cpumask_copy(pmask, possible_mask); | |
4724 | } | |
4725 | ||
4726 | rcu_read_unlock(); | |
8447a0fe | 4727 | spin_unlock_irqrestore(&callback_lock, flags); |
1da177e4 LT |
4728 | } |
4729 | ||
d477f8c2 JS |
4730 | /** |
4731 | * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. | |
4732 | * @tsk: pointer to task_struct with which the scheduler is struggling | |
4733 | * | |
4734 | * Description: In the case that the scheduler cannot find an allowed cpu in | |
4735 | * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy | |
4736 | * mode however, this value is the same as task_cs(tsk)->effective_cpus, | |
4737 | * which will not contain a sane cpumask during cases such as cpu hotplugging. | |
4738 | * This is the absolute last resort for the scheduler and it is only used if | |
4739 | * _every_ other avenue has been traveled. | |
97c0054d WD |
4740 | * |
4741 | * Returns true if the affinity of @tsk was changed, false otherwise. | |
d477f8c2 JS |
4742 | **/ |
4743 | ||
97c0054d | 4744 | bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
9084bb82 | 4745 | { |
d4b96fb9 WD |
4746 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); |
4747 | const struct cpumask *cs_mask; | |
97c0054d | 4748 | bool changed = false; |
d4b96fb9 | 4749 | |
9084bb82 | 4750 | rcu_read_lock(); |
d4b96fb9 | 4751 | cs_mask = task_cs(tsk)->cpus_allowed; |
97c0054d | 4752 | if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { |
d4b96fb9 | 4753 | do_set_cpus_allowed(tsk, cs_mask); |
97c0054d WD |
4754 | changed = true; |
4755 | } | |
9084bb82 ON |
4756 | rcu_read_unlock(); |
4757 | ||
4758 | /* | |
4759 | * We own tsk->cpus_allowed, nobody can change it under us. | |
4760 | * | |
4761 | * But we used cs && cs->cpus_allowed lockless and thus can | |
4762 | * race with cgroup_attach_task() or update_cpumask() and get | |
4763 | * the wrong tsk->cpus_allowed. However, both cases imply the | |
4764 | * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() | |
4765 | * which takes task_rq_lock(). | |
4766 | * | |
4767 | * If we are called after it dropped the lock we must see all | |
4768 | * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary | |
4769 | * set any mask even if it is not right from task_cs() pov, | |
4770 | * the pending set_cpus_allowed_ptr() will fix things. | |
2baab4e9 PZ |
4771 | * |
4772 | * select_fallback_rq() will fix things ups and set cpu_possible_mask | |
4773 | * if required. | |
9084bb82 | 4774 | */ |
97c0054d | 4775 | return changed; |
9084bb82 ON |
4776 | } |
4777 | ||
8f4ab07f | 4778 | void __init cpuset_init_current_mems_allowed(void) |
1da177e4 | 4779 | { |
f9a86fcb | 4780 | nodes_setall(current->mems_allowed); |
1da177e4 LT |
4781 | } |
4782 | ||
909d75a3 PJ |
4783 | /** |
4784 | * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. | |
4785 | * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. | |
4786 | * | |
4787 | * Description: Returns the nodemask_t mems_allowed of the cpuset | |
4788 | * attached to the specified @tsk. Guaranteed to return some non-empty | |
38d7bee9 | 4789 | * subset of node_states[N_MEMORY], even if this means going outside the |
909d75a3 PJ |
4790 | * tasks cpuset. |
4791 | **/ | |
4792 | ||
4793 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | |
4794 | { | |
4795 | nodemask_t mask; | |
8447a0fe | 4796 | unsigned long flags; |
909d75a3 | 4797 | |
8447a0fe | 4798 | spin_lock_irqsave(&callback_lock, flags); |
b8dadcb5 | 4799 | rcu_read_lock(); |
ae1c8023 | 4800 | guarantee_online_mems(task_cs(tsk), &mask); |
b8dadcb5 | 4801 | rcu_read_unlock(); |
8447a0fe | 4802 | spin_unlock_irqrestore(&callback_lock, flags); |
909d75a3 PJ |
4803 | |
4804 | return mask; | |
4805 | } | |
4806 | ||
d9fd8a6d | 4807 | /** |
08b2b6fd | 4808 | * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed |
19770b32 | 4809 | * @nodemask: the nodemask to be checked |
d9fd8a6d | 4810 | * |
19770b32 | 4811 | * Are any of the nodes in the nodemask allowed in current->mems_allowed? |
1da177e4 | 4812 | */ |
19770b32 | 4813 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
1da177e4 | 4814 | { |
19770b32 | 4815 | return nodes_intersects(*nodemask, current->mems_allowed); |
1da177e4 LT |
4816 | } |
4817 | ||
9bf2229f | 4818 | /* |
78608366 PM |
4819 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or |
4820 | * mem_hardwall ancestor to the specified cpuset. Call holding | |
8447a0fe | 4821 | * callback_lock. If no ancestor is mem_exclusive or mem_hardwall |
78608366 | 4822 | * (an unusual configuration), then returns the root cpuset. |
9bf2229f | 4823 | */ |
c9710d80 | 4824 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
9bf2229f | 4825 | { |
c431069f TH |
4826 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) |
4827 | cs = parent_cs(cs); | |
9bf2229f PJ |
4828 | return cs; |
4829 | } | |
4830 | ||
c70cd039 | 4831 | /* |
8e464522 | 4832 | * cpuset_node_allowed - Can we allocate on a memory node? |
a1bc5a4e | 4833 | * @node: is this an allowed node? |
02a0e53d | 4834 | * @gfp_mask: memory allocation flags |
d9fd8a6d | 4835 | * |
6e276d2a DR |
4836 | * If we're in interrupt, yes, we can always allocate. If @node is set in |
4837 | * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this | |
4838 | * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, | |
da99ecf1 | 4839 | * yes. If current has access to memory reserves as an oom victim, yes. |
9bf2229f PJ |
4840 | * Otherwise, no. |
4841 | * | |
4842 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | |
c596d9f3 | 4843 | * and do not allow allocations outside the current tasks cpuset |
da99ecf1 | 4844 | * unless the task has been OOM killed. |
9bf2229f | 4845 | * GFP_KERNEL allocations are not so marked, so can escape to the |
78608366 | 4846 | * nearest enclosing hardwalled ancestor cpuset. |
9bf2229f | 4847 | * |
8447a0fe | 4848 | * Scanning up parent cpusets requires callback_lock. The |
02a0e53d PJ |
4849 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit |
4850 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the | |
4851 | * current tasks mems_allowed came up empty on the first pass over | |
4852 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the | |
8447a0fe | 4853 | * cpuset are short of memory, might require taking the callback_lock. |
9bf2229f | 4854 | * |
36be57ff | 4855 | * The first call here from mm/page_alloc:get_page_from_freelist() |
02a0e53d PJ |
4856 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, |
4857 | * so no allocation on a node outside the cpuset is allowed (unless | |
4858 | * in interrupt, of course). | |
36be57ff PJ |
4859 | * |
4860 | * The second pass through get_page_from_freelist() doesn't even call | |
4861 | * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() | |
4862 | * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set | |
4863 | * in alloc_flags. That logic and the checks below have the combined | |
4864 | * affect that: | |
9bf2229f PJ |
4865 | * in_interrupt - any node ok (current task context irrelevant) |
4866 | * GFP_ATOMIC - any node ok | |
da99ecf1 | 4867 | * tsk_is_oom_victim - any node ok |
78608366 | 4868 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok |
9bf2229f | 4869 | * GFP_USER - only nodes in current tasks mems allowed ok. |
02a0e53d | 4870 | */ |
8e464522 | 4871 | bool cpuset_node_allowed(int node, gfp_t gfp_mask) |
1da177e4 | 4872 | { |
c9710d80 | 4873 | struct cpuset *cs; /* current cpuset ancestors */ |
d4296fae | 4874 | bool allowed; /* is allocation in zone z allowed? */ |
8447a0fe | 4875 | unsigned long flags; |
9bf2229f | 4876 | |
6e276d2a | 4877 | if (in_interrupt()) |
002f2906 | 4878 | return true; |
9bf2229f | 4879 | if (node_isset(node, current->mems_allowed)) |
002f2906 | 4880 | return true; |
c596d9f3 DR |
4881 | /* |
4882 | * Allow tasks that have access to memory reserves because they have | |
4883 | * been OOM killed to get memory anywhere. | |
4884 | */ | |
da99ecf1 | 4885 | if (unlikely(tsk_is_oom_victim(current))) |
002f2906 | 4886 | return true; |
9bf2229f | 4887 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ |
002f2906 | 4888 | return false; |
9bf2229f | 4889 | |
5563e770 | 4890 | if (current->flags & PF_EXITING) /* Let dying task have memory */ |
002f2906 | 4891 | return true; |
5563e770 | 4892 | |
9bf2229f | 4893 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
8447a0fe | 4894 | spin_lock_irqsave(&callback_lock, flags); |
053199ed | 4895 | |
b8dadcb5 | 4896 | rcu_read_lock(); |
78608366 | 4897 | cs = nearest_hardwall_ancestor(task_cs(current)); |
99afb0fd | 4898 | allowed = node_isset(node, cs->mems_allowed); |
b8dadcb5 | 4899 | rcu_read_unlock(); |
053199ed | 4900 | |
8447a0fe | 4901 | spin_unlock_irqrestore(&callback_lock, flags); |
9bf2229f | 4902 | return allowed; |
1da177e4 LT |
4903 | } |
4904 | ||
825a46af | 4905 | /** |
32a47817 | 4906 | * cpuset_spread_node() - On which node to begin search for a page |
05f76ae9 | 4907 | * @rotor: round robin rotor |
825a46af PJ |
4908 | * |
4909 | * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for | |
4910 | * tasks in a cpuset with is_spread_page or is_spread_slab set), | |
4911 | * and if the memory allocation used cpuset_mem_spread_node() | |
4912 | * to determine on which node to start looking, as it will for | |
4913 | * certain page cache or slab cache pages such as used for file | |
4914 | * system buffers and inode caches, then instead of starting on the | |
4915 | * local node to look for a free page, rather spread the starting | |
4916 | * node around the tasks mems_allowed nodes. | |
4917 | * | |
4918 | * We don't have to worry about the returned node being offline | |
4919 | * because "it can't happen", and even if it did, it would be ok. | |
4920 | * | |
4921 | * The routines calling guarantee_online_mems() are careful to | |
4922 | * only set nodes in task->mems_allowed that are online. So it | |
4923 | * should not be possible for the following code to return an | |
4924 | * offline node. But if it did, that would be ok, as this routine | |
4925 | * is not returning the node where the allocation must be, only | |
4926 | * the node where the search should start. The zonelist passed to | |
4927 | * __alloc_pages() will include all nodes. If the slab allocator | |
4928 | * is passed an offline node, it will fall back to the local node. | |
4929 | * See kmem_cache_alloc_node(). | |
4930 | */ | |
6adef3eb | 4931 | static int cpuset_spread_node(int *rotor) |
825a46af | 4932 | { |
0edaf86c | 4933 | return *rotor = next_node_in(*rotor, current->mems_allowed); |
825a46af | 4934 | } |
6adef3eb | 4935 | |
32a47817 RD |
4936 | /** |
4937 | * cpuset_mem_spread_node() - On which node to begin search for a file page | |
4938 | */ | |
6adef3eb JS |
4939 | int cpuset_mem_spread_node(void) |
4940 | { | |
778d3b0f MH |
4941 | if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) |
4942 | current->cpuset_mem_spread_rotor = | |
4943 | node_random(¤t->mems_allowed); | |
4944 | ||
6adef3eb JS |
4945 | return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); |
4946 | } | |
4947 | ||
32a47817 RD |
4948 | /** |
4949 | * cpuset_slab_spread_node() - On which node to begin search for a slab page | |
4950 | */ | |
6adef3eb JS |
4951 | int cpuset_slab_spread_node(void) |
4952 | { | |
778d3b0f MH |
4953 | if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) |
4954 | current->cpuset_slab_spread_rotor = | |
4955 | node_random(¤t->mems_allowed); | |
4956 | ||
6adef3eb JS |
4957 | return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); |
4958 | } | |
825a46af PJ |
4959 | EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); |
4960 | ||
ef08e3b4 | 4961 | /** |
bbe373f2 DR |
4962 | * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? |
4963 | * @tsk1: pointer to task_struct of some task. | |
4964 | * @tsk2: pointer to task_struct of some other task. | |
4965 | * | |
4966 | * Description: Return true if @tsk1's mems_allowed intersects the | |
4967 | * mems_allowed of @tsk2. Used by the OOM killer to determine if | |
4968 | * one of the task's memory usage might impact the memory available | |
4969 | * to the other. | |
ef08e3b4 PJ |
4970 | **/ |
4971 | ||
bbe373f2 DR |
4972 | int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, |
4973 | const struct task_struct *tsk2) | |
ef08e3b4 | 4974 | { |
bbe373f2 | 4975 | return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); |
ef08e3b4 PJ |
4976 | } |
4977 | ||
75aa1994 | 4978 | /** |
da39da3a | 4979 | * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed |
75aa1994 | 4980 | * |
da39da3a | 4981 | * Description: Prints current's name, cpuset name, and cached copy of its |
b8dadcb5 | 4982 | * mems_allowed to the kernel log. |
75aa1994 | 4983 | */ |
da39da3a | 4984 | void cpuset_print_current_mems_allowed(void) |
75aa1994 | 4985 | { |
b8dadcb5 | 4986 | struct cgroup *cgrp; |
75aa1994 | 4987 | |
b8dadcb5 | 4988 | rcu_read_lock(); |
63f43f55 | 4989 | |
da39da3a | 4990 | cgrp = task_cs(current)->css.cgroup; |
ef8444ea | 4991 | pr_cont(",cpuset="); |
e61734c5 | 4992 | pr_cont_cgroup_name(cgrp); |
ef8444ea | 4993 | pr_cont(",mems_allowed=%*pbl", |
da39da3a | 4994 | nodemask_pr_args(¤t->mems_allowed)); |
f440d98f | 4995 | |
cfb5966b | 4996 | rcu_read_unlock(); |
75aa1994 DR |
4997 | } |
4998 | ||
3e0d98b9 PJ |
4999 | /* |
5000 | * Collection of memory_pressure is suppressed unless | |
5001 | * this flag is enabled by writing "1" to the special | |
5002 | * cpuset file 'memory_pressure_enabled' in the root cpuset. | |
5003 | */ | |
5004 | ||
c5b2aff8 | 5005 | int cpuset_memory_pressure_enabled __read_mostly; |
3e0d98b9 | 5006 | |
c70cd039 JC |
5007 | /* |
5008 | * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. | |
3e0d98b9 PJ |
5009 | * |
5010 | * Keep a running average of the rate of synchronous (direct) | |
5011 | * page reclaim efforts initiated by tasks in each cpuset. | |
5012 | * | |
5013 | * This represents the rate at which some task in the cpuset | |
5014 | * ran low on memory on all nodes it was allowed to use, and | |
5015 | * had to enter the kernels page reclaim code in an effort to | |
5016 | * create more free memory by tossing clean pages or swapping | |
5017 | * or writing dirty pages. | |
5018 | * | |
5019 | * Display to user space in the per-cpuset read-only file | |
5020 | * "memory_pressure". Value displayed is an integer | |
5021 | * representing the recent rate of entry into the synchronous | |
5022 | * (direct) page reclaim by any task attached to the cpuset. | |
c70cd039 | 5023 | */ |
3e0d98b9 PJ |
5024 | |
5025 | void __cpuset_memory_pressure_bump(void) | |
5026 | { | |
b8dadcb5 | 5027 | rcu_read_lock(); |
8793d854 | 5028 | fmeter_markevent(&task_cs(current)->fmeter); |
b8dadcb5 | 5029 | rcu_read_unlock(); |
3e0d98b9 PJ |
5030 | } |
5031 | ||
8793d854 | 5032 | #ifdef CONFIG_PROC_PID_CPUSET |
1da177e4 LT |
5033 | /* |
5034 | * proc_cpuset_show() | |
5035 | * - Print tasks cpuset path into seq_file. | |
5036 | * - Used for /proc/<pid>/cpuset. | |
053199ed PJ |
5037 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
5038 | * doesn't really matter if tsk->cpuset changes after we read it, | |
111cd11b | 5039 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it |
2df167a3 | 5040 | * anyway. |
1da177e4 | 5041 | */ |
52de4779 ZL |
5042 | int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, |
5043 | struct pid *pid, struct task_struct *tsk) | |
1da177e4 | 5044 | { |
4c737b41 | 5045 | char *buf; |
8793d854 | 5046 | struct cgroup_subsys_state *css; |
99f89551 | 5047 | int retval; |
1da177e4 | 5048 | |
99f89551 | 5049 | retval = -ENOMEM; |
e61734c5 | 5050 | buf = kmalloc(PATH_MAX, GFP_KERNEL); |
1da177e4 | 5051 | if (!buf) |
99f89551 EB |
5052 | goto out; |
5053 | ||
a79a908f | 5054 | css = task_get_css(tsk, cpuset_cgrp_id); |
4c737b41 TH |
5055 | retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, |
5056 | current->nsproxy->cgroup_ns); | |
a79a908f | 5057 | css_put(css); |
ff6d413b | 5058 | if (retval == -E2BIG) |
679a5e3f TH |
5059 | retval = -ENAMETOOLONG; |
5060 | if (retval < 0) | |
52de4779 | 5061 | goto out_free; |
4c737b41 | 5062 | seq_puts(m, buf); |
1da177e4 | 5063 | seq_putc(m, '\n'); |
e61734c5 | 5064 | retval = 0; |
99f89551 | 5065 | out_free: |
1da177e4 | 5066 | kfree(buf); |
99f89551 | 5067 | out: |
1da177e4 LT |
5068 | return retval; |
5069 | } | |
8793d854 | 5070 | #endif /* CONFIG_PROC_PID_CPUSET */ |
1da177e4 | 5071 | |
d01d4827 | 5072 | /* Display task mems_allowed in /proc/<pid>/status file. */ |
df5f8314 EB |
5073 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) |
5074 | { | |
e8e6d97c TH |
5075 | seq_printf(m, "Mems_allowed:\t%*pb\n", |
5076 | nodemask_pr_args(&task->mems_allowed)); | |
5077 | seq_printf(m, "Mems_allowed_list:\t%*pbl\n", | |
5078 | nodemask_pr_args(&task->mems_allowed)); | |
1da177e4 | 5079 | } |