git.ipfire.org Git - thirdparty/kernel/linux.git/blob

1 /*

2 * Generic process-grouping system.

3 *

4 * Based originally on the cpuset system, extracted by Paul Menage

6 *

7 * Notifications support

9 * Author: Kirill A. Shutemov

10 *

11 * Copyright notices from the original cpuset code:

12 * --------------------------------------------------

15 *

16 * Portions derived from Patrick Mochel's sysfs code.

18 *

19 * 2003-10-10 Written by Simon Derr.

20 * 2003-10-22 Updates by Stephen Hemminger.

21 * 2004 May-July Rework by Paul Jackson.

22 * ---------------------------------------------------

23 *

24 * This file is subject to the terms and conditions of the GNU General Public

25 * License. See the file COPYING in the main directory of the Linux

26 * distribution for more details.

27 */

29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

31 #include "cgroup-internal.h"

33 #include <linux/bpf-cgroup.h>

34 #include <linux/cred.h>

35 #include <linux/errno.h>

36 #include <linux/init_task.h>

37 #include <linux/kernel.h>

38 #include <linux/magic.h>

39 #include <linux/mutex.h>

40 #include <linux/mount.h>

41 #include <linux/pagemap.h>

42 #include <linux/proc_fs.h>

43 #include <linux/rcupdate.h>

44 #include <linux/sched.h>

45 #include <linux/sched/task.h>

46 #include <linux/slab.h>

47 #include <linux/spinlock.h>

48 #include <linux/percpu-rwsem.h>

49 #include <linux/string.h>

50 #include <linux/hashtable.h>

51 #include <linux/idr.h>

52 #include <linux/kthread.h>

53 #include <linux/atomic.h>

54 #include <linux/cpuset.h>

55 #include <linux/proc_ns.h>

56 #include <linux/nsproxy.h>

57 #include <linux/file.h>

58 #include <linux/fs_parser.h>

59 #include <linux/sched/cputime.h>

60 #include <linux/psi.h>

61 #include <net/sock.h>

63 #define CREATE_TRACE_POINTS

64 #include <trace/events/cgroup.h>

66 #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \

67 MAX_CFTYPE_NAME + 2)

68 /* let's not notify more than 100 times per second */

69 #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)

71 /*

72 * To avoid confusing the compiler (and generating warnings) with code

73 * that attempts to access what would be a 0-element array (i.e. sized

74 * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this

75 * constant expression can be added.

76 */

77 #define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)

79 /*

80 * cgroup_mutex is the master lock. Any modification to cgroup or its

81 * hierarchy must be performed while holding it.

82 *

83 * css_set_lock protects task->cgroups pointer, the list of css_set

84 * objects, and the chain of tasks off each css_set.

85 *

86 * These locks are exported if CONFIG_PROVE_RCU so that accessors in

87 * cgroup.h can use them for lockdep annotations.

88 */

89 DEFINE_MUTEX(cgroup_mutex);

90 DEFINE_SPINLOCK(css_set_lock);

92 #ifdef CONFIG_PROVE_RCU

93 EXPORT_SYMBOL_GPL(cgroup_mutex);

94 EXPORT_SYMBOL_GPL(css_set_lock);

95 #endif

97 DEFINE_SPINLOCK(trace_cgroup_path_lock);

98 char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];

99 static bool cgroup_debug __read_mostly;

100

101 /*

102 * Protects cgroup_idr and css_idr so that IDs can be released without

103 * grabbing cgroup_mutex.

104 */

105 static DEFINE_SPINLOCK(cgroup_idr_lock);

106

107 /*

108 * Protects cgroup_file->kn for !self csses. It synchronizes notifications

109 * against file removal/re-creation across css hiding.

110 */

111 static DEFINE_SPINLOCK(cgroup_file_kn_lock);

112

113 DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);

114

115 #define cgroup_assert_mutex_or_rcu_locked() \

116 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \

117 !lockdep_is_held(&cgroup_mutex), \

118 "cgroup_mutex or RCU read lock required");

119

120 /*

121 * cgroup destruction makes heavy use of work items and there can be a lot

122 * of concurrent destructions. Use a separate workqueue so that cgroup

123 * destruction work items don't end up filling up max_active of system_wq

124 * which may lead to deadlock.

125 */

126 static struct workqueue_struct *cgroup_destroy_wq;

127

128 /* generate an array of cgroup subsystem pointers */

129 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,

130 struct cgroup_subsys *cgroup_subsys[] = {

131 #include <linux/cgroup_subsys.h>

132 };

133 #undef SUBSYS

134

135 /* array of cgroup subsystem names */

136 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,

137 static const char *cgroup_subsys_name[] = {

138 #include <linux/cgroup_subsys.h>

139 };

140 #undef SUBSYS

141

142 /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */

143 #define SUBSYS(_x) \

144 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \

145 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \

146 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \

147 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);

148 #include <linux/cgroup_subsys.h>

149 #undef SUBSYS

150

151 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,

152 static struct static_key_true *cgroup_subsys_enabled_key[] = {

153 #include <linux/cgroup_subsys.h>

154 };

155 #undef SUBSYS

156

157 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,

158 static struct static_key_true *cgroup_subsys_on_dfl_key[] = {

159 #include <linux/cgroup_subsys.h>

160 };

161 #undef SUBSYS

162

163 static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);

164

165 /* the default hierarchy */

166 struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };

167 EXPORT_SYMBOL_GPL(cgrp_dfl_root);

168

169 /*

170 * The default hierarchy always exists but is hidden until mounted for the

171 * first time. This is for backward compatibility.

172 */

173 static bool cgrp_dfl_visible;

174

175 /* some controllers are not supported in the default hierarchy */

176 static u16 cgrp_dfl_inhibit_ss_mask;

177

178 /* some controllers are implicitly enabled on the default hierarchy */

179 static u16 cgrp_dfl_implicit_ss_mask;

180

181 /* some controllers can be threaded on the default hierarchy */

182 static u16 cgrp_dfl_threaded_ss_mask;

183

184 /* The list of hierarchy roots */

185 LIST_HEAD(cgroup_roots);

186 static int cgroup_root_count;

187

188 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */

189 static DEFINE_IDR(cgroup_hierarchy_idr);

190

191 /*

192 * Assign a monotonically increasing serial number to csses. It guarantees

193 * cgroups with bigger numbers are newer than those with smaller numbers.

194 * Also, as csses are always appended to the parent's ->children list, it

195 * guarantees that sibling csses are always sorted in the ascending serial

196 * number order on the list. Protected by cgroup_mutex.

197 */

198 static u64 css_serial_nr_next = 1;

199

200 /*

201 * These bitmasks identify subsystems with specific features to avoid

202 * having to do iterative checks repeatedly.

203 */

204 static u16 have_fork_callback __read_mostly;

205 static u16 have_exit_callback __read_mostly;

206 static u16 have_release_callback __read_mostly;

207 static u16 have_canfork_callback __read_mostly;

208

209 /* cgroup namespace for init task */

210 struct cgroup_namespace init_cgroup_ns = {

211 .ns.count = REFCOUNT_INIT(2),

212 .user_ns = &init_user_ns,

213 .ns.ops = &cgroupns_operations,

214 .ns.inum = PROC_CGROUP_INIT_INO,

215 .root_cset = &init_css_set,

216 };

217

218 static struct file_system_type cgroup2_fs_type;

219 static struct cftype cgroup_base_files[];

220 static struct cftype cgroup_psi_files[];

221

222 /* cgroup optional features */

223 enum cgroup_opt_features {

224 #ifdef CONFIG_PSI

225 OPT_FEATURE_PRESSURE,

226 #endif

227 OPT_FEATURE_COUNT

228 };

229

230 static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {

231 #ifdef CONFIG_PSI

232 "pressure",

233 #endif

234 };

235

236 static u16 cgroup_feature_disable_mask __read_mostly;

237

238 static int cgroup_apply_control(struct cgroup *cgrp);

239 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);

240 static void css_task_iter_skip(struct css_task_iter *it,

241 struct task_struct *task);

242 static int cgroup_destroy_locked(struct cgroup *cgrp);

243 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,

244 struct cgroup_subsys *ss);

245 static void css_release(struct percpu_ref *ref);

246 static void kill_css(struct cgroup_subsys_state *css);

247 static int cgroup_addrm_files(struct cgroup_subsys_state *css,

248 struct cgroup *cgrp, struct cftype cfts[],

249 bool is_add);

250

251 /**

252 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID

253 * @ssid: subsys ID of interest

254 *

255 * cgroup_subsys_enabled() can only be used with literal subsys names which

256 * is fine for individual subsystems but unsuitable for cgroup core. This

257 * is slower static_key_enabled() based test indexed by @ssid.

258 */

259 bool cgroup_ssid_enabled(int ssid)

260 {

261 if (!CGROUP_HAS_SUBSYS_CONFIG)

262 return false;

263

264 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);

265 }

266

267 /**

268 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy

269 * @cgrp: the cgroup of interest

270 *

271 * The default hierarchy is the v2 interface of cgroup and this function

272 * can be used to test whether a cgroup is on the default hierarchy for

273 * cases where a subsystem should behave differently depending on the

274 * interface version.

275 *

276 * List of changed behaviors:

277 *

278 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"

279 * and "name" are disallowed.

280 *

281 * - When mounting an existing superblock, mount options should match.

282 *

283 * - rename(2) is disallowed.

284 *

285 * - "tasks" is removed. Everything should be at process granularity. Use

286 * "cgroup.procs" instead.

287 *

288 * - "cgroup.procs" is not sorted. pids will be unique unless they got

289 * recycled in-between reads.

290 *

291 * - "release_agent" and "notify_on_release" are removed. Replacement

292 * notification mechanism will be implemented.

293 *

294 * - "cgroup.clone_children" is removed.

295 *

296 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup

297 * and its descendants contain no task; otherwise, 1. The file also

298 * generates kernfs notification which can be monitored through poll and

299 * [di]notify when the value of the file changes.

300 *

301 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and

302 * take masks of ancestors with non-empty cpus/mems, instead of being

303 * moved to an ancestor.

304 *

305 * - cpuset: a task can be moved into an empty cpuset, and again it takes

306 * masks of ancestors.

307 *

308 * - blkcg: blk-throttle becomes properly hierarchical.

309 *

310 * - debug: disallowed on the default hierarchy.

311 */

312 bool cgroup_on_dfl(const struct cgroup *cgrp)

313 {

314 return cgrp->root == &cgrp_dfl_root;

315 }

316

317 /* IDR wrappers which synchronize using cgroup_idr_lock */

318 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,

319 gfp_t gfp_mask)

320 {

321 int ret;

322

323 idr_preload(gfp_mask);

324 spin_lock_bh(&cgroup_idr_lock);

325 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);

326 spin_unlock_bh(&cgroup_idr_lock);

327 idr_preload_end();

328 return ret;

329 }

330

331 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)

332 {

333 void *ret;

334

335 spin_lock_bh(&cgroup_idr_lock);

336 ret = idr_replace(idr, ptr, id);

337 spin_unlock_bh(&cgroup_idr_lock);

338 return ret;

339 }

340

341 static void cgroup_idr_remove(struct idr *idr, int id)

342 {

343 spin_lock_bh(&cgroup_idr_lock);

344 idr_remove(idr, id);

345 spin_unlock_bh(&cgroup_idr_lock);

346 }

347

348 static bool cgroup_has_tasks(struct cgroup *cgrp)

349 {

350 return cgrp->nr_populated_csets;

351 }

352

353 bool cgroup_is_threaded(struct cgroup *cgrp)

354 {

355 return cgrp->dom_cgrp != cgrp;

356 }

357

358 /* can @cgrp host both domain and threaded children? */

359 static bool cgroup_is_mixable(struct cgroup *cgrp)

360 {

361 /*

362 * Root isn't under domain level resource control exempting it from

363 * the no-internal-process constraint, so it can serve as a thread

364 * root and a parent of resource domains at the same time.

365 */

366 return !cgroup_parent(cgrp);

367 }

368

369 /* can @cgrp become a thread root? Should always be true for a thread root */

370 static bool cgroup_can_be_thread_root(struct cgroup *cgrp)

371 {

372 /* mixables don't care */

373 if (cgroup_is_mixable(cgrp))

374 return true;

375

376 /* domain roots can't be nested under threaded */

377 if (cgroup_is_threaded(cgrp))

378 return false;

379

380 /* can only have either domain or threaded children */

381 if (cgrp->nr_populated_domain_children)

382 return false;

383

384 /* and no domain controllers can be enabled */

385 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)

386 return false;

387

388 return true;

389 }

390

391 /* is @cgrp root of a threaded subtree? */

392 bool cgroup_is_thread_root(struct cgroup *cgrp)

393 {

394 /* thread root should be a domain */

395 if (cgroup_is_threaded(cgrp))

396 return false;

397

398 /* a domain w/ threaded children is a thread root */

399 if (cgrp->nr_threaded_children)

400 return true;

401

402 /*

403 * A domain which has tasks and explicit threaded controllers

404 * enabled is a thread root.

405 */

406 if (cgroup_has_tasks(cgrp) &&

407 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))

408 return true;

409

410 return false;

411 }

412

413 /* a domain which isn't connected to the root w/o brekage can't be used */

414 static bool cgroup_is_valid_domain(struct cgroup *cgrp)

415 {

416 /* the cgroup itself can be a thread root */

417 if (cgroup_is_threaded(cgrp))

418 return false;

419

420 /* but the ancestors can't be unless mixable */

421 while ((cgrp = cgroup_parent(cgrp))) {

422 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))

423 return false;

424 if (cgroup_is_threaded(cgrp))

425 return false;

426 }

427

428 return true;

429 }

430

431 /* subsystems visibly enabled on a cgroup */

432 static u16 cgroup_control(struct cgroup *cgrp)

433 {

434 struct cgroup *parent = cgroup_parent(cgrp);

435 u16 root_ss_mask = cgrp->root->subsys_mask;

436

437 if (parent) {

438 u16 ss_mask = parent->subtree_control;

439

440 /* threaded cgroups can only have threaded controllers */

441 if (cgroup_is_threaded(cgrp))

442 ss_mask &= cgrp_dfl_threaded_ss_mask;

443 return ss_mask;

444 }

445

446 if (cgroup_on_dfl(cgrp))

447 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |

448 cgrp_dfl_implicit_ss_mask);

449 return root_ss_mask;

450 }

451

452 /* subsystems enabled on a cgroup */

453 static u16 cgroup_ss_mask(struct cgroup *cgrp)

454 {

455 struct cgroup *parent = cgroup_parent(cgrp);

456

457 if (parent) {

458 u16 ss_mask = parent->subtree_ss_mask;

459

460 /* threaded cgroups can only have threaded controllers */

461 if (cgroup_is_threaded(cgrp))

462 ss_mask &= cgrp_dfl_threaded_ss_mask;

463 return ss_mask;

464 }

465

466 return cgrp->root->subsys_mask;

467 }

468

469 /**

470 * cgroup_css - obtain a cgroup's css for the specified subsystem

471 * @cgrp: the cgroup of interest

472 * @ss: the subsystem of interest (%NULL returns @cgrp->self)

473 *

474 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This

475 * function must be called either under cgroup_mutex or rcu_read_lock() and

476 * the caller is responsible for pinning the returned css if it wants to

477 * keep accessing it outside the said locks. This function may return

478 * %NULL if @cgrp doesn't have @subsys_id enabled.

479 */

480 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,

481 struct cgroup_subsys *ss)

482 {

483 if (CGROUP_HAS_SUBSYS_CONFIG && ss)

484 return rcu_dereference_check(cgrp->subsys[ss->id],

485 lockdep_is_held(&cgroup_mutex));

486 else

487 return &cgrp->self;

488 }

489

490 /**

491 * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem

492 * @cgrp: the cgroup of interest

493 * @ss: the subsystem of interest

494 *

495 * Find and get @cgrp's css associated with @ss. If the css doesn't exist

496 * or is offline, %NULL is returned.

497 */

498 static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,

499 struct cgroup_subsys *ss)

500 {

501 struct cgroup_subsys_state *css;

502

503 rcu_read_lock();

504 css = cgroup_css(cgrp, ss);

505 if (css && !css_tryget_online(css))

506 css = NULL;

507 rcu_read_unlock();

508

509 return css;

510 }

511

512 /**

513 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss

514 * @cgrp: the cgroup of interest

515 * @ss: the subsystem of interest (%NULL returns @cgrp->self)

516 *

517 * Similar to cgroup_css() but returns the effective css, which is defined

518 * as the matching css of the nearest ancestor including self which has @ss

519 * enabled. If @ss is associated with the hierarchy @cgrp is on, this

520 * function is guaranteed to return non-NULL css.

521 */

522 static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,

523 struct cgroup_subsys *ss)

524 {

525 lockdep_assert_held(&cgroup_mutex);

526

527 if (!ss)

528 return &cgrp->self;

529

530 /*

531 * This function is used while updating css associations and thus

532 * can't test the csses directly. Test ss_mask.

533 */

534 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {

535 cgrp = cgroup_parent(cgrp);

536 if (!cgrp)

537 return NULL;

538 }

539

540 return cgroup_css(cgrp, ss);

541 }

542

543 /**

544 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem

545 * @cgrp: the cgroup of interest

546 * @ss: the subsystem of interest

547 *

548 * Find and get the effective css of @cgrp for @ss. The effective css is

549 * defined as the matching css of the nearest ancestor including self which

550 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,

551 * the root css is returned, so this function always returns a valid css.

552 *

553 * The returned css is not guaranteed to be online, and therefore it is the

554 * callers responsibility to try get a reference for it.

555 */

556 struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,

557 struct cgroup_subsys *ss)

558 {

559 struct cgroup_subsys_state *css;

560

561 if (!CGROUP_HAS_SUBSYS_CONFIG)

562 return NULL;

563

564 do {

565 css = cgroup_css(cgrp, ss);

566

567 if (css)

568 return css;

569 cgrp = cgroup_parent(cgrp);

570 } while (cgrp);

571

572 return init_css_set.subsys[ss->id];

573 }

574

575 /**

576 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem

577 * @cgrp: the cgroup of interest

578 * @ss: the subsystem of interest

579 *

580 * Find and get the effective css of @cgrp for @ss. The effective css is

581 * defined as the matching css of the nearest ancestor including self which

582 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,

583 * the root css is returned, so this function always returns a valid css.

584 * The returned css must be put using css_put().

585 */

586 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,

587 struct cgroup_subsys *ss)

588 {

589 struct cgroup_subsys_state *css;

590

591 if (!CGROUP_HAS_SUBSYS_CONFIG)

592 return NULL;

593

594 rcu_read_lock();

595

596 do {

597 css = cgroup_css(cgrp, ss);

598

599 if (css && css_tryget_online(css))

600 goto out_unlock;

601 cgrp = cgroup_parent(cgrp);

602 } while (cgrp);

603

604 css = init_css_set.subsys[ss->id];

605 css_get(css);

606 out_unlock:

607 rcu_read_unlock();

608 return css;

609 }

610 EXPORT_SYMBOL_GPL(cgroup_get_e_css);

611

612 static void cgroup_get_live(struct cgroup *cgrp)

613 {

614 WARN_ON_ONCE(cgroup_is_dead(cgrp));

615 css_get(&cgrp->self);

616 }

617

618 /**

619 * __cgroup_task_count - count the number of tasks in a cgroup. The caller

620 * is responsible for taking the css_set_lock.

621 * @cgrp: the cgroup in question

622 */

623 int __cgroup_task_count(const struct cgroup *cgrp)

624 {

625 int count = 0;

626 struct cgrp_cset_link *link;

627

628 lockdep_assert_held(&css_set_lock);

629

630 list_for_each_entry(link, &cgrp->cset_links, cset_link)

631 count += link->cset->nr_tasks;

632

633 return count;

634 }

635

636 /**

637 * cgroup_task_count - count the number of tasks in a cgroup.

638 * @cgrp: the cgroup in question

639 */

640 int cgroup_task_count(const struct cgroup *cgrp)

641 {

642 int count;

643

644 spin_lock_irq(&css_set_lock);

645 count = __cgroup_task_count(cgrp);

646 spin_unlock_irq(&css_set_lock);

647

648 return count;

649 }

650

651 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)

652 {

653 struct cgroup *cgrp = of->kn->parent->priv;

654 struct cftype *cft = of_cft(of);

655

656 /*

657 * This is open and unprotected implementation of cgroup_css().

658 * seq_css() is only called from a kernfs file operation which has

659 * an active reference on the file. Because all the subsystem

660 * files are drained before a css is disassociated with a cgroup,

661 * the matching css from the cgroup's subsys table is guaranteed to

662 * be and stay valid until the enclosing operation is complete.

663 */

664 if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)

665 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);

666 else

667 return &cgrp->self;

668 }

669 EXPORT_SYMBOL_GPL(of_css);

670

671 /**

672 * for_each_css - iterate all css's of a cgroup

673 * @css: the iteration cursor

674 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

675 * @cgrp: the target cgroup to iterate css's of

676 *

677 * Should be called under cgroup_[tree_]mutex.

678 */

679 #define for_each_css(css, ssid, cgrp) \

680 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

681 if (!((css) = rcu_dereference_check( \

682 (cgrp)->subsys[(ssid)], \

683 lockdep_is_held(&cgroup_mutex)))) { } \

684 else

685

686 /**

687 * for_each_e_css - iterate all effective css's of a cgroup

688 * @css: the iteration cursor

689 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

690 * @cgrp: the target cgroup to iterate css's of

691 *

692 * Should be called under cgroup_[tree_]mutex.

693 */

694 #define for_each_e_css(css, ssid, cgrp) \

695 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

696 if (!((css) = cgroup_e_css_by_mask(cgrp, \

697 cgroup_subsys[(ssid)]))) \

698 ; \

699 else

700

701 /**

702 * do_each_subsys_mask - filter for_each_subsys with a bitmask

703 * @ss: the iteration cursor

704 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end

705 * @ss_mask: the bitmask

706 *

707 * The block will only run for cases where the ssid-th bit (1 << ssid) of

708 * @ss_mask is set.

709 */

710 #define do_each_subsys_mask(ss, ssid, ss_mask) do { \

711 unsigned long __ss_mask = (ss_mask); \

712 if (!CGROUP_HAS_SUBSYS_CONFIG) { \

713 (ssid) = 0; \

714 break; \

715 } \

716 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \

717 (ss) = cgroup_subsys[ssid]; \

718 {

719

720 #define while_each_subsys_mask() \

721 } \

722 } \

723 } while (false)

724

725 /* iterate over child cgrps, lock should be held throughout iteration */

726 #define cgroup_for_each_live_child(child, cgrp) \

727 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \

728 if (({ lockdep_assert_held(&cgroup_mutex); \

729 cgroup_is_dead(child); })) \

730 ; \

731 else

732

733 /* walk live descendants in pre order */

734 #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \

735 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \

736 if (({ lockdep_assert_held(&cgroup_mutex); \

737 (dsct) = (d_css)->cgroup; \

738 cgroup_is_dead(dsct); })) \

739 ; \

740 else

741

742 /* walk live descendants in postorder */

743 #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \

744 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \

745 if (({ lockdep_assert_held(&cgroup_mutex); \

746 (dsct) = (d_css)->cgroup; \

747 cgroup_is_dead(dsct); })) \

748 ; \

749 else

750

751 /*

752 * The default css_set - used by init and its children prior to any

753 * hierarchies being mounted. It contains a pointer to the root state

754 * for each subsystem. Also used to anchor the list of css_sets. Not

755 * reference-counted, to improve performance when child cgroups

756 * haven't been created.

757 */

758 struct css_set init_css_set = {

759 .refcount = REFCOUNT_INIT(1),

760 .dom_cset = &init_css_set,

761 .tasks = LIST_HEAD_INIT(init_css_set.tasks),

762 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),

763 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),

764 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),

765 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),

766 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),

767 .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),

768 .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),

769 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),

770

771 /*

772 * The following field is re-initialized when this cset gets linked

773 * in cgroup_init(). However, let's initialize the field

774 * statically too so that the default cgroup can be accessed safely

775 * early during boot.

776 */

777 .dfl_cgrp = &cgrp_dfl_root.cgrp,

778 };

779

780 static int css_set_count = 1; /* 1 for init_css_set */

781

782 static bool css_set_threaded(struct css_set *cset)

783 {

784 return cset->dom_cset != cset;

785 }

786

787 /**

788 * css_set_populated - does a css_set contain any tasks?

789 * @cset: target css_set

790 *

791 * css_set_populated() should be the same as !!cset->nr_tasks at steady

792 * state. However, css_set_populated() can be called while a task is being

793 * added to or removed from the linked list before the nr_tasks is

794 * properly updated. Hence, we can't just look at ->nr_tasks here.

795 */

796 static bool css_set_populated(struct css_set *cset)

797 {

798 lockdep_assert_held(&css_set_lock);

799

800 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);

801 }

802

803 /**

804 * cgroup_update_populated - update the populated count of a cgroup

805 * @cgrp: the target cgroup

806 * @populated: inc or dec populated count

807 *

808 * One of the css_sets associated with @cgrp is either getting its first

809 * task or losing the last. Update @cgrp->nr_populated_* accordingly. The

810 * count is propagated towards root so that a given cgroup's

811 * nr_populated_children is zero iff none of its descendants contain any

812 * tasks.

813 *

814 * @cgrp's interface file "cgroup.populated" is zero if both

815 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and

816 * 1 otherwise. When the sum changes from or to zero, userland is notified

817 * that the content of the interface file has changed. This can be used to

818 * detect when @cgrp and its descendants become populated or empty.

819 */

820 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)

821 {

822 struct cgroup *child = NULL;

823 int adj = populated ? 1 : -1;

824

825 lockdep_assert_held(&css_set_lock);

826

827 do {

828 bool was_populated = cgroup_is_populated(cgrp);

829

830 if (!child) {

831 cgrp->nr_populated_csets += adj;

832 } else {

833 if (cgroup_is_threaded(child))

834 cgrp->nr_populated_threaded_children += adj;

835 else

836 cgrp->nr_populated_domain_children += adj;

837 }

838

839 if (was_populated == cgroup_is_populated(cgrp))

840 break;

841

842 cgroup1_check_for_release(cgrp);

843 TRACE_CGROUP_PATH(notify_populated, cgrp,

844 cgroup_is_populated(cgrp));

845 cgroup_file_notify(&cgrp->events_file);

846

847 child = cgrp;

848 cgrp = cgroup_parent(cgrp);

849 } while (cgrp);

850 }

851

852 /**

853 * css_set_update_populated - update populated state of a css_set

854 * @cset: target css_set

855 * @populated: whether @cset is populated or depopulated

856 *

857 * @cset is either getting the first task or losing the last. Update the

858 * populated counters of all associated cgroups accordingly.

859 */

860 static void css_set_update_populated(struct css_set *cset, bool populated)

861 {

862 struct cgrp_cset_link *link;

863

864 lockdep_assert_held(&css_set_lock);

865

866 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)

867 cgroup_update_populated(link->cgrp, populated);

868 }

869

870 /*

871 * @task is leaving, advance task iterators which are pointing to it so

872 * that they can resume at the next position. Advancing an iterator might

873 * remove it from the list, use safe walk. See css_task_iter_skip() for

874 * details.

875 */

876 static void css_set_skip_task_iters(struct css_set *cset,

877 struct task_struct *task)

878 {

879 struct css_task_iter *it, *pos;

880

881 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)

882 css_task_iter_skip(it, task);

883 }

884

885 /**

886 * css_set_move_task - move a task from one css_set to another

887 * @task: task being moved

888 * @from_cset: css_set @task currently belongs to (may be NULL)

889 * @to_cset: new css_set @task is being moved to (may be NULL)

890 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks

891 *

892 * Move @task from @from_cset to @to_cset. If @task didn't belong to any

893 * css_set, @from_cset can be NULL. If @task is being disassociated

894 * instead of moved, @to_cset can be NULL.

895 *

896 * This function automatically handles populated counter updates and

897 * css_task_iter adjustments but the caller is responsible for managing

898 * @from_cset and @to_cset's reference counts.

899 */

900 static void css_set_move_task(struct task_struct *task,

901 struct css_set *from_cset, struct css_set *to_cset,

902 bool use_mg_tasks)

903 {

904 lockdep_assert_held(&css_set_lock);

905

906 if (to_cset && !css_set_populated(to_cset))

907 css_set_update_populated(to_cset, true);

908

909 if (from_cset) {

910 WARN_ON_ONCE(list_empty(&task->cg_list));

911

912 css_set_skip_task_iters(from_cset, task);

913 list_del_init(&task->cg_list);

914 if (!css_set_populated(from_cset))

915 css_set_update_populated(from_cset, false);

916 } else {

917 WARN_ON_ONCE(!list_empty(&task->cg_list));

918 }

919

920 if (to_cset) {

921 /*

922 * We are synchronized through cgroup_threadgroup_rwsem

923 * against PF_EXITING setting such that we can't race

924 * against cgroup_exit()/cgroup_free() dropping the css_set.

925 */

926 WARN_ON_ONCE(task->flags & PF_EXITING);

927

928 cgroup_move_task(task, to_cset);

929 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :

930 &to_cset->tasks);

931 }

932 }

933

934 /*

935 * hash table for cgroup groups. This improves the performance to find

936 * an existing css_set. This hash doesn't (currently) take into

937 * account cgroups in empty hierarchies.

938 */

939 #define CSS_SET_HASH_BITS 7

940 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

941

942 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])

943 {

944 unsigned long key = 0UL;

945 struct cgroup_subsys *ss;

946 int i;

947

948 for_each_subsys(ss, i)

949 key += (unsigned long)css[i];

950 key = (key >> 16) ^ key;

951

952 return key;

953 }

954

955 void put_css_set_locked(struct css_set *cset)

956 {

957 struct cgrp_cset_link *link, *tmp_link;

958 struct cgroup_subsys *ss;

959 int ssid;

960

961 lockdep_assert_held(&css_set_lock);

962

963 if (!refcount_dec_and_test(&cset->refcount))

964 return;

965

966 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));

967

968 /* This css_set is dead. Unlink it and release cgroup and css refs */

969 for_each_subsys(ss, ssid) {

970 list_del(&cset->e_cset_node[ssid]);

971 css_put(cset->subsys[ssid]);

972 }

973 hash_del(&cset->hlist);

974 css_set_count--;

975

976 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {

977 list_del(&link->cset_link);

978 list_del(&link->cgrp_link);

979 if (cgroup_parent(link->cgrp))

980 cgroup_put(link->cgrp);

981 kfree(link);

982 }

983

984 if (css_set_threaded(cset)) {

985 list_del(&cset->threaded_csets_node);

986 put_css_set_locked(cset->dom_cset);

987 }

988

989 kfree_rcu(cset, rcu_head);

990 }

991

992 /**

993 * compare_css_sets - helper function for find_existing_css_set().

994 * @cset: candidate css_set being tested

995 * @old_cset: existing css_set for a task

996 * @new_cgrp: cgroup that's being entered by the task

997 * @template: desired set of css pointers in css_set (pre-calculated)

998 *

999 * Returns true if "cset" matches "old_cset" except for the hierarchy

1000 * which "new_cgrp" belongs to, for which it should match "new_cgrp".

1001 */

1002 static bool compare_css_sets(struct css_set *cset,

1003 struct css_set *old_cset,

1004 struct cgroup *new_cgrp,

1005 struct cgroup_subsys_state *template[])

1006 {

1007 struct cgroup *new_dfl_cgrp;

1008 struct list_head *l1, *l2;

1009

1010 /*

1011 * On the default hierarchy, there can be csets which are

1012 * associated with the same set of cgroups but different csses.

1013 * Let's first ensure that csses match.

1014 */

1015 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))

1016 return false;

1017

1018

1019 /* @cset's domain should match the default cgroup's */

1020 if (cgroup_on_dfl(new_cgrp))

1021 new_dfl_cgrp = new_cgrp;

1022 else

1023 new_dfl_cgrp = old_cset->dfl_cgrp;

1024

1025 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)

1026 return false;

1027

1028 /*

1029 * Compare cgroup pointers in order to distinguish between

1030 * different cgroups in hierarchies. As different cgroups may

1031 * share the same effective css, this comparison is always

1032 * necessary.

1033 */

1034 l1 = &cset->cgrp_links;

1035 l2 = &old_cset->cgrp_links;

1036 while (1) {

1037 struct cgrp_cset_link *link1, *link2;

1038 struct cgroup *cgrp1, *cgrp2;

1039

1040 l1 = l1->next;

1041 l2 = l2->next;

1042 /* See if we reached the end - both lists are equal length. */

1043 if (l1 == &cset->cgrp_links) {

1044 BUG_ON(l2 != &old_cset->cgrp_links);

1045 break;

1046 } else {

1047 BUG_ON(l2 == &old_cset->cgrp_links);

1048 }

1049 /* Locate the cgroups associated with these links. */

1050 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);

1051 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);

1052 cgrp1 = link1->cgrp;

1053 cgrp2 = link2->cgrp;

1054 /* Hierarchies should be linked in the same order. */

1055 BUG_ON(cgrp1->root != cgrp2->root);

1056

1057 /*

1058 * If this hierarchy is the hierarchy of the cgroup

1059 * that's changing, then we need to check that this

1060 * css_set points to the new cgroup; if it's any other

1061 * hierarchy, then this css_set should point to the

1062 * same cgroup as the old css_set.

1063 */

1064 if (cgrp1->root == new_cgrp->root) {

1065 if (cgrp1 != new_cgrp)

1066 return false;

1067 } else {

1068 if (cgrp1 != cgrp2)

1069 return false;

1070 }

1071 }

1072 return true;

1073 }

1074

1075 /**

1076 * find_existing_css_set - init css array and find the matching css_set

1077 * @old_cset: the css_set that we're using before the cgroup transition

1078 * @cgrp: the cgroup that we're moving into

1079 * @template: out param for the new set of csses, should be clear on entry

1080 */

1081 static struct css_set *find_existing_css_set(struct css_set *old_cset,

1082 struct cgroup *cgrp,

1083 struct cgroup_subsys_state *template[])

1084 {

1085 struct cgroup_root *root = cgrp->root;

1086 struct cgroup_subsys *ss;

1087 struct css_set *cset;

1088 unsigned long key;

1089 int i;

1090

1091 /*

1092 * Build the set of subsystem state objects that we want to see in the

1093 * new css_set. While subsystems can change globally, the entries here

1094 * won't change, so no need for locking.

1095 */

1096 for_each_subsys(ss, i) {

1097 if (root->subsys_mask & (1UL << i)) {

1098 /*

1099 * @ss is in this hierarchy, so we want the

1100 * effective css from @cgrp.

1101 */

1102 template[i] = cgroup_e_css_by_mask(cgrp, ss);

1103 } else {

1104 /*

1105 * @ss is not in this hierarchy, so we don't want

1106 * to change the css.

1107 */

1108 template[i] = old_cset->subsys[i];

1109 }

1110 }

1111

1112 key = css_set_hash(template);

1113 hash_for_each_possible(css_set_table, cset, hlist, key) {

1114 if (!compare_css_sets(cset, old_cset, cgrp, template))

1115 continue;

1116

1117 /* This css_set matches what we need */

1118 return cset;

1119 }

1120

1121 /* No existing cgroup group matched */

1122 return NULL;

1123 }

1124

1125 static void free_cgrp_cset_links(struct list_head *links_to_free)

1126 {

1127 struct cgrp_cset_link *link, *tmp_link;

1128

1129 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {

1130 list_del(&link->cset_link);

1131 kfree(link);

1132 }

1133 }

1134

1135 /**

1136 * allocate_cgrp_cset_links - allocate cgrp_cset_links

1137 * @count: the number of links to allocate

1138 * @tmp_links: list_head the allocated links are put on

1139 *

1140 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links

1141 * through ->cset_link. Returns 0 on success or -errno.

1142 */

1143 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)

1144 {

1145 struct cgrp_cset_link *link;

1146 int i;

1147

1148 INIT_LIST_HEAD(tmp_links);

1149

1150 for (i = 0; i < count; i++) {

1151 link = kzalloc(sizeof(*link), GFP_KERNEL);

1152 if (!link) {

1153 free_cgrp_cset_links(tmp_links);

1154 return -ENOMEM;

1155 }

1156 list_add(&link->cset_link, tmp_links);

1157 }

1158 return 0;

1159 }

1160

1161 /**

1162 * link_css_set - a helper function to link a css_set to a cgroup

1163 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()

1164 * @cset: the css_set to be linked

1165 * @cgrp: the destination cgroup

1166 */

1167 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,

1168 struct cgroup *cgrp)

1169 {

1170 struct cgrp_cset_link *link;

1171

1172 BUG_ON(list_empty(tmp_links));

1173

1174 if (cgroup_on_dfl(cgrp))

1175 cset->dfl_cgrp = cgrp;

1176

1177 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);

1178 link->cset = cset;

1179 link->cgrp = cgrp;

1180

1181 /*

1182 * Always add links to the tail of the lists so that the lists are

1183 * in chronological order.

1184 */

1185 list_move_tail(&link->cset_link, &cgrp->cset_links);

1186 list_add_tail(&link->cgrp_link, &cset->cgrp_links);

1187

1188 if (cgroup_parent(cgrp))

1189 cgroup_get_live(cgrp);

1190 }

1191

1192 /**

1193 * find_css_set - return a new css_set with one cgroup updated

1194 * @old_cset: the baseline css_set

1195 * @cgrp: the cgroup to be updated

1196 *

1197 * Return a new css_set that's equivalent to @old_cset, but with @cgrp

1198 * substituted into the appropriate hierarchy.

1199 */

1200 static struct css_set *find_css_set(struct css_set *old_cset,

1201 struct cgroup *cgrp)

1202 {

1203 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };

1204 struct css_set *cset;

1205 struct list_head tmp_links;

1206 struct cgrp_cset_link *link;

1207 struct cgroup_subsys *ss;

1208 unsigned long key;

1209 int ssid;

1210

1211 lockdep_assert_held(&cgroup_mutex);

1212

1213 /* First see if we already have a cgroup group that matches

1214 * the desired set */

1215 spin_lock_irq(&css_set_lock);

1216 cset = find_existing_css_set(old_cset, cgrp, template);

1217 if (cset)

1218 get_css_set(cset);

1219 spin_unlock_irq(&css_set_lock);

1220

1221 if (cset)

1222 return cset;

1223

1224 cset = kzalloc(sizeof(*cset), GFP_KERNEL);

1225 if (!cset)

1226 return NULL;

1227

1228 /* Allocate all the cgrp_cset_link objects that we'll need */

1229 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {

1230 kfree(cset);

1231 return NULL;

1232 }

1233

1234 refcount_set(&cset->refcount, 1);

1235 cset->dom_cset = cset;

1236 INIT_LIST_HEAD(&cset->tasks);

1237 INIT_LIST_HEAD(&cset->mg_tasks);

1238 INIT_LIST_HEAD(&cset->dying_tasks);

1239 INIT_LIST_HEAD(&cset->task_iters);

1240 INIT_LIST_HEAD(&cset->threaded_csets);

1241 INIT_HLIST_NODE(&cset->hlist);

1242 INIT_LIST_HEAD(&cset->cgrp_links);

1243 INIT_LIST_HEAD(&cset->mg_src_preload_node);

1244 INIT_LIST_HEAD(&cset->mg_dst_preload_node);

1245 INIT_LIST_HEAD(&cset->mg_node);

1246

1247 /* Copy the set of subsystem state objects generated in

1248 * find_existing_css_set() */

1249 memcpy(cset->subsys, template, sizeof(cset->subsys));

1250

1251 spin_lock_irq(&css_set_lock);

1252 /* Add reference counts and links from the new css_set. */

1253 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {

1254 struct cgroup *c = link->cgrp;

1255

1256 if (c->root == cgrp->root)

1257 c = cgrp;

1258 link_css_set(&tmp_links, cset, c);

1259 }

1260

1261 BUG_ON(!list_empty(&tmp_links));

1262

1263 css_set_count++;

1264

1265 /* Add @cset to the hash table */

1266 key = css_set_hash(cset->subsys);

1267 hash_add(css_set_table, &cset->hlist, key);

1268

1269 for_each_subsys(ss, ssid) {

1270 struct cgroup_subsys_state *css = cset->subsys[ssid];

1271

1272 list_add_tail(&cset->e_cset_node[ssid],

1273 &css->cgroup->e_csets[ssid]);

1274 css_get(css);

1275 }

1276

1277 spin_unlock_irq(&css_set_lock);

1278

1279 /*

1280 * If @cset should be threaded, look up the matching dom_cset and

1281 * link them up. We first fully initialize @cset then look for the

1282 * dom_cset. It's simpler this way and safe as @cset is guaranteed

1283 * to stay empty until we return.

1284 */

1285 if (cgroup_is_threaded(cset->dfl_cgrp)) {

1286 struct css_set *dcset;

1287

1288 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);

1289 if (!dcset) {

1290 put_css_set(cset);

1291 return NULL;

1292 }

1293

1294 spin_lock_irq(&css_set_lock);

1295 cset->dom_cset = dcset;

1296 list_add_tail(&cset->threaded_csets_node,

1297 &dcset->threaded_csets);

1298 spin_unlock_irq(&css_set_lock);

1299 }

1300

1301 return cset;

1302 }

1303

1304 struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)

1305 {

1306 struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;

1307

1308 return root_cgrp->root;

1309 }

1310

1311 void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)

1312 {

1313 bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;

1314

1315 /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */

1316 if (favor && !favoring) {

1317 rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);

1318 root->flags |= CGRP_ROOT_FAVOR_DYNMODS;

1319 } else if (!favor && favoring) {

1320 rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);

1321 root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;

1322 }

1323 }

1324

1325 static int cgroup_init_root_id(struct cgroup_root *root)

1326 {

1327 int id;

1328

1329 lockdep_assert_held(&cgroup_mutex);

1330

1331 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);

1332 if (id < 0)

1333 return id;

1334

1335 root->hierarchy_id = id;

1336 return 0;

1337 }

1338

1339 static void cgroup_exit_root_id(struct cgroup_root *root)

1340 {

1341 lockdep_assert_held(&cgroup_mutex);

1342

1343 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);

1344 }

1345

1346 void cgroup_free_root(struct cgroup_root *root)

1347 {

1348 kfree(root);

1349 }

1350

1351 static void cgroup_destroy_root(struct cgroup_root *root)

1352 {

1353 struct cgroup *cgrp = &root->cgrp;

1354 struct cgrp_cset_link *link, *tmp_link;

1355

1356 trace_cgroup_destroy_root(root);

1357

1358 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);

1359

1360 BUG_ON(atomic_read(&root->nr_cgrps));

1361 BUG_ON(!list_empty(&cgrp->self.children));

1362

1363 /* Rebind all subsystems back to the default hierarchy */

1364 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));

1365

1366 /*

1367 * Release all the links from cset_links to this hierarchy's

1368 * root cgroup

1369 */

1370 spin_lock_irq(&css_set_lock);

1371

1372 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {

1373 list_del(&link->cset_link);

1374 list_del(&link->cgrp_link);

1375 kfree(link);

1376 }

1377

1378 spin_unlock_irq(&css_set_lock);

1379

1380 if (!list_empty(&root->root_list)) {

1381 list_del(&root->root_list);

1382 cgroup_root_count--;

1383 }

1384

1385 cgroup_favor_dynmods(root, false);

1386 cgroup_exit_root_id(root);

1387

1388 mutex_unlock(&cgroup_mutex);

1389

1390 cgroup_rstat_exit(cgrp);

1391 kernfs_destroy_root(root->kf_root);

1392 cgroup_free_root(root);

1393 }

1394

1395 static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,

1396 struct cgroup_root *root)

1397 {

1398 struct cgroup *res_cgroup = NULL;

1399

1400 if (cset == &init_css_set) {

1401 res_cgroup = &root->cgrp;

1402 } else if (root == &cgrp_dfl_root) {

1403 res_cgroup = cset->dfl_cgrp;

1404 } else {

1405 struct cgrp_cset_link *link;

1406

1407 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

1408 struct cgroup *c = link->cgrp;

1409

1410 if (c->root == root) {

1411 res_cgroup = c;

1412 break;

1413 }

1414 }

1415 }

1416

1417 return res_cgroup;

1418 }

1419

1420 /*

1421 * look up cgroup associated with current task's cgroup namespace on the

1422 * specified hierarchy

1423 */

1424 static struct cgroup *

1425 current_cgns_cgroup_from_root(struct cgroup_root *root)

1426 {

1427 struct cgroup *res = NULL;

1428 struct css_set *cset;

1429

1430 lockdep_assert_held(&css_set_lock);

1431

1432 rcu_read_lock();

1433

1434 cset = current->nsproxy->cgroup_ns->root_cset;

1435 res = __cset_cgroup_from_root(cset, root);

1436

1437 rcu_read_unlock();

1438

1439 BUG_ON(!res);

1440 return res;

1441 }

1442

1443 /* look up cgroup associated with given css_set on the specified hierarchy */

1444 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,

1445 struct cgroup_root *root)

1446 {

1447 struct cgroup *res = NULL;

1448

1449 lockdep_assert_held(&cgroup_mutex);

1450 lockdep_assert_held(&css_set_lock);

1451

1452 res = __cset_cgroup_from_root(cset, root);

1453

1454 BUG_ON(!res);

1455 return res;

1456 }

1457

1458 /*

1459 * Return the cgroup for "task" from the given hierarchy. Must be

1460 * called with cgroup_mutex and css_set_lock held.

1461 */

1462 struct cgroup *task_cgroup_from_root(struct task_struct *task,

1463 struct cgroup_root *root)

1464 {

1465 /*

1466 * No need to lock the task - since we hold css_set_lock the

1467 * task can't change groups.

1468 */

1469 return cset_cgroup_from_root(task_css_set(task), root);

1470 }

1471

1472 /*

1473 * A task must hold cgroup_mutex to modify cgroups.

1474 *

1475 * Any task can increment and decrement the count field without lock.

1476 * So in general, code holding cgroup_mutex can't rely on the count

1477 * field not changing. However, if the count goes to zero, then only

1478 * cgroup_attach_task() can increment it again. Because a count of zero

1479 * means that no tasks are currently attached, therefore there is no

1480 * way a task attached to that cgroup can fork (the other way to

1481 * increment the count). So code holding cgroup_mutex can safely

1482 * assume that if the count is zero, it will stay zero. Similarly, if

1483 * a task holds cgroup_mutex on a cgroup with zero count, it

1484 * knows that the cgroup won't be removed, as cgroup_rmdir()

1485 * needs that mutex.

1486 *

1487 * A cgroup can only be deleted if both its 'count' of using tasks

1488 * is zero, and its list of 'children' cgroups is empty. Since all

1489 * tasks in the system use _some_ cgroup, and since there is always at

1490 * least one task in the system (init, pid == 1), therefore, root cgroup

1491 * always has either children cgroups and/or using tasks. So we don't

1492 * need a special hack to ensure that root cgroup cannot be deleted.

1493 *

1494 * P.S. One more locking exception. RCU is used to guard the

1495 * update of a tasks cgroup pointer by cgroup_attach_task()

1496 */

1497

1498 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

1499

1500 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,

1501 char *buf)

1502 {

1503 struct cgroup_subsys *ss = cft->ss;

1504

1505 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&

1506 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {

1507 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";

1508

1509 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",

1510 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,

1511 cft->name);

1512 } else {

1513 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);

1514 }

1515 return buf;

1516 }

1517

1518 /**

1519 * cgroup_file_mode - deduce file mode of a control file

1520 * @cft: the control file in question

1521 *

1522 * S_IRUGO for read, S_IWUSR for write.

1523 */

1524 static umode_t cgroup_file_mode(const struct cftype *cft)

1525 {

1526 umode_t mode = 0;

1527

1528 if (cft->read_u64 || cft->read_s64 || cft->seq_show)

1529 mode |= S_IRUGO;

1530

1531 if (cft->write_u64 || cft->write_s64 || cft->write) {

1532 if (cft->flags & CFTYPE_WORLD_WRITABLE)

1533 mode |= S_IWUGO;

1534 else

1535 mode |= S_IWUSR;

1536 }

1537

1538 return mode;

1539 }

1540

1541 /**

1542 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask

1543 * @subtree_control: the new subtree_control mask to consider

1544 * @this_ss_mask: available subsystems

1545 *

1546 * On the default hierarchy, a subsystem may request other subsystems to be

1547 * enabled together through its ->depends_on mask. In such cases, more

1548 * subsystems than specified in "cgroup.subtree_control" may be enabled.

1549 *

1550 * This function calculates which subsystems need to be enabled if

1551 * @subtree_control is to be applied while restricted to @this_ss_mask.

1552 */

1553 static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)

1554 {

1555 u16 cur_ss_mask = subtree_control;

1556 struct cgroup_subsys *ss;

1557 int ssid;

1558

1559 lockdep_assert_held(&cgroup_mutex);

1560

1561 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;

1562

1563 while (true) {

1564 u16 new_ss_mask = cur_ss_mask;

1565

1566 do_each_subsys_mask(ss, ssid, cur_ss_mask) {

1567 new_ss_mask |= ss->depends_on;

1568 } while_each_subsys_mask();

1569

1570 /*

1571 * Mask out subsystems which aren't available. This can

1572 * happen only if some depended-upon subsystems were bound

1573 * to non-default hierarchies.

1574 */

1575 new_ss_mask &= this_ss_mask;

1576

1577 if (new_ss_mask == cur_ss_mask)

1578 break;

1579 cur_ss_mask = new_ss_mask;

1580 }

1581

1582 return cur_ss_mask;

1583 }

1584

1585 /**

1586 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods

1587 * @kn: the kernfs_node being serviced

1588 *

1589 * This helper undoes cgroup_kn_lock_live() and should be invoked before

1590 * the method finishes if locking succeeded. Note that once this function

1591 * returns the cgroup returned by cgroup_kn_lock_live() may become

1592 * inaccessible any time. If the caller intends to continue to access the

1593 * cgroup, it should pin it before invoking this function.

1594 */

1595 void cgroup_kn_unlock(struct kernfs_node *kn)

1596 {

1597 struct cgroup *cgrp;

1598

1599 if (kernfs_type(kn) == KERNFS_DIR)

1600 cgrp = kn->priv;

1601 else

1602 cgrp = kn->parent->priv;

1603

1604 mutex_unlock(&cgroup_mutex);

1605

1606 kernfs_unbreak_active_protection(kn);

1607 cgroup_put(cgrp);

1608 }

1609

1610 /**

1611 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods

1612 * @kn: the kernfs_node being serviced

1613 * @drain_offline: perform offline draining on the cgroup

1614 *

1615 * This helper is to be used by a cgroup kernfs method currently servicing

1616 * @kn. It breaks the active protection, performs cgroup locking and

1617 * verifies that the associated cgroup is alive. Returns the cgroup if

1618 * alive; otherwise, %NULL. A successful return should be undone by a

1619 * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the

1620 * cgroup is drained of offlining csses before return.

1621 *

1622 * Any cgroup kernfs method implementation which requires locking the

1623 * associated cgroup should use this helper. It avoids nesting cgroup

1624 * locking under kernfs active protection and allows all kernfs operations

1625 * including self-removal.

1626 */

1627 struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)

1628 {

1629 struct cgroup *cgrp;

1630

1631 if (kernfs_type(kn) == KERNFS_DIR)

1632 cgrp = kn->priv;

1633 else

1634 cgrp = kn->parent->priv;

1635

1636 /*

1637 * We're gonna grab cgroup_mutex which nests outside kernfs

1638 * active_ref. cgroup liveliness check alone provides enough

1639 * protection against removal. Ensure @cgrp stays accessible and

1640 * break the active_ref protection.

1641 */

1642 if (!cgroup_tryget(cgrp))

1643 return NULL;

1644 kernfs_break_active_protection(kn);

1645

1646 if (drain_offline)

1647 cgroup_lock_and_drain_offline(cgrp);

1648 else

1649 mutex_lock(&cgroup_mutex);

1650

1651 if (!cgroup_is_dead(cgrp))

1652 return cgrp;

1653

1654 cgroup_kn_unlock(kn);

1655 return NULL;

1656 }

1657

1658 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)

1659 {

1660 char name[CGROUP_FILE_NAME_MAX];

1661

1662 lockdep_assert_held(&cgroup_mutex);

1663

1664 if (cft->file_offset) {

1665 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);

1666 struct cgroup_file *cfile = (void *)css + cft->file_offset;

1667

1668 spin_lock_irq(&cgroup_file_kn_lock);

1669 cfile->kn = NULL;

1670 spin_unlock_irq(&cgroup_file_kn_lock);

1671

1672 del_timer_sync(&cfile->notify_timer);

1673 }

1674

1675 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));

1676 }

1677

1678 /**

1679 * css_clear_dir - remove subsys files in a cgroup directory

1680 * @css: target css

1681 */

1682 static void css_clear_dir(struct cgroup_subsys_state *css)

1683 {

1684 struct cgroup *cgrp = css->cgroup;

1685 struct cftype *cfts;

1686

1687 if (!(css->flags & CSS_VISIBLE))

1688 return;

1689

1690 css->flags &= ~CSS_VISIBLE;

1691

1692 if (!css->ss) {

1693 if (cgroup_on_dfl(cgrp)) {

1694 cgroup_addrm_files(css, cgrp,

1695 cgroup_base_files, false);

1696 if (cgroup_psi_enabled())

1697 cgroup_addrm_files(css, cgrp,

1698 cgroup_psi_files, false);

1699 } else {

1700 cgroup_addrm_files(css, cgrp,

1701 cgroup1_base_files, false);

1702 }

1703 } else {

1704 list_for_each_entry(cfts, &css->ss->cfts, node)

1705 cgroup_addrm_files(css, cgrp, cfts, false);

1706 }

1707 }

1708

1709 /**

1710 * css_populate_dir - create subsys files in a cgroup directory

1711 * @css: target css

1712 *

1713 * On failure, no file is added.

1714 */

1715 static int css_populate_dir(struct cgroup_subsys_state *css)

1716 {

1717 struct cgroup *cgrp = css->cgroup;

1718 struct cftype *cfts, *failed_cfts;

1719 int ret;

1720

1721 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)

1722 return 0;

1723

1724 if (!css->ss) {

1725 if (cgroup_on_dfl(cgrp)) {

1726 ret = cgroup_addrm_files(&cgrp->self, cgrp,

1727 cgroup_base_files, true);

1728 if (ret < 0)

1729 return ret;

1730

1731 if (cgroup_psi_enabled()) {

1732 ret = cgroup_addrm_files(&cgrp->self, cgrp,

1733 cgroup_psi_files, true);

1734 if (ret < 0)

1735 return ret;

1736 }

1737 } else {

1738 cgroup_addrm_files(css, cgrp,

1739 cgroup1_base_files, true);

1740 }

1741 } else {

1742 list_for_each_entry(cfts, &css->ss->cfts, node) {

1743 ret = cgroup_addrm_files(css, cgrp, cfts, true);

1744 if (ret < 0) {

1745 failed_cfts = cfts;

1746 goto err;

1747 }

1748 }

1749 }

1750

1751 css->flags |= CSS_VISIBLE;

1752

1753 return 0;

1754 err:

1755 list_for_each_entry(cfts, &css->ss->cfts, node) {

1756 if (cfts == failed_cfts)

1757 break;

1758 cgroup_addrm_files(css, cgrp, cfts, false);

1759 }

1760 return ret;

1761 }

1762

1763 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)

1764 {

1765 struct cgroup *dcgrp = &dst_root->cgrp;

1766 struct cgroup_subsys *ss;

1767 int ssid, i, ret;

1768 u16 dfl_disable_ss_mask = 0;

1769

1770 lockdep_assert_held(&cgroup_mutex);

1771

1772 do_each_subsys_mask(ss, ssid, ss_mask) {

1773 /*

1774 * If @ss has non-root csses attached to it, can't move.

1775 * If @ss is an implicit controller, it is exempt from this

1776 * rule and can be stolen.

1777 */

1778 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&

1779 !ss->implicit_on_dfl)

1780 return -EBUSY;

1781

1782 /* can't move between two non-dummy roots either */

1783 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)

1784 return -EBUSY;

1785

1786 /*

1787 * Collect ssid's that need to be disabled from default

1788 * hierarchy.

1789 */

1790 if (ss->root == &cgrp_dfl_root)

1791 dfl_disable_ss_mask |= 1 << ssid;

1792

1793 } while_each_subsys_mask();

1794

1795 if (dfl_disable_ss_mask) {

1796 struct cgroup *scgrp = &cgrp_dfl_root.cgrp;

1797

1798 /*

1799 * Controllers from default hierarchy that need to be rebound

1800 * are all disabled together in one go.

1801 */

1802 cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;

1803 WARN_ON(cgroup_apply_control(scgrp));

1804 cgroup_finalize_control(scgrp, 0);

1805 }

1806

1807 do_each_subsys_mask(ss, ssid, ss_mask) {

1808 struct cgroup_root *src_root = ss->root;

1809 struct cgroup *scgrp = &src_root->cgrp;

1810 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);

1811 struct css_set *cset;

1812

1813 WARN_ON(!css || cgroup_css(dcgrp, ss));

1814

1815 if (src_root != &cgrp_dfl_root) {

1816 /* disable from the source */

1817 src_root->subsys_mask &= ~(1 << ssid);

1818 WARN_ON(cgroup_apply_control(scgrp));

1819 cgroup_finalize_control(scgrp, 0);

1820 }

1821

1822 /* rebind */

1823 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);

1824 rcu_assign_pointer(dcgrp->subsys[ssid], css);

1825 ss->root = dst_root;

1826 css->cgroup = dcgrp;

1827

1828 spin_lock_irq(&css_set_lock);

1829 hash_for_each(css_set_table, i, cset, hlist)

1830 list_move_tail(&cset->e_cset_node[ss->id],

1831 &dcgrp->e_csets[ss->id]);

1832 spin_unlock_irq(&css_set_lock);

1833

1834 if (ss->css_rstat_flush) {

1835 list_del_rcu(&css->rstat_css_node);

1836 synchronize_rcu();

1837 list_add_rcu(&css->rstat_css_node,

1838 &dcgrp->rstat_css_list);

1839 }

1840

1841 /* default hierarchy doesn't enable controllers by default */

1842 dst_root->subsys_mask |= 1 << ssid;

1843 if (dst_root == &cgrp_dfl_root) {

1844 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);

1845 } else {

1846 dcgrp->subtree_control |= 1 << ssid;

1847 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);

1848 }

1849

1850 ret = cgroup_apply_control(dcgrp);

1851 if (ret)

1852 pr_warn("partial failure to rebind %s controller (err=%d)\n",

1853 ss->name, ret);

1854

1855 if (ss->bind)

1856 ss->bind(css);

1857 } while_each_subsys_mask();

1858

1859 kernfs_activate(dcgrp->kn);

1860 return 0;

1861 }

1862

1863 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,

1864 struct kernfs_root *kf_root)

1865 {

1866 int len = 0;

1867 char *buf = NULL;

1868 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);

1869 struct cgroup *ns_cgroup;

1870

1871 buf = kmalloc(PATH_MAX, GFP_KERNEL);

1872 if (!buf)

1873 return -ENOMEM;

1874

1875 spin_lock_irq(&css_set_lock);

1876 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);

1877 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);

1878 spin_unlock_irq(&css_set_lock);

1879

1880 if (len >= PATH_MAX)

1881 len = -ERANGE;

1882 else if (len > 0) {

1883 seq_escape(sf, buf, " \t\n\\");

1884 len = 0;

1885 }

1886 kfree(buf);

1887 return len;

1888 }

1889

1890 enum cgroup2_param {

1891 Opt_nsdelegate,

1892 Opt_favordynmods,

1893 Opt_memory_localevents,

1894 Opt_memory_recursiveprot,

1895 nr__cgroup2_params

1896 };

1897

1898 static const struct fs_parameter_spec cgroup2_fs_parameters[] = {

1899 fsparam_flag("nsdelegate", Opt_nsdelegate),

1900 fsparam_flag("favordynmods", Opt_favordynmods),

1901 fsparam_flag("memory_localevents", Opt_memory_localevents),

1902 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),

1903 {}

1904 };

1905

1906 static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)

1907 {

1908 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

1909 struct fs_parse_result result;

1910 int opt;

1911

1912 opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);

1913 if (opt < 0)

1914 return opt;

1915

1916 switch (opt) {

1917 case Opt_nsdelegate:

1918 ctx->flags |= CGRP_ROOT_NS_DELEGATE;

1919 return 0;

1920 case Opt_favordynmods:

1921 ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;

1922 return 0;

1923 case Opt_memory_localevents:

1924 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;

1925 return 0;

1926 case Opt_memory_recursiveprot:

1927 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;

1928 return 0;

1929 }

1930 return -EINVAL;

1931 }

1932

1933 static void apply_cgroup_root_flags(unsigned int root_flags)

1934 {

1935 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {

1936 if (root_flags & CGRP_ROOT_NS_DELEGATE)

1937 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;

1938 else

1939 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;

1940

1941 cgroup_favor_dynmods(&cgrp_dfl_root,

1942 root_flags & CGRP_ROOT_FAVOR_DYNMODS);

1943

1944 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)

1945 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;

1946 else

1947 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;

1948

1949 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)

1950 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;

1951 else

1952 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;

1953 }

1954 }

1955

1956 static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)

1957 {

1958 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)

1959 seq_puts(seq, ",nsdelegate");

1960 if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)

1961 seq_puts(seq, ",favordynmods");

1962 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)

1963 seq_puts(seq, ",memory_localevents");

1964 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)

1965 seq_puts(seq, ",memory_recursiveprot");

1966 return 0;

1967 }

1968

1969 static int cgroup_reconfigure(struct fs_context *fc)

1970 {

1971 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

1972

1973 apply_cgroup_root_flags(ctx->flags);

1974 return 0;

1975 }

1976

1977 static void init_cgroup_housekeeping(struct cgroup *cgrp)

1978 {

1979 struct cgroup_subsys *ss;

1980 int ssid;

1981

1982 INIT_LIST_HEAD(&cgrp->self.sibling);

1983 INIT_LIST_HEAD(&cgrp->self.children);

1984 INIT_LIST_HEAD(&cgrp->cset_links);

1985 INIT_LIST_HEAD(&cgrp->pidlists);

1986 mutex_init(&cgrp->pidlist_mutex);

1987 cgrp->self.cgroup = cgrp;

1988 cgrp->self.flags |= CSS_ONLINE;

1989 cgrp->dom_cgrp = cgrp;

1990 cgrp->max_descendants = INT_MAX;

1991 cgrp->max_depth = INT_MAX;

1992 INIT_LIST_HEAD(&cgrp->rstat_css_list);

1993 prev_cputime_init(&cgrp->prev_cputime);

1994

1995 for_each_subsys(ss, ssid)

1996 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);

1997

1998 init_waitqueue_head(&cgrp->offline_waitq);

1999 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);

2000 }

2001

2002 void init_cgroup_root(struct cgroup_fs_context *ctx)

2003 {

2004 struct cgroup_root *root = ctx->root;

2005 struct cgroup *cgrp = &root->cgrp;

2006

2007 INIT_LIST_HEAD(&root->root_list);

2008 atomic_set(&root->nr_cgrps, 1);

2009 cgrp->root = root;

2010 init_cgroup_housekeeping(cgrp);

2011

2012 /* DYNMODS must be modified through cgroup_favor_dynmods() */

2013 root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;

2014 if (ctx->release_agent)

2015 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);

2016 if (ctx->name)

2017 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);

2018 if (ctx->cpuset_clone_children)

2019 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);

2020 }

2021

2022 int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)

2023 {

2024 LIST_HEAD(tmp_links);

2025 struct cgroup *root_cgrp = &root->cgrp;

2026 struct kernfs_syscall_ops *kf_sops;

2027 struct css_set *cset;

2028 int i, ret;

2029

2030 lockdep_assert_held(&cgroup_mutex);

2031

2032 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,

2033 0, GFP_KERNEL);

2034 if (ret)

2035 goto out;

2036

2037 /*

2038 * We're accessing css_set_count without locking css_set_lock here,

2039 * but that's OK - it can only be increased by someone holding

2040 * cgroup_lock, and that's us. Later rebinding may disable

2041 * controllers on the default hierarchy and thus create new csets,

2042 * which can't be more than the existing ones. Allocate 2x.

2043 */

2044 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);

2045 if (ret)

2046 goto cancel_ref;

2047

2048 ret = cgroup_init_root_id(root);

2049 if (ret)

2050 goto cancel_ref;

2051

2052 kf_sops = root == &cgrp_dfl_root ?

2053 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;

2054

2055 root->kf_root = kernfs_create_root(kf_sops,

2056 KERNFS_ROOT_CREATE_DEACTIVATED |

2057 KERNFS_ROOT_SUPPORT_EXPORTOP |

2058 KERNFS_ROOT_SUPPORT_USER_XATTR,

2059 root_cgrp);

2060 if (IS_ERR(root->kf_root)) {

2061 ret = PTR_ERR(root->kf_root);

2062 goto exit_root_id;

2063 }

2064 root_cgrp->kn = kernfs_root_to_node(root->kf_root);

2065 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);

2066 root_cgrp->ancestors[0] = root_cgrp;

2067

2068 ret = css_populate_dir(&root_cgrp->self);

2069 if (ret)

2070 goto destroy_root;

2071

2072 ret = cgroup_rstat_init(root_cgrp);

2073 if (ret)

2074 goto destroy_root;

2075

2076 ret = rebind_subsystems(root, ss_mask);

2077 if (ret)

2078 goto exit_stats;

2079

2080 ret = cgroup_bpf_inherit(root_cgrp);

2081 WARN_ON_ONCE(ret);

2082

2083 trace_cgroup_setup_root(root);

2084

2085 /*

2086 * There must be no failure case after here, since rebinding takes

2087 * care of subsystems' refcounts, which are explicitly dropped in

2088 * the failure exit path.

2089 */

2090 list_add(&root->root_list, &cgroup_roots);

2091 cgroup_root_count++;

2092

2093 /*

2094 * Link the root cgroup in this hierarchy into all the css_set

2095 * objects.

2096 */

2097 spin_lock_irq(&css_set_lock);

2098 hash_for_each(css_set_table, i, cset, hlist) {

2099 link_css_set(&tmp_links, cset, root_cgrp);

2100 if (css_set_populated(cset))

2101 cgroup_update_populated(root_cgrp, true);

2102 }

2103 spin_unlock_irq(&css_set_lock);

2104

2105 BUG_ON(!list_empty(&root_cgrp->self.children));

2106 BUG_ON(atomic_read(&root->nr_cgrps) != 1);

2107

2108 ret = 0;

2109 goto out;

2110

2111 exit_stats:

2112 cgroup_rstat_exit(root_cgrp);

2113 destroy_root:

2114 kernfs_destroy_root(root->kf_root);

2115 root->kf_root = NULL;

2116 exit_root_id:

2117 cgroup_exit_root_id(root);

2118 cancel_ref:

2119 percpu_ref_exit(&root_cgrp->self.refcnt);

2120 out:

2121 free_cgrp_cset_links(&tmp_links);

2122 return ret;

2123 }

2124

2125 int cgroup_do_get_tree(struct fs_context *fc)

2126 {

2127 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

2128 int ret;

2129

2130 ctx->kfc.root = ctx->root->kf_root;

2131 if (fc->fs_type == &cgroup2_fs_type)

2132 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;

2133 else

2134 ctx->kfc.magic = CGROUP_SUPER_MAGIC;

2135 ret = kernfs_get_tree(fc);

2136

2137 /*

2138 * In non-init cgroup namespace, instead of root cgroup's dentry,

2139 * we return the dentry corresponding to the cgroupns->root_cgrp.

2140 */

2141 if (!ret && ctx->ns != &init_cgroup_ns) {

2142 struct dentry *nsdentry;

2143 struct super_block *sb = fc->root->d_sb;

2144 struct cgroup *cgrp;

2145

2146 mutex_lock(&cgroup_mutex);

2147 spin_lock_irq(&css_set_lock);

2148

2149 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);

2150

2151 spin_unlock_irq(&css_set_lock);

2152 mutex_unlock(&cgroup_mutex);

2153

2154 nsdentry = kernfs_node_dentry(cgrp->kn, sb);

2155 dput(fc->root);

2156 if (IS_ERR(nsdentry)) {

2157 deactivate_locked_super(sb);

2158 ret = PTR_ERR(nsdentry);

2159 nsdentry = NULL;

2160 }

2161 fc->root = nsdentry;

2162 }

2163

2164 if (!ctx->kfc.new_sb_created)

2165 cgroup_put(&ctx->root->cgrp);

2166

2167 return ret;

2168 }

2169

2170 /*

2171 * Destroy a cgroup filesystem context.

2172 */

2173 static void cgroup_fs_context_free(struct fs_context *fc)

2174 {

2175 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

2176

2177 kfree(ctx->name);

2178 kfree(ctx->release_agent);

2179 put_cgroup_ns(ctx->ns);

2180 kernfs_free_fs_context(fc);

2181 kfree(ctx);

2182 }

2183

2184 static int cgroup_get_tree(struct fs_context *fc)

2185 {

2186 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

2187 int ret;

2188

2189 WRITE_ONCE(cgrp_dfl_visible, true);

2190 cgroup_get_live(&cgrp_dfl_root.cgrp);

2191 ctx->root = &cgrp_dfl_root;

2192

2193 ret = cgroup_do_get_tree(fc);

2194 if (!ret)

2195 apply_cgroup_root_flags(ctx->flags);

2196 return ret;

2197 }

2198

2199 static const struct fs_context_operations cgroup_fs_context_ops = {

2200 .free = cgroup_fs_context_free,

2201 .parse_param = cgroup2_parse_param,

2202 .get_tree = cgroup_get_tree,

2203 .reconfigure = cgroup_reconfigure,

2204 };

2205

2206 static const struct fs_context_operations cgroup1_fs_context_ops = {

2207 .free = cgroup_fs_context_free,

2208 .parse_param = cgroup1_parse_param,

2209 .get_tree = cgroup1_get_tree,

2210 .reconfigure = cgroup1_reconfigure,

2211 };

2212

2213 /*

2214 * Initialise the cgroup filesystem creation/reconfiguration context. Notably,

2215 * we select the namespace we're going to use.

2216 */

2217 static int cgroup_init_fs_context(struct fs_context *fc)

2218 {

2219 struct cgroup_fs_context *ctx;

2220

2221 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);

2222 if (!ctx)

2223 return -ENOMEM;

2224

2225 ctx->ns = current->nsproxy->cgroup_ns;

2226 get_cgroup_ns(ctx->ns);

2227 fc->fs_private = &ctx->kfc;

2228 if (fc->fs_type == &cgroup2_fs_type)

2229 fc->ops = &cgroup_fs_context_ops;

2230 else

2231 fc->ops = &cgroup1_fs_context_ops;

2232 put_user_ns(fc->user_ns);

2233 fc->user_ns = get_user_ns(ctx->ns->user_ns);

2234 fc->global = true;

2235

2236 #ifdef CONFIG_CGROUP_FAVOR_DYNMODS

2237 ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;

2238 #endif

2239 return 0;

2240 }

2241

2242 static void cgroup_kill_sb(struct super_block *sb)

2243 {

2244 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);

2245 struct cgroup_root *root = cgroup_root_from_kf(kf_root);

2246

2247 /*

2248 * If @root doesn't have any children, start killing it.

2249 * This prevents new mounts by disabling percpu_ref_tryget_live().

2250 *

2251 * And don't kill the default root.

2252 */

2253 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&

2254 !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {

2255 cgroup_bpf_offline(&root->cgrp);

2256 percpu_ref_kill(&root->cgrp.self.refcnt);

2257 }

2258 cgroup_put(&root->cgrp);

2259 kernfs_kill_sb(sb);

2260 }

2261

2262 struct file_system_type cgroup_fs_type = {

2263 .name = "cgroup",

2264 .init_fs_context = cgroup_init_fs_context,

2265 .parameters = cgroup1_fs_parameters,

2266 .kill_sb = cgroup_kill_sb,

2267 .fs_flags = FS_USERNS_MOUNT,

2268 };

2269

2270 static struct file_system_type cgroup2_fs_type = {

2271 .name = "cgroup2",

2272 .init_fs_context = cgroup_init_fs_context,

2273 .parameters = cgroup2_fs_parameters,

2274 .kill_sb = cgroup_kill_sb,

2275 .fs_flags = FS_USERNS_MOUNT,

2276 };

2277

2278 #ifdef CONFIG_CPUSETS

2279 static const struct fs_context_operations cpuset_fs_context_ops = {

2280 .get_tree = cgroup1_get_tree,

2281 .free = cgroup_fs_context_free,

2282 };

2283

2284 /*

2285 * This is ugly, but preserves the userspace API for existing cpuset

2286 * users. If someone tries to mount the "cpuset" filesystem, we

2287 * silently switch it to mount "cgroup" instead

2288 */

2289 static int cpuset_init_fs_context(struct fs_context *fc)

2290 {

2291 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);

2292 struct cgroup_fs_context *ctx;

2293 int err;

2294

2295 err = cgroup_init_fs_context(fc);

2296 if (err) {

2297 kfree(agent);

2298 return err;

2299 }

2300

2301 fc->ops = &cpuset_fs_context_ops;

2302

2303 ctx = cgroup_fc2context(fc);

2304 ctx->subsys_mask = 1 << cpuset_cgrp_id;

2305 ctx->flags |= CGRP_ROOT_NOPREFIX;

2306 ctx->release_agent = agent;

2307

2308 get_filesystem(&cgroup_fs_type);

2309 put_filesystem(fc->fs_type);

2310 fc->fs_type = &cgroup_fs_type;

2311

2312 return 0;

2313 }

2314

2315 static struct file_system_type cpuset_fs_type = {

2316 .name = "cpuset",

2317 .init_fs_context = cpuset_init_fs_context,

2318 .fs_flags = FS_USERNS_MOUNT,

2319 };

2320 #endif

2321

2322 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,

2323 struct cgroup_namespace *ns)

2324 {

2325 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);

2326

2327 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);

2328 }

2329

2330 int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,

2331 struct cgroup_namespace *ns)

2332 {

2333 int ret;

2334

2335 mutex_lock(&cgroup_mutex);

2336 spin_lock_irq(&css_set_lock);

2337

2338 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);

2339

2340 spin_unlock_irq(&css_set_lock);

2341 mutex_unlock(&cgroup_mutex);

2342

2343 return ret;

2344 }

2345 EXPORT_SYMBOL_GPL(cgroup_path_ns);

2346

2347 /**

2348 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy

2349 * @task: target task

2350 * @buf: the buffer to write the path into

2351 * @buflen: the length of the buffer

2352 *

2353 * Determine @task's cgroup on the first (the one with the lowest non-zero

2354 * hierarchy_id) cgroup hierarchy and copy its path into @buf. This

2355 * function grabs cgroup_mutex and shouldn't be used inside locks used by

2356 * cgroup controller callbacks.

2357 *

2358 * Return value is the same as kernfs_path().

2359 */

2360 int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)

2361 {

2362 struct cgroup_root *root;

2363 struct cgroup *cgrp;

2364 int hierarchy_id = 1;

2365 int ret;

2366

2367 mutex_lock(&cgroup_mutex);

2368 spin_lock_irq(&css_set_lock);

2369

2370 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

2371

2372 if (root) {

2373 cgrp = task_cgroup_from_root(task, root);

2374 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);

2375 } else {

2376 /* if no hierarchy exists, everyone is in "/" */

2377 ret = strscpy(buf, "/", buflen);

2378 }

2379

2380 spin_unlock_irq(&css_set_lock);

2381 mutex_unlock(&cgroup_mutex);

2382 return ret;

2383 }

2384 EXPORT_SYMBOL_GPL(task_cgroup_path);

2385

2386 /**

2387 * cgroup_attach_lock - Lock for ->attach()

2388 * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem

2389 *

2390 * cgroup migration sometimes needs to stabilize threadgroups against forks and

2391 * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()

2392 * implementations (e.g. cpuset), also need to disable CPU hotplug.

2393 * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can

2394 * lead to deadlocks.

2395 *

2396 * Bringing up a CPU may involve creating and destroying tasks which requires

2397 * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside

2398 * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while

2399 * write-locking threadgroup_rwsem, the locking order is reversed and we end up

2400 * waiting for an on-going CPU hotplug operation which in turn is waiting for

2401 * the threadgroup_rwsem to be released to create new tasks. For more details:

2402 *

2403 * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu

2404 *

2405 * Resolve the situation by always acquiring cpus_read_lock() before optionally

2406 * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that

2407 * CPU hotplug is disabled on entry.

2408 */

2409 void cgroup_attach_lock(bool lock_threadgroup)

2410 {

2411 cpus_read_lock();

2412 if (lock_threadgroup)

2413 percpu_down_write(&cgroup_threadgroup_rwsem);

2414 }

2415

2416 /**

2417 * cgroup_attach_unlock - Undo cgroup_attach_lock()

2418 * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem

2419 */

2420 void cgroup_attach_unlock(bool lock_threadgroup)

2421 {

2422 if (lock_threadgroup)

2423 percpu_up_write(&cgroup_threadgroup_rwsem);

2424 cpus_read_unlock();

2425 }

2426

2427 /**

2428 * cgroup_migrate_add_task - add a migration target task to a migration context

2429 * @task: target task

2430 * @mgctx: target migration context

2431 *

2432 * Add @task, which is a migration target, to @mgctx->tset. This function

2433 * becomes noop if @task doesn't need to be migrated. @task's css_set

2434 * should have been added as a migration source and @task->cg_list will be

2435 * moved from the css_set's tasks list to mg_tasks one.

2436 */

2437 static void cgroup_migrate_add_task(struct task_struct *task,

2438 struct cgroup_mgctx *mgctx)

2439 {

2440 struct css_set *cset;

2441

2442 lockdep_assert_held(&css_set_lock);

2443

2444 /* @task either already exited or can't exit until the end */

2445 if (task->flags & PF_EXITING)

2446 return;

2447

2448 /* cgroup_threadgroup_rwsem protects racing against forks */

2449 WARN_ON_ONCE(list_empty(&task->cg_list));

2450

2451 cset = task_css_set(task);

2452 if (!cset->mg_src_cgrp)

2453 return;

2454

2455 mgctx->tset.nr_tasks++;

2456

2457 list_move_tail(&task->cg_list, &cset->mg_tasks);

2458 if (list_empty(&cset->mg_node))

2459 list_add_tail(&cset->mg_node,

2460 &mgctx->tset.src_csets);

2461 if (list_empty(&cset->mg_dst_cset->mg_node))

2462 list_add_tail(&cset->mg_dst_cset->mg_node,

2463 &mgctx->tset.dst_csets);

2464 }

2465

2466 /**

2467 * cgroup_taskset_first - reset taskset and return the first task

2468 * @tset: taskset of interest

2469 * @dst_cssp: output variable for the destination css

2470 *

2471 * @tset iteration is initialized and the first task is returned.

2472 */

2473 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,

2474 struct cgroup_subsys_state **dst_cssp)

2475 {

2476 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);

2477 tset->cur_task = NULL;

2478

2479 return cgroup_taskset_next(tset, dst_cssp);

2480 }

2481

2482 /**

2483 * cgroup_taskset_next - iterate to the next task in taskset

2484 * @tset: taskset of interest

2485 * @dst_cssp: output variable for the destination css

2486 *

2487 * Return the next task in @tset. Iteration must have been initialized

2488 * with cgroup_taskset_first().

2489 */

2490 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,

2491 struct cgroup_subsys_state **dst_cssp)

2492 {

2493 struct css_set *cset = tset->cur_cset;

2494 struct task_struct *task = tset->cur_task;

2495

2496 while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {

2497 if (!task)

2498 task = list_first_entry(&cset->mg_tasks,

2499 struct task_struct, cg_list);

2500 else

2501 task = list_next_entry(task, cg_list);

2502

2503 if (&task->cg_list != &cset->mg_tasks) {

2504 tset->cur_cset = cset;

2505 tset->cur_task = task;

2506

2507 /*

2508 * This function may be called both before and

2509 * after cgroup_taskset_migrate(). The two cases

2510 * can be distinguished by looking at whether @cset

2511 * has its ->mg_dst_cset set.

2512 */

2513 if (cset->mg_dst_cset)

2514 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];

2515 else

2516 *dst_cssp = cset->subsys[tset->ssid];

2517

2518 return task;

2519 }

2520

2521 cset = list_next_entry(cset, mg_node);

2522 task = NULL;

2523 }

2524

2525 return NULL;

2526 }

2527

2528 /**

2529 * cgroup_migrate_execute - migrate a taskset

2530 * @mgctx: migration context

2531 *

2532 * Migrate tasks in @mgctx as setup by migration preparation functions.

2533 * This function fails iff one of the ->can_attach callbacks fails and

2534 * guarantees that either all or none of the tasks in @mgctx are migrated.

2535 * @mgctx is consumed regardless of success.

2536 */

2537 static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)

2538 {

2539 struct cgroup_taskset *tset = &mgctx->tset;

2540 struct cgroup_subsys *ss;

2541 struct task_struct *task, *tmp_task;

2542 struct css_set *cset, *tmp_cset;

2543 int ssid, failed_ssid, ret;

2544

2545 /* check that we can legitimately attach to the cgroup */

2546 if (tset->nr_tasks) {

2547 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {

2548 if (ss->can_attach) {

2549 tset->ssid = ssid;

2550 ret = ss->can_attach(tset);

2551 if (ret) {

2552 failed_ssid = ssid;

2553 goto out_cancel_attach;

2554 }

2555 }

2556 } while_each_subsys_mask();

2557 }

2558

2559 /*

2560 * Now that we're guaranteed success, proceed to move all tasks to

2561 * the new cgroup. There are no failure cases after here, so this

2562 * is the commit point.

2563 */

2564 spin_lock_irq(&css_set_lock);

2565 list_for_each_entry(cset, &tset->src_csets, mg_node) {

2566 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {

2567 struct css_set *from_cset = task_css_set(task);

2568 struct css_set *to_cset = cset->mg_dst_cset;

2569

2570 get_css_set(to_cset);

2571 to_cset->nr_tasks++;

2572 css_set_move_task(task, from_cset, to_cset, true);

2573 from_cset->nr_tasks--;

2574 /*

2575 * If the source or destination cgroup is frozen,

2576 * the task might require to change its state.

2577 */

2578 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,

2579 to_cset->dfl_cgrp);

2580 put_css_set_locked(from_cset);

2581

2582 }

2583 }

2584 spin_unlock_irq(&css_set_lock);

2585

2586 /*

2587 * Migration is committed, all target tasks are now on dst_csets.

2588 * Nothing is sensitive to fork() after this point. Notify

2589 * controllers that migration is complete.

2590 */

2591 tset->csets = &tset->dst_csets;

2592

2593 if (tset->nr_tasks) {

2594 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {

2595 if (ss->attach) {

2596 tset->ssid = ssid;

2597 ss->attach(tset);

2598 }

2599 } while_each_subsys_mask();

2600 }

2601

2602 ret = 0;

2603 goto out_release_tset;

2604

2605 out_cancel_attach:

2606 if (tset->nr_tasks) {

2607 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {

2608 if (ssid == failed_ssid)

2609 break;

2610 if (ss->cancel_attach) {

2611 tset->ssid = ssid;

2612 ss->cancel_attach(tset);

2613 }

2614 } while_each_subsys_mask();

2615 }

2616 out_release_tset:

2617 spin_lock_irq(&css_set_lock);

2618 list_splice_init(&tset->dst_csets, &tset->src_csets);

2619 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {

2620 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);

2621 list_del_init(&cset->mg_node);

2622 }

2623 spin_unlock_irq(&css_set_lock);

2624

2625 /*

2626 * Re-initialize the cgroup_taskset structure in case it is reused

2627 * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()

2628 * iteration.

2629 */

2630 tset->nr_tasks = 0;

2631 tset->csets = &tset->src_csets;

2632 return ret;

2633 }

2634

2635 /**

2636 * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination

2637 * @dst_cgrp: destination cgroup to test

2638 *

2639 * On the default hierarchy, except for the mixable, (possible) thread root

2640 * and threaded cgroups, subtree_control must be zero for migration

2641 * destination cgroups with tasks so that child cgroups don't compete

2642 * against tasks.

2643 */

2644 int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)

2645 {

2646 /* v1 doesn't have any restriction */

2647 if (!cgroup_on_dfl(dst_cgrp))

2648 return 0;

2649

2650 /* verify @dst_cgrp can host resources */

2651 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))

2652 return -EOPNOTSUPP;

2653

2654 /*

2655 * If @dst_cgrp is already or can become a thread root or is

2656 * threaded, it doesn't matter.

2657 */

2658 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))

2659 return 0;

2660

2661 /* apply no-internal-process constraint */

2662 if (dst_cgrp->subtree_control)

2663 return -EBUSY;

2664

2665 return 0;

2666 }

2667

2668 /**

2669 * cgroup_migrate_finish - cleanup after attach

2670 * @mgctx: migration context

2671 *

2672 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See

2673 * those functions for details.

2674 */

2675 void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)

2676 {

2677 struct css_set *cset, *tmp_cset;

2678

2679 lockdep_assert_held(&cgroup_mutex);

2680

2681 spin_lock_irq(&css_set_lock);

2682

2683 list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,

2684 mg_src_preload_node) {

2685 cset->mg_src_cgrp = NULL;

2686 cset->mg_dst_cgrp = NULL;

2687 cset->mg_dst_cset = NULL;

2688 list_del_init(&cset->mg_src_preload_node);

2689 put_css_set_locked(cset);

2690 }

2691

2692 list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,

2693 mg_dst_preload_node) {

2694 cset->mg_src_cgrp = NULL;

2695 cset->mg_dst_cgrp = NULL;

2696 cset->mg_dst_cset = NULL;

2697 list_del_init(&cset->mg_dst_preload_node);

2698 put_css_set_locked(cset);

2699 }

2700

2701 spin_unlock_irq(&css_set_lock);

2702 }

2703

2704 /**

2705 * cgroup_migrate_add_src - add a migration source css_set

2706 * @src_cset: the source css_set to add

2707 * @dst_cgrp: the destination cgroup

2708 * @mgctx: migration context

2709 *

2710 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin

2711 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned

2712 * up by cgroup_migrate_finish().

2713 *

2714 * This function may be called without holding cgroup_threadgroup_rwsem

2715 * even if the target is a process. Threads may be created and destroyed

2716 * but as long as cgroup_mutex is not dropped, no new css_set can be put

2717 * into play and the preloaded css_sets are guaranteed to cover all

2718 * migrations.

2719 */

2720 void cgroup_migrate_add_src(struct css_set *src_cset,

2721 struct cgroup *dst_cgrp,

2722 struct cgroup_mgctx *mgctx)

2723 {

2724 struct cgroup *src_cgrp;

2725

2726 lockdep_assert_held(&cgroup_mutex);

2727 lockdep_assert_held(&css_set_lock);

2728

2729 /*

2730 * If ->dead, @src_set is associated with one or more dead cgroups

2731 * and doesn't contain any migratable tasks. Ignore it early so

2732 * that the rest of migration path doesn't get confused by it.

2733 */

2734 if (src_cset->dead)

2735 return;

2736

2737 if (!list_empty(&src_cset->mg_src_preload_node))

2738 return;

2739

2740 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

2741

2742 WARN_ON(src_cset->mg_src_cgrp);

2743 WARN_ON(src_cset->mg_dst_cgrp);

2744 WARN_ON(!list_empty(&src_cset->mg_tasks));

2745 WARN_ON(!list_empty(&src_cset->mg_node));

2746

2747 src_cset->mg_src_cgrp = src_cgrp;

2748 src_cset->mg_dst_cgrp = dst_cgrp;

2749 get_css_set(src_cset);

2750 list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);

2751 }

2752

2753 /**

2754 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration

2755 * @mgctx: migration context

2756 *

2757 * Tasks are about to be moved and all the source css_sets have been

2758 * preloaded to @mgctx->preloaded_src_csets. This function looks up and

2759 * pins all destination css_sets, links each to its source, and append them

2760 * to @mgctx->preloaded_dst_csets.

2761 *

2762 * This function must be called after cgroup_migrate_add_src() has been

2763 * called on each migration source css_set. After migration is performed

2764 * using cgroup_migrate(), cgroup_migrate_finish() must be called on

2765 * @mgctx.

2766 */

2767 int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)

2768 {

2769 struct css_set *src_cset, *tmp_cset;

2770

2771 lockdep_assert_held(&cgroup_mutex);

2772

2773 /* look up the dst cset for each src cset and link it to src */

2774 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,

2775 mg_src_preload_node) {

2776 struct css_set *dst_cset;

2777 struct cgroup_subsys *ss;

2778 int ssid;

2779

2780 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);

2781 if (!dst_cset)

2782 return -ENOMEM;

2783

2784 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

2785

2786 /*

2787 * If src cset equals dst, it's noop. Drop the src.

2788 * cgroup_migrate() will skip the cset too. Note that we

2789 * can't handle src == dst as some nodes are used by both.

2790 */

2791 if (src_cset == dst_cset) {

2792 src_cset->mg_src_cgrp = NULL;

2793 src_cset->mg_dst_cgrp = NULL;

2794 list_del_init(&src_cset->mg_src_preload_node);

2795 put_css_set(src_cset);

2796 put_css_set(dst_cset);

2797 continue;

2798 }

2799

2800 src_cset->mg_dst_cset = dst_cset;

2801

2802 if (list_empty(&dst_cset->mg_dst_preload_node))

2803 list_add_tail(&dst_cset->mg_dst_preload_node,

2804 &mgctx->preloaded_dst_csets);

2805 else

2806 put_css_set(dst_cset);

2807

2808 for_each_subsys(ss, ssid)

2809 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])

2810 mgctx->ss_mask |= 1 << ssid;

2811 }

2812

2813 return 0;

2814 }

2815

2816 /**

2817 * cgroup_migrate - migrate a process or task to a cgroup

2818 * @leader: the leader of the process or the task to migrate

2819 * @threadgroup: whether @leader points to the whole process or a single task

2820 * @mgctx: migration context

2821 *

2822 * Migrate a process or task denoted by @leader. If migrating a process,

2823 * the caller must be holding cgroup_threadgroup_rwsem. The caller is also

2824 * responsible for invoking cgroup_migrate_add_src() and

2825 * cgroup_migrate_prepare_dst() on the targets before invoking this

2826 * function and following up with cgroup_migrate_finish().

2827 *

2828 * As long as a controller's ->can_attach() doesn't fail, this function is

2829 * guaranteed to succeed. This means that, excluding ->can_attach()

2830 * failure, when migrating multiple targets, the success or failure can be

2831 * decided for all targets by invoking group_migrate_prepare_dst() before

2832 * actually starting migrating.

2833 */

2834 int cgroup_migrate(struct task_struct *leader, bool threadgroup,

2835 struct cgroup_mgctx *mgctx)

2836 {

2837 struct task_struct *task;

2838

2839 /*

2840 * Prevent freeing of tasks while we take a snapshot. Tasks that are

2841 * already PF_EXITING could be freed from underneath us unless we

2842 * take an rcu_read_lock.

2843 */

2844 spin_lock_irq(&css_set_lock);

2845 rcu_read_lock();

2846 task = leader;

2847 do {

2848 cgroup_migrate_add_task(task, mgctx);

2849 if (!threadgroup)

2850 break;

2851 } while_each_thread(leader, task);

2852 rcu_read_unlock();

2853 spin_unlock_irq(&css_set_lock);

2854

2855 return cgroup_migrate_execute(mgctx);

2856 }

2857

2858 /**

2859 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup

2860 * @dst_cgrp: the cgroup to attach to

2861 * @leader: the task or the leader of the threadgroup to be attached

2862 * @threadgroup: attach the whole threadgroup?

2863 *

2864 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.

2865 */

2866 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,

2867 bool threadgroup)

2868 {

2869 DEFINE_CGROUP_MGCTX(mgctx);

2870 struct task_struct *task;

2871 int ret = 0;

2872

2873 /* look up all src csets */

2874 spin_lock_irq(&css_set_lock);

2875 rcu_read_lock();

2876 task = leader;

2877 do {

2878 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);

2879 if (!threadgroup)

2880 break;

2881 } while_each_thread(leader, task);

2882 rcu_read_unlock();

2883 spin_unlock_irq(&css_set_lock);

2884

2885 /* prepare dst csets and commit */

2886 ret = cgroup_migrate_prepare_dst(&mgctx);

2887 if (!ret)

2888 ret = cgroup_migrate(leader, threadgroup, &mgctx);

2889

2890 cgroup_migrate_finish(&mgctx);

2891

2892 if (!ret)

2893 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);

2894

2895 return ret;

2896 }

2897

2898 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,

2899 bool *threadgroup_locked)

2900 {

2901 struct task_struct *tsk;

2902 pid_t pid;

2903

2904 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)

2905 return ERR_PTR(-EINVAL);

2906

2907 /*

2908 * If we migrate a single thread, we don't care about threadgroup

2909 * stability. If the thread is `current`, it won't exit(2) under our

2910 * hands or change PID through exec(2). We exclude

2911 * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write

2912 * callers by cgroup_mutex.

2913 * Therefore, we can skip the global lock.

2914 */

2915 lockdep_assert_held(&cgroup_mutex);

2916 *threadgroup_locked = pid || threadgroup;

2917 cgroup_attach_lock(*threadgroup_locked);

2918

2919 rcu_read_lock();

2920 if (pid) {

2921 tsk = find_task_by_vpid(pid);

2922 if (!tsk) {

2923 tsk = ERR_PTR(-ESRCH);

2924 goto out_unlock_threadgroup;

2925 }

2926 } else {

2927 tsk = current;

2928 }

2929

2930 if (threadgroup)

2931 tsk = tsk->group_leader;

2932

2933 /*

2934 * kthreads may acquire PF_NO_SETAFFINITY during initialization.

2935 * If userland migrates such a kthread to a non-root cgroup, it can

2936 * become trapped in a cpuset, or RT kthread may be born in a

2937 * cgroup with no rt_runtime allocated. Just say no.

2938 */

2939 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {

2940 tsk = ERR_PTR(-EINVAL);

2941 goto out_unlock_threadgroup;

2942 }

2943

2944 get_task_struct(tsk);

2945 goto out_unlock_rcu;

2946

2947 out_unlock_threadgroup:

2948 cgroup_attach_unlock(*threadgroup_locked);

2949 *threadgroup_locked = false;

2950 out_unlock_rcu:

2951 rcu_read_unlock();

2952 return tsk;

2953 }

2954

2955 void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)

2956 {

2957 struct cgroup_subsys *ss;

2958 int ssid;

2959

2960 /* release reference from cgroup_procs_write_start() */

2961 put_task_struct(task);

2962

2963 cgroup_attach_unlock(threadgroup_locked);

2964

2965 for_each_subsys(ss, ssid)

2966 if (ss->post_attach)

2967 ss->post_attach();

2968 }

2969

2970 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)

2971 {

2972 struct cgroup_subsys *ss;

2973 bool printed = false;

2974 int ssid;

2975

2976 do_each_subsys_mask(ss, ssid, ss_mask) {

2977 if (printed)

2978 seq_putc(seq, ' ');

2979 seq_puts(seq, ss->name);

2980 printed = true;

2981 } while_each_subsys_mask();

2982 if (printed)

2983 seq_putc(seq, '\n');

2984 }

2985

2986 /* show controllers which are enabled from the parent */

2987 static int cgroup_controllers_show(struct seq_file *seq, void *v)

2988 {

2989 struct cgroup *cgrp = seq_css(seq)->cgroup;

2990

2991 cgroup_print_ss_mask(seq, cgroup_control(cgrp));

2992 return 0;

2993 }

2994

2995 /* show controllers which are enabled for a given cgroup's children */

2996 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)

2997 {

2998 struct cgroup *cgrp = seq_css(seq)->cgroup;

2999

3000 cgroup_print_ss_mask(seq, cgrp->subtree_control);

3001 return 0;

3002 }

3003

3004 /**

3005 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy

3006 * @cgrp: root of the subtree to update csses for

3007 *

3008 * @cgrp's control masks have changed and its subtree's css associations

3009 * need to be updated accordingly. This function looks up all css_sets

3010 * which are attached to the subtree, creates the matching updated css_sets

3011 * and migrates the tasks to the new ones.

3012 */

3013 static int cgroup_update_dfl_csses(struct cgroup *cgrp)

3014 {

3015 DEFINE_CGROUP_MGCTX(mgctx);

3016 struct cgroup_subsys_state *d_css;

3017 struct cgroup *dsct;

3018 struct css_set *src_cset;

3019 bool has_tasks;

3020 int ret;

3021

3022 lockdep_assert_held(&cgroup_mutex);

3023

3024 /* look up all csses currently attached to @cgrp's subtree */

3025 spin_lock_irq(&css_set_lock);

3026 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {

3027 struct cgrp_cset_link *link;

3028

3029 /*

3030 * As cgroup_update_dfl_csses() is only called by

3031 * cgroup_apply_control(). The csses associated with the

3032 * given cgrp will not be affected by changes made to

3033 * its subtree_control file. We can skip them.

3034 */

3035 if (dsct == cgrp)

3036 continue;

3037

3038 list_for_each_entry(link, &dsct->cset_links, cset_link)

3039 cgroup_migrate_add_src(link->cset, dsct, &mgctx);

3040 }

3041 spin_unlock_irq(&css_set_lock);

3042

3043 /*

3044 * We need to write-lock threadgroup_rwsem while migrating tasks.

3045 * However, if there are no source csets for @cgrp, changing its

3046 * controllers isn't gonna produce any task migrations and the

3047 * write-locking can be skipped safely.

3048 */

3049 has_tasks = !list_empty(&mgctx.preloaded_src_csets);

3050 cgroup_attach_lock(has_tasks);

3051

3052 /* NULL dst indicates self on default hierarchy */

3053 ret = cgroup_migrate_prepare_dst(&mgctx);

3054 if (ret)

3055 goto out_finish;

3056

3057 spin_lock_irq(&css_set_lock);

3058 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,

3059 mg_src_preload_node) {

3060 struct task_struct *task, *ntask;

3061

3062 /* all tasks in src_csets need to be migrated */

3063 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)

3064 cgroup_migrate_add_task(task, &mgctx);

3065 }

3066 spin_unlock_irq(&css_set_lock);

3067

3068 ret = cgroup_migrate_execute(&mgctx);

3069 out_finish:

3070 cgroup_migrate_finish(&mgctx);

3071 cgroup_attach_unlock(has_tasks);

3072 return ret;

3073 }

3074

3075 /**

3076 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses

3077 * @cgrp: root of the target subtree

3078 *

3079 * Because css offlining is asynchronous, userland may try to re-enable a

3080 * controller while the previous css is still around. This function grabs

3081 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.

3082 */

3083 void cgroup_lock_and_drain_offline(struct cgroup *cgrp)

3084 __acquires(&cgroup_mutex)

3085 {

3086 struct cgroup *dsct;

3087 struct cgroup_subsys_state *d_css;

3088 struct cgroup_subsys *ss;

3089 int ssid;

3090

3091 restart:

3092 mutex_lock(&cgroup_mutex);

3093

3094 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {

3095 for_each_subsys(ss, ssid) {

3096 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

3097 DEFINE_WAIT(wait);

3098

3099 if (!css || !percpu_ref_is_dying(&css->refcnt))

3100 continue;

3101

3102 cgroup_get_live(dsct);

3103 prepare_to_wait(&dsct->offline_waitq, &wait,

3104 TASK_UNINTERRUPTIBLE);

3105

3106 mutex_unlock(&cgroup_mutex);

3107 schedule();

3108 finish_wait(&dsct->offline_waitq, &wait);

3109

3110 cgroup_put(dsct);

3111 goto restart;

3112 }

3113 }

3114 }

3115

3116 /**

3117 * cgroup_save_control - save control masks and dom_cgrp of a subtree

3118 * @cgrp: root of the target subtree

3119 *

3120 * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the

3121 * respective old_ prefixed fields for @cgrp's subtree including @cgrp

3122 * itself.

3123 */

3124 static void cgroup_save_control(struct cgroup *cgrp)

3125 {

3126 struct cgroup *dsct;

3127 struct cgroup_subsys_state *d_css;

3128

3129 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {

3130 dsct->old_subtree_control = dsct->subtree_control;

3131 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;

3132 dsct->old_dom_cgrp = dsct->dom_cgrp;

3133 }

3134 }

3135

3136 /**

3137 * cgroup_propagate_control - refresh control masks of a subtree

3138 * @cgrp: root of the target subtree

3139 *

3140 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches

3141 * ->subtree_control and propagate controller availability through the

3142 * subtree so that descendants don't have unavailable controllers enabled.

3143 */

3144 static void cgroup_propagate_control(struct cgroup *cgrp)

3145 {

3146 struct cgroup *dsct;

3147 struct cgroup_subsys_state *d_css;

3148

3149 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {

3150 dsct->subtree_control &= cgroup_control(dsct);

3151 dsct->subtree_ss_mask =

3152 cgroup_calc_subtree_ss_mask(dsct->subtree_control,

3153 cgroup_ss_mask(dsct));

3154 }

3155 }

3156

3157 /**

3158 * cgroup_restore_control - restore control masks and dom_cgrp of a subtree

3159 * @cgrp: root of the target subtree

3160 *

3161 * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the

3162 * respective old_ prefixed fields for @cgrp's subtree including @cgrp

3163 * itself.

3164 */

3165 static void cgroup_restore_control(struct cgroup *cgrp)

3166 {

3167 struct cgroup *dsct;

3168 struct cgroup_subsys_state *d_css;

3169

3170 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {

3171 dsct->subtree_control = dsct->old_subtree_control;

3172 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;

3173 dsct->dom_cgrp = dsct->old_dom_cgrp;

3174 }

3175 }

3176

3177 static bool css_visible(struct cgroup_subsys_state *css)

3178 {

3179 struct cgroup_subsys *ss = css->ss;

3180 struct cgroup *cgrp = css->cgroup;

3181

3182 if (cgroup_control(cgrp) & (1 << ss->id))

3183 return true;

3184 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))

3185 return false;

3186 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;

3187 }

3188

3189 /**

3190 * cgroup_apply_control_enable - enable or show csses according to control

3191 * @cgrp: root of the target subtree

3192 *

3193 * Walk @cgrp's subtree and create new csses or make the existing ones

3194 * visible. A css is created invisible if it's being implicitly enabled

3195 * through dependency. An invisible css is made visible when the userland

3196 * explicitly enables it.

3197 *

3198 * Returns 0 on success, -errno on failure. On failure, csses which have

3199 * been processed already aren't cleaned up. The caller is responsible for

3200 * cleaning up with cgroup_apply_control_disable().

3201 */

3202 static int cgroup_apply_control_enable(struct cgroup *cgrp)

3203 {

3204 struct cgroup *dsct;

3205 struct cgroup_subsys_state *d_css;

3206 struct cgroup_subsys *ss;

3207 int ssid, ret;

3208

3209 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {

3210 for_each_subsys(ss, ssid) {

3211 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

3212

3213 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))

3214 continue;

3215

3216 if (!css) {

3217 css = css_create(dsct, ss);

3218 if (IS_ERR(css))

3219 return PTR_ERR(css);

3220 }

3221

3222 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

3223

3224 if (css_visible(css)) {

3225 ret = css_populate_dir(css);

3226 if (ret)

3227 return ret;

3228 }

3229 }

3230 }

3231

3232 return 0;

3233 }

3234

3235 /**

3236 * cgroup_apply_control_disable - kill or hide csses according to control

3237 * @cgrp: root of the target subtree

3238 *

3239 * Walk @cgrp's subtree and kill and hide csses so that they match

3240 * cgroup_ss_mask() and cgroup_visible_mask().

3241 *

3242 * A css is hidden when the userland requests it to be disabled while other

3243 * subsystems are still depending on it. The css must not actively control

3244 * resources and be in the vanilla state if it's made visible again later.

3245 * Controllers which may be depended upon should provide ->css_reset() for

3246 * this purpose.

3247 */

3248 static void cgroup_apply_control_disable(struct cgroup *cgrp)

3249 {

3250 struct cgroup *dsct;

3251 struct cgroup_subsys_state *d_css;

3252 struct cgroup_subsys *ss;

3253 int ssid;

3254

3255 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {

3256 for_each_subsys(ss, ssid) {

3257 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

3258

3259 if (!css)

3260 continue;

3261

3262 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

3263

3264 if (css->parent &&

3265 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {

3266 kill_css(css);

3267 } else if (!css_visible(css)) {

3268 css_clear_dir(css);

3269 if (ss->css_reset)

3270 ss->css_reset(css);

3271 }

3272 }

3273 }

3274 }

3275

3276 /**

3277 * cgroup_apply_control - apply control mask updates to the subtree

3278 * @cgrp: root of the target subtree

3279 *

3280 * subsystems can be enabled and disabled in a subtree using the following

3281 * steps.

3282 *

3283 * 1. Call cgroup_save_control() to stash the current state.

3284 * 2. Update ->subtree_control masks in the subtree as desired.

3285 * 3. Call cgroup_apply_control() to apply the changes.

3286 * 4. Optionally perform other related operations.

3287 * 5. Call cgroup_finalize_control() to finish up.

3288 *

3289 * This function implements step 3 and propagates the mask changes

3290 * throughout @cgrp's subtree, updates csses accordingly and perform

3291 * process migrations.

3292 */

3293 static int cgroup_apply_control(struct cgroup *cgrp)

3294 {

3295 int ret;

3296

3297 cgroup_propagate_control(cgrp);

3298

3299 ret = cgroup_apply_control_enable(cgrp);

3300 if (ret)

3301 return ret;

3302

3303 /*

3304 * At this point, cgroup_e_css_by_mask() results reflect the new csses

3305 * making the following cgroup_update_dfl_csses() properly update

3306 * css associations of all tasks in the subtree.

3307 */

3308 return cgroup_update_dfl_csses(cgrp);

3309 }

3310

3311 /**

3312 * cgroup_finalize_control - finalize control mask update

3313 * @cgrp: root of the target subtree

3314 * @ret: the result of the update

3315 *

3316 * Finalize control mask update. See cgroup_apply_control() for more info.

3317 */

3318 static void cgroup_finalize_control(struct cgroup *cgrp, int ret)

3319 {

3320 if (ret) {

3321 cgroup_restore_control(cgrp);

3322 cgroup_propagate_control(cgrp);

3323 }

3324

3325 cgroup_apply_control_disable(cgrp);

3326 }

3327

3328 static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)

3329 {

3330 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;

3331

3332 /* if nothing is getting enabled, nothing to worry about */

3333 if (!enable)

3334 return 0;

3335

3336 /* can @cgrp host any resources? */

3337 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))

3338 return -EOPNOTSUPP;

3339

3340 /* mixables don't care */

3341 if (cgroup_is_mixable(cgrp))

3342 return 0;

3343

3344 if (domain_enable) {

3345 /* can't enable domain controllers inside a thread subtree */

3346 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))

3347 return -EOPNOTSUPP;

3348 } else {

3349 /*

3350 * Threaded controllers can handle internal competitions

3351 * and are always allowed inside a (prospective) thread

3352 * subtree.

3353 */

3354 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))

3355 return 0;

3356 }

3357

3358 /*

3359 * Controllers can't be enabled for a cgroup with tasks to avoid

3360 * child cgroups competing against tasks.

3361 */

3362 if (cgroup_has_tasks(cgrp))

3363 return -EBUSY;

3364

3365 return 0;

3366 }

3367

3368 /* change the enabled child controllers for a cgroup in the default hierarchy */

3369 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,

3370 char *buf, size_t nbytes,

3371 loff_t off)

3372 {

3373 u16 enable = 0, disable = 0;

3374 struct cgroup *cgrp, *child;

3375 struct cgroup_subsys *ss;

3376 char *tok;

3377 int ssid, ret;

3378

3379 /*

3380 * Parse input - space separated list of subsystem names prefixed

3381 * with either + or -.

3382 */

3383 buf = strstrip(buf);

3384 while ((tok = strsep(&buf, " "))) {

3385 if (tok[0] == '\0')

3386 continue;

3387 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {

3388 if (!cgroup_ssid_enabled(ssid) ||

3389 strcmp(tok + 1, ss->name))

3390 continue;

3391

3392 if (*tok == '+') {

3393 enable |= 1 << ssid;

3394 disable &= ~(1 << ssid);

3395 } else if (*tok == '-') {

3396 disable |= 1 << ssid;

3397 enable &= ~(1 << ssid);

3398 } else {

3399 return -EINVAL;

3400 }

3401 break;

3402 } while_each_subsys_mask();

3403 if (ssid == CGROUP_SUBSYS_COUNT)

3404 return -EINVAL;

3405 }

3406

3407 cgrp = cgroup_kn_lock_live(of->kn, true);

3408 if (!cgrp)

3409 return -ENODEV;

3410

3411 for_each_subsys(ss, ssid) {

3412 if (enable & (1 << ssid)) {

3413 if (cgrp->subtree_control & (1 << ssid)) {

3414 enable &= ~(1 << ssid);

3415 continue;

3416 }

3417

3418 if (!(cgroup_control(cgrp) & (1 << ssid))) {

3419 ret = -ENOENT;

3420 goto out_unlock;

3421 }

3422 } else if (disable & (1 << ssid)) {

3423 if (!(cgrp->subtree_control & (1 << ssid))) {

3424 disable &= ~(1 << ssid);

3425 continue;

3426 }

3427

3428 /* a child has it enabled? */

3429 cgroup_for_each_live_child(child, cgrp) {

3430 if (child->subtree_control & (1 << ssid)) {

3431 ret = -EBUSY;

3432 goto out_unlock;

3433 }

3434 }

3435 }

3436 }

3437

3438 if (!enable && !disable) {

3439 ret = 0;

3440 goto out_unlock;

3441 }

3442

3443 ret = cgroup_vet_subtree_control_enable(cgrp, enable);

3444 if (ret)

3445 goto out_unlock;

3446

3447 /* save and update control masks and prepare csses */

3448 cgroup_save_control(cgrp);

3449

3450 cgrp->subtree_control |= enable;

3451 cgrp->subtree_control &= ~disable;

3452

3453 ret = cgroup_apply_control(cgrp);

3454 cgroup_finalize_control(cgrp, ret);

3455 if (ret)

3456 goto out_unlock;

3457

3458 kernfs_activate(cgrp->kn);

3459 out_unlock:

3460 cgroup_kn_unlock(of->kn);

3461 return ret ?: nbytes;

3462 }

3463

3464 /**

3465 * cgroup_enable_threaded - make @cgrp threaded

3466 * @cgrp: the target cgroup

3467 *

3468 * Called when "threaded" is written to the cgroup.type interface file and

3469 * tries to make @cgrp threaded and join the parent's resource domain.

3470 * This function is never called on the root cgroup as cgroup.type doesn't

3471 * exist on it.

3472 */

3473 static int cgroup_enable_threaded(struct cgroup *cgrp)

3474 {

3475 struct cgroup *parent = cgroup_parent(cgrp);

3476 struct cgroup *dom_cgrp = parent->dom_cgrp;

3477 struct cgroup *dsct;

3478 struct cgroup_subsys_state *d_css;

3479 int ret;

3480

3481 lockdep_assert_held(&cgroup_mutex);

3482

3483 /* noop if already threaded */

3484 if (cgroup_is_threaded(cgrp))

3485 return 0;

3486

3487 /*

3488 * If @cgroup is populated or has domain controllers enabled, it

3489 * can't be switched. While the below cgroup_can_be_thread_root()

3490 * test can catch the same conditions, that's only when @parent is

3491 * not mixable, so let's check it explicitly.

3492 */

3493 if (cgroup_is_populated(cgrp) ||

3494 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)

3495 return -EOPNOTSUPP;

3496

3497 /* we're joining the parent's domain, ensure its validity */

3498 if (!cgroup_is_valid_domain(dom_cgrp) ||

3499 !cgroup_can_be_thread_root(dom_cgrp))

3500 return -EOPNOTSUPP;

3501

3502 /*

3503 * The following shouldn't cause actual migrations and should

3504 * always succeed.

3505 */

3506 cgroup_save_control(cgrp);

3507

3508 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)

3509 if (dsct == cgrp || cgroup_is_threaded(dsct))

3510 dsct->dom_cgrp = dom_cgrp;

3511

3512 ret = cgroup_apply_control(cgrp);

3513 if (!ret)

3514 parent->nr_threaded_children++;

3515

3516 cgroup_finalize_control(cgrp, ret);

3517 return ret;

3518 }

3519

3520 static int cgroup_type_show(struct seq_file *seq, void *v)

3521 {

3522 struct cgroup *cgrp = seq_css(seq)->cgroup;

3523

3524 if (cgroup_is_threaded(cgrp))

3525 seq_puts(seq, "threaded\n");

3526 else if (!cgroup_is_valid_domain(cgrp))

3527 seq_puts(seq, "domain invalid\n");

3528 else if (cgroup_is_thread_root(cgrp))

3529 seq_puts(seq, "domain threaded\n");

3530 else

3531 seq_puts(seq, "domain\n");

3532

3533 return 0;

3534 }

3535

3536 static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,

3537 size_t nbytes, loff_t off)

3538 {

3539 struct cgroup *cgrp;

3540 int ret;

3541

3542 /* only switching to threaded mode is supported */

3543 if (strcmp(strstrip(buf), "threaded"))

3544 return -EINVAL;

3545

3546 /* drain dying csses before we re-apply (threaded) subtree control */

3547 cgrp = cgroup_kn_lock_live(of->kn, true);

3548 if (!cgrp)

3549 return -ENOENT;

3550

3551 /* threaded can only be enabled */

3552 ret = cgroup_enable_threaded(cgrp);

3553

3554 cgroup_kn_unlock(of->kn);

3555 return ret ?: nbytes;

3556 }

3557

3558 static int cgroup_max_descendants_show(struct seq_file *seq, void *v)

3559 {

3560 struct cgroup *cgrp = seq_css(seq)->cgroup;

3561 int descendants = READ_ONCE(cgrp->max_descendants);

3562

3563 if (descendants == INT_MAX)

3564 seq_puts(seq, "max\n");

3565 else

3566 seq_printf(seq, "%d\n", descendants);

3567

3568 return 0;

3569 }

3570

3571 static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,

3572 char *buf, size_t nbytes, loff_t off)

3573 {

3574 struct cgroup *cgrp;

3575 int descendants;

3576 ssize_t ret;

3577

3578 buf = strstrip(buf);

3579 if (!strcmp(buf, "max")) {

3580 descendants = INT_MAX;

3581 } else {

3582 ret = kstrtoint(buf, 0, &descendants);

3583 if (ret)

3584 return ret;

3585 }

3586

3587 if (descendants < 0)

3588 return -ERANGE;

3589

3590 cgrp = cgroup_kn_lock_live(of->kn, false);

3591 if (!cgrp)

3592 return -ENOENT;

3593

3594 cgrp->max_descendants = descendants;

3595

3596 cgroup_kn_unlock(of->kn);

3597

3598 return nbytes;

3599 }

3600

3601 static int cgroup_max_depth_show(struct seq_file *seq, void *v)

3602 {

3603 struct cgroup *cgrp = seq_css(seq)->cgroup;

3604 int depth = READ_ONCE(cgrp->max_depth);

3605

3606 if (depth == INT_MAX)

3607 seq_puts(seq, "max\n");

3608 else

3609 seq_printf(seq, "%d\n", depth);

3610

3611 return 0;

3612 }

3613

3614 static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,

3615 char *buf, size_t nbytes, loff_t off)

3616 {

3617 struct cgroup *cgrp;

3618 ssize_t ret;

3619 int depth;

3620

3621 buf = strstrip(buf);

3622 if (!strcmp(buf, "max")) {

3623 depth = INT_MAX;

3624 } else {

3625 ret = kstrtoint(buf, 0, &depth);

3626 if (ret)

3627 return ret;

3628 }

3629

3630 if (depth < 0)

3631 return -ERANGE;

3632

3633 cgrp = cgroup_kn_lock_live(of->kn, false);

3634 if (!cgrp)

3635 return -ENOENT;

3636

3637 cgrp->max_depth = depth;

3638

3639 cgroup_kn_unlock(of->kn);

3640

3641 return nbytes;

3642 }

3643

3644 static int cgroup_events_show(struct seq_file *seq, void *v)

3645 {

3646 struct cgroup *cgrp = seq_css(seq)->cgroup;

3647

3648 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));

3649 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));

3650

3651 return 0;

3652 }

3653

3654 static int cgroup_stat_show(struct seq_file *seq, void *v)

3655 {

3656 struct cgroup *cgroup = seq_css(seq)->cgroup;

3657

3658 seq_printf(seq, "nr_descendants %d\n",

3659 cgroup->nr_descendants);

3660 seq_printf(seq, "nr_dying_descendants %d\n",

3661 cgroup->nr_dying_descendants);

3662

3663 return 0;

3664 }

3665

3666 static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,

3667 struct cgroup *cgrp, int ssid)

3668 {

3669 struct cgroup_subsys *ss = cgroup_subsys[ssid];

3670 struct cgroup_subsys_state *css;

3671 int ret;

3672

3673 if (!ss->css_extra_stat_show)

3674 return 0;

3675

3676 css = cgroup_tryget_css(cgrp, ss);

3677 if (!css)

3678 return 0;

3679

3680 ret = ss->css_extra_stat_show(seq, css);

3681 css_put(css);

3682 return ret;

3683 }

3684

3685 static int cpu_stat_show(struct seq_file *seq, void *v)

3686 {

3687 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;

3688 int ret = 0;

3689

3690 cgroup_base_stat_cputime_show(seq);

3691 #ifdef CONFIG_CGROUP_SCHED

3692 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);

3693 #endif

3694 return ret;

3695 }

3696

3697 #ifdef CONFIG_PSI

3698 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)

3699 {

3700 struct cgroup *cgrp = seq_css(seq)->cgroup;

3701 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;

3702

3703 return psi_show(seq, psi, PSI_IO);

3704 }

3705 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)

3706 {

3707 struct cgroup *cgrp = seq_css(seq)->cgroup;

3708 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;

3709

3710 return psi_show(seq, psi, PSI_MEM);

3711 }

3712 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)

3713 {

3714 struct cgroup *cgrp = seq_css(seq)->cgroup;

3715 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;

3716

3717 return psi_show(seq, psi, PSI_CPU);

3718 }

3719

3720 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,

3721 size_t nbytes, enum psi_res res)

3722 {

3723 struct cgroup_file_ctx *ctx = of->priv;

3724 struct psi_trigger *new;

3725 struct cgroup *cgrp;

3726 struct psi_group *psi;

3727

3728 cgrp = cgroup_kn_lock_live(of->kn, false);

3729 if (!cgrp)

3730 return -ENODEV;

3731

3732 cgroup_get(cgrp);

3733 cgroup_kn_unlock(of->kn);

3734

3735 /* Allow only one trigger per file descriptor */

3736 if (ctx->psi.trigger) {

3737 cgroup_put(cgrp);

3738 return -EBUSY;

3739 }

3740

3741 psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;

3742 new = psi_trigger_create(psi, buf, res);

3743 if (IS_ERR(new)) {

3744 cgroup_put(cgrp);

3745 return PTR_ERR(new);

3746 }

3747

3748 smp_store_release(&ctx->psi.trigger, new);

3749 cgroup_put(cgrp);

3750

3751 return nbytes;

3752 }

3753

3754 static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,

3755 char *buf, size_t nbytes,

3756 loff_t off)

3757 {

3758 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);

3759 }

3760

3761 static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,

3762 char *buf, size_t nbytes,

3763 loff_t off)

3764 {

3765 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);

3766 }

3767

3768 static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,

3769 char *buf, size_t nbytes,

3770 loff_t off)

3771 {

3772 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);

3773 }

3774

3775 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,

3776 poll_table *pt)

3777 {

3778 struct cgroup_file_ctx *ctx = of->priv;

3779

3780 return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);

3781 }

3782

3783 static void cgroup_pressure_release(struct kernfs_open_file *of)

3784 {

3785 struct cgroup_file_ctx *ctx = of->priv;

3786

3787 psi_trigger_destroy(ctx->psi.trigger);

3788 }

3789

3790 bool cgroup_psi_enabled(void)

3791 {

3792 return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;

3793 }

3794

3795 #else /* CONFIG_PSI */

3796 bool cgroup_psi_enabled(void)

3797 {

3798 return false;

3799 }

3800

3801 #endif /* CONFIG_PSI */

3802

3803 static int cgroup_freeze_show(struct seq_file *seq, void *v)

3804 {

3805 struct cgroup *cgrp = seq_css(seq)->cgroup;

3806

3807 seq_printf(seq, "%d\n", cgrp->freezer.freeze);

3808

3809 return 0;

3810 }

3811

3812 static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,

3813 char *buf, size_t nbytes, loff_t off)

3814 {

3815 struct cgroup *cgrp;

3816 ssize_t ret;

3817 int freeze;

3818

3819 ret = kstrtoint(strstrip(buf), 0, &freeze);

3820 if (ret)

3821 return ret;

3822

3823 if (freeze < 0 || freeze > 1)

3824 return -ERANGE;

3825

3826 cgrp = cgroup_kn_lock_live(of->kn, false);

3827 if (!cgrp)

3828 return -ENOENT;

3829

3830 cgroup_freeze(cgrp, freeze);

3831

3832 cgroup_kn_unlock(of->kn);

3833

3834 return nbytes;

3835 }

3836

3837 static void __cgroup_kill(struct cgroup *cgrp)

3838 {

3839 struct css_task_iter it;

3840 struct task_struct *task;

3841

3842 lockdep_assert_held(&cgroup_mutex);

3843

3844 spin_lock_irq(&css_set_lock);

3845 set_bit(CGRP_KILL, &cgrp->flags);

3846 spin_unlock_irq(&css_set_lock);

3847

3848 css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);

3849 while ((task = css_task_iter_next(&it))) {

3850 /* Ignore kernel threads here. */

3851 if (task->flags & PF_KTHREAD)

3852 continue;

3853

3854 /* Skip tasks that are already dying. */

3855 if (__fatal_signal_pending(task))

3856 continue;

3857

3858 send_sig(SIGKILL, task, 0);

3859 }

3860 css_task_iter_end(&it);

3861

3862 spin_lock_irq(&css_set_lock);

3863 clear_bit(CGRP_KILL, &cgrp->flags);

3864 spin_unlock_irq(&css_set_lock);

3865 }

3866

3867 static void cgroup_kill(struct cgroup *cgrp)

3868 {

3869 struct cgroup_subsys_state *css;

3870 struct cgroup *dsct;

3871

3872 lockdep_assert_held(&cgroup_mutex);

3873

3874 cgroup_for_each_live_descendant_pre(dsct, css, cgrp)

3875 __cgroup_kill(dsct);

3876 }

3877

3878 static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,

3879 size_t nbytes, loff_t off)

3880 {

3881 ssize_t ret = 0;

3882 int kill;

3883 struct cgroup *cgrp;

3884

3885 ret = kstrtoint(strstrip(buf), 0, &kill);

3886 if (ret)

3887 return ret;

3888

3889 if (kill != 1)

3890 return -ERANGE;

3891

3892 cgrp = cgroup_kn_lock_live(of->kn, false);

3893 if (!cgrp)

3894 return -ENOENT;

3895

3896 /*

3897 * Killing is a process directed operation, i.e. the whole thread-group

3898 * is taken down so act like we do for cgroup.procs and only make this

3899 * writable in non-threaded cgroups.

3900 */

3901 if (cgroup_is_threaded(cgrp))

3902 ret = -EOPNOTSUPP;

3903 else

3904 cgroup_kill(cgrp);

3905

3906 cgroup_kn_unlock(of->kn);

3907

3908 return ret ?: nbytes;

3909 }

3910

3911 static int cgroup_file_open(struct kernfs_open_file *of)

3912 {

3913 struct cftype *cft = of_cft(of);

3914 struct cgroup_file_ctx *ctx;

3915 int ret;

3916

3917 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);

3918 if (!ctx)

3919 return -ENOMEM;

3920

3921 ctx->ns = current->nsproxy->cgroup_ns;

3922 get_cgroup_ns(ctx->ns);

3923 of->priv = ctx;

3924

3925 if (!cft->open)

3926 return 0;

3927

3928 ret = cft->open(of);

3929 if (ret) {

3930 put_cgroup_ns(ctx->ns);

3931 kfree(ctx);

3932 }

3933 return ret;

3934 }

3935

3936 static void cgroup_file_release(struct kernfs_open_file *of)

3937 {

3938 struct cftype *cft = of_cft(of);

3939 struct cgroup_file_ctx *ctx = of->priv;

3940

3941 if (cft->release)

3942 cft->release(of);

3943 put_cgroup_ns(ctx->ns);

3944 kfree(ctx);

3945 }

3946

3947 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,

3948 size_t nbytes, loff_t off)

3949 {

3950 struct cgroup_file_ctx *ctx = of->priv;

3951 struct cgroup *cgrp = of->kn->parent->priv;

3952 struct cftype *cft = of_cft(of);

3953 struct cgroup_subsys_state *css;

3954 int ret;

3955

3956 if (!nbytes)

3957 return 0;

3958

3959 /*

3960 * If namespaces are delegation boundaries, disallow writes to

3961 * files in an non-init namespace root from inside the namespace

3962 * except for the files explicitly marked delegatable -

3963 * cgroup.procs and cgroup.subtree_control.

3964 */

3965 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&

3966 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&

3967 ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)

3968 return -EPERM;

3969

3970 if (cft->write)

3971 return cft->write(of, buf, nbytes, off);

3972

3973 /*

3974 * kernfs guarantees that a file isn't deleted with operations in

3975 * flight, which means that the matching css is and stays alive and

3976 * doesn't need to be pinned. The RCU locking is not necessary

3977 * either. It's just for the convenience of using cgroup_css().

3978 */

3979 rcu_read_lock();

3980 css = cgroup_css(cgrp, cft->ss);

3981 rcu_read_unlock();

3982

3983 if (cft->write_u64) {

3984 unsigned long long v;

3985 ret = kstrtoull(buf, 0, &v);

3986 if (!ret)

3987 ret = cft->write_u64(css, cft, v);

3988 } else if (cft->write_s64) {

3989 long long v;

3990 ret = kstrtoll(buf, 0, &v);

3991 if (!ret)

3992 ret = cft->write_s64(css, cft, v);

3993 } else {

3994 ret = -EINVAL;

3995 }

3996

3997 return ret ?: nbytes;

3998 }

3999

4000 static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)

4001 {

4002 struct cftype *cft = of_cft(of);

4003

4004 if (cft->poll)

4005 return cft->poll(of, pt);

4006

4007 return kernfs_generic_poll(of, pt);

4008 }

4009

4010 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)

4011 {

4012 return seq_cft(seq)->seq_start(seq, ppos);

4013 }

4014

4015 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)

4016 {

4017 return seq_cft(seq)->seq_next(seq, v, ppos);

4018 }

4019

4020 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)

4021 {

4022 if (seq_cft(seq)->seq_stop)

4023 seq_cft(seq)->seq_stop(seq, v);

4024 }

4025

4026 static int cgroup_seqfile_show(struct seq_file *m, void *arg)

4027 {

4028 struct cftype *cft = seq_cft(m);

4029 struct cgroup_subsys_state *css = seq_css(m);

4030

4031 if (cft->seq_show)

4032 return cft->seq_show(m, arg);

4033

4034 if (cft->read_u64)

4035 seq_printf(m, "%llu\n", cft->read_u64(css, cft));

4036 else if (cft->read_s64)

4037 seq_printf(m, "%lld\n", cft->read_s64(css, cft));

4038 else

4039 return -EINVAL;

4040 return 0;

4041 }

4042

4043 static struct kernfs_ops cgroup_kf_single_ops = {

4044 .atomic_write_len = PAGE_SIZE,

4045 .open = cgroup_file_open,

4046 .release = cgroup_file_release,

4047 .write = cgroup_file_write,

4048 .poll = cgroup_file_poll,

4049 .seq_show = cgroup_seqfile_show,

4050 };

4051

4052 static struct kernfs_ops cgroup_kf_ops = {

4053 .atomic_write_len = PAGE_SIZE,

4054 .open = cgroup_file_open,

4055 .release = cgroup_file_release,

4056 .write = cgroup_file_write,

4057 .poll = cgroup_file_poll,

4058 .seq_start = cgroup_seqfile_start,

4059 .seq_next = cgroup_seqfile_next,

4060 .seq_stop = cgroup_seqfile_stop,

4061 .seq_show = cgroup_seqfile_show,

4062 };

4063

4064 /* set uid and gid of cgroup dirs and files to that of the creator */

4065 static int cgroup_kn_set_ugid(struct kernfs_node *kn)

4066 {

4067 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,

4068 .ia_uid = current_fsuid(),

4069 .ia_gid = current_fsgid(), };

4070

4071 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&

4072 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))

4073 return 0;

4074

4075 return kernfs_setattr(kn, &iattr);

4076 }

4077

4078 static void cgroup_file_notify_timer(struct timer_list *timer)

4079 {

4080 cgroup_file_notify(container_of(timer, struct cgroup_file,

4081 notify_timer));

4082 }

4083

4084 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,

4085 struct cftype *cft)

4086 {

4087 char name[CGROUP_FILE_NAME_MAX];

4088 struct kernfs_node *kn;

4089 struct lock_class_key *key = NULL;

4090 int ret;

4091

4092 #ifdef CONFIG_DEBUG_LOCK_ALLOC

4093 key = &cft->lockdep_key;

4094 #endif

4095 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),

4096 cgroup_file_mode(cft),

4097 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,

4098 0, cft->kf_ops, cft,

4099 NULL, key);

4100 if (IS_ERR(kn))

4101 return PTR_ERR(kn);

4102

4103 ret = cgroup_kn_set_ugid(kn);

4104 if (ret) {

4105 kernfs_remove(kn);

4106 return ret;

4107 }

4108

4109 if (cft->file_offset) {

4110 struct cgroup_file *cfile = (void *)css + cft->file_offset;

4111

4112 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);

4113

4114 spin_lock_irq(&cgroup_file_kn_lock);

4115 cfile->kn = kn;

4116 spin_unlock_irq(&cgroup_file_kn_lock);

4117 }

4118

4119 return 0;

4120 }

4121

4122 /**

4123 * cgroup_addrm_files - add or remove files to a cgroup directory

4124 * @css: the target css

4125 * @cgrp: the target cgroup (usually css->cgroup)

4126 * @cfts: array of cftypes to be added

4127 * @is_add: whether to add or remove

4128 *

4129 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.

4130 * For removals, this function never fails.

4131 */

4132 static int cgroup_addrm_files(struct cgroup_subsys_state *css,

4133 struct cgroup *cgrp, struct cftype cfts[],

4134 bool is_add)

4135 {

4136 struct cftype *cft, *cft_end = NULL;

4137 int ret = 0;

4138

4139 lockdep_assert_held(&cgroup_mutex);

4140

4141 restart:

4142 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {

4143 /* does cft->flags tell us to skip this file on @cgrp? */

4144 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))

4145 continue;

4146 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))

4147 continue;

4148 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))

4149 continue;

4150 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))

4151 continue;

4152 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)

4153 continue;

4154 if (is_add) {

4155 ret = cgroup_add_file(css, cgrp, cft);

4156 if (ret) {

4157 pr_warn("%s: failed to add %s, err=%d\n",

4158 __func__, cft->name, ret);

4159 cft_end = cft;

4160 is_add = false;

4161 goto restart;

4162 }

4163 } else {

4164 cgroup_rm_file(cgrp, cft);

4165 }

4166 }

4167 return ret;

4168 }

4169

4170 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)

4171 {

4172 struct cgroup_subsys *ss = cfts[0].ss;

4173 struct cgroup *root = &ss->root->cgrp;

4174 struct cgroup_subsys_state *css;

4175 int ret = 0;

4176

4177 lockdep_assert_held(&cgroup_mutex);

4178

4179 /* add/rm files for all cgroups created before */

4180 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {

4181 struct cgroup *cgrp = css->cgroup;

4182

4183 if (!(css->flags & CSS_VISIBLE))

4184 continue;

4185

4186 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);

4187 if (ret)

4188 break;

4189 }

4190

4191 if (is_add && !ret)

4192 kernfs_activate(root->kn);

4193 return ret;

4194 }

4195

4196 static void cgroup_exit_cftypes(struct cftype *cfts)

4197 {

4198 struct cftype *cft;

4199

4200 for (cft = cfts; cft->name[0] != '\0'; cft++) {

4201 /* free copy for custom atomic_write_len, see init_cftypes() */

4202 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)

4203 kfree(cft->kf_ops);

4204 cft->kf_ops = NULL;

4205 cft->ss = NULL;

4206

4207 /* revert flags set by cgroup core while adding @cfts */

4208 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |

4209 __CFTYPE_ADDED);

4210 }

4211 }

4212

4213 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

4214 {

4215 struct cftype *cft;

4216 int ret = 0;

4217

4218 for (cft = cfts; cft->name[0] != '\0'; cft++) {

4219 struct kernfs_ops *kf_ops;

4220

4221 WARN_ON(cft->ss || cft->kf_ops);

4222

4223 if (cft->flags & __CFTYPE_ADDED) {

4224 ret = -EBUSY;

4225 break;

4226 }

4227

4228 if (cft->seq_start)

4229 kf_ops = &cgroup_kf_ops;

4230 else

4231 kf_ops = &cgroup_kf_single_ops;

4232

4233 /*

4234 * Ugh... if @cft wants a custom max_write_len, we need to

4235 * make a copy of kf_ops to set its atomic_write_len.

4236 */

4237 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {

4238 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);

4239 if (!kf_ops) {

4240 ret = -ENOMEM;

4241 break;

4242 }

4243 kf_ops->atomic_write_len = cft->max_write_len;

4244 }

4245

4246 cft->kf_ops = kf_ops;

4247 cft->ss = ss;

4248 cft->flags |= __CFTYPE_ADDED;

4249 }

4250

4251 if (ret)

4252 cgroup_exit_cftypes(cfts);

4253 return ret;

4254 }

4255

4256 static int cgroup_rm_cftypes_locked(struct cftype *cfts)

4257 {

4258 lockdep_assert_held(&cgroup_mutex);

4259

4260 list_del(&cfts->node);

4261 cgroup_apply_cftypes(cfts, false);

4262 cgroup_exit_cftypes(cfts);

4263 return 0;

4264 }

4265

4266 /**

4267 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem

4268 * @cfts: zero-length name terminated array of cftypes

4269 *

4270 * Unregister @cfts. Files described by @cfts are removed from all

4271 * existing cgroups and all future cgroups won't have them either. This

4272 * function can be called anytime whether @cfts' subsys is attached or not.

4273 *

4274 * Returns 0 on successful unregistration, -ENOENT if @cfts is not

4275 * registered.

4276 */

4277 int cgroup_rm_cftypes(struct cftype *cfts)

4278 {

4279 int ret;

4280

4281 if (!cfts || cfts[0].name[0] == '\0')

4282 return 0;

4283

4284 if (!(cfts[0].flags & __CFTYPE_ADDED))

4285 return -ENOENT;

4286

4287 mutex_lock(&cgroup_mutex);

4288 ret = cgroup_rm_cftypes_locked(cfts);

4289 mutex_unlock(&cgroup_mutex);

4290 return ret;

4291 }

4292

4293 /**

4294 * cgroup_add_cftypes - add an array of cftypes to a subsystem

4295 * @ss: target cgroup subsystem

4296 * @cfts: zero-length name terminated array of cftypes

4297 *

4298 * Register @cfts to @ss. Files described by @cfts are created for all

4299 * existing cgroups to which @ss is attached and all future cgroups will

4300 * have them too. This function can be called anytime whether @ss is

4301 * attached or not.

4302 *

4303 * Returns 0 on successful registration, -errno on failure. Note that this

4304 * function currently returns 0 as long as @cfts registration is successful

4305 * even if some file creation attempts on existing cgroups fail.

4306 */

4307 static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

4308 {

4309 int ret;

4310

4311 if (!cgroup_ssid_enabled(ss->id))

4312 return 0;

4313

4314 if (!cfts || cfts[0].name[0] == '\0')

4315 return 0;

4316

4317 ret = cgroup_init_cftypes(ss, cfts);

4318 if (ret)

4319 return ret;

4320

4321 mutex_lock(&cgroup_mutex);

4322

4323 list_add_tail(&cfts->node, &ss->cfts);

4324 ret = cgroup_apply_cftypes(cfts, true);

4325 if (ret)

4326 cgroup_rm_cftypes_locked(cfts);

4327

4328 mutex_unlock(&cgroup_mutex);

4329 return ret;

4330 }

4331

4332 /**

4333 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy

4334 * @ss: target cgroup subsystem

4335 * @cfts: zero-length name terminated array of cftypes

4336 *

4337 * Similar to cgroup_add_cftypes() but the added files are only used for

4338 * the default hierarchy.

4339 */

4340 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

4341 {

4342 struct cftype *cft;

4343

4344 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)

4345 cft->flags |= __CFTYPE_ONLY_ON_DFL;

4346 return cgroup_add_cftypes(ss, cfts);

4347 }

4348

4349 /**

4350 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies

4351 * @ss: target cgroup subsystem

4352 * @cfts: zero-length name terminated array of cftypes

4353 *

4354 * Similar to cgroup_add_cftypes() but the added files are only used for

4355 * the legacy hierarchies.

4356 */

4357 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

4358 {

4359 struct cftype *cft;

4360

4361 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)

4362 cft->flags |= __CFTYPE_NOT_ON_DFL;

4363 return cgroup_add_cftypes(ss, cfts);

4364 }

4365

4366 /**

4367 * cgroup_file_notify - generate a file modified event for a cgroup_file

4368 * @cfile: target cgroup_file

4369 *

4370 * @cfile must have been obtained by setting cftype->file_offset.

4371 */

4372 void cgroup_file_notify(struct cgroup_file *cfile)

4373 {

4374 unsigned long flags;

4375

4376 spin_lock_irqsave(&cgroup_file_kn_lock, flags);

4377 if (cfile->kn) {

4378 unsigned long last = cfile->notified_at;

4379 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;

4380

4381 if (time_in_range(jiffies, last, next)) {

4382 timer_reduce(&cfile->notify_timer, next);

4383 } else {

4384 kernfs_notify(cfile->kn);

4385 cfile->notified_at = jiffies;

4386 }

4387 }

4388 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);

4389 }

4390

4391 /**

4392 * cgroup_file_show - show or hide a hidden cgroup file

4393 * @cfile: target cgroup_file obtained by setting cftype->file_offset

4394 * @show: whether to show or hide

4395 */

4396 void cgroup_file_show(struct cgroup_file *cfile, bool show)

4397 {

4398 struct kernfs_node *kn;

4399

4400 spin_lock_irq(&cgroup_file_kn_lock);

4401 kn = cfile->kn;

4402 kernfs_get(kn);

4403 spin_unlock_irq(&cgroup_file_kn_lock);

4404

4405 if (kn)

4406 kernfs_show(kn, show);

4407

4408 kernfs_put(kn);

4409 }

4410

4411 /**

4412 * css_next_child - find the next child of a given css

4413 * @pos: the current position (%NULL to initiate traversal)

4414 * @parent: css whose children to walk

4415 *

4416 * This function returns the next child of @parent and should be called

4417 * under either cgroup_mutex or RCU read lock. The only requirement is

4418 * that @parent and @pos are accessible. The next sibling is guaranteed to

4419 * be returned regardless of their states.

4420 *

4421 * If a subsystem synchronizes ->css_online() and the start of iteration, a

4422 * css which finished ->css_online() is guaranteed to be visible in the

4423 * future iterations and will stay visible until the last reference is put.

4424 * A css which hasn't finished ->css_online() or already finished

4425 * ->css_offline() may show up during traversal. It's each subsystem's

4426 * responsibility to synchronize against on/offlining.

4427 */

4428 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,

4429 struct cgroup_subsys_state *parent)

4430 {

4431 struct cgroup_subsys_state *next;

4432

4433 cgroup_assert_mutex_or_rcu_locked();

4434

4435 /*

4436 * @pos could already have been unlinked from the sibling list.

4437 * Once a cgroup is removed, its ->sibling.next is no longer

4438 * updated when its next sibling changes. CSS_RELEASED is set when

4439 * @pos is taken off list, at which time its next pointer is valid,

4440 * and, as releases are serialized, the one pointed to by the next

4441 * pointer is guaranteed to not have started release yet. This

4442 * implies that if we observe !CSS_RELEASED on @pos in this RCU

4443 * critical section, the one pointed to by its next pointer is

4444 * guaranteed to not have finished its RCU grace period even if we

4445 * have dropped rcu_read_lock() in-between iterations.

4446 *

4447 * If @pos has CSS_RELEASED set, its next pointer can't be

4448 * dereferenced; however, as each css is given a monotonically

4449 * increasing unique serial number and always appended to the

4450 * sibling list, the next one can be found by walking the parent's

4451 * children until the first css with higher serial number than

4452 * @pos's. While this path can be slower, it happens iff iteration

4453 * races against release and the race window is very small.

4454 */

4455 if (!pos) {

4456 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);

4457 } else if (likely(!(pos->flags & CSS_RELEASED))) {

4458 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);

4459 } else {

4460 list_for_each_entry_rcu(next, &parent->children, sibling,

4461 lockdep_is_held(&cgroup_mutex))

4462 if (next->serial_nr > pos->serial_nr)

4463 break;

4464 }

4465

4466 /*

4467 * @next, if not pointing to the head, can be dereferenced and is

4468 * the next sibling.

4469 */

4470 if (&next->sibling != &parent->children)

4471 return next;

4472 return NULL;

4473 }

4474

4475 /**

4476 * css_next_descendant_pre - find the next descendant for pre-order walk

4477 * @pos: the current position (%NULL to initiate traversal)

4478 * @root: css whose descendants to walk

4479 *

4480 * To be used by css_for_each_descendant_pre(). Find the next descendant

4481 * to visit for pre-order traversal of @root's descendants. @root is

4482 * included in the iteration and the first node to be visited.

4483 *

4484 * While this function requires cgroup_mutex or RCU read locking, it

4485 * doesn't require the whole traversal to be contained in a single critical

4486 * section. This function will return the correct next descendant as long

4487 * as both @pos and @root are accessible and @pos is a descendant of @root.

4488 *

4489 * If a subsystem synchronizes ->css_online() and the start of iteration, a

4490 * css which finished ->css_online() is guaranteed to be visible in the

4491 * future iterations and will stay visible until the last reference is put.

4492 * A css which hasn't finished ->css_online() or already finished

4493 * ->css_offline() may show up during traversal. It's each subsystem's

4494 * responsibility to synchronize against on/offlining.

4495 */

4496 struct cgroup_subsys_state *

4497 css_next_descendant_pre(struct cgroup_subsys_state *pos,

4498 struct cgroup_subsys_state *root)

4499 {

4500 struct cgroup_subsys_state *next;

4501

4502 cgroup_assert_mutex_or_rcu_locked();

4503

4504 /* if first iteration, visit @root */

4505 if (!pos)

4506 return root;

4507

4508 /* visit the first child if exists */

4509 next = css_next_child(NULL, pos);

4510 if (next)

4511 return next;

4512

4513 /* no child, visit my or the closest ancestor's next sibling */

4514 while (pos != root) {

4515 next = css_next_child(pos, pos->parent);

4516 if (next)

4517 return next;

4518 pos = pos->parent;

4519 }

4520

4521 return NULL;

4522 }

4523 EXPORT_SYMBOL_GPL(css_next_descendant_pre);

4524

4525 /**

4526 * css_rightmost_descendant - return the rightmost descendant of a css

4527 * @pos: css of interest

4528 *

4529 * Return the rightmost descendant of @pos. If there's no descendant, @pos

4530 * is returned. This can be used during pre-order traversal to skip

4531 * subtree of @pos.

4532 *

4533 * While this function requires cgroup_mutex or RCU read locking, it

4534 * doesn't require the whole traversal to be contained in a single critical

4535 * section. This function will return the correct rightmost descendant as

4536 * long as @pos is accessible.

4537 */

4538 struct cgroup_subsys_state *

4539 css_rightmost_descendant(struct cgroup_subsys_state *pos)

4540 {

4541 struct cgroup_subsys_state *last, *tmp;

4542

4543 cgroup_assert_mutex_or_rcu_locked();

4544

4545 do {

4546 last = pos;

4547 /* ->prev isn't RCU safe, walk ->next till the end */

4548 pos = NULL;

4549 css_for_each_child(tmp, last)

4550 pos = tmp;

4551 } while (pos);

4552

4553 return last;

4554 }

4555

4556 static struct cgroup_subsys_state *

4557 css_leftmost_descendant(struct cgroup_subsys_state *pos)

4558 {

4559 struct cgroup_subsys_state *last;

4560

4561 do {

4562 last = pos;

4563 pos = css_next_child(NULL, pos);

4564 } while (pos);

4565

4566 return last;

4567 }

4568

4569 /**

4570 * css_next_descendant_post - find the next descendant for post-order walk

4571 * @pos: the current position (%NULL to initiate traversal)

4572 * @root: css whose descendants to walk

4573 *

4574 * To be used by css_for_each_descendant_post(). Find the next descendant

4575 * to visit for post-order traversal of @root's descendants. @root is

4576 * included in the iteration and the last node to be visited.

4577 *

4578 * While this function requires cgroup_mutex or RCU read locking, it

4579 * doesn't require the whole traversal to be contained in a single critical

4580 * section. This function will return the correct next descendant as long

4581 * as both @pos and @cgroup are accessible and @pos is a descendant of

4582 * @cgroup.

4583 *

4584 * If a subsystem synchronizes ->css_online() and the start of iteration, a

4585 * css which finished ->css_online() is guaranteed to be visible in the

4586 * future iterations and will stay visible until the last reference is put.

4587 * A css which hasn't finished ->css_online() or already finished

4588 * ->css_offline() may show up during traversal. It's each subsystem's

4589 * responsibility to synchronize against on/offlining.

4590 */

4591 struct cgroup_subsys_state *

4592 css_next_descendant_post(struct cgroup_subsys_state *pos,

4593 struct cgroup_subsys_state *root)

4594 {

4595 struct cgroup_subsys_state *next;

4596

4597 cgroup_assert_mutex_or_rcu_locked();

4598

4599 /* if first iteration, visit leftmost descendant which may be @root */

4600 if (!pos)

4601 return css_leftmost_descendant(root);

4602

4603 /* if we visited @root, we're done */

4604 if (pos == root)

4605 return NULL;

4606

4607 /* if there's an unvisited sibling, visit its leftmost descendant */

4608 next = css_next_child(pos, pos->parent);

4609 if (next)

4610 return css_leftmost_descendant(next);

4611

4612 /* no sibling left, visit parent */

4613 return pos->parent;

4614 }

4615

4616 /**

4617 * css_has_online_children - does a css have online children

4618 * @css: the target css

4619 *

4620 * Returns %true if @css has any online children; otherwise, %false. This

4621 * function can be called from any context but the caller is responsible

4622 * for synchronizing against on/offlining as necessary.

4623 */

4624 bool css_has_online_children(struct cgroup_subsys_state *css)

4625 {

4626 struct cgroup_subsys_state *child;

4627 bool ret = false;

4628

4629 rcu_read_lock();

4630 css_for_each_child(child, css) {

4631 if (child->flags & CSS_ONLINE) {

4632 ret = true;

4633 break;

4634 }

4635 }

4636 rcu_read_unlock();

4637 return ret;

4638 }

4639

4640 static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)

4641 {

4642 struct list_head *l;

4643 struct cgrp_cset_link *link;

4644 struct css_set *cset;

4645

4646 lockdep_assert_held(&css_set_lock);

4647

4648 /* find the next threaded cset */

4649 if (it->tcset_pos) {

4650 l = it->tcset_pos->next;

4651

4652 if (l != it->tcset_head) {

4653 it->tcset_pos = l;

4654 return container_of(l, struct css_set,

4655 threaded_csets_node);

4656 }

4657

4658 it->tcset_pos = NULL;

4659 }

4660

4661 /* find the next cset */

4662 l = it->cset_pos;

4663 l = l->next;

4664 if (l == it->cset_head) {

4665 it->cset_pos = NULL;

4666 return NULL;

4667 }

4668

4669 if (it->ss) {

4670 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);

4671 } else {

4672 link = list_entry(l, struct cgrp_cset_link, cset_link);

4673 cset = link->cset;

4674 }

4675

4676 it->cset_pos = l;

4677

4678 /* initialize threaded css_set walking */

4679 if (it->flags & CSS_TASK_ITER_THREADED) {

4680 if (it->cur_dcset)

4681 put_css_set_locked(it->cur_dcset);

4682 it->cur_dcset = cset;

4683 get_css_set(cset);

4684

4685 it->tcset_head = &cset->threaded_csets;

4686 it->tcset_pos = &cset->threaded_csets;

4687 }

4688

4689 return cset;

4690 }

4691

4692 /**

4693 * css_task_iter_advance_css_set - advance a task iterator to the next css_set

4694 * @it: the iterator to advance

4695 *

4696 * Advance @it to the next css_set to walk.

4697 */

4698 static void css_task_iter_advance_css_set(struct css_task_iter *it)

4699 {

4700 struct css_set *cset;

4701

4702 lockdep_assert_held(&css_set_lock);

4703

4704 /* Advance to the next non-empty css_set and find first non-empty tasks list*/

4705 while ((cset = css_task_iter_next_css_set(it))) {

4706 if (!list_empty(&cset->tasks)) {

4707 it->cur_tasks_head = &cset->tasks;

4708 break;

4709 } else if (!list_empty(&cset->mg_tasks)) {

4710 it->cur_tasks_head = &cset->mg_tasks;

4711 break;

4712 } else if (!list_empty(&cset->dying_tasks)) {

4713 it->cur_tasks_head = &cset->dying_tasks;

4714 break;

4715 }

4716 }

4717 if (!cset) {

4718 it->task_pos = NULL;

4719 return;

4720 }

4721 it->task_pos = it->cur_tasks_head->next;

4722

4723 /*

4724 * We don't keep css_sets locked across iteration steps and thus

4725 * need to take steps to ensure that iteration can be resumed after

4726 * the lock is re-acquired. Iteration is performed at two levels -

4727 * css_sets and tasks in them.

4728 *

4729 * Once created, a css_set never leaves its cgroup lists, so a

4730 * pinned css_set is guaranteed to stay put and we can resume

4731 * iteration afterwards.

4732 *

4733 * Tasks may leave @cset across iteration steps. This is resolved

4734 * by registering each iterator with the css_set currently being

4735 * walked and making css_set_move_task() advance iterators whose

4736 * next task is leaving.

4737 */

4738 if (it->cur_cset) {

4739 list_del(&it->iters_node);

4740 put_css_set_locked(it->cur_cset);

4741 }

4742 get_css_set(cset);

4743 it->cur_cset = cset;

4744 list_add(&it->iters_node, &cset->task_iters);

4745 }

4746

4747 static void css_task_iter_skip(struct css_task_iter *it,

4748 struct task_struct *task)

4749 {

4750 lockdep_assert_held(&css_set_lock);

4751

4752 if (it->task_pos == &task->cg_list) {

4753 it->task_pos = it->task_pos->next;

4754 it->flags |= CSS_TASK_ITER_SKIPPED;

4755 }

4756 }

4757

4758 static void css_task_iter_advance(struct css_task_iter *it)

4759 {

4760 struct task_struct *task;

4761

4762 lockdep_assert_held(&css_set_lock);

4763 repeat:

4764 if (it->task_pos) {

4765 /*

4766 * Advance iterator to find next entry. We go through cset

4767 * tasks, mg_tasks and dying_tasks, when consumed we move onto

4768 * the next cset.

4769 */

4770 if (it->flags & CSS_TASK_ITER_SKIPPED)

4771 it->flags &= ~CSS_TASK_ITER_SKIPPED;

4772 else

4773 it->task_pos = it->task_pos->next;

4774

4775 if (it->task_pos == &it->cur_cset->tasks) {

4776 it->cur_tasks_head = &it->cur_cset->mg_tasks;

4777 it->task_pos = it->cur_tasks_head->next;

4778 }

4779 if (it->task_pos == &it->cur_cset->mg_tasks) {

4780 it->cur_tasks_head = &it->cur_cset->dying_tasks;

4781 it->task_pos = it->cur_tasks_head->next;

4782 }

4783 if (it->task_pos == &it->cur_cset->dying_tasks)

4784 css_task_iter_advance_css_set(it);

4785 } else {

4786 /* called from start, proceed to the first cset */

4787 css_task_iter_advance_css_set(it);

4788 }

4789

4790 if (!it->task_pos)

4791 return;

4792

4793 task = list_entry(it->task_pos, struct task_struct, cg_list);

4794

4795 if (it->flags & CSS_TASK_ITER_PROCS) {

4796 /* if PROCS, skip over tasks which aren't group leaders */

4797 if (!thread_group_leader(task))

4798 goto repeat;

4799

4800 /* and dying leaders w/o live member threads */

4801 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&

4802 !atomic_read(&task->signal->live))

4803 goto repeat;

4804 } else {

4805 /* skip all dying ones */

4806 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)

4807 goto repeat;

4808 }

4809 }

4810

4811 /**

4812 * css_task_iter_start - initiate task iteration

4813 * @css: the css to walk tasks of

4814 * @flags: CSS_TASK_ITER_* flags

4815 * @it: the task iterator to use

4816 *

4817 * Initiate iteration through the tasks of @css. The caller can call

4818 * css_task_iter_next() to walk through the tasks until the function

4819 * returns NULL. On completion of iteration, css_task_iter_end() must be

4820 * called.

4821 */

4822 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,

4823 struct css_task_iter *it)

4824 {

4825 memset(it, 0, sizeof(*it));

4826

4827 spin_lock_irq(&css_set_lock);

4828

4829 it->ss = css->ss;

4830 it->flags = flags;

4831

4832 if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)

4833 it->cset_pos = &css->cgroup->e_csets[css->ss->id];

4834 else

4835 it->cset_pos = &css->cgroup->cset_links;

4836

4837 it->cset_head = it->cset_pos;

4838

4839 css_task_iter_advance(it);

4840

4841 spin_unlock_irq(&css_set_lock);

4842 }

4843

4844 /**

4845 * css_task_iter_next - return the next task for the iterator

4846 * @it: the task iterator being iterated

4847 *

4848 * The "next" function for task iteration. @it should have been

4849 * initialized via css_task_iter_start(). Returns NULL when the iteration

4850 * reaches the end.

4851 */

4852 struct task_struct *css_task_iter_next(struct css_task_iter *it)

4853 {

4854 if (it->cur_task) {

4855 put_task_struct(it->cur_task);

4856 it->cur_task = NULL;

4857 }

4858

4859 spin_lock_irq(&css_set_lock);

4860

4861 /* @it may be half-advanced by skips, finish advancing */

4862 if (it->flags & CSS_TASK_ITER_SKIPPED)

4863 css_task_iter_advance(it);

4864

4865 if (it->task_pos) {

4866 it->cur_task = list_entry(it->task_pos, struct task_struct,

4867 cg_list);

4868 get_task_struct(it->cur_task);

4869 css_task_iter_advance(it);

4870 }

4871

4872 spin_unlock_irq(&css_set_lock);

4873

4874 return it->cur_task;

4875 }

4876

4877 /**

4878 * css_task_iter_end - finish task iteration

4879 * @it: the task iterator to finish

4880 *

4881 * Finish task iteration started by css_task_iter_start().

4882 */

4883 void css_task_iter_end(struct css_task_iter *it)

4884 {

4885 if (it->cur_cset) {

4886 spin_lock_irq(&css_set_lock);

4887 list_del(&it->iters_node);

4888 put_css_set_locked(it->cur_cset);

4889 spin_unlock_irq(&css_set_lock);

4890 }

4891

4892 if (it->cur_dcset)

4893 put_css_set(it->cur_dcset);

4894

4895 if (it->cur_task)

4896 put_task_struct(it->cur_task);

4897 }

4898

4899 static void cgroup_procs_release(struct kernfs_open_file *of)

4900 {

4901 struct cgroup_file_ctx *ctx = of->priv;

4902

4903 if (ctx->procs.started)

4904 css_task_iter_end(&ctx->procs.iter);

4905 }

4906

4907 static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)

4908 {

4909 struct kernfs_open_file *of = s->private;

4910 struct cgroup_file_ctx *ctx = of->priv;

4911

4912 if (pos)

4913 (*pos)++;

4914

4915 return css_task_iter_next(&ctx->procs.iter);

4916 }

4917

4918 static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,

4919 unsigned int iter_flags)

4920 {

4921 struct kernfs_open_file *of = s->private;

4922 struct cgroup *cgrp = seq_css(s)->cgroup;

4923 struct cgroup_file_ctx *ctx = of->priv;

4924 struct css_task_iter *it = &ctx->procs.iter;

4925

4926 /*

4927 * When a seq_file is seeked, it's always traversed sequentially

4928 * from position 0, so we can simply keep iterating on !0 *pos.

4929 */

4930 if (!ctx->procs.started) {

4931 if (WARN_ON_ONCE((*pos)))

4932 return ERR_PTR(-EINVAL);

4933 css_task_iter_start(&cgrp->self, iter_flags, it);

4934 ctx->procs.started = true;

4935 } else if (!(*pos)) {

4936 css_task_iter_end(it);

4937 css_task_iter_start(&cgrp->self, iter_flags, it);

4938 } else

4939 return it->cur_task;

4940

4941 return cgroup_procs_next(s, NULL, NULL);

4942 }

4943

4944 static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)

4945 {

4946 struct cgroup *cgrp = seq_css(s)->cgroup;

4947

4948 /*

4949 * All processes of a threaded subtree belong to the domain cgroup

4950 * of the subtree. Only threads can be distributed across the

4951 * subtree. Reject reads on cgroup.procs in the subtree proper.

4952 * They're always empty anyway.

4953 */

4954 if (cgroup_is_threaded(cgrp))

4955 return ERR_PTR(-EOPNOTSUPP);

4956

4957 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |

4958 CSS_TASK_ITER_THREADED);

4959 }

4960

4961 static int cgroup_procs_show(struct seq_file *s, void *v)

4962 {

4963 seq_printf(s, "%d\n", task_pid_vnr(v));

4964 return 0;

4965 }

4966

4967 static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)

4968 {

4969 int ret;

4970 struct inode *inode;

4971

4972 lockdep_assert_held(&cgroup_mutex);

4973

4974 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);

4975 if (!inode)

4976 return -ENOMEM;

4977

4978 ret = inode_permission(&init_user_ns, inode, MAY_WRITE);

4979 iput(inode);

4980 return ret;

4981 }

4982

4983 static int cgroup_procs_write_permission(struct cgroup *src_cgrp,

4984 struct cgroup *dst_cgrp,

4985 struct super_block *sb,

4986 struct cgroup_namespace *ns)

4987 {

4988 struct cgroup *com_cgrp = src_cgrp;

4989 int ret;

4990

4991 lockdep_assert_held(&cgroup_mutex);

4992

4993 /* find the common ancestor */

4994 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))

4995 com_cgrp = cgroup_parent(com_cgrp);

4996

4997 /* %current should be authorized to migrate to the common ancestor */

4998 ret = cgroup_may_write(com_cgrp, sb);

4999 if (ret)

5000 return ret;

5001

5002 /*

5003 * If namespaces are delegation boundaries, %current must be able

5004 * to see both source and destination cgroups from its namespace.

5005 */

5006 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&

5007 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||

5008 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))

5009 return -ENOENT;

5010

5011 return 0;

5012 }

5013

5014 static int cgroup_attach_permissions(struct cgroup *src_cgrp,

5015 struct cgroup *dst_cgrp,

5016 struct super_block *sb, bool threadgroup,

5017 struct cgroup_namespace *ns)

5018 {

5019 int ret = 0;

5020

5021 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);

5022 if (ret)

5023 return ret;

5024

5025 ret = cgroup_migrate_vet_dst(dst_cgrp);

5026 if (ret)

5027 return ret;

5028

5029 if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))

5030 ret = -EOPNOTSUPP;

5031

5032 return ret;

5033 }

5034

5035 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,

5036 bool threadgroup)

5037 {

5038 struct cgroup_file_ctx *ctx = of->priv;

5039 struct cgroup *src_cgrp, *dst_cgrp;

5040 struct task_struct *task;

5041 const struct cred *saved_cred;

5042 ssize_t ret;

5043 bool threadgroup_locked;

5044

5045 dst_cgrp = cgroup_kn_lock_live(of->kn, false);

5046 if (!dst_cgrp)

5047 return -ENODEV;

5048

5049 task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);

5050 ret = PTR_ERR_OR_ZERO(task);

5051 if (ret)

5052 goto out_unlock;

5053

5054 /* find the source cgroup */

5055 spin_lock_irq(&css_set_lock);

5056 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);

5057 spin_unlock_irq(&css_set_lock);

5058

5059 /*

5060 * Process and thread migrations follow same delegation rule. Check

5061 * permissions using the credentials from file open to protect against

5062 * inherited fd attacks.

5063 */

5064 saved_cred = override_creds(of->file->f_cred);

5065 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,

5066 of->file->f_path.dentry->d_sb,

5067 threadgroup, ctx->ns);

5068 revert_creds(saved_cred);

5069 if (ret)

5070 goto out_finish;

5071

5072 ret = cgroup_attach_task(dst_cgrp, task, threadgroup);

5073

5074 out_finish:

5075 cgroup_procs_write_finish(task, threadgroup_locked);

5076 out_unlock:

5077 cgroup_kn_unlock(of->kn);

5078

5079 return ret;

5080 }

5081

5082 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,

5083 char *buf, size_t nbytes, loff_t off)

5084 {

5085 return __cgroup_procs_write(of, buf, true) ?: nbytes;

5086 }

5087

5088 static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)

5089 {

5090 return __cgroup_procs_start(s, pos, 0);

5091 }

5092

5093 static ssize_t cgroup_threads_write(struct kernfs_open_file *of,

5094 char *buf, size_t nbytes, loff_t off)

5095 {

5096 return __cgroup_procs_write(of, buf, false) ?: nbytes;

5097 }

5098

5099 /* cgroup core interface files for the default hierarchy */

5100 static struct cftype cgroup_base_files[] = {

5101 {

5102 .name = "cgroup.type",

5103 .flags = CFTYPE_NOT_ON_ROOT,

5104 .seq_show = cgroup_type_show,

5105 .write = cgroup_type_write,

5106 },

5107 {

5108 .name = "cgroup.procs",

5109 .flags = CFTYPE_NS_DELEGATABLE,

5110 .file_offset = offsetof(struct cgroup, procs_file),

5111 .release = cgroup_procs_release,

5112 .seq_start = cgroup_procs_start,

5113 .seq_next = cgroup_procs_next,

5114 .seq_show = cgroup_procs_show,

5115 .write = cgroup_procs_write,

5116 },

5117 {

5118 .name = "cgroup.threads",

5119 .flags = CFTYPE_NS_DELEGATABLE,

5120 .release = cgroup_procs_release,

5121 .seq_start = cgroup_threads_start,

5122 .seq_next = cgroup_procs_next,

5123 .seq_show = cgroup_procs_show,

5124 .write = cgroup_threads_write,

5125 },

5126 {

5127 .name = "cgroup.controllers",

5128 .seq_show = cgroup_controllers_show,

5129 },

5130 {

5131 .name = "cgroup.subtree_control",

5132 .flags = CFTYPE_NS_DELEGATABLE,

5133 .seq_show = cgroup_subtree_control_show,

5134 .write = cgroup_subtree_control_write,

5135 },

5136 {

5137 .name = "cgroup.events",

5138 .flags = CFTYPE_NOT_ON_ROOT,

5139 .file_offset = offsetof(struct cgroup, events_file),

5140 .seq_show = cgroup_events_show,

5141 },

5142 {

5143 .name = "cgroup.max.descendants",

5144 .seq_show = cgroup_max_descendants_show,

5145 .write = cgroup_max_descendants_write,

5146 },

5147 {

5148 .name = "cgroup.max.depth",

5149 .seq_show = cgroup_max_depth_show,

5150 .write = cgroup_max_depth_write,

5151 },

5152 {

5153 .name = "cgroup.stat",

5154 .seq_show = cgroup_stat_show,

5155 },

5156 {

5157 .name = "cgroup.freeze",

5158 .flags = CFTYPE_NOT_ON_ROOT,

5159 .seq_show = cgroup_freeze_show,

5160 .write = cgroup_freeze_write,

5161 },

5162 {

5163 .name = "cgroup.kill",

5164 .flags = CFTYPE_NOT_ON_ROOT,

5165 .write = cgroup_kill_write,

5166 },

5167 {

5168 .name = "cpu.stat",

5169 .seq_show = cpu_stat_show,

5170 },

5171 { } /* terminate */

5172 };

5173

5174 static struct cftype cgroup_psi_files[] = {

5175 #ifdef CONFIG_PSI

5176 {

5177 .name = "io.pressure",

5178 .seq_show = cgroup_io_pressure_show,

5179 .write = cgroup_io_pressure_write,

5180 .poll = cgroup_pressure_poll,

5181 .release = cgroup_pressure_release,

5182 },

5183 {

5184 .name = "memory.pressure",

5185 .seq_show = cgroup_memory_pressure_show,

5186 .write = cgroup_memory_pressure_write,

5187 .poll = cgroup_pressure_poll,

5188 .release = cgroup_pressure_release,

5189 },

5190 {

5191 .name = "cpu.pressure",

5192 .seq_show = cgroup_cpu_pressure_show,

5193 .write = cgroup_cpu_pressure_write,

5194 .poll = cgroup_pressure_poll,

5195 .release = cgroup_pressure_release,

5196 },

5197 #endif /* CONFIG_PSI */

5198 { } /* terminate */

5199 };

5200

5201 /*

5202 * css destruction is four-stage process.

5203 *

5204 * 1. Destruction starts. Killing of the percpu_ref is initiated.

5205 * Implemented in kill_css().

5206 *

5207 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs

5208 * and thus css_tryget_online() is guaranteed to fail, the css can be

5209 * offlined by invoking offline_css(). After offlining, the base ref is

5210 * put. Implemented in css_killed_work_fn().

5211 *

5212 * 3. When the percpu_ref reaches zero, the only possible remaining

5213 * accessors are inside RCU read sections. css_release() schedules the

5214 * RCU callback.

5215 *

5216 * 4. After the grace period, the css can be freed. Implemented in

5217 * css_free_work_fn().

5218 *

5219 * It is actually hairier because both step 2 and 4 require process context

5220 * and thus involve punting to css->destroy_work adding two additional

5221 * steps to the already complex sequence.

5222 */

5223 static void css_free_rwork_fn(struct work_struct *work)

5224 {

5225 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),

5226 struct cgroup_subsys_state, destroy_rwork);

5227 struct cgroup_subsys *ss = css->ss;

5228 struct cgroup *cgrp = css->cgroup;

5229

5230 percpu_ref_exit(&css->refcnt);

5231

5232 if (ss) {

5233 /* css free path */

5234 struct cgroup_subsys_state *parent = css->parent;

5235 int id = css->id;

5236

5237 ss->css_free(css);

5238 cgroup_idr_remove(&ss->css_idr, id);

5239 cgroup_put(cgrp);

5240

5241 if (parent)

5242 css_put(parent);

5243 } else {

5244 /* cgroup free path */

5245 atomic_dec(&cgrp->root->nr_cgrps);

5246 cgroup1_pidlist_destroy_all(cgrp);

5247 cancel_work_sync(&cgrp->release_agent_work);

5248 bpf_cgrp_storage_free(cgrp);

5249

5250 if (cgroup_parent(cgrp)) {

5251 /*

5252 * We get a ref to the parent, and put the ref when

5253 * this cgroup is being freed, so it's guaranteed

5254 * that the parent won't be destroyed before its

5255 * children.

5256 */

5257 cgroup_put(cgroup_parent(cgrp));

5258 kernfs_put(cgrp->kn);

5259 psi_cgroup_free(cgrp);

5260 cgroup_rstat_exit(cgrp);

5261 kfree(cgrp);

5262 } else {

5263 /*

5264 * This is root cgroup's refcnt reaching zero,

5265 * which indicates that the root should be

5266 * released.

5267 */

5268 cgroup_destroy_root(cgrp->root);

5269 }

5270 }

5271 }

5272

5273 static void css_release_work_fn(struct work_struct *work)

5274 {

5275 struct cgroup_subsys_state *css =

5276 container_of(work, struct cgroup_subsys_state, destroy_work);

5277 struct cgroup_subsys *ss = css->ss;

5278 struct cgroup *cgrp = css->cgroup;

5279

5280 mutex_lock(&cgroup_mutex);

5281

5282 css->flags |= CSS_RELEASED;

5283 list_del_rcu(&css->sibling);

5284

5285 if (ss) {

5286 /* css release path */

5287 if (!list_empty(&css->rstat_css_node)) {

5288 cgroup_rstat_flush(cgrp);

5289 list_del_rcu(&css->rstat_css_node);

5290 }

5291

5292 cgroup_idr_replace(&ss->css_idr, NULL, css->id);

5293 if (ss->css_released)

5294 ss->css_released(css);

5295 } else {

5296 struct cgroup *tcgrp;

5297

5298 /* cgroup release path */

5299 TRACE_CGROUP_PATH(release, cgrp);

5300

5301 cgroup_rstat_flush(cgrp);

5302

5303 spin_lock_irq(&css_set_lock);

5304 for (tcgrp = cgroup_parent(cgrp); tcgrp;

5305 tcgrp = cgroup_parent(tcgrp))

5306 tcgrp->nr_dying_descendants--;

5307 spin_unlock_irq(&css_set_lock);

5308

5309 /*

5310 * There are two control paths which try to determine

5311 * cgroup from dentry without going through kernfs -

5312 * cgroupstats_build() and css_tryget_online_from_dir().

5313 * Those are supported by RCU protecting clearing of

5314 * cgrp->kn->priv backpointer.

5315 */

5316 if (cgrp->kn)

5317 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,

5318 NULL);

5319 }

5320

5321 mutex_unlock(&cgroup_mutex);

5322

5323 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);

5324 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);

5325 }

5326

5327 static void css_release(struct percpu_ref *ref)

5328 {

5329 struct cgroup_subsys_state *css =

5330 container_of(ref, struct cgroup_subsys_state, refcnt);

5331

5332 INIT_WORK(&css->destroy_work, css_release_work_fn);

5333 queue_work(cgroup_destroy_wq, &css->destroy_work);

5334 }

5335

5336 static void init_and_link_css(struct cgroup_subsys_state *css,

5337 struct cgroup_subsys *ss, struct cgroup *cgrp)

5338 {

5339 lockdep_assert_held(&cgroup_mutex);

5340

5341 cgroup_get_live(cgrp);

5342

5343 memset(css, 0, sizeof(*css));

5344 css->cgroup = cgrp;

5345 css->ss = ss;

5346 css->id = -1;

5347 INIT_LIST_HEAD(&css->sibling);

5348 INIT_LIST_HEAD(&css->children);

5349 INIT_LIST_HEAD(&css->rstat_css_node);

5350 css->serial_nr = css_serial_nr_next++;

5351 atomic_set(&css->online_cnt, 0);

5352

5353 if (cgroup_parent(cgrp)) {

5354 css->parent = cgroup_css(cgroup_parent(cgrp), ss);

5355 css_get(css->parent);

5356 }

5357

5358 if (ss->css_rstat_flush)

5359 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);

5360

5361 BUG_ON(cgroup_css(cgrp, ss));

5362 }

5363

5364 /* invoke ->css_online() on a new CSS and mark it online if successful */

5365 static int online_css(struct cgroup_subsys_state *css)

5366 {

5367 struct cgroup_subsys *ss = css->ss;

5368 int ret = 0;

5369

5370 lockdep_assert_held(&cgroup_mutex);

5371

5372 if (ss->css_online)

5373 ret = ss->css_online(css);

5374 if (!ret) {

5375 css->flags |= CSS_ONLINE;

5376 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

5377

5378 atomic_inc(&css->online_cnt);

5379 if (css->parent)

5380 atomic_inc(&css->parent->online_cnt);

5381 }

5382 return ret;

5383 }

5384

5385 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */

5386 static void offline_css(struct cgroup_subsys_state *css)

5387 {

5388 struct cgroup_subsys *ss = css->ss;

5389

5390 lockdep_assert_held(&cgroup_mutex);

5391

5392 if (!(css->flags & CSS_ONLINE))

5393 return;

5394

5395 if (ss->css_offline)

5396 ss->css_offline(css);

5397

5398 css->flags &= ~CSS_ONLINE;

5399 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);

5400

5401 wake_up_all(&css->cgroup->offline_waitq);

5402 }

5403

5404 /**

5405 * css_create - create a cgroup_subsys_state

5406 * @cgrp: the cgroup new css will be associated with

5407 * @ss: the subsys of new css

5408 *

5409 * Create a new css associated with @cgrp - @ss pair. On success, the new

5410 * css is online and installed in @cgrp. This function doesn't create the

5411 * interface files. Returns 0 on success, -errno on failure.

5412 */

5413 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,

5414 struct cgroup_subsys *ss)

5415 {

5416 struct cgroup *parent = cgroup_parent(cgrp);

5417 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);

5418 struct cgroup_subsys_state *css;

5419 int err;

5420

5421 lockdep_assert_held(&cgroup_mutex);

5422

5423 css = ss->css_alloc(parent_css);

5424 if (!css)

5425 css = ERR_PTR(-ENOMEM);

5426 if (IS_ERR(css))

5427 return css;

5428

5429 init_and_link_css(css, ss, cgrp);

5430

5431 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);

5432 if (err)

5433 goto err_free_css;

5434

5435 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);

5436 if (err < 0)

5437 goto err_free_css;

5438 css->id = err;

5439

5440 /* @css is ready to be brought online now, make it visible */

5441 list_add_tail_rcu(&css->sibling, &parent_css->children);

5442 cgroup_idr_replace(&ss->css_idr, css, css->id);

5443

5444 err = online_css(css);

5445 if (err)

5446 goto err_list_del;

5447

5448 return css;

5449

5450 err_list_del:

5451 list_del_rcu(&css->sibling);

5452 err_free_css:

5453 list_del_rcu(&css->rstat_css_node);

5454 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);

5455 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);

5456 return ERR_PTR(err);

5457 }

5458

5459 /*

5460 * The returned cgroup is fully initialized including its control mask, but

5461 * it isn't associated with its kernfs_node and doesn't have the control

5462 * mask applied.

5463 */

5464 static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,

5465 umode_t mode)

5466 {

5467 struct cgroup_root *root = parent->root;

5468 struct cgroup *cgrp, *tcgrp;

5469 struct kernfs_node *kn;

5470 int level = parent->level + 1;

5471 int ret;

5472

5473 /* allocate the cgroup and its ID, 0 is reserved for the root */

5474 cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);

5475 if (!cgrp)

5476 return ERR_PTR(-ENOMEM);

5477

5478 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);

5479 if (ret)

5480 goto out_free_cgrp;

5481

5482 ret = cgroup_rstat_init(cgrp);

5483 if (ret)

5484 goto out_cancel_ref;

5485

5486 /* create the directory */

5487 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);

5488 if (IS_ERR(kn)) {

5489 ret = PTR_ERR(kn);

5490 goto out_stat_exit;

5491 }

5492 cgrp->kn = kn;

5493

5494 init_cgroup_housekeeping(cgrp);

5495

5496 cgrp->self.parent = &parent->self;

5497 cgrp->root = root;

5498 cgrp->level = level;

5499

5500 ret = psi_cgroup_alloc(cgrp);

5501 if (ret)

5502 goto out_kernfs_remove;

5503

5504 ret = cgroup_bpf_inherit(cgrp);

5505 if (ret)

5506 goto out_psi_free;

5507

5508 /*

5509 * New cgroup inherits effective freeze counter, and

5510 * if the parent has to be frozen, the child has too.

5511 */

5512 cgrp->freezer.e_freeze = parent->freezer.e_freeze;

5513 if (cgrp->freezer.e_freeze) {

5514 /*

5515 * Set the CGRP_FREEZE flag, so when a process will be

5516 * attached to the child cgroup, it will become frozen.

5517 * At this point the new cgroup is unpopulated, so we can

5518 * consider it frozen immediately.

5519 */

5520 set_bit(CGRP_FREEZE, &cgrp->flags);

5521 set_bit(CGRP_FROZEN, &cgrp->flags);

5522 }

5523

5524 spin_lock_irq(&css_set_lock);

5525 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {

5526 cgrp->ancestors[tcgrp->level] = tcgrp;

5527

5528 if (tcgrp != cgrp) {

5529 tcgrp->nr_descendants++;

5530

5531 /*

5532 * If the new cgroup is frozen, all ancestor cgroups

5533 * get a new frozen descendant, but their state can't

5534 * change because of this.

5535 */

5536 if (cgrp->freezer.e_freeze)

5537 tcgrp->freezer.nr_frozen_descendants++;

5538 }

5539 }

5540 spin_unlock_irq(&css_set_lock);

5541

5542 if (notify_on_release(parent))

5543 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

5544

5545 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))

5546 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

5547

5548 cgrp->self.serial_nr = css_serial_nr_next++;

5549

5550 /* allocation complete, commit to creation */

5551 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);

5552 atomic_inc(&root->nr_cgrps);

5553 cgroup_get_live(parent);

5554

5555 /*

5556 * On the default hierarchy, a child doesn't automatically inherit

5557 * subtree_control from the parent. Each is configured manually.

5558 */

5559 if (!cgroup_on_dfl(cgrp))

5560 cgrp->subtree_control = cgroup_control(cgrp);

5561

5562 cgroup_propagate_control(cgrp);

5563

5564 return cgrp;

5565

5566 out_psi_free:

5567 psi_cgroup_free(cgrp);

5568 out_kernfs_remove:

5569 kernfs_remove(cgrp->kn);

5570 out_stat_exit:

5571 cgroup_rstat_exit(cgrp);

5572 out_cancel_ref:

5573 percpu_ref_exit(&cgrp->self.refcnt);

5574 out_free_cgrp:

5575 kfree(cgrp);

5576 return ERR_PTR(ret);

5577 }

5578

5579 static bool cgroup_check_hierarchy_limits(struct cgroup *parent)

5580 {

5581 struct cgroup *cgroup;

5582 int ret = false;

5583 int level = 1;

5584

5585 lockdep_assert_held(&cgroup_mutex);

5586

5587 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {

5588 if (cgroup->nr_descendants >= cgroup->max_descendants)

5589 goto fail;

5590

5591 if (level > cgroup->max_depth)

5592 goto fail;

5593

5594 level++;

5595 }

5596

5597 ret = true;

5598 fail:

5599 return ret;

5600 }

5601

5602 int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)

5603 {

5604 struct cgroup *parent, *cgrp;

5605 int ret;

5606

5607 /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */

5608 if (strchr(name, '\n'))

5609 return -EINVAL;

5610

5611 parent = cgroup_kn_lock_live(parent_kn, false);

5612 if (!parent)

5613 return -ENODEV;

5614

5615 if (!cgroup_check_hierarchy_limits(parent)) {

5616 ret = -EAGAIN;

5617 goto out_unlock;

5618 }

5619

5620 cgrp = cgroup_create(parent, name, mode);

5621 if (IS_ERR(cgrp)) {

5622 ret = PTR_ERR(cgrp);

5623 goto out_unlock;

5624 }

5625

5626 /*

5627 * This extra ref will be put in cgroup_free_fn() and guarantees

5628 * that @cgrp->kn is always accessible.

5629 */

5630 kernfs_get(cgrp->kn);

5631

5632 ret = cgroup_kn_set_ugid(cgrp->kn);

5633 if (ret)

5634 goto out_destroy;

5635

5636 ret = css_populate_dir(&cgrp->self);

5637 if (ret)

5638 goto out_destroy;

5639

5640 ret = cgroup_apply_control_enable(cgrp);

5641 if (ret)

5642 goto out_destroy;

5643

5644 TRACE_CGROUP_PATH(mkdir, cgrp);

5645

5646 /* let's create and online css's */

5647 kernfs_activate(cgrp->kn);

5648

5649 ret = 0;

5650 goto out_unlock;

5651

5652 out_destroy:

5653 cgroup_destroy_locked(cgrp);

5654 out_unlock:

5655 cgroup_kn_unlock(parent_kn);

5656 return ret;

5657 }

5658

5659 /*

5660 * This is called when the refcnt of a css is confirmed to be killed.

5661 * css_tryget_online() is now guaranteed to fail. Tell the subsystem to

5662 * initiate destruction and put the css ref from kill_css().

5663 */

5664 static void css_killed_work_fn(struct work_struct *work)

5665 {

5666 struct cgroup_subsys_state *css =

5667 container_of(work, struct cgroup_subsys_state, destroy_work);

5668

5669 mutex_lock(&cgroup_mutex);

5670

5671 do {

5672 offline_css(css);

5673 css_put(css);

5674 /* @css can't go away while we're holding cgroup_mutex */

5675 css = css->parent;

5676 } while (css && atomic_dec_and_test(&css->online_cnt));

5677

5678 mutex_unlock(&cgroup_mutex);

5679 }

5680

5681 /* css kill confirmation processing requires process context, bounce */

5682 static void css_killed_ref_fn(struct percpu_ref *ref)

5683 {

5684 struct cgroup_subsys_state *css =

5685 container_of(ref, struct cgroup_subsys_state, refcnt);

5686

5687 if (atomic_dec_and_test(&css->online_cnt)) {

5688 INIT_WORK(&css->destroy_work, css_killed_work_fn);

5689 queue_work(cgroup_destroy_wq, &css->destroy_work);

5690 }

5691 }

5692

5693 /**

5694 * kill_css - destroy a css

5695 * @css: css to destroy

5696 *

5697 * This function initiates destruction of @css by removing cgroup interface

5698 * files and putting its base reference. ->css_offline() will be invoked

5699 * asynchronously once css_tryget_online() is guaranteed to fail and when

5700 * the reference count reaches zero, @css will be released.

5701 */

5702 static void kill_css(struct cgroup_subsys_state *css)

5703 {

5704 lockdep_assert_held(&cgroup_mutex);

5705

5706 if (css->flags & CSS_DYING)

5707 return;

5708

5709 css->flags |= CSS_DYING;

5710

5711 /*

5712 * This must happen before css is disassociated with its cgroup.

5713 * See seq_css() for details.

5714 */

5715 css_clear_dir(css);

5716

5717 /*

5718 * Killing would put the base ref, but we need to keep it alive

5719 * until after ->css_offline().

5720 */

5721 css_get(css);

5722

5723 /*

5724 * cgroup core guarantees that, by the time ->css_offline() is

5725 * invoked, no new css reference will be given out via

5726 * css_tryget_online(). We can't simply call percpu_ref_kill() and

5727 * proceed to offlining css's because percpu_ref_kill() doesn't

5728 * guarantee that the ref is seen as killed on all CPUs on return.

5729 *

5730 * Use percpu_ref_kill_and_confirm() to get notifications as each

5731 * css is confirmed to be seen as killed on all CPUs.

5732 */

5733 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

5734 }

5735

5736 /**

5737 * cgroup_destroy_locked - the first stage of cgroup destruction

5738 * @cgrp: cgroup to be destroyed

5739 *

5740 * css's make use of percpu refcnts whose killing latency shouldn't be

5741 * exposed to userland and are RCU protected. Also, cgroup core needs to

5742 * guarantee that css_tryget_online() won't succeed by the time

5743 * ->css_offline() is invoked. To satisfy all the requirements,

5744 * destruction is implemented in the following two steps.

5745 *

5746 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all

5747 * userland visible parts and start killing the percpu refcnts of

5748 * css's. Set up so that the next stage will be kicked off once all

5749 * the percpu refcnts are confirmed to be killed.

5750 *

5751 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the

5752 * rest of destruction. Once all cgroup references are gone, the

5753 * cgroup is RCU-freed.

5754 *

5755 * This function implements s1. After this step, @cgrp is gone as far as

5756 * the userland is concerned and a new cgroup with the same name may be

5757 * created. As cgroup doesn't care about the names internally, this

5758 * doesn't cause any problem.

5759 */

5760 static int cgroup_destroy_locked(struct cgroup *cgrp)

5761 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)

5762 {

5763 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);

5764 struct cgroup_subsys_state *css;

5765 struct cgrp_cset_link *link;

5766 int ssid;

5767

5768 lockdep_assert_held(&cgroup_mutex);

5769

5770 /*

5771 * Only migration can raise populated from zero and we're already

5772 * holding cgroup_mutex.

5773 */

5774 if (cgroup_is_populated(cgrp))

5775 return -EBUSY;

5776

5777 /*

5778 * Make sure there's no live children. We can't test emptiness of

5779 * ->self.children as dead children linger on it while being

5780 * drained; otherwise, "rmdir parent/child parent" may fail.

5781 */

5782 if (css_has_online_children(&cgrp->self))

5783 return -EBUSY;

5784

5785 /*

5786 * Mark @cgrp and the associated csets dead. The former prevents

5787 * further task migration and child creation by disabling

5788 * cgroup_lock_live_group(). The latter makes the csets ignored by

5789 * the migration path.

5790 */

5791 cgrp->self.flags &= ~CSS_ONLINE;

5792

5793 spin_lock_irq(&css_set_lock);

5794 list_for_each_entry(link, &cgrp->cset_links, cset_link)

5795 link->cset->dead = true;

5796 spin_unlock_irq(&css_set_lock);

5797

5798 /* initiate massacre of all css's */

5799 for_each_css(css, ssid, cgrp)

5800 kill_css(css);

5801

5802 /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */

5803 css_clear_dir(&cgrp->self);

5804 kernfs_remove(cgrp->kn);

5805

5806 if (cgroup_is_threaded(cgrp))

5807 parent->nr_threaded_children--;

5808

5809 spin_lock_irq(&css_set_lock);

5810 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {

5811 tcgrp->nr_descendants--;

5812 tcgrp->nr_dying_descendants++;

5813 /*

5814 * If the dying cgroup is frozen, decrease frozen descendants

5815 * counters of ancestor cgroups.

5816 */

5817 if (test_bit(CGRP_FROZEN, &cgrp->flags))

5818 tcgrp->freezer.nr_frozen_descendants--;

5819 }

5820 spin_unlock_irq(&css_set_lock);

5821

5822 cgroup1_check_for_release(parent);

5823

5824 cgroup_bpf_offline(cgrp);

5825

5826 /* put the base reference */

5827 percpu_ref_kill(&cgrp->self.refcnt);

5828

5829 return 0;

5830 };

5831

5832 int cgroup_rmdir(struct kernfs_node *kn)

5833 {

5834 struct cgroup *cgrp;

5835 int ret = 0;

5836

5837 cgrp = cgroup_kn_lock_live(kn, false);

5838 if (!cgrp)

5839 return 0;

5840

5841 ret = cgroup_destroy_locked(cgrp);

5842 if (!ret)

5843 TRACE_CGROUP_PATH(rmdir, cgrp);

5844

5845 cgroup_kn_unlock(kn);

5846 return ret;

5847 }

5848

5849 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {

5850 .show_options = cgroup_show_options,

5851 .mkdir = cgroup_mkdir,

5852 .rmdir = cgroup_rmdir,

5853 .show_path = cgroup_show_path,

5854 };

5855

5856 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)

5857 {

5858 struct cgroup_subsys_state *css;

5859

5860 pr_debug("Initializing cgroup subsys %s\n", ss->name);

5861

5862 mutex_lock(&cgroup_mutex);

5863

5864 idr_init(&ss->css_idr);

5865 INIT_LIST_HEAD(&ss->cfts);

5866

5867 /* Create the root cgroup state for this subsystem */

5868 ss->root = &cgrp_dfl_root;

5869 css = ss->css_alloc(NULL);

5870 /* We don't handle early failures gracefully */

5871 BUG_ON(IS_ERR(css));

5872 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);

5873

5874 /*

5875 * Root csses are never destroyed and we can't initialize

5876 * percpu_ref during early init. Disable refcnting.

5877 */

5878 css->flags |= CSS_NO_REF;

5879

5880 if (early) {

5881 /* allocation can't be done safely during early init */

5882 css->id = 1;

5883 } else {

5884 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);

5885 BUG_ON(css->id < 0);

5886 }

5887

5888 /* Update the init_css_set to contain a subsys

5889 * pointer to this state - since the subsystem is

5890 * newly registered, all tasks and hence the

5891 * init_css_set is in the subsystem's root cgroup. */

5892 init_css_set.subsys[ss->id] = css;

5893

5894 have_fork_callback |= (bool)ss->fork << ss->id;

5895 have_exit_callback |= (bool)ss->exit << ss->id;

5896 have_release_callback |= (bool)ss->release << ss->id;

5897 have_canfork_callback |= (bool)ss->can_fork << ss->id;

5898

5899 /* At system boot, before all subsystems have been

5900 * registered, no tasks have been forked, so we don't

5901 * need to invoke fork callbacks here. */

5902 BUG_ON(!list_empty(&init_task.tasks));

5903

5904 BUG_ON(online_css(css));

5905

5906 mutex_unlock(&cgroup_mutex);

5907 }

5908

5909 /**

5910 * cgroup_init_early - cgroup initialization at system boot

5911 *

5912 * Initialize cgroups at system boot, and initialize any

5913 * subsystems that request early init.

5914 */

5915 int __init cgroup_init_early(void)

5916 {

5917 static struct cgroup_fs_context __initdata ctx;

5918 struct cgroup_subsys *ss;

5919 int i;

5920

5921 ctx.root = &cgrp_dfl_root;

5922 init_cgroup_root(&ctx);

5923 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

5924

5925 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

5926

5927 for_each_subsys(ss, i) {

5928 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,

5929 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",

5930 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,

5931 ss->id, ss->name);

5932 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,

5933 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

5934

5935 ss->id = i;

5936 ss->name = cgroup_subsys_name[i];

5937 if (!ss->legacy_name)

5938 ss->legacy_name = cgroup_subsys_name[i];

5939

5940 if (ss->early_init)

5941 cgroup_init_subsys(ss, true);

5942 }

5943 return 0;

5944 }

5945

5946 /**

5947 * cgroup_init - cgroup initialization

5948 *

5949 * Register cgroup filesystem and /proc file, and initialize

5950 * any subsystems that didn't request early init.

5951 */

5952 int __init cgroup_init(void)

5953 {

5954 struct cgroup_subsys *ss;

5955 int ssid;

5956

5957 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);

5958 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));

5959 BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));

5960 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));

5961

5962 cgroup_rstat_boot();

5963

5964 get_user_ns(init_cgroup_ns.user_ns);

5965

5966 mutex_lock(&cgroup_mutex);

5967

5968 /*

5969 * Add init_css_set to the hash table so that dfl_root can link to

5970 * it during init.

5971 */

5972 hash_add(css_set_table, &init_css_set.hlist,

5973 css_set_hash(init_css_set.subsys));

5974

5975 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

5976

5977 mutex_unlock(&cgroup_mutex);

5978

5979 for_each_subsys(ss, ssid) {

5980 if (ss->early_init) {

5981 struct cgroup_subsys_state *css =

5982 init_css_set.subsys[ss->id];

5983

5984 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,

5985 GFP_KERNEL);

5986 BUG_ON(css->id < 0);

5987 } else {

5988 cgroup_init_subsys(ss, false);

5989 }

5990

5991 list_add_tail(&init_css_set.e_cset_node[ssid],

5992 &cgrp_dfl_root.cgrp.e_csets[ssid]);

5993

5994 /*

5995 * Setting dfl_root subsys_mask needs to consider the

5996 * disabled flag and cftype registration needs kmalloc,

5997 * both of which aren't available during early_init.

5998 */

5999 if (!cgroup_ssid_enabled(ssid))

6000 continue;

6001

6002 if (cgroup1_ssid_disabled(ssid))

6003 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",

6004 ss->name);

6005

6006 cgrp_dfl_root.subsys_mask |= 1 << ss->id;

6007

6008 /* implicit controllers must be threaded too */

6009 WARN_ON(ss->implicit_on_dfl && !ss->threaded);

6010

6011 if (ss->implicit_on_dfl)

6012 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;

6013 else if (!ss->dfl_cftypes)

6014 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;

6015

6016 if (ss->threaded)

6017 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;

6018

6019 if (ss->dfl_cftypes == ss->legacy_cftypes) {

6020 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));

6021 } else {

6022 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));

6023 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));

6024 }

6025

6026 if (ss->bind)

6027 ss->bind(init_css_set.subsys[ssid]);

6028

6029 mutex_lock(&cgroup_mutex);

6030 css_populate_dir(init_css_set.subsys[ssid]);

6031 mutex_unlock(&cgroup_mutex);

6032 }

6033

6034 /* init_css_set.subsys[] has been updated, re-hash */

6035 hash_del(&init_css_set.hlist);

6036 hash_add(css_set_table, &init_css_set.hlist,

6037 css_set_hash(init_css_set.subsys));

6038

6039 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));

6040 WARN_ON(register_filesystem(&cgroup_fs_type));

6041 WARN_ON(register_filesystem(&cgroup2_fs_type));

6042 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));

6043 #ifdef CONFIG_CPUSETS

6044 WARN_ON(register_filesystem(&cpuset_fs_type));

6045 #endif

6046

6047 return 0;

6048 }

6049

6050 static int __init cgroup_wq_init(void)

6051 {

6052 /*

6053 * There isn't much point in executing destruction path in

6054 * parallel. Good chunk is serialized with cgroup_mutex anyway.

6055 * Use 1 for @max_active.

6056 *

6057 * We would prefer to do this in cgroup_init() above, but that

6058 * is called before init_workqueues(): so leave this until after.

6059 */

6060 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);

6061 BUG_ON(!cgroup_destroy_wq);

6062 return 0;

6063 }

6064 core_initcall(cgroup_wq_init);

6065

6066 void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)

6067 {

6068 struct kernfs_node *kn;

6069

6070 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);

6071 if (!kn)

6072 return;

6073 kernfs_path(kn, buf, buflen);

6074 kernfs_put(kn);

6075 }

6076

6077 /*

6078 * cgroup_get_from_id : get the cgroup associated with cgroup id

6079 * @id: cgroup id

6080 * On success return the cgrp or ERR_PTR on failure

6081 * Only cgroups within current task's cgroup NS are valid.

6082 */

6083 struct cgroup *cgroup_get_from_id(u64 id)

6084 {

6085 struct kernfs_node *kn;

6086 struct cgroup *cgrp, *root_cgrp;

6087

6088 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);

6089 if (!kn)

6090 return ERR_PTR(-ENOENT);

6091

6092 if (kernfs_type(kn) != KERNFS_DIR) {

6093 kernfs_put(kn);

6094 return ERR_PTR(-ENOENT);

6095 }

6096

6097 rcu_read_lock();

6098

6099 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);

6100 if (cgrp && !cgroup_tryget(cgrp))

6101 cgrp = NULL;

6102

6103 rcu_read_unlock();

6104 kernfs_put(kn);

6105

6106 if (!cgrp)

6107 return ERR_PTR(-ENOENT);

6108

6109 spin_lock_irq(&css_set_lock);

6110 root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);

6111 spin_unlock_irq(&css_set_lock);

6112 if (!cgroup_is_descendant(cgrp, root_cgrp)) {

6113 cgroup_put(cgrp);

6114 return ERR_PTR(-ENOENT);

6115 }

6116

6117 return cgrp;

6118 }

6119 EXPORT_SYMBOL_GPL(cgroup_get_from_id);

6120

6121 /*

6122 * proc_cgroup_show()

6123 * - Print task's cgroup paths into seq_file, one line for each hierarchy

6124 * - Used for /proc/<pid>/cgroup.

6125 */

6126 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,

6127 struct pid *pid, struct task_struct *tsk)

6128 {

6129 char *buf;

6130 int retval;

6131 struct cgroup_root *root;

6132

6133 retval = -ENOMEM;

6134 buf = kmalloc(PATH_MAX, GFP_KERNEL);

6135 if (!buf)

6136 goto out;

6137

6138 mutex_lock(&cgroup_mutex);

6139 spin_lock_irq(&css_set_lock);

6140

6141 for_each_root(root) {

6142 struct cgroup_subsys *ss;

6143 struct cgroup *cgrp;

6144 int ssid, count = 0;

6145

6146 if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))

6147 continue;

6148

6149 seq_printf(m, "%d:", root->hierarchy_id);

6150 if (root != &cgrp_dfl_root)

6151 for_each_subsys(ss, ssid)

6152 if (root->subsys_mask & (1 << ssid))

6153 seq_printf(m, "%s%s", count++ ? "," : "",

6154 ss->legacy_name);

6155 if (strlen(root->name))

6156 seq_printf(m, "%sname=%s", count ? "," : "",

6157 root->name);

6158 seq_putc(m, ':');

6159

6160 cgrp = task_cgroup_from_root(tsk, root);

6161

6162 /*

6163 * On traditional hierarchies, all zombie tasks show up as

6164 * belonging to the root cgroup. On the default hierarchy,

6165 * while a zombie doesn't show up in "cgroup.procs" and

6166 * thus can't be migrated, its /proc/PID/cgroup keeps

6167 * reporting the cgroup it belonged to before exiting. If

6168 * the cgroup is removed before the zombie is reaped,

6169 * " (deleted)" is appended to the cgroup path.

6170 */

6171 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {

6172 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,

6173 current->nsproxy->cgroup_ns);

6174 if (retval >= PATH_MAX)

6175 retval = -ENAMETOOLONG;

6176 if (retval < 0)

6177 goto out_unlock;

6178

6179 seq_puts(m, buf);

6180 } else {

6181 seq_puts(m, "/");

6182 }

6183

6184 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))

6185 seq_puts(m, " (deleted)\n");

6186 else

6187 seq_putc(m, '\n');

6188 }

6189

6190 retval = 0;

6191 out_unlock:

6192 spin_unlock_irq(&css_set_lock);

6193 mutex_unlock(&cgroup_mutex);

6194 kfree(buf);

6195 out:

6196 return retval;

6197 }

6198

6199 /**

6200 * cgroup_fork - initialize cgroup related fields during copy_process()

6201 * @child: pointer to task_struct of forking parent process.

6202 *

6203 * A task is associated with the init_css_set until cgroup_post_fork()

6204 * attaches it to the target css_set.

6205 */

6206 void cgroup_fork(struct task_struct *child)

6207 {

6208 RCU_INIT_POINTER(child->cgroups, &init_css_set);

6209 INIT_LIST_HEAD(&child->cg_list);

6210 }

6211

6212 static struct cgroup *cgroup_get_from_file(struct file *f)

6213 {

6214 struct cgroup_subsys_state *css;

6215 struct cgroup *cgrp;

6216

6217 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);

6218 if (IS_ERR(css))

6219 return ERR_CAST(css);

6220

6221 cgrp = css->cgroup;

6222 return cgrp;

6223 }

6224

6225 /**

6226 * cgroup_css_set_fork - find or create a css_set for a child process

6227 * @kargs: the arguments passed to create the child process

6228 *

6229 * This functions finds or creates a new css_set which the child

6230 * process will be attached to in cgroup_post_fork(). By default,

6231 * the child process will be given the same css_set as its parent.

6232 *

6233 * If CLONE_INTO_CGROUP is specified this function will try to find an

6234 * existing css_set which includes the requested cgroup and if not create

6235 * a new css_set that the child will be attached to later. If this function

6236 * succeeds it will hold cgroup_threadgroup_rwsem on return. If

6237 * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex

6238 * before grabbing cgroup_threadgroup_rwsem and will hold a reference

6239 * to the target cgroup.

6240 */

6241 static int cgroup_css_set_fork(struct kernel_clone_args *kargs)

6242 __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)

6243 {

6244 int ret;

6245 struct cgroup *dst_cgrp = NULL;

6246 struct css_set *cset;

6247 struct super_block *sb;

6248 struct file *f;

6249

6250 if (kargs->flags & CLONE_INTO_CGROUP)

6251 mutex_lock(&cgroup_mutex);

6252

6253 cgroup_threadgroup_change_begin(current);

6254

6255 spin_lock_irq(&css_set_lock);

6256 cset = task_css_set(current);

6257 get_css_set(cset);

6258 spin_unlock_irq(&css_set_lock);

6259

6260 if (!(kargs->flags & CLONE_INTO_CGROUP)) {

6261 kargs->cset = cset;

6262 return 0;

6263 }

6264

6265 f = fget_raw(kargs->cgroup);

6266 if (!f) {

6267 ret = -EBADF;

6268 goto err;

6269 }

6270 sb = f->f_path.dentry->d_sb;

6271

6272 dst_cgrp = cgroup_get_from_file(f);

6273 if (IS_ERR(dst_cgrp)) {

6274 ret = PTR_ERR(dst_cgrp);

6275 dst_cgrp = NULL;

6276 goto err;

6277 }

6278

6279 if (cgroup_is_dead(dst_cgrp)) {

6280 ret = -ENODEV;

6281 goto err;

6282 }

6283

6284 /*

6285 * Verify that we the target cgroup is writable for us. This is

6286 * usually done by the vfs layer but since we're not going through

6287 * the vfs layer here we need to do it "manually".

6288 */

6289 ret = cgroup_may_write(dst_cgrp, sb);

6290 if (ret)

6291 goto err;

6292

6293 /*

6294 * Spawning a task directly into a cgroup works by passing a file

6295 * descriptor to the target cgroup directory. This can even be an O_PATH

6296 * file descriptor. But it can never be a cgroup.procs file descriptor.

6297 * This was done on purpose so spawning into a cgroup could be

6298 * conceptualized as an atomic

6299 *

6300 * fd = openat(dfd_cgroup, "cgroup.procs", ...);

6301 * write(fd, <child-pid>, ...);

6302 *

6303 * sequence, i.e. it's a shorthand for the caller opening and writing

6304 * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us

6305 * to always use the caller's credentials.

6306 */

6307 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,

6308 !(kargs->flags & CLONE_THREAD),

6309 current->nsproxy->cgroup_ns);

6310 if (ret)

6311 goto err;

6312

6313 kargs->cset = find_css_set(cset, dst_cgrp);

6314 if (!kargs->cset) {

6315 ret = -ENOMEM;

6316 goto err;

6317 }

6318

6319 put_css_set(cset);

6320 fput(f);

6321 kargs->cgrp = dst_cgrp;

6322 return ret;

6323

6324 err:

6325 cgroup_threadgroup_change_end(current);

6326 mutex_unlock(&cgroup_mutex);

6327 if (f)

6328 fput(f);

6329 if (dst_cgrp)

6330 cgroup_put(dst_cgrp);

6331 put_css_set(cset);

6332 if (kargs->cset)

6333 put_css_set(kargs->cset);

6334 return ret;

6335 }

6336

6337 /**

6338 * cgroup_css_set_put_fork - drop references we took during fork

6339 * @kargs: the arguments passed to create the child process

6340 *

6341 * Drop references to the prepared css_set and target cgroup if

6342 * CLONE_INTO_CGROUP was requested.

6343 */

6344 static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)

6345 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)

6346 {

6347 cgroup_threadgroup_change_end(current);

6348

6349 if (kargs->flags & CLONE_INTO_CGROUP) {

6350 struct cgroup *cgrp = kargs->cgrp;

6351 struct css_set *cset = kargs->cset;

6352

6353 mutex_unlock(&cgroup_mutex);

6354

6355 if (cset) {

6356 put_css_set(cset);

6357 kargs->cset = NULL;

6358 }

6359

6360 if (cgrp) {

6361 cgroup_put(cgrp);

6362 kargs->cgrp = NULL;

6363 }

6364 }

6365 }

6366

6367 /**

6368 * cgroup_can_fork - called on a new task before the process is exposed

6369 * @child: the child process

6370 * @kargs: the arguments passed to create the child process

6371 *

6372 * This prepares a new css_set for the child process which the child will

6373 * be attached to in cgroup_post_fork().

6374 * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()

6375 * callback returns an error, the fork aborts with that error code. This

6376 * allows for a cgroup subsystem to conditionally allow or deny new forks.

6377 */

6378 int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)

6379 {

6380 struct cgroup_subsys *ss;

6381 int i, j, ret;

6382

6383 ret = cgroup_css_set_fork(kargs);

6384 if (ret)

6385 return ret;

6386

6387 do_each_subsys_mask(ss, i, have_canfork_callback) {

6388 ret = ss->can_fork(child, kargs->cset);

6389 if (ret)

6390 goto out_revert;

6391 } while_each_subsys_mask();

6392

6393 return 0;

6394

6395 out_revert:

6396 for_each_subsys(ss, j) {

6397 if (j >= i)

6398 break;

6399 if (ss->cancel_fork)

6400 ss->cancel_fork(child, kargs->cset);

6401 }

6402

6403 cgroup_css_set_put_fork(kargs);

6404

6405 return ret;

6406 }

6407

6408 /**

6409 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()

6410 * @child: the child process

6411 * @kargs: the arguments passed to create the child process

6412 *

6413 * This calls the cancel_fork() callbacks if a fork failed *after*

6414 * cgroup_can_fork() succeeded and cleans up references we took to

6415 * prepare a new css_set for the child process in cgroup_can_fork().

6416 */

6417 void cgroup_cancel_fork(struct task_struct *child,

6418 struct kernel_clone_args *kargs)

6419 {

6420 struct cgroup_subsys *ss;

6421 int i;

6422

6423 for_each_subsys(ss, i)

6424 if (ss->cancel_fork)

6425 ss->cancel_fork(child, kargs->cset);

6426

6427 cgroup_css_set_put_fork(kargs);

6428 }

6429

6430 /**

6431 * cgroup_post_fork - finalize cgroup setup for the child process

6432 * @child: the child process

6433 * @kargs: the arguments passed to create the child process

6434 *

6435 * Attach the child process to its css_set calling the subsystem fork()

6436 * callbacks.

6437 */

6438 void cgroup_post_fork(struct task_struct *child,

6439 struct kernel_clone_args *kargs)

6440 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)

6441 {

6442 unsigned long cgrp_flags = 0;

6443 bool kill = false;

6444 struct cgroup_subsys *ss;

6445 struct css_set *cset;

6446 int i;

6447

6448 cset = kargs->cset;

6449 kargs->cset = NULL;

6450

6451 spin_lock_irq(&css_set_lock);

6452

6453 /* init tasks are special, only link regular threads */

6454 if (likely(child->pid)) {

6455 if (kargs->cgrp)

6456 cgrp_flags = kargs->cgrp->flags;

6457 else

6458 cgrp_flags = cset->dfl_cgrp->flags;

6459

6460 WARN_ON_ONCE(!list_empty(&child->cg_list));

6461 cset->nr_tasks++;

6462 css_set_move_task(child, NULL, cset, false);

6463 } else {

6464 put_css_set(cset);

6465 cset = NULL;

6466 }

6467

6468 if (!(child->flags & PF_KTHREAD)) {

6469 if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {

6470 /*

6471 * If the cgroup has to be frozen, the new task has

6472 * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to

6473 * get the task into the frozen state.

6474 */

6475 spin_lock(&child->sighand->siglock);

6476 WARN_ON_ONCE(child->frozen);

6477 child->jobctl |= JOBCTL_TRAP_FREEZE;

6478 spin_unlock(&child->sighand->siglock);

6479

6480 /*

6481 * Calling cgroup_update_frozen() isn't required here,

6482 * because it will be called anyway a bit later from

6483 * do_freezer_trap(). So we avoid cgroup's transient

6484 * switch from the frozen state and back.

6485 */

6486 }

6487

6488 /*

6489 * If the cgroup is to be killed notice it now and take the

6490 * child down right after we finished preparing it for

6491 * userspace.

6492 */

6493 kill = test_bit(CGRP_KILL, &cgrp_flags);

6494 }

6495

6496 spin_unlock_irq(&css_set_lock);

6497

6498 /*

6499 * Call ss->fork(). This must happen after @child is linked on

6500 * css_set; otherwise, @child might change state between ->fork()

6501 * and addition to css_set.

6502 */

6503 do_each_subsys_mask(ss, i, have_fork_callback) {

6504 ss->fork(child);

6505 } while_each_subsys_mask();

6506

6507 /* Make the new cset the root_cset of the new cgroup namespace. */

6508 if (kargs->flags & CLONE_NEWCGROUP) {

6509 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;

6510

6511 get_css_set(cset);

6512 child->nsproxy->cgroup_ns->root_cset = cset;

6513 put_css_set(rcset);

6514 }

6515

6516 /* Cgroup has to be killed so take down child immediately. */

6517 if (unlikely(kill))

6518 do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);

6519

6520 cgroup_css_set_put_fork(kargs);

6521 }

6522

6523 /**

6524 * cgroup_exit - detach cgroup from exiting task

6525 * @tsk: pointer to task_struct of exiting process

6526 *

6527 * Description: Detach cgroup from @tsk.

6528 *

6529 */

6530 void cgroup_exit(struct task_struct *tsk)

6531 {

6532 struct cgroup_subsys *ss;

6533 struct css_set *cset;

6534 int i;

6535

6536 spin_lock_irq(&css_set_lock);

6537

6538 WARN_ON_ONCE(list_empty(&tsk->cg_list));

6539 cset = task_css_set(tsk);

6540 css_set_move_task(tsk, cset, NULL, false);

6541 list_add_tail(&tsk->cg_list, &cset->dying_tasks);

6542 cset->nr_tasks--;

6543

6544 WARN_ON_ONCE(cgroup_task_frozen(tsk));

6545 if (unlikely(!(tsk->flags & PF_KTHREAD) &&

6546 test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))

6547 cgroup_update_frozen(task_dfl_cgroup(tsk));

6548

6549 spin_unlock_irq(&css_set_lock);

6550

6551 /* see cgroup_post_fork() for details */

6552 do_each_subsys_mask(ss, i, have_exit_callback) {

6553 ss->exit(tsk);

6554 } while_each_subsys_mask();

6555 }

6556

6557 void cgroup_release(struct task_struct *task)

6558 {

6559 struct cgroup_subsys *ss;

6560 int ssid;

6561

6562 do_each_subsys_mask(ss, ssid, have_release_callback) {

6563 ss->release(task);

6564 } while_each_subsys_mask();

6565

6566 spin_lock_irq(&css_set_lock);

6567 css_set_skip_task_iters(task_css_set(task), task);

6568 list_del_init(&task->cg_list);

6569 spin_unlock_irq(&css_set_lock);

6570 }

6571

6572 void cgroup_free(struct task_struct *task)

6573 {

6574 struct css_set *cset = task_css_set(task);

6575 put_css_set(cset);

6576 }

6577

6578 static int __init cgroup_disable(char *str)

6579 {

6580 struct cgroup_subsys *ss;

6581 char *token;

6582 int i;

6583

6584 while ((token = strsep(&str, ",")) != NULL) {

6585 if (!*token)

6586 continue;

6587

6588 for_each_subsys(ss, i) {

6589 if (strcmp(token, ss->name) &&

6590 strcmp(token, ss->legacy_name))

6591 continue;

6592

6593 static_branch_disable(cgroup_subsys_enabled_key[i]);

6594 pr_info("Disabling %s control group subsystem\n",

6595 ss->name);

6596 }

6597

6598 for (i = 0; i < OPT_FEATURE_COUNT; i++) {

6599 if (strcmp(token, cgroup_opt_feature_names[i]))

6600 continue;

6601 cgroup_feature_disable_mask |= 1 << i;

6602 pr_info("Disabling %s control group feature\n",

6603 cgroup_opt_feature_names[i]);

6604 break;

6605 }

6606 }

6607 return 1;

6608 }

6609 __setup("cgroup_disable=", cgroup_disable);

6610

6611 void __init __weak enable_debug_cgroup(void) { }

6612

6613 static int __init enable_cgroup_debug(char *str)

6614 {

6615 cgroup_debug = true;

6616 enable_debug_cgroup();

6617 return 1;

6618 }

6619 __setup("cgroup_debug", enable_cgroup_debug);

6620

6621 /**

6622 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry

6623 * @dentry: directory dentry of interest

6624 * @ss: subsystem of interest

6625 *

6626 * If @dentry is a directory for a cgroup which has @ss enabled on it, try

6627 * to get the corresponding css and return it. If such css doesn't exist

6628 * or can't be pinned, an ERR_PTR value is returned.

6629 */

6630 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,

6631 struct cgroup_subsys *ss)

6632 {

6633 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

6634 struct file_system_type *s_type = dentry->d_sb->s_type;

6635 struct cgroup_subsys_state *css = NULL;

6636 struct cgroup *cgrp;

6637

6638 /* is @dentry a cgroup dir? */

6639 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||

6640 !kn || kernfs_type(kn) != KERNFS_DIR)

6641 return ERR_PTR(-EBADF);

6642

6643 rcu_read_lock();

6644

6645 /*

6646 * This path doesn't originate from kernfs and @kn could already

6647 * have been or be removed at any point. @kn->priv is RCU

6648 * protected for this access. See css_release_work_fn() for details.

6649 */

6650 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);

6651 if (cgrp)

6652 css = cgroup_css(cgrp, ss);

6653

6654 if (!css || !css_tryget_online(css))

6655 css = ERR_PTR(-ENOENT);

6656

6657 rcu_read_unlock();

6658 return css;

6659 }

6660

6661 /**

6662 * css_from_id - lookup css by id

6663 * @id: the cgroup id

6664 * @ss: cgroup subsys to be looked into

6665 *

6666 * Returns the css if there's valid one with @id, otherwise returns NULL.

6667 * Should be called under rcu_read_lock().

6668 */

6669 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)

6670 {

6671 WARN_ON_ONCE(!rcu_read_lock_held());

6672 return idr_find(&ss->css_idr, id);

6673 }

6674

6675 /**

6676 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path

6677 * @path: path on the default hierarchy

6678 *

6679 * Find the cgroup at @path on the default hierarchy, increment its

6680 * reference count and return it. Returns pointer to the found cgroup on

6681 * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already

6682 * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.

6683 */

6684 struct cgroup *cgroup_get_from_path(const char *path)

6685 {

6686 struct kernfs_node *kn;

6687 struct cgroup *cgrp = ERR_PTR(-ENOENT);

6688 struct cgroup *root_cgrp;

6689

6690 spin_lock_irq(&css_set_lock);

6691 root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);

6692 kn = kernfs_walk_and_get(root_cgrp->kn, path);

6693 spin_unlock_irq(&css_set_lock);

6694 if (!kn)

6695 goto out;

6696

6697 if (kernfs_type(kn) != KERNFS_DIR) {

6698 cgrp = ERR_PTR(-ENOTDIR);

6699 goto out_kernfs;

6700 }

6701

6702 rcu_read_lock();

6703

6704 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);

6705 if (!cgrp || !cgroup_tryget(cgrp))

6706 cgrp = ERR_PTR(-ENOENT);

6707

6708 rcu_read_unlock();

6709

6710 out_kernfs:

6711 kernfs_put(kn);

6712 out:

6713 return cgrp;

6714 }

6715 EXPORT_SYMBOL_GPL(cgroup_get_from_path);

6716

6717 /**

6718 * cgroup_get_from_fd - get a cgroup pointer from a fd

6719 * @fd: fd obtained by open(cgroup2_dir)

6720 *

6721 * Find the cgroup from a fd which should be obtained

6722 * by opening a cgroup directory. Returns a pointer to the

6723 * cgroup on success. ERR_PTR is returned if the cgroup

6724 * cannot be found.

6725 */

6726 struct cgroup *cgroup_get_from_fd(int fd)

6727 {

6728 struct cgroup *cgrp;

6729 struct file *f;

6730

6731 f = fget_raw(fd);

6732 if (!f)

6733 return ERR_PTR(-EBADF);

6734

6735 cgrp = cgroup_get_from_file(f);

6736 fput(f);

6737 return cgrp;

6738 }

6739 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);

6740

6741 static u64 power_of_ten(int power)

6742 {

6743 u64 v = 1;

6744 while (power--)

6745 v *= 10;

6746 return v;

6747 }

6748

6749 /**

6750 * cgroup_parse_float - parse a floating number

6751 * @input: input string

6752 * @dec_shift: number of decimal digits to shift

6753 * @v: output

6754 *

6755 * Parse a decimal floating point number in @input and store the result in

6756 * @v with decimal point right shifted @dec_shift times. For example, if

6757 * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.

6758 * Returns 0 on success, -errno otherwise.

6759 *

6760 * There's nothing cgroup specific about this function except that it's

6761 * currently the only user.

6762 */

6763 int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)

6764 {

6765 s64 whole, frac = 0;

6766 int fstart = 0, fend = 0, flen;

6767

6768 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))

6769 return -EINVAL;

6770 if (frac < 0)

6771 return -EINVAL;

6772

6773 flen = fend > fstart ? fend - fstart : 0;

6774 if (flen < dec_shift)

6775 frac *= power_of_ten(dec_shift - flen);

6776 else

6777 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));

6778

6779 *v = whole * power_of_ten(dec_shift) + frac;

6780 return 0;

6781 }

6782

6783 /*

6784 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data

6785 * definition in cgroup-defs.h.

6786 */

6787 #ifdef CONFIG_SOCK_CGROUP_DATA

6788

6789 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)

6790 {

6791 struct cgroup *cgroup;

6792

6793 rcu_read_lock();

6794 /* Don't associate the sock with unrelated interrupted task's cgroup. */

6795 if (in_interrupt()) {

6796 cgroup = &cgrp_dfl_root.cgrp;

6797 cgroup_get(cgroup);

6798 goto out;

6799 }

6800

6801 while (true) {

6802 struct css_set *cset;

6803

6804 cset = task_css_set(current);

6805 if (likely(cgroup_tryget(cset->dfl_cgrp))) {

6806 cgroup = cset->dfl_cgrp;

6807 break;

6808 }

6809 cpu_relax();

6810 }

6811 out:

6812 skcd->cgroup = cgroup;

6813 cgroup_bpf_get(cgroup);

6814 rcu_read_unlock();

6815 }

6816

6817 void cgroup_sk_clone(struct sock_cgroup_data *skcd)

6818 {

6819 struct cgroup *cgrp = sock_cgroup_ptr(skcd);

6820

6821 /*

6822 * We might be cloning a socket which is left in an empty

6823 * cgroup and the cgroup might have already been rmdir'd.

6824 * Don't use cgroup_get_live().

6825 */

6826 cgroup_get(cgrp);

6827 cgroup_bpf_get(cgrp);

6828 }

6829

6830 void cgroup_sk_free(struct sock_cgroup_data *skcd)

6831 {

6832 struct cgroup *cgrp = sock_cgroup_ptr(skcd);

6833

6834 cgroup_bpf_put(cgrp);

6835 cgroup_put(cgrp);

6836 }

6837

6838 #endif /* CONFIG_SOCK_CGROUP_DATA */

6839

6840 #ifdef CONFIG_SYSFS

6841 static ssize_t show_delegatable_files(struct cftype *files, char *buf,

6842 ssize_t size, const char *prefix)

6843 {

6844 struct cftype *cft;

6845 ssize_t ret = 0;

6846

6847 for (cft = files; cft && cft->name[0] != '\0'; cft++) {

6848 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))

6849 continue;

6850

6851 if (prefix)

6852 ret += snprintf(buf + ret, size - ret, "%s.", prefix);

6853

6854 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);

6855

6856 if (WARN_ON(ret >= size))

6857 break;

6858 }

6859

6860 return ret;

6861 }

6862

6863 static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,

6864 char *buf)

6865 {

6866 struct cgroup_subsys *ss;

6867 int ssid;

6868 ssize_t ret = 0;

6869

6870 ret = show_delegatable_files(cgroup_base_files, buf + ret,

6871 PAGE_SIZE - ret, NULL);

6872 if (cgroup_psi_enabled())

6873 ret += show_delegatable_files(cgroup_psi_files, buf + ret,

6874 PAGE_SIZE - ret, NULL);

6875

6876 for_each_subsys(ss, ssid)

6877 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,

6878 PAGE_SIZE - ret,

6879 cgroup_subsys_name[ssid]);

6880

6881 return ret;

6882 }

6883 static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);

6884

6885 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,

6886 char *buf)

6887 {

6888 return snprintf(buf, PAGE_SIZE,

6889 "nsdelegate\n"

6890 "favordynmods\n"

6891 "memory_localevents\n"

6892 "memory_recursiveprot\n");

6893 }

6894 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);

6895

6896 static struct attribute *cgroup_sysfs_attrs[] = {

6897 &cgroup_delegate_attr.attr,

6898 &cgroup_features_attr.attr,

6899 NULL,

6900 };

6901

6902 static const struct attribute_group cgroup_sysfs_attr_group = {

6903 .attrs = cgroup_sysfs_attrs,

6904 .name = "cgroup",

6905 };

6906

6907 static int __init cgroup_sysfs_init(void)

6908 {

6909 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);

6910 }

6911 subsys_initcall(cgroup_sysfs_init);

6912

6913 #endif /* CONFIG_SYSFS */