]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
util: remove path_get_parent(), in favour of dirname_malloc()
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8e274523
LP
2
3/***
4 This file is part of systemd.
5
4ad49000 6 Copyright 2013 Lennart Poettering
8e274523
LP
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
8e274523 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
c6c18be3 22#include <fcntl.h>
e41969e3 23#include <fnmatch.h>
8c6db833 24
03a7b521 25#include "cgroup-util.h"
3ffd4af2
LP
26#include "cgroup.h"
27#include "fd-util.h"
6bedfcbb 28#include "parse-util.h"
9eb977db 29#include "path-util.h"
03a7b521 30#include "process-util.h"
9444b1f2 31#include "special.h"
07630cea 32#include "string-util.h"
8e274523 33
9a054909
LP
34#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
35
4ad49000
LP
36void cgroup_context_init(CGroupContext *c) {
37 assert(c);
38
39 /* Initialize everything to the kernel defaults, assuming the
40 * structure is preinitialized to 0 */
41
d53d9474
LP
42 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
43 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
44 c->cpu_quota_per_sec_usec = USEC_INFINITY;
45
ddca82ac 46 c->memory_limit = (uint64_t) -1;
b2f8b02e 47
d53d9474
LP
48 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
49 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
50
51 c->tasks_max = (uint64_t) -1;
32ee7d33
DM
52
53 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
4ad49000 54}
8e274523 55
4ad49000
LP
56void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
57 assert(c);
58 assert(a);
59
71fda00f 60 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
61 free(a->path);
62 free(a);
63}
64
65void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
66 assert(c);
67 assert(w);
68
71fda00f 69 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
70 free(w->path);
71 free(w);
72}
73
74void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
75 assert(c);
8e274523 76 assert(b);
8e274523 77
71fda00f 78 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
79 free(b->path);
80 free(b);
81}
82
83void cgroup_context_done(CGroupContext *c) {
84 assert(c);
85
86 while (c->blockio_device_weights)
87 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
88
89 while (c->blockio_device_bandwidths)
90 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
91
92 while (c->device_allow)
93 cgroup_context_free_device_allow(c, c->device_allow);
94}
95
96void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
97 CGroupBlockIODeviceBandwidth *b;
98 CGroupBlockIODeviceWeight *w;
99 CGroupDeviceAllow *a;
9a054909 100 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
101
102 assert(c);
103 assert(f);
104
105 prefix = strempty(prefix);
106
107 fprintf(f,
108 "%sCPUAccounting=%s\n"
109 "%sBlockIOAccounting=%s\n"
110 "%sMemoryAccounting=%s\n"
d53d9474
LP
111 "%sTasksAccounting=%s\n"
112 "%sCPUShares=%" PRIu64 "\n"
113 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 114 "%sCPUQuotaPerSecSec=%s\n"
d53d9474
LP
115 "%sBlockIOWeight=%" PRIu64 "\n"
116 "%sStartupBlockIOWeight=%" PRIu64 "\n"
4ad49000 117 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 118 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
119 "%sDevicePolicy=%s\n"
120 "%sDelegate=%s\n",
4ad49000
LP
121 prefix, yes_no(c->cpu_accounting),
122 prefix, yes_no(c->blockio_accounting),
123 prefix, yes_no(c->memory_accounting),
d53d9474 124 prefix, yes_no(c->tasks_accounting),
4ad49000 125 prefix, c->cpu_shares,
95ae05c0 126 prefix, c->startup_cpu_shares,
b1d6dcf5 127 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
4ad49000 128 prefix, c->blockio_weight,
95ae05c0 129 prefix, c->startup_blockio_weight,
4ad49000 130 prefix, c->memory_limit,
03a7b521 131 prefix, c->tasks_max,
a931ad47
LP
132 prefix, cgroup_device_policy_to_string(c->device_policy),
133 prefix, yes_no(c->delegate));
4ad49000
LP
134
135 LIST_FOREACH(device_allow, a, c->device_allow)
136 fprintf(f,
137 "%sDeviceAllow=%s %s%s%s\n",
138 prefix,
139 a->path,
140 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
141
142 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
143 fprintf(f,
d53d9474 144 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
145 prefix,
146 w->path,
147 w->weight);
148
149 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
150 char buf[FORMAT_BYTES_MAX];
151
152 fprintf(f,
153 "%s%s=%s %s\n",
154 prefix,
155 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
156 b->path,
157 format_bytes(buf, sizeof(buf), b->bandwidth));
158 }
159}
160
161static int lookup_blkio_device(const char *p, dev_t *dev) {
162 struct stat st;
163 int r;
164
165 assert(p);
166 assert(dev);
167
168 r = stat(p, &st);
4a62c710
MS
169 if (r < 0)
170 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 171
4ad49000
LP
172 if (S_ISBLK(st.st_mode))
173 *dev = st.st_rdev;
174 else if (major(st.st_dev) != 0) {
175 /* If this is not a device node then find the block
176 * device this file is stored on */
177 *dev = st.st_dev;
178
179 /* If this is a partition, try to get the originating
180 * block device */
181 block_get_whole_disk(*dev, dev);
182 } else {
183 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
184 return -ENODEV;
185 }
8e274523 186
8e274523 187 return 0;
8e274523
LP
188}
189
4ad49000
LP
190static int whitelist_device(const char *path, const char *node, const char *acc) {
191 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
192 struct stat st;
8c6db833 193 int r;
8e274523 194
4ad49000
LP
195 assert(path);
196 assert(acc);
8e274523 197
4ad49000
LP
198 if (stat(node, &st) < 0) {
199 log_warning("Couldn't stat device %s", node);
200 return -errno;
201 }
202
203 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
204 log_warning("%s is not a device.", node);
205 return -ENODEV;
206 }
207
208 sprintf(buf,
209 "%c %u:%u %s",
210 S_ISCHR(st.st_mode) ? 'c' : 'b',
211 major(st.st_rdev), minor(st.st_rdev),
212 acc);
213
214 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 215 if (r < 0)
714e2e1d
LP
216 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
217 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
218
219 return r;
8e274523
LP
220}
221
90060676
LP
222static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
223 _cleanup_fclose_ FILE *f = NULL;
224 char line[LINE_MAX];
225 bool good = false;
226 int r;
227
228 assert(path);
229 assert(acc);
230 assert(type == 'b' || type == 'c');
231
232 f = fopen("/proc/devices", "re");
4a62c710
MS
233 if (!f)
234 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
235
236 FOREACH_LINE(line, f, goto fail) {
237 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
238 unsigned maj;
239
240 truncate_nl(line);
241
242 if (type == 'c' && streq(line, "Character devices:")) {
243 good = true;
244 continue;
245 }
246
247 if (type == 'b' && streq(line, "Block devices:")) {
248 good = true;
249 continue;
250 }
251
252 if (isempty(line)) {
253 good = false;
254 continue;
255 }
256
257 if (!good)
258 continue;
259
260 p = strstrip(line);
261
262 w = strpbrk(p, WHITESPACE);
263 if (!w)
264 continue;
265 *w = 0;
266
267 r = safe_atou(p, &maj);
268 if (r < 0)
269 continue;
270 if (maj <= 0)
271 continue;
272
273 w++;
274 w += strspn(w, WHITESPACE);
e41969e3
LP
275
276 if (fnmatch(name, w, 0) != 0)
90060676
LP
277 continue;
278
279 sprintf(buf,
280 "%c %u:* %s",
281 type,
282 maj,
283 acc);
284
285 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 286 if (r < 0)
714e2e1d
LP
287 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
288 "Failed to set devices.allow on %s: %m", path);
90060676
LP
289 }
290
291 return 0;
292
293fail:
56f64d95 294 log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
295 return -errno;
296}
297
32ee7d33 298void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
01efdf13 299 bool is_root;
4ad49000
LP
300 int r;
301
302 assert(c);
303 assert(path);
8e274523 304
4ad49000
LP
305 if (mask == 0)
306 return;
8e274523 307
71c26873 308 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
309 * hence silently ignore */
310 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
311 if (is_root)
312 /* Make sure we don't try to display messages with an empty path. */
313 path = "/";
01efdf13 314
714e2e1d
LP
315 /* We generally ignore errors caused by read-only mounted
316 * cgroup trees (assuming we are running in a container then),
317 * and missing cgroups, i.e. EROFS and ENOENT. */
318
efdb0237 319 if ((mask & CGROUP_MASK_CPU) && !is_root) {
d53d9474 320 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
8e274523 321
d53d9474
LP
322 sprintf(buf, "%" PRIu64 "\n",
323 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
324 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
4ad49000 325 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
1aeab12b 326 if (r < 0)
714e2e1d
LP
327 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
328 "Failed to set cpu.shares on %s: %m", path);
b2f8b02e 329
9a054909 330 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
b2f8b02e 331 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
1aeab12b 332 if (r < 0)
714e2e1d
LP
333 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
334 "Failed to set cpu.cfs_period_us on %s: %m", path);
b2f8b02e 335
3a43da28 336 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
9a054909 337 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
b2f8b02e
LP
338 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
339 } else
340 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
1aeab12b 341 if (r < 0)
714e2e1d
LP
342 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
343 "Failed to set cpu.cfs_quota_us on %s: %m", path);
4ad49000
LP
344 }
345
efdb0237 346 if (mask & CGROUP_MASK_BLKIO) {
d53d9474
LP
347 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
348 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
4ad49000
LP
349 CGroupBlockIODeviceWeight *w;
350 CGroupBlockIODeviceBandwidth *b;
351
01efdf13 352 if (!is_root) {
d53d9474
LP
353 sprintf(buf, "%" PRIu64 "\n",
354 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
355 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
01efdf13 356 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 357 if (r < 0)
714e2e1d
LP
358 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
359 "Failed to set blkio.weight on %s: %m", path);
4ad49000 360
01efdf13
LP
361 /* FIXME: no way to reset this list */
362 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
363 dev_t dev;
4ad49000 364
01efdf13
LP
365 r = lookup_blkio_device(w->path, &dev);
366 if (r < 0)
367 continue;
8e274523 368
d53d9474 369 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
01efdf13 370 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
1aeab12b 371 if (r < 0)
714e2e1d
LP
372 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
373 "Failed to set blkio.weight_device on %s: %m", path);
01efdf13 374 }
4ad49000
LP
375 }
376
377 /* FIXME: no way to reset this list */
378 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
379 const char *a;
380 dev_t dev;
381
382 r = lookup_blkio_device(b->path, &dev);
383 if (r < 0)
384 continue;
385
386 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
387
388 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
389 r = cg_set_attribute("blkio", path, a, buf);
1aeab12b 390 if (r < 0)
714e2e1d
LP
391 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
392 "Failed to set %s on %s: %m", a, path);
d686d8a9 393 }
8e274523
LP
394 }
395
efdb0237 396 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
6a94f2e9 397 if (c->memory_limit != (uint64_t) -1) {
e58cec11
LP
398 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
399
6a94f2e9 400 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
efdb0237
LP
401
402 if (cg_unified() <= 0)
403 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
404 else
405 r = cg_set_attribute("memory", path, "memory.max", buf);
406
407 } else {
408 if (cg_unified() <= 0)
409 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
410 else
411 r = cg_set_attribute("memory", path, "memory.max", "max");
412 }
8e274523 413
1aeab12b 414 if (r < 0)
714e2e1d 415 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
efdb0237 416 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
4ad49000 417 }
8e274523 418
3905f127 419 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 420 CGroupDeviceAllow *a;
8e274523 421
714e2e1d
LP
422 /* Changing the devices list of a populated cgroup
423 * might result in EINVAL, hence ignore EINVAL
424 * here. */
425
4ad49000
LP
426 if (c->device_allow || c->device_policy != CGROUP_AUTO)
427 r = cg_set_attribute("devices", path, "devices.deny", "a");
428 else
429 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 430 if (r < 0)
714e2e1d
LP
431 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
432 "Failed to reset devices.list on %s: %m", path);
fb385181 433
4ad49000
LP
434 if (c->device_policy == CGROUP_CLOSED ||
435 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
436 static const char auto_devices[] =
7d711efb
LP
437 "/dev/null\0" "rwm\0"
438 "/dev/zero\0" "rwm\0"
439 "/dev/full\0" "rwm\0"
440 "/dev/random\0" "rwm\0"
441 "/dev/urandom\0" "rwm\0"
442 "/dev/tty\0" "rwm\0"
443 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
4ad49000
LP
444
445 const char *x, *y;
446
447 NULSTR_FOREACH_PAIR(x, y, auto_devices)
448 whitelist_device(path, x, y);
7d711efb
LP
449
450 whitelist_major(path, "pts", 'c', "rw");
451 whitelist_major(path, "kdbus", 'c', "rw");
452 whitelist_major(path, "kdbus/*", 'c', "rw");
4ad49000
LP
453 }
454
455 LIST_FOREACH(device_allow, a, c->device_allow) {
456 char acc[4];
457 unsigned k = 0;
458
459 if (a->r)
460 acc[k++] = 'r';
461 if (a->w)
462 acc[k++] = 'w';
463 if (a->m)
464 acc[k++] = 'm';
fb385181 465
4ad49000
LP
466 if (k == 0)
467 continue;
fb385181 468
4ad49000 469 acc[k++] = 0;
90060676
LP
470
471 if (startswith(a->path, "/dev/"))
472 whitelist_device(path, a->path, acc);
473 else if (startswith(a->path, "block-"))
474 whitelist_major(path, a->path + 6, 'b', acc);
475 else if (startswith(a->path, "char-"))
476 whitelist_major(path, a->path + 5, 'c', acc);
477 else
478 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
479 }
480 }
03a7b521
LP
481
482 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
483
484 if (c->tasks_max != (uint64_t) -1) {
485 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
486
487 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
488 r = cg_set_attribute("pids", path, "pids.max", buf);
489 } else
490 r = cg_set_attribute("pids", path, "pids.max", "max");
491
492 if (r < 0)
493 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
494 "Failed to set pids.max on %s: %m", path);
495 }
32ee7d33
DM
496
497 if (mask & CGROUP_MASK_NET_CLS) {
498 char buf[DECIMAL_STR_MAX(uint32_t)];
499
500 sprintf(buf, "%" PRIu32, netclass);
501
502 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
503 if (r < 0)
504 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
505 "Failed to set net_cls.classid on %s: %m", path);
506 }
fb385181
LP
507}
508
efdb0237
LP
509CGroupMask cgroup_context_get_mask(CGroupContext *c) {
510 CGroupMask mask = 0;
8e274523 511
4ad49000 512 /* Figure out which controllers we need */
8e274523 513
b2f8b02e 514 if (c->cpu_accounting ||
d53d9474
LP
515 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
516 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
3a43da28 517 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 518 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 519
4ad49000 520 if (c->blockio_accounting ||
d53d9474
LP
521 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
522 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
4ad49000 523 c->blockio_device_weights ||
db785129 524 c->blockio_device_bandwidths)
efdb0237 525 mask |= CGROUP_MASK_BLKIO;
ecedd90f 526
4ad49000 527 if (c->memory_accounting ||
ddca82ac 528 c->memory_limit != (uint64_t) -1)
efdb0237 529 mask |= CGROUP_MASK_MEMORY;
8e274523 530
a931ad47
LP
531 if (c->device_allow ||
532 c->device_policy != CGROUP_AUTO)
3905f127 533 mask |= CGROUP_MASK_DEVICES;
4ad49000 534
03a7b521
LP
535 if (c->tasks_accounting ||
536 c->tasks_max != (uint64_t) -1)
537 mask |= CGROUP_MASK_PIDS;
538
32ee7d33
DM
539 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
540 mask |= CGROUP_MASK_NET_CLS;
541
4ad49000 542 return mask;
8e274523
LP
543}
544
efdb0237 545CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 546 CGroupContext *c;
8e274523 547
efdb0237
LP
548 /* Returns the mask of controllers the unit needs for itself */
549
4ad49000
LP
550 c = unit_get_cgroup_context(u);
551 if (!c)
552 return 0;
8e274523 553
a931ad47 554 /* If delegation is turned on, then turn on all cgroups,
19af675e
LP
555 * unless we are on the legacy hierarchy and the process we
556 * fork into it is known to drop privileges, and hence
557 * shouldn't get access to the controllers.
558 *
559 * Note that on the unified hierarchy it is safe to delegate
560 * controllers to unprivileged services. */
a931ad47
LP
561
562 if (c->delegate) {
563 ExecContext *e;
564
565 e = unit_get_exec_context(u);
19af675e
LP
566 if (!e ||
567 exec_context_maintains_privileges(e) ||
568 cg_unified() > 0)
efdb0237 569 return _CGROUP_MASK_ALL;
a931ad47
LP
570 }
571
db785129 572 return cgroup_context_get_mask(c);
8e274523
LP
573}
574
efdb0237 575CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 576 assert(u);
bc432dc7 577
efdb0237
LP
578 /* Returns the mask of controllers all of the unit's children
579 * require, merged */
580
bc432dc7
LP
581 if (u->cgroup_members_mask_valid)
582 return u->cgroup_members_mask;
583
584 u->cgroup_members_mask = 0;
585
586 if (u->type == UNIT_SLICE) {
587 Unit *member;
588 Iterator i;
589
590 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
591
592 if (member == u)
593 continue;
594
d4fdc205 595 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
596 continue;
597
598 u->cgroup_members_mask |=
efdb0237 599 unit_get_own_mask(member) |
bc432dc7
LP
600 unit_get_members_mask(member);
601 }
602 }
603
604 u->cgroup_members_mask_valid = true;
6414b7c9 605 return u->cgroup_members_mask;
246aa6dd
LP
606}
607
efdb0237 608CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 609 assert(u);
246aa6dd 610
efdb0237
LP
611 /* Returns the mask of controllers all of the unit's siblings
612 * require, i.e. the members mask of the unit's parent slice
613 * if there is one. */
614
bc432dc7 615 if (UNIT_ISSET(u->slice))
637f421e 616 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 617
efdb0237 618 return unit_get_own_mask(u) | unit_get_members_mask(u);
246aa6dd
LP
619}
620
efdb0237
LP
621CGroupMask unit_get_subtree_mask(Unit *u) {
622
623 /* Returns the mask of this subtree, meaning of the group
624 * itself and its children. */
625
626 return unit_get_own_mask(u) | unit_get_members_mask(u);
627}
628
629CGroupMask unit_get_target_mask(Unit *u) {
630 CGroupMask mask;
631
632 /* This returns the cgroup mask of all controllers to enable
633 * for a specific cgroup, i.e. everything it needs itself,
634 * plus all that its children need, plus all that its siblings
635 * need. This is primarily useful on the legacy cgroup
636 * hierarchy, where we need to duplicate each cgroup in each
637 * hierarchy that shall be enabled for it. */
6414b7c9 638
efdb0237
LP
639 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
640 mask &= u->manager->cgroup_supported;
641
642 return mask;
643}
644
645CGroupMask unit_get_enable_mask(Unit *u) {
646 CGroupMask mask;
647
648 /* This returns the cgroup mask of all controllers to enable
649 * for the children of a specific cgroup. This is primarily
650 * useful for the unified cgroup hierarchy, where each cgroup
651 * controls which controllers are enabled for its children. */
652
653 mask = unit_get_members_mask(u);
6414b7c9
DS
654 mask &= u->manager->cgroup_supported;
655
656 return mask;
657}
658
659/* Recurse from a unit up through its containing slices, propagating
660 * mask bits upward. A unit is also member of itself. */
bc432dc7 661void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 662 CGroupMask m;
bc432dc7
LP
663 bool more;
664
665 assert(u);
666
667 /* Calculate subtree mask */
efdb0237 668 m = unit_get_subtree_mask(u);
bc432dc7
LP
669
670 /* See if anything changed from the previous invocation. If
671 * not, we're done. */
672 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
673 return;
674
675 more =
676 u->cgroup_subtree_mask_valid &&
677 ((m & ~u->cgroup_subtree_mask) != 0) &&
678 ((~m & u->cgroup_subtree_mask) == 0);
679
680 u->cgroup_subtree_mask = m;
681 u->cgroup_subtree_mask_valid = true;
682
6414b7c9
DS
683 if (UNIT_ISSET(u->slice)) {
684 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
685
686 if (more)
687 /* There's more set now than before. We
688 * propagate the new mask to the parent's mask
689 * (not caring if it actually was valid or
690 * not). */
691
692 s->cgroup_members_mask |= m;
693
694 else
695 /* There's less set now than before (or we
696 * don't know), we need to recalculate
697 * everything, so let's invalidate the
698 * parent's members mask */
699
700 s->cgroup_members_mask_valid = false;
701
702 /* And now make sure that this change also hits our
703 * grandparents */
704 unit_update_cgroup_members_masks(s);
6414b7c9
DS
705 }
706}
707
efdb0237 708static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
709 Unit *u = userdata;
710
711 assert(mask != 0);
712 assert(u);
713
714 while (u) {
715 if (u->cgroup_path &&
716 u->cgroup_realized &&
717 (u->cgroup_realized_mask & mask) == mask)
718 return u->cgroup_path;
719
720 u = UNIT_DEREF(u->slice);
721 }
722
723 return NULL;
724}
725
efdb0237
LP
726char *unit_default_cgroup_path(Unit *u) {
727 _cleanup_free_ char *escaped = NULL, *slice = NULL;
728 int r;
729
730 assert(u);
731
732 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
733 return strdup(u->manager->cgroup_root);
734
735 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
736 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
737 if (r < 0)
738 return NULL;
739 }
740
741 escaped = cg_escape(u->id);
742 if (!escaped)
743 return NULL;
744
745 if (slice)
746 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
747 else
748 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
749}
750
751int unit_set_cgroup_path(Unit *u, const char *path) {
752 _cleanup_free_ char *p = NULL;
753 int r;
754
755 assert(u);
756
757 if (path) {
758 p = strdup(path);
759 if (!p)
760 return -ENOMEM;
761 } else
762 p = NULL;
763
764 if (streq_ptr(u->cgroup_path, p))
765 return 0;
766
767 if (p) {
768 r = hashmap_put(u->manager->cgroup_unit, p, u);
769 if (r < 0)
770 return r;
771 }
772
773 unit_release_cgroup(u);
774
775 u->cgroup_path = p;
776 p = NULL;
777
778 return 1;
779}
780
781int unit_watch_cgroup(Unit *u) {
782 _cleanup_free_ char *populated = NULL;
783 int r;
784
785 assert(u);
786
787 if (!u->cgroup_path)
788 return 0;
789
790 if (u->cgroup_inotify_wd >= 0)
791 return 0;
792
793 /* Only applies to the unified hierarchy */
794 r = cg_unified();
795 if (r < 0)
796 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
797 if (r == 0)
798 return 0;
799
800 /* Don't watch the root slice, it's pointless. */
801 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
802 return 0;
803
804 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
805 if (r < 0)
806 return log_oom();
807
808 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
809 if (r < 0)
810 return log_oom();
811
812 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
813 if (u->cgroup_inotify_wd < 0) {
814
815 if (errno == ENOENT) /* If the directory is already
816 * gone we don't need to track
817 * it, so this is not an error */
818 return 0;
819
820 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
821 }
822
823 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
824 if (r < 0)
825 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
826
827 return 0;
828}
829
830static int unit_create_cgroup(
831 Unit *u,
832 CGroupMask target_mask,
833 CGroupMask enable_mask) {
834
0cd385d3 835 CGroupContext *c;
bc432dc7 836 int r;
64747e2d 837
4ad49000 838 assert(u);
64747e2d 839
0cd385d3
LP
840 c = unit_get_cgroup_context(u);
841 if (!c)
842 return 0;
843
7b3fd631
LP
844 if (!u->cgroup_path) {
845 _cleanup_free_ char *path = NULL;
64747e2d 846
7b3fd631
LP
847 path = unit_default_cgroup_path(u);
848 if (!path)
849 return log_oom();
850
efdb0237
LP
851 r = unit_set_cgroup_path(u, path);
852 if (r == -EEXIST)
853 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
854 if (r < 0)
855 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
b58b8e11
HH
856 }
857
03b90d4b 858 /* First, create our own group */
efdb0237 859 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 860 if (r < 0)
efdb0237
LP
861 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
862
863 /* Start watching it */
864 (void) unit_watch_cgroup(u);
865
866 /* Enable all controllers we need */
867 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
868 if (r < 0)
869 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
870
871 /* Keep track that this is now realized */
4ad49000 872 u->cgroup_realized = true;
efdb0237 873 u->cgroup_realized_mask = target_mask;
4ad49000 874
0cd385d3
LP
875 if (u->type != UNIT_SLICE && !c->delegate) {
876
877 /* Then, possibly move things over, but not if
878 * subgroups may contain processes, which is the case
879 * for slice and delegation units. */
880 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
881 if (r < 0)
efdb0237 882 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 883 }
03b90d4b 884
64747e2d
LP
885 return 0;
886}
887
7b3fd631
LP
888int unit_attach_pids_to_cgroup(Unit *u) {
889 int r;
890 assert(u);
891
892 r = unit_realize_cgroup(u);
893 if (r < 0)
894 return r;
895
896 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
897 if (r < 0)
898 return r;
899
900 return 0;
901}
902
efdb0237 903static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
bc432dc7
LP
904 assert(u);
905
efdb0237 906 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
6414b7c9
DS
907}
908
32ee7d33
DM
909static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
910
911 uint32_t start, i;
912 Manager *m;
913
914 assert(u);
915
916 m = u->manager;
917
918 i = start = m->cgroup_netclass_registry_last;
919
920 do {
921 i++;
922
923 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
924 m->cgroup_netclass_registry_last = i;
925 *ret = i;
926 return 0;
927 }
928
929 if (i == UINT32_MAX)
930 i = CGROUP_NETCLASS_FIXED_MAX;
931
932 } while (i != start);
933
934 return -ENOBUFS;
935}
936
937int unit_add_to_netclass_cgroup(Unit *u) {
938
939 CGroupContext *cc;
940 Unit *first;
941 void *key;
942 int r;
943
944 assert(u);
945
946 cc = unit_get_cgroup_context(u);
947 if (!cc)
948 return 0;
949
950 switch (cc->netclass_type) {
951 case CGROUP_NETCLASS_TYPE_NONE:
952 return 0;
953
954 case CGROUP_NETCLASS_TYPE_FIXED:
955 u->cgroup_netclass_id = cc->netclass_id;
956 break;
957
958 case CGROUP_NETCLASS_TYPE_AUTO:
959 /* Allocate a new ID in case it was requested and not done yet */
960 if (u->cgroup_netclass_id == 0) {
961 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
962 if (r < 0)
963 return r;
964
965 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
966 }
967
968 break;
969 }
970
971 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
972 if (r < 0)
973 return r;
974
975 key = UINT32_TO_PTR(u->cgroup_netclass_id);
976 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
977
978 if (first) {
979 LIST_PREPEND(cgroup_netclass, first, u);
980 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
981 }
982
983 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
984}
985
986int unit_remove_from_netclass_cgroup(Unit *u) {
987
988 Unit *head;
989 void *key;
990
991 assert(u);
992
993 key = UINT32_TO_PTR(u->cgroup_netclass_id);
994
995 LIST_FIND_HEAD(cgroup_netclass, u, head);
996 LIST_REMOVE(cgroup_netclass, head, u);
997
998 if (head)
999 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1000
1001 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1002
1003 return 0;
1004}
1005
6414b7c9
DS
1006/* Check if necessary controllers and attributes for a unit are in place.
1007 *
1008 * If so, do nothing.
1009 * If not, create paths, move processes over, and set attributes.
1010 *
1011 * Returns 0 on success and < 0 on failure. */
db785129 1012static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1013 CGroupMask target_mask, enable_mask;
6414b7c9 1014 int r;
64747e2d 1015
4ad49000 1016 assert(u);
64747e2d 1017
4ad49000 1018 if (u->in_cgroup_queue) {
71fda00f 1019 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
1020 u->in_cgroup_queue = false;
1021 }
64747e2d 1022
efdb0237
LP
1023 target_mask = unit_get_target_mask(u);
1024 if (unit_has_mask_realized(u, target_mask))
0a1eb06d 1025 return 0;
64747e2d 1026
4ad49000 1027 /* First, realize parents */
6414b7c9 1028 if (UNIT_ISSET(u->slice)) {
db785129 1029 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1030 if (r < 0)
1031 return r;
1032 }
4ad49000
LP
1033
1034 /* And then do the real work */
efdb0237
LP
1035 enable_mask = unit_get_enable_mask(u);
1036 r = unit_create_cgroup(u, target_mask, enable_mask);
6414b7c9
DS
1037 if (r < 0)
1038 return r;
1039
1040 /* Finally, apply the necessary attributes. */
32ee7d33 1041 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
6414b7c9
DS
1042
1043 return 0;
64747e2d
LP
1044}
1045
4ad49000 1046static void unit_add_to_cgroup_queue(Unit *u) {
ecedd90f 1047
4ad49000
LP
1048 if (u->in_cgroup_queue)
1049 return;
8e274523 1050
71fda00f 1051 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
1052 u->in_cgroup_queue = true;
1053}
8c6db833 1054
4ad49000 1055unsigned manager_dispatch_cgroup_queue(Manager *m) {
db785129 1056 ManagerState state;
4ad49000 1057 unsigned n = 0;
db785129 1058 Unit *i;
6414b7c9 1059 int r;
ecedd90f 1060
db785129
LP
1061 state = manager_state(m);
1062
4ad49000
LP
1063 while ((i = m->cgroup_queue)) {
1064 assert(i->in_cgroup_queue);
ecedd90f 1065
db785129 1066 r = unit_realize_cgroup_now(i, state);
6414b7c9 1067 if (r < 0)
efdb0237 1068 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1069
4ad49000
LP
1070 n++;
1071 }
ecedd90f 1072
4ad49000 1073 return n;
8e274523
LP
1074}
1075
4ad49000
LP
1076static void unit_queue_siblings(Unit *u) {
1077 Unit *slice;
ca949c9d 1078
4ad49000
LP
1079 /* This adds the siblings of the specified unit and the
1080 * siblings of all parent units to the cgroup queue. (But
1081 * neither the specified unit itself nor the parents.) */
1082
1083 while ((slice = UNIT_DEREF(u->slice))) {
1084 Iterator i;
1085 Unit *m;
8f53a7b8 1086
4ad49000
LP
1087 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1088 if (m == u)
1089 continue;
8e274523 1090
6414b7c9
DS
1091 /* Skip units that have a dependency on the slice
1092 * but aren't actually in it. */
4ad49000 1093 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1094 continue;
8e274523 1095
6414b7c9
DS
1096 /* No point in doing cgroup application for units
1097 * without active processes. */
1098 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1099 continue;
1100
1101 /* If the unit doesn't need any new controllers
1102 * and has current ones realized, it doesn't need
1103 * any changes. */
1104 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1105 continue;
1106
4ad49000 1107 unit_add_to_cgroup_queue(m);
50159e6a
LP
1108 }
1109
4ad49000 1110 u = slice;
8e274523 1111 }
4ad49000
LP
1112}
1113
0a1eb06d 1114int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1115 assert(u);
1116
35b7ff80 1117 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1118 return 0;
8e274523 1119
4ad49000
LP
1120 /* So, here's the deal: when realizing the cgroups for this
1121 * unit, we need to first create all parents, but there's more
1122 * actually: for the weight-based controllers we also need to
1123 * make sure that all our siblings (i.e. units that are in the
73e231ab 1124 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1125 * would become very uneven as each of their processes would
1126 * get as much resources as all our group together. This call
1127 * will synchronously create the parent cgroups, but will
1128 * defer work on the siblings to the next event loop
1129 * iteration. */
ca949c9d 1130
4ad49000
LP
1131 /* Add all sibling slices to the cgroup queue. */
1132 unit_queue_siblings(u);
1133
6414b7c9 1134 /* And realize this one now (and apply the values) */
db785129 1135 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1136}
1137
efdb0237
LP
1138void unit_release_cgroup(Unit *u) {
1139 assert(u);
1140
1141 /* Forgets all cgroup details for this cgroup */
1142
1143 if (u->cgroup_path) {
1144 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1145 u->cgroup_path = mfree(u->cgroup_path);
1146 }
1147
1148 if (u->cgroup_inotify_wd >= 0) {
1149 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1150 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1151
1152 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1153 u->cgroup_inotify_wd = -1;
1154 }
1155}
1156
1157void unit_prune_cgroup(Unit *u) {
8e274523 1158 int r;
efdb0237 1159 bool is_root_slice;
8e274523 1160
4ad49000 1161 assert(u);
8e274523 1162
efdb0237
LP
1163 /* Removes the cgroup, if empty and possible, and stops watching it. */
1164
4ad49000
LP
1165 if (!u->cgroup_path)
1166 return;
8e274523 1167
efdb0237
LP
1168 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1169
1170 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1171 if (r < 0) {
efdb0237 1172 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1173 return;
1174 }
8e274523 1175
efdb0237
LP
1176 if (is_root_slice)
1177 return;
1178
1179 unit_release_cgroup(u);
0a1eb06d 1180
4ad49000 1181 u->cgroup_realized = false;
bc432dc7 1182 u->cgroup_realized_mask = 0;
8e274523
LP
1183}
1184
efdb0237 1185int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1186 _cleanup_fclose_ FILE *f = NULL;
1187 pid_t pid = 0, npid, mypid;
efdb0237 1188 int r;
4ad49000
LP
1189
1190 assert(u);
efdb0237 1191 assert(ret);
4ad49000
LP
1192
1193 if (!u->cgroup_path)
efdb0237 1194 return -ENXIO;
4ad49000 1195
efdb0237
LP
1196 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1197 if (r < 0)
1198 return r;
4ad49000
LP
1199
1200 mypid = getpid();
1201 while (cg_read_pid(f, &npid) > 0) {
1202 pid_t ppid;
1203
1204 if (npid == pid)
1205 continue;
8e274523 1206
4ad49000
LP
1207 /* Ignore processes that aren't our kids */
1208 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1209 continue;
8e274523 1210
efdb0237 1211 if (pid != 0)
4ad49000
LP
1212 /* Dang, there's more than one daemonized PID
1213 in this group, so we don't know what process
1214 is the main process. */
efdb0237
LP
1215
1216 return -ENODATA;
8e274523 1217
4ad49000 1218 pid = npid;
8e274523
LP
1219 }
1220
efdb0237
LP
1221 *ret = pid;
1222 return 0;
1223}
1224
1225static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1226 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1227 _cleanup_fclose_ FILE *f = NULL;
1228 int ret = 0, r;
1229
1230 assert(u);
1231 assert(path);
1232
1233 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1234 if (r < 0)
1235 ret = r;
1236 else {
1237 pid_t pid;
1238
1239 while ((r = cg_read_pid(f, &pid)) > 0) {
1240 r = unit_watch_pid(u, pid);
1241 if (r < 0 && ret >= 0)
1242 ret = r;
1243 }
1244
1245 if (r < 0 && ret >= 0)
1246 ret = r;
1247 }
1248
1249 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1250 if (r < 0) {
1251 if (ret >= 0)
1252 ret = r;
1253 } else {
1254 char *fn;
1255
1256 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1257 _cleanup_free_ char *p = NULL;
1258
1259 p = strjoin(path, "/", fn, NULL);
1260 free(fn);
1261
1262 if (!p)
1263 return -ENOMEM;
1264
1265 r = unit_watch_pids_in_path(u, p);
1266 if (r < 0 && ret >= 0)
1267 ret = r;
1268 }
1269
1270 if (r < 0 && ret >= 0)
1271 ret = r;
1272 }
1273
1274 return ret;
1275}
1276
1277int unit_watch_all_pids(Unit *u) {
1278 assert(u);
1279
1280 /* Adds all PIDs from our cgroup to the set of PIDs we
1281 * watch. This is a fallback logic for cases where we do not
1282 * get reliable cgroup empty notifications: we try to use
1283 * SIGCHLD as replacement. */
1284
1285 if (!u->cgroup_path)
1286 return -ENOENT;
1287
1288 if (cg_unified() > 0) /* On unified we can use proper notifications */
1289 return 0;
1290
1291 return unit_watch_pids_in_path(u, u->cgroup_path);
1292}
1293
1294int unit_notify_cgroup_empty(Unit *u) {
1295 int r;
1296
1297 assert(u);
1298
1299 if (!u->cgroup_path)
1300 return 0;
1301
1302 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1303 if (r <= 0)
1304 return r;
1305
1306 unit_add_to_gc_queue(u);
1307
1308 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1309 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1310
1311 return 0;
1312}
1313
1314static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1315 Manager *m = userdata;
1316
1317 assert(s);
1318 assert(fd >= 0);
1319 assert(m);
1320
1321 for (;;) {
1322 union inotify_event_buffer buffer;
1323 struct inotify_event *e;
1324 ssize_t l;
1325
1326 l = read(fd, &buffer, sizeof(buffer));
1327 if (l < 0) {
1328 if (errno == EINTR || errno == EAGAIN)
1329 return 0;
1330
1331 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1332 }
1333
1334 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1335 Unit *u;
1336
1337 if (e->wd < 0)
1338 /* Queue overflow has no watch descriptor */
1339 continue;
1340
1341 if (e->mask & IN_IGNORED)
1342 /* The watch was just removed */
1343 continue;
1344
1345 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1346 if (!u) /* Not that inotify might deliver
1347 * events for a watch even after it
1348 * was removed, because it was queued
1349 * before the removal. Let's ignore
1350 * this here safely. */
1351 continue;
1352
1353 (void) unit_notify_cgroup_empty(u);
1354 }
1355 }
8e274523
LP
1356}
1357
8e274523 1358int manager_setup_cgroup(Manager *m) {
9444b1f2 1359 _cleanup_free_ char *path = NULL;
efdb0237
LP
1360 CGroupController c;
1361 int r, unified;
1362 char *e;
8e274523
LP
1363
1364 assert(m);
1365
35d2e7ec 1366 /* 1. Determine hierarchy */
efdb0237 1367 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 1368 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
1369 if (r < 0)
1370 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 1371
efdb0237
LP
1372 /* Chop off the init scope, if we are already located in it */
1373 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 1374
efdb0237
LP
1375 /* LEGACY: Also chop off the system slice if we are in
1376 * it. This is to support live upgrades from older systemd
1377 * versions where PID 1 was moved there. Also see
1378 * cg_get_root_path(). */
1379 if (!e && m->running_as == MANAGER_SYSTEM) {
9444b1f2 1380 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 1381 if (!e)
efdb0237 1382 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 1383 }
efdb0237
LP
1384 if (e)
1385 *e = 0;
7ccfb64a 1386
9444b1f2
LP
1387 /* And make sure to store away the root value without trailing
1388 * slash, even for the root dir, so that we can easily prepend
1389 * it everywhere. */
efdb0237
LP
1390 while ((e = endswith(m->cgroup_root, "/")))
1391 *e = 0;
8e274523 1392
35d2e7ec 1393 /* 2. Show data */
9444b1f2 1394 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
1395 if (r < 0)
1396 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 1397
efdb0237
LP
1398 unified = cg_unified();
1399 if (unified < 0)
1400 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1401 if (unified > 0)
1402 log_debug("Unified cgroup hierarchy is located at %s.", path);
1403 else
1404 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1405
0d8c31ff 1406 if (!m->test_run) {
efdb0237 1407 const char *scope_path;
c6c18be3 1408
0d8c31ff 1409 /* 3. Install agent */
efdb0237
LP
1410 if (unified) {
1411
1412 /* In the unified hierarchy we can can get
1413 * cgroup empty notifications via inotify. */
1414
1415 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1416 safe_close(m->cgroup_inotify_fd);
1417
1418 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1419 if (m->cgroup_inotify_fd < 0)
1420 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1421
1422 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1423 if (r < 0)
1424 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1425
1426 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1427 if (r < 0)
1428 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1429
1430 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1431
1432 } else if (m->running_as == MANAGER_SYSTEM) {
1433
1434 /* On the legacy hierarchy we only get
1435 * notifications via cgroup agents. (Which
1436 * isn't really reliable, since it does not
1437 * generate events when control groups with
1438 * children run empty. */
1439
0d8c31ff
ZJS
1440 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1441 if (r < 0)
da927ba9 1442 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
0d8c31ff
ZJS
1443 else if (r > 0)
1444 log_debug("Installed release agent.");
efdb0237 1445 else if (r == 0)
0d8c31ff
ZJS
1446 log_debug("Release agent already installed.");
1447 }
8e274523 1448
efdb0237
LP
1449 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1450 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1451 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
23bbb0de 1452 if (r < 0)
efdb0237
LP
1453 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1454
1455 /* also, move all other userspace processes remaining
1456 * in the root cgroup into that scope. */
1457 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1458 if (r < 0)
1459 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
c6c18be3 1460
0d8c31ff
ZJS
1461 /* 5. And pin it, so that it cannot be unmounted */
1462 safe_close(m->pin_cgroupfs_fd);
0d8c31ff 1463 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1464 if (m->pin_cgroupfs_fd < 0)
1465 return log_error_errno(errno, "Failed to open pin file: %m");
0d8c31ff 1466
cc98b302 1467 /* 6. Always enable hierarchical support if it exists... */
efdb0237
LP
1468 if (!unified)
1469 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3
LP
1470 }
1471
0d8c31ff 1472 /* 7. Figure out which controllers are supported */
efdb0237
LP
1473 r = cg_mask_supported(&m->cgroup_supported);
1474 if (r < 0)
1475 return log_error_errno(r, "Failed to determine supported controllers: %m");
1476
1477 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1478 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
9156e799 1479
a32360f1 1480 return 0;
8e274523
LP
1481}
1482
c6c18be3 1483void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
1484 assert(m);
1485
9444b1f2
LP
1486 /* We can't really delete the group, since we are in it. But
1487 * let's trim it. */
1488 if (delete && m->cgroup_root)
efdb0237
LP
1489 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1490
1491 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1492
1493 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1494 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 1495
03e334a1 1496 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 1497
efdb0237 1498 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
1499}
1500
4ad49000 1501Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 1502 char *p;
4ad49000 1503 Unit *u;
acb14d31
LP
1504
1505 assert(m);
1506 assert(cgroup);
acb14d31 1507
4ad49000
LP
1508 u = hashmap_get(m->cgroup_unit, cgroup);
1509 if (u)
1510 return u;
acb14d31 1511
8e70580b 1512 p = strdupa(cgroup);
acb14d31
LP
1513 for (;;) {
1514 char *e;
1515
1516 e = strrchr(p, '/');
efdb0237
LP
1517 if (!e || e == p)
1518 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
1519
1520 *e = 0;
1521
4ad49000
LP
1522 u = hashmap_get(m->cgroup_unit, p);
1523 if (u)
1524 return u;
acb14d31
LP
1525 }
1526}
1527
b3ac818b 1528Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 1529 _cleanup_free_ char *cgroup = NULL;
acb14d31 1530 int r;
8e274523 1531
8c47c732
LP
1532 assert(m);
1533
b3ac818b
LP
1534 if (pid <= 0)
1535 return NULL;
1536
1537 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1538 if (r < 0)
1539 return NULL;
1540
1541 return manager_get_unit_by_cgroup(m, cgroup);
1542}
1543
1544Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1545 Unit *u;
1546
1547 assert(m);
1548
efdb0237 1549 if (pid <= 0)
8c47c732
LP
1550 return NULL;
1551
efdb0237
LP
1552 if (pid == 1)
1553 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1554
fea72cc0 1555 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
5fe8876b
LP
1556 if (u)
1557 return u;
1558
fea72cc0 1559 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
5fe8876b
LP
1560 if (u)
1561 return u;
1562
b3ac818b 1563 return manager_get_unit_by_pid_cgroup(m, pid);
6dde1f33 1564}
4fbf50b3 1565
4ad49000
LP
1566int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1567 Unit *u;
4fbf50b3 1568
4ad49000
LP
1569 assert(m);
1570 assert(cgroup);
4fbf50b3 1571
4ad49000 1572 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
1573 if (!u)
1574 return 0;
b56c28c3 1575
efdb0237 1576 return unit_notify_cgroup_empty(u);
5ad096b3
LP
1577}
1578
1579int unit_get_memory_current(Unit *u, uint64_t *ret) {
1580 _cleanup_free_ char *v = NULL;
1581 int r;
1582
1583 assert(u);
1584 assert(ret);
1585
1586 if (!u->cgroup_path)
1587 return -ENODATA;
1588
efdb0237 1589 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
1590 return -ENODATA;
1591
efdb0237
LP
1592 if (cg_unified() <= 0)
1593 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1594 else
1595 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
5ad096b3
LP
1596 if (r == -ENOENT)
1597 return -ENODATA;
1598 if (r < 0)
1599 return r;
1600
1601 return safe_atou64(v, ret);
1602}
1603
03a7b521
LP
1604int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1605 _cleanup_free_ char *v = NULL;
1606 int r;
1607
1608 assert(u);
1609 assert(ret);
1610
1611 if (!u->cgroup_path)
1612 return -ENODATA;
1613
1614 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1615 return -ENODATA;
1616
1617 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1618 if (r == -ENOENT)
1619 return -ENODATA;
1620 if (r < 0)
1621 return r;
1622
1623 return safe_atou64(v, ret);
1624}
1625
5ad096b3
LP
1626static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1627 _cleanup_free_ char *v = NULL;
1628 uint64_t ns;
1629 int r;
1630
1631 assert(u);
1632 assert(ret);
1633
1634 if (!u->cgroup_path)
1635 return -ENODATA;
1636
efdb0237 1637 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
5ad096b3
LP
1638 return -ENODATA;
1639
1640 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1641 if (r == -ENOENT)
1642 return -ENODATA;
1643 if (r < 0)
1644 return r;
1645
1646 r = safe_atou64(v, &ns);
1647 if (r < 0)
1648 return r;
1649
1650 *ret = ns;
1651 return 0;
1652}
1653
1654int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1655 nsec_t ns;
1656 int r;
1657
1658 r = unit_get_cpu_usage_raw(u, &ns);
1659 if (r < 0)
1660 return r;
1661
1662 if (ns > u->cpuacct_usage_base)
1663 ns -= u->cpuacct_usage_base;
1664 else
1665 ns = 0;
1666
1667 *ret = ns;
1668 return 0;
1669}
1670
1671int unit_reset_cpu_usage(Unit *u) {
1672 nsec_t ns;
1673 int r;
1674
1675 assert(u);
1676
1677 r = unit_get_cpu_usage_raw(u, &ns);
1678 if (r < 0) {
1679 u->cpuacct_usage_base = 0;
1680 return r;
b56c28c3 1681 }
2633eb83 1682
5ad096b3 1683 u->cpuacct_usage_base = ns;
4ad49000 1684 return 0;
4fbf50b3
LP
1685}
1686
e9db43d5
LP
1687bool unit_cgroup_delegate(Unit *u) {
1688 CGroupContext *c;
1689
1690 assert(u);
1691
1692 c = unit_get_cgroup_context(u);
1693 if (!c)
1694 return false;
1695
1696 return c->delegate;
1697}
1698
e7ab4d1a
LP
1699void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1700 assert(u);
1701
1702 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1703 return;
1704
1705 if (m == 0)
1706 return;
1707
1708 if ((u->cgroup_realized_mask & m) == 0)
1709 return;
1710
1711 u->cgroup_realized_mask &= ~m;
1712 unit_add_to_cgroup_queue(u);
1713}
1714
1715void manager_invalidate_startup_units(Manager *m) {
1716 Iterator i;
1717 Unit *u;
1718
1719 assert(m);
1720
1721 SET_FOREACH(u, m->startup_units, i)
1722 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1723}
1724
4ad49000
LP
1725static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1726 [CGROUP_AUTO] = "auto",
1727 [CGROUP_CLOSED] = "closed",
1728 [CGROUP_STRICT] = "strict",
1729};
4fbf50b3 1730
4ad49000 1731DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);