]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
sd-*.h: clean up exported (or to-be-exported) header files
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8e274523
LP
2
3/***
4 This file is part of systemd.
5
4ad49000 6 Copyright 2013 Lennart Poettering
8e274523
LP
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
8e274523 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
c6c18be3 22#include <fcntl.h>
e41969e3 23#include <fnmatch.h>
8c6db833 24
03a7b521 25#include "cgroup-util.h"
9eb977db 26#include "path-util.h"
03a7b521 27#include "process-util.h"
9444b1f2 28#include "special.h"
07630cea 29#include "string-util.h"
4ad49000 30#include "cgroup.h"
8e274523 31
9a054909
LP
32#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
33
4ad49000
LP
34void cgroup_context_init(CGroupContext *c) {
35 assert(c);
36
37 /* Initialize everything to the kernel defaults, assuming the
38 * structure is preinitialized to 0 */
39
d53d9474
LP
40 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
41 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
42 c->cpu_quota_per_sec_usec = USEC_INFINITY;
43
ddca82ac 44 c->memory_limit = (uint64_t) -1;
b2f8b02e 45
d53d9474
LP
46 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
47 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
48
49 c->tasks_max = (uint64_t) -1;
32ee7d33
DM
50
51 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
4ad49000 52}
8e274523 53
4ad49000
LP
54void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
55 assert(c);
56 assert(a);
57
71fda00f 58 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
59 free(a->path);
60 free(a);
61}
62
63void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
64 assert(c);
65 assert(w);
66
71fda00f 67 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
68 free(w->path);
69 free(w);
70}
71
72void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
73 assert(c);
8e274523 74 assert(b);
8e274523 75
71fda00f 76 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
77 free(b->path);
78 free(b);
79}
80
81void cgroup_context_done(CGroupContext *c) {
82 assert(c);
83
84 while (c->blockio_device_weights)
85 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
86
87 while (c->blockio_device_bandwidths)
88 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
89
90 while (c->device_allow)
91 cgroup_context_free_device_allow(c, c->device_allow);
92}
93
94void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
95 CGroupBlockIODeviceBandwidth *b;
96 CGroupBlockIODeviceWeight *w;
97 CGroupDeviceAllow *a;
9a054909 98 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
99
100 assert(c);
101 assert(f);
102
103 prefix = strempty(prefix);
104
105 fprintf(f,
106 "%sCPUAccounting=%s\n"
107 "%sBlockIOAccounting=%s\n"
108 "%sMemoryAccounting=%s\n"
d53d9474
LP
109 "%sTasksAccounting=%s\n"
110 "%sCPUShares=%" PRIu64 "\n"
111 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 112 "%sCPUQuotaPerSecSec=%s\n"
d53d9474
LP
113 "%sBlockIOWeight=%" PRIu64 "\n"
114 "%sStartupBlockIOWeight=%" PRIu64 "\n"
4ad49000 115 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 116 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
117 "%sDevicePolicy=%s\n"
118 "%sDelegate=%s\n",
4ad49000
LP
119 prefix, yes_no(c->cpu_accounting),
120 prefix, yes_no(c->blockio_accounting),
121 prefix, yes_no(c->memory_accounting),
d53d9474 122 prefix, yes_no(c->tasks_accounting),
4ad49000 123 prefix, c->cpu_shares,
95ae05c0 124 prefix, c->startup_cpu_shares,
b1d6dcf5 125 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
4ad49000 126 prefix, c->blockio_weight,
95ae05c0 127 prefix, c->startup_blockio_weight,
4ad49000 128 prefix, c->memory_limit,
03a7b521 129 prefix, c->tasks_max,
a931ad47
LP
130 prefix, cgroup_device_policy_to_string(c->device_policy),
131 prefix, yes_no(c->delegate));
4ad49000
LP
132
133 LIST_FOREACH(device_allow, a, c->device_allow)
134 fprintf(f,
135 "%sDeviceAllow=%s %s%s%s\n",
136 prefix,
137 a->path,
138 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
139
140 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
141 fprintf(f,
d53d9474 142 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
143 prefix,
144 w->path,
145 w->weight);
146
147 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
148 char buf[FORMAT_BYTES_MAX];
149
150 fprintf(f,
151 "%s%s=%s %s\n",
152 prefix,
153 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
154 b->path,
155 format_bytes(buf, sizeof(buf), b->bandwidth));
156 }
157}
158
159static int lookup_blkio_device(const char *p, dev_t *dev) {
160 struct stat st;
161 int r;
162
163 assert(p);
164 assert(dev);
165
166 r = stat(p, &st);
4a62c710
MS
167 if (r < 0)
168 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 169
4ad49000
LP
170 if (S_ISBLK(st.st_mode))
171 *dev = st.st_rdev;
172 else if (major(st.st_dev) != 0) {
173 /* If this is not a device node then find the block
174 * device this file is stored on */
175 *dev = st.st_dev;
176
177 /* If this is a partition, try to get the originating
178 * block device */
179 block_get_whole_disk(*dev, dev);
180 } else {
181 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
182 return -ENODEV;
183 }
8e274523 184
8e274523 185 return 0;
8e274523
LP
186}
187
4ad49000
LP
188static int whitelist_device(const char *path, const char *node, const char *acc) {
189 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
190 struct stat st;
8c6db833 191 int r;
8e274523 192
4ad49000
LP
193 assert(path);
194 assert(acc);
8e274523 195
4ad49000
LP
196 if (stat(node, &st) < 0) {
197 log_warning("Couldn't stat device %s", node);
198 return -errno;
199 }
200
201 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
202 log_warning("%s is not a device.", node);
203 return -ENODEV;
204 }
205
206 sprintf(buf,
207 "%c %u:%u %s",
208 S_ISCHR(st.st_mode) ? 'c' : 'b',
209 major(st.st_rdev), minor(st.st_rdev),
210 acc);
211
212 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 213 if (r < 0)
714e2e1d
LP
214 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
215 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
216
217 return r;
8e274523
LP
218}
219
90060676
LP
220static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
221 _cleanup_fclose_ FILE *f = NULL;
222 char line[LINE_MAX];
223 bool good = false;
224 int r;
225
226 assert(path);
227 assert(acc);
228 assert(type == 'b' || type == 'c');
229
230 f = fopen("/proc/devices", "re");
4a62c710
MS
231 if (!f)
232 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
233
234 FOREACH_LINE(line, f, goto fail) {
235 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
236 unsigned maj;
237
238 truncate_nl(line);
239
240 if (type == 'c' && streq(line, "Character devices:")) {
241 good = true;
242 continue;
243 }
244
245 if (type == 'b' && streq(line, "Block devices:")) {
246 good = true;
247 continue;
248 }
249
250 if (isempty(line)) {
251 good = false;
252 continue;
253 }
254
255 if (!good)
256 continue;
257
258 p = strstrip(line);
259
260 w = strpbrk(p, WHITESPACE);
261 if (!w)
262 continue;
263 *w = 0;
264
265 r = safe_atou(p, &maj);
266 if (r < 0)
267 continue;
268 if (maj <= 0)
269 continue;
270
271 w++;
272 w += strspn(w, WHITESPACE);
e41969e3
LP
273
274 if (fnmatch(name, w, 0) != 0)
90060676
LP
275 continue;
276
277 sprintf(buf,
278 "%c %u:* %s",
279 type,
280 maj,
281 acc);
282
283 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 284 if (r < 0)
714e2e1d
LP
285 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
286 "Failed to set devices.allow on %s: %m", path);
90060676
LP
287 }
288
289 return 0;
290
291fail:
56f64d95 292 log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
293 return -errno;
294}
295
32ee7d33 296void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
01efdf13 297 bool is_root;
4ad49000
LP
298 int r;
299
300 assert(c);
301 assert(path);
8e274523 302
4ad49000
LP
303 if (mask == 0)
304 return;
8e274523 305
71c26873 306 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
307 * hence silently ignore */
308 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
309 if (is_root)
310 /* Make sure we don't try to display messages with an empty path. */
311 path = "/";
01efdf13 312
714e2e1d
LP
313 /* We generally ignore errors caused by read-only mounted
314 * cgroup trees (assuming we are running in a container then),
315 * and missing cgroups, i.e. EROFS and ENOENT. */
316
efdb0237 317 if ((mask & CGROUP_MASK_CPU) && !is_root) {
d53d9474 318 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
8e274523 319
d53d9474
LP
320 sprintf(buf, "%" PRIu64 "\n",
321 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
322 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
4ad49000 323 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
1aeab12b 324 if (r < 0)
714e2e1d
LP
325 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
326 "Failed to set cpu.shares on %s: %m", path);
b2f8b02e 327
9a054909 328 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
b2f8b02e 329 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
1aeab12b 330 if (r < 0)
714e2e1d
LP
331 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
332 "Failed to set cpu.cfs_period_us on %s: %m", path);
b2f8b02e 333
3a43da28 334 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
9a054909 335 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
b2f8b02e
LP
336 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
337 } else
338 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
1aeab12b 339 if (r < 0)
714e2e1d
LP
340 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
341 "Failed to set cpu.cfs_quota_us on %s: %m", path);
4ad49000
LP
342 }
343
efdb0237 344 if (mask & CGROUP_MASK_BLKIO) {
d53d9474
LP
345 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
346 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
4ad49000
LP
347 CGroupBlockIODeviceWeight *w;
348 CGroupBlockIODeviceBandwidth *b;
349
01efdf13 350 if (!is_root) {
d53d9474
LP
351 sprintf(buf, "%" PRIu64 "\n",
352 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
353 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
01efdf13 354 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 355 if (r < 0)
714e2e1d
LP
356 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
357 "Failed to set blkio.weight on %s: %m", path);
4ad49000 358
01efdf13
LP
359 /* FIXME: no way to reset this list */
360 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
361 dev_t dev;
4ad49000 362
01efdf13
LP
363 r = lookup_blkio_device(w->path, &dev);
364 if (r < 0)
365 continue;
8e274523 366
d53d9474 367 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
01efdf13 368 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
1aeab12b 369 if (r < 0)
714e2e1d
LP
370 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
371 "Failed to set blkio.weight_device on %s: %m", path);
01efdf13 372 }
4ad49000
LP
373 }
374
375 /* FIXME: no way to reset this list */
376 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
377 const char *a;
378 dev_t dev;
379
380 r = lookup_blkio_device(b->path, &dev);
381 if (r < 0)
382 continue;
383
384 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
385
386 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
387 r = cg_set_attribute("blkio", path, a, buf);
1aeab12b 388 if (r < 0)
714e2e1d
LP
389 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
390 "Failed to set %s on %s: %m", a, path);
d686d8a9 391 }
8e274523
LP
392 }
393
efdb0237 394 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
6a94f2e9 395 if (c->memory_limit != (uint64_t) -1) {
e58cec11
LP
396 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
397
6a94f2e9 398 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
efdb0237
LP
399
400 if (cg_unified() <= 0)
401 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
402 else
403 r = cg_set_attribute("memory", path, "memory.max", buf);
404
405 } else {
406 if (cg_unified() <= 0)
407 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
408 else
409 r = cg_set_attribute("memory", path, "memory.max", "max");
410 }
8e274523 411
1aeab12b 412 if (r < 0)
714e2e1d 413 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
efdb0237 414 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
4ad49000 415 }
8e274523 416
3905f127 417 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 418 CGroupDeviceAllow *a;
8e274523 419
714e2e1d
LP
420 /* Changing the devices list of a populated cgroup
421 * might result in EINVAL, hence ignore EINVAL
422 * here. */
423
4ad49000
LP
424 if (c->device_allow || c->device_policy != CGROUP_AUTO)
425 r = cg_set_attribute("devices", path, "devices.deny", "a");
426 else
427 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 428 if (r < 0)
714e2e1d
LP
429 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
430 "Failed to reset devices.list on %s: %m", path);
fb385181 431
4ad49000
LP
432 if (c->device_policy == CGROUP_CLOSED ||
433 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
434 static const char auto_devices[] =
7d711efb
LP
435 "/dev/null\0" "rwm\0"
436 "/dev/zero\0" "rwm\0"
437 "/dev/full\0" "rwm\0"
438 "/dev/random\0" "rwm\0"
439 "/dev/urandom\0" "rwm\0"
440 "/dev/tty\0" "rwm\0"
441 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
4ad49000
LP
442
443 const char *x, *y;
444
445 NULSTR_FOREACH_PAIR(x, y, auto_devices)
446 whitelist_device(path, x, y);
7d711efb
LP
447
448 whitelist_major(path, "pts", 'c', "rw");
449 whitelist_major(path, "kdbus", 'c', "rw");
450 whitelist_major(path, "kdbus/*", 'c', "rw");
4ad49000
LP
451 }
452
453 LIST_FOREACH(device_allow, a, c->device_allow) {
454 char acc[4];
455 unsigned k = 0;
456
457 if (a->r)
458 acc[k++] = 'r';
459 if (a->w)
460 acc[k++] = 'w';
461 if (a->m)
462 acc[k++] = 'm';
fb385181 463
4ad49000
LP
464 if (k == 0)
465 continue;
fb385181 466
4ad49000 467 acc[k++] = 0;
90060676
LP
468
469 if (startswith(a->path, "/dev/"))
470 whitelist_device(path, a->path, acc);
471 else if (startswith(a->path, "block-"))
472 whitelist_major(path, a->path + 6, 'b', acc);
473 else if (startswith(a->path, "char-"))
474 whitelist_major(path, a->path + 5, 'c', acc);
475 else
476 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
477 }
478 }
03a7b521
LP
479
480 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
481
482 if (c->tasks_max != (uint64_t) -1) {
483 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
484
485 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
486 r = cg_set_attribute("pids", path, "pids.max", buf);
487 } else
488 r = cg_set_attribute("pids", path, "pids.max", "max");
489
490 if (r < 0)
491 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
492 "Failed to set pids.max on %s: %m", path);
493 }
32ee7d33
DM
494
495 if (mask & CGROUP_MASK_NET_CLS) {
496 char buf[DECIMAL_STR_MAX(uint32_t)];
497
498 sprintf(buf, "%" PRIu32, netclass);
499
500 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
501 if (r < 0)
502 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
503 "Failed to set net_cls.classid on %s: %m", path);
504 }
fb385181
LP
505}
506
efdb0237
LP
507CGroupMask cgroup_context_get_mask(CGroupContext *c) {
508 CGroupMask mask = 0;
8e274523 509
4ad49000 510 /* Figure out which controllers we need */
8e274523 511
b2f8b02e 512 if (c->cpu_accounting ||
d53d9474
LP
513 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
514 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
3a43da28 515 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 516 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 517
4ad49000 518 if (c->blockio_accounting ||
d53d9474
LP
519 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
520 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
4ad49000 521 c->blockio_device_weights ||
db785129 522 c->blockio_device_bandwidths)
efdb0237 523 mask |= CGROUP_MASK_BLKIO;
ecedd90f 524
4ad49000 525 if (c->memory_accounting ||
ddca82ac 526 c->memory_limit != (uint64_t) -1)
efdb0237 527 mask |= CGROUP_MASK_MEMORY;
8e274523 528
a931ad47
LP
529 if (c->device_allow ||
530 c->device_policy != CGROUP_AUTO)
3905f127 531 mask |= CGROUP_MASK_DEVICES;
4ad49000 532
03a7b521
LP
533 if (c->tasks_accounting ||
534 c->tasks_max != (uint64_t) -1)
535 mask |= CGROUP_MASK_PIDS;
536
32ee7d33
DM
537 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
538 mask |= CGROUP_MASK_NET_CLS;
539
4ad49000 540 return mask;
8e274523
LP
541}
542
efdb0237 543CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 544 CGroupContext *c;
8e274523 545
efdb0237
LP
546 /* Returns the mask of controllers the unit needs for itself */
547
4ad49000
LP
548 c = unit_get_cgroup_context(u);
549 if (!c)
550 return 0;
8e274523 551
a931ad47 552 /* If delegation is turned on, then turn on all cgroups,
19af675e
LP
553 * unless we are on the legacy hierarchy and the process we
554 * fork into it is known to drop privileges, and hence
555 * shouldn't get access to the controllers.
556 *
557 * Note that on the unified hierarchy it is safe to delegate
558 * controllers to unprivileged services. */
a931ad47
LP
559
560 if (c->delegate) {
561 ExecContext *e;
562
563 e = unit_get_exec_context(u);
19af675e
LP
564 if (!e ||
565 exec_context_maintains_privileges(e) ||
566 cg_unified() > 0)
efdb0237 567 return _CGROUP_MASK_ALL;
a931ad47
LP
568 }
569
db785129 570 return cgroup_context_get_mask(c);
8e274523
LP
571}
572
efdb0237 573CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 574 assert(u);
bc432dc7 575
efdb0237
LP
576 /* Returns the mask of controllers all of the unit's children
577 * require, merged */
578
bc432dc7
LP
579 if (u->cgroup_members_mask_valid)
580 return u->cgroup_members_mask;
581
582 u->cgroup_members_mask = 0;
583
584 if (u->type == UNIT_SLICE) {
585 Unit *member;
586 Iterator i;
587
588 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
589
590 if (member == u)
591 continue;
592
d4fdc205 593 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
594 continue;
595
596 u->cgroup_members_mask |=
efdb0237 597 unit_get_own_mask(member) |
bc432dc7
LP
598 unit_get_members_mask(member);
599 }
600 }
601
602 u->cgroup_members_mask_valid = true;
6414b7c9 603 return u->cgroup_members_mask;
246aa6dd
LP
604}
605
efdb0237 606CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 607 assert(u);
246aa6dd 608
efdb0237
LP
609 /* Returns the mask of controllers all of the unit's siblings
610 * require, i.e. the members mask of the unit's parent slice
611 * if there is one. */
612
bc432dc7 613 if (UNIT_ISSET(u->slice))
637f421e 614 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 615
efdb0237 616 return unit_get_own_mask(u) | unit_get_members_mask(u);
246aa6dd
LP
617}
618
efdb0237
LP
619CGroupMask unit_get_subtree_mask(Unit *u) {
620
621 /* Returns the mask of this subtree, meaning of the group
622 * itself and its children. */
623
624 return unit_get_own_mask(u) | unit_get_members_mask(u);
625}
626
627CGroupMask unit_get_target_mask(Unit *u) {
628 CGroupMask mask;
629
630 /* This returns the cgroup mask of all controllers to enable
631 * for a specific cgroup, i.e. everything it needs itself,
632 * plus all that its children need, plus all that its siblings
633 * need. This is primarily useful on the legacy cgroup
634 * hierarchy, where we need to duplicate each cgroup in each
635 * hierarchy that shall be enabled for it. */
6414b7c9 636
efdb0237
LP
637 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
638 mask &= u->manager->cgroup_supported;
639
640 return mask;
641}
642
643CGroupMask unit_get_enable_mask(Unit *u) {
644 CGroupMask mask;
645
646 /* This returns the cgroup mask of all controllers to enable
647 * for the children of a specific cgroup. This is primarily
648 * useful for the unified cgroup hierarchy, where each cgroup
649 * controls which controllers are enabled for its children. */
650
651 mask = unit_get_members_mask(u);
6414b7c9
DS
652 mask &= u->manager->cgroup_supported;
653
654 return mask;
655}
656
657/* Recurse from a unit up through its containing slices, propagating
658 * mask bits upward. A unit is also member of itself. */
bc432dc7 659void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 660 CGroupMask m;
bc432dc7
LP
661 bool more;
662
663 assert(u);
664
665 /* Calculate subtree mask */
efdb0237 666 m = unit_get_subtree_mask(u);
bc432dc7
LP
667
668 /* See if anything changed from the previous invocation. If
669 * not, we're done. */
670 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
671 return;
672
673 more =
674 u->cgroup_subtree_mask_valid &&
675 ((m & ~u->cgroup_subtree_mask) != 0) &&
676 ((~m & u->cgroup_subtree_mask) == 0);
677
678 u->cgroup_subtree_mask = m;
679 u->cgroup_subtree_mask_valid = true;
680
6414b7c9
DS
681 if (UNIT_ISSET(u->slice)) {
682 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
683
684 if (more)
685 /* There's more set now than before. We
686 * propagate the new mask to the parent's mask
687 * (not caring if it actually was valid or
688 * not). */
689
690 s->cgroup_members_mask |= m;
691
692 else
693 /* There's less set now than before (or we
694 * don't know), we need to recalculate
695 * everything, so let's invalidate the
696 * parent's members mask */
697
698 s->cgroup_members_mask_valid = false;
699
700 /* And now make sure that this change also hits our
701 * grandparents */
702 unit_update_cgroup_members_masks(s);
6414b7c9
DS
703 }
704}
705
efdb0237 706static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
707 Unit *u = userdata;
708
709 assert(mask != 0);
710 assert(u);
711
712 while (u) {
713 if (u->cgroup_path &&
714 u->cgroup_realized &&
715 (u->cgroup_realized_mask & mask) == mask)
716 return u->cgroup_path;
717
718 u = UNIT_DEREF(u->slice);
719 }
720
721 return NULL;
722}
723
efdb0237
LP
724char *unit_default_cgroup_path(Unit *u) {
725 _cleanup_free_ char *escaped = NULL, *slice = NULL;
726 int r;
727
728 assert(u);
729
730 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
731 return strdup(u->manager->cgroup_root);
732
733 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
734 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
735 if (r < 0)
736 return NULL;
737 }
738
739 escaped = cg_escape(u->id);
740 if (!escaped)
741 return NULL;
742
743 if (slice)
744 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
745 else
746 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
747}
748
749int unit_set_cgroup_path(Unit *u, const char *path) {
750 _cleanup_free_ char *p = NULL;
751 int r;
752
753 assert(u);
754
755 if (path) {
756 p = strdup(path);
757 if (!p)
758 return -ENOMEM;
759 } else
760 p = NULL;
761
762 if (streq_ptr(u->cgroup_path, p))
763 return 0;
764
765 if (p) {
766 r = hashmap_put(u->manager->cgroup_unit, p, u);
767 if (r < 0)
768 return r;
769 }
770
771 unit_release_cgroup(u);
772
773 u->cgroup_path = p;
774 p = NULL;
775
776 return 1;
777}
778
779int unit_watch_cgroup(Unit *u) {
780 _cleanup_free_ char *populated = NULL;
781 int r;
782
783 assert(u);
784
785 if (!u->cgroup_path)
786 return 0;
787
788 if (u->cgroup_inotify_wd >= 0)
789 return 0;
790
791 /* Only applies to the unified hierarchy */
792 r = cg_unified();
793 if (r < 0)
794 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
795 if (r == 0)
796 return 0;
797
798 /* Don't watch the root slice, it's pointless. */
799 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
800 return 0;
801
802 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
803 if (r < 0)
804 return log_oom();
805
806 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
807 if (r < 0)
808 return log_oom();
809
810 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
811 if (u->cgroup_inotify_wd < 0) {
812
813 if (errno == ENOENT) /* If the directory is already
814 * gone we don't need to track
815 * it, so this is not an error */
816 return 0;
817
818 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
819 }
820
821 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
822 if (r < 0)
823 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
824
825 return 0;
826}
827
828static int unit_create_cgroup(
829 Unit *u,
830 CGroupMask target_mask,
831 CGroupMask enable_mask) {
832
0cd385d3 833 CGroupContext *c;
bc432dc7 834 int r;
64747e2d 835
4ad49000 836 assert(u);
64747e2d 837
0cd385d3
LP
838 c = unit_get_cgroup_context(u);
839 if (!c)
840 return 0;
841
7b3fd631
LP
842 if (!u->cgroup_path) {
843 _cleanup_free_ char *path = NULL;
64747e2d 844
7b3fd631
LP
845 path = unit_default_cgroup_path(u);
846 if (!path)
847 return log_oom();
848
efdb0237
LP
849 r = unit_set_cgroup_path(u, path);
850 if (r == -EEXIST)
851 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
852 if (r < 0)
853 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
b58b8e11
HH
854 }
855
03b90d4b 856 /* First, create our own group */
efdb0237 857 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 858 if (r < 0)
efdb0237
LP
859 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
860
861 /* Start watching it */
862 (void) unit_watch_cgroup(u);
863
864 /* Enable all controllers we need */
865 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
866 if (r < 0)
867 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
868
869 /* Keep track that this is now realized */
4ad49000 870 u->cgroup_realized = true;
efdb0237 871 u->cgroup_realized_mask = target_mask;
4ad49000 872
0cd385d3
LP
873 if (u->type != UNIT_SLICE && !c->delegate) {
874
875 /* Then, possibly move things over, but not if
876 * subgroups may contain processes, which is the case
877 * for slice and delegation units. */
878 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
879 if (r < 0)
efdb0237 880 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 881 }
03b90d4b 882
64747e2d
LP
883 return 0;
884}
885
7b3fd631
LP
886int unit_attach_pids_to_cgroup(Unit *u) {
887 int r;
888 assert(u);
889
890 r = unit_realize_cgroup(u);
891 if (r < 0)
892 return r;
893
894 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
895 if (r < 0)
896 return r;
897
898 return 0;
899}
900
efdb0237 901static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
bc432dc7
LP
902 assert(u);
903
efdb0237 904 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
6414b7c9
DS
905}
906
32ee7d33
DM
907static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
908
909 uint32_t start, i;
910 Manager *m;
911
912 assert(u);
913
914 m = u->manager;
915
916 i = start = m->cgroup_netclass_registry_last;
917
918 do {
919 i++;
920
921 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
922 m->cgroup_netclass_registry_last = i;
923 *ret = i;
924 return 0;
925 }
926
927 if (i == UINT32_MAX)
928 i = CGROUP_NETCLASS_FIXED_MAX;
929
930 } while (i != start);
931
932 return -ENOBUFS;
933}
934
935int unit_add_to_netclass_cgroup(Unit *u) {
936
937 CGroupContext *cc;
938 Unit *first;
939 void *key;
940 int r;
941
942 assert(u);
943
944 cc = unit_get_cgroup_context(u);
945 if (!cc)
946 return 0;
947
948 switch (cc->netclass_type) {
949 case CGROUP_NETCLASS_TYPE_NONE:
950 return 0;
951
952 case CGROUP_NETCLASS_TYPE_FIXED:
953 u->cgroup_netclass_id = cc->netclass_id;
954 break;
955
956 case CGROUP_NETCLASS_TYPE_AUTO:
957 /* Allocate a new ID in case it was requested and not done yet */
958 if (u->cgroup_netclass_id == 0) {
959 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
960 if (r < 0)
961 return r;
962
963 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
964 }
965
966 break;
967 }
968
969 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
970 if (r < 0)
971 return r;
972
973 key = UINT32_TO_PTR(u->cgroup_netclass_id);
974 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
975
976 if (first) {
977 LIST_PREPEND(cgroup_netclass, first, u);
978 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
979 }
980
981 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
982}
983
984int unit_remove_from_netclass_cgroup(Unit *u) {
985
986 Unit *head;
987 void *key;
988
989 assert(u);
990
991 key = UINT32_TO_PTR(u->cgroup_netclass_id);
992
993 LIST_FIND_HEAD(cgroup_netclass, u, head);
994 LIST_REMOVE(cgroup_netclass, head, u);
995
996 if (head)
997 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
998
999 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1000
1001 return 0;
1002}
1003
6414b7c9
DS
1004/* Check if necessary controllers and attributes for a unit are in place.
1005 *
1006 * If so, do nothing.
1007 * If not, create paths, move processes over, and set attributes.
1008 *
1009 * Returns 0 on success and < 0 on failure. */
db785129 1010static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1011 CGroupMask target_mask, enable_mask;
6414b7c9 1012 int r;
64747e2d 1013
4ad49000 1014 assert(u);
64747e2d 1015
4ad49000 1016 if (u->in_cgroup_queue) {
71fda00f 1017 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
1018 u->in_cgroup_queue = false;
1019 }
64747e2d 1020
efdb0237
LP
1021 target_mask = unit_get_target_mask(u);
1022 if (unit_has_mask_realized(u, target_mask))
0a1eb06d 1023 return 0;
64747e2d 1024
4ad49000 1025 /* First, realize parents */
6414b7c9 1026 if (UNIT_ISSET(u->slice)) {
db785129 1027 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1028 if (r < 0)
1029 return r;
1030 }
4ad49000
LP
1031
1032 /* And then do the real work */
efdb0237
LP
1033 enable_mask = unit_get_enable_mask(u);
1034 r = unit_create_cgroup(u, target_mask, enable_mask);
6414b7c9
DS
1035 if (r < 0)
1036 return r;
1037
1038 /* Finally, apply the necessary attributes. */
32ee7d33 1039 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
6414b7c9
DS
1040
1041 return 0;
64747e2d
LP
1042}
1043
4ad49000 1044static void unit_add_to_cgroup_queue(Unit *u) {
ecedd90f 1045
4ad49000
LP
1046 if (u->in_cgroup_queue)
1047 return;
8e274523 1048
71fda00f 1049 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
1050 u->in_cgroup_queue = true;
1051}
8c6db833 1052
4ad49000 1053unsigned manager_dispatch_cgroup_queue(Manager *m) {
db785129 1054 ManagerState state;
4ad49000 1055 unsigned n = 0;
db785129 1056 Unit *i;
6414b7c9 1057 int r;
ecedd90f 1058
db785129
LP
1059 state = manager_state(m);
1060
4ad49000
LP
1061 while ((i = m->cgroup_queue)) {
1062 assert(i->in_cgroup_queue);
ecedd90f 1063
db785129 1064 r = unit_realize_cgroup_now(i, state);
6414b7c9 1065 if (r < 0)
efdb0237 1066 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1067
4ad49000
LP
1068 n++;
1069 }
ecedd90f 1070
4ad49000 1071 return n;
8e274523
LP
1072}
1073
4ad49000
LP
1074static void unit_queue_siblings(Unit *u) {
1075 Unit *slice;
ca949c9d 1076
4ad49000
LP
1077 /* This adds the siblings of the specified unit and the
1078 * siblings of all parent units to the cgroup queue. (But
1079 * neither the specified unit itself nor the parents.) */
1080
1081 while ((slice = UNIT_DEREF(u->slice))) {
1082 Iterator i;
1083 Unit *m;
8f53a7b8 1084
4ad49000
LP
1085 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1086 if (m == u)
1087 continue;
8e274523 1088
6414b7c9
DS
1089 /* Skip units that have a dependency on the slice
1090 * but aren't actually in it. */
4ad49000 1091 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1092 continue;
8e274523 1093
6414b7c9
DS
1094 /* No point in doing cgroup application for units
1095 * without active processes. */
1096 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1097 continue;
1098
1099 /* If the unit doesn't need any new controllers
1100 * and has current ones realized, it doesn't need
1101 * any changes. */
1102 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1103 continue;
1104
4ad49000 1105 unit_add_to_cgroup_queue(m);
50159e6a
LP
1106 }
1107
4ad49000 1108 u = slice;
8e274523 1109 }
4ad49000
LP
1110}
1111
0a1eb06d 1112int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1113 assert(u);
1114
35b7ff80 1115 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1116 return 0;
8e274523 1117
4ad49000
LP
1118 /* So, here's the deal: when realizing the cgroups for this
1119 * unit, we need to first create all parents, but there's more
1120 * actually: for the weight-based controllers we also need to
1121 * make sure that all our siblings (i.e. units that are in the
73e231ab 1122 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1123 * would become very uneven as each of their processes would
1124 * get as much resources as all our group together. This call
1125 * will synchronously create the parent cgroups, but will
1126 * defer work on the siblings to the next event loop
1127 * iteration. */
ca949c9d 1128
4ad49000
LP
1129 /* Add all sibling slices to the cgroup queue. */
1130 unit_queue_siblings(u);
1131
6414b7c9 1132 /* And realize this one now (and apply the values) */
db785129 1133 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1134}
1135
efdb0237
LP
1136void unit_release_cgroup(Unit *u) {
1137 assert(u);
1138
1139 /* Forgets all cgroup details for this cgroup */
1140
1141 if (u->cgroup_path) {
1142 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1143 u->cgroup_path = mfree(u->cgroup_path);
1144 }
1145
1146 if (u->cgroup_inotify_wd >= 0) {
1147 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1148 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1149
1150 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1151 u->cgroup_inotify_wd = -1;
1152 }
1153}
1154
1155void unit_prune_cgroup(Unit *u) {
8e274523 1156 int r;
efdb0237 1157 bool is_root_slice;
8e274523 1158
4ad49000 1159 assert(u);
8e274523 1160
efdb0237
LP
1161 /* Removes the cgroup, if empty and possible, and stops watching it. */
1162
4ad49000
LP
1163 if (!u->cgroup_path)
1164 return;
8e274523 1165
efdb0237
LP
1166 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1167
1168 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1169 if (r < 0) {
efdb0237 1170 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1171 return;
1172 }
8e274523 1173
efdb0237
LP
1174 if (is_root_slice)
1175 return;
1176
1177 unit_release_cgroup(u);
0a1eb06d 1178
4ad49000 1179 u->cgroup_realized = false;
bc432dc7 1180 u->cgroup_realized_mask = 0;
8e274523
LP
1181}
1182
efdb0237 1183int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1184 _cleanup_fclose_ FILE *f = NULL;
1185 pid_t pid = 0, npid, mypid;
efdb0237 1186 int r;
4ad49000
LP
1187
1188 assert(u);
efdb0237 1189 assert(ret);
4ad49000
LP
1190
1191 if (!u->cgroup_path)
efdb0237 1192 return -ENXIO;
4ad49000 1193
efdb0237
LP
1194 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1195 if (r < 0)
1196 return r;
4ad49000
LP
1197
1198 mypid = getpid();
1199 while (cg_read_pid(f, &npid) > 0) {
1200 pid_t ppid;
1201
1202 if (npid == pid)
1203 continue;
8e274523 1204
4ad49000
LP
1205 /* Ignore processes that aren't our kids */
1206 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1207 continue;
8e274523 1208
efdb0237 1209 if (pid != 0)
4ad49000
LP
1210 /* Dang, there's more than one daemonized PID
1211 in this group, so we don't know what process
1212 is the main process. */
efdb0237
LP
1213
1214 return -ENODATA;
8e274523 1215
4ad49000 1216 pid = npid;
8e274523
LP
1217 }
1218
efdb0237
LP
1219 *ret = pid;
1220 return 0;
1221}
1222
1223static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1224 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1225 _cleanup_fclose_ FILE *f = NULL;
1226 int ret = 0, r;
1227
1228 assert(u);
1229 assert(path);
1230
1231 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1232 if (r < 0)
1233 ret = r;
1234 else {
1235 pid_t pid;
1236
1237 while ((r = cg_read_pid(f, &pid)) > 0) {
1238 r = unit_watch_pid(u, pid);
1239 if (r < 0 && ret >= 0)
1240 ret = r;
1241 }
1242
1243 if (r < 0 && ret >= 0)
1244 ret = r;
1245 }
1246
1247 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1248 if (r < 0) {
1249 if (ret >= 0)
1250 ret = r;
1251 } else {
1252 char *fn;
1253
1254 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1255 _cleanup_free_ char *p = NULL;
1256
1257 p = strjoin(path, "/", fn, NULL);
1258 free(fn);
1259
1260 if (!p)
1261 return -ENOMEM;
1262
1263 r = unit_watch_pids_in_path(u, p);
1264 if (r < 0 && ret >= 0)
1265 ret = r;
1266 }
1267
1268 if (r < 0 && ret >= 0)
1269 ret = r;
1270 }
1271
1272 return ret;
1273}
1274
1275int unit_watch_all_pids(Unit *u) {
1276 assert(u);
1277
1278 /* Adds all PIDs from our cgroup to the set of PIDs we
1279 * watch. This is a fallback logic for cases where we do not
1280 * get reliable cgroup empty notifications: we try to use
1281 * SIGCHLD as replacement. */
1282
1283 if (!u->cgroup_path)
1284 return -ENOENT;
1285
1286 if (cg_unified() > 0) /* On unified we can use proper notifications */
1287 return 0;
1288
1289 return unit_watch_pids_in_path(u, u->cgroup_path);
1290}
1291
1292int unit_notify_cgroup_empty(Unit *u) {
1293 int r;
1294
1295 assert(u);
1296
1297 if (!u->cgroup_path)
1298 return 0;
1299
1300 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1301 if (r <= 0)
1302 return r;
1303
1304 unit_add_to_gc_queue(u);
1305
1306 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1307 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1308
1309 return 0;
1310}
1311
1312static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1313 Manager *m = userdata;
1314
1315 assert(s);
1316 assert(fd >= 0);
1317 assert(m);
1318
1319 for (;;) {
1320 union inotify_event_buffer buffer;
1321 struct inotify_event *e;
1322 ssize_t l;
1323
1324 l = read(fd, &buffer, sizeof(buffer));
1325 if (l < 0) {
1326 if (errno == EINTR || errno == EAGAIN)
1327 return 0;
1328
1329 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1330 }
1331
1332 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1333 Unit *u;
1334
1335 if (e->wd < 0)
1336 /* Queue overflow has no watch descriptor */
1337 continue;
1338
1339 if (e->mask & IN_IGNORED)
1340 /* The watch was just removed */
1341 continue;
1342
1343 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1344 if (!u) /* Not that inotify might deliver
1345 * events for a watch even after it
1346 * was removed, because it was queued
1347 * before the removal. Let's ignore
1348 * this here safely. */
1349 continue;
1350
1351 (void) unit_notify_cgroup_empty(u);
1352 }
1353 }
8e274523
LP
1354}
1355
8e274523 1356int manager_setup_cgroup(Manager *m) {
9444b1f2 1357 _cleanup_free_ char *path = NULL;
efdb0237
LP
1358 CGroupController c;
1359 int r, unified;
1360 char *e;
8e274523
LP
1361
1362 assert(m);
1363
35d2e7ec 1364 /* 1. Determine hierarchy */
efdb0237 1365 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 1366 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
1367 if (r < 0)
1368 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 1369
efdb0237
LP
1370 /* Chop off the init scope, if we are already located in it */
1371 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 1372
efdb0237
LP
1373 /* LEGACY: Also chop off the system slice if we are in
1374 * it. This is to support live upgrades from older systemd
1375 * versions where PID 1 was moved there. Also see
1376 * cg_get_root_path(). */
1377 if (!e && m->running_as == MANAGER_SYSTEM) {
9444b1f2 1378 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 1379 if (!e)
efdb0237 1380 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 1381 }
efdb0237
LP
1382 if (e)
1383 *e = 0;
7ccfb64a 1384
9444b1f2
LP
1385 /* And make sure to store away the root value without trailing
1386 * slash, even for the root dir, so that we can easily prepend
1387 * it everywhere. */
efdb0237
LP
1388 while ((e = endswith(m->cgroup_root, "/")))
1389 *e = 0;
8e274523 1390
35d2e7ec 1391 /* 2. Show data */
9444b1f2 1392 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
1393 if (r < 0)
1394 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 1395
efdb0237
LP
1396 unified = cg_unified();
1397 if (unified < 0)
1398 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1399 if (unified > 0)
1400 log_debug("Unified cgroup hierarchy is located at %s.", path);
1401 else
1402 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1403
0d8c31ff 1404 if (!m->test_run) {
efdb0237 1405 const char *scope_path;
c6c18be3 1406
0d8c31ff 1407 /* 3. Install agent */
efdb0237
LP
1408 if (unified) {
1409
1410 /* In the unified hierarchy we can can get
1411 * cgroup empty notifications via inotify. */
1412
1413 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1414 safe_close(m->cgroup_inotify_fd);
1415
1416 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1417 if (m->cgroup_inotify_fd < 0)
1418 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1419
1420 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1421 if (r < 0)
1422 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1423
1424 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1425 if (r < 0)
1426 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1427
1428 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1429
1430 } else if (m->running_as == MANAGER_SYSTEM) {
1431
1432 /* On the legacy hierarchy we only get
1433 * notifications via cgroup agents. (Which
1434 * isn't really reliable, since it does not
1435 * generate events when control groups with
1436 * children run empty. */
1437
0d8c31ff
ZJS
1438 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1439 if (r < 0)
da927ba9 1440 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
0d8c31ff
ZJS
1441 else if (r > 0)
1442 log_debug("Installed release agent.");
efdb0237 1443 else if (r == 0)
0d8c31ff
ZJS
1444 log_debug("Release agent already installed.");
1445 }
8e274523 1446
efdb0237
LP
1447 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1448 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1449 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
23bbb0de 1450 if (r < 0)
efdb0237
LP
1451 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1452
1453 /* also, move all other userspace processes remaining
1454 * in the root cgroup into that scope. */
1455 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1456 if (r < 0)
1457 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
c6c18be3 1458
0d8c31ff
ZJS
1459 /* 5. And pin it, so that it cannot be unmounted */
1460 safe_close(m->pin_cgroupfs_fd);
0d8c31ff 1461 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1462 if (m->pin_cgroupfs_fd < 0)
1463 return log_error_errno(errno, "Failed to open pin file: %m");
0d8c31ff 1464
cc98b302 1465 /* 6. Always enable hierarchical support if it exists... */
efdb0237
LP
1466 if (!unified)
1467 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3
LP
1468 }
1469
0d8c31ff 1470 /* 7. Figure out which controllers are supported */
efdb0237
LP
1471 r = cg_mask_supported(&m->cgroup_supported);
1472 if (r < 0)
1473 return log_error_errno(r, "Failed to determine supported controllers: %m");
1474
1475 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1476 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
9156e799 1477
a32360f1 1478 return 0;
8e274523
LP
1479}
1480
c6c18be3 1481void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
1482 assert(m);
1483
9444b1f2
LP
1484 /* We can't really delete the group, since we are in it. But
1485 * let's trim it. */
1486 if (delete && m->cgroup_root)
efdb0237
LP
1487 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1488
1489 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1490
1491 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1492 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 1493
03e334a1 1494 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 1495
efdb0237 1496 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
1497}
1498
4ad49000 1499Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 1500 char *p;
4ad49000 1501 Unit *u;
acb14d31
LP
1502
1503 assert(m);
1504 assert(cgroup);
acb14d31 1505
4ad49000
LP
1506 u = hashmap_get(m->cgroup_unit, cgroup);
1507 if (u)
1508 return u;
acb14d31 1509
8e70580b 1510 p = strdupa(cgroup);
acb14d31
LP
1511 for (;;) {
1512 char *e;
1513
1514 e = strrchr(p, '/');
efdb0237
LP
1515 if (!e || e == p)
1516 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
1517
1518 *e = 0;
1519
4ad49000
LP
1520 u = hashmap_get(m->cgroup_unit, p);
1521 if (u)
1522 return u;
acb14d31
LP
1523 }
1524}
1525
b3ac818b 1526Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 1527 _cleanup_free_ char *cgroup = NULL;
acb14d31 1528 int r;
8e274523 1529
8c47c732
LP
1530 assert(m);
1531
b3ac818b
LP
1532 if (pid <= 0)
1533 return NULL;
1534
1535 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1536 if (r < 0)
1537 return NULL;
1538
1539 return manager_get_unit_by_cgroup(m, cgroup);
1540}
1541
1542Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1543 Unit *u;
1544
1545 assert(m);
1546
efdb0237 1547 if (pid <= 0)
8c47c732
LP
1548 return NULL;
1549
efdb0237
LP
1550 if (pid == 1)
1551 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1552
fea72cc0 1553 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
5fe8876b
LP
1554 if (u)
1555 return u;
1556
fea72cc0 1557 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
5fe8876b
LP
1558 if (u)
1559 return u;
1560
b3ac818b 1561 return manager_get_unit_by_pid_cgroup(m, pid);
6dde1f33 1562}
4fbf50b3 1563
4ad49000
LP
1564int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1565 Unit *u;
4fbf50b3 1566
4ad49000
LP
1567 assert(m);
1568 assert(cgroup);
4fbf50b3 1569
4ad49000 1570 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
1571 if (!u)
1572 return 0;
b56c28c3 1573
efdb0237 1574 return unit_notify_cgroup_empty(u);
5ad096b3
LP
1575}
1576
1577int unit_get_memory_current(Unit *u, uint64_t *ret) {
1578 _cleanup_free_ char *v = NULL;
1579 int r;
1580
1581 assert(u);
1582 assert(ret);
1583
1584 if (!u->cgroup_path)
1585 return -ENODATA;
1586
efdb0237 1587 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
1588 return -ENODATA;
1589
efdb0237
LP
1590 if (cg_unified() <= 0)
1591 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1592 else
1593 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
5ad096b3
LP
1594 if (r == -ENOENT)
1595 return -ENODATA;
1596 if (r < 0)
1597 return r;
1598
1599 return safe_atou64(v, ret);
1600}
1601
03a7b521
LP
1602int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1603 _cleanup_free_ char *v = NULL;
1604 int r;
1605
1606 assert(u);
1607 assert(ret);
1608
1609 if (!u->cgroup_path)
1610 return -ENODATA;
1611
1612 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1613 return -ENODATA;
1614
1615 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1616 if (r == -ENOENT)
1617 return -ENODATA;
1618 if (r < 0)
1619 return r;
1620
1621 return safe_atou64(v, ret);
1622}
1623
5ad096b3
LP
1624static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1625 _cleanup_free_ char *v = NULL;
1626 uint64_t ns;
1627 int r;
1628
1629 assert(u);
1630 assert(ret);
1631
1632 if (!u->cgroup_path)
1633 return -ENODATA;
1634
efdb0237 1635 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
5ad096b3
LP
1636 return -ENODATA;
1637
1638 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1639 if (r == -ENOENT)
1640 return -ENODATA;
1641 if (r < 0)
1642 return r;
1643
1644 r = safe_atou64(v, &ns);
1645 if (r < 0)
1646 return r;
1647
1648 *ret = ns;
1649 return 0;
1650}
1651
1652int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1653 nsec_t ns;
1654 int r;
1655
1656 r = unit_get_cpu_usage_raw(u, &ns);
1657 if (r < 0)
1658 return r;
1659
1660 if (ns > u->cpuacct_usage_base)
1661 ns -= u->cpuacct_usage_base;
1662 else
1663 ns = 0;
1664
1665 *ret = ns;
1666 return 0;
1667}
1668
1669int unit_reset_cpu_usage(Unit *u) {
1670 nsec_t ns;
1671 int r;
1672
1673 assert(u);
1674
1675 r = unit_get_cpu_usage_raw(u, &ns);
1676 if (r < 0) {
1677 u->cpuacct_usage_base = 0;
1678 return r;
b56c28c3 1679 }
2633eb83 1680
5ad096b3 1681 u->cpuacct_usage_base = ns;
4ad49000 1682 return 0;
4fbf50b3
LP
1683}
1684
e9db43d5
LP
1685bool unit_cgroup_delegate(Unit *u) {
1686 CGroupContext *c;
1687
1688 assert(u);
1689
1690 c = unit_get_cgroup_context(u);
1691 if (!c)
1692 return false;
1693
1694 return c->delegate;
1695}
1696
e7ab4d1a
LP
1697void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1698 assert(u);
1699
1700 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1701 return;
1702
1703 if (m == 0)
1704 return;
1705
1706 if ((u->cgroup_realized_mask & m) == 0)
1707 return;
1708
1709 u->cgroup_realized_mask &= ~m;
1710 unit_add_to_cgroup_queue(u);
1711}
1712
1713void manager_invalidate_startup_units(Manager *m) {
1714 Iterator i;
1715 Unit *u;
1716
1717 assert(m);
1718
1719 SET_FOREACH(u, m->startup_units, i)
1720 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1721}
1722
4ad49000
LP
1723static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1724 [CGROUP_AUTO] = "auto",
1725 [CGROUP_CLOSED] = "closed",
1726 [CGROUP_STRICT] = "strict",
1727};
4fbf50b3 1728
4ad49000 1729DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);