]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/cgroup.c
util-lib: wrap personality() to fix up broken glibc error handling (#6766)
[thirdparty/systemd.git] / src / core / cgroup.c
CommitLineData
8e274523
LP
1/***
2 This file is part of systemd.
3
4ad49000 4 Copyright 2013 Lennart Poettering
8e274523
LP
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
8e274523 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
c6c18be3 20#include <fcntl.h>
e41969e3 21#include <fnmatch.h>
8c6db833 22
b5efdb8a 23#include "alloc-util.h"
03a7b521 24#include "cgroup-util.h"
3ffd4af2
LP
25#include "cgroup.h"
26#include "fd-util.h"
0d39fa9c 27#include "fileio.h"
77601719 28#include "fs-util.h"
6bedfcbb 29#include "parse-util.h"
9eb977db 30#include "path-util.h"
03a7b521 31#include "process-util.h"
9444b1f2 32#include "special.h"
8b43440b 33#include "string-table.h"
07630cea 34#include "string-util.h"
13c31542 35#include "stdio-util.h"
8e274523 36
9a054909
LP
37#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
38
2b40998d 39static void cgroup_compat_warn(void) {
128fadc9
TH
40 static bool cgroup_compat_warned = false;
41
42 if (cgroup_compat_warned)
43 return;
44
45 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
46 cgroup_compat_warned = true;
47}
48
49#define log_cgroup_compat(unit, fmt, ...) do { \
50 cgroup_compat_warn(); \
51 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
2b40998d 52 } while (false)
128fadc9 53
4ad49000
LP
54void cgroup_context_init(CGroupContext *c) {
55 assert(c);
56
57 /* Initialize everything to the kernel defaults, assuming the
58 * structure is preinitialized to 0 */
59
66ebf6c0
TH
60 c->cpu_weight = CGROUP_WEIGHT_INVALID;
61 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
62 c->cpu_quota_per_sec_usec = USEC_INFINITY;
63
d53d9474
LP
64 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
65 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
d53d9474 66
da4d897e
TH
67 c->memory_high = CGROUP_LIMIT_MAX;
68 c->memory_max = CGROUP_LIMIT_MAX;
96e131ea 69 c->memory_swap_max = CGROUP_LIMIT_MAX;
da4d897e
TH
70
71 c->memory_limit = CGROUP_LIMIT_MAX;
b2f8b02e 72
13c31542
TH
73 c->io_weight = CGROUP_WEIGHT_INVALID;
74 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
75
d53d9474
LP
76 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
77 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
78
79 c->tasks_max = (uint64_t) -1;
4ad49000 80}
8e274523 81
4ad49000
LP
82void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
83 assert(c);
84 assert(a);
85
71fda00f 86 LIST_REMOVE(device_allow, c->device_allow, a);
4ad49000
LP
87 free(a->path);
88 free(a);
89}
90
13c31542
TH
91void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
92 assert(c);
93 assert(w);
94
95 LIST_REMOVE(device_weights, c->io_device_weights, w);
96 free(w->path);
97 free(w);
98}
99
100void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
101 assert(c);
102 assert(l);
103
104 LIST_REMOVE(device_limits, c->io_device_limits, l);
105 free(l->path);
106 free(l);
107}
108
4ad49000
LP
109void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
110 assert(c);
111 assert(w);
112
71fda00f 113 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
4ad49000
LP
114 free(w->path);
115 free(w);
116}
117
118void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
119 assert(c);
8e274523 120 assert(b);
8e274523 121
71fda00f 122 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
4ad49000
LP
123 free(b->path);
124 free(b);
125}
126
127void cgroup_context_done(CGroupContext *c) {
128 assert(c);
129
13c31542
TH
130 while (c->io_device_weights)
131 cgroup_context_free_io_device_weight(c, c->io_device_weights);
132
133 while (c->io_device_limits)
134 cgroup_context_free_io_device_limit(c, c->io_device_limits);
135
4ad49000
LP
136 while (c->blockio_device_weights)
137 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
138
139 while (c->blockio_device_bandwidths)
140 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
141
142 while (c->device_allow)
143 cgroup_context_free_device_allow(c, c->device_allow);
144}
145
146void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
13c31542
TH
147 CGroupIODeviceLimit *il;
148 CGroupIODeviceWeight *iw;
4ad49000
LP
149 CGroupBlockIODeviceBandwidth *b;
150 CGroupBlockIODeviceWeight *w;
151 CGroupDeviceAllow *a;
9a054909 152 char u[FORMAT_TIMESPAN_MAX];
4ad49000
LP
153
154 assert(c);
155 assert(f);
156
157 prefix = strempty(prefix);
158
159 fprintf(f,
160 "%sCPUAccounting=%s\n"
13c31542 161 "%sIOAccounting=%s\n"
4ad49000
LP
162 "%sBlockIOAccounting=%s\n"
163 "%sMemoryAccounting=%s\n"
d53d9474 164 "%sTasksAccounting=%s\n"
66ebf6c0
TH
165 "%sCPUWeight=%" PRIu64 "\n"
166 "%sStartupCPUWeight=%" PRIu64 "\n"
d53d9474
LP
167 "%sCPUShares=%" PRIu64 "\n"
168 "%sStartupCPUShares=%" PRIu64 "\n"
b2f8b02e 169 "%sCPUQuotaPerSecSec=%s\n"
13c31542
TH
170 "%sIOWeight=%" PRIu64 "\n"
171 "%sStartupIOWeight=%" PRIu64 "\n"
d53d9474
LP
172 "%sBlockIOWeight=%" PRIu64 "\n"
173 "%sStartupBlockIOWeight=%" PRIu64 "\n"
da4d897e
TH
174 "%sMemoryLow=%" PRIu64 "\n"
175 "%sMemoryHigh=%" PRIu64 "\n"
176 "%sMemoryMax=%" PRIu64 "\n"
96e131ea 177 "%sMemorySwapMax=%" PRIu64 "\n"
4ad49000 178 "%sMemoryLimit=%" PRIu64 "\n"
03a7b521 179 "%sTasksMax=%" PRIu64 "\n"
a931ad47
LP
180 "%sDevicePolicy=%s\n"
181 "%sDelegate=%s\n",
4ad49000 182 prefix, yes_no(c->cpu_accounting),
13c31542 183 prefix, yes_no(c->io_accounting),
4ad49000
LP
184 prefix, yes_no(c->blockio_accounting),
185 prefix, yes_no(c->memory_accounting),
d53d9474 186 prefix, yes_no(c->tasks_accounting),
66ebf6c0
TH
187 prefix, c->cpu_weight,
188 prefix, c->startup_cpu_weight,
4ad49000 189 prefix, c->cpu_shares,
95ae05c0 190 prefix, c->startup_cpu_shares,
b1d6dcf5 191 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
13c31542
TH
192 prefix, c->io_weight,
193 prefix, c->startup_io_weight,
4ad49000 194 prefix, c->blockio_weight,
95ae05c0 195 prefix, c->startup_blockio_weight,
da4d897e
TH
196 prefix, c->memory_low,
197 prefix, c->memory_high,
198 prefix, c->memory_max,
96e131ea 199 prefix, c->memory_swap_max,
4ad49000 200 prefix, c->memory_limit,
03a7b521 201 prefix, c->tasks_max,
a931ad47
LP
202 prefix, cgroup_device_policy_to_string(c->device_policy),
203 prefix, yes_no(c->delegate));
4ad49000
LP
204
205 LIST_FOREACH(device_allow, a, c->device_allow)
206 fprintf(f,
207 "%sDeviceAllow=%s %s%s%s\n",
208 prefix,
209 a->path,
210 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
211
13c31542
TH
212 LIST_FOREACH(device_weights, iw, c->io_device_weights)
213 fprintf(f,
214 "%sIODeviceWeight=%s %" PRIu64,
215 prefix,
216 iw->path,
217 iw->weight);
218
219 LIST_FOREACH(device_limits, il, c->io_device_limits) {
220 char buf[FORMAT_BYTES_MAX];
9be57249
TH
221 CGroupIOLimitType type;
222
223 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
224 if (il->limits[type] != cgroup_io_limit_defaults[type])
225 fprintf(f,
226 "%s%s=%s %s\n",
227 prefix,
228 cgroup_io_limit_type_to_string(type),
229 il->path,
230 format_bytes(buf, sizeof(buf), il->limits[type]));
13c31542
TH
231 }
232
4ad49000
LP
233 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
234 fprintf(f,
d53d9474 235 "%sBlockIODeviceWeight=%s %" PRIu64,
4ad49000
LP
236 prefix,
237 w->path,
238 w->weight);
239
240 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
241 char buf[FORMAT_BYTES_MAX];
242
979d0311
TH
243 if (b->rbps != CGROUP_LIMIT_MAX)
244 fprintf(f,
245 "%sBlockIOReadBandwidth=%s %s\n",
246 prefix,
247 b->path,
248 format_bytes(buf, sizeof(buf), b->rbps));
249 if (b->wbps != CGROUP_LIMIT_MAX)
250 fprintf(f,
251 "%sBlockIOWriteBandwidth=%s %s\n",
252 prefix,
253 b->path,
254 format_bytes(buf, sizeof(buf), b->wbps));
4ad49000
LP
255 }
256}
257
13c31542 258static int lookup_block_device(const char *p, dev_t *dev) {
4ad49000
LP
259 struct stat st;
260 int r;
261
262 assert(p);
263 assert(dev);
264
265 r = stat(p, &st);
4a62c710
MS
266 if (r < 0)
267 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
8e274523 268
4ad49000
LP
269 if (S_ISBLK(st.st_mode))
270 *dev = st.st_rdev;
271 else if (major(st.st_dev) != 0) {
272 /* If this is not a device node then find the block
273 * device this file is stored on */
274 *dev = st.st_dev;
275
276 /* If this is a partition, try to get the originating
277 * block device */
278 block_get_whole_disk(*dev, dev);
279 } else {
280 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
281 return -ENODEV;
282 }
8e274523 283
8e274523 284 return 0;
8e274523
LP
285}
286
4ad49000
LP
287static int whitelist_device(const char *path, const char *node, const char *acc) {
288 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
289 struct stat st;
b200489b 290 bool ignore_notfound;
8c6db833 291 int r;
8e274523 292
4ad49000
LP
293 assert(path);
294 assert(acc);
8e274523 295
b200489b
DR
296 if (node[0] == '-') {
297 /* Non-existent paths starting with "-" must be silently ignored */
298 node++;
299 ignore_notfound = true;
300 } else
301 ignore_notfound = false;
302
4ad49000 303 if (stat(node, &st) < 0) {
b200489b 304 if (errno == ENOENT && ignore_notfound)
e7330dfe
DP
305 return 0;
306
307 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
4ad49000
LP
308 }
309
310 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
311 log_warning("%s is not a device.", node);
312 return -ENODEV;
313 }
314
315 sprintf(buf,
316 "%c %u:%u %s",
317 S_ISCHR(st.st_mode) ? 'c' : 'b',
318 major(st.st_rdev), minor(st.st_rdev),
319 acc);
320
321 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 322 if (r < 0)
077ba06e 323 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 324 "Failed to set devices.allow on %s: %m", path);
4ad49000
LP
325
326 return r;
8e274523
LP
327}
328
90060676
LP
329static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
330 _cleanup_fclose_ FILE *f = NULL;
331 char line[LINE_MAX];
332 bool good = false;
333 int r;
334
335 assert(path);
336 assert(acc);
337 assert(type == 'b' || type == 'c');
338
339 f = fopen("/proc/devices", "re");
4a62c710
MS
340 if (!f)
341 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
90060676
LP
342
343 FOREACH_LINE(line, f, goto fail) {
344 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
345 unsigned maj;
346
347 truncate_nl(line);
348
349 if (type == 'c' && streq(line, "Character devices:")) {
350 good = true;
351 continue;
352 }
353
354 if (type == 'b' && streq(line, "Block devices:")) {
355 good = true;
356 continue;
357 }
358
359 if (isempty(line)) {
360 good = false;
361 continue;
362 }
363
364 if (!good)
365 continue;
366
367 p = strstrip(line);
368
369 w = strpbrk(p, WHITESPACE);
370 if (!w)
371 continue;
372 *w = 0;
373
374 r = safe_atou(p, &maj);
375 if (r < 0)
376 continue;
377 if (maj <= 0)
378 continue;
379
380 w++;
381 w += strspn(w, WHITESPACE);
e41969e3
LP
382
383 if (fnmatch(name, w, 0) != 0)
90060676
LP
384 continue;
385
386 sprintf(buf,
387 "%c %u:* %s",
388 type,
389 maj,
390 acc);
391
392 r = cg_set_attribute("devices", path, "devices.allow", buf);
1aeab12b 393 if (r < 0)
077ba06e 394 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
714e2e1d 395 "Failed to set devices.allow on %s: %m", path);
90060676
LP
396 }
397
398 return 0;
399
400fail:
25f027c5 401 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
90060676
LP
402}
403
66ebf6c0
TH
404static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
405 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
406 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
407}
408
409static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
410 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
411 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
412}
413
414static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
415 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
416 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
417 return c->startup_cpu_weight;
418 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
419 return c->cpu_weight;
420 else
421 return CGROUP_WEIGHT_DEFAULT;
422}
423
424static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
425 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
426 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
427 return c->startup_cpu_shares;
428 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
429 return c->cpu_shares;
430 else
431 return CGROUP_CPU_SHARES_DEFAULT;
432}
433
434static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
435 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
436 int r;
437
438 xsprintf(buf, "%" PRIu64 "\n", weight);
439 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
440 if (r < 0)
441 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
442 "Failed to set cpu.weight: %m");
443
444 if (quota != USEC_INFINITY)
445 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
446 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
447 else
448 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
449
450 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
451
452 if (r < 0)
453 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
454 "Failed to set cpu.max: %m");
455}
456
457static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
458 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
459 int r;
460
461 xsprintf(buf, "%" PRIu64 "\n", shares);
462 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
463 if (r < 0)
464 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
465 "Failed to set cpu.shares: %m");
466
467 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
468 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
469 if (r < 0)
470 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
471 "Failed to set cpu.cfs_period_us: %m");
472
473 if (quota != USEC_INFINITY) {
474 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
475 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
476 } else
477 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
478 if (r < 0)
479 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
480 "Failed to set cpu.cfs_quota_us: %m");
481}
482
483static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
484 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
485 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
486}
487
488static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
489 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
490 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
491}
492
508c45da 493static bool cgroup_context_has_io_config(CGroupContext *c) {
538b4852
TH
494 return c->io_accounting ||
495 c->io_weight != CGROUP_WEIGHT_INVALID ||
496 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
497 c->io_device_weights ||
498 c->io_device_limits;
499}
500
508c45da 501static bool cgroup_context_has_blockio_config(CGroupContext *c) {
538b4852
TH
502 return c->blockio_accounting ||
503 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
504 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
505 c->blockio_device_weights ||
506 c->blockio_device_bandwidths;
507}
508
508c45da 509static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
510 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
511 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
512 return c->startup_io_weight;
513 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
514 return c->io_weight;
515 else
516 return CGROUP_WEIGHT_DEFAULT;
517}
518
508c45da 519static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
64faf04c
TH
520 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
521 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
522 return c->startup_blockio_weight;
523 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
524 return c->blockio_weight;
525 else
526 return CGROUP_BLKIO_WEIGHT_DEFAULT;
527}
528
508c45da 529static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
538b4852
TH
530 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
531 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
532}
533
508c45da 534static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
538b4852
TH
535 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
536 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
537}
538
f29ff115 539static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
64faf04c
TH
540 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
541 dev_t dev;
542 int r;
543
544 r = lookup_block_device(dev_path, &dev);
545 if (r < 0)
546 return;
547
548 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
f29ff115 549 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
64faf04c 550 if (r < 0)
f29ff115
TH
551 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
552 "Failed to set io.weight: %m");
64faf04c
TH
553}
554
f29ff115 555static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
64faf04c
TH
556 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
557 dev_t dev;
558 int r;
559
560 r = lookup_block_device(dev_path, &dev);
561 if (r < 0)
562 return;
563
564 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
f29ff115 565 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
64faf04c 566 if (r < 0)
f29ff115
TH
567 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
568 "Failed to set blkio.weight_device: %m");
64faf04c
TH
569}
570
f29ff115 571static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
64faf04c
TH
572 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
573 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
574 CGroupIOLimitType type;
575 dev_t dev;
576 unsigned n = 0;
577 int r;
578
579 r = lookup_block_device(dev_path, &dev);
580 if (r < 0)
581 return 0;
582
583 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
584 if (limits[type] != cgroup_io_limit_defaults[type]) {
585 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
586 n++;
587 } else {
588 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
589 }
590 }
591
592 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
593 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
594 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
f29ff115 595 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
64faf04c 596 if (r < 0)
f29ff115
TH
597 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
598 "Failed to set io.max: %m");
64faf04c
TH
599 return n;
600}
601
f29ff115 602static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
64faf04c
TH
603 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
604 dev_t dev;
605 unsigned n = 0;
606 int r;
607
608 r = lookup_block_device(dev_path, &dev);
609 if (r < 0)
610 return 0;
611
612 if (rbps != CGROUP_LIMIT_MAX)
613 n++;
614 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
f29ff115 615 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
64faf04c 616 if (r < 0)
f29ff115
TH
617 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
618 "Failed to set blkio.throttle.read_bps_device: %m");
64faf04c
TH
619
620 if (wbps != CGROUP_LIMIT_MAX)
621 n++;
622 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
f29ff115 623 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
64faf04c 624 if (r < 0)
f29ff115
TH
625 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
626 "Failed to set blkio.throttle.write_bps_device: %m");
64faf04c
TH
627
628 return n;
629}
630
da4d897e 631static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
96e131ea 632 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
da4d897e
TH
633}
634
f29ff115 635static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
da4d897e
TH
636 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
637 int r;
638
639 if (v != CGROUP_LIMIT_MAX)
640 xsprintf(buf, "%" PRIu64 "\n", v);
641
f29ff115 642 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
da4d897e 643 if (r < 0)
f29ff115
TH
644 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
645 "Failed to set %s: %m", file);
da4d897e
TH
646}
647
f29ff115
TH
648static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) {
649 const char *path;
650 CGroupContext *c;
01efdf13 651 bool is_root;
4ad49000
LP
652 int r;
653
f29ff115
TH
654 assert(u);
655
656 c = unit_get_cgroup_context(u);
657 path = u->cgroup_path;
658
4ad49000
LP
659 assert(c);
660 assert(path);
8e274523 661
4ad49000
LP
662 if (mask == 0)
663 return;
8e274523 664
71c26873 665 /* Some cgroup attributes are not supported on the root cgroup,
01efdf13
LP
666 * hence silently ignore */
667 is_root = isempty(path) || path_equal(path, "/");
6da13913
ZJS
668 if (is_root)
669 /* Make sure we don't try to display messages with an empty path. */
670 path = "/";
01efdf13 671
714e2e1d
LP
672 /* We generally ignore errors caused by read-only mounted
673 * cgroup trees (assuming we are running in a container then),
674 * and missing cgroups, i.e. EROFS and ENOENT. */
675
efdb0237 676 if ((mask & CGROUP_MASK_CPU) && !is_root) {
66ebf6c0
TH
677 bool has_weight = cgroup_context_has_cpu_weight(c);
678 bool has_shares = cgroup_context_has_cpu_shares(c);
8e274523 679
b4cccbc1 680 if (cg_all_unified() > 0) {
66ebf6c0 681 uint64_t weight;
b2f8b02e 682
66ebf6c0
TH
683 if (has_weight)
684 weight = cgroup_context_cpu_weight(c, state);
685 else if (has_shares) {
686 uint64_t shares = cgroup_context_cpu_shares(c, state);
b2f8b02e 687
66ebf6c0
TH
688 weight = cgroup_cpu_shares_to_weight(shares);
689
690 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
691 shares, weight, path);
692 } else
693 weight = CGROUP_WEIGHT_DEFAULT;
694
695 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
696 } else {
697 uint64_t shares;
698
7d862ab8 699 if (has_weight) {
66ebf6c0
TH
700 uint64_t weight = cgroup_context_cpu_weight(c, state);
701
702 shares = cgroup_cpu_weight_to_shares(weight);
703
704 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
705 weight, shares, path);
7d862ab8
TH
706 } else if (has_shares)
707 shares = cgroup_context_cpu_shares(c, state);
708 else
66ebf6c0
TH
709 shares = CGROUP_CPU_SHARES_DEFAULT;
710
711 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
712 }
4ad49000
LP
713 }
714
13c31542 715 if (mask & CGROUP_MASK_IO) {
538b4852
TH
716 bool has_io = cgroup_context_has_io_config(c);
717 bool has_blockio = cgroup_context_has_blockio_config(c);
13c31542
TH
718
719 if (!is_root) {
64faf04c
TH
720 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
721 uint64_t weight;
13c31542 722
538b4852
TH
723 if (has_io)
724 weight = cgroup_context_io_weight(c, state);
128fadc9
TH
725 else if (has_blockio) {
726 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
727
728 weight = cgroup_weight_blkio_to_io(blkio_weight);
729
730 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
731 blkio_weight, weight);
732 } else
538b4852 733 weight = CGROUP_WEIGHT_DEFAULT;
13c31542
TH
734
735 xsprintf(buf, "default %" PRIu64 "\n", weight);
736 r = cg_set_attribute("io", path, "io.weight", buf);
737 if (r < 0)
f29ff115
TH
738 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
739 "Failed to set io.weight: %m");
13c31542 740
538b4852
TH
741 if (has_io) {
742 CGroupIODeviceWeight *w;
743
744 /* FIXME: no way to reset this list */
745 LIST_FOREACH(device_weights, w, c->io_device_weights)
f29ff115 746 cgroup_apply_io_device_weight(u, w->path, w->weight);
538b4852
TH
747 } else if (has_blockio) {
748 CGroupBlockIODeviceWeight *w;
749
750 /* FIXME: no way to reset this list */
128fadc9
TH
751 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
752 weight = cgroup_weight_blkio_to_io(w->weight);
753
754 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
755 w->weight, weight, w->path);
756
757 cgroup_apply_io_device_weight(u, w->path, weight);
758 }
538b4852 759 }
13c31542
TH
760 }
761
64faf04c 762 /* Apply limits and free ones without config. */
538b4852
TH
763 if (has_io) {
764 CGroupIODeviceLimit *l, *next;
765
766 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
f29ff115 767 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
538b4852
TH
768 cgroup_context_free_io_device_limit(c, l);
769 }
770 } else if (has_blockio) {
771 CGroupBlockIODeviceBandwidth *b, *next;
772
773 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
774 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
775 CGroupIOLimitType type;
776
777 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
778 limits[type] = cgroup_io_limit_defaults[type];
779
780 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
781 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
782
128fadc9
TH
783 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
784 b->rbps, b->wbps, b->path);
785
f29ff115 786 if (!cgroup_apply_io_device_limit(u, b->path, limits))
538b4852
TH
787 cgroup_context_free_blockio_device_bandwidth(c, b);
788 }
13c31542
TH
789 }
790 }
791
efdb0237 792 if (mask & CGROUP_MASK_BLKIO) {
538b4852
TH
793 bool has_io = cgroup_context_has_io_config(c);
794 bool has_blockio = cgroup_context_has_blockio_config(c);
4ad49000 795
01efdf13 796 if (!is_root) {
64faf04c
TH
797 char buf[DECIMAL_STR_MAX(uint64_t)+1];
798 uint64_t weight;
64faf04c 799
7d862ab8 800 if (has_io) {
128fadc9
TH
801 uint64_t io_weight = cgroup_context_io_weight(c, state);
802
538b4852 803 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
128fadc9
TH
804
805 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
806 io_weight, weight);
7d862ab8
TH
807 } else if (has_blockio)
808 weight = cgroup_context_blkio_weight(c, state);
809 else
538b4852 810 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
64faf04c
TH
811
812 xsprintf(buf, "%" PRIu64 "\n", weight);
01efdf13 813 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
1aeab12b 814 if (r < 0)
f29ff115
TH
815 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
816 "Failed to set blkio.weight: %m");
4ad49000 817
7d862ab8 818 if (has_io) {
538b4852
TH
819 CGroupIODeviceWeight *w;
820
821 /* FIXME: no way to reset this list */
128fadc9
TH
822 LIST_FOREACH(device_weights, w, c->io_device_weights) {
823 weight = cgroup_weight_io_to_blkio(w->weight);
824
825 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
826 w->weight, weight, w->path);
827
828 cgroup_apply_blkio_device_weight(u, w->path, weight);
829 }
7d862ab8
TH
830 } else if (has_blockio) {
831 CGroupBlockIODeviceWeight *w;
832
833 /* FIXME: no way to reset this list */
834 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
835 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
538b4852 836 }
4ad49000
LP
837 }
838
64faf04c 839 /* Apply limits and free ones without config. */
7d862ab8 840 if (has_io) {
538b4852
TH
841 CGroupIODeviceLimit *l, *next;
842
843 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
128fadc9
TH
844 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
845 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
846
f29ff115 847 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
538b4852
TH
848 cgroup_context_free_io_device_limit(c, l);
849 }
7d862ab8
TH
850 } else if (has_blockio) {
851 CGroupBlockIODeviceBandwidth *b, *next;
852
853 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
854 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
855 cgroup_context_free_blockio_device_bandwidth(c, b);
d686d8a9 856 }
8e274523
LP
857 }
858
efdb0237 859 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
b4cccbc1
LP
860 if (cg_all_unified() > 0) {
861 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
efdb0237 862
96e131ea 863 if (cgroup_context_has_unified_memory_config(c)) {
da4d897e 864 max = c->memory_max;
96e131ea
WC
865 swap_max = c->memory_swap_max;
866 } else {
da4d897e 867 max = c->memory_limit;
efdb0237 868
128fadc9
TH
869 if (max != CGROUP_LIMIT_MAX)
870 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
871 }
872
f29ff115
TH
873 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
874 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
875 cgroup_apply_unified_memory_limit(u, "memory.max", max);
96e131ea 876 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
efdb0237 877 } else {
da4d897e 878 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
7d862ab8 879 uint64_t val;
da4d897e 880
7d862ab8 881 if (cgroup_context_has_unified_memory_config(c)) {
78a4ee59 882 val = c->memory_max;
7d862ab8
TH
883 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
884 } else
885 val = c->memory_limit;
128fadc9 886
78a4ee59
DM
887 if (val == CGROUP_LIMIT_MAX)
888 strncpy(buf, "-1\n", sizeof(buf));
889 else
890 xsprintf(buf, "%" PRIu64 "\n", val);
891
da4d897e
TH
892 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
893 if (r < 0)
f29ff115
TH
894 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
895 "Failed to set memory.limit_in_bytes: %m");
da4d897e 896 }
4ad49000 897 }
8e274523 898
3905f127 899 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
4ad49000 900 CGroupDeviceAllow *a;
8e274523 901
714e2e1d
LP
902 /* Changing the devices list of a populated cgroup
903 * might result in EINVAL, hence ignore EINVAL
904 * here. */
905
4ad49000
LP
906 if (c->device_allow || c->device_policy != CGROUP_AUTO)
907 r = cg_set_attribute("devices", path, "devices.deny", "a");
908 else
909 r = cg_set_attribute("devices", path, "devices.allow", "a");
1aeab12b 910 if (r < 0)
f29ff115
TH
911 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
912 "Failed to reset devices.list: %m");
fb385181 913
4ad49000
LP
914 if (c->device_policy == CGROUP_CLOSED ||
915 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
916 static const char auto_devices[] =
7d711efb
LP
917 "/dev/null\0" "rwm\0"
918 "/dev/zero\0" "rwm\0"
919 "/dev/full\0" "rwm\0"
920 "/dev/random\0" "rwm\0"
921 "/dev/urandom\0" "rwm\0"
922 "/dev/tty\0" "rwm\0"
0d9e7991
AP
923 "/dev/pts/ptmx\0" "rw\0" /* /dev/pts/ptmx may not be duplicated, but accessed */
924 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
e7330dfe
DP
925 "-/run/systemd/inaccessible/chr\0" "rwm\0"
926 "-/run/systemd/inaccessible/blk\0" "rwm\0";
4ad49000
LP
927
928 const char *x, *y;
929
930 NULSTR_FOREACH_PAIR(x, y, auto_devices)
931 whitelist_device(path, x, y);
7d711efb
LP
932
933 whitelist_major(path, "pts", 'c', "rw");
4ad49000
LP
934 }
935
936 LIST_FOREACH(device_allow, a, c->device_allow) {
fb4650aa 937 char acc[4], *val;
4ad49000
LP
938 unsigned k = 0;
939
940 if (a->r)
941 acc[k++] = 'r';
942 if (a->w)
943 acc[k++] = 'w';
944 if (a->m)
945 acc[k++] = 'm';
fb385181 946
4ad49000
LP
947 if (k == 0)
948 continue;
fb385181 949
4ad49000 950 acc[k++] = 0;
90060676 951
27458ed6 952 if (path_startswith(a->path, "/dev/"))
90060676 953 whitelist_device(path, a->path, acc);
fb4650aa
ZJS
954 else if ((val = startswith(a->path, "block-")))
955 whitelist_major(path, val, 'b', acc);
956 else if ((val = startswith(a->path, "char-")))
957 whitelist_major(path, val, 'c', acc);
90060676 958 else
f29ff115 959 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
4ad49000
LP
960 }
961 }
03a7b521
LP
962
963 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
964
f5058264 965 if (c->tasks_max != CGROUP_LIMIT_MAX) {
03a7b521
LP
966 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
967
968 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
969 r = cg_set_attribute("pids", path, "pids.max", buf);
970 } else
971 r = cg_set_attribute("pids", path, "pids.max", "max");
972
973 if (r < 0)
f29ff115
TH
974 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
975 "Failed to set pids.max: %m");
03a7b521 976 }
fb385181
LP
977}
978
efdb0237
LP
979CGroupMask cgroup_context_get_mask(CGroupContext *c) {
980 CGroupMask mask = 0;
8e274523 981
4ad49000 982 /* Figure out which controllers we need */
8e274523 983
b2f8b02e 984 if (c->cpu_accounting ||
66ebf6c0
TH
985 cgroup_context_has_cpu_weight(c) ||
986 cgroup_context_has_cpu_shares(c) ||
3a43da28 987 c->cpu_quota_per_sec_usec != USEC_INFINITY)
efdb0237 988 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
ecedd90f 989
538b4852
TH
990 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
991 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
ecedd90f 992
4ad49000 993 if (c->memory_accounting ||
da4d897e
TH
994 c->memory_limit != CGROUP_LIMIT_MAX ||
995 cgroup_context_has_unified_memory_config(c))
efdb0237 996 mask |= CGROUP_MASK_MEMORY;
8e274523 997
a931ad47
LP
998 if (c->device_allow ||
999 c->device_policy != CGROUP_AUTO)
3905f127 1000 mask |= CGROUP_MASK_DEVICES;
4ad49000 1001
03a7b521
LP
1002 if (c->tasks_accounting ||
1003 c->tasks_max != (uint64_t) -1)
1004 mask |= CGROUP_MASK_PIDS;
1005
4ad49000 1006 return mask;
8e274523
LP
1007}
1008
efdb0237 1009CGroupMask unit_get_own_mask(Unit *u) {
4ad49000 1010 CGroupContext *c;
8e274523 1011
efdb0237
LP
1012 /* Returns the mask of controllers the unit needs for itself */
1013
4ad49000
LP
1014 c = unit_get_cgroup_context(u);
1015 if (!c)
1016 return 0;
8e274523 1017
a931ad47 1018 /* If delegation is turned on, then turn on all cgroups,
19af675e
LP
1019 * unless we are on the legacy hierarchy and the process we
1020 * fork into it is known to drop privileges, and hence
1021 * shouldn't get access to the controllers.
1022 *
1023 * Note that on the unified hierarchy it is safe to delegate
1024 * controllers to unprivileged services. */
a931ad47
LP
1025
1026 if (c->delegate) {
1027 ExecContext *e;
1028
1029 e = unit_get_exec_context(u);
19af675e
LP
1030 if (!e ||
1031 exec_context_maintains_privileges(e) ||
b4cccbc1 1032 cg_all_unified() > 0)
efdb0237 1033 return _CGROUP_MASK_ALL;
a931ad47
LP
1034 }
1035
db785129 1036 return cgroup_context_get_mask(c);
8e274523
LP
1037}
1038
efdb0237 1039CGroupMask unit_get_members_mask(Unit *u) {
4ad49000 1040 assert(u);
bc432dc7 1041
efdb0237
LP
1042 /* Returns the mask of controllers all of the unit's children
1043 * require, merged */
1044
bc432dc7
LP
1045 if (u->cgroup_members_mask_valid)
1046 return u->cgroup_members_mask;
1047
1048 u->cgroup_members_mask = 0;
1049
1050 if (u->type == UNIT_SLICE) {
1051 Unit *member;
1052 Iterator i;
1053
1054 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
1055
1056 if (member == u)
1057 continue;
1058
d4fdc205 1059 if (UNIT_DEREF(member->slice) != u)
bc432dc7
LP
1060 continue;
1061
1062 u->cgroup_members_mask |=
efdb0237 1063 unit_get_own_mask(member) |
bc432dc7
LP
1064 unit_get_members_mask(member);
1065 }
1066 }
1067
1068 u->cgroup_members_mask_valid = true;
6414b7c9 1069 return u->cgroup_members_mask;
246aa6dd
LP
1070}
1071
efdb0237 1072CGroupMask unit_get_siblings_mask(Unit *u) {
4ad49000 1073 assert(u);
246aa6dd 1074
efdb0237
LP
1075 /* Returns the mask of controllers all of the unit's siblings
1076 * require, i.e. the members mask of the unit's parent slice
1077 * if there is one. */
1078
bc432dc7 1079 if (UNIT_ISSET(u->slice))
637f421e 1080 return unit_get_members_mask(UNIT_DEREF(u->slice));
4ad49000 1081
efdb0237 1082 return unit_get_own_mask(u) | unit_get_members_mask(u);
246aa6dd
LP
1083}
1084
efdb0237
LP
1085CGroupMask unit_get_subtree_mask(Unit *u) {
1086
1087 /* Returns the mask of this subtree, meaning of the group
1088 * itself and its children. */
1089
1090 return unit_get_own_mask(u) | unit_get_members_mask(u);
1091}
1092
1093CGroupMask unit_get_target_mask(Unit *u) {
1094 CGroupMask mask;
1095
1096 /* This returns the cgroup mask of all controllers to enable
1097 * for a specific cgroup, i.e. everything it needs itself,
1098 * plus all that its children need, plus all that its siblings
1099 * need. This is primarily useful on the legacy cgroup
1100 * hierarchy, where we need to duplicate each cgroup in each
1101 * hierarchy that shall be enabled for it. */
6414b7c9 1102
efdb0237
LP
1103 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1104 mask &= u->manager->cgroup_supported;
1105
1106 return mask;
1107}
1108
1109CGroupMask unit_get_enable_mask(Unit *u) {
1110 CGroupMask mask;
1111
1112 /* This returns the cgroup mask of all controllers to enable
1113 * for the children of a specific cgroup. This is primarily
1114 * useful for the unified cgroup hierarchy, where each cgroup
1115 * controls which controllers are enabled for its children. */
1116
1117 mask = unit_get_members_mask(u);
6414b7c9
DS
1118 mask &= u->manager->cgroup_supported;
1119
1120 return mask;
1121}
1122
1123/* Recurse from a unit up through its containing slices, propagating
1124 * mask bits upward. A unit is also member of itself. */
bc432dc7 1125void unit_update_cgroup_members_masks(Unit *u) {
efdb0237 1126 CGroupMask m;
bc432dc7
LP
1127 bool more;
1128
1129 assert(u);
1130
1131 /* Calculate subtree mask */
efdb0237 1132 m = unit_get_subtree_mask(u);
bc432dc7
LP
1133
1134 /* See if anything changed from the previous invocation. If
1135 * not, we're done. */
1136 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1137 return;
1138
1139 more =
1140 u->cgroup_subtree_mask_valid &&
1141 ((m & ~u->cgroup_subtree_mask) != 0) &&
1142 ((~m & u->cgroup_subtree_mask) == 0);
1143
1144 u->cgroup_subtree_mask = m;
1145 u->cgroup_subtree_mask_valid = true;
1146
6414b7c9
DS
1147 if (UNIT_ISSET(u->slice)) {
1148 Unit *s = UNIT_DEREF(u->slice);
bc432dc7
LP
1149
1150 if (more)
1151 /* There's more set now than before. We
1152 * propagate the new mask to the parent's mask
1153 * (not caring if it actually was valid or
1154 * not). */
1155
1156 s->cgroup_members_mask |= m;
1157
1158 else
1159 /* There's less set now than before (or we
1160 * don't know), we need to recalculate
1161 * everything, so let's invalidate the
1162 * parent's members mask */
1163
1164 s->cgroup_members_mask_valid = false;
1165
1166 /* And now make sure that this change also hits our
1167 * grandparents */
1168 unit_update_cgroup_members_masks(s);
6414b7c9
DS
1169 }
1170}
1171
efdb0237 1172static const char *migrate_callback(CGroupMask mask, void *userdata) {
03b90d4b
LP
1173 Unit *u = userdata;
1174
1175 assert(mask != 0);
1176 assert(u);
1177
1178 while (u) {
1179 if (u->cgroup_path &&
1180 u->cgroup_realized &&
1181 (u->cgroup_realized_mask & mask) == mask)
1182 return u->cgroup_path;
1183
1184 u = UNIT_DEREF(u->slice);
1185 }
1186
1187 return NULL;
1188}
1189
efdb0237
LP
1190char *unit_default_cgroup_path(Unit *u) {
1191 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1192 int r;
1193
1194 assert(u);
1195
1196 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1197 return strdup(u->manager->cgroup_root);
1198
1199 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1200 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1201 if (r < 0)
1202 return NULL;
1203 }
1204
1205 escaped = cg_escape(u->id);
1206 if (!escaped)
1207 return NULL;
1208
1209 if (slice)
605405c6
ZJS
1210 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1211 escaped);
efdb0237 1212 else
605405c6 1213 return strjoin(u->manager->cgroup_root, "/", escaped);
efdb0237
LP
1214}
1215
1216int unit_set_cgroup_path(Unit *u, const char *path) {
1217 _cleanup_free_ char *p = NULL;
1218 int r;
1219
1220 assert(u);
1221
1222 if (path) {
1223 p = strdup(path);
1224 if (!p)
1225 return -ENOMEM;
1226 } else
1227 p = NULL;
1228
1229 if (streq_ptr(u->cgroup_path, p))
1230 return 0;
1231
1232 if (p) {
1233 r = hashmap_put(u->manager->cgroup_unit, p, u);
1234 if (r < 0)
1235 return r;
1236 }
1237
1238 unit_release_cgroup(u);
1239
1240 u->cgroup_path = p;
1241 p = NULL;
1242
1243 return 1;
1244}
1245
1246int unit_watch_cgroup(Unit *u) {
ab2c3861 1247 _cleanup_free_ char *events = NULL;
efdb0237
LP
1248 int r;
1249
1250 assert(u);
1251
1252 if (!u->cgroup_path)
1253 return 0;
1254
1255 if (u->cgroup_inotify_wd >= 0)
1256 return 0;
1257
1258 /* Only applies to the unified hierarchy */
c22800e4 1259 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1260 if (r < 0)
1261 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1262 if (r == 0)
efdb0237
LP
1263 return 0;
1264
1265 /* Don't watch the root slice, it's pointless. */
1266 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1267 return 0;
1268
1269 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1270 if (r < 0)
1271 return log_oom();
1272
ab2c3861 1273 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
efdb0237
LP
1274 if (r < 0)
1275 return log_oom();
1276
ab2c3861 1277 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
efdb0237
LP
1278 if (u->cgroup_inotify_wd < 0) {
1279
1280 if (errno == ENOENT) /* If the directory is already
1281 * gone we don't need to track
1282 * it, so this is not an error */
1283 return 0;
1284
1285 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1286 }
1287
1288 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1289 if (r < 0)
1290 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1291
1292 return 0;
1293}
1294
1295static int unit_create_cgroup(
1296 Unit *u,
1297 CGroupMask target_mask,
1298 CGroupMask enable_mask) {
1299
0cd385d3 1300 CGroupContext *c;
bc432dc7 1301 int r;
64747e2d 1302
4ad49000 1303 assert(u);
64747e2d 1304
0cd385d3
LP
1305 c = unit_get_cgroup_context(u);
1306 if (!c)
1307 return 0;
1308
7b3fd631
LP
1309 if (!u->cgroup_path) {
1310 _cleanup_free_ char *path = NULL;
64747e2d 1311
7b3fd631
LP
1312 path = unit_default_cgroup_path(u);
1313 if (!path)
1314 return log_oom();
1315
efdb0237
LP
1316 r = unit_set_cgroup_path(u, path);
1317 if (r == -EEXIST)
1318 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1319 if (r < 0)
1320 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
b58b8e11
HH
1321 }
1322
03b90d4b 1323 /* First, create our own group */
efdb0237 1324 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
23bbb0de 1325 if (r < 0)
efdb0237
LP
1326 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1327
1328 /* Start watching it */
1329 (void) unit_watch_cgroup(u);
1330
1331 /* Enable all controllers we need */
1332 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1333 if (r < 0)
1334 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
03b90d4b
LP
1335
1336 /* Keep track that this is now realized */
4ad49000 1337 u->cgroup_realized = true;
efdb0237 1338 u->cgroup_realized_mask = target_mask;
ccf78df1 1339 u->cgroup_enabled_mask = enable_mask;
4ad49000 1340
0cd385d3
LP
1341 if (u->type != UNIT_SLICE && !c->delegate) {
1342
1343 /* Then, possibly move things over, but not if
1344 * subgroups may contain processes, which is the case
1345 * for slice and delegation units. */
1346 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1347 if (r < 0)
efdb0237 1348 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
0cd385d3 1349 }
03b90d4b 1350
64747e2d
LP
1351 return 0;
1352}
1353
7b3fd631
LP
1354int unit_attach_pids_to_cgroup(Unit *u) {
1355 int r;
1356 assert(u);
1357
1358 r = unit_realize_cgroup(u);
1359 if (r < 0)
1360 return r;
1361
1362 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1363 if (r < 0)
1364 return r;
1365
1366 return 0;
1367}
1368
4b58153d
LP
1369static void cgroup_xattr_apply(Unit *u) {
1370 char ids[SD_ID128_STRING_MAX];
1371 int r;
1372
1373 assert(u);
1374
1375 if (!MANAGER_IS_SYSTEM(u->manager))
1376 return;
1377
1378 if (sd_id128_is_null(u->invocation_id))
1379 return;
1380
1381 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1382 "trusted.invocation_id",
1383 sd_id128_to_string(u->invocation_id, ids), 32,
1384 0);
1385 if (r < 0)
1386 log_unit_warning_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1387}
1388
ccf78df1 1389static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask, CGroupMask enable_mask) {
bc432dc7
LP
1390 assert(u);
1391
ccf78df1 1392 return u->cgroup_realized && u->cgroup_realized_mask == target_mask && u->cgroup_enabled_mask == enable_mask;
6414b7c9
DS
1393}
1394
1395/* Check if necessary controllers and attributes for a unit are in place.
1396 *
1397 * If so, do nothing.
1398 * If not, create paths, move processes over, and set attributes.
1399 *
1400 * Returns 0 on success and < 0 on failure. */
db785129 1401static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
efdb0237 1402 CGroupMask target_mask, enable_mask;
6414b7c9 1403 int r;
64747e2d 1404
4ad49000 1405 assert(u);
64747e2d 1406
4ad49000 1407 if (u->in_cgroup_queue) {
71fda00f 1408 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
1409 u->in_cgroup_queue = false;
1410 }
64747e2d 1411
efdb0237 1412 target_mask = unit_get_target_mask(u);
ccf78df1
TH
1413 enable_mask = unit_get_enable_mask(u);
1414
1415 if (unit_has_mask_realized(u, target_mask, enable_mask))
0a1eb06d 1416 return 0;
64747e2d 1417
4ad49000 1418 /* First, realize parents */
6414b7c9 1419 if (UNIT_ISSET(u->slice)) {
db785129 1420 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
6414b7c9
DS
1421 if (r < 0)
1422 return r;
1423 }
4ad49000
LP
1424
1425 /* And then do the real work */
efdb0237 1426 r = unit_create_cgroup(u, target_mask, enable_mask);
6414b7c9
DS
1427 if (r < 0)
1428 return r;
1429
1430 /* Finally, apply the necessary attributes. */
f29ff115 1431 cgroup_context_apply(u, target_mask, state);
4b58153d 1432 cgroup_xattr_apply(u);
6414b7c9
DS
1433
1434 return 0;
64747e2d
LP
1435}
1436
4ad49000 1437static void unit_add_to_cgroup_queue(Unit *u) {
ecedd90f 1438
4ad49000
LP
1439 if (u->in_cgroup_queue)
1440 return;
8e274523 1441
71fda00f 1442 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
4ad49000
LP
1443 u->in_cgroup_queue = true;
1444}
8c6db833 1445
4ad49000 1446unsigned manager_dispatch_cgroup_queue(Manager *m) {
db785129 1447 ManagerState state;
4ad49000 1448 unsigned n = 0;
db785129 1449 Unit *i;
6414b7c9 1450 int r;
ecedd90f 1451
db785129
LP
1452 state = manager_state(m);
1453
4ad49000
LP
1454 while ((i = m->cgroup_queue)) {
1455 assert(i->in_cgroup_queue);
ecedd90f 1456
db785129 1457 r = unit_realize_cgroup_now(i, state);
6414b7c9 1458 if (r < 0)
efdb0237 1459 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
0a1eb06d 1460
4ad49000
LP
1461 n++;
1462 }
ecedd90f 1463
4ad49000 1464 return n;
8e274523
LP
1465}
1466
4ad49000
LP
1467static void unit_queue_siblings(Unit *u) {
1468 Unit *slice;
ca949c9d 1469
4ad49000
LP
1470 /* This adds the siblings of the specified unit and the
1471 * siblings of all parent units to the cgroup queue. (But
1472 * neither the specified unit itself nor the parents.) */
1473
1474 while ((slice = UNIT_DEREF(u->slice))) {
1475 Iterator i;
1476 Unit *m;
8f53a7b8 1477
4ad49000
LP
1478 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1479 if (m == u)
1480 continue;
8e274523 1481
6414b7c9
DS
1482 /* Skip units that have a dependency on the slice
1483 * but aren't actually in it. */
4ad49000 1484 if (UNIT_DEREF(m->slice) != slice)
50159e6a 1485 continue;
8e274523 1486
6414b7c9
DS
1487 /* No point in doing cgroup application for units
1488 * without active processes. */
1489 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1490 continue;
1491
1492 /* If the unit doesn't need any new controllers
1493 * and has current ones realized, it doesn't need
1494 * any changes. */
ccf78df1 1495 if (unit_has_mask_realized(m, unit_get_target_mask(m), unit_get_enable_mask(m)))
6414b7c9
DS
1496 continue;
1497
4ad49000 1498 unit_add_to_cgroup_queue(m);
50159e6a
LP
1499 }
1500
4ad49000 1501 u = slice;
8e274523 1502 }
4ad49000
LP
1503}
1504
0a1eb06d 1505int unit_realize_cgroup(Unit *u) {
4ad49000
LP
1506 assert(u);
1507
35b7ff80 1508 if (!UNIT_HAS_CGROUP_CONTEXT(u))
0a1eb06d 1509 return 0;
8e274523 1510
4ad49000
LP
1511 /* So, here's the deal: when realizing the cgroups for this
1512 * unit, we need to first create all parents, but there's more
1513 * actually: for the weight-based controllers we also need to
1514 * make sure that all our siblings (i.e. units that are in the
73e231ab 1515 * same slice as we are) have cgroups, too. Otherwise, things
4ad49000
LP
1516 * would become very uneven as each of their processes would
1517 * get as much resources as all our group together. This call
1518 * will synchronously create the parent cgroups, but will
1519 * defer work on the siblings to the next event loop
1520 * iteration. */
ca949c9d 1521
4ad49000
LP
1522 /* Add all sibling slices to the cgroup queue. */
1523 unit_queue_siblings(u);
1524
6414b7c9 1525 /* And realize this one now (and apply the values) */
db785129 1526 return unit_realize_cgroup_now(u, manager_state(u->manager));
8e274523
LP
1527}
1528
efdb0237
LP
1529void unit_release_cgroup(Unit *u) {
1530 assert(u);
1531
1532 /* Forgets all cgroup details for this cgroup */
1533
1534 if (u->cgroup_path) {
1535 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1536 u->cgroup_path = mfree(u->cgroup_path);
1537 }
1538
1539 if (u->cgroup_inotify_wd >= 0) {
1540 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1541 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1542
1543 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1544 u->cgroup_inotify_wd = -1;
1545 }
1546}
1547
1548void unit_prune_cgroup(Unit *u) {
8e274523 1549 int r;
efdb0237 1550 bool is_root_slice;
8e274523 1551
4ad49000 1552 assert(u);
8e274523 1553
efdb0237
LP
1554 /* Removes the cgroup, if empty and possible, and stops watching it. */
1555
4ad49000
LP
1556 if (!u->cgroup_path)
1557 return;
8e274523 1558
fe700f46
LP
1559 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1560
efdb0237
LP
1561 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1562
1563 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
dab5bf85 1564 if (r < 0) {
f29ff115 1565 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
dab5bf85
RL
1566 return;
1567 }
8e274523 1568
efdb0237
LP
1569 if (is_root_slice)
1570 return;
1571
1572 unit_release_cgroup(u);
0a1eb06d 1573
4ad49000 1574 u->cgroup_realized = false;
bc432dc7 1575 u->cgroup_realized_mask = 0;
ccf78df1 1576 u->cgroup_enabled_mask = 0;
8e274523
LP
1577}
1578
efdb0237 1579int unit_search_main_pid(Unit *u, pid_t *ret) {
4ad49000
LP
1580 _cleanup_fclose_ FILE *f = NULL;
1581 pid_t pid = 0, npid, mypid;
efdb0237 1582 int r;
4ad49000
LP
1583
1584 assert(u);
efdb0237 1585 assert(ret);
4ad49000
LP
1586
1587 if (!u->cgroup_path)
efdb0237 1588 return -ENXIO;
4ad49000 1589
efdb0237
LP
1590 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1591 if (r < 0)
1592 return r;
4ad49000 1593
df0ff127 1594 mypid = getpid_cached();
4ad49000
LP
1595 while (cg_read_pid(f, &npid) > 0) {
1596 pid_t ppid;
1597
1598 if (npid == pid)
1599 continue;
8e274523 1600
4ad49000 1601 /* Ignore processes that aren't our kids */
6bc73acb 1602 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
4ad49000 1603 continue;
8e274523 1604
efdb0237 1605 if (pid != 0)
4ad49000
LP
1606 /* Dang, there's more than one daemonized PID
1607 in this group, so we don't know what process
1608 is the main process. */
efdb0237
LP
1609
1610 return -ENODATA;
8e274523 1611
4ad49000 1612 pid = npid;
8e274523
LP
1613 }
1614
efdb0237
LP
1615 *ret = pid;
1616 return 0;
1617}
1618
1619static int unit_watch_pids_in_path(Unit *u, const char *path) {
b3c5bad3 1620 _cleanup_closedir_ DIR *d = NULL;
efdb0237
LP
1621 _cleanup_fclose_ FILE *f = NULL;
1622 int ret = 0, r;
1623
1624 assert(u);
1625 assert(path);
1626
1627 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1628 if (r < 0)
1629 ret = r;
1630 else {
1631 pid_t pid;
1632
1633 while ((r = cg_read_pid(f, &pid)) > 0) {
1634 r = unit_watch_pid(u, pid);
1635 if (r < 0 && ret >= 0)
1636 ret = r;
1637 }
1638
1639 if (r < 0 && ret >= 0)
1640 ret = r;
1641 }
1642
1643 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1644 if (r < 0) {
1645 if (ret >= 0)
1646 ret = r;
1647 } else {
1648 char *fn;
1649
1650 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1651 _cleanup_free_ char *p = NULL;
1652
605405c6 1653 p = strjoin(path, "/", fn);
efdb0237
LP
1654 free(fn);
1655
1656 if (!p)
1657 return -ENOMEM;
1658
1659 r = unit_watch_pids_in_path(u, p);
1660 if (r < 0 && ret >= 0)
1661 ret = r;
1662 }
1663
1664 if (r < 0 && ret >= 0)
1665 ret = r;
1666 }
1667
1668 return ret;
1669}
1670
1671int unit_watch_all_pids(Unit *u) {
b4cccbc1
LP
1672 int r;
1673
efdb0237
LP
1674 assert(u);
1675
1676 /* Adds all PIDs from our cgroup to the set of PIDs we
1677 * watch. This is a fallback logic for cases where we do not
1678 * get reliable cgroup empty notifications: we try to use
1679 * SIGCHLD as replacement. */
1680
1681 if (!u->cgroup_path)
1682 return -ENOENT;
1683
c22800e4 1684 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1685 if (r < 0)
1686 return r;
1687 if (r > 0) /* On unified we can use proper notifications */
efdb0237
LP
1688 return 0;
1689
1690 return unit_watch_pids_in_path(u, u->cgroup_path);
1691}
1692
1693int unit_notify_cgroup_empty(Unit *u) {
1694 int r;
1695
1696 assert(u);
1697
1698 if (!u->cgroup_path)
1699 return 0;
1700
1701 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1702 if (r <= 0)
1703 return r;
1704
1705 unit_add_to_gc_queue(u);
1706
1707 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1708 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1709
1710 return 0;
1711}
1712
1713static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1714 Manager *m = userdata;
1715
1716 assert(s);
1717 assert(fd >= 0);
1718 assert(m);
1719
1720 for (;;) {
1721 union inotify_event_buffer buffer;
1722 struct inotify_event *e;
1723 ssize_t l;
1724
1725 l = read(fd, &buffer, sizeof(buffer));
1726 if (l < 0) {
1727 if (errno == EINTR || errno == EAGAIN)
1728 return 0;
1729
1730 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1731 }
1732
1733 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1734 Unit *u;
1735
1736 if (e->wd < 0)
1737 /* Queue overflow has no watch descriptor */
1738 continue;
1739
1740 if (e->mask & IN_IGNORED)
1741 /* The watch was just removed */
1742 continue;
1743
1744 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1745 if (!u) /* Not that inotify might deliver
1746 * events for a watch even after it
1747 * was removed, because it was queued
1748 * before the removal. Let's ignore
1749 * this here safely. */
1750 continue;
1751
1752 (void) unit_notify_cgroup_empty(u);
1753 }
1754 }
8e274523
LP
1755}
1756
8e274523 1757int manager_setup_cgroup(Manager *m) {
9444b1f2 1758 _cleanup_free_ char *path = NULL;
efdb0237 1759 CGroupController c;
b4cccbc1 1760 int r, all_unified;
efdb0237 1761 char *e;
8e274523
LP
1762
1763 assert(m);
1764
35d2e7ec 1765 /* 1. Determine hierarchy */
efdb0237 1766 m->cgroup_root = mfree(m->cgroup_root);
9444b1f2 1767 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
23bbb0de
MS
1768 if (r < 0)
1769 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
8e274523 1770
efdb0237
LP
1771 /* Chop off the init scope, if we are already located in it */
1772 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
0d8c31ff 1773
efdb0237
LP
1774 /* LEGACY: Also chop off the system slice if we are in
1775 * it. This is to support live upgrades from older systemd
1776 * versions where PID 1 was moved there. Also see
1777 * cg_get_root_path(). */
463d0d15 1778 if (!e && MANAGER_IS_SYSTEM(m)) {
9444b1f2 1779 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
15c60e99 1780 if (!e)
efdb0237 1781 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
0baf24dd 1782 }
efdb0237
LP
1783 if (e)
1784 *e = 0;
7ccfb64a 1785
9444b1f2
LP
1786 /* And make sure to store away the root value without trailing
1787 * slash, even for the root dir, so that we can easily prepend
1788 * it everywhere. */
efdb0237
LP
1789 while ((e = endswith(m->cgroup_root, "/")))
1790 *e = 0;
8e274523 1791
35d2e7ec 1792 /* 2. Show data */
9444b1f2 1793 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
23bbb0de
MS
1794 if (r < 0)
1795 return log_error_errno(r, "Cannot find cgroup mount point: %m");
8e274523 1796
415fc41c
TH
1797 r = cg_unified_flush();
1798 if (r < 0)
1799 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
5da38d07 1800
b4cccbc1
LP
1801 all_unified = cg_all_unified();
1802 if (r < 0)
1803 return log_error_errno(r, "Couldn't determine whether we are in all unified mode: %m");
1804 if (r > 0)
efdb0237 1805 log_debug("Unified cgroup hierarchy is located at %s.", path);
b4cccbc1 1806 else {
c22800e4 1807 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
b4cccbc1
LP
1808 if (r < 0)
1809 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
1810 if (r > 0)
1811 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
1812 else
1813 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
1814 }
efdb0237 1815
0d8c31ff 1816 if (!m->test_run) {
efdb0237 1817 const char *scope_path;
c6c18be3 1818
0d8c31ff 1819 /* 3. Install agent */
c22800e4 1820 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
efdb0237 1821
61233823 1822 /* In the unified hierarchy we can get
efdb0237
LP
1823 * cgroup empty notifications via inotify. */
1824
1825 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1826 safe_close(m->cgroup_inotify_fd);
1827
1828 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1829 if (m->cgroup_inotify_fd < 0)
1830 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1831
1832 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1833 if (r < 0)
1834 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1835
d8fdc620
LP
1836 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
1837 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
1838 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5);
efdb0237
LP
1839 if (r < 0)
1840 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1841
1842 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1843
463d0d15 1844 } else if (MANAGER_IS_SYSTEM(m)) {
efdb0237
LP
1845
1846 /* On the legacy hierarchy we only get
1847 * notifications via cgroup agents. (Which
1848 * isn't really reliable, since it does not
1849 * generate events when control groups with
1850 * children run empty. */
1851
0d8c31ff
ZJS
1852 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1853 if (r < 0)
da927ba9 1854 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
0d8c31ff
ZJS
1855 else if (r > 0)
1856 log_debug("Installed release agent.");
efdb0237 1857 else if (r == 0)
0d8c31ff
ZJS
1858 log_debug("Release agent already installed.");
1859 }
8e274523 1860
efdb0237
LP
1861 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1862 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1863 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
23bbb0de 1864 if (r < 0)
efdb0237
LP
1865 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1866
1867 /* also, move all other userspace processes remaining
1868 * in the root cgroup into that scope. */
1d98fef1 1869 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
efdb0237
LP
1870 if (r < 0)
1871 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
c6c18be3 1872
0d8c31ff
ZJS
1873 /* 5. And pin it, so that it cannot be unmounted */
1874 safe_close(m->pin_cgroupfs_fd);
0d8c31ff 1875 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1876 if (m->pin_cgroupfs_fd < 0)
1877 return log_error_errno(errno, "Failed to open pin file: %m");
0d8c31ff 1878
cc98b302 1879 /* 6. Always enable hierarchical support if it exists... */
b4cccbc1 1880 if (!all_unified)
efdb0237 1881 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
c6c18be3
LP
1882 }
1883
0d8c31ff 1884 /* 7. Figure out which controllers are supported */
efdb0237
LP
1885 r = cg_mask_supported(&m->cgroup_supported);
1886 if (r < 0)
1887 return log_error_errno(r, "Failed to determine supported controllers: %m");
1888
1889 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
eee0a1e4 1890 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
9156e799 1891
a32360f1 1892 return 0;
8e274523
LP
1893}
1894
c6c18be3 1895void manager_shutdown_cgroup(Manager *m, bool delete) {
8e274523
LP
1896 assert(m);
1897
9444b1f2
LP
1898 /* We can't really delete the group, since we are in it. But
1899 * let's trim it. */
1900 if (delete && m->cgroup_root)
efdb0237
LP
1901 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1902
1903 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1904
1905 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1906 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
8e274523 1907
03e334a1 1908 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
c6c18be3 1909
efdb0237 1910 m->cgroup_root = mfree(m->cgroup_root);
8e274523
LP
1911}
1912
4ad49000 1913Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
acb14d31 1914 char *p;
4ad49000 1915 Unit *u;
acb14d31
LP
1916
1917 assert(m);
1918 assert(cgroup);
acb14d31 1919
4ad49000
LP
1920 u = hashmap_get(m->cgroup_unit, cgroup);
1921 if (u)
1922 return u;
acb14d31 1923
8e70580b 1924 p = strdupa(cgroup);
acb14d31
LP
1925 for (;;) {
1926 char *e;
1927
1928 e = strrchr(p, '/');
efdb0237
LP
1929 if (!e || e == p)
1930 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
acb14d31
LP
1931
1932 *e = 0;
1933
4ad49000
LP
1934 u = hashmap_get(m->cgroup_unit, p);
1935 if (u)
1936 return u;
acb14d31
LP
1937 }
1938}
1939
b3ac818b 1940Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
4ad49000 1941 _cleanup_free_ char *cgroup = NULL;
acb14d31 1942 int r;
8e274523 1943
8c47c732
LP
1944 assert(m);
1945
b3ac818b
LP
1946 if (pid <= 0)
1947 return NULL;
1948
1949 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1950 if (r < 0)
1951 return NULL;
1952
1953 return manager_get_unit_by_cgroup(m, cgroup);
1954}
1955
1956Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1957 Unit *u;
1958
1959 assert(m);
1960
efdb0237 1961 if (pid <= 0)
8c47c732
LP
1962 return NULL;
1963
efdb0237
LP
1964 if (pid == 1)
1965 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1966
fea72cc0 1967 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
5fe8876b
LP
1968 if (u)
1969 return u;
1970
fea72cc0 1971 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
5fe8876b
LP
1972 if (u)
1973 return u;
1974
b3ac818b 1975 return manager_get_unit_by_pid_cgroup(m, pid);
6dde1f33 1976}
4fbf50b3 1977
4ad49000
LP
1978int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1979 Unit *u;
4fbf50b3 1980
4ad49000
LP
1981 assert(m);
1982 assert(cgroup);
4fbf50b3 1983
d8fdc620
LP
1984 log_debug("Got cgroup empty notification for: %s", cgroup);
1985
4ad49000 1986 u = manager_get_unit_by_cgroup(m, cgroup);
5ad096b3
LP
1987 if (!u)
1988 return 0;
b56c28c3 1989
efdb0237 1990 return unit_notify_cgroup_empty(u);
5ad096b3
LP
1991}
1992
1993int unit_get_memory_current(Unit *u, uint64_t *ret) {
1994 _cleanup_free_ char *v = NULL;
1995 int r;
1996
1997 assert(u);
1998 assert(ret);
1999
2000 if (!u->cgroup_path)
2001 return -ENODATA;
2002
efdb0237 2003 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
5ad096b3
LP
2004 return -ENODATA;
2005
b4cccbc1
LP
2006 r = cg_all_unified();
2007 if (r < 0)
2008 return r;
2009 if (r > 0)
efdb0237 2010 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
b4cccbc1
LP
2011 else
2012 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
5ad096b3
LP
2013 if (r == -ENOENT)
2014 return -ENODATA;
2015 if (r < 0)
2016 return r;
2017
2018 return safe_atou64(v, ret);
2019}
2020
03a7b521
LP
2021int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2022 _cleanup_free_ char *v = NULL;
2023 int r;
2024
2025 assert(u);
2026 assert(ret);
2027
2028 if (!u->cgroup_path)
2029 return -ENODATA;
2030
2031 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2032 return -ENODATA;
2033
2034 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2035 if (r == -ENOENT)
2036 return -ENODATA;
2037 if (r < 0)
2038 return r;
2039
2040 return safe_atou64(v, ret);
2041}
2042
5ad096b3
LP
2043static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2044 _cleanup_free_ char *v = NULL;
2045 uint64_t ns;
2046 int r;
2047
2048 assert(u);
2049 assert(ret);
2050
2051 if (!u->cgroup_path)
2052 return -ENODATA;
2053
b4cccbc1
LP
2054 r = cg_all_unified();
2055 if (r < 0)
2056 return r;
2057 if (r > 0) {
66ebf6c0
TH
2058 const char *keys[] = { "usage_usec", NULL };
2059 _cleanup_free_ char *val = NULL;
2060 uint64_t us;
5ad096b3 2061
66ebf6c0
TH
2062 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2063 return -ENODATA;
5ad096b3 2064
66ebf6c0
TH
2065 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2066 if (r < 0)
2067 return r;
2068
2069 r = safe_atou64(val, &us);
2070 if (r < 0)
2071 return r;
2072
2073 ns = us * NSEC_PER_USEC;
2074 } else {
2075 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2076 return -ENODATA;
2077
2078 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2079 if (r == -ENOENT)
2080 return -ENODATA;
2081 if (r < 0)
2082 return r;
2083
2084 r = safe_atou64(v, &ns);
2085 if (r < 0)
2086 return r;
2087 }
5ad096b3
LP
2088
2089 *ret = ns;
2090 return 0;
2091}
2092
2093int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2094 nsec_t ns;
2095 int r;
2096
fe700f46
LP
2097 assert(u);
2098
2099 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2100 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2101 * call this function with a NULL return value. */
2102
5ad096b3 2103 r = unit_get_cpu_usage_raw(u, &ns);
fe700f46
LP
2104 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2105 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2106 * cached value. */
2107
2108 if (ret)
2109 *ret = u->cpu_usage_last;
2110 return 0;
2111 }
5ad096b3
LP
2112 if (r < 0)
2113 return r;
2114
66ebf6c0
TH
2115 if (ns > u->cpu_usage_base)
2116 ns -= u->cpu_usage_base;
5ad096b3
LP
2117 else
2118 ns = 0;
2119
fe700f46
LP
2120 u->cpu_usage_last = ns;
2121 if (ret)
2122 *ret = ns;
2123
5ad096b3
LP
2124 return 0;
2125}
2126
2127int unit_reset_cpu_usage(Unit *u) {
2128 nsec_t ns;
2129 int r;
2130
2131 assert(u);
2132
fe700f46
LP
2133 u->cpu_usage_last = NSEC_INFINITY;
2134
5ad096b3
LP
2135 r = unit_get_cpu_usage_raw(u, &ns);
2136 if (r < 0) {
66ebf6c0 2137 u->cpu_usage_base = 0;
5ad096b3 2138 return r;
b56c28c3 2139 }
2633eb83 2140
66ebf6c0 2141 u->cpu_usage_base = ns;
4ad49000 2142 return 0;
4fbf50b3
LP
2143}
2144
e9db43d5
LP
2145bool unit_cgroup_delegate(Unit *u) {
2146 CGroupContext *c;
2147
2148 assert(u);
2149
2150 c = unit_get_cgroup_context(u);
2151 if (!c)
2152 return false;
2153
2154 return c->delegate;
2155}
2156
e7ab4d1a
LP
2157void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2158 assert(u);
2159
2160 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2161 return;
2162
2163 if (m == 0)
2164 return;
2165
538b4852
TH
2166 /* always invalidate compat pairs together */
2167 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2168 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2169
e7ab4d1a
LP
2170 if ((u->cgroup_realized_mask & m) == 0)
2171 return;
2172
2173 u->cgroup_realized_mask &= ~m;
2174 unit_add_to_cgroup_queue(u);
2175}
2176
2177void manager_invalidate_startup_units(Manager *m) {
2178 Iterator i;
2179 Unit *u;
2180
2181 assert(m);
2182
2183 SET_FOREACH(u, m->startup_units, i)
13c31542 2184 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
e7ab4d1a
LP
2185}
2186
4ad49000
LP
2187static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2188 [CGROUP_AUTO] = "auto",
2189 [CGROUP_CLOSED] = "closed",
2190 [CGROUP_STRICT] = "strict",
2191};
4fbf50b3 2192
4ad49000 2193DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);