src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 #include "blockdev-util.h"
  26 #include "bpf-firewall.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29 #include "fd-util.h"
  30 #include "fileio.h"
  31 #include "fs-util.h"
  32 #include "parse-util.h"
  33 #include "path-util.h"
  34 #include "process-util.h"
  35 #include "special.h"
  36 #include "stdio-util.h"
  37 #include "string-table.h"
  38 #include "string-util.h"
  39
  40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  41
  42 static void cgroup_compat_warn(void) {
  43         static bool cgroup_compat_warned = false;
  44
  45         if (cgroup_compat_warned)
  46                 return;
  47
  48         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
  49         cgroup_compat_warned = true;
  50 }
  51
  52 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  53                 cgroup_compat_warn();                                           \
  54                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  55         } while (false)
  56
  57 void cgroup_context_init(CGroupContext *c) {
  58         assert(c);
  59
  60         /* Initialize everything to the kernel defaults, assuming the
  61          * structure is preinitialized to 0 */
  62
  63         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  64         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  65         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  66
  67         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  68         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  69
  70         c->memory_high = CGROUP_LIMIT_MAX;
  71         c->memory_max = CGROUP_LIMIT_MAX;
  72         c->memory_swap_max = CGROUP_LIMIT_MAX;
  73
  74         c->memory_limit = CGROUP_LIMIT_MAX;
  75
  76         c->io_weight = CGROUP_WEIGHT_INVALID;
  77         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  78
  79         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  80         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  81
  82         c->tasks_max = (uint64_t) -1;
  83 }
  84
  85 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  86         assert(c);
  87         assert(a);
  88
  89         LIST_REMOVE(device_allow, c->device_allow, a);
  90         free(a->path);
  91         free(a);
  92 }
  93
  94 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
  95         assert(c);
  96         assert(w);
  97
  98         LIST_REMOVE(device_weights, c->io_device_weights, w);
  99         free(w->path);
 100         free(w);
 101 }
 102
 103 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 104         assert(c);
 105         assert(l);
 106
 107         LIST_REMOVE(device_limits, c->io_device_limits, l);
 108         free(l->path);
 109         free(l);
 110 }
 111
 112 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 113         assert(c);
 114         assert(w);
 115
 116         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 117         free(w->path);
 118         free(w);
 119 }
 120
 121 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 122         assert(c);
 123         assert(b);
 124
 125         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 126         free(b->path);
 127         free(b);
 128 }
 129
 130 void cgroup_context_done(CGroupContext *c) {
 131         assert(c);
 132
 133         while (c->io_device_weights)
 134                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 135
 136         while (c->io_device_limits)
 137                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 138
 139         while (c->blockio_device_weights)
 140                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 141
 142         while (c->blockio_device_bandwidths)
 143                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 144
 145         while (c->device_allow)
 146                 cgroup_context_free_device_allow(c, c->device_allow);
 147
 148         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 149         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 150 }
 151
 152 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 153         CGroupIODeviceLimit *il;
 154         CGroupIODeviceWeight *iw;
 155         CGroupBlockIODeviceBandwidth *b;
 156         CGroupBlockIODeviceWeight *w;
 157         CGroupDeviceAllow *a;
 158         IPAddressAccessItem *iaai;
 159         char u[FORMAT_TIMESPAN_MAX];
 160
 161         assert(c);
 162         assert(f);
 163
 164         prefix = strempty(prefix);
 165
 166         fprintf(f,
 167                 "%sCPUAccounting=%s\n"
 168                 "%sIOAccounting=%s\n"
 169                 "%sBlockIOAccounting=%s\n"
 170                 "%sMemoryAccounting=%s\n"
 171                 "%sTasksAccounting=%s\n"
 172                 "%sIPAccounting=%s\n"
 173                 "%sCPUWeight=%" PRIu64 "\n"
 174                 "%sStartupCPUWeight=%" PRIu64 "\n"
 175                 "%sCPUShares=%" PRIu64 "\n"
 176                 "%sStartupCPUShares=%" PRIu64 "\n"
 177                 "%sCPUQuotaPerSecSec=%s\n"
 178                 "%sIOWeight=%" PRIu64 "\n"
 179                 "%sStartupIOWeight=%" PRIu64 "\n"
 180                 "%sBlockIOWeight=%" PRIu64 "\n"
 181                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 182                 "%sMemoryLow=%" PRIu64 "\n"
 183                 "%sMemoryHigh=%" PRIu64 "\n"
 184                 "%sMemoryMax=%" PRIu64 "\n"
 185                 "%sMemorySwapMax=%" PRIu64 "\n"
 186                 "%sMemoryLimit=%" PRIu64 "\n"
 187                 "%sTasksMax=%" PRIu64 "\n"
 188                 "%sDevicePolicy=%s\n"
 189                 "%sDelegate=%s\n",
 190                 prefix, yes_no(c->cpu_accounting),
 191                 prefix, yes_no(c->io_accounting),
 192                 prefix, yes_no(c->blockio_accounting),
 193                 prefix, yes_no(c->memory_accounting),
 194                 prefix, yes_no(c->tasks_accounting),
 195                 prefix, yes_no(c->ip_accounting),
 196                 prefix, c->cpu_weight,
 197                 prefix, c->startup_cpu_weight,
 198                 prefix, c->cpu_shares,
 199                 prefix, c->startup_cpu_shares,
 200                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 201                 prefix, c->io_weight,
 202                 prefix, c->startup_io_weight,
 203                 prefix, c->blockio_weight,
 204                 prefix, c->startup_blockio_weight,
 205                 prefix, c->memory_low,
 206                 prefix, c->memory_high,
 207                 prefix, c->memory_max,
 208                 prefix, c->memory_swap_max,
 209                 prefix, c->memory_limit,
 210                 prefix, c->tasks_max,
 211                 prefix, cgroup_device_policy_to_string(c->device_policy),
 212                 prefix, yes_no(c->delegate));
 213
 214         if (c->delegate) {
 215                 _cleanup_free_ char *t = NULL;
 216
 217                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 218
 219                 fprintf(f, "%sDelegateControllers=%s\n",
 220                         prefix,
 221                         strempty(t));
 222         }
 223
 224         LIST_FOREACH(device_allow, a, c->device_allow)
 225                 fprintf(f,
 226                         "%sDeviceAllow=%s %s%s%s\n",
 227                         prefix,
 228                         a->path,
 229                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 230
 231         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 232                 fprintf(f,
 233                         "%sIODeviceWeight=%s %" PRIu64,
 234                         prefix,
 235                         iw->path,
 236                         iw->weight);
 237
 238         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 239                 char buf[FORMAT_BYTES_MAX];
 240                 CGroupIOLimitType type;
 241
 242                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 243                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 244                                 fprintf(f,
 245                                         "%s%s=%s %s\n",
 246                                         prefix,
 247                                         cgroup_io_limit_type_to_string(type),
 248                                         il->path,
 249                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 250         }
 251
 252         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 253                 fprintf(f,
 254                         "%sBlockIODeviceWeight=%s %" PRIu64,
 255                         prefix,
 256                         w->path,
 257                         w->weight);
 258
 259         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 260                 char buf[FORMAT_BYTES_MAX];
 261
 262                 if (b->rbps != CGROUP_LIMIT_MAX)
 263                         fprintf(f,
 264                                 "%sBlockIOReadBandwidth=%s %s\n",
 265                                 prefix,
 266                                 b->path,
 267                                 format_bytes(buf, sizeof(buf), b->rbps));
 268                 if (b->wbps != CGROUP_LIMIT_MAX)
 269                         fprintf(f,
 270                                 "%sBlockIOWriteBandwidth=%s %s\n",
 271                                 prefix,
 272                                 b->path,
 273                                 format_bytes(buf, sizeof(buf), b->wbps));
 274         }
 275
 276         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 277                 _cleanup_free_ char *k = NULL;
 278
 279                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 280                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 281         }
 282
 283         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 284                 _cleanup_free_ char *k = NULL;
 285
 286                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 287                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 288         }
 289 }
 290
 291 static int lookup_block_device(const char *p, dev_t *dev) {
 292         struct stat st;
 293         int r;
 294
 295         assert(p);
 296         assert(dev);
 297
 298         r = stat(p, &st);
 299         if (r < 0)
 300                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 301
 302         if (S_ISBLK(st.st_mode))
 303                 *dev = st.st_rdev;
 304         else if (major(st.st_dev) != 0) {
 305                 /* If this is not a device node then find the block
 306                  * device this file is stored on */
 307                 *dev = st.st_dev;
 308
 309                 /* If this is a partition, try to get the originating
 310                  * block device */
 311                 (void) block_get_whole_disk(*dev, dev);
 312         } else {
 313                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 314                 return -ENODEV;
 315         }
 316
 317         return 0;
 318 }
 319
 320 static int whitelist_device(const char *path, const char *node, const char *acc) {
 321         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 322         struct stat st;
 323         bool ignore_notfound;
 324         int r;
 325
 326         assert(path);
 327         assert(acc);
 328
 329         if (node[0] == '-') {
 330                 /* Non-existent paths starting with "-" must be silently ignored */
 331                 node++;
 332                 ignore_notfound = true;
 333         } else
 334                 ignore_notfound = false;
 335
 336         if (stat(node, &st) < 0) {
 337                 if (errno == ENOENT && ignore_notfound)
 338                         return 0;
 339
 340                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 341         }
 342
 343         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 344                 log_warning("%s is not a device.", node);
 345                 return -ENODEV;
 346         }
 347
 348         sprintf(buf,
 349                 "%c %u:%u %s",
 350                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 351                 major(st.st_rdev), minor(st.st_rdev),
 352                 acc);
 353
 354         r = cg_set_attribute("devices", path, "devices.allow", buf);
 355         if (r < 0)
 356                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 357                                "Failed to set devices.allow on %s: %m", path);
 358
 359         return r;
 360 }
 361
 362 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 363         _cleanup_fclose_ FILE *f = NULL;
 364         char line[LINE_MAX];
 365         bool good = false;
 366         int r;
 367
 368         assert(path);
 369         assert(acc);
 370         assert(IN_SET(type, 'b', 'c'));
 371
 372         f = fopen("/proc/devices", "re");
 373         if (!f)
 374                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 375
 376         FOREACH_LINE(line, f, goto fail) {
 377                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 378                 unsigned maj;
 379
 380                 truncate_nl(line);
 381
 382                 if (type == 'c' && streq(line, "Character devices:")) {
 383                         good = true;
 384                         continue;
 385                 }
 386
 387                 if (type == 'b' && streq(line, "Block devices:")) {
 388                         good = true;
 389                         continue;
 390                 }
 391
 392                 if (isempty(line)) {
 393                         good = false;
 394                         continue;
 395                 }
 396
 397                 if (!good)
 398                         continue;
 399
 400                 p = strstrip(line);
 401
 402                 w = strpbrk(p, WHITESPACE);
 403                 if (!w)
 404                         continue;
 405                 *w = 0;
 406
 407                 r = safe_atou(p, &maj);
 408                 if (r < 0)
 409                         continue;
 410                 if (maj <= 0)
 411                         continue;
 412
 413                 w++;
 414                 w += strspn(w, WHITESPACE);
 415
 416                 if (fnmatch(name, w, 0) != 0)
 417                         continue;
 418
 419                 sprintf(buf,
 420                         "%c %u:* %s",
 421                         type,
 422                         maj,
 423                         acc);
 424
 425                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 426                 if (r < 0)
 427                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 428                                        "Failed to set devices.allow on %s: %m", path);
 429         }
 430
 431         return 0;
 432
 433 fail:
 434         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 435 }
 436
 437 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 438         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 439                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 440 }
 441
 442 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 443         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 444                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 445 }
 446
 447 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 448         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 449             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 450                 return c->startup_cpu_weight;
 451         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 452                 return c->cpu_weight;
 453         else
 454                 return CGROUP_WEIGHT_DEFAULT;
 455 }
 456
 457 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 458         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 459             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 460                 return c->startup_cpu_shares;
 461         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 462                 return c->cpu_shares;
 463         else
 464                 return CGROUP_CPU_SHARES_DEFAULT;
 465 }
 466
 467 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 468         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 469         int r;
 470
 471         xsprintf(buf, "%" PRIu64 "\n", weight);
 472         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 473         if (r < 0)
 474                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 475                               "Failed to set cpu.weight: %m");
 476
 477         if (quota != USEC_INFINITY)
 478                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 479                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 480         else
 481                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 482
 483         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 484
 485         if (r < 0)
 486                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 487                               "Failed to set cpu.max: %m");
 488 }
 489
 490 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 491         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 492         int r;
 493
 494         xsprintf(buf, "%" PRIu64 "\n", shares);
 495         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 496         if (r < 0)
 497                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 498                               "Failed to set cpu.shares: %m");
 499
 500         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 501         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 502         if (r < 0)
 503                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 504                               "Failed to set cpu.cfs_period_us: %m");
 505
 506         if (quota != USEC_INFINITY) {
 507                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 508                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 509         } else
 510                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 511         if (r < 0)
 512                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 513                               "Failed to set cpu.cfs_quota_us: %m");
 514 }
 515
 516 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 517         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 518                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 519 }
 520
 521 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 522         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 523                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 524 }
 525
 526 static bool cgroup_context_has_io_config(CGroupContext *c) {
 527         return c->io_accounting ||
 528                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 529                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 530                 c->io_device_weights ||
 531                 c->io_device_limits;
 532 }
 533
 534 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 535         return c->blockio_accounting ||
 536                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 537                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 538                 c->blockio_device_weights ||
 539                 c->blockio_device_bandwidths;
 540 }
 541
 542 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 543         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 544             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 545                 return c->startup_io_weight;
 546         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 547                 return c->io_weight;
 548         else
 549                 return CGROUP_WEIGHT_DEFAULT;
 550 }
 551
 552 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 553         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 554             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 555                 return c->startup_blockio_weight;
 556         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 557                 return c->blockio_weight;
 558         else
 559                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 560 }
 561
 562 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 563         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 564                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 565 }
 566
 567 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 568         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 569                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 570 }
 571
 572 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 573         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 574         dev_t dev;
 575         int r;
 576
 577         r = lookup_block_device(dev_path, &dev);
 578         if (r < 0)
 579                 return;
 580
 581         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 582         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 583         if (r < 0)
 584                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 585                               "Failed to set io.weight: %m");
 586 }
 587
 588 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 589         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 590         dev_t dev;
 591         int r;
 592
 593         r = lookup_block_device(dev_path, &dev);
 594         if (r < 0)
 595                 return;
 596
 597         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 598         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 599         if (r < 0)
 600                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 601                               "Failed to set blkio.weight_device: %m");
 602 }
 603
 604 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 605         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 606         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 607         CGroupIOLimitType type;
 608         dev_t dev;
 609         unsigned n = 0;
 610         int r;
 611
 612         r = lookup_block_device(dev_path, &dev);
 613         if (r < 0)
 614                 return 0;
 615
 616         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 617                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 618                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 619                         n++;
 620                 } else {
 621                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 622                 }
 623         }
 624
 625         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 626                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 627                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 628         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 629         if (r < 0)
 630                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 631                               "Failed to set io.max: %m");
 632         return n;
 633 }
 634
 635 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 636         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 637         dev_t dev;
 638         unsigned n = 0;
 639         int r;
 640
 641         r = lookup_block_device(dev_path, &dev);
 642         if (r < 0)
 643                 return 0;
 644
 645         if (rbps != CGROUP_LIMIT_MAX)
 646                 n++;
 647         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 648         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 649         if (r < 0)
 650                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 651                               "Failed to set blkio.throttle.read_bps_device: %m");
 652
 653         if (wbps != CGROUP_LIMIT_MAX)
 654                 n++;
 655         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 656         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 657         if (r < 0)
 658                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 659                               "Failed to set blkio.throttle.write_bps_device: %m");
 660
 661         return n;
 662 }
 663
 664 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 665         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 666 }
 667
 668 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 669         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 670         int r;
 671
 672         if (v != CGROUP_LIMIT_MAX)
 673                 xsprintf(buf, "%" PRIu64 "\n", v);
 674
 675         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 676         if (r < 0)
 677                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 678                               "Failed to set %s: %m", file);
 679 }
 680
 681 static void cgroup_apply_firewall(Unit *u) {
 682         int r;
 683
 684         assert(u);
 685
 686         if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
 687                                     * not recursive we don't ever touch the bpf on them */
 688                 return;
 689
 690         r = bpf_firewall_compile(u);
 691         if (r < 0)
 692                 return;
 693
 694         (void) bpf_firewall_install(u);
 695         return;
 696 }
 697
 698 static void cgroup_context_apply(
 699                 Unit *u,
 700                 CGroupMask apply_mask,
 701                 bool apply_bpf,
 702                 ManagerState state) {
 703
 704         const char *path;
 705         CGroupContext *c;
 706         bool is_root;
 707         int r;
 708
 709         assert(u);
 710
 711         c = unit_get_cgroup_context(u);
 712         path = u->cgroup_path;
 713
 714         assert(c);
 715         assert(path);
 716
 717         /* Nothing to do? Exit early! */
 718         if (apply_mask == 0 && !apply_bpf)
 719                 return;
 720
 721         /* Some cgroup attributes are not supported on the root cgroup,
 722          * hence silently ignore */
 723         is_root = isempty(path) || path_equal(path, "/");
 724         if (is_root)
 725                 /* Make sure we don't try to display messages with an empty path. */
 726                 path = "/";
 727
 728         /* We generally ignore errors caused by read-only mounted
 729          * cgroup trees (assuming we are running in a container then),
 730          * and missing cgroups, i.e. EROFS and ENOENT. */
 731
 732         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 733                 bool has_weight, has_shares;
 734
 735                 has_weight = cgroup_context_has_cpu_weight(c);
 736                 has_shares = cgroup_context_has_cpu_shares(c);
 737
 738                 if (cg_all_unified() > 0) {
 739                         uint64_t weight;
 740
 741                         if (has_weight)
 742                                 weight = cgroup_context_cpu_weight(c, state);
 743                         else if (has_shares) {
 744                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 745
 746                                 weight = cgroup_cpu_shares_to_weight(shares);
 747
 748                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 749                                                   shares, weight, path);
 750                         } else
 751                                 weight = CGROUP_WEIGHT_DEFAULT;
 752
 753                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 754                 } else {
 755                         uint64_t shares;
 756
 757                         if (has_weight) {
 758                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 759
 760                                 shares = cgroup_cpu_weight_to_shares(weight);
 761
 762                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 763                                                   weight, shares, path);
 764                         } else if (has_shares)
 765                                 shares = cgroup_context_cpu_shares(c, state);
 766                         else
 767                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 768
 769                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 770                 }
 771         }
 772
 773         if (apply_mask & CGROUP_MASK_IO) {
 774                 bool has_io = cgroup_context_has_io_config(c);
 775                 bool has_blockio = cgroup_context_has_blockio_config(c);
 776
 777                 if (!is_root) {
 778                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 779                         uint64_t weight;
 780
 781                         if (has_io)
 782                                 weight = cgroup_context_io_weight(c, state);
 783                         else if (has_blockio) {
 784                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 785
 786                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 787
 788                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 789                                                   blkio_weight, weight);
 790                         } else
 791                                 weight = CGROUP_WEIGHT_DEFAULT;
 792
 793                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 794                         r = cg_set_attribute("io", path, "io.weight", buf);
 795                         if (r < 0)
 796                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 797                                               "Failed to set io.weight: %m");
 798
 799                         if (has_io) {
 800                                 CGroupIODeviceWeight *w;
 801
 802                                 /* FIXME: no way to reset this list */
 803                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 804                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 805                         } else if (has_blockio) {
 806                                 CGroupBlockIODeviceWeight *w;
 807
 808                                 /* FIXME: no way to reset this list */
 809                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 810                                         weight = cgroup_weight_blkio_to_io(w->weight);
 811
 812                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 813                                                           w->weight, weight, w->path);
 814
 815                                         cgroup_apply_io_device_weight(u, w->path, weight);
 816                                 }
 817                         }
 818                 }
 819
 820                 /* Apply limits and free ones without config. */
 821                 if (has_io) {
 822                         CGroupIODeviceLimit *l, *next;
 823
 824                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 825                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 826                                         cgroup_context_free_io_device_limit(c, l);
 827                         }
 828                 } else if (has_blockio) {
 829                         CGroupBlockIODeviceBandwidth *b, *next;
 830
 831                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 832                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 833                                 CGroupIOLimitType type;
 834
 835                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 836                                         limits[type] = cgroup_io_limit_defaults[type];
 837
 838                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 839                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 840
 841                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 842                                                   b->rbps, b->wbps, b->path);
 843
 844                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 845                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 846                         }
 847                 }
 848         }
 849
 850         if (apply_mask & CGROUP_MASK_BLKIO) {
 851                 bool has_io = cgroup_context_has_io_config(c);
 852                 bool has_blockio = cgroup_context_has_blockio_config(c);
 853
 854                 if (!is_root) {
 855                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 856                         uint64_t weight;
 857
 858                         if (has_io) {
 859                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 860
 861                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 862
 863                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 864                                                   io_weight, weight);
 865                         } else if (has_blockio)
 866                                 weight = cgroup_context_blkio_weight(c, state);
 867                         else
 868                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 869
 870                         xsprintf(buf, "%" PRIu64 "\n", weight);
 871                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 872                         if (r < 0)
 873                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 874                                               "Failed to set blkio.weight: %m");
 875
 876                         if (has_io) {
 877                                 CGroupIODeviceWeight *w;
 878
 879                                 /* FIXME: no way to reset this list */
 880                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 881                                         weight = cgroup_weight_io_to_blkio(w->weight);
 882
 883                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 884                                                           w->weight, weight, w->path);
 885
 886                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 887                                 }
 888                         } else if (has_blockio) {
 889                                 CGroupBlockIODeviceWeight *w;
 890
 891                                 /* FIXME: no way to reset this list */
 892                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 893                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 894                         }
 895                 }
 896
 897                 /* Apply limits and free ones without config. */
 898                 if (has_io) {
 899                         CGroupIODeviceLimit *l, *next;
 900
 901                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 902                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 903                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 904
 905                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 906                                         cgroup_context_free_io_device_limit(c, l);
 907                         }
 908                 } else if (has_blockio) {
 909                         CGroupBlockIODeviceBandwidth *b, *next;
 910
 911                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 912                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 913                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 914                 }
 915         }
 916
 917         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 918                 if (cg_all_unified() > 0) {
 919                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 920
 921                         if (cgroup_context_has_unified_memory_config(c)) {
 922                                 max = c->memory_max;
 923                                 swap_max = c->memory_swap_max;
 924                         } else {
 925                                 max = c->memory_limit;
 926
 927                                 if (max != CGROUP_LIMIT_MAX)
 928                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 929                         }
 930
 931                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 932                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 933                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 934                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 935                 } else {
 936                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 937                         uint64_t val;
 938
 939                         if (cgroup_context_has_unified_memory_config(c)) {
 940                                 val = c->memory_max;
 941                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 942                         } else
 943                                 val = c->memory_limit;
 944
 945                         if (val == CGROUP_LIMIT_MAX)
 946                                 strncpy(buf, "-1\n", sizeof(buf));
 947                         else
 948                                 xsprintf(buf, "%" PRIu64 "\n", val);
 949
 950                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 951                         if (r < 0)
 952                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 953                                               "Failed to set memory.limit_in_bytes: %m");
 954                 }
 955         }
 956
 957         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 958                 CGroupDeviceAllow *a;
 959
 960                 /* Changing the devices list of a populated cgroup
 961                  * might result in EINVAL, hence ignore EINVAL
 962                  * here. */
 963
 964                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 965                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 966                 else
 967                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 968                 if (r < 0)
 969                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 970                                       "Failed to reset devices.list: %m");
 971
 972                 if (c->device_policy == CGROUP_CLOSED ||
 973                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 974                         static const char auto_devices[] =
 975                                 "/dev/null\0" "rwm\0"
 976                                 "/dev/zero\0" "rwm\0"
 977                                 "/dev/full\0" "rwm\0"
 978                                 "/dev/random\0" "rwm\0"
 979                                 "/dev/urandom\0" "rwm\0"
 980                                 "/dev/tty\0" "rwm\0"
 981                                 "/dev/ptmx\0" "rwm\0"
 982                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
 983                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
 984                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
 985
 986                         const char *x, *y;
 987
 988                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 989                                 whitelist_device(path, x, y);
 990
 991                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
 992                         whitelist_major(path, "pts", 'c', "rw");
 993                 }
 994
 995                 LIST_FOREACH(device_allow, a, c->device_allow) {
 996                         char acc[4], *val;
 997                         unsigned k = 0;
 998
 999                         if (a->r)
1000                                 acc[k++] = 'r';
1001                         if (a->w)
1002                                 acc[k++] = 'w';
1003                         if (a->m)
1004                                 acc[k++] = 'm';
1005
1006                         if (k == 0)
1007                                 continue;
1008
1009                         acc[k++] = 0;
1010
1011                         if (path_startswith(a->path, "/dev/"))
1012                                 whitelist_device(path, a->path, acc);
1013                         else if ((val = startswith(a->path, "block-")))
1014                                 whitelist_major(path, val, 'b', acc);
1015                         else if ((val = startswith(a->path, "char-")))
1016                                 whitelist_major(path, val, 'c', acc);
1017                         else
1018                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1019                 }
1020         }
1021
1022         if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1023
1024                 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1025                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1026
1027                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1028                         r = cg_set_attribute("pids", path, "pids.max", buf);
1029                 } else
1030                         r = cg_set_attribute("pids", path, "pids.max", "max");
1031
1032                 if (r < 0)
1033                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1034                                       "Failed to set pids.max: %m");
1035         }
1036
1037         if (apply_bpf)
1038                 cgroup_apply_firewall(u);
1039 }
1040
1041 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1042         CGroupMask mask = 0;
1043
1044         /* Figure out which controllers we need */
1045
1046         if (c->cpu_accounting ||
1047             cgroup_context_has_cpu_weight(c) ||
1048             cgroup_context_has_cpu_shares(c) ||
1049             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1050                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1051
1052         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1053                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1054
1055         if (c->memory_accounting ||
1056             c->memory_limit != CGROUP_LIMIT_MAX ||
1057             cgroup_context_has_unified_memory_config(c))
1058                 mask |= CGROUP_MASK_MEMORY;
1059
1060         if (c->device_allow ||
1061             c->device_policy != CGROUP_AUTO)
1062                 mask |= CGROUP_MASK_DEVICES;
1063
1064         if (c->tasks_accounting ||
1065             c->tasks_max != (uint64_t) -1)
1066                 mask |= CGROUP_MASK_PIDS;
1067
1068         return mask;
1069 }
1070
1071 CGroupMask unit_get_own_mask(Unit *u) {
1072         CGroupContext *c;
1073
1074         /* Returns the mask of controllers the unit needs for itself */
1075
1076         c = unit_get_cgroup_context(u);
1077         if (!c)
1078                 return 0;
1079
1080         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1081 }
1082
1083 CGroupMask unit_get_delegate_mask(Unit *u) {
1084         CGroupContext *c;
1085
1086         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1087          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1088          *
1089          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1090
1091         if (u->type == UNIT_SLICE)
1092                 return 0;
1093
1094         c = unit_get_cgroup_context(u);
1095         if (!c)
1096                 return 0;
1097
1098         if (!c->delegate)
1099                 return 0;
1100
1101         if (cg_all_unified() <= 0) {
1102                 ExecContext *e;
1103
1104                 e = unit_get_exec_context(u);
1105                 if (e && !exec_context_maintains_privileges(e))
1106                         return 0;
1107         }
1108
1109         return c->delegate_controllers;
1110 }
1111
1112 CGroupMask unit_get_members_mask(Unit *u) {
1113         assert(u);
1114
1115         /* Returns the mask of controllers all of the unit's children require, merged */
1116
1117         if (u->cgroup_members_mask_valid)
1118                 return u->cgroup_members_mask;
1119
1120         u->cgroup_members_mask = 0;
1121
1122         if (u->type == UNIT_SLICE) {
1123                 void *v;
1124                 Unit *member;
1125                 Iterator i;
1126
1127                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1128
1129                         if (member == u)
1130                                 continue;
1131
1132                         if (UNIT_DEREF(member->slice) != u)
1133                                 continue;
1134
1135                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1136                 }
1137         }
1138
1139         u->cgroup_members_mask_valid = true;
1140         return u->cgroup_members_mask;
1141 }
1142
1143 CGroupMask unit_get_siblings_mask(Unit *u) {
1144         assert(u);
1145
1146         /* Returns the mask of controllers all of the unit's siblings
1147          * require, i.e. the members mask of the unit's parent slice
1148          * if there is one. */
1149
1150         if (UNIT_ISSET(u->slice))
1151                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1152
1153         return unit_get_subtree_mask(u); /* we are the top-level slice */
1154 }
1155
1156 CGroupMask unit_get_subtree_mask(Unit *u) {
1157
1158         /* Returns the mask of this subtree, meaning of the group
1159          * itself and its children. */
1160
1161         return unit_get_own_mask(u) | unit_get_members_mask(u);
1162 }
1163
1164 CGroupMask unit_get_target_mask(Unit *u) {
1165         CGroupMask mask;
1166
1167         /* This returns the cgroup mask of all controllers to enable
1168          * for a specific cgroup, i.e. everything it needs itself,
1169          * plus all that its children need, plus all that its siblings
1170          * need. This is primarily useful on the legacy cgroup
1171          * hierarchy, where we need to duplicate each cgroup in each
1172          * hierarchy that shall be enabled for it. */
1173
1174         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1175         mask &= u->manager->cgroup_supported;
1176
1177         return mask;
1178 }
1179
1180 CGroupMask unit_get_enable_mask(Unit *u) {
1181         CGroupMask mask;
1182
1183         /* This returns the cgroup mask of all controllers to enable
1184          * for the children of a specific cgroup. This is primarily
1185          * useful for the unified cgroup hierarchy, where each cgroup
1186          * controls which controllers are enabled for its children. */
1187
1188         mask = unit_get_members_mask(u);
1189         mask &= u->manager->cgroup_supported;
1190
1191         return mask;
1192 }
1193
1194 bool unit_get_needs_bpf(Unit *u) {
1195         CGroupContext *c;
1196         Unit *p;
1197         assert(u);
1198
1199         /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1200          * moment. */
1201         if (u->type == UNIT_SLICE)
1202                 return false;
1203
1204         c = unit_get_cgroup_context(u);
1205         if (!c)
1206                 return false;
1207
1208         if (c->ip_accounting ||
1209             c->ip_address_allow ||
1210             c->ip_address_deny)
1211                 return true;
1212
1213         /* If any parent slice has an IP access list defined, it applies too */
1214         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1215                 c = unit_get_cgroup_context(p);
1216                 if (!c)
1217                         return false;
1218
1219                 if (c->ip_address_allow ||
1220                     c->ip_address_deny)
1221                         return true;
1222         }
1223
1224         return false;
1225 }
1226
1227 /* Recurse from a unit up through its containing slices, propagating
1228  * mask bits upward. A unit is also member of itself. */
1229 void unit_update_cgroup_members_masks(Unit *u) {
1230         CGroupMask m;
1231         bool more;
1232
1233         assert(u);
1234
1235         /* Calculate subtree mask */
1236         m = unit_get_subtree_mask(u);
1237
1238         /* See if anything changed from the previous invocation. If
1239          * not, we're done. */
1240         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1241                 return;
1242
1243         more =
1244                 u->cgroup_subtree_mask_valid &&
1245                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1246                 ((~m & u->cgroup_subtree_mask) == 0);
1247
1248         u->cgroup_subtree_mask = m;
1249         u->cgroup_subtree_mask_valid = true;
1250
1251         if (UNIT_ISSET(u->slice)) {
1252                 Unit *s = UNIT_DEREF(u->slice);
1253
1254                 if (more)
1255                         /* There's more set now than before. We
1256                          * propagate the new mask to the parent's mask
1257                          * (not caring if it actually was valid or
1258                          * not). */
1259
1260                         s->cgroup_members_mask |= m;
1261
1262                 else
1263                         /* There's less set now than before (or we
1264                          * don't know), we need to recalculate
1265                          * everything, so let's invalidate the
1266                          * parent's members mask */
1267
1268                         s->cgroup_members_mask_valid = false;
1269
1270                 /* And now make sure that this change also hits our
1271                  * grandparents */
1272                 unit_update_cgroup_members_masks(s);
1273         }
1274 }
1275
1276 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1277         Unit *u = userdata;
1278
1279         assert(mask != 0);
1280         assert(u);
1281
1282         while (u) {
1283                 if (u->cgroup_path &&
1284                     u->cgroup_realized &&
1285                     (u->cgroup_realized_mask & mask) == mask)
1286                         return u->cgroup_path;
1287
1288                 u = UNIT_DEREF(u->slice);
1289         }
1290
1291         return NULL;
1292 }
1293
1294 char *unit_default_cgroup_path(Unit *u) {
1295         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1296         int r;
1297
1298         assert(u);
1299
1300         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1301                 return strdup(u->manager->cgroup_root);
1302
1303         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1304                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1305                 if (r < 0)
1306                         return NULL;
1307         }
1308
1309         escaped = cg_escape(u->id);
1310         if (!escaped)
1311                 return NULL;
1312
1313         if (slice)
1314                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1315                                escaped);
1316         else
1317                 return strjoin(u->manager->cgroup_root, "/", escaped);
1318 }
1319
1320 int unit_set_cgroup_path(Unit *u, const char *path) {
1321         _cleanup_free_ char *p = NULL;
1322         int r;
1323
1324         assert(u);
1325
1326         if (path) {
1327                 p = strdup(path);
1328                 if (!p)
1329                         return -ENOMEM;
1330         } else
1331                 p = NULL;
1332
1333         if (streq_ptr(u->cgroup_path, p))
1334                 return 0;
1335
1336         if (p) {
1337                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1338                 if (r < 0)
1339                         return r;
1340         }
1341
1342         unit_release_cgroup(u);
1343
1344         u->cgroup_path = p;
1345         p = NULL;
1346
1347         return 1;
1348 }
1349
1350 int unit_watch_cgroup(Unit *u) {
1351         _cleanup_free_ char *events = NULL;
1352         int r;
1353
1354         assert(u);
1355
1356         if (!u->cgroup_path)
1357                 return 0;
1358
1359         if (u->cgroup_inotify_wd >= 0)
1360                 return 0;
1361
1362         /* Only applies to the unified hierarchy */
1363         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1364         if (r < 0)
1365                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1366         if (r == 0)
1367                 return 0;
1368
1369         /* Don't watch the root slice, it's pointless. */
1370         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1371                 return 0;
1372
1373         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1374         if (r < 0)
1375                 return log_oom();
1376
1377         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1378         if (r < 0)
1379                 return log_oom();
1380
1381         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1382         if (u->cgroup_inotify_wd < 0) {
1383
1384                 if (errno == ENOENT) /* If the directory is already
1385                                       * gone we don't need to track
1386                                       * it, so this is not an error */
1387                         return 0;
1388
1389                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1390         }
1391
1392         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1393         if (r < 0)
1394                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1395
1396         return 0;
1397 }
1398
1399 int unit_pick_cgroup_path(Unit *u) {
1400         _cleanup_free_ char *path = NULL;
1401         int r;
1402
1403         assert(u);
1404
1405         if (u->cgroup_path)
1406                 return 0;
1407
1408         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1409                 return -EINVAL;
1410
1411         path = unit_default_cgroup_path(u);
1412         if (!path)
1413                 return log_oom();
1414
1415         r = unit_set_cgroup_path(u, path);
1416         if (r == -EEXIST)
1417                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1418         if (r < 0)
1419                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1420
1421         return 0;
1422 }
1423
1424 static int unit_create_cgroup(
1425                 Unit *u,
1426                 CGroupMask target_mask,
1427                 CGroupMask enable_mask,
1428                 bool needs_bpf) {
1429
1430         CGroupContext *c;
1431         int r;
1432
1433         assert(u);
1434
1435         c = unit_get_cgroup_context(u);
1436         if (!c)
1437                 return 0;
1438
1439         /* Figure out our cgroup path */
1440         r = unit_pick_cgroup_path(u);
1441         if (r < 0)
1442                 return r;
1443
1444         /* First, create our own group */
1445         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1446         if (r < 0)
1447                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1448
1449         /* Start watching it */
1450         (void) unit_watch_cgroup(u);
1451
1452         /* Enable all controllers we need */
1453         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1454         if (r < 0)
1455                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1456
1457         /* Keep track that this is now realized */
1458         u->cgroup_realized = true;
1459         u->cgroup_realized_mask = target_mask;
1460         u->cgroup_enabled_mask = enable_mask;
1461         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1462
1463         if (u->type != UNIT_SLICE && !c->delegate) {
1464
1465                 /* Then, possibly move things over, but not if
1466                  * subgroups may contain processes, which is the case
1467                  * for slice and delegation units. */
1468                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1469                 if (r < 0)
1470                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1471         }
1472
1473         return 0;
1474 }
1475
1476 int unit_attach_pids_to_cgroup(Unit *u) {
1477         int r;
1478         assert(u);
1479
1480         r = unit_realize_cgroup(u);
1481         if (r < 0)
1482                 return r;
1483
1484         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1485         if (r < 0)
1486                 return r;
1487
1488         return 0;
1489 }
1490
1491 static void cgroup_xattr_apply(Unit *u) {
1492         char ids[SD_ID128_STRING_MAX];
1493         int r;
1494
1495         assert(u);
1496
1497         if (!MANAGER_IS_SYSTEM(u->manager))
1498                 return;
1499
1500         if (sd_id128_is_null(u->invocation_id))
1501                 return;
1502
1503         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1504                          "trusted.invocation_id",
1505                          sd_id128_to_string(u->invocation_id, ids), 32,
1506                          0);
1507         if (r < 0)
1508                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1509 }
1510
1511 static bool unit_has_mask_realized(
1512                 Unit *u,
1513                 CGroupMask target_mask,
1514                 CGroupMask enable_mask,
1515                 bool needs_bpf) {
1516
1517         assert(u);
1518
1519         return u->cgroup_realized &&
1520                 u->cgroup_realized_mask == target_mask &&
1521                 u->cgroup_enabled_mask == enable_mask &&
1522                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1523                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1524 }
1525
1526 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1527         assert(u);
1528
1529         if (u->in_cgroup_realize_queue)
1530                 return;
1531
1532         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1533         u->in_cgroup_realize_queue = true;
1534 }
1535
1536 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1537         assert(u);
1538
1539         if (!u->in_cgroup_realize_queue)
1540                 return;
1541
1542         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1543         u->in_cgroup_realize_queue = false;
1544 }
1545
1546
1547 /* Check if necessary controllers and attributes for a unit are in place.
1548  *
1549  * If so, do nothing.
1550  * If not, create paths, move processes over, and set attributes.
1551  *
1552  * Returns 0 on success and < 0 on failure. */
1553 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1554         CGroupMask target_mask, enable_mask;
1555         bool needs_bpf, apply_bpf;
1556         int r;
1557
1558         assert(u);
1559
1560         unit_remove_from_cgroup_realize_queue(u);
1561
1562         target_mask = unit_get_target_mask(u);
1563         enable_mask = unit_get_enable_mask(u);
1564         needs_bpf = unit_get_needs_bpf(u);
1565
1566         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1567                 return 0;
1568
1569         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1570          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1571          * this will trickle down properly to cgroupfs. */
1572         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1573
1574         /* First, realize parents */
1575         if (UNIT_ISSET(u->slice)) {
1576                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1577                 if (r < 0)
1578                         return r;
1579         }
1580
1581         /* And then do the real work */
1582         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1583         if (r < 0)
1584                 return r;
1585
1586         /* Finally, apply the necessary attributes. */
1587         cgroup_context_apply(u, target_mask, apply_bpf, state);
1588         cgroup_xattr_apply(u);
1589
1590         return 0;
1591 }
1592
1593 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1594         ManagerState state;
1595         unsigned n = 0;
1596         Unit *i;
1597         int r;
1598
1599         assert(m);
1600
1601         state = manager_state(m);
1602
1603         while ((i = m->cgroup_realize_queue)) {
1604                 assert(i->in_cgroup_realize_queue);
1605
1606                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1607                         /* Maybe things changed, and the unit is not actually active anymore? */
1608                         unit_remove_from_cgroup_realize_queue(i);
1609                         continue;
1610                 }
1611
1612                 r = unit_realize_cgroup_now(i, state);
1613                 if (r < 0)
1614                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1615
1616                 n++;
1617         }
1618
1619         return n;
1620 }
1621
1622 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1623         Unit *slice;
1624
1625         /* This adds the siblings of the specified unit and the
1626          * siblings of all parent units to the cgroup queue. (But
1627          * neither the specified unit itself nor the parents.) */
1628
1629         while ((slice = UNIT_DEREF(u->slice))) {
1630                 Iterator i;
1631                 Unit *m;
1632                 void *v;
1633
1634                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1635                         if (m == u)
1636                                 continue;
1637
1638                         /* Skip units that have a dependency on the slice
1639                          * but aren't actually in it. */
1640                         if (UNIT_DEREF(m->slice) != slice)
1641                                 continue;
1642
1643                         /* No point in doing cgroup application for units
1644                          * without active processes. */
1645                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1646                                 continue;
1647
1648                         /* If the unit doesn't need any new controllers
1649                          * and has current ones realized, it doesn't need
1650                          * any changes. */
1651                         if (unit_has_mask_realized(m,
1652                                                    unit_get_target_mask(m),
1653                                                    unit_get_enable_mask(m),
1654                                                    unit_get_needs_bpf(m)))
1655                                 continue;
1656
1657                         unit_add_to_cgroup_realize_queue(m);
1658                 }
1659
1660                 u = slice;
1661         }
1662 }
1663
1664 int unit_realize_cgroup(Unit *u) {
1665         assert(u);
1666
1667         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1668                 return 0;
1669
1670         /* So, here's the deal: when realizing the cgroups for this
1671          * unit, we need to first create all parents, but there's more
1672          * actually: for the weight-based controllers we also need to
1673          * make sure that all our siblings (i.e. units that are in the
1674          * same slice as we are) have cgroups, too. Otherwise, things
1675          * would become very uneven as each of their processes would
1676          * get as much resources as all our group together. This call
1677          * will synchronously create the parent cgroups, but will
1678          * defer work on the siblings to the next event loop
1679          * iteration. */
1680
1681         /* Add all sibling slices to the cgroup queue. */
1682         unit_add_siblings_to_cgroup_realize_queue(u);
1683
1684         /* And realize this one now (and apply the values) */
1685         return unit_realize_cgroup_now(u, manager_state(u->manager));
1686 }
1687
1688 void unit_release_cgroup(Unit *u) {
1689         assert(u);
1690
1691         /* Forgets all cgroup details for this cgroup */
1692
1693         if (u->cgroup_path) {
1694                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1695                 u->cgroup_path = mfree(u->cgroup_path);
1696         }
1697
1698         if (u->cgroup_inotify_wd >= 0) {
1699                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1700                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1701
1702                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1703                 u->cgroup_inotify_wd = -1;
1704         }
1705 }
1706
1707 void unit_prune_cgroup(Unit *u) {
1708         int r;
1709         bool is_root_slice;
1710
1711         assert(u);
1712
1713         /* Removes the cgroup, if empty and possible, and stops watching it. */
1714
1715         if (!u->cgroup_path)
1716                 return;
1717
1718         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1719
1720         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1721
1722         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1723         if (r < 0) {
1724                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1725                 return;
1726         }
1727
1728         if (is_root_slice)
1729                 return;
1730
1731         unit_release_cgroup(u);
1732
1733         u->cgroup_realized = false;
1734         u->cgroup_realized_mask = 0;
1735         u->cgroup_enabled_mask = 0;
1736 }
1737
1738 int unit_search_main_pid(Unit *u, pid_t *ret) {
1739         _cleanup_fclose_ FILE *f = NULL;
1740         pid_t pid = 0, npid, mypid;
1741         int r;
1742
1743         assert(u);
1744         assert(ret);
1745
1746         if (!u->cgroup_path)
1747                 return -ENXIO;
1748
1749         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1750         if (r < 0)
1751                 return r;
1752
1753         mypid = getpid_cached();
1754         while (cg_read_pid(f, &npid) > 0)  {
1755                 pid_t ppid;
1756
1757                 if (npid == pid)
1758                         continue;
1759
1760                 /* Ignore processes that aren't our kids */
1761                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1762                         continue;
1763
1764                 if (pid != 0)
1765                         /* Dang, there's more than one daemonized PID
1766                         in this group, so we don't know what process
1767                         is the main process. */
1768
1769                         return -ENODATA;
1770
1771                 pid = npid;
1772         }
1773
1774         *ret = pid;
1775         return 0;
1776 }
1777
1778 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1779         _cleanup_closedir_ DIR *d = NULL;
1780         _cleanup_fclose_ FILE *f = NULL;
1781         int ret = 0, r;
1782
1783         assert(u);
1784         assert(path);
1785
1786         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1787         if (r < 0)
1788                 ret = r;
1789         else {
1790                 pid_t pid;
1791
1792                 while ((r = cg_read_pid(f, &pid)) > 0) {
1793                         r = unit_watch_pid(u, pid);
1794                         if (r < 0 && ret >= 0)
1795                                 ret = r;
1796                 }
1797
1798                 if (r < 0 && ret >= 0)
1799                         ret = r;
1800         }
1801
1802         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1803         if (r < 0) {
1804                 if (ret >= 0)
1805                         ret = r;
1806         } else {
1807                 char *fn;
1808
1809                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1810                         _cleanup_free_ char *p = NULL;
1811
1812                         p = strjoin(path, "/", fn);
1813                         free(fn);
1814
1815                         if (!p)
1816                                 return -ENOMEM;
1817
1818                         r = unit_watch_pids_in_path(u, p);
1819                         if (r < 0 && ret >= 0)
1820                                 ret = r;
1821                 }
1822
1823                 if (r < 0 && ret >= 0)
1824                         ret = r;
1825         }
1826
1827         return ret;
1828 }
1829
1830 int unit_watch_all_pids(Unit *u) {
1831         int r;
1832
1833         assert(u);
1834
1835         /* Adds all PIDs from our cgroup to the set of PIDs we
1836          * watch. This is a fallback logic for cases where we do not
1837          * get reliable cgroup empty notifications: we try to use
1838          * SIGCHLD as replacement. */
1839
1840         if (!u->cgroup_path)
1841                 return -ENOENT;
1842
1843         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1844         if (r < 0)
1845                 return r;
1846         if (r > 0) /* On unified we can use proper notifications */
1847                 return 0;
1848
1849         return unit_watch_pids_in_path(u, u->cgroup_path);
1850 }
1851
1852 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1853         Manager *m = userdata;
1854         Unit *u;
1855         int r;
1856
1857         assert(s);
1858         assert(m);
1859
1860         u = m->cgroup_empty_queue;
1861         if (!u)
1862                 return 0;
1863
1864         assert(u->in_cgroup_empty_queue);
1865         u->in_cgroup_empty_queue = false;
1866         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1867
1868         if (m->cgroup_empty_queue) {
1869                 /* More stuff queued, let's make sure we remain enabled */
1870                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1871                 if (r < 0)
1872                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1873         }
1874
1875         unit_add_to_gc_queue(u);
1876
1877         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1878                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1879
1880         return 0;
1881 }
1882
1883 void unit_add_to_cgroup_empty_queue(Unit *u) {
1884         int r;
1885
1886         assert(u);
1887
1888         /* Note that there are four different ways how cgroup empty events reach us:
1889          *
1890          * 1. On the unified hierarchy we get an inotify event on the cgroup
1891          *
1892          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1893          *
1894          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1895          *
1896          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1897          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1898          *
1899          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1900          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1901          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1902          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1903          * case for scope units). */
1904
1905         if (u->in_cgroup_empty_queue)
1906                 return;
1907
1908         /* Let's verify that the cgroup is really empty */
1909         if (!u->cgroup_path)
1910                 return;
1911         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1912         if (r < 0) {
1913                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1914                 return;
1915         }
1916         if (r == 0)
1917                 return;
1918
1919         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1920         u->in_cgroup_empty_queue = true;
1921
1922         /* Trigger the defer event */
1923         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1924         if (r < 0)
1925                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1926 }
1927
1928 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1929         Manager *m = userdata;
1930
1931         assert(s);
1932         assert(fd >= 0);
1933         assert(m);
1934
1935         for (;;) {
1936                 union inotify_event_buffer buffer;
1937                 struct inotify_event *e;
1938                 ssize_t l;
1939
1940                 l = read(fd, &buffer, sizeof(buffer));
1941                 if (l < 0) {
1942                         if (IN_SET(errno, EINTR, EAGAIN))
1943                                 return 0;
1944
1945                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1946                 }
1947
1948                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1949                         Unit *u;
1950
1951                         if (e->wd < 0)
1952                                 /* Queue overflow has no watch descriptor */
1953                                 continue;
1954
1955                         if (e->mask & IN_IGNORED)
1956                                 /* The watch was just removed */
1957                                 continue;
1958
1959                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1960                         if (!u) /* Not that inotify might deliver
1961                                  * events for a watch even after it
1962                                  * was removed, because it was queued
1963                                  * before the removal. Let's ignore
1964                                  * this here safely. */
1965                                 continue;
1966
1967                         unit_add_to_cgroup_empty_queue(u);
1968                 }
1969         }
1970 }
1971
1972 int manager_setup_cgroup(Manager *m) {
1973         _cleanup_free_ char *path = NULL;
1974         const char *scope_path;
1975         CGroupController c;
1976         int r, all_unified;
1977         char *e;
1978
1979         assert(m);
1980
1981         /* 1. Determine hierarchy */
1982         m->cgroup_root = mfree(m->cgroup_root);
1983         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1984         if (r < 0)
1985                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1986
1987         /* Chop off the init scope, if we are already located in it */
1988         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1989
1990         /* LEGACY: Also chop off the system slice if we are in
1991          * it. This is to support live upgrades from older systemd
1992          * versions where PID 1 was moved there. Also see
1993          * cg_get_root_path(). */
1994         if (!e && MANAGER_IS_SYSTEM(m)) {
1995                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1996                 if (!e)
1997                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1998         }
1999         if (e)
2000                 *e = 0;
2001
2002         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2003          * easily prepend it everywhere. */
2004         delete_trailing_chars(m->cgroup_root, "/");
2005
2006         /* 2. Show data */
2007         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2008         if (r < 0)
2009                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2010
2011         r = cg_unified_flush();
2012         if (r < 0)
2013                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2014
2015         all_unified = cg_all_unified();
2016         if (all_unified < 0)
2017                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2018         if (all_unified > 0)
2019                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2020         else {
2021                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2022                 if (r < 0)
2023                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2024                 if (r > 0)
2025                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2026                 else
2027                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2028         }
2029
2030         /* 3. Allocate cgroup empty defer event source */
2031         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2032         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2033         if (r < 0)
2034                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2035
2036         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2037         if (r < 0)
2038                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2039
2040         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2041         if (r < 0)
2042                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2043
2044         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2045
2046         /* 4. Install notifier inotify object, or agent */
2047         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2048
2049                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2050
2051                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2052                 safe_close(m->cgroup_inotify_fd);
2053
2054                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2055                 if (m->cgroup_inotify_fd < 0)
2056                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2057
2058                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2059                 if (r < 0)
2060                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2061
2062                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2063                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2064                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2065                 if (r < 0)
2066                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2067
2068                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2069
2070         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2071
2072                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2073                  * since it does not generate events when control groups with children run empty. */
2074
2075                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2076                 if (r < 0)
2077                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2078                 else if (r > 0)
2079                         log_debug("Installed release agent.");
2080                 else if (r == 0)
2081                         log_debug("Release agent already installed.");
2082         }
2083
2084         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2085         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2086         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2087         if (r < 0)
2088                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2089
2090         /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2091         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2092         if (r < 0)
2093                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2094
2095         /* 6. And pin it, so that it cannot be unmounted */
2096         safe_close(m->pin_cgroupfs_fd);
2097         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2098         if (m->pin_cgroupfs_fd < 0)
2099                 return log_error_errno(errno, "Failed to open pin file: %m");
2100
2101         /* 7. Always enable hierarchical support if it exists... */
2102         if (!all_unified && m->test_run_flags == 0)
2103                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2104
2105         /* 8. Figure out which controllers are supported, and log about it */
2106         r = cg_mask_supported(&m->cgroup_supported);
2107         if (r < 0)
2108                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2109         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2110                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2111
2112         return 0;
2113 }
2114
2115 void manager_shutdown_cgroup(Manager *m, bool delete) {
2116         assert(m);
2117
2118         /* We can't really delete the group, since we are in it. But
2119          * let's trim it. */
2120         if (delete && m->cgroup_root)
2121                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2122
2123         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2124
2125         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2126
2127         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2128         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2129
2130         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2131
2132         m->cgroup_root = mfree(m->cgroup_root);
2133 }
2134
2135 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2136         char *p;
2137         Unit *u;
2138
2139         assert(m);
2140         assert(cgroup);
2141
2142         u = hashmap_get(m->cgroup_unit, cgroup);
2143         if (u)
2144                 return u;
2145
2146         p = strdupa(cgroup);
2147         for (;;) {
2148                 char *e;
2149
2150                 e = strrchr(p, '/');
2151                 if (!e || e == p)
2152                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2153
2154                 *e = 0;
2155
2156                 u = hashmap_get(m->cgroup_unit, p);
2157                 if (u)
2158                         return u;
2159         }
2160 }
2161
2162 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2163         _cleanup_free_ char *cgroup = NULL;
2164         int r;
2165
2166         assert(m);
2167
2168         if (pid <= 0)
2169                 return NULL;
2170
2171         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2172         if (r < 0)
2173                 return NULL;
2174
2175         return manager_get_unit_by_cgroup(m, cgroup);
2176 }
2177
2178 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2179         Unit *u;
2180
2181         assert(m);
2182
2183         if (pid <= 0)
2184                 return NULL;
2185
2186         if (pid == getpid_cached())
2187                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2188
2189         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2190         if (u)
2191                 return u;
2192
2193         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2194         if (u)
2195                 return u;
2196
2197         return manager_get_unit_by_pid_cgroup(m, pid);
2198 }
2199
2200 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2201         Unit *u;
2202
2203         assert(m);
2204         assert(cgroup);
2205
2206         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2207          * or from the --system instance */
2208
2209         log_debug("Got cgroup empty notification for: %s", cgroup);
2210
2211         u = manager_get_unit_by_cgroup(m, cgroup);
2212         if (!u)
2213                 return 0;
2214
2215         unit_add_to_cgroup_empty_queue(u);
2216         return 1;
2217 }
2218
2219 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2220         _cleanup_free_ char *v = NULL;
2221         int r;
2222
2223         assert(u);
2224         assert(ret);
2225
2226         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2227                 return -ENODATA;
2228
2229         if (!u->cgroup_path)
2230                 return -ENODATA;
2231
2232         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2233                 return -ENODATA;
2234
2235         r = cg_all_unified();
2236         if (r < 0)
2237                 return r;
2238         if (r > 0)
2239                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2240         else
2241                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2242         if (r == -ENOENT)
2243                 return -ENODATA;
2244         if (r < 0)
2245                 return r;
2246
2247         return safe_atou64(v, ret);
2248 }
2249
2250 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2251         _cleanup_free_ char *v = NULL;
2252         int r;
2253
2254         assert(u);
2255         assert(ret);
2256
2257         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2258                 return -ENODATA;
2259
2260         if (!u->cgroup_path)
2261                 return -ENODATA;
2262
2263         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2264                 return -ENODATA;
2265
2266         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2267         if (r == -ENOENT)
2268                 return -ENODATA;
2269         if (r < 0)
2270                 return r;
2271
2272         return safe_atou64(v, ret);
2273 }
2274
2275 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2276         _cleanup_free_ char *v = NULL;
2277         uint64_t ns;
2278         int r;
2279
2280         assert(u);
2281         assert(ret);
2282
2283         if (!u->cgroup_path)
2284                 return -ENODATA;
2285
2286         r = cg_all_unified();
2287         if (r < 0)
2288                 return r;
2289         if (r > 0) {
2290                 const char *keys[] = { "usage_usec", NULL };
2291                 _cleanup_free_ char *val = NULL;
2292                 uint64_t us;
2293
2294                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2295                         return -ENODATA;
2296
2297                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2298                 if (r < 0)
2299                         return r;
2300
2301                 r = safe_atou64(val, &us);
2302                 if (r < 0)
2303                         return r;
2304
2305                 ns = us * NSEC_PER_USEC;
2306         } else {
2307                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2308                         return -ENODATA;
2309
2310                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2311                 if (r == -ENOENT)
2312                         return -ENODATA;
2313                 if (r < 0)
2314                         return r;
2315
2316                 r = safe_atou64(v, &ns);
2317                 if (r < 0)
2318                         return r;
2319         }
2320
2321         *ret = ns;
2322         return 0;
2323 }
2324
2325 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2326         nsec_t ns;
2327         int r;
2328
2329         assert(u);
2330
2331         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2332          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2333          * call this function with a NULL return value. */
2334
2335         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2336                 return -ENODATA;
2337
2338         r = unit_get_cpu_usage_raw(u, &ns);
2339         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2340                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2341                  * cached value. */
2342
2343                 if (ret)
2344                         *ret = u->cpu_usage_last;
2345                 return 0;
2346         }
2347         if (r < 0)
2348                 return r;
2349
2350         if (ns > u->cpu_usage_base)
2351                 ns -= u->cpu_usage_base;
2352         else
2353                 ns = 0;
2354
2355         u->cpu_usage_last = ns;
2356         if (ret)
2357                 *ret = ns;
2358
2359         return 0;
2360 }
2361
2362 int unit_get_ip_accounting(
2363                 Unit *u,
2364                 CGroupIPAccountingMetric metric,
2365                 uint64_t *ret) {
2366
2367         uint64_t value;
2368         int fd, r;
2369
2370         assert(u);
2371         assert(metric >= 0);
2372         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2373         assert(ret);
2374
2375         /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2376          * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2377          * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2378          * filters. */
2379         if (u->type == UNIT_SLICE)
2380                 return -ENODATA;
2381
2382         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2383                 return -ENODATA;
2384
2385         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2386                 u->ip_accounting_ingress_map_fd :
2387                 u->ip_accounting_egress_map_fd;
2388         if (fd < 0)
2389                 return -ENODATA;
2390
2391         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2392                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2393         else
2394                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2395         if (r < 0)
2396                 return r;
2397
2398         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2399          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2400          * ip_accounting_extra[] field, and add them in here transparently. */
2401
2402         *ret = value + u->ip_accounting_extra[metric];
2403
2404         return r;
2405 }
2406
2407 int unit_reset_cpu_accounting(Unit *u) {
2408         nsec_t ns;
2409         int r;
2410
2411         assert(u);
2412
2413         u->cpu_usage_last = NSEC_INFINITY;
2414
2415         r = unit_get_cpu_usage_raw(u, &ns);
2416         if (r < 0) {
2417                 u->cpu_usage_base = 0;
2418                 return r;
2419         }
2420
2421         u->cpu_usage_base = ns;
2422         return 0;
2423 }
2424
2425 int unit_reset_ip_accounting(Unit *u) {
2426         int r = 0, q = 0;
2427
2428         assert(u);
2429
2430         if (u->ip_accounting_ingress_map_fd >= 0)
2431                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2432
2433         if (u->ip_accounting_egress_map_fd >= 0)
2434                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2435
2436         zero(u->ip_accounting_extra);
2437
2438         return r < 0 ? r : q;
2439 }
2440
2441 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2442         assert(u);
2443
2444         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2445                 return;
2446
2447         if (m == 0)
2448                 return;
2449
2450         /* always invalidate compat pairs together */
2451         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2452                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2453
2454         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2455                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2456
2457         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2458                 return;
2459
2460         u->cgroup_realized_mask &= ~m;
2461         unit_add_to_cgroup_realize_queue(u);
2462 }
2463
2464 void unit_invalidate_cgroup_bpf(Unit *u) {
2465         assert(u);
2466
2467         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2468                 return;
2469
2470         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2471                 return;
2472
2473         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2474         unit_add_to_cgroup_realize_queue(u);
2475
2476         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2477          * list of our children includes our own. */
2478         if (u->type == UNIT_SLICE) {
2479                 Unit *member;
2480                 Iterator i;
2481                 void *v;
2482
2483                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2484                         if (member == u)
2485                                 continue;
2486
2487                         if (UNIT_DEREF(member->slice) != u)
2488                                 continue;
2489
2490                         unit_invalidate_cgroup_bpf(member);
2491                 }
2492         }
2493 }
2494
2495 void manager_invalidate_startup_units(Manager *m) {
2496         Iterator i;
2497         Unit *u;
2498
2499         assert(m);
2500
2501         SET_FOREACH(u, m->startup_units, i)
2502                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2503 }
2504
2505 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2506         [CGROUP_AUTO] = "auto",
2507         [CGROUP_CLOSED] = "closed",
2508         [CGROUP_STRICT] = "strict",
2509 };
2510
2511 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);