src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 #include "blockdev-util.h"
  26 #include "bpf-firewall.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29 #include "fd-util.h"
  30 #include "fileio.h"
  31 #include "fs-util.h"
  32 #include "parse-util.h"
  33 #include "path-util.h"
  34 #include "process-util.h"
  35 #include "special.h"
  36 #include "stdio-util.h"
  37 #include "string-table.h"
  38 #include "string-util.h"
  39
  40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  41
  42 static void cgroup_compat_warn(void) {
  43         static bool cgroup_compat_warned = false;
  44
  45         if (cgroup_compat_warned)
  46                 return;
  47
  48         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
  49         cgroup_compat_warned = true;
  50 }
  51
  52 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  53                 cgroup_compat_warn();                                           \
  54                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  55         } while (false)
  56
  57 void cgroup_context_init(CGroupContext *c) {
  58         assert(c);
  59
  60         /* Initialize everything to the kernel defaults, assuming the
  61          * structure is preinitialized to 0 */
  62
  63         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  64         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  65         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  66
  67         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  68         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  69
  70         c->memory_high = CGROUP_LIMIT_MAX;
  71         c->memory_max = CGROUP_LIMIT_MAX;
  72         c->memory_swap_max = CGROUP_LIMIT_MAX;
  73
  74         c->memory_limit = CGROUP_LIMIT_MAX;
  75
  76         c->io_weight = CGROUP_WEIGHT_INVALID;
  77         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  78
  79         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  80         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  81
  82         c->tasks_max = (uint64_t) -1;
  83 }
  84
  85 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  86         assert(c);
  87         assert(a);
  88
  89         LIST_REMOVE(device_allow, c->device_allow, a);
  90         free(a->path);
  91         free(a);
  92 }
  93
  94 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
  95         assert(c);
  96         assert(w);
  97
  98         LIST_REMOVE(device_weights, c->io_device_weights, w);
  99         free(w->path);
 100         free(w);
 101 }
 102
 103 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 104         assert(c);
 105         assert(l);
 106
 107         LIST_REMOVE(device_limits, c->io_device_limits, l);
 108         free(l->path);
 109         free(l);
 110 }
 111
 112 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 113         assert(c);
 114         assert(w);
 115
 116         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 117         free(w->path);
 118         free(w);
 119 }
 120
 121 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 122         assert(c);
 123         assert(b);
 124
 125         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 126         free(b->path);
 127         free(b);
 128 }
 129
 130 void cgroup_context_done(CGroupContext *c) {
 131         assert(c);
 132
 133         while (c->io_device_weights)
 134                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 135
 136         while (c->io_device_limits)
 137                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 138
 139         while (c->blockio_device_weights)
 140                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 141
 142         while (c->blockio_device_bandwidths)
 143                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 144
 145         while (c->device_allow)
 146                 cgroup_context_free_device_allow(c, c->device_allow);
 147
 148         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 149         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 150 }
 151
 152 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 153         CGroupIODeviceLimit *il;
 154         CGroupIODeviceWeight *iw;
 155         CGroupBlockIODeviceBandwidth *b;
 156         CGroupBlockIODeviceWeight *w;
 157         CGroupDeviceAllow *a;
 158         IPAddressAccessItem *iaai;
 159         char u[FORMAT_TIMESPAN_MAX];
 160
 161         assert(c);
 162         assert(f);
 163
 164         prefix = strempty(prefix);
 165
 166         fprintf(f,
 167                 "%sCPUAccounting=%s\n"
 168                 "%sIOAccounting=%s\n"
 169                 "%sBlockIOAccounting=%s\n"
 170                 "%sMemoryAccounting=%s\n"
 171                 "%sTasksAccounting=%s\n"
 172                 "%sIPAccounting=%s\n"
 173                 "%sCPUWeight=%" PRIu64 "\n"
 174                 "%sStartupCPUWeight=%" PRIu64 "\n"
 175                 "%sCPUShares=%" PRIu64 "\n"
 176                 "%sStartupCPUShares=%" PRIu64 "\n"
 177                 "%sCPUQuotaPerSecSec=%s\n"
 178                 "%sIOWeight=%" PRIu64 "\n"
 179                 "%sStartupIOWeight=%" PRIu64 "\n"
 180                 "%sBlockIOWeight=%" PRIu64 "\n"
 181                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 182                 "%sMemoryLow=%" PRIu64 "\n"
 183                 "%sMemoryHigh=%" PRIu64 "\n"
 184                 "%sMemoryMax=%" PRIu64 "\n"
 185                 "%sMemorySwapMax=%" PRIu64 "\n"
 186                 "%sMemoryLimit=%" PRIu64 "\n"
 187                 "%sTasksMax=%" PRIu64 "\n"
 188                 "%sDevicePolicy=%s\n"
 189                 "%sDelegate=%s\n",
 190                 prefix, yes_no(c->cpu_accounting),
 191                 prefix, yes_no(c->io_accounting),
 192                 prefix, yes_no(c->blockio_accounting),
 193                 prefix, yes_no(c->memory_accounting),
 194                 prefix, yes_no(c->tasks_accounting),
 195                 prefix, yes_no(c->ip_accounting),
 196                 prefix, c->cpu_weight,
 197                 prefix, c->startup_cpu_weight,
 198                 prefix, c->cpu_shares,
 199                 prefix, c->startup_cpu_shares,
 200                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 201                 prefix, c->io_weight,
 202                 prefix, c->startup_io_weight,
 203                 prefix, c->blockio_weight,
 204                 prefix, c->startup_blockio_weight,
 205                 prefix, c->memory_low,
 206                 prefix, c->memory_high,
 207                 prefix, c->memory_max,
 208                 prefix, c->memory_swap_max,
 209                 prefix, c->memory_limit,
 210                 prefix, c->tasks_max,
 211                 prefix, cgroup_device_policy_to_string(c->device_policy),
 212                 prefix, yes_no(c->delegate));
 213
 214         if (c->delegate) {
 215                 _cleanup_free_ char *t = NULL;
 216
 217                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 218
 219                 fprintf(f, "%sDelegateControllers=%s\n",
 220                         prefix,
 221                         strempty(t));
 222         }
 223
 224         LIST_FOREACH(device_allow, a, c->device_allow)
 225                 fprintf(f,
 226                         "%sDeviceAllow=%s %s%s%s\n",
 227                         prefix,
 228                         a->path,
 229                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 230
 231         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 232                 fprintf(f,
 233                         "%sIODeviceWeight=%s %" PRIu64,
 234                         prefix,
 235                         iw->path,
 236                         iw->weight);
 237
 238         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 239                 char buf[FORMAT_BYTES_MAX];
 240                 CGroupIOLimitType type;
 241
 242                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 243                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 244                                 fprintf(f,
 245                                         "%s%s=%s %s\n",
 246                                         prefix,
 247                                         cgroup_io_limit_type_to_string(type),
 248                                         il->path,
 249                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 250         }
 251
 252         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 253                 fprintf(f,
 254                         "%sBlockIODeviceWeight=%s %" PRIu64,
 255                         prefix,
 256                         w->path,
 257                         w->weight);
 258
 259         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 260                 char buf[FORMAT_BYTES_MAX];
 261
 262                 if (b->rbps != CGROUP_LIMIT_MAX)
 263                         fprintf(f,
 264                                 "%sBlockIOReadBandwidth=%s %s\n",
 265                                 prefix,
 266                                 b->path,
 267                                 format_bytes(buf, sizeof(buf), b->rbps));
 268                 if (b->wbps != CGROUP_LIMIT_MAX)
 269                         fprintf(f,
 270                                 "%sBlockIOWriteBandwidth=%s %s\n",
 271                                 prefix,
 272                                 b->path,
 273                                 format_bytes(buf, sizeof(buf), b->wbps));
 274         }
 275
 276         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 277                 _cleanup_free_ char *k = NULL;
 278
 279                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 280                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 281         }
 282
 283         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 284                 _cleanup_free_ char *k = NULL;
 285
 286                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 287                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 288         }
 289 }
 290
 291 static int lookup_block_device(const char *p, dev_t *dev) {
 292         struct stat st;
 293         int r;
 294
 295         assert(p);
 296         assert(dev);
 297
 298         r = stat(p, &st);
 299         if (r < 0)
 300                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 301
 302         if (S_ISBLK(st.st_mode))
 303                 *dev = st.st_rdev;
 304         else if (major(st.st_dev) != 0) {
 305                 /* If this is not a device node then find the block
 306                  * device this file is stored on */
 307                 *dev = st.st_dev;
 308
 309                 /* If this is a partition, try to get the originating
 310                  * block device */
 311                 (void) block_get_whole_disk(*dev, dev);
 312         } else {
 313                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 314                 return -ENODEV;
 315         }
 316
 317         return 0;
 318 }
 319
 320 static int whitelist_device(const char *path, const char *node, const char *acc) {
 321         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 322         struct stat st;
 323         bool ignore_notfound;
 324         int r;
 325
 326         assert(path);
 327         assert(acc);
 328
 329         if (node[0] == '-') {
 330                 /* Non-existent paths starting with "-" must be silently ignored */
 331                 node++;
 332                 ignore_notfound = true;
 333         } else
 334                 ignore_notfound = false;
 335
 336         if (stat(node, &st) < 0) {
 337                 if (errno == ENOENT && ignore_notfound)
 338                         return 0;
 339
 340                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 341         }
 342
 343         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 344                 log_warning("%s is not a device.", node);
 345                 return -ENODEV;
 346         }
 347
 348         sprintf(buf,
 349                 "%c %u:%u %s",
 350                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 351                 major(st.st_rdev), minor(st.st_rdev),
 352                 acc);
 353
 354         r = cg_set_attribute("devices", path, "devices.allow", buf);
 355         if (r < 0)
 356                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 357                                "Failed to set devices.allow on %s: %m", path);
 358
 359         return r;
 360 }
 361
 362 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 363         _cleanup_fclose_ FILE *f = NULL;
 364         char line[LINE_MAX];
 365         bool good = false;
 366         int r;
 367
 368         assert(path);
 369         assert(acc);
 370         assert(IN_SET(type, 'b', 'c'));
 371
 372         f = fopen("/proc/devices", "re");
 373         if (!f)
 374                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 375
 376         FOREACH_LINE(line, f, goto fail) {
 377                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 378                 unsigned maj;
 379
 380                 truncate_nl(line);
 381
 382                 if (type == 'c' && streq(line, "Character devices:")) {
 383                         good = true;
 384                         continue;
 385                 }
 386
 387                 if (type == 'b' && streq(line, "Block devices:")) {
 388                         good = true;
 389                         continue;
 390                 }
 391
 392                 if (isempty(line)) {
 393                         good = false;
 394                         continue;
 395                 }
 396
 397                 if (!good)
 398                         continue;
 399
 400                 p = strstrip(line);
 401
 402                 w = strpbrk(p, WHITESPACE);
 403                 if (!w)
 404                         continue;
 405                 *w = 0;
 406
 407                 r = safe_atou(p, &maj);
 408                 if (r < 0)
 409                         continue;
 410                 if (maj <= 0)
 411                         continue;
 412
 413                 w++;
 414                 w += strspn(w, WHITESPACE);
 415
 416                 if (fnmatch(name, w, 0) != 0)
 417                         continue;
 418
 419                 sprintf(buf,
 420                         "%c %u:* %s",
 421                         type,
 422                         maj,
 423                         acc);
 424
 425                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 426                 if (r < 0)
 427                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 428                                        "Failed to set devices.allow on %s: %m", path);
 429         }
 430
 431         return 0;
 432
 433 fail:
 434         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 435 }
 436
 437 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 438         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 439                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 440 }
 441
 442 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 443         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 444                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 445 }
 446
 447 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 448         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 449             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 450                 return c->startup_cpu_weight;
 451         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 452                 return c->cpu_weight;
 453         else
 454                 return CGROUP_WEIGHT_DEFAULT;
 455 }
 456
 457 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 458         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 459             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 460                 return c->startup_cpu_shares;
 461         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 462                 return c->cpu_shares;
 463         else
 464                 return CGROUP_CPU_SHARES_DEFAULT;
 465 }
 466
 467 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 468         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 469         int r;
 470
 471         xsprintf(buf, "%" PRIu64 "\n", weight);
 472         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 473         if (r < 0)
 474                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 475                               "Failed to set cpu.weight: %m");
 476
 477         if (quota != USEC_INFINITY)
 478                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 479                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 480         else
 481                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 482
 483         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 484
 485         if (r < 0)
 486                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 487                               "Failed to set cpu.max: %m");
 488 }
 489
 490 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 491         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 492         int r;
 493
 494         xsprintf(buf, "%" PRIu64 "\n", shares);
 495         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 496         if (r < 0)
 497                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 498                               "Failed to set cpu.shares: %m");
 499
 500         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 501         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 502         if (r < 0)
 503                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 504                               "Failed to set cpu.cfs_period_us: %m");
 505
 506         if (quota != USEC_INFINITY) {
 507                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 508                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 509         } else
 510                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 511         if (r < 0)
 512                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 513                               "Failed to set cpu.cfs_quota_us: %m");
 514 }
 515
 516 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 517         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 518                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 519 }
 520
 521 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 522         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 523                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 524 }
 525
 526 static bool cgroup_context_has_io_config(CGroupContext *c) {
 527         return c->io_accounting ||
 528                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 529                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 530                 c->io_device_weights ||
 531                 c->io_device_limits;
 532 }
 533
 534 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 535         return c->blockio_accounting ||
 536                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 537                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 538                 c->blockio_device_weights ||
 539                 c->blockio_device_bandwidths;
 540 }
 541
 542 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 543         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 544             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 545                 return c->startup_io_weight;
 546         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 547                 return c->io_weight;
 548         else
 549                 return CGROUP_WEIGHT_DEFAULT;
 550 }
 551
 552 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 553         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 554             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 555                 return c->startup_blockio_weight;
 556         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 557                 return c->blockio_weight;
 558         else
 559                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 560 }
 561
 562 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 563         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 564                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 565 }
 566
 567 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 568         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 569                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 570 }
 571
 572 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 573         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 574         dev_t dev;
 575         int r;
 576
 577         r = lookup_block_device(dev_path, &dev);
 578         if (r < 0)
 579                 return;
 580
 581         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 582         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 583         if (r < 0)
 584                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 585                               "Failed to set io.weight: %m");
 586 }
 587
 588 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 589         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 590         dev_t dev;
 591         int r;
 592
 593         r = lookup_block_device(dev_path, &dev);
 594         if (r < 0)
 595                 return;
 596
 597         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 598         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 599         if (r < 0)
 600                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 601                               "Failed to set blkio.weight_device: %m");
 602 }
 603
 604 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 605         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 606         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 607         CGroupIOLimitType type;
 608         dev_t dev;
 609         unsigned n = 0;
 610         int r;
 611
 612         r = lookup_block_device(dev_path, &dev);
 613         if (r < 0)
 614                 return 0;
 615
 616         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 617                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 618                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 619                         n++;
 620                 } else {
 621                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 622                 }
 623         }
 624
 625         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 626                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 627                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 628         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 629         if (r < 0)
 630                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 631                               "Failed to set io.max: %m");
 632         return n;
 633 }
 634
 635 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 636         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 637         dev_t dev;
 638         unsigned n = 0;
 639         int r;
 640
 641         r = lookup_block_device(dev_path, &dev);
 642         if (r < 0)
 643                 return 0;
 644
 645         if (rbps != CGROUP_LIMIT_MAX)
 646                 n++;
 647         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 648         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 649         if (r < 0)
 650                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 651                               "Failed to set blkio.throttle.read_bps_device: %m");
 652
 653         if (wbps != CGROUP_LIMIT_MAX)
 654                 n++;
 655         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 656         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 657         if (r < 0)
 658                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 659                               "Failed to set blkio.throttle.write_bps_device: %m");
 660
 661         return n;
 662 }
 663
 664 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 665         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 666 }
 667
 668 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 669         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 670         int r;
 671
 672         if (v != CGROUP_LIMIT_MAX)
 673                 xsprintf(buf, "%" PRIu64 "\n", v);
 674
 675         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 676         if (r < 0)
 677                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 678                               "Failed to set %s: %m", file);
 679 }
 680
 681 static void cgroup_apply_firewall(Unit *u) {
 682         int r;
 683
 684         assert(u);
 685
 686         if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
 687                                     * not recursive we don't ever touch the bpf on them */
 688                 return;
 689
 690         r = bpf_firewall_compile(u);
 691         if (r < 0)
 692                 return;
 693
 694         (void) bpf_firewall_install(u);
 695         return;
 696 }
 697
 698 static void cgroup_context_apply(
 699                 Unit *u,
 700                 CGroupMask apply_mask,
 701                 bool apply_bpf,
 702                 ManagerState state) {
 703
 704         const char *path;
 705         CGroupContext *c;
 706         bool is_root;
 707         int r;
 708
 709         assert(u);
 710
 711         c = unit_get_cgroup_context(u);
 712         path = u->cgroup_path;
 713
 714         assert(c);
 715         assert(path);
 716
 717         /* Nothing to do? Exit early! */
 718         if (apply_mask == 0 && !apply_bpf)
 719                 return;
 720
 721         /* Some cgroup attributes are not supported on the root cgroup,
 722          * hence silently ignore */
 723         is_root = isempty(path) || path_equal(path, "/");
 724         if (is_root)
 725                 /* Make sure we don't try to display messages with an empty path. */
 726                 path = "/";
 727
 728         /* We generally ignore errors caused by read-only mounted
 729          * cgroup trees (assuming we are running in a container then),
 730          * and missing cgroups, i.e. EROFS and ENOENT. */
 731
 732         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 733                 bool has_weight, has_shares;
 734
 735                 has_weight = cgroup_context_has_cpu_weight(c);
 736                 has_shares = cgroup_context_has_cpu_shares(c);
 737
 738                 if (cg_all_unified() > 0) {
 739                         uint64_t weight;
 740
 741                         if (has_weight)
 742                                 weight = cgroup_context_cpu_weight(c, state);
 743                         else if (has_shares) {
 744                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 745
 746                                 weight = cgroup_cpu_shares_to_weight(shares);
 747
 748                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 749                                                   shares, weight, path);
 750                         } else
 751                                 weight = CGROUP_WEIGHT_DEFAULT;
 752
 753                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 754                 } else {
 755                         uint64_t shares;
 756
 757                         if (has_weight) {
 758                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 759
 760                                 shares = cgroup_cpu_weight_to_shares(weight);
 761
 762                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 763                                                   weight, shares, path);
 764                         } else if (has_shares)
 765                                 shares = cgroup_context_cpu_shares(c, state);
 766                         else
 767                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 768
 769                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 770                 }
 771         }
 772
 773         if (apply_mask & CGROUP_MASK_IO) {
 774                 bool has_io = cgroup_context_has_io_config(c);
 775                 bool has_blockio = cgroup_context_has_blockio_config(c);
 776
 777                 if (!is_root) {
 778                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 779                         uint64_t weight;
 780
 781                         if (has_io)
 782                                 weight = cgroup_context_io_weight(c, state);
 783                         else if (has_blockio) {
 784                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 785
 786                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 787
 788                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 789                                                   blkio_weight, weight);
 790                         } else
 791                                 weight = CGROUP_WEIGHT_DEFAULT;
 792
 793                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 794                         r = cg_set_attribute("io", path, "io.weight", buf);
 795                         if (r < 0)
 796                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 797                                               "Failed to set io.weight: %m");
 798
 799                         if (has_io) {
 800                                 CGroupIODeviceWeight *w;
 801
 802                                 /* FIXME: no way to reset this list */
 803                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 804                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 805                         } else if (has_blockio) {
 806                                 CGroupBlockIODeviceWeight *w;
 807
 808                                 /* FIXME: no way to reset this list */
 809                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 810                                         weight = cgroup_weight_blkio_to_io(w->weight);
 811
 812                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 813                                                           w->weight, weight, w->path);
 814
 815                                         cgroup_apply_io_device_weight(u, w->path, weight);
 816                                 }
 817                         }
 818                 }
 819
 820                 /* Apply limits and free ones without config. */
 821                 if (has_io) {
 822                         CGroupIODeviceLimit *l, *next;
 823
 824                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 825                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 826                                         cgroup_context_free_io_device_limit(c, l);
 827                         }
 828                 } else if (has_blockio) {
 829                         CGroupBlockIODeviceBandwidth *b, *next;
 830
 831                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 832                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 833                                 CGroupIOLimitType type;
 834
 835                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 836                                         limits[type] = cgroup_io_limit_defaults[type];
 837
 838                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 839                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 840
 841                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 842                                                   b->rbps, b->wbps, b->path);
 843
 844                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 845                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 846                         }
 847                 }
 848         }
 849
 850         if (apply_mask & CGROUP_MASK_BLKIO) {
 851                 bool has_io = cgroup_context_has_io_config(c);
 852                 bool has_blockio = cgroup_context_has_blockio_config(c);
 853
 854                 if (!is_root) {
 855                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 856                         uint64_t weight;
 857
 858                         if (has_io) {
 859                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 860
 861                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 862
 863                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 864                                                   io_weight, weight);
 865                         } else if (has_blockio)
 866                                 weight = cgroup_context_blkio_weight(c, state);
 867                         else
 868                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 869
 870                         xsprintf(buf, "%" PRIu64 "\n", weight);
 871                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 872                         if (r < 0)
 873                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 874                                               "Failed to set blkio.weight: %m");
 875
 876                         if (has_io) {
 877                                 CGroupIODeviceWeight *w;
 878
 879                                 /* FIXME: no way to reset this list */
 880                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 881                                         weight = cgroup_weight_io_to_blkio(w->weight);
 882
 883                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 884                                                           w->weight, weight, w->path);
 885
 886                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 887                                 }
 888                         } else if (has_blockio) {
 889                                 CGroupBlockIODeviceWeight *w;
 890
 891                                 /* FIXME: no way to reset this list */
 892                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 893                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 894                         }
 895                 }
 896
 897                 /* Apply limits and free ones without config. */
 898                 if (has_io) {
 899                         CGroupIODeviceLimit *l, *next;
 900
 901                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 902                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 903                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 904
 905                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 906                                         cgroup_context_free_io_device_limit(c, l);
 907                         }
 908                 } else if (has_blockio) {
 909                         CGroupBlockIODeviceBandwidth *b, *next;
 910
 911                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 912                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 913                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 914                 }
 915         }
 916
 917         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 918                 if (cg_all_unified() > 0) {
 919                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 920
 921                         if (cgroup_context_has_unified_memory_config(c)) {
 922                                 max = c->memory_max;
 923                                 swap_max = c->memory_swap_max;
 924                         } else {
 925                                 max = c->memory_limit;
 926
 927                                 if (max != CGROUP_LIMIT_MAX)
 928                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 929                         }
 930
 931                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 932                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 933                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 934                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 935                 } else {
 936                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 937                         uint64_t val;
 938
 939                         if (cgroup_context_has_unified_memory_config(c)) {
 940                                 val = c->memory_max;
 941                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 942                         } else
 943                                 val = c->memory_limit;
 944
 945                         if (val == CGROUP_LIMIT_MAX)
 946                                 strncpy(buf, "-1\n", sizeof(buf));
 947                         else
 948                                 xsprintf(buf, "%" PRIu64 "\n", val);
 949
 950                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 951                         if (r < 0)
 952                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 953                                               "Failed to set memory.limit_in_bytes: %m");
 954                 }
 955         }
 956
 957         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 958                 CGroupDeviceAllow *a;
 959
 960                 /* Changing the devices list of a populated cgroup
 961                  * might result in EINVAL, hence ignore EINVAL
 962                  * here. */
 963
 964                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 965                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 966                 else
 967                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 968                 if (r < 0)
 969                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 970                                       "Failed to reset devices.list: %m");
 971
 972                 if (c->device_policy == CGROUP_CLOSED ||
 973                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 974                         static const char auto_devices[] =
 975                                 "/dev/null\0" "rwm\0"
 976                                 "/dev/zero\0" "rwm\0"
 977                                 "/dev/full\0" "rwm\0"
 978                                 "/dev/random\0" "rwm\0"
 979                                 "/dev/urandom\0" "rwm\0"
 980                                 "/dev/tty\0" "rwm\0"
 981                                 "/dev/ptmx\0" "rwm\0"
 982                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
 983                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
 984                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
 985
 986                         const char *x, *y;
 987
 988                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 989                                 whitelist_device(path, x, y);
 990
 991                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
 992                         whitelist_major(path, "pts", 'c', "rw");
 993                 }
 994
 995                 LIST_FOREACH(device_allow, a, c->device_allow) {
 996                         char acc[4], *val;
 997                         unsigned k = 0;
 998
 999                         if (a->r)
1000                                 acc[k++] = 'r';
1001                         if (a->w)
1002                                 acc[k++] = 'w';
1003                         if (a->m)
1004                                 acc[k++] = 'm';
1005
1006                         if (k == 0)
1007                                 continue;
1008
1009                         acc[k++] = 0;
1010
1011                         if (path_startswith(a->path, "/dev/"))
1012                                 whitelist_device(path, a->path, acc);
1013                         else if ((val = startswith(a->path, "block-")))
1014                                 whitelist_major(path, val, 'b', acc);
1015                         else if ((val = startswith(a->path, "char-")))
1016                                 whitelist_major(path, val, 'c', acc);
1017                         else
1018                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1019                 }
1020         }
1021
1022         if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1023
1024                 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1025                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1026
1027                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1028                         r = cg_set_attribute("pids", path, "pids.max", buf);
1029                 } else
1030                         r = cg_set_attribute("pids", path, "pids.max", "max");
1031
1032                 if (r < 0)
1033                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1034                                       "Failed to set pids.max: %m");
1035         }
1036
1037         if (apply_bpf)
1038                 cgroup_apply_firewall(u);
1039 }
1040
1041 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1042         CGroupMask mask = 0;
1043
1044         /* Figure out which controllers we need */
1045
1046         if (c->cpu_accounting ||
1047             cgroup_context_has_cpu_weight(c) ||
1048             cgroup_context_has_cpu_shares(c) ||
1049             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1050                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1051
1052         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1053                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1054
1055         if (c->memory_accounting ||
1056             c->memory_limit != CGROUP_LIMIT_MAX ||
1057             cgroup_context_has_unified_memory_config(c))
1058                 mask |= CGROUP_MASK_MEMORY;
1059
1060         if (c->device_allow ||
1061             c->device_policy != CGROUP_AUTO)
1062                 mask |= CGROUP_MASK_DEVICES;
1063
1064         if (c->tasks_accounting ||
1065             c->tasks_max != (uint64_t) -1)
1066                 mask |= CGROUP_MASK_PIDS;
1067
1068         return mask;
1069 }
1070
1071 CGroupMask unit_get_own_mask(Unit *u) {
1072         CGroupContext *c;
1073
1074         /* Returns the mask of controllers the unit needs for itself */
1075
1076         c = unit_get_cgroup_context(u);
1077         if (!c)
1078                 return 0;
1079
1080         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1081 }
1082
1083 CGroupMask unit_get_delegate_mask(Unit *u) {
1084         CGroupContext *c;
1085
1086         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1087          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1088          *
1089          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1090
1091         if (u->type == UNIT_SLICE)
1092                 return 0;
1093
1094         c = unit_get_cgroup_context(u);
1095         if (!c)
1096                 return 0;
1097
1098         if (!c->delegate)
1099                 return 0;
1100
1101         if (cg_all_unified() <= 0) {
1102                 ExecContext *e;
1103
1104                 e = unit_get_exec_context(u);
1105                 if (e && !exec_context_maintains_privileges(e))
1106                         return 0;
1107         }
1108
1109         return c->delegate_controllers;
1110 }
1111
1112 CGroupMask unit_get_members_mask(Unit *u) {
1113         assert(u);
1114
1115         /* Returns the mask of controllers all of the unit's children require, merged */
1116
1117         if (u->cgroup_members_mask_valid)
1118                 return u->cgroup_members_mask;
1119
1120         u->cgroup_members_mask = 0;
1121
1122         if (u->type == UNIT_SLICE) {
1123                 void *v;
1124                 Unit *member;
1125                 Iterator i;
1126
1127                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1128
1129                         if (member == u)
1130                                 continue;
1131
1132                         if (UNIT_DEREF(member->slice) != u)
1133                                 continue;
1134
1135                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1136                 }
1137         }
1138
1139         u->cgroup_members_mask_valid = true;
1140         return u->cgroup_members_mask;
1141 }
1142
1143 CGroupMask unit_get_siblings_mask(Unit *u) {
1144         assert(u);
1145
1146         /* Returns the mask of controllers all of the unit's siblings
1147          * require, i.e. the members mask of the unit's parent slice
1148          * if there is one. */
1149
1150         if (UNIT_ISSET(u->slice))
1151                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1152
1153         return unit_get_subtree_mask(u); /* we are the top-level slice */
1154 }
1155
1156 CGroupMask unit_get_subtree_mask(Unit *u) {
1157
1158         /* Returns the mask of this subtree, meaning of the group
1159          * itself and its children. */
1160
1161         return unit_get_own_mask(u) | unit_get_members_mask(u);
1162 }
1163
1164 CGroupMask unit_get_target_mask(Unit *u) {
1165         CGroupMask mask;
1166
1167         /* This returns the cgroup mask of all controllers to enable
1168          * for a specific cgroup, i.e. everything it needs itself,
1169          * plus all that its children need, plus all that its siblings
1170          * need. This is primarily useful on the legacy cgroup
1171          * hierarchy, where we need to duplicate each cgroup in each
1172          * hierarchy that shall be enabled for it. */
1173
1174         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1175         mask &= u->manager->cgroup_supported;
1176
1177         return mask;
1178 }
1179
1180 CGroupMask unit_get_enable_mask(Unit *u) {
1181         CGroupMask mask;
1182
1183         /* This returns the cgroup mask of all controllers to enable
1184          * for the children of a specific cgroup. This is primarily
1185          * useful for the unified cgroup hierarchy, where each cgroup
1186          * controls which controllers are enabled for its children. */
1187
1188         mask = unit_get_members_mask(u);
1189         mask &= u->manager->cgroup_supported;
1190
1191         return mask;
1192 }
1193
1194 bool unit_get_needs_bpf(Unit *u) {
1195         CGroupContext *c;
1196         Unit *p;
1197         assert(u);
1198
1199         /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1200          * moment. */
1201         if (u->type == UNIT_SLICE)
1202                 return false;
1203
1204         c = unit_get_cgroup_context(u);
1205         if (!c)
1206                 return false;
1207
1208         if (c->ip_accounting ||
1209             c->ip_address_allow ||
1210             c->ip_address_deny)
1211                 return true;
1212
1213         /* If any parent slice has an IP access list defined, it applies too */
1214         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1215                 c = unit_get_cgroup_context(p);
1216                 if (!c)
1217                         return false;
1218
1219                 if (c->ip_address_allow ||
1220                     c->ip_address_deny)
1221                         return true;
1222         }
1223
1224         return false;
1225 }
1226
1227 /* Recurse from a unit up through its containing slices, propagating
1228  * mask bits upward. A unit is also member of itself. */
1229 void unit_update_cgroup_members_masks(Unit *u) {
1230         CGroupMask m;
1231         bool more;
1232
1233         assert(u);
1234
1235         /* Calculate subtree mask */
1236         m = unit_get_subtree_mask(u);
1237
1238         /* See if anything changed from the previous invocation. If
1239          * not, we're done. */
1240         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1241                 return;
1242
1243         more =
1244                 u->cgroup_subtree_mask_valid &&
1245                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1246                 ((~m & u->cgroup_subtree_mask) == 0);
1247
1248         u->cgroup_subtree_mask = m;
1249         u->cgroup_subtree_mask_valid = true;
1250
1251         if (UNIT_ISSET(u->slice)) {
1252                 Unit *s = UNIT_DEREF(u->slice);
1253
1254                 if (more)
1255                         /* There's more set now than before. We
1256                          * propagate the new mask to the parent's mask
1257                          * (not caring if it actually was valid or
1258                          * not). */
1259
1260                         s->cgroup_members_mask |= m;
1261
1262                 else
1263                         /* There's less set now than before (or we
1264                          * don't know), we need to recalculate
1265                          * everything, so let's invalidate the
1266                          * parent's members mask */
1267
1268                         s->cgroup_members_mask_valid = false;
1269
1270                 /* And now make sure that this change also hits our
1271                  * grandparents */
1272                 unit_update_cgroup_members_masks(s);
1273         }
1274 }
1275
1276 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1277         Unit *u = userdata;
1278
1279         assert(mask != 0);
1280         assert(u);
1281
1282         while (u) {
1283                 if (u->cgroup_path &&
1284                     u->cgroup_realized &&
1285                     (u->cgroup_realized_mask & mask) == mask)
1286                         return u->cgroup_path;
1287
1288                 u = UNIT_DEREF(u->slice);
1289         }
1290
1291         return NULL;
1292 }
1293
1294 char *unit_default_cgroup_path(Unit *u) {
1295         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1296         int r;
1297
1298         assert(u);
1299
1300         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1301                 return strdup(u->manager->cgroup_root);
1302
1303         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1304                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1305                 if (r < 0)
1306                         return NULL;
1307         }
1308
1309         escaped = cg_escape(u->id);
1310         if (!escaped)
1311                 return NULL;
1312
1313         if (slice)
1314                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1315                                escaped);
1316         else
1317                 return strjoin(u->manager->cgroup_root, "/", escaped);
1318 }
1319
1320 int unit_set_cgroup_path(Unit *u, const char *path) {
1321         _cleanup_free_ char *p = NULL;
1322         int r;
1323
1324         assert(u);
1325
1326         if (path) {
1327                 p = strdup(path);
1328                 if (!p)
1329                         return -ENOMEM;
1330         } else
1331                 p = NULL;
1332
1333         if (streq_ptr(u->cgroup_path, p))
1334                 return 0;
1335
1336         if (p) {
1337                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1338                 if (r < 0)
1339                         return r;
1340         }
1341
1342         unit_release_cgroup(u);
1343
1344         u->cgroup_path = p;
1345         p = NULL;
1346
1347         return 1;
1348 }
1349
1350 int unit_watch_cgroup(Unit *u) {
1351         _cleanup_free_ char *events = NULL;
1352         int r;
1353
1354         assert(u);
1355
1356         if (!u->cgroup_path)
1357                 return 0;
1358
1359         if (u->cgroup_inotify_wd >= 0)
1360                 return 0;
1361
1362         /* Only applies to the unified hierarchy */
1363         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1364         if (r < 0)
1365                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1366         if (r == 0)
1367                 return 0;
1368
1369         /* Don't watch the root slice, it's pointless. */
1370         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1371                 return 0;
1372
1373         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1374         if (r < 0)
1375                 return log_oom();
1376
1377         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1378         if (r < 0)
1379                 return log_oom();
1380
1381         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1382         if (u->cgroup_inotify_wd < 0) {
1383
1384                 if (errno == ENOENT) /* If the directory is already
1385                                       * gone we don't need to track
1386                                       * it, so this is not an error */
1387                         return 0;
1388
1389                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1390         }
1391
1392         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1393         if (r < 0)
1394                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1395
1396         return 0;
1397 }
1398
1399 int unit_pick_cgroup_path(Unit *u) {
1400         _cleanup_free_ char *path = NULL;
1401         int r;
1402
1403         assert(u);
1404
1405         if (u->cgroup_path)
1406                 return 0;
1407
1408         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1409                 return -EINVAL;
1410
1411         path = unit_default_cgroup_path(u);
1412         if (!path)
1413                 return log_oom();
1414
1415         r = unit_set_cgroup_path(u, path);
1416         if (r == -EEXIST)
1417                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1418         if (r < 0)
1419                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1420
1421         return 0;
1422 }
1423
1424 static int unit_create_cgroup(
1425                 Unit *u,
1426                 CGroupMask target_mask,
1427                 CGroupMask enable_mask,
1428                 bool needs_bpf) {
1429
1430         CGroupContext *c;
1431         int r;
1432
1433         assert(u);
1434
1435         c = unit_get_cgroup_context(u);
1436         if (!c)
1437                 return 0;
1438
1439         /* Figure out our cgroup path */
1440         r = unit_pick_cgroup_path(u);
1441         if (r < 0)
1442                 return r;
1443
1444         /* First, create our own group */
1445         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1446         if (r < 0)
1447                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1448
1449         /* Start watching it */
1450         (void) unit_watch_cgroup(u);
1451
1452         /* Enable all controllers we need */
1453         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1454         if (r < 0)
1455                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1456
1457         /* Keep track that this is now realized */
1458         u->cgroup_realized = true;
1459         u->cgroup_realized_mask = target_mask;
1460         u->cgroup_enabled_mask = enable_mask;
1461         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1462
1463         if (u->type != UNIT_SLICE && !c->delegate) {
1464
1465                 /* Then, possibly move things over, but not if
1466                  * subgroups may contain processes, which is the case
1467                  * for slice and delegation units. */
1468                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1469                 if (r < 0)
1470                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1471         }
1472
1473         return 0;
1474 }
1475
1476 int unit_attach_pids_to_cgroup(Unit *u) {
1477         int r;
1478         assert(u);
1479
1480         r = unit_realize_cgroup(u);
1481         if (r < 0)
1482                 return r;
1483
1484         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1485         if (r < 0)
1486                 return r;
1487
1488         return 0;
1489 }
1490
1491 static void cgroup_xattr_apply(Unit *u) {
1492         char ids[SD_ID128_STRING_MAX];
1493         int r;
1494
1495         assert(u);
1496
1497         if (!MANAGER_IS_SYSTEM(u->manager))
1498                 return;
1499
1500         if (sd_id128_is_null(u->invocation_id))
1501                 return;
1502
1503         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1504                          "trusted.invocation_id",
1505                          sd_id128_to_string(u->invocation_id, ids), 32,
1506                          0);
1507         if (r < 0)
1508                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1509 }
1510
1511 static bool unit_has_mask_realized(
1512                 Unit *u,
1513                 CGroupMask target_mask,
1514                 CGroupMask enable_mask,
1515                 bool needs_bpf) {
1516
1517         assert(u);
1518
1519         return u->cgroup_realized &&
1520                 u->cgroup_realized_mask == target_mask &&
1521                 u->cgroup_enabled_mask == enable_mask &&
1522                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1523                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1524 }
1525
1526 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1527         assert(u);
1528
1529         if (u->in_cgroup_realize_queue)
1530                 return;
1531
1532         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1533         u->in_cgroup_realize_queue = true;
1534 }
1535
1536 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1537         assert(u);
1538
1539         if (!u->in_cgroup_realize_queue)
1540                 return;
1541
1542         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1543         u->in_cgroup_realize_queue = false;
1544 }
1545
1546
1547 /* Check if necessary controllers and attributes for a unit are in place.
1548  *
1549  * If so, do nothing.
1550  * If not, create paths, move processes over, and set attributes.
1551  *
1552  * Returns 0 on success and < 0 on failure. */
1553 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1554         CGroupMask target_mask, enable_mask;
1555         bool needs_bpf, apply_bpf;
1556         int r;
1557
1558         assert(u);
1559
1560         unit_remove_from_cgroup_realize_queue(u);
1561
1562         target_mask = unit_get_target_mask(u);
1563         enable_mask = unit_get_enable_mask(u);
1564         needs_bpf = unit_get_needs_bpf(u);
1565
1566         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1567                 return 0;
1568
1569         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1570          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1571          * this will trickle down properly to cgroupfs. */
1572         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1573
1574         /* First, realize parents */
1575         if (UNIT_ISSET(u->slice)) {
1576                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1577                 if (r < 0)
1578                         return r;
1579         }
1580
1581         /* And then do the real work */
1582         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1583         if (r < 0)
1584                 return r;
1585
1586         /* Finally, apply the necessary attributes. */
1587         cgroup_context_apply(u, target_mask, apply_bpf, state);
1588         cgroup_xattr_apply(u);
1589
1590         return 0;
1591 }
1592
1593 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1594         ManagerState state;
1595         unsigned n = 0;
1596         Unit *i;
1597         int r;
1598
1599         assert(m);
1600
1601         state = manager_state(m);
1602
1603         while ((i = m->cgroup_realize_queue)) {
1604                 assert(i->in_cgroup_realize_queue);
1605
1606                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1607                         /* Maybe things changed, and the unit is not actually active anymore? */
1608                         unit_remove_from_cgroup_realize_queue(i);
1609                         continue;
1610                 }
1611
1612                 r = unit_realize_cgroup_now(i, state);
1613                 if (r < 0)
1614                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1615
1616                 n++;
1617         }
1618
1619         return n;
1620 }
1621
1622 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1623         Unit *slice;
1624
1625         /* This adds the siblings of the specified unit and the
1626          * siblings of all parent units to the cgroup queue. (But
1627          * neither the specified unit itself nor the parents.) */
1628
1629         while ((slice = UNIT_DEREF(u->slice))) {
1630                 Iterator i;
1631                 Unit *m;
1632                 void *v;
1633
1634                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1635                         if (m == u)
1636                                 continue;
1637
1638                         /* Skip units that have a dependency on the slice
1639                          * but aren't actually in it. */
1640                         if (UNIT_DEREF(m->slice) != slice)
1641                                 continue;
1642
1643                         /* No point in doing cgroup application for units
1644                          * without active processes. */
1645                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1646                                 continue;
1647
1648                         /* If the unit doesn't need any new controllers
1649                          * and has current ones realized, it doesn't need
1650                          * any changes. */
1651                         if (unit_has_mask_realized(m,
1652                                                    unit_get_target_mask(m),
1653                                                    unit_get_enable_mask(m),
1654                                                    unit_get_needs_bpf(m)))
1655                                 continue;
1656
1657                         unit_add_to_cgroup_realize_queue(m);
1658                 }
1659
1660                 u = slice;
1661         }
1662 }
1663
1664 int unit_realize_cgroup(Unit *u) {
1665         assert(u);
1666
1667         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1668                 return 0;
1669
1670         /* So, here's the deal: when realizing the cgroups for this
1671          * unit, we need to first create all parents, but there's more
1672          * actually: for the weight-based controllers we also need to
1673          * make sure that all our siblings (i.e. units that are in the
1674          * same slice as we are) have cgroups, too. Otherwise, things
1675          * would become very uneven as each of their processes would
1676          * get as much resources as all our group together. This call
1677          * will synchronously create the parent cgroups, but will
1678          * defer work on the siblings to the next event loop
1679          * iteration. */
1680
1681         /* Add all sibling slices to the cgroup queue. */
1682         unit_add_siblings_to_cgroup_realize_queue(u);
1683
1684         /* And realize this one now (and apply the values) */
1685         return unit_realize_cgroup_now(u, manager_state(u->manager));
1686 }
1687
1688 void unit_release_cgroup(Unit *u) {
1689         assert(u);
1690
1691         /* Forgets all cgroup details for this cgroup */
1692
1693         if (u->cgroup_path) {
1694                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1695                 u->cgroup_path = mfree(u->cgroup_path);
1696         }
1697
1698         if (u->cgroup_inotify_wd >= 0) {
1699                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1700                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1701
1702                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1703                 u->cgroup_inotify_wd = -1;
1704         }
1705 }
1706
1707 void unit_prune_cgroup(Unit *u) {
1708         int r;
1709         bool is_root_slice;
1710
1711         assert(u);
1712
1713         /* Removes the cgroup, if empty and possible, and stops watching it. */
1714
1715         if (!u->cgroup_path)
1716                 return;
1717
1718         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1719
1720         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1721
1722         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1723         if (r < 0) {
1724                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1725                 return;
1726         }
1727
1728         if (is_root_slice)
1729                 return;
1730
1731         unit_release_cgroup(u);
1732
1733         u->cgroup_realized = false;
1734         u->cgroup_realized_mask = 0;
1735         u->cgroup_enabled_mask = 0;
1736 }
1737
1738 int unit_search_main_pid(Unit *u, pid_t *ret) {
1739         _cleanup_fclose_ FILE *f = NULL;
1740         pid_t pid = 0, npid, mypid;
1741         int r;
1742
1743         assert(u);
1744         assert(ret);
1745
1746         if (!u->cgroup_path)
1747                 return -ENXIO;
1748
1749         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1750         if (r < 0)
1751                 return r;
1752
1753         mypid = getpid_cached();
1754         while (cg_read_pid(f, &npid) > 0)  {
1755                 pid_t ppid;
1756
1757                 if (npid == pid)
1758                         continue;
1759
1760                 /* Ignore processes that aren't our kids */
1761                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1762                         continue;
1763
1764                 if (pid != 0)
1765                         /* Dang, there's more than one daemonized PID
1766                         in this group, so we don't know what process
1767                         is the main process. */
1768
1769                         return -ENODATA;
1770
1771                 pid = npid;
1772         }
1773
1774         *ret = pid;
1775         return 0;
1776 }
1777
1778 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1779         _cleanup_closedir_ DIR *d = NULL;
1780         _cleanup_fclose_ FILE *f = NULL;
1781         int ret = 0, r;
1782
1783         assert(u);
1784         assert(path);
1785
1786         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1787         if (r < 0)
1788                 ret = r;
1789         else {
1790                 pid_t pid;
1791
1792                 while ((r = cg_read_pid(f, &pid)) > 0) {
1793                         r = unit_watch_pid(u, pid);
1794                         if (r < 0 && ret >= 0)
1795                                 ret = r;
1796                 }
1797
1798                 if (r < 0 && ret >= 0)
1799                         ret = r;
1800         }
1801
1802         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1803         if (r < 0) {
1804                 if (ret >= 0)
1805                         ret = r;
1806         } else {
1807                 char *fn;
1808
1809                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1810                         _cleanup_free_ char *p = NULL;
1811
1812                         p = strjoin(path, "/", fn);
1813                         free(fn);
1814
1815                         if (!p)
1816                                 return -ENOMEM;
1817
1818                         r = unit_watch_pids_in_path(u, p);
1819                         if (r < 0 && ret >= 0)
1820                                 ret = r;
1821                 }
1822
1823                 if (r < 0 && ret >= 0)
1824                         ret = r;
1825         }
1826
1827         return ret;
1828 }
1829
1830 int unit_synthesize_cgroup_empty_event(Unit *u) {
1831         int r;
1832
1833         assert(u);
1834
1835         /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1836          * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1837          * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1838
1839         if (!u->cgroup_path)
1840                 return -ENOENT;
1841
1842         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1843         if (r < 0)
1844                 return r;
1845         if (r > 0) /* On unified we have reliable notifications, and don't need this */
1846                 return 0;
1847
1848         if (!set_isempty(u->pids))
1849                 return 0;
1850
1851         unit_add_to_cgroup_empty_queue(u);
1852         return 0;
1853 }
1854
1855 int unit_watch_all_pids(Unit *u) {
1856         int r;
1857
1858         assert(u);
1859
1860         /* Adds all PIDs from our cgroup to the set of PIDs we
1861          * watch. This is a fallback logic for cases where we do not
1862          * get reliable cgroup empty notifications: we try to use
1863          * SIGCHLD as replacement. */
1864
1865         if (!u->cgroup_path)
1866                 return -ENOENT;
1867
1868         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1869         if (r < 0)
1870                 return r;
1871         if (r > 0) /* On unified we can use proper notifications */
1872                 return 0;
1873
1874         return unit_watch_pids_in_path(u, u->cgroup_path);
1875 }
1876
1877 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1878         Manager *m = userdata;
1879         Unit *u;
1880         int r;
1881
1882         assert(s);
1883         assert(m);
1884
1885         u = m->cgroup_empty_queue;
1886         if (!u)
1887                 return 0;
1888
1889         assert(u->in_cgroup_empty_queue);
1890         u->in_cgroup_empty_queue = false;
1891         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1892
1893         if (m->cgroup_empty_queue) {
1894                 /* More stuff queued, let's make sure we remain enabled */
1895                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1896                 if (r < 0)
1897                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1898         }
1899
1900         unit_add_to_gc_queue(u);
1901
1902         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1903                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1904
1905         return 0;
1906 }
1907
1908 void unit_add_to_cgroup_empty_queue(Unit *u) {
1909         int r;
1910
1911         assert(u);
1912
1913         /* Note that there are four different ways how cgroup empty events reach us:
1914          *
1915          * 1. On the unified hierarchy we get an inotify event on the cgroup
1916          *
1917          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1918          *
1919          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1920          *
1921          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1922          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1923          *
1924          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1925          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1926          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1927          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1928          * case for scope units). */
1929
1930         if (u->in_cgroup_empty_queue)
1931                 return;
1932
1933         /* Let's verify that the cgroup is really empty */
1934         if (!u->cgroup_path)
1935                 return;
1936         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1937         if (r < 0) {
1938                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1939                 return;
1940         }
1941         if (r == 0)
1942                 return;
1943
1944         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1945         u->in_cgroup_empty_queue = true;
1946
1947         /* Trigger the defer event */
1948         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1949         if (r < 0)
1950                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1951 }
1952
1953 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1954         Manager *m = userdata;
1955
1956         assert(s);
1957         assert(fd >= 0);
1958         assert(m);
1959
1960         for (;;) {
1961                 union inotify_event_buffer buffer;
1962                 struct inotify_event *e;
1963                 ssize_t l;
1964
1965                 l = read(fd, &buffer, sizeof(buffer));
1966                 if (l < 0) {
1967                         if (IN_SET(errno, EINTR, EAGAIN))
1968                                 return 0;
1969
1970                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1971                 }
1972
1973                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1974                         Unit *u;
1975
1976                         if (e->wd < 0)
1977                                 /* Queue overflow has no watch descriptor */
1978                                 continue;
1979
1980                         if (e->mask & IN_IGNORED)
1981                                 /* The watch was just removed */
1982                                 continue;
1983
1984                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1985                         if (!u) /* Not that inotify might deliver
1986                                  * events for a watch even after it
1987                                  * was removed, because it was queued
1988                                  * before the removal. Let's ignore
1989                                  * this here safely. */
1990                                 continue;
1991
1992                         unit_add_to_cgroup_empty_queue(u);
1993                 }
1994         }
1995 }
1996
1997 int manager_setup_cgroup(Manager *m) {
1998         _cleanup_free_ char *path = NULL;
1999         const char *scope_path;
2000         CGroupController c;
2001         int r, all_unified;
2002         char *e;
2003
2004         assert(m);
2005
2006         /* 1. Determine hierarchy */
2007         m->cgroup_root = mfree(m->cgroup_root);
2008         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2009         if (r < 0)
2010                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2011
2012         /* Chop off the init scope, if we are already located in it */
2013         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2014
2015         /* LEGACY: Also chop off the system slice if we are in
2016          * it. This is to support live upgrades from older systemd
2017          * versions where PID 1 was moved there. Also see
2018          * cg_get_root_path(). */
2019         if (!e && MANAGER_IS_SYSTEM(m)) {
2020                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2021                 if (!e)
2022                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2023         }
2024         if (e)
2025                 *e = 0;
2026
2027         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2028          * easily prepend it everywhere. */
2029         delete_trailing_chars(m->cgroup_root, "/");
2030
2031         /* 2. Show data */
2032         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2033         if (r < 0)
2034                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2035
2036         r = cg_unified_flush();
2037         if (r < 0)
2038                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2039
2040         all_unified = cg_all_unified();
2041         if (all_unified < 0)
2042                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2043         if (all_unified > 0)
2044                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2045         else {
2046                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2047                 if (r < 0)
2048                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2049                 if (r > 0)
2050                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2051                 else
2052                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2053         }
2054
2055         /* 3. Allocate cgroup empty defer event source */
2056         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2057         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2058         if (r < 0)
2059                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2060
2061         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2062         if (r < 0)
2063                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2064
2065         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2066         if (r < 0)
2067                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2068
2069         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2070
2071         /* 4. Install notifier inotify object, or agent */
2072         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2073
2074                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2075
2076                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2077                 safe_close(m->cgroup_inotify_fd);
2078
2079                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2080                 if (m->cgroup_inotify_fd < 0)
2081                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2082
2083                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2084                 if (r < 0)
2085                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2086
2087                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2088                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2089                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2090                 if (r < 0)
2091                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2092
2093                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2094
2095         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2096
2097                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2098                  * since it does not generate events when control groups with children run empty. */
2099
2100                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2101                 if (r < 0)
2102                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2103                 else if (r > 0)
2104                         log_debug("Installed release agent.");
2105                 else if (r == 0)
2106                         log_debug("Release agent already installed.");
2107         }
2108
2109         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2110         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2111         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2112         if (r < 0)
2113                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2114
2115         /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2116         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2117         if (r < 0)
2118                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2119
2120         /* 6. And pin it, so that it cannot be unmounted */
2121         safe_close(m->pin_cgroupfs_fd);
2122         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2123         if (m->pin_cgroupfs_fd < 0)
2124                 return log_error_errno(errno, "Failed to open pin file: %m");
2125
2126         /* 7. Always enable hierarchical support if it exists... */
2127         if (!all_unified && m->test_run_flags == 0)
2128                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2129
2130         /* 8. Figure out which controllers are supported, and log about it */
2131         r = cg_mask_supported(&m->cgroup_supported);
2132         if (r < 0)
2133                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2134         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2135                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2136
2137         return 0;
2138 }
2139
2140 void manager_shutdown_cgroup(Manager *m, bool delete) {
2141         assert(m);
2142
2143         /* We can't really delete the group, since we are in it. But
2144          * let's trim it. */
2145         if (delete && m->cgroup_root)
2146                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2147
2148         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2149
2150         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2151
2152         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2153         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2154
2155         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2156
2157         m->cgroup_root = mfree(m->cgroup_root);
2158 }
2159
2160 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2161         char *p;
2162         Unit *u;
2163
2164         assert(m);
2165         assert(cgroup);
2166
2167         u = hashmap_get(m->cgroup_unit, cgroup);
2168         if (u)
2169                 return u;
2170
2171         p = strdupa(cgroup);
2172         for (;;) {
2173                 char *e;
2174
2175                 e = strrchr(p, '/');
2176                 if (!e || e == p)
2177                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2178
2179                 *e = 0;
2180
2181                 u = hashmap_get(m->cgroup_unit, p);
2182                 if (u)
2183                         return u;
2184         }
2185 }
2186
2187 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2188         _cleanup_free_ char *cgroup = NULL;
2189         int r;
2190
2191         assert(m);
2192
2193         if (pid <= 0)
2194                 return NULL;
2195
2196         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2197         if (r < 0)
2198                 return NULL;
2199
2200         return manager_get_unit_by_cgroup(m, cgroup);
2201 }
2202
2203 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2204         Unit *u;
2205
2206         assert(m);
2207
2208         if (pid <= 0)
2209                 return NULL;
2210
2211         if (pid == getpid_cached())
2212                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2213
2214         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2215         if (u)
2216                 return u;
2217
2218         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2219         if (u)
2220                 return u;
2221
2222         return manager_get_unit_by_pid_cgroup(m, pid);
2223 }
2224
2225 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2226         Unit *u;
2227
2228         assert(m);
2229         assert(cgroup);
2230
2231         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2232          * or from the --system instance */
2233
2234         log_debug("Got cgroup empty notification for: %s", cgroup);
2235
2236         u = manager_get_unit_by_cgroup(m, cgroup);
2237         if (!u)
2238                 return 0;
2239
2240         unit_add_to_cgroup_empty_queue(u);
2241         return 1;
2242 }
2243
2244 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2245         _cleanup_free_ char *v = NULL;
2246         int r;
2247
2248         assert(u);
2249         assert(ret);
2250
2251         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2252                 return -ENODATA;
2253
2254         if (!u->cgroup_path)
2255                 return -ENODATA;
2256
2257         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2258                 return -ENODATA;
2259
2260         r = cg_all_unified();
2261         if (r < 0)
2262                 return r;
2263         if (r > 0)
2264                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2265         else
2266                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2267         if (r == -ENOENT)
2268                 return -ENODATA;
2269         if (r < 0)
2270                 return r;
2271
2272         return safe_atou64(v, ret);
2273 }
2274
2275 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2276         _cleanup_free_ char *v = NULL;
2277         int r;
2278
2279         assert(u);
2280         assert(ret);
2281
2282         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2283                 return -ENODATA;
2284
2285         if (!u->cgroup_path)
2286                 return -ENODATA;
2287
2288         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2289                 return -ENODATA;
2290
2291         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2292         if (r == -ENOENT)
2293                 return -ENODATA;
2294         if (r < 0)
2295                 return r;
2296
2297         return safe_atou64(v, ret);
2298 }
2299
2300 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2301         _cleanup_free_ char *v = NULL;
2302         uint64_t ns;
2303         int r;
2304
2305         assert(u);
2306         assert(ret);
2307
2308         if (!u->cgroup_path)
2309                 return -ENODATA;
2310
2311         r = cg_all_unified();
2312         if (r < 0)
2313                 return r;
2314         if (r > 0) {
2315                 const char *keys[] = { "usage_usec", NULL };
2316                 _cleanup_free_ char *val = NULL;
2317                 uint64_t us;
2318
2319                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2320                         return -ENODATA;
2321
2322                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2323                 if (r < 0)
2324                         return r;
2325
2326                 r = safe_atou64(val, &us);
2327                 if (r < 0)
2328                         return r;
2329
2330                 ns = us * NSEC_PER_USEC;
2331         } else {
2332                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2333                         return -ENODATA;
2334
2335                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2336                 if (r == -ENOENT)
2337                         return -ENODATA;
2338                 if (r < 0)
2339                         return r;
2340
2341                 r = safe_atou64(v, &ns);
2342                 if (r < 0)
2343                         return r;
2344         }
2345
2346         *ret = ns;
2347         return 0;
2348 }
2349
2350 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2351         nsec_t ns;
2352         int r;
2353
2354         assert(u);
2355
2356         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2357          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2358          * call this function with a NULL return value. */
2359
2360         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2361                 return -ENODATA;
2362
2363         r = unit_get_cpu_usage_raw(u, &ns);
2364         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2365                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2366                  * cached value. */
2367
2368                 if (ret)
2369                         *ret = u->cpu_usage_last;
2370                 return 0;
2371         }
2372         if (r < 0)
2373                 return r;
2374
2375         if (ns > u->cpu_usage_base)
2376                 ns -= u->cpu_usage_base;
2377         else
2378                 ns = 0;
2379
2380         u->cpu_usage_last = ns;
2381         if (ret)
2382                 *ret = ns;
2383
2384         return 0;
2385 }
2386
2387 int unit_get_ip_accounting(
2388                 Unit *u,
2389                 CGroupIPAccountingMetric metric,
2390                 uint64_t *ret) {
2391
2392         uint64_t value;
2393         int fd, r;
2394
2395         assert(u);
2396         assert(metric >= 0);
2397         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2398         assert(ret);
2399
2400         /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2401          * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2402          * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2403          * filters. */
2404         if (u->type == UNIT_SLICE)
2405                 return -ENODATA;
2406
2407         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2408                 return -ENODATA;
2409
2410         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2411                 u->ip_accounting_ingress_map_fd :
2412                 u->ip_accounting_egress_map_fd;
2413         if (fd < 0)
2414                 return -ENODATA;
2415
2416         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2417                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2418         else
2419                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2420         if (r < 0)
2421                 return r;
2422
2423         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2424          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2425          * ip_accounting_extra[] field, and add them in here transparently. */
2426
2427         *ret = value + u->ip_accounting_extra[metric];
2428
2429         return r;
2430 }
2431
2432 int unit_reset_cpu_accounting(Unit *u) {
2433         nsec_t ns;
2434         int r;
2435
2436         assert(u);
2437
2438         u->cpu_usage_last = NSEC_INFINITY;
2439
2440         r = unit_get_cpu_usage_raw(u, &ns);
2441         if (r < 0) {
2442                 u->cpu_usage_base = 0;
2443                 return r;
2444         }
2445
2446         u->cpu_usage_base = ns;
2447         return 0;
2448 }
2449
2450 int unit_reset_ip_accounting(Unit *u) {
2451         int r = 0, q = 0;
2452
2453         assert(u);
2454
2455         if (u->ip_accounting_ingress_map_fd >= 0)
2456                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2457
2458         if (u->ip_accounting_egress_map_fd >= 0)
2459                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2460
2461         zero(u->ip_accounting_extra);
2462
2463         return r < 0 ? r : q;
2464 }
2465
2466 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2467         assert(u);
2468
2469         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2470                 return;
2471
2472         if (m == 0)
2473                 return;
2474
2475         /* always invalidate compat pairs together */
2476         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2477                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2478
2479         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2480                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2481
2482         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2483                 return;
2484
2485         u->cgroup_realized_mask &= ~m;
2486         unit_add_to_cgroup_realize_queue(u);
2487 }
2488
2489 void unit_invalidate_cgroup_bpf(Unit *u) {
2490         assert(u);
2491
2492         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2493                 return;
2494
2495         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2496                 return;
2497
2498         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2499         unit_add_to_cgroup_realize_queue(u);
2500
2501         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2502          * list of our children includes our own. */
2503         if (u->type == UNIT_SLICE) {
2504                 Unit *member;
2505                 Iterator i;
2506                 void *v;
2507
2508                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2509                         if (member == u)
2510                                 continue;
2511
2512                         if (UNIT_DEREF(member->slice) != u)
2513                                 continue;
2514
2515                         unit_invalidate_cgroup_bpf(member);
2516                 }
2517         }
2518 }
2519
2520 void manager_invalidate_startup_units(Manager *m) {
2521         Iterator i;
2522         Unit *u;
2523
2524         assert(m);
2525
2526         SET_FOREACH(u, m->startup_units, i)
2527                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2528 }
2529
2530 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2531         [CGROUP_AUTO] = "auto",
2532         [CGROUP_CLOSED] = "closed",
2533         [CGROUP_STRICT] = "strict",
2534 };
2535
2536 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);