src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <fnmatch.h>
  23
  24 #include "alloc-util.h"
  25 #include "blockdev-util.h"
  26 #include "bpf-firewall.h"
  27 #include "bus-error.h"
  28 #include "cgroup-util.h"
  29 #include "cgroup.h"
  30 #include "fd-util.h"
  31 #include "fileio.h"
  32 #include "fs-util.h"
  33 #include "parse-util.h"
  34 #include "path-util.h"
  35 #include "process-util.h"
  36 #include "procfs-util.h"
  37 #include "special.h"
  38 #include "stdio-util.h"
  39 #include "string-table.h"
  40 #include "string-util.h"
  41 #include "virt.h"
  42
  43 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  44
  45 bool manager_owns_root_cgroup(Manager *m) {
  46         assert(m);
  47
  48         /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
  49          * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
  50          * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
  51          * we run in any kind of container virtualization. */
  52
  53         if (detect_container() > 0)
  54                 return false;
  55
  56         return isempty(m->cgroup_root) || path_equal(m->cgroup_root, "/");
  57 }
  58
  59 bool unit_has_root_cgroup(Unit *u) {
  60         assert(u);
  61
  62         /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
  63          * the manager manages the root cgroup. */
  64
  65         if (!manager_owns_root_cgroup(u->manager))
  66                 return false;
  67
  68         return unit_has_name(u, SPECIAL_ROOT_SLICE);
  69 }
  70
  71 static void cgroup_compat_warn(void) {
  72         static bool cgroup_compat_warned = false;
  73
  74         if (cgroup_compat_warned)
  75                 return;
  76
  77         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
  78                     "See cgroup-compat debug messages for details.");
  79
  80         cgroup_compat_warned = true;
  81 }
  82
  83 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  84                 cgroup_compat_warn();                                           \
  85                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  86         } while (false)
  87
  88 void cgroup_context_init(CGroupContext *c) {
  89         assert(c);
  90
  91         /* Initialize everything to the kernel defaults, assuming the
  92          * structure is preinitialized to 0 */
  93
  94         c->cpu_weight = CGROUP_WEIGHT_INVALID;
  95         c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
  96         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  97
  98         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  99         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
 100
 101         c->memory_high = CGROUP_LIMIT_MAX;
 102         c->memory_max = CGROUP_LIMIT_MAX;
 103         c->memory_swap_max = CGROUP_LIMIT_MAX;
 104
 105         c->memory_limit = CGROUP_LIMIT_MAX;
 106
 107         c->io_weight = CGROUP_WEIGHT_INVALID;
 108         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
 109
 110         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
 111         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
 112
 113         c->tasks_max = (uint64_t) -1;
 114 }
 115
 116 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
 117         assert(c);
 118         assert(a);
 119
 120         LIST_REMOVE(device_allow, c->device_allow, a);
 121         free(a->path);
 122         free(a);
 123 }
 124
 125 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
 126         assert(c);
 127         assert(w);
 128
 129         LIST_REMOVE(device_weights, c->io_device_weights, w);
 130         free(w->path);
 131         free(w);
 132 }
 133
 134 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 135         assert(c);
 136         assert(l);
 137
 138         LIST_REMOVE(device_limits, c->io_device_limits, l);
 139         free(l->path);
 140         free(l);
 141 }
 142
 143 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 144         assert(c);
 145         assert(w);
 146
 147         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 148         free(w->path);
 149         free(w);
 150 }
 151
 152 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 153         assert(c);
 154         assert(b);
 155
 156         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 157         free(b->path);
 158         free(b);
 159 }
 160
 161 void cgroup_context_done(CGroupContext *c) {
 162         assert(c);
 163
 164         while (c->io_device_weights)
 165                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 166
 167         while (c->io_device_limits)
 168                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 169
 170         while (c->blockio_device_weights)
 171                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 172
 173         while (c->blockio_device_bandwidths)
 174                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 175
 176         while (c->device_allow)
 177                 cgroup_context_free_device_allow(c, c->device_allow);
 178
 179         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
 180         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
 181 }
 182
 183 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 184         CGroupIODeviceLimit *il;
 185         CGroupIODeviceWeight *iw;
 186         CGroupBlockIODeviceBandwidth *b;
 187         CGroupBlockIODeviceWeight *w;
 188         CGroupDeviceAllow *a;
 189         IPAddressAccessItem *iaai;
 190         char u[FORMAT_TIMESPAN_MAX];
 191
 192         assert(c);
 193         assert(f);
 194
 195         prefix = strempty(prefix);
 196
 197         fprintf(f,
 198                 "%sCPUAccounting=%s\n"
 199                 "%sIOAccounting=%s\n"
 200                 "%sBlockIOAccounting=%s\n"
 201                 "%sMemoryAccounting=%s\n"
 202                 "%sTasksAccounting=%s\n"
 203                 "%sIPAccounting=%s\n"
 204                 "%sCPUWeight=%" PRIu64 "\n"
 205                 "%sStartupCPUWeight=%" PRIu64 "\n"
 206                 "%sCPUShares=%" PRIu64 "\n"
 207                 "%sStartupCPUShares=%" PRIu64 "\n"
 208                 "%sCPUQuotaPerSecSec=%s\n"
 209                 "%sIOWeight=%" PRIu64 "\n"
 210                 "%sStartupIOWeight=%" PRIu64 "\n"
 211                 "%sBlockIOWeight=%" PRIu64 "\n"
 212                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 213                 "%sMemoryLow=%" PRIu64 "\n"
 214                 "%sMemoryHigh=%" PRIu64 "\n"
 215                 "%sMemoryMax=%" PRIu64 "\n"
 216                 "%sMemorySwapMax=%" PRIu64 "\n"
 217                 "%sMemoryLimit=%" PRIu64 "\n"
 218                 "%sTasksMax=%" PRIu64 "\n"
 219                 "%sDevicePolicy=%s\n"
 220                 "%sDelegate=%s\n",
 221                 prefix, yes_no(c->cpu_accounting),
 222                 prefix, yes_no(c->io_accounting),
 223                 prefix, yes_no(c->blockio_accounting),
 224                 prefix, yes_no(c->memory_accounting),
 225                 prefix, yes_no(c->tasks_accounting),
 226                 prefix, yes_no(c->ip_accounting),
 227                 prefix, c->cpu_weight,
 228                 prefix, c->startup_cpu_weight,
 229                 prefix, c->cpu_shares,
 230                 prefix, c->startup_cpu_shares,
 231                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 232                 prefix, c->io_weight,
 233                 prefix, c->startup_io_weight,
 234                 prefix, c->blockio_weight,
 235                 prefix, c->startup_blockio_weight,
 236                 prefix, c->memory_low,
 237                 prefix, c->memory_high,
 238                 prefix, c->memory_max,
 239                 prefix, c->memory_swap_max,
 240                 prefix, c->memory_limit,
 241                 prefix, c->tasks_max,
 242                 prefix, cgroup_device_policy_to_string(c->device_policy),
 243                 prefix, yes_no(c->delegate));
 244
 245         if (c->delegate) {
 246                 _cleanup_free_ char *t = NULL;
 247
 248                 (void) cg_mask_to_string(c->delegate_controllers, &t);
 249
 250                 fprintf(f, "%sDelegateControllers=%s\n",
 251                         prefix,
 252                         strempty(t));
 253         }
 254
 255         LIST_FOREACH(device_allow, a, c->device_allow)
 256                 fprintf(f,
 257                         "%sDeviceAllow=%s %s%s%s\n",
 258                         prefix,
 259                         a->path,
 260                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 261
 262         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 263                 fprintf(f,
 264                         "%sIODeviceWeight=%s %" PRIu64,
 265                         prefix,
 266                         iw->path,
 267                         iw->weight);
 268
 269         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 270                 char buf[FORMAT_BYTES_MAX];
 271                 CGroupIOLimitType type;
 272
 273                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 274                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 275                                 fprintf(f,
 276                                         "%s%s=%s %s\n",
 277                                         prefix,
 278                                         cgroup_io_limit_type_to_string(type),
 279                                         il->path,
 280                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 281         }
 282
 283         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 284                 fprintf(f,
 285                         "%sBlockIODeviceWeight=%s %" PRIu64,
 286                         prefix,
 287                         w->path,
 288                         w->weight);
 289
 290         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 291                 char buf[FORMAT_BYTES_MAX];
 292
 293                 if (b->rbps != CGROUP_LIMIT_MAX)
 294                         fprintf(f,
 295                                 "%sBlockIOReadBandwidth=%s %s\n",
 296                                 prefix,
 297                                 b->path,
 298                                 format_bytes(buf, sizeof(buf), b->rbps));
 299                 if (b->wbps != CGROUP_LIMIT_MAX)
 300                         fprintf(f,
 301                                 "%sBlockIOWriteBandwidth=%s %s\n",
 302                                 prefix,
 303                                 b->path,
 304                                 format_bytes(buf, sizeof(buf), b->wbps));
 305         }
 306
 307         LIST_FOREACH(items, iaai, c->ip_address_allow) {
 308                 _cleanup_free_ char *k = NULL;
 309
 310                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 311                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 312         }
 313
 314         LIST_FOREACH(items, iaai, c->ip_address_deny) {
 315                 _cleanup_free_ char *k = NULL;
 316
 317                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
 318                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
 319         }
 320 }
 321
 322 static int lookup_block_device(const char *p, dev_t *dev) {
 323         struct stat st;
 324         int r;
 325
 326         assert(p);
 327         assert(dev);
 328
 329         r = stat(p, &st);
 330         if (r < 0)
 331                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 332
 333         if (S_ISBLK(st.st_mode))
 334                 *dev = st.st_rdev;
 335         else if (major(st.st_dev) != 0) {
 336                 /* If this is not a device node then find the block
 337                  * device this file is stored on */
 338                 *dev = st.st_dev;
 339
 340                 /* If this is a partition, try to get the originating
 341                  * block device */
 342                 (void) block_get_whole_disk(*dev, dev);
 343         } else {
 344                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 345                 return -ENODEV;
 346         }
 347
 348         return 0;
 349 }
 350
 351 static int whitelist_device(const char *path, const char *node, const char *acc) {
 352         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 353         struct stat st;
 354         bool ignore_notfound;
 355         int r;
 356
 357         assert(path);
 358         assert(acc);
 359
 360         if (node[0] == '-') {
 361                 /* Non-existent paths starting with "-" must be silently ignored */
 362                 node++;
 363                 ignore_notfound = true;
 364         } else
 365                 ignore_notfound = false;
 366
 367         if (stat(node, &st) < 0) {
 368                 if (errno == ENOENT && ignore_notfound)
 369                         return 0;
 370
 371                 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
 372         }
 373
 374         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 375                 log_warning("%s is not a device.", node);
 376                 return -ENODEV;
 377         }
 378
 379         sprintf(buf,
 380                 "%c %u:%u %s",
 381                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 382                 major(st.st_rdev), minor(st.st_rdev),
 383                 acc);
 384
 385         r = cg_set_attribute("devices", path, "devices.allow", buf);
 386         if (r < 0)
 387                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 388                                "Failed to set devices.allow on %s: %m", path);
 389
 390         return r;
 391 }
 392
 393 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 394         _cleanup_fclose_ FILE *f = NULL;
 395         char line[LINE_MAX];
 396         bool good = false;
 397         int r;
 398
 399         assert(path);
 400         assert(acc);
 401         assert(IN_SET(type, 'b', 'c'));
 402
 403         f = fopen("/proc/devices", "re");
 404         if (!f)
 405                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 406
 407         FOREACH_LINE(line, f, goto fail) {
 408                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 409                 unsigned maj;
 410
 411                 truncate_nl(line);
 412
 413                 if (type == 'c' && streq(line, "Character devices:")) {
 414                         good = true;
 415                         continue;
 416                 }
 417
 418                 if (type == 'b' && streq(line, "Block devices:")) {
 419                         good = true;
 420                         continue;
 421                 }
 422
 423                 if (isempty(line)) {
 424                         good = false;
 425                         continue;
 426                 }
 427
 428                 if (!good)
 429                         continue;
 430
 431                 p = strstrip(line);
 432
 433                 w = strpbrk(p, WHITESPACE);
 434                 if (!w)
 435                         continue;
 436                 *w = 0;
 437
 438                 r = safe_atou(p, &maj);
 439                 if (r < 0)
 440                         continue;
 441                 if (maj <= 0)
 442                         continue;
 443
 444                 w++;
 445                 w += strspn(w, WHITESPACE);
 446
 447                 if (fnmatch(name, w, 0) != 0)
 448                         continue;
 449
 450                 sprintf(buf,
 451                         "%c %u:* %s",
 452                         type,
 453                         maj,
 454                         acc);
 455
 456                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 457                 if (r < 0)
 458                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 459                                        "Failed to set devices.allow on %s: %m", path);
 460         }
 461
 462         return 0;
 463
 464 fail:
 465         return log_warning_errno(errno, "Failed to read /proc/devices: %m");
 466 }
 467
 468 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
 469         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
 470                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
 471 }
 472
 473 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
 474         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 475                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
 476 }
 477
 478 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
 479         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 480             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
 481                 return c->startup_cpu_weight;
 482         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
 483                 return c->cpu_weight;
 484         else
 485                 return CGROUP_WEIGHT_DEFAULT;
 486 }
 487
 488 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
 489         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 490             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
 491                 return c->startup_cpu_shares;
 492         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
 493                 return c->cpu_shares;
 494         else
 495                 return CGROUP_CPU_SHARES_DEFAULT;
 496 }
 497
 498 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
 499         char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
 500         int r;
 501
 502         xsprintf(buf, "%" PRIu64 "\n", weight);
 503         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
 504         if (r < 0)
 505                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 506                               "Failed to set cpu.weight: %m");
 507
 508         if (quota != USEC_INFINITY)
 509                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
 510                          quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
 511         else
 512                 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 513
 514         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
 515
 516         if (r < 0)
 517                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 518                               "Failed to set cpu.max: %m");
 519 }
 520
 521 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
 522         char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 523         int r;
 524
 525         xsprintf(buf, "%" PRIu64 "\n", shares);
 526         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
 527         if (r < 0)
 528                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 529                               "Failed to set cpu.shares: %m");
 530
 531         xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 532         r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
 533         if (r < 0)
 534                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 535                               "Failed to set cpu.cfs_period_us: %m");
 536
 537         if (quota != USEC_INFINITY) {
 538                 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 539                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
 540         } else
 541                 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
 542         if (r < 0)
 543                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 544                               "Failed to set cpu.cfs_quota_us: %m");
 545 }
 546
 547 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
 548         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
 549                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 550 }
 551
 552 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
 553         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 554                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
 555 }
 556
 557 static bool cgroup_context_has_io_config(CGroupContext *c) {
 558         return c->io_accounting ||
 559                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 560                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 561                 c->io_device_weights ||
 562                 c->io_device_limits;
 563 }
 564
 565 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 566         return c->blockio_accounting ||
 567                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 568                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 569                 c->blockio_device_weights ||
 570                 c->blockio_device_bandwidths;
 571 }
 572
 573 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 574         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 575             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 576                 return c->startup_io_weight;
 577         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 578                 return c->io_weight;
 579         else
 580                 return CGROUP_WEIGHT_DEFAULT;
 581 }
 582
 583 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 584         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 585             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 586                 return c->startup_blockio_weight;
 587         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 588                 return c->blockio_weight;
 589         else
 590                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 591 }
 592
 593 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 594         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 595                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 596 }
 597
 598 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 599         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 600                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 601 }
 602
 603 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 604         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 605         dev_t dev;
 606         int r;
 607
 608         r = lookup_block_device(dev_path, &dev);
 609         if (r < 0)
 610                 return;
 611
 612         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 613         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 614         if (r < 0)
 615                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 616                               "Failed to set io.weight: %m");
 617 }
 618
 619 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 620         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 621         dev_t dev;
 622         int r;
 623
 624         r = lookup_block_device(dev_path, &dev);
 625         if (r < 0)
 626                 return;
 627
 628         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 629         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 630         if (r < 0)
 631                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 632                               "Failed to set blkio.weight_device: %m");
 633 }
 634
 635 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 636         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 637         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 638         CGroupIOLimitType type;
 639         dev_t dev;
 640         unsigned n = 0;
 641         int r;
 642
 643         r = lookup_block_device(dev_path, &dev);
 644         if (r < 0)
 645                 return 0;
 646
 647         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 648                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 649                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 650                         n++;
 651                 } else {
 652                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 653                 }
 654         }
 655
 656         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 657                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 658                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 659         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 660         if (r < 0)
 661                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 662                               "Failed to set io.max: %m");
 663         return n;
 664 }
 665
 666 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 667         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 668         dev_t dev;
 669         unsigned n = 0;
 670         int r;
 671
 672         r = lookup_block_device(dev_path, &dev);
 673         if (r < 0)
 674                 return 0;
 675
 676         if (rbps != CGROUP_LIMIT_MAX)
 677                 n++;
 678         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 679         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 680         if (r < 0)
 681                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 682                               "Failed to set blkio.throttle.read_bps_device: %m");
 683
 684         if (wbps != CGROUP_LIMIT_MAX)
 685                 n++;
 686         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 687         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 688         if (r < 0)
 689                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 690                               "Failed to set blkio.throttle.write_bps_device: %m");
 691
 692         return n;
 693 }
 694
 695 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 696         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
 697 }
 698
 699 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 700         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 701         int r;
 702
 703         if (v != CGROUP_LIMIT_MAX)
 704                 xsprintf(buf, "%" PRIu64 "\n", v);
 705
 706         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 707         if (r < 0)
 708                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 709                               "Failed to set %s: %m", file);
 710 }
 711
 712 static void cgroup_apply_firewall(Unit *u) {
 713         assert(u);
 714
 715         /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
 716
 717         if (bpf_firewall_compile(u) < 0)
 718                 return;
 719
 720         (void) bpf_firewall_install(u);
 721 }
 722
 723 static void cgroup_context_apply(
 724                 Unit *u,
 725                 CGroupMask apply_mask,
 726                 bool apply_bpf,
 727                 ManagerState state) {
 728
 729         const char *path;
 730         CGroupContext *c;
 731         bool is_root;
 732         int r;
 733
 734         assert(u);
 735
 736         /* Nothing to do? Exit early! */
 737         if (apply_mask == 0 && !apply_bpf)
 738                 return;
 739
 740         /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
 741         is_root = unit_has_root_cgroup(u);
 742
 743         assert_se(c = unit_get_cgroup_context(u));
 744         assert_se(path = u->cgroup_path);
 745
 746         if (is_root) /* Make sure we don't try to display messages with an empty path. */
 747                 path = "/";
 748
 749         /* We generally ignore errors caused by read-only mounted
 750          * cgroup trees (assuming we are running in a container then),
 751          * and missing cgroups, i.e. EROFS and ENOENT. */
 752
 753         if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
 754                 bool has_weight, has_shares;
 755
 756                 has_weight = cgroup_context_has_cpu_weight(c);
 757                 has_shares = cgroup_context_has_cpu_shares(c);
 758
 759                 if (cg_all_unified() > 0) {
 760                         uint64_t weight;
 761
 762                         if (has_weight)
 763                                 weight = cgroup_context_cpu_weight(c, state);
 764                         else if (has_shares) {
 765                                 uint64_t shares = cgroup_context_cpu_shares(c, state);
 766
 767                                 weight = cgroup_cpu_shares_to_weight(shares);
 768
 769                                 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
 770                                                   shares, weight, path);
 771                         } else
 772                                 weight = CGROUP_WEIGHT_DEFAULT;
 773
 774                         cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
 775                 } else {
 776                         uint64_t shares;
 777
 778                         if (has_weight) {
 779                                 uint64_t weight = cgroup_context_cpu_weight(c, state);
 780
 781                                 shares = cgroup_cpu_weight_to_shares(weight);
 782
 783                                 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
 784                                                   weight, shares, path);
 785                         } else if (has_shares)
 786                                 shares = cgroup_context_cpu_shares(c, state);
 787                         else
 788                                 shares = CGROUP_CPU_SHARES_DEFAULT;
 789
 790                         cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
 791                 }
 792         }
 793
 794         if (apply_mask & CGROUP_MASK_IO) {
 795                 bool has_io = cgroup_context_has_io_config(c);
 796                 bool has_blockio = cgroup_context_has_blockio_config(c);
 797
 798                 if (!is_root) {
 799                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 800                         uint64_t weight;
 801
 802                         if (has_io)
 803                                 weight = cgroup_context_io_weight(c, state);
 804                         else if (has_blockio) {
 805                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 806
 807                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 808
 809                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 810                                                   blkio_weight, weight);
 811                         } else
 812                                 weight = CGROUP_WEIGHT_DEFAULT;
 813
 814                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 815                         r = cg_set_attribute("io", path, "io.weight", buf);
 816                         if (r < 0)
 817                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 818                                               "Failed to set io.weight: %m");
 819
 820                         if (has_io) {
 821                                 CGroupIODeviceWeight *w;
 822
 823                                 /* FIXME: no way to reset this list */
 824                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 825                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 826                         } else if (has_blockio) {
 827                                 CGroupBlockIODeviceWeight *w;
 828
 829                                 /* FIXME: no way to reset this list */
 830                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 831                                         weight = cgroup_weight_blkio_to_io(w->weight);
 832
 833                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 834                                                           w->weight, weight, w->path);
 835
 836                                         cgroup_apply_io_device_weight(u, w->path, weight);
 837                                 }
 838                         }
 839                 }
 840
 841                 /* Apply limits and free ones without config. */
 842                 if (has_io) {
 843                         CGroupIODeviceLimit *l, *next;
 844
 845                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 846                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 847                                         cgroup_context_free_io_device_limit(c, l);
 848                         }
 849                 } else if (has_blockio) {
 850                         CGroupBlockIODeviceBandwidth *b, *next;
 851
 852                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 853                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 854                                 CGroupIOLimitType type;
 855
 856                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 857                                         limits[type] = cgroup_io_limit_defaults[type];
 858
 859                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 860                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 861
 862                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 863                                                   b->rbps, b->wbps, b->path);
 864
 865                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 866                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 867                         }
 868                 }
 869         }
 870
 871         if (apply_mask & CGROUP_MASK_BLKIO) {
 872                 bool has_io = cgroup_context_has_io_config(c);
 873                 bool has_blockio = cgroup_context_has_blockio_config(c);
 874
 875                 if (!is_root) {
 876                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 877                         uint64_t weight;
 878
 879                         if (has_io) {
 880                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 881
 882                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 883
 884                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 885                                                   io_weight, weight);
 886                         } else if (has_blockio)
 887                                 weight = cgroup_context_blkio_weight(c, state);
 888                         else
 889                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 890
 891                         xsprintf(buf, "%" PRIu64 "\n", weight);
 892                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 893                         if (r < 0)
 894                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 895                                               "Failed to set blkio.weight: %m");
 896
 897                         if (has_io) {
 898                                 CGroupIODeviceWeight *w;
 899
 900                                 /* FIXME: no way to reset this list */
 901                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 902                                         weight = cgroup_weight_io_to_blkio(w->weight);
 903
 904                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 905                                                           w->weight, weight, w->path);
 906
 907                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 908                                 }
 909                         } else if (has_blockio) {
 910                                 CGroupBlockIODeviceWeight *w;
 911
 912                                 /* FIXME: no way to reset this list */
 913                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 914                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 915                         }
 916                 }
 917
 918                 /* Apply limits and free ones without config. */
 919                 if (has_io) {
 920                         CGroupIODeviceLimit *l, *next;
 921
 922                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 923                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 924                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 925
 926                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 927                                         cgroup_context_free_io_device_limit(c, l);
 928                         }
 929                 } else if (has_blockio) {
 930                         CGroupBlockIODeviceBandwidth *b, *next;
 931
 932                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
 933                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 934                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 935                 }
 936         }
 937
 938         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
 939                 if (cg_all_unified() > 0) {
 940                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
 941
 942                         if (cgroup_context_has_unified_memory_config(c)) {
 943                                 max = c->memory_max;
 944                                 swap_max = c->memory_swap_max;
 945                         } else {
 946                                 max = c->memory_limit;
 947
 948                                 if (max != CGROUP_LIMIT_MAX)
 949                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 950                         }
 951
 952                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 953                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 954                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 955                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
 956                 } else {
 957                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 958                         uint64_t val;
 959
 960                         if (cgroup_context_has_unified_memory_config(c)) {
 961                                 val = c->memory_max;
 962                                 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
 963                         } else
 964                                 val = c->memory_limit;
 965
 966                         if (val == CGROUP_LIMIT_MAX)
 967                                 strncpy(buf, "-1\n", sizeof(buf));
 968                         else
 969                                 xsprintf(buf, "%" PRIu64 "\n", val);
 970
 971                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 972                         if (r < 0)
 973                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 974                                               "Failed to set memory.limit_in_bytes: %m");
 975                 }
 976         }
 977
 978         if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
 979                 CGroupDeviceAllow *a;
 980
 981                 /* Changing the devices list of a populated cgroup
 982                  * might result in EINVAL, hence ignore EINVAL
 983                  * here. */
 984
 985                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 986                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 987                 else
 988                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 989                 if (r < 0)
 990                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 991                                       "Failed to reset devices.list: %m");
 992
 993                 if (c->device_policy == CGROUP_CLOSED ||
 994                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 995                         static const char auto_devices[] =
 996                                 "/dev/null\0" "rwm\0"
 997                                 "/dev/zero\0" "rwm\0"
 998                                 "/dev/full\0" "rwm\0"
 999                                 "/dev/random\0" "rwm\0"
1000                                 "/dev/urandom\0" "rwm\0"
1001                                 "/dev/tty\0" "rwm\0"
1002                                 "/dev/ptmx\0" "rwm\0"
1003                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1004                                 "-/run/systemd/inaccessible/chr\0" "rwm\0"
1005                                 "-/run/systemd/inaccessible/blk\0" "rwm\0";
1006
1007                         const char *x, *y;
1008
1009                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
1010                                 whitelist_device(path, x, y);
1011
1012                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1013                         whitelist_major(path, "pts", 'c', "rw");
1014                 }
1015
1016                 LIST_FOREACH(device_allow, a, c->device_allow) {
1017                         char acc[4], *val;
1018                         unsigned k = 0;
1019
1020                         if (a->r)
1021                                 acc[k++] = 'r';
1022                         if (a->w)
1023                                 acc[k++] = 'w';
1024                         if (a->m)
1025                                 acc[k++] = 'm';
1026
1027                         if (k == 0)
1028                                 continue;
1029
1030                         acc[k++] = 0;
1031
1032                         if (path_startswith(a->path, "/dev/"))
1033                                 whitelist_device(path, a->path, acc);
1034                         else if ((val = startswith(a->path, "block-")))
1035                                 whitelist_major(path, val, 'b', acc);
1036                         else if ((val = startswith(a->path, "char-")))
1037                                 whitelist_major(path, val, 'c', acc);
1038                         else
1039                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1040                 }
1041         }
1042
1043         if (apply_mask & CGROUP_MASK_PIDS) {
1044
1045                 if (is_root) {
1046                         /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1047                          * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1048                          * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1049                          * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1050                          * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1051                          * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1052                          * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1053                          * it also counts. But if the user never set a limit through us (i.e. we are the default of
1054                          * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1055                          * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1056                          * which is desirable so that there's an offical way to release control of the sysctl from
1057                          * systemd: set the limit to unbounded and reload. */
1058
1059                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1060                                 u->manager->sysctl_pid_max_changed = true;
1061                                 r = procfs_tasks_set_limit(c->tasks_max);
1062                         } else if (u->manager->sysctl_pid_max_changed)
1063                                 r = procfs_tasks_set_limit(TASKS_MAX);
1064                         else
1065                                 r = 0;
1066
1067                         if (r < 0)
1068                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1069                                               "Failed to write to tasks limit sysctls: %m");
1070
1071                 } else {
1072                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
1073                                 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1074
1075                                 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1076                                 r = cg_set_attribute("pids", path, "pids.max", buf);
1077                         } else
1078                                 r = cg_set_attribute("pids", path, "pids.max", "max");
1079                         if (r < 0)
1080                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1081                                               "Failed to set pids.max: %m");
1082                 }
1083         }
1084
1085         if (apply_bpf)
1086                 cgroup_apply_firewall(u);
1087 }
1088
1089 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1090         CGroupMask mask = 0;
1091
1092         /* Figure out which controllers we need */
1093
1094         if (c->cpu_accounting ||
1095             cgroup_context_has_cpu_weight(c) ||
1096             cgroup_context_has_cpu_shares(c) ||
1097             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1098                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1099
1100         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1101                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1102
1103         if (c->memory_accounting ||
1104             c->memory_limit != CGROUP_LIMIT_MAX ||
1105             cgroup_context_has_unified_memory_config(c))
1106                 mask |= CGROUP_MASK_MEMORY;
1107
1108         if (c->device_allow ||
1109             c->device_policy != CGROUP_AUTO)
1110                 mask |= CGROUP_MASK_DEVICES;
1111
1112         if (c->tasks_accounting ||
1113             c->tasks_max != CGROUP_LIMIT_MAX)
1114                 mask |= CGROUP_MASK_PIDS;
1115
1116         return mask;
1117 }
1118
1119 CGroupMask unit_get_own_mask(Unit *u) {
1120         CGroupContext *c;
1121
1122         /* Returns the mask of controllers the unit needs for itself */
1123
1124         c = unit_get_cgroup_context(u);
1125         if (!c)
1126                 return 0;
1127
1128         return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1129 }
1130
1131 CGroupMask unit_get_delegate_mask(Unit *u) {
1132         CGroupContext *c;
1133
1134         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1135          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1136          *
1137          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1138
1139         if (!unit_cgroup_delegate(u))
1140                 return 0;
1141
1142         if (cg_all_unified() <= 0) {
1143                 ExecContext *e;
1144
1145                 e = unit_get_exec_context(u);
1146                 if (e && !exec_context_maintains_privileges(e))
1147                         return 0;
1148         }
1149
1150         assert_se(c = unit_get_cgroup_context(u));
1151         return c->delegate_controllers;
1152 }
1153
1154 CGroupMask unit_get_members_mask(Unit *u) {
1155         assert(u);
1156
1157         /* Returns the mask of controllers all of the unit's children require, merged */
1158
1159         if (u->cgroup_members_mask_valid)
1160                 return u->cgroup_members_mask;
1161
1162         u->cgroup_members_mask = 0;
1163
1164         if (u->type == UNIT_SLICE) {
1165                 void *v;
1166                 Unit *member;
1167                 Iterator i;
1168
1169                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1170
1171                         if (member == u)
1172                                 continue;
1173
1174                         if (UNIT_DEREF(member->slice) != u)
1175                                 continue;
1176
1177                         u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1178                 }
1179         }
1180
1181         u->cgroup_members_mask_valid = true;
1182         return u->cgroup_members_mask;
1183 }
1184
1185 CGroupMask unit_get_siblings_mask(Unit *u) {
1186         assert(u);
1187
1188         /* Returns the mask of controllers all of the unit's siblings
1189          * require, i.e. the members mask of the unit's parent slice
1190          * if there is one. */
1191
1192         if (UNIT_ISSET(u->slice))
1193                 return unit_get_members_mask(UNIT_DEREF(u->slice));
1194
1195         return unit_get_subtree_mask(u); /* we are the top-level slice */
1196 }
1197
1198 CGroupMask unit_get_subtree_mask(Unit *u) {
1199
1200         /* Returns the mask of this subtree, meaning of the group
1201          * itself and its children. */
1202
1203         return unit_get_own_mask(u) | unit_get_members_mask(u);
1204 }
1205
1206 CGroupMask unit_get_target_mask(Unit *u) {
1207         CGroupMask mask;
1208
1209         /* This returns the cgroup mask of all controllers to enable
1210          * for a specific cgroup, i.e. everything it needs itself,
1211          * plus all that its children need, plus all that its siblings
1212          * need. This is primarily useful on the legacy cgroup
1213          * hierarchy, where we need to duplicate each cgroup in each
1214          * hierarchy that shall be enabled for it. */
1215
1216         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1217         mask &= u->manager->cgroup_supported;
1218
1219         return mask;
1220 }
1221
1222 CGroupMask unit_get_enable_mask(Unit *u) {
1223         CGroupMask mask;
1224
1225         /* This returns the cgroup mask of all controllers to enable
1226          * for the children of a specific cgroup. This is primarily
1227          * useful for the unified cgroup hierarchy, where each cgroup
1228          * controls which controllers are enabled for its children. */
1229
1230         mask = unit_get_members_mask(u);
1231         mask &= u->manager->cgroup_supported;
1232
1233         return mask;
1234 }
1235
1236 bool unit_get_needs_bpf(Unit *u) {
1237         CGroupContext *c;
1238         Unit *p;
1239         assert(u);
1240
1241         c = unit_get_cgroup_context(u);
1242         if (!c)
1243                 return false;
1244
1245         if (c->ip_accounting ||
1246             c->ip_address_allow ||
1247             c->ip_address_deny)
1248                 return true;
1249
1250         /* If any parent slice has an IP access list defined, it applies too */
1251         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1252                 c = unit_get_cgroup_context(p);
1253                 if (!c)
1254                         return false;
1255
1256                 if (c->ip_address_allow ||
1257                     c->ip_address_deny)
1258                         return true;
1259         }
1260
1261         return false;
1262 }
1263
1264 /* Recurse from a unit up through its containing slices, propagating
1265  * mask bits upward. A unit is also member of itself. */
1266 void unit_update_cgroup_members_masks(Unit *u) {
1267         CGroupMask m;
1268         bool more;
1269
1270         assert(u);
1271
1272         /* Calculate subtree mask */
1273         m = unit_get_subtree_mask(u);
1274
1275         /* See if anything changed from the previous invocation. If
1276          * not, we're done. */
1277         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1278                 return;
1279
1280         more =
1281                 u->cgroup_subtree_mask_valid &&
1282                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1283                 ((~m & u->cgroup_subtree_mask) == 0);
1284
1285         u->cgroup_subtree_mask = m;
1286         u->cgroup_subtree_mask_valid = true;
1287
1288         if (UNIT_ISSET(u->slice)) {
1289                 Unit *s = UNIT_DEREF(u->slice);
1290
1291                 if (more)
1292                         /* There's more set now than before. We
1293                          * propagate the new mask to the parent's mask
1294                          * (not caring if it actually was valid or
1295                          * not). */
1296
1297                         s->cgroup_members_mask |= m;
1298
1299                 else
1300                         /* There's less set now than before (or we
1301                          * don't know), we need to recalculate
1302                          * everything, so let's invalidate the
1303                          * parent's members mask */
1304
1305                         s->cgroup_members_mask_valid = false;
1306
1307                 /* And now make sure that this change also hits our
1308                  * grandparents */
1309                 unit_update_cgroup_members_masks(s);
1310         }
1311 }
1312
1313 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1314
1315         /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1316
1317         while (u) {
1318
1319                 if (u->cgroup_path &&
1320                     u->cgroup_realized &&
1321                     (u->cgroup_realized_mask & mask) == mask)
1322                         return u->cgroup_path;
1323
1324                 u = UNIT_DEREF(u->slice);
1325         }
1326
1327         return NULL;
1328 }
1329
1330 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1331         return unit_get_realized_cgroup_path(userdata, mask);
1332 }
1333
1334 char *unit_default_cgroup_path(Unit *u) {
1335         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1336         int r;
1337
1338         assert(u);
1339
1340         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1341                 return strdup(u->manager->cgroup_root);
1342
1343         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1344                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1345                 if (r < 0)
1346                         return NULL;
1347         }
1348
1349         escaped = cg_escape(u->id);
1350         if (!escaped)
1351                 return NULL;
1352
1353         if (slice)
1354                 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1355                                escaped);
1356         else
1357                 return strjoin(u->manager->cgroup_root, "/", escaped);
1358 }
1359
1360 int unit_set_cgroup_path(Unit *u, const char *path) {
1361         _cleanup_free_ char *p = NULL;
1362         int r;
1363
1364         assert(u);
1365
1366         if (path) {
1367                 p = strdup(path);
1368                 if (!p)
1369                         return -ENOMEM;
1370         } else
1371                 p = NULL;
1372
1373         if (streq_ptr(u->cgroup_path, p))
1374                 return 0;
1375
1376         if (p) {
1377                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1378                 if (r < 0)
1379                         return r;
1380         }
1381
1382         unit_release_cgroup(u);
1383
1384         u->cgroup_path = TAKE_PTR(p);
1385
1386         return 1;
1387 }
1388
1389 int unit_watch_cgroup(Unit *u) {
1390         _cleanup_free_ char *events = NULL;
1391         int r;
1392
1393         assert(u);
1394
1395         if (!u->cgroup_path)
1396                 return 0;
1397
1398         if (u->cgroup_inotify_wd >= 0)
1399                 return 0;
1400
1401         /* Only applies to the unified hierarchy */
1402         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1403         if (r < 0)
1404                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1405         if (r == 0)
1406                 return 0;
1407
1408         /* Don't watch the root slice, it's pointless. */
1409         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1410                 return 0;
1411
1412         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1413         if (r < 0)
1414                 return log_oom();
1415
1416         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1417         if (r < 0)
1418                 return log_oom();
1419
1420         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1421         if (u->cgroup_inotify_wd < 0) {
1422
1423                 if (errno == ENOENT) /* If the directory is already
1424                                       * gone we don't need to track
1425                                       * it, so this is not an error */
1426                         return 0;
1427
1428                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1429         }
1430
1431         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1432         if (r < 0)
1433                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1434
1435         return 0;
1436 }
1437
1438 int unit_pick_cgroup_path(Unit *u) {
1439         _cleanup_free_ char *path = NULL;
1440         int r;
1441
1442         assert(u);
1443
1444         if (u->cgroup_path)
1445                 return 0;
1446
1447         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1448                 return -EINVAL;
1449
1450         path = unit_default_cgroup_path(u);
1451         if (!path)
1452                 return log_oom();
1453
1454         r = unit_set_cgroup_path(u, path);
1455         if (r == -EEXIST)
1456                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1457         if (r < 0)
1458                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1459
1460         return 0;
1461 }
1462
1463 static int unit_create_cgroup(
1464                 Unit *u,
1465                 CGroupMask target_mask,
1466                 CGroupMask enable_mask,
1467                 bool needs_bpf) {
1468
1469         CGroupContext *c;
1470         int r;
1471
1472         assert(u);
1473
1474         c = unit_get_cgroup_context(u);
1475         if (!c)
1476                 return 0;
1477
1478         /* Figure out our cgroup path */
1479         r = unit_pick_cgroup_path(u);
1480         if (r < 0)
1481                 return r;
1482
1483         /* First, create our own group */
1484         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1485         if (r < 0)
1486                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1487
1488         /* Start watching it */
1489         (void) unit_watch_cgroup(u);
1490
1491         /* Enable all controllers we need */
1492         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1493         if (r < 0)
1494                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1495
1496         /* Keep track that this is now realized */
1497         u->cgroup_realized = true;
1498         u->cgroup_realized_mask = target_mask;
1499         u->cgroup_enabled_mask = enable_mask;
1500         u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1501
1502         if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1503
1504                 /* Then, possibly move things over, but not if
1505                  * subgroups may contain processes, which is the case
1506                  * for slice and delegation units. */
1507                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1508                 if (r < 0)
1509                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1510         }
1511
1512         return 0;
1513 }
1514
1515 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1516         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1517         char *pp;
1518         int r;
1519
1520         assert(u);
1521
1522         if (MANAGER_IS_SYSTEM(u->manager))
1523                 return -EINVAL;
1524
1525         if (!u->manager->system_bus)
1526                 return -EIO;
1527
1528         if (!u->cgroup_path)
1529                 return -EINVAL;
1530
1531         /* Determine this unit's cgroup path relative to our cgroup root */
1532         pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1533         if (!pp)
1534                 return -EINVAL;
1535
1536         pp = strjoina("/", pp, suffix_path);
1537         path_kill_slashes(pp);
1538
1539         r = sd_bus_call_method(u->manager->system_bus,
1540                                "org.freedesktop.systemd1",
1541                                "/org/freedesktop/systemd1",
1542                                "org.freedesktop.systemd1.Manager",
1543                                "AttachProcessesToUnit",
1544                                &error, NULL,
1545                                "ssau",
1546                                NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1547         if (r < 0)
1548                 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1549
1550         return 0;
1551 }
1552
1553 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1554         CGroupMask delegated_mask;
1555         const char *p;
1556         Iterator i;
1557         void *pidp;
1558         int r, q;
1559
1560         assert(u);
1561
1562         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1563                 return -EINVAL;
1564
1565         if (set_isempty(pids))
1566                 return 0;
1567
1568         r = unit_realize_cgroup(u);
1569         if (r < 0)
1570                 return r;
1571
1572         if (isempty(suffix_path))
1573                 p = u->cgroup_path;
1574         else
1575                 p = strjoina(u->cgroup_path, "/", suffix_path);
1576
1577         delegated_mask = unit_get_delegate_mask(u);
1578
1579         r = 0;
1580         SET_FOREACH(pidp, pids, i) {
1581                 pid_t pid = PTR_TO_PID(pidp);
1582                 CGroupController c;
1583
1584                 /* First, attach the PID to the main cgroup hierarchy */
1585                 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1586                 if (q < 0) {
1587                         log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1588
1589                         if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1590                                 int z;
1591
1592                                 /* If we are in a user instance, and we can't move the process ourselves due to
1593                                  * permission problems, let's ask the system instance about it instead. Since it's more
1594                                  * privileged it might be able to move the process across the leaves of a subtree who's
1595                                  * top node is not owned by us. */
1596
1597                                 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1598                                 if (z < 0)
1599                                         log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1600                                 else
1601                                         continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1602                         }
1603
1604                         if (r >= 0)
1605                                 r = q; /* Remember first error */
1606
1607                         continue;
1608                 }
1609
1610                 q = cg_all_unified();
1611                 if (q < 0)
1612                         return q;
1613                 if (q > 0)
1614                         continue;
1615
1616                 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1617                  * innermost realized one */
1618
1619                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1620                         CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1621                         const char *realized;
1622
1623                         if (!(u->manager->cgroup_supported & bit))
1624                                 continue;
1625
1626                         /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1627                         if (delegated_mask & u->cgroup_realized_mask & bit) {
1628                                 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1629                                 if (q >= 0)
1630                                         continue; /* Success! */
1631
1632                                 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1633                                                      pid, p, cgroup_controller_to_string(c));
1634                         }
1635
1636                         /* So this controller is either not delegate or realized, or something else weird happened. In
1637                          * that case let's attach the PID at least to the closest cgroup up the tree that is
1638                          * realized. */
1639                         realized = unit_get_realized_cgroup_path(u, bit);
1640                         if (!realized)
1641                                 continue; /* Not even realized in the root slice? Then let's not bother */
1642
1643                         q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1644                         if (q < 0)
1645                                 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1646                                                      pid, realized, cgroup_controller_to_string(c));
1647                 }
1648         }
1649
1650         return r;
1651 }
1652
1653 static void cgroup_xattr_apply(Unit *u) {
1654         char ids[SD_ID128_STRING_MAX];
1655         int r;
1656
1657         assert(u);
1658
1659         if (!MANAGER_IS_SYSTEM(u->manager))
1660                 return;
1661
1662         if (sd_id128_is_null(u->invocation_id))
1663                 return;
1664
1665         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1666                          "trusted.invocation_id",
1667                          sd_id128_to_string(u->invocation_id, ids), 32,
1668                          0);
1669         if (r < 0)
1670                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1671 }
1672
1673 static bool unit_has_mask_realized(
1674                 Unit *u,
1675                 CGroupMask target_mask,
1676                 CGroupMask enable_mask,
1677                 bool needs_bpf) {
1678
1679         assert(u);
1680
1681         return u->cgroup_realized &&
1682                 u->cgroup_realized_mask == target_mask &&
1683                 u->cgroup_enabled_mask == enable_mask &&
1684                 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1685                  (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1686 }
1687
1688 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1689         assert(u);
1690
1691         if (u->in_cgroup_realize_queue)
1692                 return;
1693
1694         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1695         u->in_cgroup_realize_queue = true;
1696 }
1697
1698 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1699         assert(u);
1700
1701         if (!u->in_cgroup_realize_queue)
1702                 return;
1703
1704         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1705         u->in_cgroup_realize_queue = false;
1706 }
1707
1708
1709 /* Check if necessary controllers and attributes for a unit are in place.
1710  *
1711  * If so, do nothing.
1712  * If not, create paths, move processes over, and set attributes.
1713  *
1714  * Returns 0 on success and < 0 on failure. */
1715 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1716         CGroupMask target_mask, enable_mask;
1717         bool needs_bpf, apply_bpf;
1718         int r;
1719
1720         assert(u);
1721
1722         unit_remove_from_cgroup_realize_queue(u);
1723
1724         target_mask = unit_get_target_mask(u);
1725         enable_mask = unit_get_enable_mask(u);
1726         needs_bpf = unit_get_needs_bpf(u);
1727
1728         if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1729                 return 0;
1730
1731         /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1732          * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1733          * this will trickle down properly to cgroupfs. */
1734         apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1735
1736         /* First, realize parents */
1737         if (UNIT_ISSET(u->slice)) {
1738                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1739                 if (r < 0)
1740                         return r;
1741         }
1742
1743         /* And then do the real work */
1744         r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1745         if (r < 0)
1746                 return r;
1747
1748         /* Finally, apply the necessary attributes. */
1749         cgroup_context_apply(u, target_mask, apply_bpf, state);
1750         cgroup_xattr_apply(u);
1751
1752         return 0;
1753 }
1754
1755 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1756         ManagerState state;
1757         unsigned n = 0;
1758         Unit *i;
1759         int r;
1760
1761         assert(m);
1762
1763         state = manager_state(m);
1764
1765         while ((i = m->cgroup_realize_queue)) {
1766                 assert(i->in_cgroup_realize_queue);
1767
1768                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1769                         /* Maybe things changed, and the unit is not actually active anymore? */
1770                         unit_remove_from_cgroup_realize_queue(i);
1771                         continue;
1772                 }
1773
1774                 r = unit_realize_cgroup_now(i, state);
1775                 if (r < 0)
1776                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1777
1778                 n++;
1779         }
1780
1781         return n;
1782 }
1783
1784 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1785         Unit *slice;
1786
1787         /* This adds the siblings of the specified unit and the
1788          * siblings of all parent units to the cgroup queue. (But
1789          * neither the specified unit itself nor the parents.) */
1790
1791         while ((slice = UNIT_DEREF(u->slice))) {
1792                 Iterator i;
1793                 Unit *m;
1794                 void *v;
1795
1796                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1797                         if (m == u)
1798                                 continue;
1799
1800                         /* Skip units that have a dependency on the slice
1801                          * but aren't actually in it. */
1802                         if (UNIT_DEREF(m->slice) != slice)
1803                                 continue;
1804
1805                         /* No point in doing cgroup application for units
1806                          * without active processes. */
1807                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1808                                 continue;
1809
1810                         /* If the unit doesn't need any new controllers
1811                          * and has current ones realized, it doesn't need
1812                          * any changes. */
1813                         if (unit_has_mask_realized(m,
1814                                                    unit_get_target_mask(m),
1815                                                    unit_get_enable_mask(m),
1816                                                    unit_get_needs_bpf(m)))
1817                                 continue;
1818
1819                         unit_add_to_cgroup_realize_queue(m);
1820                 }
1821
1822                 u = slice;
1823         }
1824 }
1825
1826 int unit_realize_cgroup(Unit *u) {
1827         assert(u);
1828
1829         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1830                 return 0;
1831
1832         /* So, here's the deal: when realizing the cgroups for this
1833          * unit, we need to first create all parents, but there's more
1834          * actually: for the weight-based controllers we also need to
1835          * make sure that all our siblings (i.e. units that are in the
1836          * same slice as we are) have cgroups, too. Otherwise, things
1837          * would become very uneven as each of their processes would
1838          * get as much resources as all our group together. This call
1839          * will synchronously create the parent cgroups, but will
1840          * defer work on the siblings to the next event loop
1841          * iteration. */
1842
1843         /* Add all sibling slices to the cgroup queue. */
1844         unit_add_siblings_to_cgroup_realize_queue(u);
1845
1846         /* And realize this one now (and apply the values) */
1847         return unit_realize_cgroup_now(u, manager_state(u->manager));
1848 }
1849
1850 void unit_release_cgroup(Unit *u) {
1851         assert(u);
1852
1853         /* Forgets all cgroup details for this cgroup */
1854
1855         if (u->cgroup_path) {
1856                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1857                 u->cgroup_path = mfree(u->cgroup_path);
1858         }
1859
1860         if (u->cgroup_inotify_wd >= 0) {
1861                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1862                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1863
1864                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1865                 u->cgroup_inotify_wd = -1;
1866         }
1867 }
1868
1869 void unit_prune_cgroup(Unit *u) {
1870         int r;
1871         bool is_root_slice;
1872
1873         assert(u);
1874
1875         /* Removes the cgroup, if empty and possible, and stops watching it. */
1876
1877         if (!u->cgroup_path)
1878                 return;
1879
1880         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1881
1882         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1883
1884         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1885         if (r < 0) {
1886                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1887                 return;
1888         }
1889
1890         if (is_root_slice)
1891                 return;
1892
1893         unit_release_cgroup(u);
1894
1895         u->cgroup_realized = false;
1896         u->cgroup_realized_mask = 0;
1897         u->cgroup_enabled_mask = 0;
1898 }
1899
1900 int unit_search_main_pid(Unit *u, pid_t *ret) {
1901         _cleanup_fclose_ FILE *f = NULL;
1902         pid_t pid = 0, npid, mypid;
1903         int r;
1904
1905         assert(u);
1906         assert(ret);
1907
1908         if (!u->cgroup_path)
1909                 return -ENXIO;
1910
1911         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1912         if (r < 0)
1913                 return r;
1914
1915         mypid = getpid_cached();
1916         while (cg_read_pid(f, &npid) > 0)  {
1917                 pid_t ppid;
1918
1919                 if (npid == pid)
1920                         continue;
1921
1922                 /* Ignore processes that aren't our kids */
1923                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1924                         continue;
1925
1926                 if (pid != 0)
1927                         /* Dang, there's more than one daemonized PID
1928                         in this group, so we don't know what process
1929                         is the main process. */
1930
1931                         return -ENODATA;
1932
1933                 pid = npid;
1934         }
1935
1936         *ret = pid;
1937         return 0;
1938 }
1939
1940 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1941         _cleanup_closedir_ DIR *d = NULL;
1942         _cleanup_fclose_ FILE *f = NULL;
1943         int ret = 0, r;
1944
1945         assert(u);
1946         assert(path);
1947
1948         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1949         if (r < 0)
1950                 ret = r;
1951         else {
1952                 pid_t pid;
1953
1954                 while ((r = cg_read_pid(f, &pid)) > 0) {
1955                         r = unit_watch_pid(u, pid);
1956                         if (r < 0 && ret >= 0)
1957                                 ret = r;
1958                 }
1959
1960                 if (r < 0 && ret >= 0)
1961                         ret = r;
1962         }
1963
1964         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1965         if (r < 0) {
1966                 if (ret >= 0)
1967                         ret = r;
1968         } else {
1969                 char *fn;
1970
1971                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1972                         _cleanup_free_ char *p = NULL;
1973
1974                         p = strjoin(path, "/", fn);
1975                         free(fn);
1976
1977                         if (!p)
1978                                 return -ENOMEM;
1979
1980                         r = unit_watch_pids_in_path(u, p);
1981                         if (r < 0 && ret >= 0)
1982                                 ret = r;
1983                 }
1984
1985                 if (r < 0 && ret >= 0)
1986                         ret = r;
1987         }
1988
1989         return ret;
1990 }
1991
1992 int unit_synthesize_cgroup_empty_event(Unit *u) {
1993         int r;
1994
1995         assert(u);
1996
1997         /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1998          * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1999          * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2000
2001         if (!u->cgroup_path)
2002                 return -ENOENT;
2003
2004         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2005         if (r < 0)
2006                 return r;
2007         if (r > 0) /* On unified we have reliable notifications, and don't need this */
2008                 return 0;
2009
2010         if (!set_isempty(u->pids))
2011                 return 0;
2012
2013         unit_add_to_cgroup_empty_queue(u);
2014         return 0;
2015 }
2016
2017 int unit_watch_all_pids(Unit *u) {
2018         int r;
2019
2020         assert(u);
2021
2022         /* Adds all PIDs from our cgroup to the set of PIDs we
2023          * watch. This is a fallback logic for cases where we do not
2024          * get reliable cgroup empty notifications: we try to use
2025          * SIGCHLD as replacement. */
2026
2027         if (!u->cgroup_path)
2028                 return -ENOENT;
2029
2030         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2031         if (r < 0)
2032                 return r;
2033         if (r > 0) /* On unified we can use proper notifications */
2034                 return 0;
2035
2036         return unit_watch_pids_in_path(u, u->cgroup_path);
2037 }
2038
2039 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2040         Manager *m = userdata;
2041         Unit *u;
2042         int r;
2043
2044         assert(s);
2045         assert(m);
2046
2047         u = m->cgroup_empty_queue;
2048         if (!u)
2049                 return 0;
2050
2051         assert(u->in_cgroup_empty_queue);
2052         u->in_cgroup_empty_queue = false;
2053         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2054
2055         if (m->cgroup_empty_queue) {
2056                 /* More stuff queued, let's make sure we remain enabled */
2057                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2058                 if (r < 0)
2059                         log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2060         }
2061
2062         unit_add_to_gc_queue(u);
2063
2064         if (UNIT_VTABLE(u)->notify_cgroup_empty)
2065                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2066
2067         return 0;
2068 }
2069
2070 void unit_add_to_cgroup_empty_queue(Unit *u) {
2071         int r;
2072
2073         assert(u);
2074
2075         /* Note that there are four different ways how cgroup empty events reach us:
2076          *
2077          * 1. On the unified hierarchy we get an inotify event on the cgroup
2078          *
2079          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2080          *
2081          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2082          *
2083          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2084          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2085          *
2086          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2087          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2088          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2089          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2090          * case for scope units). */
2091
2092         if (u->in_cgroup_empty_queue)
2093                 return;
2094
2095         /* Let's verify that the cgroup is really empty */
2096         if (!u->cgroup_path)
2097                 return;
2098         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2099         if (r < 0) {
2100                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2101                 return;
2102         }
2103         if (r == 0)
2104                 return;
2105
2106         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2107         u->in_cgroup_empty_queue = true;
2108
2109         /* Trigger the defer event */
2110         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2111         if (r < 0)
2112                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2113 }
2114
2115 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2116         Manager *m = userdata;
2117
2118         assert(s);
2119         assert(fd >= 0);
2120         assert(m);
2121
2122         for (;;) {
2123                 union inotify_event_buffer buffer;
2124                 struct inotify_event *e;
2125                 ssize_t l;
2126
2127                 l = read(fd, &buffer, sizeof(buffer));
2128                 if (l < 0) {
2129                         if (IN_SET(errno, EINTR, EAGAIN))
2130                                 return 0;
2131
2132                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
2133                 }
2134
2135                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2136                         Unit *u;
2137
2138                         if (e->wd < 0)
2139                                 /* Queue overflow has no watch descriptor */
2140                                 continue;
2141
2142                         if (e->mask & IN_IGNORED)
2143                                 /* The watch was just removed */
2144                                 continue;
2145
2146                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2147                         if (!u) /* Not that inotify might deliver
2148                                  * events for a watch even after it
2149                                  * was removed, because it was queued
2150                                  * before the removal. Let's ignore
2151                                  * this here safely. */
2152                                 continue;
2153
2154                         unit_add_to_cgroup_empty_queue(u);
2155                 }
2156         }
2157 }
2158
2159 int manager_setup_cgroup(Manager *m) {
2160         _cleanup_free_ char *path = NULL;
2161         const char *scope_path;
2162         CGroupController c;
2163         int r, all_unified;
2164         char *e;
2165
2166         assert(m);
2167
2168         /* 1. Determine hierarchy */
2169         m->cgroup_root = mfree(m->cgroup_root);
2170         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2171         if (r < 0)
2172                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2173
2174         /* Chop off the init scope, if we are already located in it */
2175         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2176
2177         /* LEGACY: Also chop off the system slice if we are in
2178          * it. This is to support live upgrades from older systemd
2179          * versions where PID 1 was moved there. Also see
2180          * cg_get_root_path(). */
2181         if (!e && MANAGER_IS_SYSTEM(m)) {
2182                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2183                 if (!e)
2184                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2185         }
2186         if (e)
2187                 *e = 0;
2188
2189         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2190          * easily prepend it everywhere. */
2191         delete_trailing_chars(m->cgroup_root, "/");
2192
2193         /* 2. Show data */
2194         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2195         if (r < 0)
2196                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2197
2198         r = cg_unified_flush();
2199         if (r < 0)
2200                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2201
2202         all_unified = cg_all_unified();
2203         if (all_unified < 0)
2204                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2205         if (all_unified > 0)
2206                 log_debug("Unified cgroup hierarchy is located at %s.", path);
2207         else {
2208                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2209                 if (r < 0)
2210                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2211                 if (r > 0)
2212                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2213                 else
2214                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2215         }
2216
2217         /* 3. Allocate cgroup empty defer event source */
2218         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2219         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2222
2223         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2226
2227         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2230
2231         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2232
2233         /* 4. Install notifier inotify object, or agent */
2234         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2235
2236                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2237
2238                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2239                 safe_close(m->cgroup_inotify_fd);
2240
2241                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2242                 if (m->cgroup_inotify_fd < 0)
2243                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
2244
2245                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2246                 if (r < 0)
2247                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
2248
2249                 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2250                  * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2251                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2252                 if (r < 0)
2253                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2254
2255                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2256
2257         } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2258
2259                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2260                  * since it does not generate events when control groups with children run empty. */
2261
2262                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2263                 if (r < 0)
2264                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2265                 else if (r > 0)
2266                         log_debug("Installed release agent.");
2267                 else if (r == 0)
2268                         log_debug("Release agent already installed.");
2269         }
2270
2271         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2272         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2273         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2274         if (r >= 0) {
2275                 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2276                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2277                 if (r < 0)
2278                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2279
2280                 /* 6. And pin it, so that it cannot be unmounted */
2281                 safe_close(m->pin_cgroupfs_fd);
2282                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2283                 if (m->pin_cgroupfs_fd < 0)
2284                         return log_error_errno(errno, "Failed to open pin file: %m");
2285
2286         } else if (r < 0 && !m->test_run_flags)
2287                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2288
2289         /* 7. Always enable hierarchical support if it exists... */
2290         if (!all_unified && m->test_run_flags == 0)
2291                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2292
2293         /* 8. Figure out which controllers are supported, and log about it */
2294         r = cg_mask_supported(&m->cgroup_supported);
2295         if (r < 0)
2296                 return log_error_errno(r, "Failed to determine supported controllers: %m");
2297         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2298                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2299
2300         return 0;
2301 }
2302
2303 void manager_shutdown_cgroup(Manager *m, bool delete) {
2304         assert(m);
2305
2306         /* We can't really delete the group, since we are in it. But
2307          * let's trim it. */
2308         if (delete && m->cgroup_root)
2309                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2310
2311         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2312
2313         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2314
2315         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2316         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2317
2318         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2319
2320         m->cgroup_root = mfree(m->cgroup_root);
2321 }
2322
2323 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2324         char *p;
2325         Unit *u;
2326
2327         assert(m);
2328         assert(cgroup);
2329
2330         u = hashmap_get(m->cgroup_unit, cgroup);
2331         if (u)
2332                 return u;
2333
2334         p = strdupa(cgroup);
2335         for (;;) {
2336                 char *e;
2337
2338                 e = strrchr(p, '/');
2339                 if (!e || e == p)
2340                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2341
2342                 *e = 0;
2343
2344                 u = hashmap_get(m->cgroup_unit, p);
2345                 if (u)
2346                         return u;
2347         }
2348 }
2349
2350 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2351         _cleanup_free_ char *cgroup = NULL;
2352
2353         assert(m);
2354
2355         if (!pid_is_valid(pid))
2356                 return NULL;
2357
2358         if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2359                 return NULL;
2360
2361         return manager_get_unit_by_cgroup(m, cgroup);
2362 }
2363
2364 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2365         Unit *u, **array;
2366
2367         assert(m);
2368
2369         /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2370          * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2371          * relevant one as children of the process will be assigned to that one, too, before all else. */
2372
2373         if (!pid_is_valid(pid))
2374                 return NULL;
2375
2376         if (pid == getpid_cached())
2377                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2378
2379         u = manager_get_unit_by_pid_cgroup(m, pid);
2380         if (u)
2381                 return u;
2382
2383         u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2384         if (u)
2385                 return u;
2386
2387         array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2388         if (array)
2389                 return array[0];
2390
2391         return NULL;
2392 }
2393
2394 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2395         Unit *u;
2396
2397         assert(m);
2398         assert(cgroup);
2399
2400         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2401          * or from the --system instance */
2402
2403         log_debug("Got cgroup empty notification for: %s", cgroup);
2404
2405         u = manager_get_unit_by_cgroup(m, cgroup);
2406         if (!u)
2407                 return 0;
2408
2409         unit_add_to_cgroup_empty_queue(u);
2410         return 1;
2411 }
2412
2413 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2414         _cleanup_free_ char *v = NULL;
2415         int r;
2416
2417         assert(u);
2418         assert(ret);
2419
2420         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2421                 return -ENODATA;
2422
2423         if (!u->cgroup_path)
2424                 return -ENODATA;
2425
2426         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2427         if (unit_has_root_cgroup(u))
2428                 return procfs_memory_get_current(ret);
2429
2430         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2431                 return -ENODATA;
2432
2433         r = cg_all_unified();
2434         if (r < 0)
2435                 return r;
2436         if (r > 0)
2437                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2438         else
2439                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2440         if (r == -ENOENT)
2441                 return -ENODATA;
2442         if (r < 0)
2443                 return r;
2444
2445         return safe_atou64(v, ret);
2446 }
2447
2448 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2449         _cleanup_free_ char *v = NULL;
2450         int r;
2451
2452         assert(u);
2453         assert(ret);
2454
2455         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2456                 return -ENODATA;
2457
2458         if (!u->cgroup_path)
2459                 return -ENODATA;
2460
2461         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2462         if (unit_has_root_cgroup(u))
2463                 return procfs_tasks_get_current(ret);
2464
2465         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2466                 return -ENODATA;
2467
2468         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2469         if (r == -ENOENT)
2470                 return -ENODATA;
2471         if (r < 0)
2472                 return r;
2473
2474         return safe_atou64(v, ret);
2475 }
2476
2477 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2478         _cleanup_free_ char *v = NULL;
2479         uint64_t ns;
2480         int r;
2481
2482         assert(u);
2483         assert(ret);
2484
2485         if (!u->cgroup_path)
2486                 return -ENODATA;
2487
2488         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2489         if (unit_has_root_cgroup(u))
2490                 return procfs_cpu_get_usage(ret);
2491
2492         r = cg_all_unified();
2493         if (r < 0)
2494                 return r;
2495         if (r > 0) {
2496                 _cleanup_free_ char *val = NULL;
2497                 uint64_t us;
2498
2499                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2500                         return -ENODATA;
2501
2502                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
2503                 if (r < 0)
2504                         return r;
2505                 if (IN_SET(r, -ENOENT, -ENXIO))
2506                         return -ENODATA;
2507
2508                 r = safe_atou64(val, &us);
2509                 if (r < 0)
2510                         return r;
2511
2512                 ns = us * NSEC_PER_USEC;
2513         } else {
2514                 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2515                         return -ENODATA;
2516
2517                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2518                 if (r == -ENOENT)
2519                         return -ENODATA;
2520                 if (r < 0)
2521                         return r;
2522
2523                 r = safe_atou64(v, &ns);
2524                 if (r < 0)
2525                         return r;
2526         }
2527
2528         *ret = ns;
2529         return 0;
2530 }
2531
2532 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2533         nsec_t ns;
2534         int r;
2535
2536         assert(u);
2537
2538         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2539          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2540          * call this function with a NULL return value. */
2541
2542         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2543                 return -ENODATA;
2544
2545         r = unit_get_cpu_usage_raw(u, &ns);
2546         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2547                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2548                  * cached value. */
2549
2550                 if (ret)
2551                         *ret = u->cpu_usage_last;
2552                 return 0;
2553         }
2554         if (r < 0)
2555                 return r;
2556
2557         if (ns > u->cpu_usage_base)
2558                 ns -= u->cpu_usage_base;
2559         else
2560                 ns = 0;
2561
2562         u->cpu_usage_last = ns;
2563         if (ret)
2564                 *ret = ns;
2565
2566         return 0;
2567 }
2568
2569 int unit_get_ip_accounting(
2570                 Unit *u,
2571                 CGroupIPAccountingMetric metric,
2572                 uint64_t *ret) {
2573
2574         uint64_t value;
2575         int fd, r;
2576
2577         assert(u);
2578         assert(metric >= 0);
2579         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2580         assert(ret);
2581
2582         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2583                 return -ENODATA;
2584
2585         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2586                 u->ip_accounting_ingress_map_fd :
2587                 u->ip_accounting_egress_map_fd;
2588         if (fd < 0)
2589                 return -ENODATA;
2590
2591         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2592                 r = bpf_firewall_read_accounting(fd, &value, NULL);
2593         else
2594                 r = bpf_firewall_read_accounting(fd, NULL, &value);
2595         if (r < 0)
2596                 return r;
2597
2598         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2599          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2600          * ip_accounting_extra[] field, and add them in here transparently. */
2601
2602         *ret = value + u->ip_accounting_extra[metric];
2603
2604         return r;
2605 }
2606
2607 int unit_reset_cpu_accounting(Unit *u) {
2608         nsec_t ns;
2609         int r;
2610
2611         assert(u);
2612
2613         u->cpu_usage_last = NSEC_INFINITY;
2614
2615         r = unit_get_cpu_usage_raw(u, &ns);
2616         if (r < 0) {
2617                 u->cpu_usage_base = 0;
2618                 return r;
2619         }
2620
2621         u->cpu_usage_base = ns;
2622         return 0;
2623 }
2624
2625 int unit_reset_ip_accounting(Unit *u) {
2626         int r = 0, q = 0;
2627
2628         assert(u);
2629
2630         if (u->ip_accounting_ingress_map_fd >= 0)
2631                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2632
2633         if (u->ip_accounting_egress_map_fd >= 0)
2634                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2635
2636         zero(u->ip_accounting_extra);
2637
2638         return r < 0 ? r : q;
2639 }
2640
2641 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2642         assert(u);
2643
2644         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2645                 return;
2646
2647         if (m == 0)
2648                 return;
2649
2650         /* always invalidate compat pairs together */
2651         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2652                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2653
2654         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2655                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2656
2657         if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2658                 return;
2659
2660         u->cgroup_realized_mask &= ~m;
2661         unit_add_to_cgroup_realize_queue(u);
2662 }
2663
2664 void unit_invalidate_cgroup_bpf(Unit *u) {
2665         assert(u);
2666
2667         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2668                 return;
2669
2670         if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2671                 return;
2672
2673         u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2674         unit_add_to_cgroup_realize_queue(u);
2675
2676         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2677          * list of our children includes our own. */
2678         if (u->type == UNIT_SLICE) {
2679                 Unit *member;
2680                 Iterator i;
2681                 void *v;
2682
2683                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2684                         if (member == u)
2685                                 continue;
2686
2687                         if (UNIT_DEREF(member->slice) != u)
2688                                 continue;
2689
2690                         unit_invalidate_cgroup_bpf(member);
2691                 }
2692         }
2693 }
2694
2695 bool unit_cgroup_delegate(Unit *u) {
2696         CGroupContext *c;
2697
2698         assert(u);
2699
2700         if (!UNIT_VTABLE(u)->can_delegate)
2701                 return false;
2702
2703         c = unit_get_cgroup_context(u);
2704         if (!c)
2705                 return false;
2706
2707         return c->delegate;
2708 }
2709
2710 void manager_invalidate_startup_units(Manager *m) {
2711         Iterator i;
2712         Unit *u;
2713
2714         assert(m);
2715
2716         SET_FOREACH(u, m->startup_units, i)
2717                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2718 }
2719
2720 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2721         [CGROUP_AUTO] = "auto",
2722         [CGROUP_CLOSED] = "closed",
2723         [CGROUP_STRICT] = "strict",
2724 };
2725
2726 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);