src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <fcntl.h>
   4
   5 #include "sd-messages.h"
   6
   7 #include "af-list.h"
   8 #include "alloc-util.h"
   9 #include "blockdev-util.h"
  10 #include "bpf-devices.h"
  11 #include "bpf-firewall.h"
  12 #include "bpf-foreign.h"
  13 #include "bpf-restrict-ifaces.h"
  14 #include "bpf-socket-bind.h"
  15 #include "btrfs-util.h"
  16 #include "bus-error.h"
  17 #include "bus-locator.h"
  18 #include "cgroup-setup.h"
  19 #include "cgroup-util.h"
  20 #include "cgroup.h"
  21 #include "devnum-util.h"
  22 #include "fd-util.h"
  23 #include "fileio.h"
  24 #include "firewall-util.h"
  25 #include "in-addr-prefix-util.h"
  26 #include "inotify-util.h"
  27 #include "io-util.h"
  28 #include "ip-protocol-list.h"
  29 #include "limits-util.h"
  30 #include "nulstr-util.h"
  31 #include "parse-util.h"
  32 #include "path-util.h"
  33 #include "percent-util.h"
  34 #include "process-util.h"
  35 #include "procfs-util.h"
  36 #include "set.h"
  37 #include "serialize.h"
  38 #include "special.h"
  39 #include "stdio-util.h"
  40 #include "string-table.h"
  41 #include "string-util.h"
  42 #include "virt.h"
  43
  44 #if BPF_FRAMEWORK
  45 #include "bpf-dlopen.h"
  46 #include "bpf-link.h"
  47 #include "bpf/restrict_fs/restrict-fs-skel.h"
  48 #endif
  49
  50 #define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  51
  52 /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
  53  * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
  54  * out specific attributes from us. */
  55 #define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
  56
  57 uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max) {
  58         if (tasks_max->scale == 0)
  59                 return tasks_max->value;
  60
  61         return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
  62 }
  63
  64 bool manager_owns_host_root_cgroup(Manager *m) {
  65         assert(m);
  66
  67         /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
  68          * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
  69          * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
  70          * we run in any kind of container virtualization. */
  71
  72         if (MANAGER_IS_USER(m))
  73                 return false;
  74
  75         if (detect_container() > 0)
  76                 return false;
  77
  78         return empty_or_root(m->cgroup_root);
  79 }
  80
  81 bool unit_has_startup_cgroup_constraints(Unit *u) {
  82         assert(u);
  83
  84         /* Returns true if this unit has any directives which apply during
  85          * startup/shutdown phases. */
  86
  87         CGroupContext *c;
  88
  89         c = unit_get_cgroup_context(u);
  90         if (!c)
  91                 return false;
  92
  93         return c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
  94                c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
  95                c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
  96                c->startup_cpuset_cpus.set ||
  97                c->startup_cpuset_mems.set ||
  98                c->startup_memory_high_set ||
  99                c->startup_memory_max_set ||
 100                c->startup_memory_swap_max_set||
 101                c->startup_memory_zswap_max_set ||
 102                c->startup_memory_low_set;
 103 }
 104
 105 bool unit_has_host_root_cgroup(Unit *u) {
 106         assert(u);
 107
 108         /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
 109          * the manager manages the root cgroup. */
 110
 111         if (!manager_owns_host_root_cgroup(u->manager))
 112                 return false;
 113
 114         return unit_has_name(u, SPECIAL_ROOT_SLICE);
 115 }
 116
 117 static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
 118         int r;
 119
 120         assert(u);
 121
 122         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
 123         if (!crt || !crt->cgroup_path)
 124                 return -EOWNERDEAD;
 125
 126         r = cg_set_attribute(controller, crt->cgroup_path, attribute, value);
 127         if (r < 0)
 128                 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
 129                                     strna(attribute), empty_to_root(crt->cgroup_path), (int) strcspn(value, NEWLINE), value);
 130
 131         return r;
 132 }
 133
 134 static void cgroup_compat_warn(void) {
 135         static bool cgroup_compat_warned = false;
 136
 137         if (cgroup_compat_warned)
 138                 return;
 139
 140         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
 141                     "See cgroup-compat debug messages for details.");
 142
 143         cgroup_compat_warned = true;
 144 }
 145
 146 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
 147                 cgroup_compat_warn();                                           \
 148                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
 149         } while (false)
 150
 151 void cgroup_context_init(CGroupContext *c) {
 152         assert(c);
 153
 154         /* Initialize everything to the kernel defaults. When initializing a bool member to 'true', make
 155          * sure to serialize in execute-serialize.c using serialize_bool() instead of
 156          * serialize_bool_elide(), as sd-executor will initialize here to 'true', but serialize_bool_elide()
 157          * skips serialization if the value is 'false' (as that's the common default), so if the value at
 158          * runtime is zero it would be lost after deserialization. Same when initializing uint64_t and other
 159          * values, update/add a conditional serialization check. This is to minimize the amount of
 160          * serialized data that is sent to the sd-executor, so that there is less work to do on the default
 161          * cases. */
 162
 163         *c = (CGroupContext) {
 164                 .cpu_weight = CGROUP_WEIGHT_INVALID,
 165                 .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
 166                 .cpu_quota_per_sec_usec = USEC_INFINITY,
 167                 .cpu_quota_period_usec = USEC_INFINITY,
 168
 169                 .cpu_shares = CGROUP_CPU_SHARES_INVALID,
 170                 .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
 171
 172                 .memory_high = CGROUP_LIMIT_MAX,
 173                 .startup_memory_high = CGROUP_LIMIT_MAX,
 174                 .memory_max = CGROUP_LIMIT_MAX,
 175                 .startup_memory_max = CGROUP_LIMIT_MAX,
 176                 .memory_swap_max = CGROUP_LIMIT_MAX,
 177                 .startup_memory_swap_max = CGROUP_LIMIT_MAX,
 178                 .memory_zswap_max = CGROUP_LIMIT_MAX,
 179                 .startup_memory_zswap_max = CGROUP_LIMIT_MAX,
 180
 181                 .memory_limit = CGROUP_LIMIT_MAX,
 182
 183                 .memory_zswap_writeback = true,
 184
 185                 .io_weight = CGROUP_WEIGHT_INVALID,
 186                 .startup_io_weight = CGROUP_WEIGHT_INVALID,
 187
 188                 .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
 189                 .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
 190
 191                 .tasks_max = CGROUP_TASKS_MAX_UNSET,
 192
 193                 .moom_swap = MANAGED_OOM_AUTO,
 194                 .moom_mem_pressure = MANAGED_OOM_AUTO,
 195                 .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
 196
 197                 .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID,
 198                 .memory_pressure_threshold_usec = USEC_INFINITY,
 199         };
 200 }
 201
 202 int cgroup_context_add_io_device_weight_dup(CGroupContext *c, const CGroupIODeviceWeight *w) {
 203         _cleanup_free_ CGroupIODeviceWeight *n = NULL;
 204
 205         assert(c);
 206         assert(w);
 207
 208         n = new(CGroupIODeviceWeight, 1);
 209         if (!n)
 210                 return -ENOMEM;
 211
 212         *n = (CGroupIODeviceWeight) {
 213                 .path = strdup(w->path),
 214                 .weight = w->weight,
 215         };
 216         if (!n->path)
 217                 return -ENOMEM;
 218
 219         LIST_PREPEND(device_weights, c->io_device_weights, TAKE_PTR(n));
 220         return 0;
 221 }
 222
 223 int cgroup_context_add_io_device_limit_dup(CGroupContext *c, const CGroupIODeviceLimit *l) {
 224         _cleanup_free_ CGroupIODeviceLimit *n = NULL;
 225
 226         assert(c);
 227         assert(l);
 228
 229         n = new0(CGroupIODeviceLimit, 1);
 230         if (!n)
 231                 return -ENOMEM;
 232
 233         n->path = strdup(l->path);
 234         if (!n->path)
 235                 return -ENOMEM;
 236
 237         for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 238                 n->limits[type] = l->limits[type];
 239
 240         LIST_PREPEND(device_limits, c->io_device_limits, TAKE_PTR(n));
 241         return 0;
 242 }
 243
 244 int cgroup_context_add_io_device_latency_dup(CGroupContext *c, const CGroupIODeviceLatency *l) {
 245         _cleanup_free_ CGroupIODeviceLatency *n = NULL;
 246
 247         assert(c);
 248         assert(l);
 249
 250         n = new(CGroupIODeviceLatency, 1);
 251         if (!n)
 252                 return -ENOMEM;
 253
 254         *n = (CGroupIODeviceLatency) {
 255                 .path = strdup(l->path),
 256                 .target_usec = l->target_usec,
 257         };
 258         if (!n->path)
 259                 return -ENOMEM;
 260
 261         LIST_PREPEND(device_latencies, c->io_device_latencies, TAKE_PTR(n));
 262         return 0;
 263 }
 264
 265 int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, const CGroupBlockIODeviceWeight *w) {
 266         _cleanup_free_ CGroupBlockIODeviceWeight *n = NULL;
 267
 268         assert(c);
 269         assert(w);
 270
 271         n = new(CGroupBlockIODeviceWeight, 1);
 272         if (!n)
 273                 return -ENOMEM;
 274
 275         *n = (CGroupBlockIODeviceWeight) {
 276                 .path = strdup(w->path),
 277                 .weight = w->weight,
 278         };
 279         if (!n->path)
 280                 return -ENOMEM;
 281
 282         LIST_PREPEND(device_weights, c->blockio_device_weights, TAKE_PTR(n));
 283         return 0;
 284 }
 285
 286 int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, const CGroupBlockIODeviceBandwidth *b) {
 287         _cleanup_free_ CGroupBlockIODeviceBandwidth *n = NULL;
 288
 289         assert(c);
 290         assert(b);
 291
 292         n = new(CGroupBlockIODeviceBandwidth, 1);
 293         if (!n)
 294                 return -ENOMEM;
 295
 296         *n = (CGroupBlockIODeviceBandwidth) {
 297                 .rbps = b->rbps,
 298                 .wbps = b->wbps,
 299         };
 300
 301         LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, TAKE_PTR(n));
 302         return 0;
 303 }
 304
 305 int cgroup_context_add_device_allow_dup(CGroupContext *c, const CGroupDeviceAllow *a) {
 306         _cleanup_free_ CGroupDeviceAllow *n = NULL;
 307
 308         assert(c);
 309         assert(a);
 310
 311         n = new(CGroupDeviceAllow, 1);
 312         if (!n)
 313                 return -ENOMEM;
 314
 315         *n = (CGroupDeviceAllow) {
 316                 .path = strdup(a->path),
 317                 .permissions = a->permissions,
 318         };
 319         if (!n->path)
 320                 return -ENOMEM;
 321
 322         LIST_PREPEND(device_allow, c->device_allow, TAKE_PTR(n));
 323         return 0;
 324 }
 325
 326 static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, const CGroupSocketBindItem *i, CGroupSocketBindItem *h) {
 327         _cleanup_free_ CGroupSocketBindItem *n = NULL;
 328
 329         assert(c);
 330         assert(i);
 331
 332         n = new(CGroupSocketBindItem, 1);
 333         if (!n)
 334                 return -ENOMEM;
 335
 336         *n = (CGroupSocketBindItem) {
 337                 .address_family = i->address_family,
 338                 .ip_protocol    = i->ip_protocol,
 339                 .nr_ports       = i->nr_ports,
 340                 .port_min       = i->port_min,
 341         };
 342
 343         LIST_PREPEND(socket_bind_items, h, TAKE_PTR(n));
 344         return 0;
 345 }
 346
 347 int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, const CGroupSocketBindItem *i) {
 348         return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_allow);
 349 }
 350
 351 int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, const CGroupSocketBindItem *i) {
 352         return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_deny);
 353 }
 354
 355 int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src) {
 356         struct in_addr_prefix *i;
 357         char *iface;
 358         int r;
 359
 360         assert(src);
 361         assert(dst);
 362
 363         dst->cpu_accounting = src->cpu_accounting;
 364         dst->io_accounting = src->io_accounting;
 365         dst->blockio_accounting = src->blockio_accounting;
 366         dst->memory_accounting = src->memory_accounting;
 367         dst->tasks_accounting = src->tasks_accounting;
 368         dst->ip_accounting = src->ip_accounting;
 369
 370         dst->memory_oom_group = src->memory_oom_group;
 371
 372         dst->cpu_weight = src->cpu_weight;
 373         dst->startup_cpu_weight = src->startup_cpu_weight;
 374         dst->cpu_quota_per_sec_usec = src->cpu_quota_per_sec_usec;
 375         dst->cpu_quota_period_usec = src->cpu_quota_period_usec;
 376
 377         dst->cpuset_cpus = src->cpuset_cpus;
 378         dst->startup_cpuset_cpus = src->startup_cpuset_cpus;
 379         dst->cpuset_mems = src->cpuset_mems;
 380         dst->startup_cpuset_mems = src->startup_cpuset_mems;
 381
 382         dst->io_weight = src->io_weight;
 383         dst->startup_io_weight = src->startup_io_weight;
 384
 385         LIST_FOREACH_BACKWARDS(device_weights, w, LIST_FIND_TAIL(device_weights, src->io_device_weights)) {
 386                 r = cgroup_context_add_io_device_weight_dup(dst, w);
 387                 if (r < 0)
 388                         return r;
 389         }
 390
 391         LIST_FOREACH_BACKWARDS(device_limits, l, LIST_FIND_TAIL(device_limits, src->io_device_limits)) {
 392                 r = cgroup_context_add_io_device_limit_dup(dst, l);
 393                 if (r < 0)
 394                         return r;
 395         }
 396
 397         LIST_FOREACH_BACKWARDS(device_latencies, l, LIST_FIND_TAIL(device_latencies, src->io_device_latencies)) {
 398                 r = cgroup_context_add_io_device_latency_dup(dst, l);
 399                 if (r < 0)
 400                         return r;
 401         }
 402
 403         dst->default_memory_min = src->default_memory_min;
 404         dst->default_memory_low = src->default_memory_low;
 405         dst->default_startup_memory_low = src->default_startup_memory_low;
 406         dst->memory_min = src->memory_min;
 407         dst->memory_low = src->memory_low;
 408         dst->startup_memory_low = src->startup_memory_low;
 409         dst->memory_high = src->memory_high;
 410         dst->startup_memory_high = src->startup_memory_high;
 411         dst->memory_max = src->memory_max;
 412         dst->startup_memory_max = src->startup_memory_max;
 413         dst->memory_swap_max = src->memory_swap_max;
 414         dst->startup_memory_swap_max = src->startup_memory_swap_max;
 415         dst->memory_zswap_max = src->memory_zswap_max;
 416         dst->startup_memory_zswap_max = src->startup_memory_zswap_max;
 417
 418         dst->default_memory_min_set = src->default_memory_min_set;
 419         dst->default_memory_low_set = src->default_memory_low_set;
 420         dst->default_startup_memory_low_set = src->default_startup_memory_low_set;
 421         dst->memory_min_set = src->memory_min_set;
 422         dst->memory_low_set = src->memory_low_set;
 423         dst->startup_memory_low_set = src->startup_memory_low_set;
 424         dst->startup_memory_high_set = src->startup_memory_high_set;
 425         dst->startup_memory_max_set = src->startup_memory_max_set;
 426         dst->startup_memory_swap_max_set = src->startup_memory_swap_max_set;
 427         dst->startup_memory_zswap_max_set = src->startup_memory_zswap_max_set;
 428         dst->memory_zswap_writeback = src->memory_zswap_writeback;
 429
 430         SET_FOREACH(i, src->ip_address_allow) {
 431                 r = in_addr_prefix_add(&dst->ip_address_allow, i);
 432                 if (r < 0)
 433                         return r;
 434         }
 435
 436         SET_FOREACH(i, src->ip_address_deny) {
 437                 r = in_addr_prefix_add(&dst->ip_address_deny, i);
 438                 if (r < 0)
 439                         return r;
 440         }
 441
 442         dst->ip_address_allow_reduced = src->ip_address_allow_reduced;
 443         dst->ip_address_deny_reduced = src->ip_address_deny_reduced;
 444
 445         if (!strv_isempty(src->ip_filters_ingress)) {
 446                 dst->ip_filters_ingress = strv_copy(src->ip_filters_ingress);
 447                 if (!dst->ip_filters_ingress)
 448                         return -ENOMEM;
 449         }
 450
 451         if (!strv_isempty(src->ip_filters_egress)) {
 452                 dst->ip_filters_egress = strv_copy(src->ip_filters_egress);
 453                 if (!dst->ip_filters_egress)
 454                         return -ENOMEM;
 455         }
 456
 457         LIST_FOREACH_BACKWARDS(programs, l, LIST_FIND_TAIL(programs, src->bpf_foreign_programs)) {
 458                 r = cgroup_context_add_bpf_foreign_program_dup(dst, l);
 459                 if (r < 0)
 460                         return r;
 461         }
 462
 463         SET_FOREACH(iface, src->restrict_network_interfaces) {
 464                 r = set_put_strdup(&dst->restrict_network_interfaces, iface);
 465                 if (r < 0)
 466                         return r;
 467         }
 468         dst->restrict_network_interfaces_is_allow_list = src->restrict_network_interfaces_is_allow_list;
 469
 470         dst->cpu_shares = src->cpu_shares;
 471         dst->startup_cpu_shares = src->startup_cpu_shares;
 472
 473         dst->blockio_weight = src->blockio_weight;
 474         dst->startup_blockio_weight = src->startup_blockio_weight;
 475
 476         LIST_FOREACH_BACKWARDS(device_weights, l, LIST_FIND_TAIL(device_weights, src->blockio_device_weights)) {
 477                 r = cgroup_context_add_block_io_device_weight_dup(dst, l);
 478                 if (r < 0)
 479                         return r;
 480         }
 481
 482         LIST_FOREACH_BACKWARDS(device_bandwidths, l, LIST_FIND_TAIL(device_bandwidths, src->blockio_device_bandwidths)) {
 483                 r = cgroup_context_add_block_io_device_bandwidth_dup(dst, l);
 484                 if (r < 0)
 485                         return r;
 486         }
 487
 488         dst->memory_limit = src->memory_limit;
 489
 490         dst->device_policy = src->device_policy;
 491         LIST_FOREACH_BACKWARDS(device_allow, l, LIST_FIND_TAIL(device_allow, src->device_allow)) {
 492                 r = cgroup_context_add_device_allow_dup(dst, l);
 493                 if (r < 0)
 494                         return r;
 495         }
 496
 497         LIST_FOREACH_BACKWARDS(socket_bind_items, l, LIST_FIND_TAIL(socket_bind_items, src->socket_bind_allow)) {
 498                 r = cgroup_context_add_socket_bind_item_allow_dup(dst, l);
 499                 if (r < 0)
 500                         return r;
 501
 502         }
 503
 504         LIST_FOREACH_BACKWARDS(socket_bind_items, l, LIST_FIND_TAIL(socket_bind_items, src->socket_bind_deny)) {
 505                 r = cgroup_context_add_socket_bind_item_deny_dup(dst, l);
 506                 if (r < 0)
 507                         return r;
 508         }
 509
 510         dst->tasks_max = src->tasks_max;
 511
 512         return 0;
 513 }
 514
 515 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
 516         assert(c);
 517         assert(a);
 518
 519         LIST_REMOVE(device_allow, c->device_allow, a);
 520         free(a->path);
 521         free(a);
 522 }
 523
 524 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
 525         assert(c);
 526         assert(w);
 527
 528         LIST_REMOVE(device_weights, c->io_device_weights, w);
 529         free(w->path);
 530         free(w);
 531 }
 532
 533 void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
 534         assert(c);
 535         assert(l);
 536
 537         LIST_REMOVE(device_latencies, c->io_device_latencies, l);
 538         free(l->path);
 539         free(l);
 540 }
 541
 542 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 543         assert(c);
 544         assert(l);
 545
 546         LIST_REMOVE(device_limits, c->io_device_limits, l);
 547         free(l->path);
 548         free(l);
 549 }
 550
 551 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 552         assert(c);
 553         assert(w);
 554
 555         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 556         free(w->path);
 557         free(w);
 558 }
 559
 560 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 561         assert(c);
 562         assert(b);
 563
 564         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 565         free(b->path);
 566         free(b);
 567 }
 568
 569 void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) {
 570         assert(c);
 571         assert(p);
 572
 573         LIST_REMOVE(programs, c->bpf_foreign_programs, p);
 574         free(p->bpffs_path);
 575         free(p);
 576 }
 577
 578 void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) {
 579         assert(head);
 580
 581         LIST_CLEAR(socket_bind_items, *head, free);
 582 }
 583
 584 void cgroup_context_done(CGroupContext *c) {
 585         assert(c);
 586
 587         while (c->io_device_weights)
 588                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 589
 590         while (c->io_device_latencies)
 591                 cgroup_context_free_io_device_latency(c, c->io_device_latencies);
 592
 593         while (c->io_device_limits)
 594                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 595
 596         while (c->blockio_device_weights)
 597                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 598
 599         while (c->blockio_device_bandwidths)
 600                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 601
 602         while (c->device_allow)
 603                 cgroup_context_free_device_allow(c, c->device_allow);
 604
 605         cgroup_context_remove_socket_bind(&c->socket_bind_allow);
 606         cgroup_context_remove_socket_bind(&c->socket_bind_deny);
 607
 608         c->ip_address_allow = set_free(c->ip_address_allow);
 609         c->ip_address_deny = set_free(c->ip_address_deny);
 610
 611         c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
 612         c->ip_filters_egress = strv_free(c->ip_filters_egress);
 613
 614         while (c->bpf_foreign_programs)
 615                 cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
 616
 617         c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
 618
 619         cpu_set_reset(&c->cpuset_cpus);
 620         cpu_set_reset(&c->startup_cpuset_cpus);
 621         cpu_set_reset(&c->cpuset_mems);
 622         cpu_set_reset(&c->startup_cpuset_mems);
 623
 624         c->delegate_subgroup = mfree(c->delegate_subgroup);
 625
 626         nft_set_context_clear(&c->nft_set_context);
 627 }
 628
 629 static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
 630         assert(u);
 631
 632         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
 633         if (!crt || !crt->cgroup_path)
 634                 return -EOWNERDEAD;
 635
 636         return cg_get_attribute_as_uint64("memory", crt->cgroup_path, file, ret);
 637 }
 638
 639 static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
 640         CGroupContext *c;
 641         CGroupMask m;
 642         const char *file;
 643         uint64_t unit_value;
 644         int r;
 645
 646         /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will
 647          * return -ENODATA) on cgroup v1.
 648          *
 649          * Returns:
 650          *
 651          * <0: On error.
 652          *  0: If the kernel memory setting doesn't match our configuration.
 653          * >0: If the kernel memory setting matches our configuration.
 654          *
 655          * The following values are only guaranteed to be populated on return >=0:
 656          *
 657          * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
 658          * - ret_kernel_value will contain the actual value presented by the kernel. */
 659
 660         assert(u);
 661
 662         r = cg_all_unified();
 663         if (r < 0)
 664                 return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m");
 665
 666         /* Unsupported on v1.
 667          *
 668          * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has
 669          * silently masked the controller. */
 670         if (r == 0)
 671                 return -ENODATA;
 672
 673         /* The root slice doesn't have any controller files, so we can't compare anything. */
 674         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 675                 return -ENODATA;
 676
 677         /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
 678          * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
 679          * avoid specious errors in these scenarios, check that we even expect the memory controller to be
 680          * enabled at all. */
 681         m = unit_get_target_mask(u);
 682         if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
 683                 return -ENODATA;
 684
 685         assert_se(c = unit_get_cgroup_context(u));
 686
 687         bool startup = u->manager && IN_SET(manager_state(u->manager), MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
 688
 689         if (streq(property_name, "MemoryLow")) {
 690                 unit_value = unit_get_ancestor_memory_low(u);
 691                 file = "memory.low";
 692         } else if (startup && streq(property_name, "StartupMemoryLow")) {
 693                 unit_value = unit_get_ancestor_startup_memory_low(u);
 694                 file = "memory.low";
 695         } else if (streq(property_name, "MemoryMin")) {
 696                 unit_value = unit_get_ancestor_memory_min(u);
 697                 file = "memory.min";
 698         } else if (streq(property_name, "MemoryHigh")) {
 699                 unit_value = c->memory_high;
 700                 file = "memory.high";
 701         } else if (startup && streq(property_name, "StartupMemoryHigh")) {
 702                 unit_value = c->startup_memory_high;
 703                 file = "memory.high";
 704         } else if (streq(property_name, "MemoryMax")) {
 705                 unit_value = c->memory_max;
 706                 file = "memory.max";
 707         } else if (startup && streq(property_name, "StartupMemoryMax")) {
 708                 unit_value = c->startup_memory_max;
 709                 file = "memory.max";
 710         } else if (streq(property_name, "MemorySwapMax")) {
 711                 unit_value = c->memory_swap_max;
 712                 file = "memory.swap.max";
 713         } else if (startup && streq(property_name, "StartupMemorySwapMax")) {
 714                 unit_value = c->startup_memory_swap_max;
 715                 file = "memory.swap.max";
 716         } else if (streq(property_name, "MemoryZSwapMax")) {
 717                 unit_value = c->memory_zswap_max;
 718                 file = "memory.zswap.max";
 719         } else if (startup && streq(property_name, "StartupMemoryZSwapMax")) {
 720                 unit_value = c->startup_memory_zswap_max;
 721                 file = "memory.zswap.max";
 722         } else
 723                 return -EINVAL;
 724
 725         r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
 726         if (r < 0)
 727                 return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
 728
 729         /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
 730          * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
 731          * our internal page-counting. To support those future kernels, just check the value itself first
 732          * without any page-alignment. */
 733         if (*ret_kernel_value == unit_value) {
 734                 *ret_unit_value = unit_value;
 735                 return 1;
 736         }
 737
 738         /* The current kernel behaviour, by comparison, is that even if you write a particular number of
 739          * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
 740          * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
 741          * cricket. */
 742         if (unit_value != CGROUP_LIMIT_MAX)
 743                 unit_value = PAGE_ALIGN_DOWN(unit_value);
 744
 745         *ret_unit_value = unit_value;
 746
 747         return *ret_kernel_value == *ret_unit_value;
 748 }
 749
 750 #define FORMAT_CGROUP_DIFF_MAX 128
 751
 752 static char *format_cgroup_memory_limit_comparison(Unit *u, const char *property_name, char *buf, size_t l) {
 753         uint64_t kval, sval;
 754         int r;
 755
 756         assert(u);
 757         assert(property_name);
 758         assert(buf);
 759         assert(l > 0);
 760
 761         r = unit_compare_memory_limit(u, property_name, &sval, &kval);
 762
 763         /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
 764          * In the absence of reliably being able to detect whether memcg swap support is available or not,
 765          * only complain if the error is not ENOENT. This is similarly the case for memory.zswap.max relying
 766          * on CONFIG_ZSWAP. */
 767         if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
 768             (r == -ENOENT && STR_IN_SET(property_name,
 769                                         "MemorySwapMax",
 770                                         "StartupMemorySwapMax",
 771                                         "MemoryZSwapMax",
 772                                         "StartupMemoryZSwapMax")))
 773                 buf[0] = 0;
 774         else if (r < 0) {
 775                 errno = -r;
 776                 (void) snprintf(buf, l, " (error getting kernel value: %m)");
 777         } else
 778                 (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
 779
 780         return buf;
 781 }
 782
 783 const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) {
 784         static const char *table[_CGROUP_DEVICE_PERMISSIONS_MAX] = {
 785                 /* Lets simply define a table with every possible combination. As long as those are just 8 we
 786                  * can get away with it. If this ever grows to more we need to revisit this logic though. */
 787                 [0]                                                          = "",
 788                 [CGROUP_DEVICE_READ]                                         = "r",
 789                 [CGROUP_DEVICE_WRITE]                                        = "w",
 790                 [CGROUP_DEVICE_MKNOD]                                        = "m",
 791                 [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE]                     = "rw",
 792                 [CGROUP_DEVICE_READ|CGROUP_DEVICE_MKNOD]                     = "rm",
 793                 [CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD]                    = "wm",
 794                 [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "rwm",
 795         };
 796
 797         if (p < 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
 798                 return NULL;
 799
 800         return table[p];
 801 }
 802
 803 CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) {
 804         CGroupDevicePermissions p = 0;
 805
 806         if (!s)
 807                 return _CGROUP_DEVICE_PERMISSIONS_INVALID;
 808
 809         for (const char *c = s; *c; c++) {
 810                 if (*c == 'r')
 811                         p |= CGROUP_DEVICE_READ;
 812                 else if (*c == 'w')
 813                         p |= CGROUP_DEVICE_WRITE;
 814                 else if (*c == 'm')
 815                         p |= CGROUP_DEVICE_MKNOD;
 816                 else
 817                         return _CGROUP_DEVICE_PERMISSIONS_INVALID;
 818         }
 819
 820         return p;
 821 }
 822
 823 void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
 824         _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL;
 825         CGroupContext *c;
 826         struct in_addr_prefix *iaai;
 827         char cda[FORMAT_CGROUP_DIFF_MAX], cdb[FORMAT_CGROUP_DIFF_MAX], cdc[FORMAT_CGROUP_DIFF_MAX], cdd[FORMAT_CGROUP_DIFF_MAX],
 828                 cde[FORMAT_CGROUP_DIFF_MAX], cdf[FORMAT_CGROUP_DIFF_MAX], cdg[FORMAT_CGROUP_DIFF_MAX], cdh[FORMAT_CGROUP_DIFF_MAX],
 829                 cdi[FORMAT_CGROUP_DIFF_MAX], cdj[FORMAT_CGROUP_DIFF_MAX], cdk[FORMAT_CGROUP_DIFF_MAX];
 830
 831         assert(u);
 832         assert(f);
 833
 834         assert_se(c = unit_get_cgroup_context(u));
 835
 836         prefix = strempty(prefix);
 837
 838         (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
 839         (void) cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str);
 840
 841         /* "Delegate=" means "yes, but no controllers". Show this as "(none)". */
 842         const char *delegate_str = delegate_controllers_str ?: c->delegate ? "(none)" : "no";
 843
 844         cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
 845         startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
 846         cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
 847         startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
 848
 849         fprintf(f,
 850                 "%sCPUAccounting: %s\n"
 851                 "%sIOAccounting: %s\n"
 852                 "%sBlockIOAccounting: %s\n"
 853                 "%sMemoryAccounting: %s\n"
 854                 "%sTasksAccounting: %s\n"
 855                 "%sIPAccounting: %s\n"
 856                 "%sCPUWeight: %" PRIu64 "\n"
 857                 "%sStartupCPUWeight: %" PRIu64 "\n"
 858                 "%sCPUShares: %" PRIu64 "\n"
 859                 "%sStartupCPUShares: %" PRIu64 "\n"
 860                 "%sCPUQuotaPerSecSec: %s\n"
 861                 "%sCPUQuotaPeriodSec: %s\n"
 862                 "%sAllowedCPUs: %s\n"
 863                 "%sStartupAllowedCPUs: %s\n"
 864                 "%sAllowedMemoryNodes: %s\n"
 865                 "%sStartupAllowedMemoryNodes: %s\n"
 866                 "%sIOWeight: %" PRIu64 "\n"
 867                 "%sStartupIOWeight: %" PRIu64 "\n"
 868                 "%sBlockIOWeight: %" PRIu64 "\n"
 869                 "%sStartupBlockIOWeight: %" PRIu64 "\n"
 870                 "%sDefaultMemoryMin: %" PRIu64 "\n"
 871                 "%sDefaultMemoryLow: %" PRIu64 "\n"
 872                 "%sMemoryMin: %" PRIu64 "%s\n"
 873                 "%sMemoryLow: %" PRIu64 "%s\n"
 874                 "%sStartupMemoryLow: %" PRIu64 "%s\n"
 875                 "%sMemoryHigh: %" PRIu64 "%s\n"
 876                 "%sStartupMemoryHigh: %" PRIu64 "%s\n"
 877                 "%sMemoryMax: %" PRIu64 "%s\n"
 878                 "%sStartupMemoryMax: %" PRIu64 "%s\n"
 879                 "%sMemorySwapMax: %" PRIu64 "%s\n"
 880                 "%sStartupMemorySwapMax: %" PRIu64 "%s\n"
 881                 "%sMemoryZSwapMax: %" PRIu64 "%s\n"
 882                 "%sStartupMemoryZSwapMax: %" PRIu64 "%s\n"
 883                 "%sMemoryZSwapWriteback: %s\n"
 884                 "%sMemoryLimit: %" PRIu64 "\n"
 885                 "%sTasksMax: %" PRIu64 "\n"
 886                 "%sDevicePolicy: %s\n"
 887                 "%sDisableControllers: %s\n"
 888                 "%sDelegate: %s\n"
 889                 "%sManagedOOMSwap: %s\n"
 890                 "%sManagedOOMMemoryPressure: %s\n"
 891                 "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
 892                 "%sManagedOOMPreference: %s\n"
 893                 "%sMemoryPressureWatch: %s\n"
 894                 "%sCoredumpReceive: %s\n",
 895                 prefix, yes_no(c->cpu_accounting),
 896                 prefix, yes_no(c->io_accounting),
 897                 prefix, yes_no(c->blockio_accounting),
 898                 prefix, yes_no(c->memory_accounting),
 899                 prefix, yes_no(c->tasks_accounting),
 900                 prefix, yes_no(c->ip_accounting),
 901                 prefix, c->cpu_weight,
 902                 prefix, c->startup_cpu_weight,
 903                 prefix, c->cpu_shares,
 904                 prefix, c->startup_cpu_shares,
 905                 prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1),
 906                 prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1),
 907                 prefix, strempty(cpuset_cpus),
 908                 prefix, strempty(startup_cpuset_cpus),
 909                 prefix, strempty(cpuset_mems),
 910                 prefix, strempty(startup_cpuset_mems),
 911                 prefix, c->io_weight,
 912                 prefix, c->startup_io_weight,
 913                 prefix, c->blockio_weight,
 914                 prefix, c->startup_blockio_weight,
 915                 prefix, c->default_memory_min,
 916                 prefix, c->default_memory_low,
 917                 prefix, c->memory_min, format_cgroup_memory_limit_comparison(u, "MemoryMin", cda, sizeof(cda)),
 918                 prefix, c->memory_low, format_cgroup_memory_limit_comparison(u, "MemoryLow", cdb, sizeof(cdb)),
 919                 prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(u, "StartupMemoryLow", cdc, sizeof(cdc)),
 920                 prefix, c->memory_high, format_cgroup_memory_limit_comparison(u, "MemoryHigh", cdd, sizeof(cdd)),
 921                 prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(u, "StartupMemoryHigh", cde, sizeof(cde)),
 922                 prefix, c->memory_max, format_cgroup_memory_limit_comparison(u, "MemoryMax", cdf, sizeof(cdf)),
 923                 prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryMax", cdg, sizeof(cdg)),
 924                 prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(u, "MemorySwapMax", cdh, sizeof(cdh)),
 925                 prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(u, "StartupMemorySwapMax", cdi, sizeof(cdi)),
 926                 prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(u, "MemoryZSwapMax", cdj, sizeof(cdj)),
 927                 prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryZSwapMax", cdk, sizeof(cdk)),
 928                 prefix, yes_no(c->memory_zswap_writeback),
 929                 prefix, c->memory_limit,
 930                 prefix, cgroup_tasks_max_resolve(&c->tasks_max),
 931                 prefix, cgroup_device_policy_to_string(c->device_policy),
 932                 prefix, strempty(disable_controllers_str),
 933                 prefix, delegate_str,
 934                 prefix, managed_oom_mode_to_string(c->moom_swap),
 935                 prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
 936                 prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
 937                 prefix, managed_oom_preference_to_string(c->moom_preference),
 938                 prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch),
 939                 prefix, yes_no(c->coredump_receive));
 940
 941         if (c->delegate_subgroup)
 942                 fprintf(f, "%sDelegateSubgroup: %s\n",
 943                         prefix, c->delegate_subgroup);
 944
 945         if (c->memory_pressure_threshold_usec != USEC_INFINITY)
 946                 fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
 947                         prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
 948
 949         LIST_FOREACH(device_allow, a, c->device_allow)
 950                 /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */
 951                 fprintf(f,
 952                         "%sDeviceAllow: %s %s\n",
 953                         prefix,
 954                         a->path,
 955                         strna(cgroup_device_permissions_to_string(a->permissions)));
 956
 957         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 958                 fprintf(f,
 959                         "%sIODeviceWeight: %s %" PRIu64 "\n",
 960                         prefix,
 961                         iw->path,
 962                         iw->weight);
 963
 964         LIST_FOREACH(device_latencies, l, c->io_device_latencies)
 965                 fprintf(f,
 966                         "%sIODeviceLatencyTargetSec: %s %s\n",
 967                         prefix,
 968                         l->path,
 969                         FORMAT_TIMESPAN(l->target_usec, 1));
 970
 971         LIST_FOREACH(device_limits, il, c->io_device_limits)
 972                 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 973                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 974                                 fprintf(f,
 975                                         "%s%s: %s %s\n",
 976                                         prefix,
 977                                         cgroup_io_limit_type_to_string(type),
 978                                         il->path,
 979                                         FORMAT_BYTES(il->limits[type]));
 980
 981         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 982                 fprintf(f,
 983                         "%sBlockIODeviceWeight: %s %" PRIu64,
 984                         prefix,
 985                         w->path,
 986                         w->weight);
 987
 988         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 989                 if (b->rbps != CGROUP_LIMIT_MAX)
 990                         fprintf(f,
 991                                 "%sBlockIOReadBandwidth: %s %s\n",
 992                                 prefix,
 993                                 b->path,
 994                                 FORMAT_BYTES(b->rbps));
 995                 if (b->wbps != CGROUP_LIMIT_MAX)
 996                         fprintf(f,
 997                                 "%sBlockIOWriteBandwidth: %s %s\n",
 998                                 prefix,
 999                                 b->path,
1000                                 FORMAT_BYTES(b->wbps));
1001         }
1002
1003         SET_FOREACH(iaai, c->ip_address_allow)
1004                 fprintf(f, "%sIPAddressAllow: %s\n", prefix,
1005                         IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
1006         SET_FOREACH(iaai, c->ip_address_deny)
1007                 fprintf(f, "%sIPAddressDeny: %s\n", prefix,
1008                         IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
1009
1010         STRV_FOREACH(path, c->ip_filters_ingress)
1011                 fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
1012         STRV_FOREACH(path, c->ip_filters_egress)
1013                 fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
1014
1015         LIST_FOREACH(programs, p, c->bpf_foreign_programs)
1016                 fprintf(f, "%sBPFProgram: %s:%s",
1017                         prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path);
1018
1019         if (c->socket_bind_allow) {
1020                 fprintf(f, "%sSocketBindAllow: ", prefix);
1021                 cgroup_context_dump_socket_bind_items(c->socket_bind_allow, f);
1022                 fputc('\n', f);
1023         }
1024
1025         if (c->socket_bind_deny) {
1026                 fprintf(f, "%sSocketBindDeny: ", prefix);
1027                 cgroup_context_dump_socket_bind_items(c->socket_bind_deny, f);
1028                 fputc('\n', f);
1029         }
1030
1031         if (c->restrict_network_interfaces) {
1032                 char *iface;
1033                 SET_FOREACH(iface, c->restrict_network_interfaces)
1034                         fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface);
1035         }
1036
1037         FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
1038                 fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source),
1039                         nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set);
1040 }
1041
1042 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
1043         const char *family, *colon1, *protocol = "", *colon2 = "";
1044
1045         family = strempty(af_to_ipv4_ipv6(item->address_family));
1046         colon1 = isempty(family) ? "" : ":";
1047
1048         if (item->ip_protocol != 0) {
1049                 protocol = ip_protocol_to_tcp_udp(item->ip_protocol);
1050                 colon2 = ":";
1051         }
1052
1053         if (item->nr_ports == 0)
1054                 fprintf(f, "%s%s%s%sany", family, colon1, protocol, colon2);
1055         else if (item->nr_ports == 1)
1056                 fprintf(f, "%s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min);
1057         else {
1058                 uint16_t port_max = item->port_min + item->nr_ports - 1;
1059                 fprintf(f, "%s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2,
1060                         item->port_min, port_max);
1061         }
1062 }
1063
1064 void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f) {
1065         bool first = true;
1066
1067         LIST_FOREACH(socket_bind_items, bi, items) {
1068                 if (first)
1069                         first = false;
1070                 else
1071                         fputc(' ', f);
1072
1073                 cgroup_context_dump_socket_bind_item(bi, f);
1074         }
1075 }
1076
1077 int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
1078         _cleanup_free_ CGroupDeviceAllow *a = NULL;
1079         _cleanup_free_ char *d = NULL;
1080
1081         assert(c);
1082         assert(dev);
1083         assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
1084
1085         if (p == 0)
1086                 p = _CGROUP_DEVICE_PERMISSIONS_ALL;
1087
1088         a = new(CGroupDeviceAllow, 1);
1089         if (!a)
1090                 return -ENOMEM;
1091
1092         d = strdup(dev);
1093         if (!d)
1094                 return -ENOMEM;
1095
1096         *a = (CGroupDeviceAllow) {
1097                 .path = TAKE_PTR(d),
1098                 .permissions = p,
1099         };
1100
1101         LIST_PREPEND(device_allow, c->device_allow, a);
1102         TAKE_PTR(a);
1103
1104         return 0;
1105 }
1106
1107 int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
1108         assert(c);
1109         assert(dev);
1110         assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
1111
1112         if (p == 0)
1113                 p = _CGROUP_DEVICE_PERMISSIONS_ALL;
1114
1115         LIST_FOREACH(device_allow, b, c->device_allow)
1116                 if (path_equal(b->path, dev)) {
1117                         b->permissions = p;
1118                         return 0;
1119                 }
1120
1121         return cgroup_context_add_device_allow(c, dev, p);
1122 }
1123
1124 int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) {
1125         CGroupBPFForeignProgram *p;
1126         _cleanup_free_ char *d = NULL;
1127
1128         assert(c);
1129         assert(bpffs_path);
1130
1131         if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
1132                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m");
1133
1134         d = strdup(bpffs_path);
1135         if (!d)
1136                 return log_oom();
1137
1138         p = new(CGroupBPFForeignProgram, 1);
1139         if (!p)
1140                 return log_oom();
1141
1142         *p = (CGroupBPFForeignProgram) {
1143                 .attach_type = attach_type,
1144                 .bpffs_path = TAKE_PTR(d),
1145         };
1146
1147         LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p));
1148
1149         return 0;
1150 }
1151
1152 #define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry)                       \
1153         uint64_t unit_get_ancestor_##entry(Unit *u) {                   \
1154                 CGroupContext *c;                                       \
1155                                                                         \
1156                 /* 1. Is entry set in this unit? If so, use that.       \
1157                  * 2. Is the default for this entry set in any          \
1158                  *    ancestor? If so, use that.                        \
1159                  * 3. Otherwise, return CGROUP_LIMIT_MIN. */            \
1160                                                                         \
1161                 assert(u);                                              \
1162                                                                         \
1163                 c = unit_get_cgroup_context(u);                         \
1164                 if (c && c->entry##_set)                                \
1165                         return c->entry;                                \
1166                                                                         \
1167                 while ((u = UNIT_GET_SLICE(u))) {                       \
1168                         c = unit_get_cgroup_context(u);                 \
1169                         if (c && c->default_##entry##_set)              \
1170                                 return c->default_##entry;              \
1171                 }                                                       \
1172                                                                         \
1173                 /* We've reached the root, but nobody had default for   \
1174                  * this entry set, so set it to the kernel default. */  \
1175                 return CGROUP_LIMIT_MIN;                                \
1176 }
1177
1178 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
1179 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(startup_memory_low);
1180 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
1181
1182 static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data, size_t size) {
1183         int r;
1184
1185         assert(u);
1186         assert(name);
1187
1188         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1189         if (!crt || !crt->cgroup_path)
1190                 return;
1191
1192         r = cg_set_xattr(crt->cgroup_path, name, data, size, 0);
1193         if (r < 0)
1194                 log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path));
1195 }
1196
1197 static void unit_remove_xattr_graceful(Unit *u, const char *name) {
1198         int r;
1199
1200         assert(u);
1201         assert(name);
1202
1203         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1204         if (!crt || !crt->cgroup_path)
1205                 return;
1206
1207         r = cg_remove_xattr(crt->cgroup_path, name);
1208         if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
1209                 log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path));
1210 }
1211
1212 static void cgroup_oomd_xattr_apply(Unit *u) {
1213         CGroupContext *c;
1214
1215         assert(u);
1216
1217         c = unit_get_cgroup_context(u);
1218         if (!c)
1219                 return;
1220
1221         if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT)
1222                 unit_set_xattr_graceful(u, "user.oomd_omit", "1", 1);
1223
1224         if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID)
1225                 unit_set_xattr_graceful(u, "user.oomd_avoid", "1", 1);
1226
1227         if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID)
1228                 unit_remove_xattr_graceful(u, "user.oomd_avoid");
1229
1230         if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT)
1231                 unit_remove_xattr_graceful(u, "user.oomd_omit");
1232 }
1233
1234 static int cgroup_log_xattr_apply(Unit *u) {
1235         ExecContext *c;
1236         size_t len, allowed_patterns_len, denied_patterns_len;
1237         _cleanup_free_ char *patterns = NULL, *allowed_patterns = NULL, *denied_patterns = NULL;
1238         char *last;
1239         int r;
1240
1241         assert(u);
1242
1243         c = unit_get_exec_context(u);
1244         if (!c)
1245                 /* Some unit types have a cgroup context but no exec context, so we do not log
1246                  * any error here to avoid confusion. */
1247                 return 0;
1248
1249         if (set_isempty(c->log_filter_allowed_patterns) && set_isempty(c->log_filter_denied_patterns)) {
1250                 unit_remove_xattr_graceful(u, "user.journald_log_filter_patterns");
1251                 return 0;
1252         }
1253
1254         r = set_make_nulstr(c->log_filter_allowed_patterns, &allowed_patterns, &allowed_patterns_len);
1255         if (r < 0)
1256                 return log_debug_errno(r, "Failed to make nulstr from set: %m");
1257
1258         r = set_make_nulstr(c->log_filter_denied_patterns, &denied_patterns, &denied_patterns_len);
1259         if (r < 0)
1260                 return log_debug_errno(r, "Failed to make nulstr from set: %m");
1261
1262         /* Use nul character separated strings without trailing nul */
1263         allowed_patterns_len = LESS_BY(allowed_patterns_len, 1u);
1264         denied_patterns_len = LESS_BY(denied_patterns_len, 1u);
1265
1266         len = allowed_patterns_len + 1 + denied_patterns_len;
1267         patterns = new(char, len);
1268         if (!patterns)
1269                 return log_oom_debug();
1270
1271         last = mempcpy_safe(patterns, allowed_patterns, allowed_patterns_len);
1272         *(last++) = '\xff';
1273         memcpy_safe(last, denied_patterns, denied_patterns_len);
1274
1275         unit_set_xattr_graceful(u, "user.journald_log_filter_patterns", patterns, len);
1276
1277         return 0;
1278 }
1279
1280 static void cgroup_invocation_id_xattr_apply(Unit *u) {
1281         bool b;
1282
1283         assert(u);
1284
1285         b = !sd_id128_is_null(u->invocation_id);
1286         FOREACH_STRING(xn, "trusted.invocation_id", "user.invocation_id") {
1287                 if (b)
1288                         unit_set_xattr_graceful(u, xn, SD_ID128_TO_STRING(u->invocation_id), 32);
1289                 else
1290                         unit_remove_xattr_graceful(u, xn);
1291         }
1292 }
1293
1294 static void cgroup_coredump_xattr_apply(Unit *u) {
1295         CGroupContext *c;
1296
1297         assert(u);
1298
1299         c = unit_get_cgroup_context(u);
1300         if (!c)
1301                 return;
1302
1303         if (unit_cgroup_delegate(u) && c->coredump_receive)
1304                 unit_set_xattr_graceful(u, "user.coredump_receive", "1", 1);
1305         else
1306                 unit_remove_xattr_graceful(u, "user.coredump_receive");
1307 }
1308
1309 static void cgroup_delegate_xattr_apply(Unit *u) {
1310         bool b;
1311
1312         assert(u);
1313
1314         /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels
1315          * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs,
1316          * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and
1317          * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well,
1318          * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker
1319          * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't
1320          * be a big problem given this communicates delegation state to clients, but the manager never reads
1321          * it. */
1322         b = unit_cgroup_delegate(u);
1323         FOREACH_STRING(xn, "trusted.delegate", "user.delegate") {
1324                 if (b)
1325                         unit_set_xattr_graceful(u, xn, "1", 1);
1326                 else
1327                         unit_remove_xattr_graceful(u, xn);
1328         }
1329 }
1330
1331 static void cgroup_survive_xattr_apply(Unit *u) {
1332         int r;
1333
1334         assert(u);
1335
1336         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1337         if (!crt)
1338                 return;
1339
1340         if (u->survive_final_kill_signal) {
1341                 r = cg_set_xattr(
1342                                 crt->cgroup_path,
1343                                 "user.survive_final_kill_signal",
1344                                 "1",
1345                                 1,
1346                                 /* flags= */ 0);
1347                 /* user xattr support was added in kernel v5.7 */
1348                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
1349                         r = cg_set_xattr(
1350                                         crt->cgroup_path,
1351                                         "trusted.survive_final_kill_signal",
1352                                         "1",
1353                                         1,
1354                                         /* flags= */ 0);
1355                 if (r < 0)
1356                         log_unit_debug_errno(u,
1357                                              r,
1358                                              "Failed to set 'survive_final_kill_signal' xattr on control "
1359                                              "group %s, ignoring: %m",
1360                                              empty_to_root(crt->cgroup_path));
1361         } else {
1362                 unit_remove_xattr_graceful(u, "user.survive_final_kill_signal");
1363                 unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal");
1364         }
1365 }
1366
1367 static void cgroup_xattr_apply(Unit *u) {
1368         assert(u);
1369
1370         /* The 'user.*' xattrs can be set from a user manager. */
1371         cgroup_oomd_xattr_apply(u);
1372         cgroup_log_xattr_apply(u);
1373         cgroup_coredump_xattr_apply(u);
1374
1375         if (!MANAGER_IS_SYSTEM(u->manager))
1376                 return;
1377
1378         cgroup_invocation_id_xattr_apply(u);
1379         cgroup_delegate_xattr_apply(u);
1380         cgroup_survive_xattr_apply(u);
1381 }
1382
1383 static int lookup_block_device(const char *p, dev_t *ret) {
1384         dev_t rdev, dev = 0;
1385         mode_t mode;
1386         int r;
1387
1388         assert(p);
1389         assert(ret);
1390
1391         r = device_path_parse_major_minor(p, &mode, &rdev);
1392         if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
1393                 struct stat st;
1394
1395                 if (stat(p, &st) < 0)
1396                         return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
1397
1398                 mode = st.st_mode;
1399                 rdev = st.st_rdev;
1400                 dev = st.st_dev;
1401         } else if (r < 0)
1402                 return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
1403
1404         if (S_ISCHR(mode))
1405                 return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
1406                                          "Device node '%s' is a character device, but block device needed.", p);
1407         if (S_ISBLK(mode))
1408                 *ret = rdev;
1409         else if (major(dev) != 0)
1410                 *ret = dev; /* If this is not a device node then use the block device this file is stored on */
1411         else {
1412                 /* If this is btrfs, getting the backing block device is a bit harder */
1413                 r = btrfs_get_block_device(p, ret);
1414                 if (r == -ENOTTY)
1415                         return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
1416                                                  "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
1417                 if (r < 0)
1418                         return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
1419         }
1420
1421         /* If this is a LUKS/DM device, recursively try to get the originating block device */
1422         while (block_get_originating(*ret, ret) > 0);
1423
1424         /* If this is a partition, try to get the originating block device */
1425         (void) block_get_whole_disk(*ret, ret);
1426         return 0;
1427 }
1428
1429 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
1430         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
1431                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
1432 }
1433
1434 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
1435         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
1436                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
1437 }
1438
1439 static bool cgroup_context_has_allowed_cpus(CGroupContext *c) {
1440         return c->cpuset_cpus.set || c->startup_cpuset_cpus.set;
1441 }
1442
1443 static bool cgroup_context_has_allowed_mems(CGroupContext *c) {
1444         return c->cpuset_mems.set || c->startup_cpuset_mems.set;
1445 }
1446
1447 uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
1448         assert(c);
1449
1450         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1451             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
1452                 return c->startup_cpu_weight;
1453         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
1454                 return c->cpu_weight;
1455         else
1456                 return CGROUP_WEIGHT_DEFAULT;
1457 }
1458
1459 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
1460         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1461             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
1462                 return c->startup_cpu_shares;
1463         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
1464                 return c->cpu_shares;
1465         else
1466                 return CGROUP_CPU_SHARES_DEFAULT;
1467 }
1468
1469 static CPUSet *cgroup_context_allowed_cpus(CGroupContext *c, ManagerState state) {
1470         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1471             c->startup_cpuset_cpus.set)
1472                 return &c->startup_cpuset_cpus;
1473         else
1474                 return &c->cpuset_cpus;
1475 }
1476
1477 static CPUSet *cgroup_context_allowed_mems(CGroupContext *c, ManagerState state) {
1478         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1479             c->startup_cpuset_mems.set)
1480                 return &c->startup_cpuset_mems;
1481         else
1482                 return &c->cpuset_mems;
1483 }
1484
1485 usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
1486         /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
1487          * need to be higher than that boundary. quota is specified in USecPerSec.
1488          * Additionally, period must be at most max_period. */
1489         assert(quota > 0);
1490
1491         return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
1492 }
1493
1494 static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
1495         usec_t new_period;
1496
1497         assert(u);
1498
1499         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1500         if (!crt)
1501                 return USEC_INFINITY;
1502
1503         if (quota == USEC_INFINITY)
1504                 /* Always use default period for infinity quota. */
1505                 return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
1506
1507         if (period == USEC_INFINITY)
1508                 /* Default period was requested. */
1509                 period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
1510
1511         /* Clamp to interval [1ms, 1s] */
1512         new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
1513
1514         if (new_period != period) {
1515                 log_unit_full(u, crt->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
1516                               "Clamping CPU interval for cpu.max: period is now %s",
1517                               FORMAT_TIMESPAN(new_period, 1));
1518                 crt->warned_clamping_cpu_quota_period = true;
1519         }
1520
1521         return new_period;
1522 }
1523
1524 static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
1525         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1526
1527         if (weight == CGROUP_WEIGHT_IDLE)
1528                 return;
1529         xsprintf(buf, "%" PRIu64 "\n", weight);
1530         (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
1531 }
1532
1533 static void cgroup_apply_unified_cpu_idle(Unit *u, uint64_t weight) {
1534         int r;
1535         bool is_idle;
1536         const char *idle_val;
1537
1538         assert(u);
1539
1540         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1541         if (!crt || !crt->cgroup_path)
1542                 return;
1543
1544         is_idle = weight == CGROUP_WEIGHT_IDLE;
1545         idle_val = one_zero(is_idle);
1546         r = cg_set_attribute("cpu", crt->cgroup_path, "cpu.idle", idle_val);
1547         if (r < 0 && (r != -ENOENT || is_idle))
1548                 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m",
1549                                     "cpu.idle", empty_to_root(crt->cgroup_path), idle_val);
1550 }
1551
1552 static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
1553         char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
1554
1555         assert(u);
1556
1557         period = cgroup_cpu_adjust_period_and_log(u, period, quota);
1558         if (quota != USEC_INFINITY)
1559                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
1560                          MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
1561         else
1562                 xsprintf(buf, "max " USEC_FMT "\n", period);
1563         (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
1564 }
1565
1566 static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
1567         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1568
1569         xsprintf(buf, "%" PRIu64 "\n", shares);
1570         (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
1571 }
1572
1573 static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
1574         char buf[DECIMAL_STR_MAX(usec_t) + 2];
1575
1576         period = cgroup_cpu_adjust_period_and_log(u, period, quota);
1577
1578         xsprintf(buf, USEC_FMT "\n", period);
1579         (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
1580
1581         if (quota != USEC_INFINITY) {
1582                 xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
1583                 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
1584         } else
1585                 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
1586 }
1587
1588 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
1589         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
1590                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
1591 }
1592
1593 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
1594         /* we don't support idle in cgroupv1 */
1595         if (weight == CGROUP_WEIGHT_IDLE)
1596                 return CGROUP_CPU_SHARES_MIN;
1597
1598         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
1599                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
1600 }
1601
1602 static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
1603         _cleanup_free_ char *buf = NULL;
1604
1605         buf = cpu_set_to_range_string(cpus);
1606         if (!buf) {
1607                 log_oom();
1608                 return;
1609         }
1610
1611         (void) set_attribute_and_warn(u, "cpuset", name, buf);
1612 }
1613
1614 static bool cgroup_context_has_io_config(CGroupContext *c) {
1615         return c->io_accounting ||
1616                 c->io_weight != CGROUP_WEIGHT_INVALID ||
1617                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
1618                 c->io_device_weights ||
1619                 c->io_device_latencies ||
1620                 c->io_device_limits;
1621 }
1622
1623 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
1624         return c->blockio_accounting ||
1625                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
1626                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
1627                 c->blockio_device_weights ||
1628                 c->blockio_device_bandwidths;
1629 }
1630
1631 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
1632         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1633             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
1634                 return c->startup_io_weight;
1635         if (c->io_weight != CGROUP_WEIGHT_INVALID)
1636                 return c->io_weight;
1637         return CGROUP_WEIGHT_DEFAULT;
1638 }
1639
1640 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
1641         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1642             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
1643                 return c->startup_blockio_weight;
1644         if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
1645                 return c->blockio_weight;
1646         return CGROUP_BLKIO_WEIGHT_DEFAULT;
1647 }
1648
1649 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
1650         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
1651                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
1652 }
1653
1654 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
1655         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
1656                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
1657 }
1658
1659 static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t io_weight) {
1660         static const char * const prop_names[] = {
1661                 "IOWeight",
1662                 "BlockIOWeight",
1663                 "IODeviceWeight",
1664                 "BlockIODeviceWeight",
1665         };
1666         static bool warned = false;
1667         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")];
1668         const char *p;
1669         uint64_t bfq_weight;
1670         int r;
1671
1672         assert(u);
1673
1674         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1675         if (!crt || !crt->cgroup_path)
1676                 return -EOWNERDEAD;
1677
1678         /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
1679          * See also: https://github.com/systemd/systemd/pull/13335 and
1680          * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
1681         p = strjoina(controller, ".bfq.weight");
1682         /* Adjust to kernel range is 1..1000, the default is 100. */
1683         bfq_weight = BFQ_WEIGHT(io_weight);
1684
1685         if (major(dev) > 0)
1686                 xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), bfq_weight);
1687         else
1688                 xsprintf(buf, "%" PRIu64 "\n", bfq_weight);
1689
1690         r = cg_set_attribute(controller, crt->cgroup_path, p, buf);
1691
1692         /* FIXME: drop this when kernels prior
1693          * 795fe54c2a82 ("bfq: Add per-device weight") v5.4
1694          * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return
1695          * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */
1696         if (r == -EINVAL && major(dev) > 0) {
1697                if (!warned) {
1698                         log_unit_warning(u, "Kernel version does not accept per-device setting in %s.", p);
1699                         warned = true;
1700                }
1701                r = -EOPNOTSUPP; /* mask as unconfigured device */
1702         } else if (r >= 0 && io_weight != bfq_weight)
1703                 log_unit_debug(u, "%s=%" PRIu64 " scaled to %s=%" PRIu64,
1704                                prop_names[2*(major(dev) > 0) + streq(controller, "blkio")],
1705                                io_weight, p, bfq_weight);
1706         return r;
1707 }
1708
1709 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
1710         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1711         dev_t dev;
1712         int r, r1, r2;
1713
1714         assert(u);
1715
1716         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1717         if (!crt || !crt->cgroup_path)
1718                 return;
1719
1720         if (lookup_block_device(dev_path, &dev) < 0)
1721                 return;
1722
1723         r1 = set_bfq_weight(u, "io", dev, io_weight);
1724
1725         xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight);
1726         r2 = cg_set_attribute("io", crt->cgroup_path, "io.weight", buf);
1727
1728         /* Look at the configured device, when both fail, prefer io.weight errno. */
1729         r = r2 == -EOPNOTSUPP ? r1 : r2;
1730
1731         if (r < 0)
1732                 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r),
1733                                     r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
1734                                     empty_to_root(crt->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
1735 }
1736
1737 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
1738         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1739         dev_t dev;
1740         int r;
1741
1742         r = lookup_block_device(dev_path, &dev);
1743         if (r < 0)
1744                 return;
1745
1746         xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), blkio_weight);
1747         (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
1748 }
1749
1750 static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
1751         char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
1752         dev_t dev;
1753         int r;
1754
1755         r = lookup_block_device(dev_path, &dev);
1756         if (r < 0)
1757                 return;
1758
1759         if (target != USEC_INFINITY)
1760                 xsprintf(buf, DEVNUM_FORMAT_STR " target=%" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), target);
1761         else
1762                 xsprintf(buf, DEVNUM_FORMAT_STR " target=max\n", DEVNUM_FORMAT_VAL(dev));
1763
1764         (void) set_attribute_and_warn(u, "io", "io.latency", buf);
1765 }
1766
1767 static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
1768         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)],
1769              buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
1770         dev_t dev;
1771
1772         if (lookup_block_device(dev_path, &dev) < 0)
1773                 return;
1774
1775         for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
1776                 if (limits[type] != cgroup_io_limit_defaults[type])
1777                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
1778                 else
1779                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
1780
1781         xsprintf(buf, DEVNUM_FORMAT_STR " rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev),
1782                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
1783                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
1784         (void) set_attribute_and_warn(u, "io", "io.max", buf);
1785 }
1786
1787 static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
1788         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1789         dev_t dev;
1790
1791         if (lookup_block_device(dev_path, &dev) < 0)
1792                 return;
1793
1794         sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), rbps);
1795         (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
1796
1797         sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), wbps);
1798         (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
1799 }
1800
1801 static bool unit_has_unified_memory_config(Unit *u) {
1802         CGroupContext *c;
1803
1804         assert(u);
1805
1806         assert_se(c = unit_get_cgroup_context(u));
1807
1808         return unit_get_ancestor_memory_min(u) > 0 ||
1809                unit_get_ancestor_memory_low(u) > 0 || unit_get_ancestor_startup_memory_low(u) > 0 ||
1810                c->memory_high != CGROUP_LIMIT_MAX || c->startup_memory_high_set ||
1811                c->memory_max != CGROUP_LIMIT_MAX || c->startup_memory_max_set ||
1812                c->memory_swap_max != CGROUP_LIMIT_MAX || c->startup_memory_swap_max_set ||
1813                c->memory_zswap_max != CGROUP_LIMIT_MAX || c->startup_memory_zswap_max_set;
1814 }
1815
1816 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
1817         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
1818
1819         if (v != CGROUP_LIMIT_MAX)
1820                 xsprintf(buf, "%" PRIu64 "\n", v);
1821
1822         (void) set_attribute_and_warn(u, "memory", file, buf);
1823 }
1824
1825 static void cgroup_apply_firewall(Unit *u) {
1826         assert(u);
1827
1828         /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
1829
1830         if (bpf_firewall_compile(u) < 0)
1831                 return;
1832
1833         (void) bpf_firewall_load_custom(u);
1834         (void) bpf_firewall_install(u);
1835 }
1836
1837 void unit_modify_nft_set(Unit *u, bool add) {
1838         int r;
1839
1840         assert(u);
1841
1842         if (!MANAGER_IS_SYSTEM(u->manager))
1843                 return;
1844
1845         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1846                 return;
1847
1848         if (cg_all_unified() <= 0)
1849                 return;
1850
1851         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1852         if (!crt || crt->cgroup_id == 0)
1853                 return;
1854
1855         if (!u->manager->fw_ctx) {
1856                 r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false);
1857                 if (r < 0)
1858                         return;
1859
1860                 assert(u->manager->fw_ctx);
1861         }
1862
1863         CGroupContext *c = ASSERT_PTR(unit_get_cgroup_context(u));
1864
1865         FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
1866                 if (nft_set->source != NFT_SET_SOURCE_CGROUP)
1867                         continue;
1868
1869                 uint64_t element = crt->cgroup_id;
1870
1871                 r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
1872                 if (r < 0)
1873                         log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m",
1874                                           add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id);
1875                 else
1876                         log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64,
1877                                   add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id);
1878         }
1879 }
1880
1881 static void cgroup_apply_socket_bind(Unit *u) {
1882         assert(u);
1883
1884         (void) bpf_socket_bind_install(u);
1885 }
1886
1887 static void cgroup_apply_restrict_network_interfaces(Unit *u) {
1888         assert(u);
1889
1890         (void) bpf_restrict_ifaces_install(u);
1891 }
1892
1893 static int cgroup_apply_devices(Unit *u) {
1894         _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
1895         CGroupContext *c;
1896         CGroupDevicePolicy policy;
1897         int r;
1898
1899         assert_se(c = unit_get_cgroup_context(u));
1900
1901         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1902         if (!crt || !crt->cgroup_path)
1903                 return -EOWNERDEAD;
1904
1905         policy = c->device_policy;
1906
1907         if (cg_all_unified() > 0) {
1908                 r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
1909                 if (r < 0)
1910                         return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
1911
1912         } else {
1913                 /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
1914                  * EINVAL here. */
1915
1916                 if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO)
1917                         r = cg_set_attribute("devices", crt->cgroup_path, "devices.deny", "a");
1918                 else
1919                         r = cg_set_attribute("devices", crt->cgroup_path, "devices.allow", "a");
1920                 if (r < 0)
1921                         log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1922                                             "Failed to reset devices.allow/devices.deny: %m");
1923         }
1924
1925         bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
1926                 (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
1927
1928         bool any = false;
1929         if (allow_list_static) {
1930                 r = bpf_devices_allow_list_static(prog, crt->cgroup_path);
1931                 if (r > 0)
1932                         any = true;
1933         }
1934
1935         LIST_FOREACH(device_allow, a, c->device_allow) {
1936                 const char *val;
1937
1938                 if (a->permissions == 0)
1939                         continue;
1940
1941                 if (path_startswith(a->path, "/dev/"))
1942                         r = bpf_devices_allow_list_device(prog, crt->cgroup_path, a->path, a->permissions);
1943                 else if ((val = startswith(a->path, "block-")))
1944                         r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'b', a->permissions);
1945                 else if ((val = startswith(a->path, "char-")))
1946                         r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'c', a->permissions);
1947                 else {
1948                         log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
1949                         continue;
1950                 }
1951
1952                 if (r > 0)
1953                         any = true;
1954         }
1955
1956         if (prog && !any) {
1957                 log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter.");
1958
1959                 /* The kernel verifier would reject a program we would build with the normal intro and outro
1960                    but no allow-listing rules (outro would contain an unreachable instruction for successful
1961                    return). */
1962                 policy = CGROUP_DEVICE_POLICY_STRICT;
1963         }
1964
1965         r = bpf_devices_apply_policy(&prog, policy, any, crt->cgroup_path, &crt->bpf_device_control_installed);
1966         if (r < 0) {
1967                 static bool warned = false;
1968
1969                 log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
1970                                "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
1971                                "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
1972                                "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
1973
1974                 warned = true;
1975         }
1976         return r;
1977 }
1978
1979 static void set_io_weight(Unit *u, uint64_t weight) {
1980         char buf[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)];
1981
1982         assert(u);
1983
1984         (void) set_bfq_weight(u, "io", makedev(0, 0), weight);
1985
1986         xsprintf(buf, "default %" PRIu64 "\n", weight);
1987         (void) set_attribute_and_warn(u, "io", "io.weight", buf);
1988 }
1989
1990 static void set_blkio_weight(Unit *u, uint64_t weight) {
1991         char buf[STRLEN("\n")+DECIMAL_STR_MAX(uint64_t)];
1992
1993         assert(u);
1994
1995         (void) set_bfq_weight(u, "blkio", makedev(0, 0), weight);
1996
1997         xsprintf(buf, "%" PRIu64 "\n", weight);
1998         (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
1999 }
2000
2001 static void cgroup_apply_bpf_foreign_program(Unit *u) {
2002         assert(u);
2003
2004         (void) bpf_foreign_install(u);
2005 }
2006
2007 static void cgroup_context_apply(
2008                 Unit *u,
2009                 CGroupMask apply_mask,
2010                 ManagerState state) {
2011
2012         bool is_host_root, is_local_root;
2013         const char *path;
2014         CGroupContext *c;
2015         int r;
2016
2017         assert(u);
2018
2019         /* Nothing to do? Exit early! */
2020         if (apply_mask == 0)
2021                 return;
2022
2023         /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
2024          * attributes should only be managed for cgroups further down the tree. */
2025         is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
2026         is_host_root = unit_has_host_root_cgroup(u);
2027
2028         assert_se(c = unit_get_cgroup_context(u));
2029
2030         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2031         if (!crt || !crt->cgroup_path)
2032                 return;
2033
2034         path = crt->cgroup_path;
2035
2036         if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
2037                 path = "/";
2038
2039         /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
2040          * then), and missing cgroups, i.e. EROFS and ENOENT. */
2041
2042         /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
2043          * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
2044          * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
2045          * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
2046          * we couldn't even write to them if we wanted to). */
2047         if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
2048
2049                 if (cg_all_unified() > 0) {
2050                         uint64_t weight;
2051
2052                         if (cgroup_context_has_cpu_weight(c))
2053                                 weight = cgroup_context_cpu_weight(c, state);
2054                         else if (cgroup_context_has_cpu_shares(c)) {
2055                                 uint64_t shares;
2056
2057                                 shares = cgroup_context_cpu_shares(c, state);
2058                                 weight = cgroup_cpu_shares_to_weight(shares);
2059
2060                                 log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
2061                                                   shares, weight, path);
2062                         } else
2063                                 weight = CGROUP_WEIGHT_DEFAULT;
2064
2065                         cgroup_apply_unified_cpu_idle(u, weight);
2066                         cgroup_apply_unified_cpu_weight(u, weight);
2067                         cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
2068
2069                 } else {
2070                         uint64_t shares;
2071
2072                         if (cgroup_context_has_cpu_weight(c)) {
2073                                 uint64_t weight;
2074
2075                                 weight = cgroup_context_cpu_weight(c, state);
2076                                 shares = cgroup_cpu_weight_to_shares(weight);
2077
2078                                 log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
2079                                                   weight, shares, path);
2080                         } else if (cgroup_context_has_cpu_shares(c))
2081                                 shares = cgroup_context_cpu_shares(c, state);
2082                         else
2083                                 shares = CGROUP_CPU_SHARES_DEFAULT;
2084
2085                         cgroup_apply_legacy_cpu_shares(u, shares);
2086                         cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
2087                 }
2088         }
2089
2090         if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
2091                 cgroup_apply_unified_cpuset(u, cgroup_context_allowed_cpus(c, state), "cpuset.cpus");
2092                 cgroup_apply_unified_cpuset(u, cgroup_context_allowed_mems(c, state), "cpuset.mems");
2093         }
2094
2095         /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
2096          * controller), and in case of containers we want to leave control of these attributes to the container manager
2097          * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
2098         if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
2099                 bool has_io, has_blockio;
2100                 uint64_t weight;
2101
2102                 has_io = cgroup_context_has_io_config(c);
2103                 has_blockio = cgroup_context_has_blockio_config(c);
2104
2105                 if (has_io)
2106                         weight = cgroup_context_io_weight(c, state);
2107                 else if (has_blockio) {
2108                         uint64_t blkio_weight;
2109
2110                         blkio_weight = cgroup_context_blkio_weight(c, state);
2111                         weight = cgroup_weight_blkio_to_io(blkio_weight);
2112
2113                         log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
2114                                           blkio_weight, weight);
2115                 } else
2116                         weight = CGROUP_WEIGHT_DEFAULT;
2117
2118                 set_io_weight(u, weight);
2119
2120                 if (has_io) {
2121                         LIST_FOREACH(device_weights, w, c->io_device_weights)
2122                                 cgroup_apply_io_device_weight(u, w->path, w->weight);
2123
2124                         LIST_FOREACH(device_limits, limit, c->io_device_limits)
2125                                 cgroup_apply_io_device_limit(u, limit->path, limit->limits);
2126
2127                         LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
2128                                 cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
2129
2130                 } else if (has_blockio) {
2131                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
2132                                 weight = cgroup_weight_blkio_to_io(w->weight);
2133
2134                                 log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
2135                                                   w->weight, weight, w->path);
2136
2137                                 cgroup_apply_io_device_weight(u, w->path, weight);
2138                         }
2139
2140                         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
2141                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
2142
2143                                 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
2144                                         limits[type] = cgroup_io_limit_defaults[type];
2145
2146                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
2147                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
2148
2149                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
2150                                                   b->rbps, b->wbps, b->path);
2151
2152                                 cgroup_apply_io_device_limit(u, b->path, limits);
2153                         }
2154                 }
2155         }
2156
2157         if (apply_mask & CGROUP_MASK_BLKIO) {
2158                 bool has_io, has_blockio;
2159
2160                 has_io = cgroup_context_has_io_config(c);
2161                 has_blockio = cgroup_context_has_blockio_config(c);
2162
2163                 /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
2164                  * left to our container manager, too. */
2165                 if (!is_local_root) {
2166                         uint64_t weight;
2167
2168                         if (has_io) {
2169                                 uint64_t io_weight;
2170
2171                                 io_weight = cgroup_context_io_weight(c, state);
2172                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
2173
2174                                 log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
2175                                                   io_weight, weight);
2176                         } else if (has_blockio)
2177                                 weight = cgroup_context_blkio_weight(c, state);
2178                         else
2179                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
2180
2181                         set_blkio_weight(u, weight);
2182
2183                         if (has_io)
2184                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
2185                                         weight = cgroup_weight_io_to_blkio(w->weight);
2186
2187                                         log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
2188                                                           w->weight, weight, w->path);
2189
2190                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
2191                                 }
2192                         else if (has_blockio)
2193                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
2194                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
2195                 }
2196
2197                 /* The bandwidth limits are something that make sense to be applied to the host's root but not container
2198                  * roots, as there we want the container manager to handle it */
2199                 if (is_host_root || !is_local_root) {
2200                         if (has_io)
2201                                 LIST_FOREACH(device_limits, l, c->io_device_limits) {
2202                                         log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
2203                                                           l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
2204
2205                                         cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
2206                                 }
2207                         else if (has_blockio)
2208                                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
2209                                         cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
2210                 }
2211         }
2212
2213         /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
2214          * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
2215          * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
2216          * write to this if we wanted to.) */
2217         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
2218
2219                 if (cg_all_unified() > 0) {
2220                         uint64_t max, swap_max = CGROUP_LIMIT_MAX, zswap_max = CGROUP_LIMIT_MAX, high = CGROUP_LIMIT_MAX;
2221
2222                         if (unit_has_unified_memory_config(u)) {
2223                                 bool startup = IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
2224
2225                                 high = startup && c->startup_memory_high_set ? c->startup_memory_high : c->memory_high;
2226                                 max = startup && c->startup_memory_max_set ? c->startup_memory_max : c->memory_max;
2227                                 swap_max = startup && c->startup_memory_swap_max_set ? c->startup_memory_swap_max : c->memory_swap_max;
2228                                 zswap_max = startup && c->startup_memory_zswap_max_set ? c->startup_memory_zswap_max : c->memory_zswap_max;
2229                         } else {
2230                                 max = c->memory_limit;
2231
2232                                 if (max != CGROUP_LIMIT_MAX)
2233                                         log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
2234                         }
2235
2236                         cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
2237                         cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
2238                         cgroup_apply_unified_memory_limit(u, "memory.high", high);
2239                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
2240                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
2241                         cgroup_apply_unified_memory_limit(u, "memory.zswap.max", zswap_max);
2242
2243                         (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
2244                         (void) set_attribute_and_warn(u, "memory", "memory.zswap.writeback", one_zero(c->memory_zswap_writeback));
2245
2246                 } else {
2247                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
2248                         uint64_t val;
2249
2250                         if (unit_has_unified_memory_config(u)) {
2251                                 val = c->memory_max;
2252                                 if (val != CGROUP_LIMIT_MAX)
2253                                         log_cgroup_compat(u, "Applying MemoryMax=%" PRIu64 " as MemoryLimit=", val);
2254                         } else
2255                                 val = c->memory_limit;
2256
2257                         if (val == CGROUP_LIMIT_MAX)
2258                                 strncpy(buf, "-1\n", sizeof(buf));
2259                         else
2260                                 xsprintf(buf, "%" PRIu64 "\n", val);
2261
2262                         (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
2263                 }
2264         }
2265
2266         /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
2267          * containers, where we leave this to the manager */
2268         if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
2269             (is_host_root || cg_all_unified() > 0 || !is_local_root))
2270                 (void) cgroup_apply_devices(u);
2271
2272         if (apply_mask & CGROUP_MASK_PIDS) {
2273
2274                 if (is_host_root) {
2275                         /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
2276                          * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
2277                          * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
2278                          * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
2279                          * exclusive ownership of the sysctls, but we still want to honour things if the user sets
2280                          * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
2281                          * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
2282                          * it also counts. But if the user never set a limit through us (i.e. we are the default of
2283                          * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
2284                          * the first time we set a limit. Note that this boolean is flushed out on manager reload,
2285                          * which is desirable so that there's an official way to release control of the sysctl from
2286                          * systemd: set the limit to unbounded and reload. */
2287
2288                         if (cgroup_tasks_max_isset(&c->tasks_max)) {
2289                                 u->manager->sysctl_pid_max_changed = true;
2290                                 r = procfs_tasks_set_limit(cgroup_tasks_max_resolve(&c->tasks_max));
2291                         } else if (u->manager->sysctl_pid_max_changed)
2292                                 r = procfs_tasks_set_limit(TASKS_MAX);
2293                         else
2294                                 r = 0;
2295                         if (r < 0)
2296                                 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
2297                                                     "Failed to write to tasks limit sysctls: %m");
2298                 }
2299
2300                 /* The attribute itself is not available on the host root cgroup, and in the container case we want to
2301                  * leave it for the container manager. */
2302                 if (!is_local_root) {
2303                         if (cgroup_tasks_max_isset(&c->tasks_max)) {
2304                                 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
2305
2306                                 xsprintf(buf, "%" PRIu64 "\n", cgroup_tasks_max_resolve(&c->tasks_max));
2307                                 (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
2308                         } else
2309                                 (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
2310                 }
2311         }
2312
2313         if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
2314                 cgroup_apply_firewall(u);
2315
2316         if (apply_mask & CGROUP_MASK_BPF_FOREIGN)
2317                 cgroup_apply_bpf_foreign_program(u);
2318
2319         if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND)
2320                 cgroup_apply_socket_bind(u);
2321
2322         if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES)
2323                 cgroup_apply_restrict_network_interfaces(u);
2324
2325         unit_modify_nft_set(u, /* add = */ true);
2326 }
2327
2328 static bool unit_get_needs_bpf_firewall(Unit *u) {
2329         CGroupContext *c;
2330         assert(u);
2331
2332         c = unit_get_cgroup_context(u);
2333         if (!c)
2334                 return false;
2335
2336         if (c->ip_accounting ||
2337             !set_isempty(c->ip_address_allow) ||
2338             !set_isempty(c->ip_address_deny) ||
2339             c->ip_filters_ingress ||
2340             c->ip_filters_egress)
2341                 return true;
2342
2343         /* If any parent slice has an IP access list defined, it applies too */
2344         for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) {
2345                 c = unit_get_cgroup_context(p);
2346                 if (!c)
2347                         return false;
2348
2349                 if (!set_isempty(c->ip_address_allow) ||
2350                     !set_isempty(c->ip_address_deny))
2351                         return true;
2352         }
2353
2354         return false;
2355 }
2356
2357 static bool unit_get_needs_bpf_foreign_program(Unit *u) {
2358         CGroupContext *c;
2359         assert(u);
2360
2361         c = unit_get_cgroup_context(u);
2362         if (!c)
2363                 return false;
2364
2365         return !!c->bpf_foreign_programs;
2366 }
2367
2368 static bool unit_get_needs_socket_bind(Unit *u) {
2369         CGroupContext *c;
2370         assert(u);
2371
2372         c = unit_get_cgroup_context(u);
2373         if (!c)
2374                 return false;
2375
2376         return c->socket_bind_allow || c->socket_bind_deny;
2377 }
2378
2379 static bool unit_get_needs_restrict_network_interfaces(Unit *u) {
2380         CGroupContext *c;
2381         assert(u);
2382
2383         c = unit_get_cgroup_context(u);
2384         if (!c)
2385                 return false;
2386
2387         return !set_isempty(c->restrict_network_interfaces);
2388 }
2389
2390 static CGroupMask unit_get_cgroup_mask(Unit *u) {
2391         CGroupMask mask = 0;
2392         CGroupContext *c;
2393
2394         assert(u);
2395
2396         assert_se(c = unit_get_cgroup_context(u));
2397
2398         /* Figure out which controllers we need, based on the cgroup context object */
2399
2400         if (c->cpu_accounting)
2401                 mask |= get_cpu_accounting_mask();
2402
2403         if (cgroup_context_has_cpu_weight(c) ||
2404             cgroup_context_has_cpu_shares(c) ||
2405             c->cpu_quota_per_sec_usec != USEC_INFINITY)
2406                 mask |= CGROUP_MASK_CPU;
2407
2408         if (cgroup_context_has_allowed_cpus(c) || cgroup_context_has_allowed_mems(c))
2409                 mask |= CGROUP_MASK_CPUSET;
2410
2411         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
2412                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2413
2414         if (c->memory_accounting ||
2415             c->memory_limit != CGROUP_LIMIT_MAX ||
2416             unit_has_unified_memory_config(u))
2417                 mask |= CGROUP_MASK_MEMORY;
2418
2419         if (c->device_allow ||
2420             c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
2421                 mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
2422
2423         if (c->tasks_accounting ||
2424             cgroup_tasks_max_isset(&c->tasks_max))
2425                 mask |= CGROUP_MASK_PIDS;
2426
2427         return CGROUP_MASK_EXTEND_JOINED(mask);
2428 }
2429
2430 static CGroupMask unit_get_bpf_mask(Unit *u) {
2431         CGroupMask mask = 0;
2432
2433         /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
2434          * too. */
2435
2436         if (unit_get_needs_bpf_firewall(u))
2437                 mask |= CGROUP_MASK_BPF_FIREWALL;
2438
2439         if (unit_get_needs_bpf_foreign_program(u))
2440                 mask |= CGROUP_MASK_BPF_FOREIGN;
2441
2442         if (unit_get_needs_socket_bind(u))
2443                 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
2444
2445         if (unit_get_needs_restrict_network_interfaces(u))
2446                 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
2447
2448         return mask;
2449 }
2450
2451 CGroupMask unit_get_own_mask(Unit *u) {
2452         CGroupContext *c;
2453
2454         /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
2455          * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
2456
2457         if (u->load_state != UNIT_LOADED)
2458                 return 0;
2459
2460         c = unit_get_cgroup_context(u);
2461         if (!c)
2462                 return 0;
2463
2464         return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
2465 }
2466
2467 CGroupMask unit_get_delegate_mask(Unit *u) {
2468         CGroupContext *c;
2469
2470         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
2471          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
2472          *
2473          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
2474
2475         if (!unit_cgroup_delegate(u))
2476                 return 0;
2477
2478         if (cg_all_unified() <= 0) {
2479                 ExecContext *e;
2480
2481                 e = unit_get_exec_context(u);
2482                 if (e && !exec_context_maintains_privileges(e))
2483                         return 0;
2484         }
2485
2486         assert_se(c = unit_get_cgroup_context(u));
2487         return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
2488 }
2489
2490 static CGroupMask unit_get_subtree_mask(Unit *u) {
2491
2492         /* Returns the mask of this subtree, meaning of the group
2493          * itself and its children. */
2494
2495         return unit_get_own_mask(u) | unit_get_members_mask(u);
2496 }
2497
2498 CGroupMask unit_get_members_mask(Unit *u) {
2499         assert(u);
2500
2501         /* Returns the mask of controllers all of the unit's children require, merged */
2502
2503         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2504         if (crt && crt->cgroup_members_mask_valid)
2505                 return crt->cgroup_members_mask; /* Use cached value if possible */
2506
2507         CGroupMask m = 0;
2508         if (u->type == UNIT_SLICE) {
2509                 Unit *member;
2510
2511                 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
2512                         m |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
2513         }
2514
2515         if (crt) {
2516                 crt->cgroup_members_mask = m;
2517                 crt->cgroup_members_mask_valid = true;
2518         }
2519
2520         return m;
2521 }
2522
2523 CGroupMask unit_get_siblings_mask(Unit *u) {
2524         Unit *slice;
2525         assert(u);
2526
2527         /* Returns the mask of controllers all of the unit's siblings
2528          * require, i.e. the members mask of the unit's parent slice
2529          * if there is one. */
2530
2531         slice = UNIT_GET_SLICE(u);
2532         if (slice)
2533                 return unit_get_members_mask(slice);
2534
2535         return unit_get_subtree_mask(u); /* we are the top-level slice */
2536 }
2537
2538 static CGroupMask unit_get_disable_mask(Unit *u) {
2539         CGroupContext *c;
2540
2541         c = unit_get_cgroup_context(u);
2542         if (!c)
2543                 return 0;
2544
2545         return c->disable_controllers;
2546 }
2547
2548 CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
2549         CGroupMask mask;
2550         Unit *slice;
2551
2552         assert(u);
2553         mask = unit_get_disable_mask(u);
2554
2555         /* Returns the mask of controllers which are marked as forcibly
2556          * disabled in any ancestor unit or the unit in question. */
2557
2558         slice = UNIT_GET_SLICE(u);
2559         if (slice)
2560                 mask |= unit_get_ancestor_disable_mask(slice);
2561
2562         return mask;
2563 }
2564
2565 CGroupMask unit_get_target_mask(Unit *u) {
2566         CGroupMask own_mask, mask;
2567
2568         /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
2569          * it needs itself, plus all that its children need, plus all that its siblings need. This is
2570          * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
2571          * hierarchy that shall be enabled for it. */
2572
2573         own_mask = unit_get_own_mask(u);
2574
2575         if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
2576                 emit_bpf_firewall_warning(u);
2577
2578         mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u);
2579
2580         mask &= u->manager->cgroup_supported;
2581         mask &= ~unit_get_ancestor_disable_mask(u);
2582
2583         return mask;
2584 }
2585
2586 CGroupMask unit_get_enable_mask(Unit *u) {
2587         CGroupMask mask;
2588
2589         /* This returns the cgroup mask of all controllers to enable
2590          * for the children of a specific cgroup. This is primarily
2591          * useful for the unified cgroup hierarchy, where each cgroup
2592          * controls which controllers are enabled for its children. */
2593
2594         mask = unit_get_members_mask(u);
2595         mask &= u->manager->cgroup_supported;
2596         mask &= ~unit_get_ancestor_disable_mask(u);
2597
2598         return mask;
2599 }
2600
2601 void unit_invalidate_cgroup_members_masks(Unit *u) {
2602         Unit *slice;
2603
2604         assert(u);
2605
2606         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2607         if (!crt)
2608                 return;
2609
2610         /* Recurse invalidate the member masks cache all the way up the tree */
2611         crt->cgroup_members_mask_valid = false;
2612
2613         slice = UNIT_GET_SLICE(u);
2614         if (slice)
2615                 unit_invalidate_cgroup_members_masks(slice);
2616 }
2617
2618 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
2619
2620         /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
2621
2622         while (u) {
2623                 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2624                 if (crt &&
2625                     crt->cgroup_path &&
2626                     crt->cgroup_realized &&
2627                     FLAGS_SET(crt->cgroup_realized_mask, mask))
2628                         return crt->cgroup_path;
2629
2630                 u = UNIT_GET_SLICE(u);
2631         }
2632
2633         return NULL;
2634 }
2635
2636 static const char *migrate_callback(CGroupMask mask, void *userdata) {
2637         /* If not realized at all, migrate to root ("").
2638          * It may happen if we're upgrading from older version that didn't clean up.
2639          */
2640         return strempty(unit_get_realized_cgroup_path(userdata, mask));
2641 }
2642
2643 int unit_default_cgroup_path(const Unit *u, char **ret) {
2644         _cleanup_free_ char *p = NULL;
2645         int r;
2646
2647         assert(u);
2648         assert(ret);
2649
2650         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
2651                 p = strdup(u->manager->cgroup_root);
2652         else {
2653                 _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
2654                 Unit *slice;
2655
2656                 slice = UNIT_GET_SLICE(u);
2657                 if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
2658                         r = cg_slice_to_path(slice->id, &slice_path);
2659                         if (r < 0)
2660                                 return r;
2661                 }
2662
2663                 r = cg_escape(u->id, &escaped);
2664                 if (r < 0)
2665                         return r;
2666
2667                 p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
2668         }
2669         if (!p)
2670                 return -ENOMEM;
2671
2672         *ret = TAKE_PTR(p);
2673         return 0;
2674 }
2675
2676 int unit_set_cgroup_path(Unit *u, const char *path) {
2677         _cleanup_free_ char *p = NULL;
2678         CGroupRuntime *crt;
2679         int r;
2680
2681         assert(u);
2682
2683         crt = unit_get_cgroup_runtime(u);
2684
2685         if (crt && streq_ptr(crt->cgroup_path, path))
2686                 return 0;
2687
2688         unit_release_cgroup(u);
2689
2690         crt = unit_setup_cgroup_runtime(u);
2691         if (!crt)
2692                 return -ENOMEM;
2693
2694         if (path) {
2695                 p = strdup(path);
2696                 if (!p)
2697                         return -ENOMEM;
2698
2699                 r = hashmap_put(u->manager->cgroup_unit, p, u);
2700                 if (r < 0)
2701                         return r;
2702         }
2703
2704         assert(!crt->cgroup_path);
2705         crt->cgroup_path = TAKE_PTR(p);
2706
2707         return 1;
2708 }
2709
2710 int unit_watch_cgroup(Unit *u) {
2711         _cleanup_free_ char *events = NULL;
2712         int r;
2713
2714         assert(u);
2715
2716         /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
2717          * cgroupv2 is available. */
2718
2719         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2720         if (!crt || !crt->cgroup_path)
2721                 return 0;
2722
2723         if (crt->cgroup_control_inotify_wd >= 0)
2724                 return 0;
2725
2726         /* Only applies to the unified hierarchy */
2727         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2728         if (r < 0)
2729                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
2730         if (r == 0)
2731                 return 0;
2732
2733         /* No point in watch the top-level slice, it's never going to run empty. */
2734         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
2735                 return 0;
2736
2737         r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
2738         if (r < 0)
2739                 return log_oom();
2740
2741         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.events", &events);
2742         if (r < 0)
2743                 return log_oom();
2744
2745         crt->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2746         if (crt->cgroup_control_inotify_wd < 0) {
2747
2748                 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2749                                       * is not an error */
2750                         return 0;
2751
2752                 return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path));
2753         }
2754
2755         r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd), u);
2756         if (r < 0)
2757                 return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path));
2758
2759         return 0;
2760 }
2761
2762 int unit_watch_cgroup_memory(Unit *u) {
2763         _cleanup_free_ char *events = NULL;
2764         int r;
2765
2766         assert(u);
2767
2768         /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
2769          * cgroupv2 is available. */
2770
2771         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2772         if (!crt || !crt->cgroup_path)
2773                 return 0;
2774
2775         CGroupContext *c = unit_get_cgroup_context(u);
2776         if (!c)
2777                 return 0;
2778
2779         /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
2780          * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
2781          * all. */
2782         if (!c->memory_accounting)
2783                 return 0;
2784
2785         /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
2786          * we also don't want to generate a log message for each parent cgroup of a process. */
2787         if (u->type == UNIT_SLICE)
2788                 return 0;
2789
2790         if (crt->cgroup_memory_inotify_wd >= 0)
2791                 return 0;
2792
2793         /* Only applies to the unified hierarchy */
2794         r = cg_all_unified();
2795         if (r < 0)
2796                 return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
2797         if (r == 0)
2798                 return 0;
2799
2800         r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
2801         if (r < 0)
2802                 return log_oom();
2803
2804         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "memory.events", &events);
2805         if (r < 0)
2806                 return log_oom();
2807
2808         crt->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2809         if (crt->cgroup_memory_inotify_wd < 0) {
2810
2811                 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2812                                       * is not an error */
2813                         return 0;
2814
2815                 return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path));
2816         }
2817
2818         r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd), u);
2819         if (r < 0)
2820                 return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path));
2821
2822         return 0;
2823 }
2824
2825 int unit_pick_cgroup_path(Unit *u) {
2826         _cleanup_free_ char *path = NULL;
2827         int r;
2828
2829         assert(u);
2830
2831         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2832                 return -EINVAL;
2833
2834         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
2835         if (!crt)
2836                 return -ENOMEM;
2837         if (crt->cgroup_path)
2838                 return 0;
2839
2840         r = unit_default_cgroup_path(u, &path);
2841         if (r < 0)
2842                 return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m");
2843
2844         r = unit_set_cgroup_path(u, path);
2845         if (r == -EEXIST)
2846                 return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path));
2847         if (r < 0)
2848                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path));
2849
2850         return 0;
2851 }
2852
2853 static int unit_update_cgroup(
2854                 Unit *u,
2855                 CGroupMask target_mask,
2856                 CGroupMask enable_mask,
2857                 ManagerState state) {
2858
2859         bool created, is_root_slice;
2860         CGroupMask migrate_mask = 0;
2861         _cleanup_free_ char *cgroup_full_path = NULL;
2862         int r;
2863
2864         assert(u);
2865
2866         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2867                 return 0;
2868
2869         /* Figure out our cgroup path */
2870         r = unit_pick_cgroup_path(u);
2871         if (r < 0)
2872                 return r;
2873
2874         CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
2875
2876         /* First, create our own group */
2877         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, crt->cgroup_path);
2878         if (r < 0)
2879                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(crt->cgroup_path));
2880         created = r;
2881
2882         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2883                 uint64_t cgroup_id = 0;
2884
2885                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_full_path);
2886                 if (r == 0) {
2887                         r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
2888                         if (r < 0)
2889                                 log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
2890                                                     "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path);
2891                 } else
2892                         log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
2893
2894                 crt->cgroup_id = cgroup_id;
2895         }
2896
2897         /* Start watching it */
2898         (void) unit_watch_cgroup(u);
2899         (void) unit_watch_cgroup_memory(u);
2900
2901         /* For v2 we preserve enabled controllers in delegated units, adjust others,
2902          * for v1 we figure out which controller hierarchies need migration. */
2903         if (created || !crt->cgroup_realized || !unit_cgroup_delegate(u)) {
2904                 CGroupMask result_mask = 0;
2905
2906                 /* Enable all controllers we need */
2907                 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, crt->cgroup_path, &result_mask);
2908                 if (r < 0)
2909                         log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
2910
2911                 /* Remember what's actually enabled now */
2912                 crt->cgroup_enabled_mask = result_mask;
2913
2914                 migrate_mask = crt->cgroup_realized_mask ^ target_mask;
2915         }
2916
2917         /* Keep track that this is now realized */
2918         crt->cgroup_realized = true;
2919         crt->cgroup_realized_mask = target_mask;
2920
2921         /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling).
2922          *
2923          * Unnecessary controller cgroups are trimmed (after emptied by upward migration).
2924          * We perform migration also with whole slices for cases when users don't care about leave
2925          * granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing
2926          * delegated units.
2927          */
2928         if (cg_all_unified() == 0) {
2929                 r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, crt->cgroup_path, migrate_callback, u);
2930                 if (r < 0)
2931                         log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(crt->cgroup_path));
2932
2933                 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
2934                 r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, crt->cgroup_path, !is_root_slice);
2935                 if (r < 0)
2936                         log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(crt->cgroup_path));
2937         }
2938
2939         /* Set attributes */
2940         cgroup_context_apply(u, target_mask, state);
2941         cgroup_xattr_apply(u);
2942
2943         /* For most units we expect that memory monitoring is set up before the unit is started and we won't
2944          * touch it after. For PID 1 this is different though, because we couldn't possibly do that given
2945          * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's
2946          * try to open the memory pressure interface anew. */
2947         if (unit_has_name(u, SPECIAL_INIT_SCOPE))
2948                 (void) manager_setup_memory_pressure_event_source(u->manager);
2949
2950         return 0;
2951 }
2952
2953 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
2954         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2955         char *pp;
2956         int r;
2957
2958         assert(u);
2959
2960         if (MANAGER_IS_SYSTEM(u->manager))
2961                 return -EINVAL;
2962
2963         if (!u->manager->system_bus)
2964                 return -EIO;
2965
2966         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2967         if (!crt || !crt->cgroup_path)
2968                 return -EOWNERDEAD;
2969
2970         /* Determine this unit's cgroup path relative to our cgroup root */
2971         pp = path_startswith(crt->cgroup_path, u->manager->cgroup_root);
2972         if (!pp)
2973                 return -EINVAL;
2974
2975         pp = strjoina("/", pp, suffix_path);
2976         path_simplify(pp);
2977
2978         r = bus_call_method(u->manager->system_bus,
2979                             bus_systemd_mgr,
2980                             "AttachProcessesToUnit",
2981                             &error, NULL,
2982                             "ssau",
2983                             NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
2984         if (r < 0)
2985                 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
2986
2987         return 0;
2988 }
2989
2990 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
2991         _cleanup_free_ char *joined = NULL;
2992         CGroupMask delegated_mask;
2993         const char *p;
2994         PidRef *pid;
2995         int ret, r;
2996
2997         assert(u);
2998
2999         if (!UNIT_HAS_CGROUP_CONTEXT(u))
3000                 return -EINVAL;
3001
3002         if (set_isempty(pids))
3003                 return 0;
3004
3005         /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
3006          * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
3007         r = bpf_firewall_load_custom(u);
3008         if (r < 0)
3009                 return r;
3010
3011         r = unit_realize_cgroup(u);
3012         if (r < 0)
3013                 return r;
3014
3015         CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
3016
3017         if (isempty(suffix_path))
3018                 p = crt->cgroup_path;
3019         else {
3020                 joined = path_join(crt->cgroup_path, suffix_path);
3021                 if (!joined)
3022                         return -ENOMEM;
3023
3024                 p = joined;
3025         }
3026
3027         delegated_mask = unit_get_delegate_mask(u);
3028
3029         ret = 0;
3030         SET_FOREACH(pid, pids) {
3031
3032                 /* Unfortunately we cannot add pids by pidfd to a cgroup. Hence we have to use PIDs instead,
3033                  * which of course is racy. Let's shorten the race a bit though, and re-validate the PID
3034                  * before we use it */
3035                 r = pidref_verify(pid);
3036                 if (r < 0) {
3037                         log_unit_info_errno(u, r, "PID " PID_FMT " vanished before we could move it to target cgroup '%s', skipping: %m", pid->pid, empty_to_root(p));
3038                         continue;
3039                 }
3040
3041                 /* First, attach the PID to the main cgroup hierarchy */
3042                 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid->pid);
3043                 if (r < 0) {
3044                         bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(r);
3045
3046                         log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO,  r,
3047                                             "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m",
3048                                             pid->pid, again ? " directly" : "", empty_to_root(p));
3049
3050                         if (again) {
3051                                 int z;
3052
3053                                 /* If we are in a user instance, and we can't move the process ourselves due
3054                                  * to permission problems, let's ask the system instance about it instead.
3055                                  * Since it's more privileged it might be able to move the process across the
3056                                  * leaves of a subtree whose top node is not owned by us. */
3057
3058                                 z = unit_attach_pid_to_cgroup_via_bus(u, pid->pid, suffix_path);
3059                                 if (z < 0)
3060                                         log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid->pid, empty_to_root(p));
3061                                 else {
3062                                         if (ret >= 0)
3063                                                 ret++; /* Count successful additions */
3064                                         continue; /* When the bus thing worked via the bus we are fully done for this PID. */
3065                                 }
3066                         }
3067
3068                         if (ret >= 0)
3069                                 ret = r; /* Remember first error */
3070
3071                         continue;
3072                 } else if (ret >= 0)
3073                         ret++; /* Count successful additions */
3074
3075                 r = cg_all_unified();
3076                 if (r < 0)
3077                         return r;
3078                 if (r > 0)
3079                         continue;
3080
3081                 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
3082                  * innermost realized one */
3083
3084                 for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
3085                         CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
3086                         const char *realized;
3087
3088                         if (!(u->manager->cgroup_supported & bit))
3089                                 continue;
3090
3091                         /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
3092                         if (delegated_mask & crt->cgroup_realized_mask & bit) {
3093                                 r = cg_attach(cgroup_controller_to_string(c), p, pid->pid);
3094                                 if (r >= 0)
3095                                         continue; /* Success! */
3096
3097                                 log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
3098                                                      pid->pid, empty_to_root(p), cgroup_controller_to_string(c));
3099                         }
3100
3101                         /* So this controller is either not delegate or realized, or something else weird happened. In
3102                          * that case let's attach the PID at least to the closest cgroup up the tree that is
3103                          * realized. */
3104                         realized = unit_get_realized_cgroup_path(u, bit);
3105                         if (!realized)
3106                                 continue; /* Not even realized in the root slice? Then let's not bother */
3107
3108                         r = cg_attach(cgroup_controller_to_string(c), realized, pid->pid);
3109                         if (r < 0)
3110                                 log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
3111                                                      pid->pid, realized, cgroup_controller_to_string(c));
3112                 }
3113         }
3114
3115         return ret;
3116 }
3117
3118 static bool unit_has_mask_realized(
3119                 Unit *u,
3120                 CGroupMask target_mask,
3121                 CGroupMask enable_mask) {
3122
3123         assert(u);
3124
3125         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3126         if (!crt)
3127                 return false;
3128
3129         /* Returns true if this unit is fully realized. We check four things:
3130          *
3131          * 1. Whether the cgroup was created at all
3132          * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
3133          * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
3134          * 4. Whether the invalidation mask is currently zero
3135          *
3136          * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
3137          * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
3138          * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
3139          * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
3140          * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
3141          * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
3142          * simply don't matter. */
3143
3144         return crt->cgroup_realized &&
3145                 ((crt->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
3146                 ((crt->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
3147                 crt->cgroup_invalidated_mask == 0;
3148 }
3149
3150 static bool unit_has_mask_disables_realized(
3151                 Unit *u,
3152                 CGroupMask target_mask,
3153                 CGroupMask enable_mask) {
3154
3155         assert(u);
3156
3157         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3158         if (!crt)
3159                 return true;
3160
3161         /* Returns true if all controllers which should be disabled are indeed disabled.
3162          *
3163          * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
3164          * already removed. */
3165
3166         return !crt->cgroup_realized ||
3167                 (FLAGS_SET(crt->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
3168                  FLAGS_SET(crt->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
3169 }
3170
3171 static bool unit_has_mask_enables_realized(
3172                 Unit *u,
3173                 CGroupMask target_mask,
3174                 CGroupMask enable_mask) {
3175
3176         assert(u);
3177
3178         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3179         if (!crt)
3180                 return false;
3181
3182         /* Returns true if all controllers which should be enabled are indeed enabled.
3183          *
3184          * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
3185          * we want to add is already added. */
3186
3187         return crt->cgroup_realized &&
3188                 ((crt->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (crt->cgroup_realized_mask & CGROUP_MASK_V1) &&
3189                 ((crt->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (crt->cgroup_enabled_mask & CGROUP_MASK_V2);
3190 }
3191
3192 void unit_add_to_cgroup_realize_queue(Unit *u) {
3193         assert(u);
3194
3195         if (u->in_cgroup_realize_queue)
3196                 return;
3197
3198         LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
3199         u->in_cgroup_realize_queue = true;
3200 }
3201
3202 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
3203         assert(u);
3204
3205         if (!u->in_cgroup_realize_queue)
3206                 return;
3207
3208         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
3209         u->in_cgroup_realize_queue = false;
3210 }
3211
3212 /* Controllers can only be enabled breadth-first, from the root of the
3213  * hierarchy downwards to the unit in question. */
3214 static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
3215         CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
3216         Unit *slice;
3217         int r;
3218
3219         assert(u);
3220
3221         /* First go deal with this unit's parent, or we won't be able to enable
3222          * any new controllers at this layer. */
3223         slice = UNIT_GET_SLICE(u);
3224         if (slice) {
3225                 r = unit_realize_cgroup_now_enable(slice, state);
3226                 if (r < 0)
3227                         return r;
3228         }
3229
3230         target_mask = unit_get_target_mask(u);
3231         enable_mask = unit_get_enable_mask(u);
3232
3233         /* We can only enable in this direction, don't try to disable anything.
3234          */
3235         if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
3236                 return 0;
3237
3238         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3239
3240         new_target_mask = (crt ? crt->cgroup_realized_mask : 0) | target_mask;
3241         new_enable_mask = (crt ? crt->cgroup_enabled_mask : 0) | enable_mask;
3242
3243         return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
3244 }
3245
3246 /* Controllers can only be disabled depth-first, from the leaves of the
3247  * hierarchy upwards to the unit in question. */
3248 static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
3249         Unit *m;
3250
3251         assert(u);
3252
3253         if (u->type != UNIT_SLICE)
3254                 return 0;
3255
3256         UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
3257                 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
3258                 int r;
3259
3260                 CGroupRuntime *rt = unit_get_cgroup_runtime(m);
3261                 if (!rt)
3262                         continue;
3263
3264                 /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
3265                  * holding any controllers open anyway. */
3266                 if (!rt->cgroup_realized)
3267                         continue;
3268
3269                 /* We must disable those below us first in order to release the controller. */
3270                 if (m->type == UNIT_SLICE)
3271                         (void) unit_realize_cgroup_now_disable(m, state);
3272
3273                 target_mask = unit_get_target_mask(m);
3274                 enable_mask = unit_get_enable_mask(m);
3275
3276                 /* We can only disable in this direction, don't try to enable anything. */
3277                 if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
3278                         continue;
3279
3280                 new_target_mask = rt->cgroup_realized_mask & target_mask;
3281                 new_enable_mask = rt->cgroup_enabled_mask & enable_mask;
3282
3283                 r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
3284                 if (r < 0)
3285                         return r;
3286         }
3287
3288         return 0;
3289 }
3290
3291 /* Check if necessary controllers and attributes for a unit are in place.
3292  *
3293  * - If so, do nothing.
3294  * - If not, create paths, move processes over, and set attributes.
3295  *
3296  * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
3297  * a depth-first way. As such the process looks like this:
3298  *
3299  * Suppose we have a cgroup hierarchy which looks like this:
3300  *
3301  *             root
3302  *            /    \
3303  *           /      \
3304  *          /        \
3305  *         a          b
3306  *        / \        / \
3307  *       /   \      /   \
3308  *      c     d    e     f
3309  *     / \   / \  / \   / \
3310  *     h i   j k  l m   n o
3311  *
3312  * 1. We want to realise cgroup "d" now.
3313  * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
3314  * 3. cgroup "k" just started requesting the memory controller.
3315  *
3316  * To make this work we must do the following in order:
3317  *
3318  * 1. Disable CPU controller in k, j
3319  * 2. Disable CPU controller in d
3320  * 3. Enable memory controller in root
3321  * 4. Enable memory controller in a
3322  * 5. Enable memory controller in d
3323  * 6. Enable memory controller in k
3324  *
3325  * Notice that we need to touch j in one direction, but not the other. We also
3326  * don't go beyond d when disabling -- it's up to "a" to get realized if it
3327  * wants to disable further. The basic rules are therefore:
3328  *
3329  * - If you're disabling something, you need to realise all of the cgroups from
3330  *   your recursive descendants to the root. This starts from the leaves.
3331  * - If you're enabling something, you need to realise from the root cgroup
3332  *   downwards, but you don't need to iterate your recursive descendants.
3333  *
3334  * Returns 0 on success and < 0 on failure. */
3335 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
3336         CGroupMask target_mask, enable_mask;
3337         Unit *slice;
3338         int r;
3339
3340         assert(u);
3341
3342         unit_remove_from_cgroup_realize_queue(u);
3343
3344         target_mask = unit_get_target_mask(u);
3345         enable_mask = unit_get_enable_mask(u);
3346
3347         if (unit_has_mask_realized(u, target_mask, enable_mask))
3348                 return 0;
3349
3350         /* Disable controllers below us, if there are any */
3351         r = unit_realize_cgroup_now_disable(u, state);
3352         if (r < 0)
3353                 return r;
3354
3355         /* Enable controllers above us, if there are any */
3356         slice = UNIT_GET_SLICE(u);
3357         if (slice) {
3358                 r = unit_realize_cgroup_now_enable(slice, state);
3359                 if (r < 0)
3360                         return r;
3361         }
3362
3363         /* Now actually deal with the cgroup we were trying to realise and set attributes */
3364         r = unit_update_cgroup(u, target_mask, enable_mask, state);
3365         if (r < 0)
3366                 return r;
3367
3368         CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
3369
3370         /* Now, reset the invalidation mask */
3371         crt->cgroup_invalidated_mask = 0;
3372         return 0;
3373 }
3374
3375 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
3376         ManagerState state;
3377         unsigned n = 0;
3378         Unit *i;
3379         int r;
3380
3381         assert(m);
3382
3383         state = manager_state(m);
3384
3385         while ((i = m->cgroup_realize_queue)) {
3386                 assert(i->in_cgroup_realize_queue);
3387
3388                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
3389                         /* Maybe things changed, and the unit is not actually active anymore? */
3390                         unit_remove_from_cgroup_realize_queue(i);
3391                         continue;
3392                 }
3393
3394                 r = unit_realize_cgroup_now(i, state);
3395                 if (r < 0)
3396                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
3397
3398                 n++;
3399         }
3400
3401         return n;
3402 }
3403
3404 void unit_add_family_to_cgroup_realize_queue(Unit *u) {
3405         assert(u);
3406         assert(u->type == UNIT_SLICE);
3407
3408         /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
3409          * its ancestors.
3410          *
3411          * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
3412          * very weird if two units that own processes reside in the same slice, but one is realized in the
3413          * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
3414          * not), because that means individual processes need to be scheduled against whole cgroups. Let's
3415          * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
3416          * controller hierarchies too (if unit requires the controller to be realized).
3417          *
3418          * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
3419          * masks. */
3420
3421         do {
3422                 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3423
3424                 /* Children of u likely changed when we're called */
3425                 if (crt)
3426                         crt->cgroup_members_mask_valid = false;
3427
3428                 Unit *m;
3429                 UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
3430
3431                         /* No point in doing cgroup application for units without active processes. */
3432                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
3433                                 continue;
3434
3435                         /* We only enqueue siblings if they were realized once at least, in the main
3436                          * hierarchy. */
3437                         crt = unit_get_cgroup_runtime(m);
3438                         if (!crt || !crt->cgroup_realized)
3439                                 continue;
3440
3441                         /* If the unit doesn't need any new controllers and has current ones
3442                          * realized, it doesn't need any changes. */
3443                         if (unit_has_mask_realized(m,
3444                                                    unit_get_target_mask(m),
3445                                                    unit_get_enable_mask(m)))
3446                                 continue;
3447
3448                         unit_add_to_cgroup_realize_queue(m);
3449                 }
3450
3451                 /* Parent comes after children */
3452                 unit_add_to_cgroup_realize_queue(u);
3453
3454                 u = UNIT_GET_SLICE(u);
3455         } while (u);
3456 }
3457
3458 int unit_realize_cgroup(Unit *u) {
3459         Unit *slice;
3460
3461         assert(u);
3462
3463         if (!UNIT_HAS_CGROUP_CONTEXT(u))
3464                 return 0;
3465
3466         /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
3467          * parents, but there's more actually: for the weight-based controllers we also need to make sure
3468          * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too.  On the
3469          * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
3470          * and ancestors and they should be (de)realized too.
3471          *
3472          * This call will defer work on the siblings and derealized ancestors to the next event loop
3473          * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
3474
3475         slice = UNIT_GET_SLICE(u);
3476         if (slice)
3477                 unit_add_family_to_cgroup_realize_queue(slice);
3478
3479         /* And realize this one now (and apply the values) */
3480         return unit_realize_cgroup_now(u, manager_state(u->manager));
3481 }
3482
3483 void unit_release_cgroup(Unit *u) {
3484         assert(u);
3485
3486         /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
3487          * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
3488
3489         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3490         if (!crt)
3491                 return;
3492
3493         if (crt->cgroup_path) {
3494                 (void) hashmap_remove(u->manager->cgroup_unit, crt->cgroup_path);
3495                 crt->cgroup_path = mfree(crt->cgroup_path);
3496         }
3497
3498         if (crt->cgroup_control_inotify_wd >= 0) {
3499                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_control_inotify_wd) < 0)
3500                         log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", crt->cgroup_control_inotify_wd, u->id);
3501
3502                 (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd));
3503                 crt->cgroup_control_inotify_wd = -1;
3504         }
3505
3506         if (crt->cgroup_memory_inotify_wd >= 0) {
3507                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_memory_inotify_wd) < 0)
3508                         log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", crt->cgroup_memory_inotify_wd, u->id);
3509
3510                 (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd));
3511                 crt->cgroup_memory_inotify_wd = -1;
3512         }
3513
3514         *(CGroupRuntime**) ((uint8_t*) u + UNIT_VTABLE(u)->cgroup_runtime_offset) = cgroup_runtime_free(crt);
3515 }
3516
3517 int unit_cgroup_is_empty(Unit *u) {
3518         int r;
3519
3520         assert(u);
3521
3522         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3523         if (!crt)
3524                 return -ENXIO;
3525         if (!crt->cgroup_path)
3526                 return -EOWNERDEAD;
3527
3528         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path);
3529         if (r < 0)
3530                 return log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty, ignoring: %m", empty_to_root(crt->cgroup_path));
3531
3532         return r;
3533 }
3534
3535 bool unit_maybe_release_cgroup(Unit *u) {
3536         int r;
3537
3538         assert(u);
3539
3540         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3541         if (!crt || !crt->cgroup_path)
3542                 return true;
3543
3544         /* Don't release the cgroup if there are still processes under it. If we get notified later when all
3545          * the processes exit (e.g. the processes were in D-state and exited after the unit was marked as
3546          * failed) we need the cgroup paths to continue to be tracked by the manager so they can be looked up
3547          * and cleaned up later. */
3548         r = unit_cgroup_is_empty(u);
3549         if (r == 1) {
3550                 unit_release_cgroup(u);
3551                 return true;
3552         }
3553
3554         return false;
3555 }
3556
3557 void unit_prune_cgroup(Unit *u) {
3558         int r;
3559         bool is_root_slice;
3560
3561         assert(u);
3562
3563         /* Removes the cgroup, if empty and possible, and stops watching it. */
3564         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3565         if (!crt || !crt->cgroup_path)
3566                 return;
3567
3568         /* Cache the last CPU and memory usage values before we destroy the cgroup */
3569         (void) unit_get_cpu_usage(u, /* ret = */ NULL);
3570
3571         for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++)
3572                 (void) unit_get_memory_accounting(u, metric, /* ret = */ NULL);
3573
3574 #if BPF_FRAMEWORK
3575         (void) bpf_restrict_fs_cleanup(u); /* Remove cgroup from the global LSM BPF map */
3576 #endif
3577
3578         unit_modify_nft_set(u, /* add = */ false);
3579
3580         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
3581
3582         r = cg_trim_everywhere(u->manager->cgroup_supported, crt->cgroup_path, !is_root_slice);
3583         if (r < 0)
3584                 /* One reason we could have failed here is, that the cgroup still contains a process.
3585                  * However, if the cgroup becomes removable at a later time, it might be removed when
3586                  * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
3587                  * that the cgroup is still realized the next time it is started. Do not return early
3588                  * on error, continue cleanup. */
3589                 log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
3590
3591         if (is_root_slice)
3592                 return;
3593
3594         if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
3595                 return;
3596
3597         crt = unit_get_cgroup_runtime(u); /* The above might have destroyed the runtime object, let's see if it's still there */
3598         if (!crt)
3599                 return;
3600
3601         crt->cgroup_realized = false;
3602         crt->cgroup_realized_mask = 0;
3603         crt->cgroup_enabled_mask = 0;
3604
3605         crt->bpf_device_control_installed = bpf_program_free(crt->bpf_device_control_installed);
3606 }
3607
3608 int unit_search_main_pid(Unit *u, PidRef *ret) {
3609         _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
3610         _cleanup_fclose_ FILE *f = NULL;
3611         int r;
3612
3613         assert(u);
3614         assert(ret);
3615
3616         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3617         if (!crt || !crt->cgroup_path)
3618                 return -ENXIO;
3619
3620         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, &f);
3621         if (r < 0)
3622                 return r;
3623
3624         for (;;) {
3625                 _cleanup_(pidref_done) PidRef npidref = PIDREF_NULL;
3626
3627                 r = cg_read_pidref(f, &npidref);
3628                 if (r < 0)
3629                         return r;
3630                 if (r == 0)
3631                         break;
3632
3633                 if (pidref_equal(&pidref, &npidref)) /* seen already, cgroupfs reports duplicates! */
3634                         continue;
3635
3636                 if (pidref_is_my_child(&npidref) <= 0) /* ignore processes further down the tree */
3637                         continue;
3638
3639                 if (pidref_is_set(&pidref) != 0)
3640                         /* Dang, there's more than one daemonized PID in this group, so we don't know what
3641                          * process is the main process. */
3642                         return -ENODATA;
3643
3644                 pidref = TAKE_PIDREF(npidref);
3645         }
3646
3647         if (!pidref_is_set(&pidref))
3648                 return -ENODATA;
3649
3650         *ret = TAKE_PIDREF(pidref);
3651         return 0;
3652 }
3653
3654 static int unit_watch_pids_in_path(Unit *u, const char *path) {
3655         _cleanup_closedir_ DIR *d = NULL;
3656         _cleanup_fclose_ FILE *f = NULL;
3657         int ret = 0, r;
3658
3659         assert(u);
3660         assert(path);
3661
3662         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
3663         if (r < 0)
3664                 RET_GATHER(ret, r);
3665         else {
3666                 for (;;) {
3667                         _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
3668
3669                         r = cg_read_pidref(f, &pid);
3670                         if (r == 0)
3671                                 break;
3672                         if (r < 0) {
3673                                 RET_GATHER(ret, r);
3674                                 break;
3675                         }
3676
3677                         RET_GATHER(ret, unit_watch_pidref(u, &pid, /* exclusive= */ false));
3678                 }
3679         }
3680
3681         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
3682         if (r < 0)
3683                 RET_GATHER(ret, r);
3684         else {
3685                 for (;;) {
3686                         _cleanup_free_ char *fn = NULL, *p = NULL;
3687
3688                         r = cg_read_subgroup(d, &fn);
3689                         if (r == 0)
3690                                 break;
3691                         if (r < 0) {
3692                                 RET_GATHER(ret, r);
3693                                 break;
3694                         }
3695
3696                         p = path_join(empty_to_root(path), fn);
3697                         if (!p)
3698                                 return -ENOMEM;
3699
3700                         RET_GATHER(ret, unit_watch_pids_in_path(u, p));
3701                 }
3702         }
3703
3704         return ret;
3705 }
3706
3707 int unit_synthesize_cgroup_empty_event(Unit *u) {
3708         int r;
3709
3710         assert(u);
3711
3712         /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
3713          * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
3714          * get as notification source as soon as we stopped having any useful PIDs to watch for. */
3715
3716         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3717         if (!crt || !crt->cgroup_path)
3718                 return -ENOENT;
3719
3720         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
3721         if (r < 0)
3722                 return r;
3723         if (r > 0) /* On unified we have reliable notifications, and don't need this */
3724                 return 0;
3725
3726         if (!set_isempty(u->pids))
3727                 return 0;
3728
3729         unit_add_to_cgroup_empty_queue(u);
3730         return 0;
3731 }
3732
3733 int unit_watch_all_pids(Unit *u) {
3734         int r;
3735
3736         assert(u);
3737
3738         /* Adds all PIDs from our cgroup to the set of PIDs we
3739          * watch. This is a fallback logic for cases where we do not
3740          * get reliable cgroup empty notifications: we try to use
3741          * SIGCHLD as replacement. */
3742
3743         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3744         if (!crt || !crt->cgroup_path)
3745                 return -ENOENT;
3746
3747         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
3748         if (r < 0)
3749                 return r;
3750         if (r > 0) /* On unified we can use proper notifications */
3751                 return 0;
3752
3753         return unit_watch_pids_in_path(u, crt->cgroup_path);
3754 }
3755
3756 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
3757         Manager *m = ASSERT_PTR(userdata);
3758         Unit *u;
3759         int r;
3760
3761         assert(s);
3762
3763         u = m->cgroup_empty_queue;
3764         if (!u)
3765                 return 0;
3766
3767         assert(u->in_cgroup_empty_queue);
3768         u->in_cgroup_empty_queue = false;
3769         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
3770
3771         if (m->cgroup_empty_queue) {
3772                 /* More stuff queued, let's make sure we remain enabled */
3773                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
3774                 if (r < 0)
3775                         log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
3776         }
3777
3778         /* Update state based on OOM kills before we notify about cgroup empty event */
3779         (void) unit_check_oom(u);
3780         (void) unit_check_oomd_kill(u);
3781
3782         unit_add_to_gc_queue(u);
3783
3784         if (IN_SET(unit_active_state(u), UNIT_INACTIVE, UNIT_FAILED))
3785                 unit_prune_cgroup(u);
3786         else if (UNIT_VTABLE(u)->notify_cgroup_empty)
3787                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
3788
3789         return 0;
3790 }
3791
3792 void unit_add_to_cgroup_empty_queue(Unit *u) {
3793         int r;
3794
3795         assert(u);
3796
3797         /* Note that there are four different ways how cgroup empty events reach us:
3798          *
3799          * 1. On the unified hierarchy we get an inotify event on the cgroup
3800          *
3801          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
3802          *
3803          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
3804          *
3805          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
3806          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
3807          *
3808          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
3809          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
3810          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
3811          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
3812          * case for scope units). */
3813
3814         if (u->in_cgroup_empty_queue)
3815                 return;
3816
3817         /* Let's verify that the cgroup is really empty */
3818         r = unit_cgroup_is_empty(u);
3819         if (r <= 0)
3820                 return;
3821
3822         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
3823         u->in_cgroup_empty_queue = true;
3824
3825         /* Trigger the defer event */
3826         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
3827         if (r < 0)
3828                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
3829 }
3830
3831 static void unit_remove_from_cgroup_empty_queue(Unit *u) {
3832         assert(u);
3833
3834         if (!u->in_cgroup_empty_queue)
3835                 return;
3836
3837         LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
3838         u->in_cgroup_empty_queue = false;
3839 }
3840
3841 int unit_check_oomd_kill(Unit *u) {
3842         _cleanup_free_ char *value = NULL;
3843         bool increased;
3844         uint64_t n = 0;
3845         int r;
3846
3847         assert(u);
3848
3849         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3850         if (!crt || !crt->cgroup_path)
3851                 return 0;
3852
3853         r = cg_all_unified();
3854         if (r < 0)
3855                 return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m");
3856         else if (r == 0)
3857                 return 0;
3858
3859         r = cg_get_xattr_malloc(crt->cgroup_path, "user.oomd_ooms", &value);
3860         if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3861                 return r;
3862
3863         if (!isempty(value)) {
3864                  r = safe_atou64(value, &n);
3865                  if (r < 0)
3866                          return r;
3867         }
3868
3869         increased = n > crt->managed_oom_kill_last;
3870         crt->managed_oom_kill_last = n;
3871
3872         if (!increased)
3873                 return 0;
3874
3875         n = 0;
3876         value = mfree(value);
3877         r = cg_get_xattr_malloc(crt->cgroup_path, "user.oomd_kill", &value);
3878         if (r >= 0 && !isempty(value))
3879                 (void) safe_atou64(value, &n);
3880
3881         if (n > 0)
3882                 log_unit_struct(u, LOG_NOTICE,
3883                                 "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
3884                                 LOG_UNIT_INVOCATION_ID(u),
3885                                 LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n),
3886                                 "N_PROCESSES=%" PRIu64, n);
3887         else
3888                 log_unit_struct(u, LOG_NOTICE,
3889                                 "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
3890                                 LOG_UNIT_INVOCATION_ID(u),
3891                                 LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit."));
3892
3893         unit_notify_cgroup_oom(u, /* ManagedOOM= */ true);
3894
3895         return 1;
3896 }
3897
3898 int unit_check_oom(Unit *u) {
3899         _cleanup_free_ char *oom_kill = NULL;
3900         bool increased;
3901         uint64_t c;
3902         int r;
3903
3904         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3905         if (!crt || !crt->cgroup_path)
3906                 return 0;
3907
3908         r = cg_get_keyed_attribute(
3909                         "memory",
3910                         crt->cgroup_path,
3911                         "memory.events",
3912                         STRV_MAKE("oom_kill"),
3913                         &oom_kill);
3914         if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
3915                 c = 0;
3916         else if (r < 0)
3917                 return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
3918         else {
3919                 r = safe_atou64(oom_kill, &c);
3920                 if (r < 0)
3921                         return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
3922         }
3923
3924         increased = c > crt->oom_kill_last;
3925         crt->oom_kill_last = c;
3926
3927         if (!increased)
3928                 return 0;
3929
3930         log_unit_struct(u, LOG_NOTICE,
3931                         "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
3932                         LOG_UNIT_INVOCATION_ID(u),
3933                         LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
3934
3935         unit_notify_cgroup_oom(u, /* ManagedOOM= */ false);
3936
3937         return 1;
3938 }
3939
3940 static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
3941         Manager *m = ASSERT_PTR(userdata);
3942         Unit *u;
3943         int r;
3944
3945         assert(s);
3946
3947         u = m->cgroup_oom_queue;
3948         if (!u)
3949                 return 0;
3950
3951         assert(u->in_cgroup_oom_queue);
3952         u->in_cgroup_oom_queue = false;
3953         LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
3954
3955         if (m->cgroup_oom_queue) {
3956                 /* More stuff queued, let's make sure we remain enabled */
3957                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
3958                 if (r < 0)
3959                         log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
3960         }
3961
3962         (void) unit_check_oom(u);
3963         unit_add_to_gc_queue(u);
3964
3965         return 0;
3966 }
3967
3968 static void unit_add_to_cgroup_oom_queue(Unit *u) {
3969         int r;
3970
3971         assert(u);
3972
3973         if (u->in_cgroup_oom_queue)
3974                 return;
3975
3976         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3977         if (!crt || !crt->cgroup_path)
3978                 return;
3979
3980         LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
3981         u->in_cgroup_oom_queue = true;
3982
3983         /* Trigger the defer event */
3984         if (!u->manager->cgroup_oom_event_source) {
3985                 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
3986
3987                 r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
3988                 if (r < 0) {
3989                         log_error_errno(r, "Failed to create cgroup oom event source: %m");
3990                         return;
3991                 }
3992
3993                 r = sd_event_source_set_priority(s, EVENT_PRIORITY_CGROUP_OOM);
3994                 if (r < 0) {
3995                         log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
3996                         return;
3997                 }
3998
3999                 (void) sd_event_source_set_description(s, "cgroup-oom");
4000                 u->manager->cgroup_oom_event_source = TAKE_PTR(s);
4001         }
4002
4003         r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
4004         if (r < 0)
4005                 log_error_errno(r, "Failed to enable cgroup oom event source: %m");
4006 }
4007
4008 static int unit_check_cgroup_events(Unit *u) {
4009         char *values[2] = {};
4010         int r;
4011
4012         assert(u);
4013
4014         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4015         if (!crt || !crt->cgroup_path)
4016                 return 0;
4017
4018         r = cg_get_keyed_attribute_graceful(
4019                         SYSTEMD_CGROUP_CONTROLLER,
4020                         crt->cgroup_path,
4021                         "cgroup.events",
4022                         STRV_MAKE("populated", "frozen"),
4023                         values);
4024         if (r < 0)
4025                 return r;
4026
4027         /* The cgroup.events notifications can be merged together so act as we saw the given state for the
4028          * first time. The functions we call to handle given state are idempotent, which makes them
4029          * effectively remember the previous state. */
4030         if (values[0]) {
4031                 if (streq(values[0], "1"))
4032                         unit_remove_from_cgroup_empty_queue(u);
4033                 else
4034                         unit_add_to_cgroup_empty_queue(u);
4035         }
4036
4037         /* Disregard freezer state changes due to operations not initiated by us.
4038          * See: https://github.com/systemd/systemd/pull/13512/files#r416469963 and
4039          *      https://github.com/systemd/systemd/pull/13512#issuecomment-573007207 */
4040         if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING)) {
4041                 if (streq(values[1], "0"))
4042                         unit_thawed(u);
4043                 else
4044                         unit_frozen(u);
4045         }
4046
4047         free(values[0]);
4048         free(values[1]);
4049
4050         return 0;
4051 }
4052
4053 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
4054         Manager *m = ASSERT_PTR(userdata);
4055
4056         assert(s);
4057         assert(fd >= 0);
4058
4059         for (;;) {
4060                 union inotify_event_buffer buffer;
4061                 ssize_t l;
4062
4063                 l = read(fd, &buffer, sizeof(buffer));
4064                 if (l < 0) {
4065                         if (ERRNO_IS_TRANSIENT(errno))
4066                                 return 0;
4067
4068                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
4069                 }
4070
4071                 FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) {
4072                         Unit *u;
4073
4074                         if (e->wd < 0)
4075                                 /* Queue overflow has no watch descriptor */
4076                                 continue;
4077
4078                         if (e->mask & IN_IGNORED)
4079                                 /* The watch was just removed */
4080                                 continue;
4081
4082                         /* Note that inotify might deliver events for a watch even after it was removed,
4083                          * because it was queued before the removal. Let's ignore this here safely. */
4084
4085                         u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
4086                         if (u)
4087                                 unit_check_cgroup_events(u);
4088
4089                         u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
4090                         if (u)
4091                                 unit_add_to_cgroup_oom_queue(u);
4092                 }
4093         }
4094 }
4095
4096 static int cg_bpf_mask_supported(CGroupMask *ret) {
4097         CGroupMask mask = 0;
4098         int r;
4099
4100         /* BPF-based firewall */
4101         r = bpf_firewall_supported();
4102         if (r < 0)
4103                 return r;
4104         if (r > 0)
4105                 mask |= CGROUP_MASK_BPF_FIREWALL;
4106
4107         /* BPF-based device access control */
4108         r = bpf_devices_supported();
4109         if (r < 0)
4110                 return r;
4111         if (r > 0)
4112                 mask |= CGROUP_MASK_BPF_DEVICES;
4113
4114         /* BPF pinned prog */
4115         r = bpf_foreign_supported();
4116         if (r < 0)
4117                 return r;
4118         if (r > 0)
4119                 mask |= CGROUP_MASK_BPF_FOREIGN;
4120
4121         /* BPF-based bind{4|6} hooks */
4122         r = bpf_socket_bind_supported();
4123         if (r < 0)
4124                 return r;
4125         if (r > 0)
4126                 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
4127
4128         /* BPF-based cgroup_skb/{egress|ingress} hooks */
4129         r = bpf_restrict_ifaces_supported();
4130         if (r < 0)
4131                 return r;
4132         if (r > 0)
4133                 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
4134
4135         *ret = mask;
4136         return 0;
4137 }
4138
4139 int manager_setup_cgroup(Manager *m) {
4140         _cleanup_free_ char *path = NULL;
4141         const char *scope_path;
4142         int r, all_unified;
4143         CGroupMask mask;
4144         char *e;
4145
4146         assert(m);
4147
4148         /* 1. Determine hierarchy */
4149         m->cgroup_root = mfree(m->cgroup_root);
4150         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
4151         if (r < 0)
4152                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
4153
4154         /* Chop off the init scope, if we are already located in it */
4155         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
4156
4157         /* LEGACY: Also chop off the system slice if we are in
4158          * it. This is to support live upgrades from older systemd
4159          * versions where PID 1 was moved there. Also see
4160          * cg_get_root_path(). */
4161         if (!e && MANAGER_IS_SYSTEM(m)) {
4162                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
4163                 if (!e)
4164                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
4165         }
4166         if (e)
4167                 *e = 0;
4168
4169         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
4170          * easily prepend it everywhere. */
4171         delete_trailing_chars(m->cgroup_root, "/");
4172
4173         /* 2. Show data */
4174         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
4175         if (r < 0)
4176                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
4177
4178         r = cg_unified();
4179         if (r < 0)
4180                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
4181
4182         all_unified = cg_all_unified();
4183         if (all_unified < 0)
4184                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
4185         if (all_unified > 0)
4186                 log_debug("Unified cgroup hierarchy is located at %s.", path);
4187         else {
4188                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
4189                 if (r < 0)
4190                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
4191                 if (r > 0)
4192                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
4193                 else
4194                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
4195         }
4196
4197         /* 3. Allocate cgroup empty defer event source */
4198         m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
4199         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
4200         if (r < 0)
4201                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
4202
4203         /* Schedule cgroup empty checks early, but after having processed service notification messages or
4204          * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
4205          * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
4206         r = sd_event_source_set_priority(m->cgroup_empty_event_source, EVENT_PRIORITY_CGROUP_EMPTY);
4207         if (r < 0)
4208                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
4209
4210         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
4211         if (r < 0)
4212                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
4213
4214         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
4215
4216         /* 4. Install notifier inotify object, or agent */
4217         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
4218
4219                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
4220
4221                 m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
4222                 safe_close(m->cgroup_inotify_fd);
4223
4224                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
4225                 if (m->cgroup_inotify_fd < 0)
4226                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
4227
4228                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
4229                 if (r < 0)
4230                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
4231
4232                 /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
4233                  * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
4234                  * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
4235                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, EVENT_PRIORITY_CGROUP_INOTIFY);
4236                 if (r < 0)
4237                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
4238
4239                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
4240
4241         } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
4242
4243                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
4244                  * since it does not generate events when control groups with children run empty. */
4245
4246                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUPS_AGENT_PATH);
4247                 if (r < 0)
4248                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
4249                 else if (r > 0)
4250                         log_debug("Installed release agent.");
4251                 else if (r == 0)
4252                         log_debug("Release agent already installed.");
4253         }
4254
4255         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
4256         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
4257         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
4258         if (r >= 0) {
4259                 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
4260                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
4261                 if (r < 0)
4262                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
4263
4264                 /* 6. And pin it, so that it cannot be unmounted */
4265                 safe_close(m->pin_cgroupfs_fd);
4266                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
4267                 if (m->pin_cgroupfs_fd < 0)
4268                         return log_error_errno(errno, "Failed to open pin file: %m");
4269
4270         } else if (!MANAGER_IS_TEST_RUN(m))
4271                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
4272
4273         /* 7. Always enable hierarchical support if it exists... */
4274         if (!all_unified && !MANAGER_IS_TEST_RUN(m))
4275                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
4276
4277         /* 8. Figure out which controllers are supported */
4278         r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported);
4279         if (r < 0)
4280                 return log_error_errno(r, "Failed to determine supported controllers: %m");
4281
4282         /* 9. Figure out which bpf-based pseudo-controllers are supported */
4283         r = cg_bpf_mask_supported(&mask);
4284         if (r < 0)
4285                 return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
4286         m->cgroup_supported |= mask;
4287
4288         /* 10. Log which controllers are supported */
4289         for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
4290                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c),
4291                           yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
4292
4293         return 0;
4294 }
4295
4296 void manager_shutdown_cgroup(Manager *m, bool delete) {
4297         assert(m);
4298
4299         /* We can't really delete the group, since we are in it. But
4300          * let's trim it. */
4301         if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
4302                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
4303
4304         m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
4305
4306         m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
4307         m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
4308
4309         m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
4310         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
4311
4312         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
4313
4314         m->cgroup_root = mfree(m->cgroup_root);
4315 }
4316
4317 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
4318         char *p;
4319         Unit *u;
4320
4321         assert(m);
4322         assert(cgroup);
4323
4324         u = hashmap_get(m->cgroup_unit, cgroup);
4325         if (u)
4326                 return u;
4327
4328         p = strdupa_safe(cgroup);
4329         for (;;) {
4330                 char *e;
4331
4332                 e = strrchr(p, '/');
4333                 if (!e || e == p)
4334                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
4335
4336                 *e = 0;
4337
4338                 u = hashmap_get(m->cgroup_unit, p);
4339                 if (u)
4340                         return u;
4341         }
4342 }
4343
4344 Unit *manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid) {
4345         _cleanup_free_ char *cgroup = NULL;
4346
4347         assert(m);
4348
4349         if (cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
4350                 return NULL;
4351
4352         return manager_get_unit_by_cgroup(m, cgroup);
4353 }
4354
4355 Unit *manager_get_unit_by_pidref_watching(Manager *m, const PidRef *pid) {
4356         Unit *u, **array;
4357
4358         assert(m);
4359
4360         if (!pidref_is_set(pid))
4361                 return NULL;
4362
4363         u = hashmap_get(m->watch_pids, pid);
4364         if (u)
4365                 return u;
4366
4367         array = hashmap_get(m->watch_pids_more, pid);
4368         if (array)
4369                 return array[0];
4370
4371         return NULL;
4372 }
4373
4374 Unit *manager_get_unit_by_pidref(Manager *m, const PidRef *pid) {
4375         Unit *u;
4376
4377         assert(m);
4378
4379         /* Note that a process might be owned by multiple units, we return only one here, which is good
4380          * enough for most cases, though not strictly correct. We prefer the one reported by cgroup
4381          * membership, as that's the most relevant one as children of the process will be assigned to that
4382          * one, too, before all else. */
4383
4384         if (!pidref_is_set(pid))
4385                 return NULL;
4386
4387         if (pidref_is_self(pid))
4388                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
4389         if (pid->pid == 1)
4390                 return NULL;
4391
4392         u = manager_get_unit_by_pidref_cgroup(m, pid);
4393         if (u)
4394                 return u;
4395
4396         u = manager_get_unit_by_pidref_watching(m, pid);
4397         if (u)
4398                 return u;
4399
4400         return NULL;
4401 }
4402
4403 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
4404         assert(m);
4405
4406         if (!pid_is_valid(pid))
4407                 return NULL;
4408
4409         return manager_get_unit_by_pidref(m, &PIDREF_MAKE_FROM_PID(pid));
4410 }
4411
4412 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
4413         Unit *u;
4414
4415         assert(m);
4416         assert(cgroup);
4417
4418         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
4419          * or from the --system instance */
4420
4421         log_debug("Got cgroup empty notification for: %s", cgroup);
4422
4423         u = manager_get_unit_by_cgroup(m, cgroup);
4424         if (!u)
4425                 return 0;
4426
4427         unit_add_to_cgroup_empty_queue(u);
4428         return 1;
4429 }
4430
4431 int unit_get_memory_available(Unit *u, uint64_t *ret) {
4432         uint64_t available = UINT64_MAX, current = 0;
4433
4434         assert(u);
4435         assert(ret);
4436
4437         /* If data from cgroups can be accessed, try to find out how much more memory a unit can
4438          * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
4439          * and MemoryMax, and also any slice the unit might be nested below. */
4440
4441         do {
4442                 uint64_t unit_available, unit_limit = UINT64_MAX;
4443                 CGroupContext *unit_context;
4444
4445                 /* No point in continuing if we can't go any lower */
4446                 if (available == 0)
4447                         break;
4448
4449                 unit_context = unit_get_cgroup_context(u);
4450                 if (!unit_context)
4451                         return -ENODATA;
4452
4453                 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4454                 if (!crt || !crt->cgroup_path)
4455                         continue;
4456
4457                 (void) unit_get_memory_current(u, &current);
4458                 /* in case of error, previous current propagates as lower bound */
4459
4460                 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
4461                         unit_limit = physical_memory();
4462                 else if (unit_context->memory_max == UINT64_MAX && unit_context->memory_high == UINT64_MAX)
4463                         continue;
4464                 unit_limit = MIN3(unit_limit, unit_context->memory_max, unit_context->memory_high);
4465
4466                 unit_available = LESS_BY(unit_limit, current);
4467                 available = MIN(unit_available, available);
4468         } while ((u = UNIT_GET_SLICE(u)));
4469
4470         *ret = available;
4471
4472         return 0;
4473 }
4474
4475 int unit_get_memory_current(Unit *u, uint64_t *ret) {
4476         int r;
4477
4478         // FIXME: Merge this into unit_get_memory_accounting after support for cgroup v1 is dropped
4479
4480         assert(u);
4481         assert(ret);
4482
4483         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
4484                 return -ENODATA;
4485
4486         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4487         if (!crt || !crt->cgroup_path)
4488                 return -ENODATA;
4489
4490         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
4491         if (unit_has_host_root_cgroup(u))
4492                 return procfs_memory_get_used(ret);
4493
4494         if ((crt->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
4495                 return -ENODATA;
4496
4497         r = cg_all_unified();
4498         if (r < 0)
4499                 return r;
4500
4501         return cg_get_attribute_as_uint64("memory", crt->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
4502 }
4503
4504 int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) {
4505
4506         static const char* const attributes_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
4507                 [CGROUP_MEMORY_PEAK]          = "memory.peak",
4508                 [CGROUP_MEMORY_SWAP_CURRENT]  = "memory.swap.current",
4509                 [CGROUP_MEMORY_SWAP_PEAK]     = "memory.swap.peak",
4510                 [CGROUP_MEMORY_ZSWAP_CURRENT] = "memory.zswap.current",
4511         };
4512
4513         uint64_t bytes;
4514         bool updated = false;
4515         int r;
4516
4517         assert(u);
4518         assert(metric >= 0);
4519         assert(metric < _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX);
4520
4521         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
4522                 return -ENODATA;
4523
4524         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4525         if (!crt)
4526                 return -ENODATA;
4527         if (!crt->cgroup_path)
4528                 /* If the cgroup is already gone, we try to find the last cached value. */
4529                 goto finish;
4530
4531         /* The root cgroup doesn't expose this information. */
4532         if (unit_has_host_root_cgroup(u))
4533                 return -ENODATA;
4534
4535         if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_MEMORY))
4536                 return -ENODATA;
4537
4538         r = cg_all_unified();
4539         if (r < 0)
4540                 return r;
4541         if (r == 0)
4542                 return -ENODATA;
4543
4544         r = cg_get_attribute_as_uint64("memory", crt->cgroup_path, attributes_table[metric], &bytes);
4545         if (r < 0 && r != -ENODATA)
4546                 return r;
4547         updated = r >= 0;
4548
4549 finish:
4550         if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) {
4551                 uint64_t *last = &crt->memory_accounting_last[metric];
4552
4553                 if (updated)
4554                         *last = bytes;
4555                 else if (*last != UINT64_MAX)
4556                         bytes = *last;
4557                 else
4558                         return -ENODATA;
4559
4560         } else if (!updated)
4561                 return -ENODATA;
4562
4563         if (ret)
4564                 *ret = bytes;
4565
4566         return 0;
4567 }
4568
4569 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
4570         assert(u);
4571         assert(ret);
4572
4573         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
4574                 return -ENODATA;
4575
4576         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4577         if (!crt || !crt->cgroup_path)
4578                 return -ENODATA;
4579
4580         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
4581         if (unit_has_host_root_cgroup(u))
4582                 return procfs_tasks_get_current(ret);
4583
4584         if ((crt->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
4585                 return -ENODATA;
4586
4587         return cg_get_attribute_as_uint64("pids", crt->cgroup_path, "pids.current", ret);
4588 }
4589
4590 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
4591         uint64_t ns;
4592         int r;
4593
4594         assert(u);
4595         assert(ret);
4596
4597         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4598         if (!crt || !crt->cgroup_path)
4599                 return -ENODATA;
4600
4601         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
4602         if (unit_has_host_root_cgroup(u))
4603                 return procfs_cpu_get_usage(ret);
4604
4605         /* Requisite controllers for CPU accounting are not enabled */
4606         if ((get_cpu_accounting_mask() & ~crt->cgroup_realized_mask) != 0)
4607                 return -ENODATA;
4608
4609         r = cg_all_unified();
4610         if (r < 0)
4611                 return r;
4612         if (r > 0) {
4613                 _cleanup_free_ char *val = NULL;
4614                 uint64_t us;
4615
4616                 r = cg_get_keyed_attribute("cpu", crt->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
4617                 if (IN_SET(r, -ENOENT, -ENXIO))
4618                         return -ENODATA;
4619                 if (r < 0)
4620                         return r;
4621
4622                 r = safe_atou64(val, &us);
4623                 if (r < 0)
4624                         return r;
4625
4626                 ns = us * NSEC_PER_USEC;
4627         } else
4628                 return cg_get_attribute_as_uint64("cpuacct", crt->cgroup_path, "cpuacct.usage", ret);
4629
4630         *ret = ns;
4631         return 0;
4632 }
4633
4634 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
4635         nsec_t ns;
4636         int r;
4637
4638         assert(u);
4639
4640         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
4641          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
4642          * call this function with a NULL return value. */
4643
4644         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4645         if (!crt || !crt->cgroup_path)
4646                 return -ENODATA;
4647
4648         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
4649                 return -ENODATA;
4650
4651         r = unit_get_cpu_usage_raw(u, &ns);
4652         if (r == -ENODATA && crt->cpu_usage_last != NSEC_INFINITY) {
4653                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
4654                  * cached value. */
4655
4656                 if (ret)
4657                         *ret = crt->cpu_usage_last;
4658                 return 0;
4659         }
4660         if (r < 0)
4661                 return r;
4662
4663         if (ns > crt->cpu_usage_base)
4664                 ns -= crt->cpu_usage_base;
4665         else
4666                 ns = 0;
4667
4668         crt->cpu_usage_last = ns;
4669         if (ret)
4670                 *ret = ns;
4671
4672         return 0;
4673 }
4674
4675 int unit_get_ip_accounting(
4676                 Unit *u,
4677                 CGroupIPAccountingMetric metric,
4678                 uint64_t *ret) {
4679
4680         uint64_t value;
4681         int fd, r;
4682
4683         assert(u);
4684         assert(metric >= 0);
4685         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
4686         assert(ret);
4687
4688         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
4689                 return -ENODATA;
4690
4691         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4692         if (!crt || !crt->cgroup_path)
4693                 return -ENODATA;
4694
4695         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
4696                 crt->ip_accounting_ingress_map_fd :
4697                 crt->ip_accounting_egress_map_fd;
4698         if (fd < 0)
4699                 return -ENODATA;
4700
4701         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
4702                 r = bpf_firewall_read_accounting(fd, &value, NULL);
4703         else
4704                 r = bpf_firewall_read_accounting(fd, NULL, &value);
4705         if (r < 0)
4706                 return r;
4707
4708         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
4709          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
4710          * ip_accounting_extra[] field, and add them in here transparently. */
4711
4712         *ret = value + crt->ip_accounting_extra[metric];
4713
4714         return r;
4715 }
4716
4717 static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) {
4718         CGroupContext *cc;
4719
4720         assert(u);
4721         assert(UNIT_HAS_CGROUP_CONTEXT(u));
4722
4723         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
4724                 switch (type) {
4725                         case CGROUP_LIMIT_MEMORY_MAX:
4726                         case CGROUP_LIMIT_MEMORY_HIGH:
4727                                 return physical_memory();
4728                         case CGROUP_LIMIT_TASKS_MAX:
4729                                 return system_tasks_max();
4730                         default:
4731                                 assert_not_reached();
4732                 }
4733
4734         cc = ASSERT_PTR(unit_get_cgroup_context(u));
4735         switch (type) {
4736                 /* Note: on legacy/hybrid hierarchies memory_max stays CGROUP_LIMIT_MAX unless configured
4737                  * explicitly. Effective value of MemoryLimit= (cgroup v1) is not implemented. */
4738                 case CGROUP_LIMIT_MEMORY_MAX:
4739                         return cc->memory_max;
4740                 case CGROUP_LIMIT_MEMORY_HIGH:
4741                         return cc->memory_high;
4742                 case CGROUP_LIMIT_TASKS_MAX:
4743                         return cgroup_tasks_max_resolve(&cc->tasks_max);
4744                 default:
4745                         assert_not_reached();
4746         }
4747 }
4748
4749 int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) {
4750         uint64_t infimum;
4751
4752         assert(u);
4753         assert(ret);
4754         assert(type >= 0);
4755         assert(type < _CGROUP_LIMIT_TYPE_MAX);
4756
4757         if (!UNIT_HAS_CGROUP_CONTEXT(u))
4758                 return -EINVAL;
4759
4760         infimum = unit_get_effective_limit_one(u, type);
4761         for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice))
4762                 infimum = MIN(infimum, unit_get_effective_limit_one(slice, type));
4763
4764         *ret = infimum;
4765         return 0;
4766 }
4767
4768 static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
4769         static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4770                 [CGROUP_IO_READ_BYTES]       = "rbytes=",
4771                 [CGROUP_IO_WRITE_BYTES]      = "wbytes=",
4772                 [CGROUP_IO_READ_OPERATIONS]  = "rios=",
4773                 [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
4774         };
4775         uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
4776         _cleanup_free_ char *path = NULL;
4777         _cleanup_fclose_ FILE *f = NULL;
4778         int r;
4779
4780         assert(u);
4781
4782         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4783         if (!crt || !crt->cgroup_path)
4784                 return -ENODATA;
4785
4786         if (unit_has_host_root_cgroup(u))
4787                 return -ENODATA; /* TODO: return useful data for the top-level cgroup */
4788
4789         r = cg_all_unified();
4790         if (r < 0)
4791                 return r;
4792         if (r == 0)
4793                 return -ENODATA;
4794
4795         if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_IO))
4796                 return -ENODATA;
4797
4798         r = cg_get_path("io", crt->cgroup_path, "io.stat", &path);
4799         if (r < 0)
4800                 return r;
4801
4802         f = fopen(path, "re");
4803         if (!f)
4804                 return -errno;
4805
4806         for (;;) {
4807                 _cleanup_free_ char *line = NULL;
4808                 const char *p;
4809
4810                 r = read_line(f, LONG_LINE_MAX, &line);
4811                 if (r < 0)
4812                         return r;
4813                 if (r == 0)
4814                         break;
4815
4816                 p = line;
4817                 p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
4818                 p += strspn(p, WHITESPACE);  /* Skip over following whitespace */
4819
4820                 for (;;) {
4821                         _cleanup_free_ char *word = NULL;
4822
4823                         r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
4824                         if (r < 0)
4825                                 return r;
4826                         if (r == 0)
4827                                 break;
4828
4829                         for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
4830                                 const char *x;
4831
4832                                 x = startswith(word, field_names[i]);
4833                                 if (x) {
4834                                         uint64_t w;
4835
4836                                         r = safe_atou64(x, &w);
4837                                         if (r < 0)
4838                                                 return r;
4839
4840                                         /* Sum up the stats of all devices */
4841                                         acc[i] += w;
4842                                         break;
4843                                 }
4844                         }
4845                 }
4846         }
4847
4848         memcpy(ret, acc, sizeof(acc));
4849         return 0;
4850 }
4851
4852 int unit_get_io_accounting(
4853                 Unit *u,
4854                 CGroupIOAccountingMetric metric,
4855                 bool allow_cache,
4856                 uint64_t *ret) {
4857
4858         uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
4859         int r;
4860
4861         /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
4862
4863         if (!UNIT_CGROUP_BOOL(u, io_accounting))
4864                 return -ENODATA;
4865
4866         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4867         if (!crt || !crt->cgroup_path)
4868                 return -ENODATA;
4869
4870         if (allow_cache && crt->io_accounting_last[metric] != UINT64_MAX)
4871                 goto done;
4872
4873         r = unit_get_io_accounting_raw(u, raw);
4874         if (r == -ENODATA && crt->io_accounting_last[metric] != UINT64_MAX)
4875                 goto done;
4876         if (r < 0)
4877                 return r;
4878
4879         for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
4880                 /* Saturated subtraction */
4881                 if (raw[i] > crt->io_accounting_base[i])
4882                         crt->io_accounting_last[i] = raw[i] - crt->io_accounting_base[i];
4883                 else
4884                         crt->io_accounting_last[i] = 0;
4885         }
4886
4887 done:
4888         if (ret)
4889                 *ret = crt->io_accounting_last[metric];
4890
4891         return 0;
4892 }
4893
4894 int unit_reset_cpu_accounting(Unit *u) {
4895         int r;
4896
4897         assert(u);
4898
4899         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4900         if (!crt || !crt->cgroup_path)
4901                 return 0;
4902
4903         crt->cpu_usage_last = NSEC_INFINITY;
4904
4905         r = unit_get_cpu_usage_raw(u, &crt->cpu_usage_base);
4906         if (r < 0) {
4907                 crt->cpu_usage_base = 0;
4908                 return r;
4909         }
4910
4911         return 0;
4912 }
4913
4914 void unit_reset_memory_accounting_last(Unit *u) {
4915         assert(u);
4916
4917         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4918         if (!crt || !crt->cgroup_path)
4919                 return;
4920
4921         FOREACH_ELEMENT(i, crt->memory_accounting_last)
4922                 *i = UINT64_MAX;
4923 }
4924
4925 int unit_reset_ip_accounting(Unit *u) {
4926         int r = 0;
4927
4928         assert(u);
4929
4930         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4931         if (!crt || !crt->cgroup_path)
4932                 return 0;
4933
4934         if (crt->ip_accounting_ingress_map_fd >= 0)
4935                 RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_ingress_map_fd));
4936
4937         if (crt->ip_accounting_egress_map_fd >= 0)
4938                 RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_egress_map_fd));
4939
4940         zero(crt->ip_accounting_extra);
4941
4942         return r;
4943 }
4944
4945 void unit_reset_io_accounting_last(Unit *u) {
4946         assert(u);
4947
4948         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4949         if (!crt || !crt->cgroup_path)
4950                 return;
4951
4952         FOREACH_ARRAY(i, crt->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX)
4953                 *i = UINT64_MAX;
4954 }
4955
4956 int unit_reset_io_accounting(Unit *u) {
4957         int r;
4958
4959         assert(u);
4960
4961         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4962         if (!crt || !crt->cgroup_path)
4963                 return 0;
4964
4965         unit_reset_io_accounting_last(u);
4966
4967         r = unit_get_io_accounting_raw(u, crt->io_accounting_base);
4968         if (r < 0) {
4969                 zero(crt->io_accounting_base);
4970                 return r;
4971         }
4972
4973         return 0;
4974 }
4975
4976 int unit_reset_accounting(Unit *u) {
4977         int r = 0;
4978
4979         assert(u);
4980
4981         RET_GATHER(r, unit_reset_cpu_accounting(u));
4982         RET_GATHER(r, unit_reset_io_accounting(u));
4983         RET_GATHER(r, unit_reset_ip_accounting(u));
4984         unit_reset_memory_accounting_last(u);
4985
4986         return r;
4987 }
4988
4989 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
4990         assert(u);
4991
4992         if (!UNIT_HAS_CGROUP_CONTEXT(u))
4993                 return;
4994
4995         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4996         if (!crt)
4997                 return;
4998
4999         if (m == 0)
5000                 return;
5001
5002         /* always invalidate compat pairs together */
5003         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
5004                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
5005
5006         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
5007                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
5008
5009         if (FLAGS_SET(crt->cgroup_invalidated_mask, m)) /* NOP? */
5010                 return;
5011
5012         crt->cgroup_invalidated_mask |= m;
5013         unit_add_to_cgroup_realize_queue(u);
5014 }
5015
5016 void unit_invalidate_cgroup_bpf(Unit *u) {
5017         assert(u);
5018
5019         if (!UNIT_HAS_CGROUP_CONTEXT(u))
5020                 return;
5021
5022         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
5023         if (!crt)
5024                 return;
5025
5026         if (crt->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
5027                 return;
5028
5029         crt->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
5030         unit_add_to_cgroup_realize_queue(u);
5031
5032         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
5033          * list of our children includes our own. */
5034         if (u->type == UNIT_SLICE) {
5035                 Unit *member;
5036
5037                 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
5038                         unit_invalidate_cgroup_bpf(member);
5039         }
5040 }
5041
5042 void unit_cgroup_catchup(Unit *u) {
5043         assert(u);
5044
5045         if (!UNIT_HAS_CGROUP_CONTEXT(u))
5046                 return;
5047
5048         /* We dropped the inotify watch during reexec/reload, so we need to
5049          * check these as they may have changed.
5050          * Note that (currently) the kernel doesn't actually update cgroup
5051          * file modification times, so we can't just serialize and then check
5052          * the mtime for file(s) we are interested in. */
5053         (void) unit_check_cgroup_events(u);
5054         unit_add_to_cgroup_oom_queue(u);
5055 }
5056
5057 bool unit_cgroup_delegate(Unit *u) {
5058         CGroupContext *c;
5059
5060         assert(u);
5061
5062         if (!UNIT_VTABLE(u)->can_delegate)
5063                 return false;
5064
5065         c = unit_get_cgroup_context(u);
5066         if (!c)
5067                 return false;
5068
5069         return c->delegate;
5070 }
5071
5072 void manager_invalidate_startup_units(Manager *m) {
5073         Unit *u;
5074
5075         assert(m);
5076
5077         SET_FOREACH(u, m->startup_units)
5078                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
5079 }
5080
5081 static int unit_cgroup_freezer_kernel_state(Unit *u, FreezerState *ret) {
5082         _cleanup_free_ char *val = NULL;
5083         FreezerState s;
5084         int r;
5085
5086         assert(u);
5087         assert(ret);
5088
5089         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
5090         if (!crt || !crt->cgroup_path)
5091                 return -EOWNERDEAD;
5092
5093         r = cg_get_keyed_attribute(
5094                         SYSTEMD_CGROUP_CONTROLLER,
5095                         crt->cgroup_path,
5096                         "cgroup.events",
5097                         STRV_MAKE("frozen"),
5098                         &val);
5099         if (IN_SET(r, -ENOENT, -ENXIO))
5100                 return -ENODATA;
5101         if (r < 0)
5102                 return r;
5103
5104         if (streq(val, "0"))
5105                 s = FREEZER_RUNNING;
5106         else if (streq(val, "1"))
5107                 s = FREEZER_FROZEN;
5108         else {
5109                 log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "Unexpected cgroup frozen state: %s", val);
5110                 s = _FREEZER_STATE_INVALID;
5111         }
5112
5113         *ret = s;
5114         return 0;
5115 }
5116
5117 int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
5118         _cleanup_free_ char *path = NULL;
5119         FreezerState target, current, next;
5120         int r;
5121
5122         assert(u);
5123         assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE,
5124                               FREEZER_THAW, FREEZER_PARENT_THAW));
5125
5126         if (!cg_freezer_supported())
5127                 return 0;
5128
5129         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
5130         if (!crt || !crt->cgroup_realized)
5131                 return 0; /* No cgroup = nothing running to freeze */
5132
5133         unit_next_freezer_state(u, action, &next, &target);
5134
5135         r = unit_cgroup_freezer_kernel_state(u, &current);
5136         if (r < 0)
5137                 return r;
5138
5139         if (current == target)
5140                 next = freezer_state_finish(next);
5141         else if (IN_SET(next, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT, FREEZER_RUNNING)) {
5142                 /* We're transitioning into a finished state, which implies that the cgroup's
5143                  * current state already matches the target and thus we'd return 0. But, reality
5144                  * shows otherwise. This indicates that our freezer_state tracking has diverged
5145                  * from the real state of the cgroup, which can happen if someone meddles with the
5146                  * cgroup from underneath us. This really shouldn't happen during normal operation,
5147                  * though. So, let's warn about it and fix up the state to be valid */
5148
5149                 log_unit_warning(u, "Unit wants to transition to %s freezer state but cgroup is unexpectedly %s, fixing up.",
5150                                  freezer_state_to_string(next), freezer_state_to_string(current) ?: "(invalid)");
5151
5152                 if (next == FREEZER_FROZEN)
5153                         next = FREEZER_FREEZING;
5154                 else if (next == FREEZER_FROZEN_BY_PARENT)
5155                         next = FREEZER_FREEZING_BY_PARENT;
5156                 else if (next == FREEZER_RUNNING)
5157                         next = FREEZER_THAWING;
5158         }
5159
5160         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.freeze", &path);
5161         if (r < 0)
5162                 return r;
5163
5164         log_unit_debug(u, "Unit freezer state was %s, now %s.",
5165                        freezer_state_to_string(u->freezer_state),
5166                        freezer_state_to_string(next));
5167
5168         r = write_string_file(path, one_zero(target == FREEZER_FROZEN), WRITE_STRING_FILE_DISABLE_BUFFER);
5169         if (r < 0)
5170                 return r;
5171
5172         u->freezer_state = next;
5173         return target != current;
5174 }
5175
5176 int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
5177         _cleanup_free_ char *v = NULL;
5178         int r;
5179
5180         assert(u);
5181         assert(cpus);
5182
5183         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
5184         if (!crt || !crt->cgroup_path)
5185                 return -ENODATA;
5186
5187         if ((crt->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
5188                 return -ENODATA;
5189
5190         r = cg_all_unified();
5191         if (r < 0)
5192                 return r;
5193         if (r == 0)
5194                 return -ENODATA;
5195
5196         r = cg_get_attribute("cpuset", crt->cgroup_path, name, &v);
5197         if (r == -ENOENT)
5198                 return -ENODATA;
5199         if (r < 0)
5200                 return r;
5201
5202         return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
5203 }
5204
5205 CGroupRuntime *cgroup_runtime_new(void) {
5206         _cleanup_(cgroup_runtime_freep) CGroupRuntime *crt = NULL;
5207
5208         crt = new(CGroupRuntime, 1);
5209         if (!crt)
5210                 return NULL;
5211
5212         *crt = (CGroupRuntime) {
5213                 .cpu_usage_last = NSEC_INFINITY,
5214
5215                 .cgroup_control_inotify_wd = -1,
5216                 .cgroup_memory_inotify_wd = -1,
5217
5218                 .ip_accounting_ingress_map_fd = -EBADF,
5219                 .ip_accounting_egress_map_fd = -EBADF,
5220
5221                 .ipv4_allow_map_fd = -EBADF,
5222                 .ipv6_allow_map_fd = -EBADF,
5223                 .ipv4_deny_map_fd = -EBADF,
5224                 .ipv6_deny_map_fd = -EBADF,
5225
5226                 .cgroup_invalidated_mask = _CGROUP_MASK_ALL,
5227         };
5228
5229         FOREACH_ELEMENT(i, crt->memory_accounting_last)
5230                 *i = UINT64_MAX;
5231         FOREACH_ELEMENT(i, crt->io_accounting_base)
5232                 *i = UINT64_MAX;
5233         FOREACH_ELEMENT(i, crt->io_accounting_last)
5234                 *i = UINT64_MAX;
5235         FOREACH_ELEMENT(i, crt->ip_accounting_extra)
5236                 *i = UINT64_MAX;
5237
5238         return TAKE_PTR(crt);
5239 }
5240
5241 CGroupRuntime *cgroup_runtime_free(CGroupRuntime *crt) {
5242         if (!crt)
5243                 return NULL;
5244
5245         fdset_free(crt->initial_socket_bind_link_fds);
5246 #if BPF_FRAMEWORK
5247         bpf_link_free(crt->ipv4_socket_bind_link);
5248         bpf_link_free(crt->ipv6_socket_bind_link);
5249 #endif
5250         hashmap_free(crt->bpf_foreign_by_key);
5251
5252         bpf_program_free(crt->bpf_device_control_installed);
5253
5254 #if BPF_FRAMEWORK
5255         bpf_link_free(crt->restrict_ifaces_ingress_bpf_link);
5256         bpf_link_free(crt->restrict_ifaces_egress_bpf_link);
5257 #endif
5258         fdset_free(crt->initial_restrict_ifaces_link_fds);
5259
5260         safe_close(crt->ipv4_allow_map_fd);
5261         safe_close(crt->ipv6_allow_map_fd);
5262         safe_close(crt->ipv4_deny_map_fd);
5263         safe_close(crt->ipv6_deny_map_fd);
5264
5265         bpf_program_free(crt->ip_bpf_ingress);
5266         bpf_program_free(crt->ip_bpf_ingress_installed);
5267         bpf_program_free(crt->ip_bpf_egress);
5268         bpf_program_free(crt->ip_bpf_egress_installed);
5269
5270         set_free(crt->ip_bpf_custom_ingress);
5271         set_free(crt->ip_bpf_custom_ingress_installed);
5272         set_free(crt->ip_bpf_custom_egress);
5273         set_free(crt->ip_bpf_custom_egress_installed);
5274
5275         free(crt->cgroup_path);
5276
5277         return mfree(crt);
5278 }
5279
5280 static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
5281         [CGROUP_IP_INGRESS_BYTES]   = "ip-accounting-ingress-bytes",
5282         [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets",
5283         [CGROUP_IP_EGRESS_BYTES]    = "ip-accounting-egress-bytes",
5284         [CGROUP_IP_EGRESS_PACKETS]  = "ip-accounting-egress-packets",
5285 };
5286
5287 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric);
5288
5289 static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
5290         [CGROUP_IO_READ_BYTES]       = "io-accounting-read-bytes-base",
5291         [CGROUP_IO_WRITE_BYTES]      = "io-accounting-write-bytes-base",
5292         [CGROUP_IO_READ_OPERATIONS]  = "io-accounting-read-operations-base",
5293         [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base",
5294 };
5295
5296 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric);
5297
5298 static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
5299         [CGROUP_IO_READ_BYTES]       = "io-accounting-read-bytes-last",
5300         [CGROUP_IO_WRITE_BYTES]      = "io-accounting-write-bytes-last",
5301         [CGROUP_IO_READ_OPERATIONS]  = "io-accounting-read-operations-last",
5302         [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last",
5303 };
5304
5305 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric);
5306
5307 static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = {
5308         [CGROUP_MEMORY_PEAK]      = "memory-accounting-peak",
5309         [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak",
5310 };
5311
5312 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric);
5313
5314 static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) {
5315         _cleanup_free_ char *s = NULL;
5316         int r;
5317
5318         assert(f);
5319         assert(key);
5320
5321         if (mask == 0)
5322                 return 0;
5323
5324         r = cg_mask_to_string(mask, &s);
5325         if (r < 0)
5326                 return log_error_errno(r, "Failed to format cgroup mask: %m");
5327
5328         return serialize_item(f, key, s);
5329 }
5330
5331 int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds) {
5332         int r;
5333
5334         assert(u);
5335         assert(f);
5336         assert(fds);
5337
5338         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
5339         if (!crt)
5340                 return 0;
5341
5342         (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, crt->cpu_usage_base);
5343         if (crt->cpu_usage_last != NSEC_INFINITY)
5344                 (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, crt->cpu_usage_last);
5345
5346         if (crt->managed_oom_kill_last > 0)
5347                 (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, crt->managed_oom_kill_last);
5348
5349         if (crt->oom_kill_last > 0)
5350                 (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, crt->oom_kill_last);
5351
5352         for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) {
5353                 uint64_t v;
5354
5355                 r = unit_get_memory_accounting(u, metric, &v);
5356                 if (r >= 0)
5357                         (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v);
5358         }
5359
5360         for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
5361                 uint64_t v;
5362
5363                 r = unit_get_ip_accounting(u, m, &v);
5364                 if (r >= 0)
5365                         (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v);
5366         }
5367
5368         for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) {
5369                 (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, crt->io_accounting_base[im]);
5370
5371                 if (crt->io_accounting_last[im] != UINT64_MAX)
5372                         (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, crt->io_accounting_last[im]);
5373         }
5374
5375         if (crt->cgroup_path)
5376                 (void) serialize_item(f, "cgroup", crt->cgroup_path);
5377         if (crt->cgroup_id != 0)
5378                 (void) serialize_item_format(f, "cgroup-id", "%" PRIu64, crt->cgroup_id);
5379
5380         (void) serialize_bool(f, "cgroup-realized", crt->cgroup_realized);
5381         (void) serialize_cgroup_mask(f, "cgroup-realized-mask", crt->cgroup_realized_mask);
5382         (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", crt->cgroup_enabled_mask);
5383         (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", crt->cgroup_invalidated_mask);
5384
5385         (void) bpf_socket_bind_serialize(u, f, fds);
5386
5387         (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", crt->ip_bpf_ingress_installed);
5388         (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", crt->ip_bpf_egress_installed);
5389         (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", crt->bpf_device_control_installed);
5390         (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", crt->ip_bpf_custom_ingress_installed);
5391         (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", crt->ip_bpf_custom_egress_installed);
5392
5393         (void) bpf_restrict_ifaces_serialize(u, f, fds);
5394
5395         return 0;
5396 }
5397
5398 #define MATCH_DESERIALIZE(u, key, l, v, parse_func, target)             \
5399         ({                                                              \
5400                 bool _deserialize_matched = streq(l, key);              \
5401                 if (_deserialize_matched) {                             \
5402                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
5403                         if (!crt)                                       \
5404                                 log_oom_debug();                        \
5405                         else {                                          \
5406                                 int _deserialize_r = parse_func(v);     \
5407                                 if (_deserialize_r < 0)                 \
5408                                         log_unit_debug_errno(u, _deserialize_r, \
5409                                                              "Failed to parse \"%s=%s\", ignoring.", l, v); \
5410                                 else                                    \
5411                                         crt->target = _deserialize_r; \
5412                         }                                               \
5413                 }                                                       \
5414                 _deserialize_matched;                                   \
5415         })
5416
5417 #define MATCH_DESERIALIZE_IMMEDIATE(u, key, l, v, parse_func, target)   \
5418         ({                                                              \
5419                  bool _deserialize_matched = streq(l, key);             \
5420                  if (_deserialize_matched) {                            \
5421                          CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
5422                          if (!crt)                                      \
5423                                  log_oom_debug();                       \
5424                          else {                                         \
5425                                  int _deserialize_r = parse_func(v, &crt->target); \
5426                                  if (_deserialize_r < 0)                \
5427                                          log_unit_debug_errno(u, _deserialize_r, \
5428                                                               "Failed to parse \"%s=%s\", ignoring", l, v); \
5429                          }                                              \
5430                  }                                                      \
5431                 _deserialize_matched;                                   \
5432         })
5433
5434 #define MATCH_DESERIALIZE_METRIC(u, key, l, v, parse_func, target)             \
5435         ({                                                              \
5436                 bool _deserialize_matched = streq(l, key);              \
5437                 if (_deserialize_matched) {                             \
5438                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
5439                         if (!crt)                                       \
5440                                 log_oom_debug();                        \
5441                         else {                                          \
5442                                 int _deserialize_r = parse_func(v);     \
5443                                 if (_deserialize_r < 0)                 \
5444                                         log_unit_debug_errno(u, _deserialize_r, \
5445                                                              "Failed to parse \"%s=%s\", ignoring.", l, v); \
5446                                 else                                    \
5447                                         crt->target = _deserialize_r; \
5448                         }                                               \
5449                 }                                                       \
5450                 _deserialize_matched;                                   \
5451         })
5452
5453 int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, FDSet *fds) {
5454         int r;
5455
5456         assert(u);
5457         assert(value);
5458
5459         if (!UNIT_HAS_CGROUP_CONTEXT(u))
5460                 return 0;
5461
5462         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-base", key, value, safe_atou64, cpu_usage_base) ||
5463             MATCH_DESERIALIZE_IMMEDIATE(u, "cpuacct-usage-base", key, value, safe_atou64, cpu_usage_base))
5464                 return 1;
5465
5466         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-last", key, value, safe_atou64, cpu_usage_last))
5467                 return 1;
5468
5469         if (MATCH_DESERIALIZE_IMMEDIATE(u, "managed-oom-kill-last", key, value, safe_atou64, managed_oom_kill_last))
5470                 return 1;
5471
5472         if (MATCH_DESERIALIZE_IMMEDIATE(u, "oom-kill-last", key, value, safe_atou64, oom_kill_last))
5473                 return 1;
5474
5475         if (streq(key, "cgroup")) {
5476                 r = unit_set_cgroup_path(u, value);
5477                 if (r < 0)
5478                         log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", value);
5479
5480                 (void) unit_watch_cgroup(u);
5481                 (void) unit_watch_cgroup_memory(u);
5482                 return 1;
5483         }
5484
5485         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-id", key, value, safe_atou64, cgroup_id))
5486                 return 1;
5487
5488         if (MATCH_DESERIALIZE(u, "cgroup-realized", key, value, parse_boolean, cgroup_realized))
5489                 return 1;
5490
5491         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized-mask", key, value, cg_mask_from_string, cgroup_realized_mask))
5492                 return 1;
5493
5494         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-enabled-mask", key, value, cg_mask_from_string, cgroup_enabled_mask))
5495                 return 1;
5496
5497         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-invalidated-mask", key, value, cg_mask_from_string, cgroup_invalidated_mask))
5498                 return 1;
5499
5500         if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
5501                 int fd;
5502
5503                 fd = deserialize_fd(fds, value);
5504                 if (fd >= 0)
5505                         (void) bpf_socket_bind_add_initial_link_fd(u, fd);
5506
5507                 return 1;
5508         }
5509
5510         if (STR_IN_SET(key,
5511                        "ip-bpf-ingress-installed", "ip-bpf-egress-installed",
5512                        "bpf-device-control-installed",
5513                        "ip-bpf-custom-ingress-installed", "ip-bpf-custom-egress-installed")) {
5514
5515                 CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
5516                 if (!crt)
5517                         log_oom_debug();
5518                 else {
5519                         if (streq(key, "ip-bpf-ingress-installed"))
5520                                 (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_ingress_installed);
5521
5522                         if (streq(key, "ip-bpf-egress-installed"))
5523                                 (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_egress_installed);
5524
5525                         if (streq(key, "bpf-device-control-installed"))
5526                                 (void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed);
5527
5528                         if (streq(key, "ip-bpf-custom-ingress-installed"))
5529                                 (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_ingress_installed);
5530
5531                         if (streq(key, "ip-bpf-custom-egress-installed"))
5532                                 (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_egress_installed);
5533                 }
5534
5535                 return 1;
5536         }
5537
5538         if (streq(key, "restrict-ifaces-bpf-fd")) {
5539                 int fd;
5540
5541                 fd = deserialize_fd(fds, value);
5542                 if (fd >= 0)
5543                         (void) bpf_restrict_ifaces_add_initial_link_fd(u, fd);
5544                 return 1;
5545         }
5546
5547         CGroupMemoryAccountingMetric mm = memory_accounting_metric_field_last_from_string(key);
5548         if (mm >= 0) {
5549                 uint64_t c;
5550
5551                 r = safe_atou64(value, &c);
5552                 if (r < 0)
5553                         log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", value);
5554                 else {
5555                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
5556                         if (!crt)
5557                                 log_oom_debug();
5558                         else
5559                                 crt->memory_accounting_last[mm] = c;
5560                 }
5561
5562                 return 1;
5563         }
5564
5565         CGroupIPAccountingMetric ipm = ip_accounting_metric_field_from_string(key);
5566         if (ipm >= 0) {
5567                 uint64_t c;
5568
5569                 r = safe_atou64(value, &c);
5570                 if (r < 0)
5571                         log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", value);
5572                 else {
5573                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
5574                         if (!crt)
5575                                 log_oom_debug();
5576                         else
5577                                 crt->ip_accounting_extra[ipm] = c;
5578                 }
5579
5580                 return 1;
5581         }
5582
5583         CGroupIOAccountingMetric iom = io_accounting_metric_field_base_from_string(key);
5584         if (iom >= 0) {
5585                 uint64_t c;
5586
5587                 r = safe_atou64(value, &c);
5588                 if (r < 0)
5589                         log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", value);
5590                 else {
5591                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
5592                         if (!crt)
5593                                 log_oom_debug();
5594                         else
5595                                 crt->io_accounting_base[iom] = c;
5596                 }
5597
5598                 return 1;
5599         }
5600
5601         iom = io_accounting_metric_field_last_from_string(key);
5602         if (iom >= 0) {
5603                 uint64_t c;
5604
5605                 r = safe_atou64(value, &c);
5606                 if (r < 0)
5607                         log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", value);
5608                 else {
5609                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
5610                         if (!crt)
5611                                 log_oom_debug();
5612                         else
5613                                 crt->io_accounting_last[iom] = c;
5614                 }
5615                 return 1;
5616         }
5617
5618         return 0;
5619 }
5620
5621 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
5622         [CGROUP_DEVICE_POLICY_AUTO]   = "auto",
5623         [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
5624         [CGROUP_DEVICE_POLICY_STRICT] = "strict",
5625 };
5626
5627 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
5628
5629 static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
5630         [CGROUP_PRESSURE_WATCH_OFF]  = "off",
5631         [CGROUP_PRESSURE_WATCH_AUTO] = "auto",
5632         [CGROUP_PRESSURE_WATCH_ON]   = "on",
5633         [CGROUP_PRESSURE_WATCH_SKIP] = "skip",
5634 };
5635
5636 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_ON);
5637
5638 static const char* const cgroup_ip_accounting_metric_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
5639         [CGROUP_IP_INGRESS_BYTES]   = "IPIngressBytes",
5640         [CGROUP_IP_EGRESS_BYTES]    = "IPEgressBytes",
5641         [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets",
5642         [CGROUP_IP_EGRESS_PACKETS]  = "IPEgressPackets",
5643 };
5644
5645 DEFINE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric);
5646
5647 static const char* const cgroup_io_accounting_metric_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
5648         [CGROUP_IO_READ_BYTES]       = "IOReadBytes",
5649         [CGROUP_IO_WRITE_BYTES]      = "IOWriteBytes",
5650         [CGROUP_IO_READ_OPERATIONS]  = "IOReadOperations",
5651         [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations",
5652 };
5653
5654 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_accounting_metric, CGroupIOAccountingMetric);
5655
5656 static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
5657         [CGROUP_MEMORY_PEAK]          = "MemoryPeak",
5658         [CGROUP_MEMORY_SWAP_CURRENT]  = "MemorySwapCurrent",
5659         [CGROUP_MEMORY_SWAP_PEAK]     = "MemorySwapPeak",
5660         [CGROUP_MEMORY_ZSWAP_CURRENT] = "MemoryZSwapCurrent",
5661 };
5662
5663 DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
5664
5665 static const char *const cgroup_effective_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = {
5666         [CGROUP_LIMIT_MEMORY_MAX]  = "EffectiveMemoryMax",
5667         [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh",
5668         [CGROUP_LIMIT_TASKS_MAX]   = "EffectiveTasksMax",
5669 };
5670
5671 DEFINE_STRING_TABLE_LOOKUP(cgroup_effective_limit_type, CGroupLimitType);