src/core/cgroup.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <fcntl.h>
   4 #include <sys/stat.h>
   5 #include <unistd.h>
   6
   7 #include "sd-bus.h"
   8 #include "sd-messages.h"
   9
  10 #include "af-list.h"
  11 #include "alloc-util.h"
  12 #include "blockdev-util.h"
  13 #include "bpf-devices.h"
  14 #include "bpf-firewall.h"
  15 #include "bpf-foreign.h"
  16 #include "bpf-program.h"
  17 #include "bpf-restrict-ifaces.h"
  18 #include "bpf-socket-bind.h"
  19 #include "btrfs-util.h"
  20 #include "bus-error.h"
  21 #include "bus-locator.h"
  22 #include "cgroup.h"
  23 #include "cgroup-setup.h"
  24 #include "cgroup-util.h"
  25 #include "devnum-util.h"
  26 #include "errno-util.h"
  27 #include "extract-word.h"
  28 #include "fd-util.h"
  29 #include "fdset.h"
  30 #include "fileio.h"
  31 #include "firewall-util.h"
  32 #include "in-addr-prefix-util.h"
  33 #include "inotify-util.h"
  34 #include "ip-protocol-list.h"
  35 #include "limits-util.h"
  36 #include "manager.h"
  37 #include "nulstr-util.h"
  38 #include "parse-util.h"
  39 #include "path-util.h"
  40 #include "percent-util.h"
  41 #include "pidref.h"
  42 #include "process-util.h"
  43 #include "procfs-util.h"
  44 #include "serialize.h"
  45 #include "set.h"
  46 #include "special.h"
  47 #include "stdio-util.h"
  48 #include "string-table.h"
  49 #include "string-util.h"
  50 #include "strv.h"
  51 #include "virt.h"
  52
  53 #if BPF_FRAMEWORK
  54 #include "bpf-dlopen.h"
  55 #include "bpf-link.h"
  56 #include "bpf-restrict-fs.h"
  57 #include "bpf/restrict_fs/restrict-fs-skel.h"
  58 #endif
  59
  60 #define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  61
  62 /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
  63  * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
  64  * out specific attributes from us. */
  65 #define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(ABS(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
  66
  67 static void unit_remove_from_cgroup_empty_queue(Unit *u);
  68
  69 uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max) {
  70         if (tasks_max->scale == 0)
  71                 return tasks_max->value;
  72
  73         return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
  74 }
  75
  76 bool manager_owns_host_root_cgroup(Manager *m) {
  77         assert(m);
  78
  79         /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
  80          * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
  81          * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
  82          * we run in any kind of container virtualization. */
  83
  84         if (MANAGER_IS_USER(m))
  85                 return false;
  86
  87         if (detect_container() > 0)
  88                 return false;
  89
  90         return empty_or_root(m->cgroup_root);
  91 }
  92
  93 bool unit_has_startup_cgroup_constraints(Unit *u) {
  94         assert(u);
  95
  96         /* Returns true if this unit has any directives which apply during
  97          * startup/shutdown phases. */
  98
  99         CGroupContext *c;
 100
 101         c = unit_get_cgroup_context(u);
 102         if (!c)
 103                 return false;
 104
 105         return c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 106                c->startup_cpuset_cpus.set ||
 107                c->startup_cpuset_mems.set ||
 108                c->startup_memory_high_set ||
 109                c->startup_memory_max_set ||
 110                c->startup_memory_swap_max_set||
 111                c->startup_memory_zswap_max_set ||
 112                c->startup_memory_low_set;
 113 }
 114
 115 bool unit_has_host_root_cgroup(const Unit *u) {
 116         assert(u);
 117         assert(u->manager);
 118
 119         /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
 120          * the manager manages the root cgroup. */
 121
 122         if (!manager_owns_host_root_cgroup(u->manager))
 123                 return false;
 124
 125         return unit_has_name(u, SPECIAL_ROOT_SLICE);
 126 }
 127
 128 static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
 129         int r;
 130
 131         assert(u);
 132
 133         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
 134         if (!crt || !crt->cgroup_path)
 135                 return -EOWNERDEAD;
 136
 137         r = cg_set_attribute(controller, crt->cgroup_path, attribute, value);
 138         if (r < 0)
 139                 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
 140                                     strna(attribute), empty_to_root(crt->cgroup_path), (int) strcspn(value, NEWLINE), value);
 141
 142         return r;
 143 }
 144
 145 void cgroup_context_init(CGroupContext *c) {
 146         assert(c);
 147
 148         /* Initialize everything to the kernel defaults. When initializing a bool member to 'true', make
 149          * sure to serialize in execute-serialize.c using serialize_bool() instead of
 150          * serialize_bool_elide(), as sd-executor will initialize here to 'true', but serialize_bool_elide()
 151          * skips serialization if the value is 'false' (as that's the common default), so if the value at
 152          * runtime is zero it would be lost after deserialization. Same when initializing uint64_t and other
 153          * values, update/add a conditional serialization check. This is to minimize the amount of
 154          * serialized data that is sent to the sd-executor, so that there is less work to do on the default
 155          * cases. */
 156
 157         *c = (CGroupContext) {
 158                 .cpu_weight = CGROUP_WEIGHT_INVALID,
 159                 .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
 160                 .cpu_quota_per_sec_usec = USEC_INFINITY,
 161                 .cpu_quota_period_usec = USEC_INFINITY,
 162
 163                 .memory_high = CGROUP_LIMIT_MAX,
 164                 .startup_memory_high = CGROUP_LIMIT_MAX,
 165                 .memory_max = CGROUP_LIMIT_MAX,
 166                 .startup_memory_max = CGROUP_LIMIT_MAX,
 167                 .memory_swap_max = CGROUP_LIMIT_MAX,
 168                 .startup_memory_swap_max = CGROUP_LIMIT_MAX,
 169                 .memory_zswap_max = CGROUP_LIMIT_MAX,
 170                 .startup_memory_zswap_max = CGROUP_LIMIT_MAX,
 171
 172                 .memory_zswap_writeback = true,
 173
 174                 .io_weight = CGROUP_WEIGHT_INVALID,
 175                 .startup_io_weight = CGROUP_WEIGHT_INVALID,
 176
 177                 .tasks_max = CGROUP_TASKS_MAX_UNSET,
 178
 179                 .moom_swap = MANAGED_OOM_AUTO,
 180                 .moom_mem_pressure = MANAGED_OOM_AUTO,
 181                 .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
 182                 /* The default duration value in oomd.conf will be used when
 183                  * moom_mem_pressure_duration_usec is set to infinity. */
 184                 .moom_mem_pressure_duration_usec = USEC_INFINITY,
 185
 186                 .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID,
 187                 .memory_pressure_threshold_usec = USEC_INFINITY,
 188         };
 189 }
 190
 191 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
 192         assert(c);
 193         assert(a);
 194
 195         LIST_REMOVE(device_allow, c->device_allow, a);
 196         free(a->path);
 197         free(a);
 198 }
 199
 200 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
 201         assert(c);
 202         assert(w);
 203
 204         LIST_REMOVE(device_weights, c->io_device_weights, w);
 205         free(w->path);
 206         free(w);
 207 }
 208
 209 void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
 210         assert(c);
 211         assert(l);
 212
 213         LIST_REMOVE(device_latencies, c->io_device_latencies, l);
 214         free(l->path);
 215         free(l);
 216 }
 217
 218 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
 219         assert(c);
 220         assert(l);
 221
 222         LIST_REMOVE(device_limits, c->io_device_limits, l);
 223         free(l->path);
 224         free(l);
 225 }
 226
 227 void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) {
 228         assert(c);
 229         assert(p);
 230
 231         LIST_REMOVE(programs, c->bpf_foreign_programs, p);
 232         free(p->bpffs_path);
 233         free(p);
 234 }
 235
 236 void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) {
 237         assert(head);
 238
 239         LIST_CLEAR(socket_bind_items, *head, free);
 240 }
 241
 242 void cgroup_context_done(CGroupContext *c) {
 243         assert(c);
 244
 245         while (c->io_device_weights)
 246                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 247
 248         while (c->io_device_latencies)
 249                 cgroup_context_free_io_device_latency(c, c->io_device_latencies);
 250
 251         while (c->io_device_limits)
 252                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 253
 254         while (c->device_allow)
 255                 cgroup_context_free_device_allow(c, c->device_allow);
 256
 257         cgroup_context_remove_socket_bind(&c->socket_bind_allow);
 258         cgroup_context_remove_socket_bind(&c->socket_bind_deny);
 259
 260         c->ip_address_allow = set_free(c->ip_address_allow);
 261         c->ip_address_deny = set_free(c->ip_address_deny);
 262
 263         c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
 264         c->ip_filters_egress = strv_free(c->ip_filters_egress);
 265
 266         while (c->bpf_foreign_programs)
 267                 cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
 268
 269         c->restrict_network_interfaces = set_free(c->restrict_network_interfaces);
 270
 271         cpu_set_done(&c->cpuset_cpus);
 272         cpu_set_done(&c->startup_cpuset_cpus);
 273         cpu_set_done(&c->cpuset_mems);
 274         cpu_set_done(&c->startup_cpuset_mems);
 275
 276         c->delegate_subgroup = mfree(c->delegate_subgroup);
 277
 278         nft_set_context_clear(&c->nft_set_context);
 279 }
 280
 281 static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
 282         assert(u);
 283
 284         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
 285         if (!crt || !crt->cgroup_path)
 286                 return -EOWNERDEAD;
 287
 288         return cg_get_attribute_as_uint64("memory", crt->cgroup_path, file, ret);
 289 }
 290
 291 static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
 292         CGroupContext *c;
 293         CGroupMask m;
 294         const char *file;
 295         uint64_t unit_value;
 296         int r;
 297
 298         /* Compare kernel memcg configuration against our internal systemd state.
 299          *
 300          * Returns:
 301          *
 302          * <0: On error.
 303          *  0: If the kernel memory setting doesn't match our configuration.
 304          * >0: If the kernel memory setting matches our configuration.
 305          *
 306          * The following values are only guaranteed to be populated on return >=0:
 307          *
 308          * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
 309          * - ret_kernel_value will contain the actual value presented by the kernel. */
 310
 311         assert(u);
 312
 313         /* The root slice doesn't have any controller files, so we can't compare anything. */
 314         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 315                 return -ENODATA;
 316
 317         /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
 318          * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
 319          * avoid specious errors in these scenarios, check that we even expect the memory controller to be
 320          * enabled at all. */
 321         m = unit_get_target_mask(u);
 322         if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
 323                 return -ENODATA;
 324
 325         assert_se(c = unit_get_cgroup_context(u));
 326
 327         bool startup = u->manager && IN_SET(manager_state(u->manager), MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
 328
 329         if (streq(property_name, "MemoryLow")) {
 330                 unit_value = unit_get_ancestor_memory_low(u);
 331                 file = "memory.low";
 332         } else if (startup && streq(property_name, "StartupMemoryLow")) {
 333                 unit_value = unit_get_ancestor_startup_memory_low(u);
 334                 file = "memory.low";
 335         } else if (streq(property_name, "MemoryMin")) {
 336                 unit_value = unit_get_ancestor_memory_min(u);
 337                 file = "memory.min";
 338         } else if (streq(property_name, "MemoryHigh")) {
 339                 unit_value = c->memory_high;
 340                 file = "memory.high";
 341         } else if (startup && streq(property_name, "StartupMemoryHigh")) {
 342                 unit_value = c->startup_memory_high;
 343                 file = "memory.high";
 344         } else if (streq(property_name, "MemoryMax")) {
 345                 unit_value = c->memory_max;
 346                 file = "memory.max";
 347         } else if (startup && streq(property_name, "StartupMemoryMax")) {
 348                 unit_value = c->startup_memory_max;
 349                 file = "memory.max";
 350         } else if (streq(property_name, "MemorySwapMax")) {
 351                 unit_value = c->memory_swap_max;
 352                 file = "memory.swap.max";
 353         } else if (startup && streq(property_name, "StartupMemorySwapMax")) {
 354                 unit_value = c->startup_memory_swap_max;
 355                 file = "memory.swap.max";
 356         } else if (streq(property_name, "MemoryZSwapMax")) {
 357                 unit_value = c->memory_zswap_max;
 358                 file = "memory.zswap.max";
 359         } else if (startup && streq(property_name, "StartupMemoryZSwapMax")) {
 360                 unit_value = c->startup_memory_zswap_max;
 361                 file = "memory.zswap.max";
 362         } else
 363                 return -EINVAL;
 364
 365         r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
 366         if (r < 0)
 367                 return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
 368
 369         /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
 370          * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
 371          * our internal page-counting. To support those future kernels, just check the value itself first
 372          * without any page-alignment. */
 373         if (*ret_kernel_value == unit_value) {
 374                 *ret_unit_value = unit_value;
 375                 return 1;
 376         }
 377
 378         /* The current kernel behaviour, by comparison, is that even if you write a particular number of
 379          * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
 380          * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
 381          * cricket. */
 382         if (unit_value != CGROUP_LIMIT_MAX)
 383                 unit_value = PAGE_ALIGN_DOWN(unit_value);
 384
 385         *ret_unit_value = unit_value;
 386
 387         return *ret_kernel_value == *ret_unit_value;
 388 }
 389
 390 #define FORMAT_CGROUP_DIFF_MAX 128
 391
 392 static char *format_cgroup_memory_limit_comparison(Unit *u, const char *property_name, char *buf, size_t l) {
 393         uint64_t kval, sval;
 394         int r;
 395
 396         assert(u);
 397         assert(property_name);
 398         assert(buf);
 399         assert(l > 0);
 400
 401         r = unit_compare_memory_limit(u, property_name, &sval, &kval);
 402
 403         /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
 404          * In the absence of reliably being able to detect whether memcg swap support is available or not,
 405          * only complain if the error is not ENOENT. This is similarly the case for memory.zswap.max relying
 406          * on CONFIG_ZSWAP. */
 407         if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
 408             (r == -ENOENT && STR_IN_SET(property_name,
 409                                         "MemorySwapMax",
 410                                         "StartupMemorySwapMax",
 411                                         "MemoryZSwapMax",
 412                                         "StartupMemoryZSwapMax")))
 413                 buf[0] = 0;
 414         else if (r < 0) {
 415                 errno = -r;
 416                 (void) snprintf(buf, l, " (error getting kernel value: %m)");
 417         } else
 418                 (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
 419
 420         return buf;
 421 }
 422
 423 const char* cgroup_device_permissions_to_string(CGroupDevicePermissions p) {
 424         static const char *table[_CGROUP_DEVICE_PERMISSIONS_MAX] = {
 425                 /* Lets simply define a table with every possible combination. As long as those are just 8 we
 426                  * can get away with it. If this ever grows to more we need to revisit this logic though. */
 427                 [0]                                                          = "",
 428                 [CGROUP_DEVICE_READ]                                         = "r",
 429                 [CGROUP_DEVICE_WRITE]                                        = "w",
 430                 [CGROUP_DEVICE_MKNOD]                                        = "m",
 431                 [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE]                     = "rw",
 432                 [CGROUP_DEVICE_READ|CGROUP_DEVICE_MKNOD]                     = "rm",
 433                 [CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD]                    = "wm",
 434                 [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "rwm",
 435         };
 436
 437         if (p < 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
 438                 return NULL;
 439
 440         return table[p];
 441 }
 442
 443 CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) {
 444         CGroupDevicePermissions p = 0;
 445
 446         if (!s)
 447                 return _CGROUP_DEVICE_PERMISSIONS_INVALID;
 448
 449         for (const char *c = s; *c; c++) {
 450                 if (*c == 'r')
 451                         p |= CGROUP_DEVICE_READ;
 452                 else if (*c == 'w')
 453                         p |= CGROUP_DEVICE_WRITE;
 454                 else if (*c == 'm')
 455                         p |= CGROUP_DEVICE_MKNOD;
 456                 else
 457                         return _CGROUP_DEVICE_PERMISSIONS_INVALID;
 458         }
 459
 460         return p;
 461 }
 462
 463 void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
 464         _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL;
 465         CGroupContext *c;
 466         struct in_addr_prefix *iaai;
 467         char cda[FORMAT_CGROUP_DIFF_MAX], cdb[FORMAT_CGROUP_DIFF_MAX], cdc[FORMAT_CGROUP_DIFF_MAX], cdd[FORMAT_CGROUP_DIFF_MAX],
 468                 cde[FORMAT_CGROUP_DIFF_MAX], cdf[FORMAT_CGROUP_DIFF_MAX], cdg[FORMAT_CGROUP_DIFF_MAX], cdh[FORMAT_CGROUP_DIFF_MAX],
 469                 cdi[FORMAT_CGROUP_DIFF_MAX], cdj[FORMAT_CGROUP_DIFF_MAX], cdk[FORMAT_CGROUP_DIFF_MAX];
 470
 471         assert(u);
 472         assert(f);
 473
 474         assert_se(c = unit_get_cgroup_context(u));
 475
 476         prefix = strempty(prefix);
 477
 478         (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
 479         (void) cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str);
 480
 481         /* "Delegate=" means "yes, but no controllers". Show this as "(none)". */
 482         const char *delegate_str = delegate_controllers_str ?: c->delegate ? "(none)" : "no";
 483
 484         cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
 485         startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
 486         cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
 487         startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
 488
 489         fprintf(f,
 490                 "%sIOAccounting: %s\n"
 491                 "%sMemoryAccounting: %s\n"
 492                 "%sTasksAccounting: %s\n"
 493                 "%sIPAccounting: %s\n"
 494                 "%sCPUWeight: %" PRIu64 "\n"
 495                 "%sStartupCPUWeight: %" PRIu64 "\n"
 496                 "%sCPUQuotaPerSecSec: %s\n"
 497                 "%sCPUQuotaPeriodSec: %s\n"
 498                 "%sAllowedCPUs: %s\n"
 499                 "%sStartupAllowedCPUs: %s\n"
 500                 "%sAllowedMemoryNodes: %s\n"
 501                 "%sStartupAllowedMemoryNodes: %s\n"
 502                 "%sIOWeight: %" PRIu64 "\n"
 503                 "%sStartupIOWeight: %" PRIu64 "\n"
 504                 "%sDefaultMemoryMin: %" PRIu64 "\n"
 505                 "%sDefaultMemoryLow: %" PRIu64 "\n"
 506                 "%sMemoryMin: %" PRIu64 "%s\n"
 507                 "%sMemoryLow: %" PRIu64 "%s\n"
 508                 "%sStartupMemoryLow: %" PRIu64 "%s\n"
 509                 "%sMemoryHigh: %" PRIu64 "%s\n"
 510                 "%sStartupMemoryHigh: %" PRIu64 "%s\n"
 511                 "%sMemoryMax: %" PRIu64 "%s\n"
 512                 "%sStartupMemoryMax: %" PRIu64 "%s\n"
 513                 "%sMemorySwapMax: %" PRIu64 "%s\n"
 514                 "%sStartupMemorySwapMax: %" PRIu64 "%s\n"
 515                 "%sMemoryZSwapMax: %" PRIu64 "%s\n"
 516                 "%sStartupMemoryZSwapMax: %" PRIu64 "%s\n"
 517                 "%sMemoryZSwapWriteback: %s\n"
 518                 "%sTasksMax: %" PRIu64 "\n"
 519                 "%sDevicePolicy: %s\n"
 520                 "%sDisableControllers: %s\n"
 521                 "%sDelegate: %s\n"
 522                 "%sManagedOOMSwap: %s\n"
 523                 "%sManagedOOMMemoryPressure: %s\n"
 524                 "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
 525                 "%sManagedOOMPreference: %s\n"
 526                 "%sMemoryPressureWatch: %s\n"
 527                 "%sCoredumpReceive: %s\n",
 528                 prefix, yes_no(c->io_accounting),
 529                 prefix, yes_no(c->memory_accounting),
 530                 prefix, yes_no(c->tasks_accounting),
 531                 prefix, yes_no(c->ip_accounting),
 532                 prefix, c->cpu_weight,
 533                 prefix, c->startup_cpu_weight,
 534                 prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1),
 535                 prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1),
 536                 prefix, strempty(cpuset_cpus),
 537                 prefix, strempty(startup_cpuset_cpus),
 538                 prefix, strempty(cpuset_mems),
 539                 prefix, strempty(startup_cpuset_mems),
 540                 prefix, c->io_weight,
 541                 prefix, c->startup_io_weight,
 542                 prefix, c->default_memory_min,
 543                 prefix, c->default_memory_low,
 544                 prefix, c->memory_min, format_cgroup_memory_limit_comparison(u, "MemoryMin", cda, sizeof(cda)),
 545                 prefix, c->memory_low, format_cgroup_memory_limit_comparison(u, "MemoryLow", cdb, sizeof(cdb)),
 546                 prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(u, "StartupMemoryLow", cdc, sizeof(cdc)),
 547                 prefix, c->memory_high, format_cgroup_memory_limit_comparison(u, "MemoryHigh", cdd, sizeof(cdd)),
 548                 prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(u, "StartupMemoryHigh", cde, sizeof(cde)),
 549                 prefix, c->memory_max, format_cgroup_memory_limit_comparison(u, "MemoryMax", cdf, sizeof(cdf)),
 550                 prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryMax", cdg, sizeof(cdg)),
 551                 prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(u, "MemorySwapMax", cdh, sizeof(cdh)),
 552                 prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(u, "StartupMemorySwapMax", cdi, sizeof(cdi)),
 553                 prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(u, "MemoryZSwapMax", cdj, sizeof(cdj)),
 554                 prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryZSwapMax", cdk, sizeof(cdk)),
 555                 prefix, yes_no(c->memory_zswap_writeback),
 556                 prefix, cgroup_tasks_max_resolve(&c->tasks_max),
 557                 prefix, cgroup_device_policy_to_string(c->device_policy),
 558                 prefix, strempty(disable_controllers_str),
 559                 prefix, delegate_str,
 560                 prefix, managed_oom_mode_to_string(c->moom_swap),
 561                 prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
 562                 prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
 563                 prefix, managed_oom_preference_to_string(c->moom_preference),
 564                 prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch),
 565                 prefix, yes_no(c->coredump_receive));
 566
 567         if (c->delegate_subgroup)
 568                 fprintf(f, "%sDelegateSubgroup: %s\n",
 569                         prefix, c->delegate_subgroup);
 570
 571         if (c->memory_pressure_threshold_usec != USEC_INFINITY)
 572                 fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
 573                         prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
 574
 575         if (c->moom_mem_pressure_duration_usec != USEC_INFINITY)
 576                 fprintf(f, "%sManagedOOMMemoryPressureDurationSec: %s\n",
 577                         prefix, FORMAT_TIMESPAN(c->moom_mem_pressure_duration_usec, 1));
 578
 579         LIST_FOREACH(device_allow, a, c->device_allow)
 580                 /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */
 581                 fprintf(f,
 582                         "%sDeviceAllow: %s %s\n",
 583                         prefix,
 584                         a->path,
 585                         strna(cgroup_device_permissions_to_string(a->permissions)));
 586
 587         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 588                 fprintf(f,
 589                         "%sIODeviceWeight: %s %" PRIu64 "\n",
 590                         prefix,
 591                         iw->path,
 592                         iw->weight);
 593
 594         LIST_FOREACH(device_latencies, l, c->io_device_latencies)
 595                 fprintf(f,
 596                         "%sIODeviceLatencyTargetSec: %s %s\n",
 597                         prefix,
 598                         l->path,
 599                         FORMAT_TIMESPAN(l->target_usec, 1));
 600
 601         LIST_FOREACH(device_limits, il, c->io_device_limits)
 602                 for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 603                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 604                                 fprintf(f,
 605                                         "%s%s: %s %s\n",
 606                                         prefix,
 607                                         cgroup_io_limit_type_to_string(type),
 608                                         il->path,
 609                                         FORMAT_BYTES(il->limits[type]));
 610
 611         SET_FOREACH(iaai, c->ip_address_allow)
 612                 fprintf(f, "%sIPAddressAllow: %s\n", prefix,
 613                         IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
 614         SET_FOREACH(iaai, c->ip_address_deny)
 615                 fprintf(f, "%sIPAddressDeny: %s\n", prefix,
 616                         IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
 617
 618         STRV_FOREACH(path, c->ip_filters_ingress)
 619                 fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
 620         STRV_FOREACH(path, c->ip_filters_egress)
 621                 fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
 622
 623         LIST_FOREACH(programs, p, c->bpf_foreign_programs)
 624                 fprintf(f, "%sBPFProgram: %s:%s",
 625                         prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path);
 626
 627         if (c->socket_bind_allow) {
 628                 fprintf(f, "%sSocketBindAllow: ", prefix);
 629                 cgroup_context_dump_socket_bind_items(c->socket_bind_allow, f);
 630                 fputc('\n', f);
 631         }
 632
 633         if (c->socket_bind_deny) {
 634                 fprintf(f, "%sSocketBindDeny: ", prefix);
 635                 cgroup_context_dump_socket_bind_items(c->socket_bind_deny, f);
 636                 fputc('\n', f);
 637         }
 638
 639         if (c->restrict_network_interfaces) {
 640                 char *iface;
 641                 SET_FOREACH(iface, c->restrict_network_interfaces)
 642                         fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface);
 643         }
 644
 645         FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
 646                 fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source),
 647                         nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set);
 648 }
 649
 650 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
 651         const char *family, *colon1, *protocol = "", *colon2 = "";
 652
 653         family = strempty(af_to_ipv4_ipv6(item->address_family));
 654         colon1 = isempty(family) ? "" : ":";
 655
 656         if (item->ip_protocol != 0) {
 657                 protocol = ip_protocol_to_tcp_udp(item->ip_protocol);
 658                 colon2 = ":";
 659         }
 660
 661         if (item->nr_ports == 0)
 662                 fprintf(f, "%s%s%s%sany", family, colon1, protocol, colon2);
 663         else if (item->nr_ports == 1)
 664                 fprintf(f, "%s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min);
 665         else {
 666                 uint16_t port_max = item->port_min + item->nr_ports - 1;
 667                 fprintf(f, "%s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2,
 668                         item->port_min, port_max);
 669         }
 670 }
 671
 672 void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f) {
 673         bool first = true;
 674
 675         LIST_FOREACH(socket_bind_items, bi, items) {
 676                 if (first)
 677                         first = false;
 678                 else
 679                         fputc(' ', f);
 680
 681                 cgroup_context_dump_socket_bind_item(bi, f);
 682         }
 683 }
 684
 685 int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
 686         _cleanup_free_ CGroupDeviceAllow *a = NULL;
 687         _cleanup_free_ char *d = NULL;
 688
 689         assert(c);
 690         assert(dev);
 691         assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
 692
 693         if (p == 0)
 694                 p = _CGROUP_DEVICE_PERMISSIONS_ALL;
 695
 696         a = new(CGroupDeviceAllow, 1);
 697         if (!a)
 698                 return -ENOMEM;
 699
 700         d = strdup(dev);
 701         if (!d)
 702                 return -ENOMEM;
 703
 704         *a = (CGroupDeviceAllow) {
 705                 .path = TAKE_PTR(d),
 706                 .permissions = p,
 707         };
 708
 709         LIST_PREPEND(device_allow, c->device_allow, a);
 710         TAKE_PTR(a);
 711
 712         return 0;
 713 }
 714
 715 int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
 716         assert(c);
 717         assert(dev);
 718         assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
 719
 720         if (p == 0)
 721                 p = _CGROUP_DEVICE_PERMISSIONS_ALL;
 722
 723         LIST_FOREACH(device_allow, b, c->device_allow)
 724                 if (path_equal(b->path, dev)) {
 725                         b->permissions = p;
 726                         return 0;
 727                 }
 728
 729         return cgroup_context_add_device_allow(c, dev, p);
 730 }
 731
 732 int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) {
 733         CGroupBPFForeignProgram *p;
 734         _cleanup_free_ char *d = NULL;
 735
 736         assert(c);
 737         assert(bpffs_path);
 738
 739         if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
 740                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized.");
 741
 742         d = strdup(bpffs_path);
 743         if (!d)
 744                 return log_oom();
 745
 746         p = new(CGroupBPFForeignProgram, 1);
 747         if (!p)
 748                 return log_oom();
 749
 750         *p = (CGroupBPFForeignProgram) {
 751                 .attach_type = attach_type,
 752                 .bpffs_path = TAKE_PTR(d),
 753         };
 754
 755         LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p));
 756
 757         return 0;
 758 }
 759
 760 #define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry)                       \
 761         uint64_t unit_get_ancestor_##entry(Unit *u) {                   \
 762                 CGroupContext *c;                                       \
 763                                                                         \
 764                 /* 1. Is entry set in this unit? If so, use that.       \
 765                  * 2. Is the default for this entry set in any          \
 766                  *    ancestor? If so, use that.                        \
 767                  * 3. Otherwise, return CGROUP_LIMIT_MIN. */            \
 768                                                                         \
 769                 assert(u);                                              \
 770                                                                         \
 771                 c = unit_get_cgroup_context(u);                         \
 772                 if (c && c->entry##_set)                                \
 773                         return c->entry;                                \
 774                                                                         \
 775                 while ((u = UNIT_GET_SLICE(u))) {                       \
 776                         c = unit_get_cgroup_context(u);                 \
 777                         if (c && c->default_##entry##_set)              \
 778                                 return c->default_##entry;              \
 779                 }                                                       \
 780                                                                         \
 781                 /* We've reached the root, but nobody had default for   \
 782                  * this entry set, so set it to the kernel default. */  \
 783                 return CGROUP_LIMIT_MIN;                                \
 784 }
 785
 786 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
 787 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(startup_memory_low);
 788 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
 789
 790 static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data, size_t size) {
 791         int r;
 792
 793         assert(u);
 794         assert(name);
 795
 796         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
 797         if (!crt || !crt->cgroup_path)
 798                 return;
 799
 800         r = cg_set_xattr(crt->cgroup_path, name, data, size, 0);
 801         if (r < 0)
 802                 log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path));
 803 }
 804
 805 static void unit_remove_xattr_graceful(Unit *u, const char *name) {
 806         int r;
 807
 808         assert(u);
 809         assert(name);
 810
 811         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
 812         if (!crt || !crt->cgroup_path)
 813                 return;
 814
 815         r = cg_remove_xattr(crt->cgroup_path, name);
 816         if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
 817                 log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path));
 818 }
 819
 820 static void cgroup_oomd_xattr_apply(Unit *u) {
 821         CGroupContext *c;
 822
 823         assert(u);
 824
 825         c = unit_get_cgroup_context(u);
 826         if (!c)
 827                 return;
 828
 829         if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT)
 830                 unit_set_xattr_graceful(u, "user.oomd_omit", "1", 1);
 831
 832         if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID)
 833                 unit_set_xattr_graceful(u, "user.oomd_avoid", "1", 1);
 834
 835         if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID)
 836                 unit_remove_xattr_graceful(u, "user.oomd_avoid");
 837
 838         if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT)
 839                 unit_remove_xattr_graceful(u, "user.oomd_omit");
 840 }
 841
 842 static int cgroup_log_xattr_apply(Unit *u) {
 843         ExecContext *c;
 844         size_t len, allowed_patterns_len, denied_patterns_len;
 845         _cleanup_free_ char *patterns = NULL, *allowed_patterns = NULL, *denied_patterns = NULL;
 846         char *last;
 847         int r;
 848
 849         assert(u);
 850
 851         c = unit_get_exec_context(u);
 852         if (!c)
 853                 /* Some unit types have a cgroup context but no exec context, so we do not log
 854                  * any error here to avoid confusion. */
 855                 return 0;
 856
 857         if (set_isempty(c->log_filter_allowed_patterns) && set_isempty(c->log_filter_denied_patterns)) {
 858                 unit_remove_xattr_graceful(u, "user.journald_log_filter_patterns");
 859                 return 0;
 860         }
 861
 862         r = set_make_nulstr(c->log_filter_allowed_patterns, &allowed_patterns, &allowed_patterns_len);
 863         if (r < 0)
 864                 return log_debug_errno(r, "Failed to make nulstr from set: %m");
 865
 866         r = set_make_nulstr(c->log_filter_denied_patterns, &denied_patterns, &denied_patterns_len);
 867         if (r < 0)
 868                 return log_debug_errno(r, "Failed to make nulstr from set: %m");
 869
 870         /* Use nul character separated strings without trailing nul */
 871         allowed_patterns_len = LESS_BY(allowed_patterns_len, 1u);
 872         denied_patterns_len = LESS_BY(denied_patterns_len, 1u);
 873
 874         len = allowed_patterns_len + 1 + denied_patterns_len;
 875         patterns = new(char, len);
 876         if (!patterns)
 877                 return log_oom_debug();
 878
 879         last = mempcpy_safe(patterns, allowed_patterns, allowed_patterns_len);
 880         *(last++) = '\xff';
 881         memcpy_safe(last, denied_patterns, denied_patterns_len);
 882
 883         unit_set_xattr_graceful(u, "user.journald_log_filter_patterns", patterns, len);
 884
 885         return 0;
 886 }
 887
 888 static void cgroup_invocation_id_xattr_apply(Unit *u) {
 889         bool b;
 890
 891         assert(u);
 892
 893         b = !sd_id128_is_null(u->invocation_id);
 894         FOREACH_STRING(xn, "trusted.invocation_id", "user.invocation_id") {
 895                 if (b)
 896                         unit_set_xattr_graceful(u, xn, SD_ID128_TO_STRING(u->invocation_id), 32);
 897                 else
 898                         unit_remove_xattr_graceful(u, xn);
 899         }
 900 }
 901
 902 static void cgroup_coredump_xattr_apply(Unit *u) {
 903         CGroupContext *c;
 904
 905         assert(u);
 906
 907         c = unit_get_cgroup_context(u);
 908         if (!c)
 909                 return;
 910
 911         if (unit_cgroup_delegate(u) && c->coredump_receive)
 912                 unit_set_xattr_graceful(u, "user.coredump_receive", "1", 1);
 913         else
 914                 unit_remove_xattr_graceful(u, "user.coredump_receive");
 915 }
 916
 917 static void cgroup_delegate_xattr_apply(Unit *u) {
 918         bool b;
 919
 920         assert(u);
 921
 922         /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels
 923          * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs,
 924          * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and
 925          * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well,
 926          * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker
 927          * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't
 928          * be a big problem given this communicates delegation state to clients, but the manager never reads
 929          * it. */
 930         b = unit_cgroup_delegate(u);
 931         FOREACH_STRING(xn, "trusted.delegate", "user.delegate") {
 932                 if (b)
 933                         unit_set_xattr_graceful(u, xn, "1", 1);
 934                 else
 935                         unit_remove_xattr_graceful(u, xn);
 936         }
 937 }
 938
 939 static void cgroup_survive_xattr_apply(Unit *u) {
 940         int r;
 941
 942         assert(u);
 943
 944         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
 945         if (!crt)
 946                 return;
 947
 948         if (u->survive_final_kill_signal) {
 949                 r = cg_set_xattr(
 950                                 crt->cgroup_path,
 951                                 "user.survive_final_kill_signal",
 952                                 "1",
 953                                 1,
 954                                 /* flags= */ 0);
 955                 /* user xattr support was added in kernel v5.7 */
 956                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
 957                         r = cg_set_xattr(
 958                                         crt->cgroup_path,
 959                                         "trusted.survive_final_kill_signal",
 960                                         "1",
 961                                         1,
 962                                         /* flags= */ 0);
 963                 if (r < 0)
 964                         log_unit_debug_errno(u,
 965                                              r,
 966                                              "Failed to set 'survive_final_kill_signal' xattr on control "
 967                                              "group %s, ignoring: %m",
 968                                              empty_to_root(crt->cgroup_path));
 969         } else {
 970                 unit_remove_xattr_graceful(u, "user.survive_final_kill_signal");
 971                 unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal");
 972         }
 973 }
 974
 975 static void cgroup_xattr_apply(Unit *u) {
 976         assert(u);
 977
 978         /* The 'user.*' xattrs can be set from a user manager. */
 979         cgroup_oomd_xattr_apply(u);
 980         cgroup_log_xattr_apply(u);
 981         cgroup_coredump_xattr_apply(u);
 982
 983         if (!MANAGER_IS_SYSTEM(u->manager))
 984                 return;
 985
 986         cgroup_invocation_id_xattr_apply(u);
 987         cgroup_delegate_xattr_apply(u);
 988         cgroup_survive_xattr_apply(u);
 989 }
 990
 991 static int lookup_block_device(const char *p, dev_t *ret) {
 992         dev_t rdev, dev = 0;
 993         mode_t mode;
 994         int r;
 995
 996         assert(p);
 997         assert(ret);
 998
 999         r = device_path_parse_major_minor(p, &mode, &rdev);
1000         if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
1001                 struct stat st;
1002
1003                 if (stat(p, &st) < 0)
1004                         return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
1005
1006                 mode = st.st_mode;
1007                 rdev = st.st_rdev;
1008                 dev = st.st_dev;
1009         } else if (r < 0)
1010                 return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
1011
1012         if (S_ISCHR(mode))
1013                 return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
1014                                          "Device node '%s' is a character device, but block device needed.", p);
1015         if (S_ISBLK(mode))
1016                 *ret = rdev;
1017         else if (major(dev) != 0)
1018                 *ret = dev; /* If this is not a device node then use the block device this file is stored on */
1019         else {
1020                 /* If this is btrfs, getting the backing block device is a bit harder */
1021                 r = btrfs_get_block_device(p, ret);
1022                 if (r == -ENOTTY)
1023                         return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
1024                                                  "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
1025                 if (r < 0)
1026                         return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
1027         }
1028
1029         /* If this is a LUKS/DM device, recursively try to get the originating block device */
1030         while (block_get_originating(*ret, ret) >= 0)
1031                 ;
1032
1033         /* If this is a partition, try to get the originating block device */
1034         (void) block_get_whole_disk(*ret, ret);
1035         return 0;
1036 }
1037
1038 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
1039         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
1040                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
1041 }
1042
1043 static bool cgroup_context_has_allowed_cpus(CGroupContext *c) {
1044         return c->cpuset_cpus.set || c->startup_cpuset_cpus.set;
1045 }
1046
1047 static bool cgroup_context_has_allowed_mems(CGroupContext *c) {
1048         return c->cpuset_mems.set || c->startup_cpuset_mems.set;
1049 }
1050
1051 uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
1052         assert(c);
1053
1054         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1055             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
1056                 return c->startup_cpu_weight;
1057         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
1058                 return c->cpu_weight;
1059         else
1060                 return CGROUP_WEIGHT_DEFAULT;
1061 }
1062
1063 static CPUSet *cgroup_context_allowed_cpus(CGroupContext *c, ManagerState state) {
1064         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1065             c->startup_cpuset_cpus.set)
1066                 return &c->startup_cpuset_cpus;
1067         else
1068                 return &c->cpuset_cpus;
1069 }
1070
1071 static CPUSet *cgroup_context_allowed_mems(CGroupContext *c, ManagerState state) {
1072         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1073             c->startup_cpuset_mems.set)
1074                 return &c->startup_cpuset_mems;
1075         else
1076                 return &c->cpuset_mems;
1077 }
1078
1079 usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
1080         /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
1081          * need to be higher than that boundary. quota is specified in USecPerSec.
1082          * Additionally, period must be at most max_period. */
1083         assert(quota > 0);
1084
1085         return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
1086 }
1087
1088 static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
1089         usec_t new_period;
1090
1091         assert(u);
1092
1093         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1094         if (!crt)
1095                 return USEC_INFINITY;
1096
1097         if (quota == USEC_INFINITY)
1098                 /* Always use default period for infinity quota. */
1099                 return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
1100
1101         if (period == USEC_INFINITY)
1102                 /* Default period was requested. */
1103                 period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
1104
1105         /* Clamp to interval [1ms, 1s] */
1106         new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
1107
1108         if (new_period != period) {
1109                 log_unit_full(u, crt->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
1110                               "Clamping CPU interval for cpu.max: period is now %s",
1111                               FORMAT_TIMESPAN(new_period, 1));
1112                 crt->warned_clamping_cpu_quota_period = true;
1113         }
1114
1115         return new_period;
1116 }
1117
1118 static void cgroup_apply_cpu_weight(Unit *u, uint64_t weight) {
1119         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1120
1121         if (weight == CGROUP_WEIGHT_IDLE)
1122                 return;
1123         xsprintf(buf, "%" PRIu64 "\n", weight);
1124         (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
1125 }
1126
1127 static void cgroup_apply_cpu_idle(Unit *u, uint64_t weight) {
1128         int r;
1129         bool is_idle;
1130         const char *idle_val;
1131
1132         assert(u);
1133
1134         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1135         if (!crt || !crt->cgroup_path)
1136                 return;
1137
1138         is_idle = weight == CGROUP_WEIGHT_IDLE;
1139         idle_val = one_zero(is_idle);
1140         r = cg_set_attribute("cpu", crt->cgroup_path, "cpu.idle", idle_val);
1141         if (r < 0 && (r != -ENOENT || is_idle))
1142                 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m",
1143                                     "cpu.idle", empty_to_root(crt->cgroup_path), idle_val);
1144 }
1145
1146 static void cgroup_apply_cpu_quota(Unit *u, usec_t quota, usec_t period) {
1147         char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
1148
1149         assert(u);
1150
1151         period = cgroup_cpu_adjust_period_and_log(u, period, quota);
1152         if (quota != USEC_INFINITY)
1153                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
1154                          MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
1155         else
1156                 xsprintf(buf, "max " USEC_FMT "\n", period);
1157         (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
1158 }
1159
1160 static void cgroup_apply_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
1161         _cleanup_free_ char *buf = NULL;
1162
1163         buf = cpu_set_to_range_string(cpus);
1164         if (!buf) {
1165                 log_oom();
1166                 return;
1167         }
1168
1169         (void) set_attribute_and_warn(u, "cpuset", name, buf);
1170 }
1171
1172 static bool cgroup_context_has_io_config(CGroupContext *c) {
1173         return c->io_accounting ||
1174                 c->io_weight != CGROUP_WEIGHT_INVALID ||
1175                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
1176                 c->io_device_weights ||
1177                 c->io_device_latencies ||
1178                 c->io_device_limits;
1179 }
1180
1181 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
1182         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
1183             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
1184                 return c->startup_io_weight;
1185         if (c->io_weight != CGROUP_WEIGHT_INVALID)
1186                 return c->io_weight;
1187         return CGROUP_WEIGHT_DEFAULT;
1188 }
1189
1190 static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t io_weight) {
1191         static bool warned = false;
1192         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")];
1193         const char *p;
1194         uint64_t bfq_weight;
1195         int r;
1196
1197         assert(u);
1198
1199         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1200         if (!crt || !crt->cgroup_path)
1201                 return -EOWNERDEAD;
1202
1203         /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
1204          * See also: https://github.com/systemd/systemd/pull/13335 and
1205          * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
1206         p = strjoina(controller, ".bfq.weight");
1207         /* Adjust to kernel range is 1..1000, the default is 100. */
1208         bfq_weight = BFQ_WEIGHT(io_weight);
1209
1210         if (major(dev) > 0)
1211                 xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), bfq_weight);
1212         else
1213                 xsprintf(buf, "%" PRIu64 "\n", bfq_weight);
1214
1215         r = cg_set_attribute(controller, crt->cgroup_path, p, buf);
1216
1217         /* FIXME: drop this when kernels prior
1218          * 795fe54c2a82 ("bfq: Add per-device weight") v5.4
1219          * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return
1220          * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */
1221         if (r == -EINVAL && major(dev) > 0) {
1222                if (!warned) {
1223                         log_unit_warning(u, "Kernel version does not accept per-device setting in %s.", p);
1224                         warned = true;
1225                }
1226                r = -EOPNOTSUPP; /* mask as unconfigured device */
1227         } else if (r >= 0 && io_weight != bfq_weight)
1228                 log_unit_debug(u, "%s=%" PRIu64 " scaled to %s=%" PRIu64,
1229                                major(dev) > 0 ? "IODeviceWeight" : "IOWeight",
1230                                io_weight, p, bfq_weight);
1231         return r;
1232 }
1233
1234 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
1235         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
1236         dev_t dev;
1237         int r, r1, r2;
1238
1239         assert(u);
1240
1241         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1242         if (!crt || !crt->cgroup_path)
1243                 return;
1244
1245         if (lookup_block_device(dev_path, &dev) < 0)
1246                 return;
1247
1248         r1 = set_bfq_weight(u, "io", dev, io_weight);
1249
1250         xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight);
1251         r2 = cg_set_attribute("io", crt->cgroup_path, "io.weight", buf);
1252
1253         /* Look at the configured device, when both fail, prefer io.weight errno. */
1254         r = r2 == -EOPNOTSUPP ? r1 : r2;
1255
1256         if (r < 0)
1257                 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r),
1258                                     r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
1259                                     empty_to_root(crt->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
1260 }
1261
1262 static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
1263         char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
1264         dev_t dev;
1265         int r;
1266
1267         r = lookup_block_device(dev_path, &dev);
1268         if (r < 0)
1269                 return;
1270
1271         if (target != USEC_INFINITY)
1272                 xsprintf(buf, DEVNUM_FORMAT_STR " target=%" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), target);
1273         else
1274                 xsprintf(buf, DEVNUM_FORMAT_STR " target=max\n", DEVNUM_FORMAT_VAL(dev));
1275
1276         (void) set_attribute_and_warn(u, "io", "io.latency", buf);
1277 }
1278
1279 static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
1280         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)],
1281              buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
1282         dev_t dev;
1283
1284         if (lookup_block_device(dev_path, &dev) < 0)
1285                 return;
1286
1287         for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
1288                 if (limits[type] != cgroup_io_limit_defaults[type])
1289                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
1290                 else
1291                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
1292
1293         xsprintf(buf, DEVNUM_FORMAT_STR " rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev),
1294                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
1295                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
1296         (void) set_attribute_and_warn(u, "io", "io.max", buf);
1297 }
1298
1299 static bool unit_has_memory_config(Unit *u) {
1300         CGroupContext *c;
1301
1302         assert(u);
1303
1304         assert_se(c = unit_get_cgroup_context(u));
1305
1306         return unit_get_ancestor_memory_min(u) > 0 ||
1307                unit_get_ancestor_memory_low(u) > 0 || unit_get_ancestor_startup_memory_low(u) > 0 ||
1308                c->memory_high != CGROUP_LIMIT_MAX || c->startup_memory_high_set ||
1309                c->memory_max != CGROUP_LIMIT_MAX || c->startup_memory_max_set ||
1310                c->memory_swap_max != CGROUP_LIMIT_MAX || c->startup_memory_swap_max_set ||
1311                c->memory_zswap_max != CGROUP_LIMIT_MAX || c->startup_memory_zswap_max_set;
1312 }
1313
1314 static void cgroup_apply_memory_limit(Unit *u, const char *file, uint64_t v) {
1315         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
1316
1317         if (v != CGROUP_LIMIT_MAX)
1318                 xsprintf(buf, "%" PRIu64 "\n", v);
1319
1320         (void) set_attribute_and_warn(u, "memory", file, buf);
1321 }
1322
1323 static void cgroup_apply_firewall(Unit *u) {
1324         assert(u);
1325
1326         /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
1327
1328         if (bpf_firewall_compile(u) < 0)
1329                 return;
1330
1331         (void) bpf_firewall_load_custom(u);
1332         (void) bpf_firewall_install(u);
1333 }
1334
1335 void unit_modify_nft_set(Unit *u, bool add) {
1336         int r;
1337
1338         assert(u);
1339
1340         if (!MANAGER_IS_SYSTEM(u->manager))
1341                 return;
1342
1343         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1344                 return;
1345
1346         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1347         if (!crt || crt->cgroup_id == 0)
1348                 return;
1349
1350         if (!u->manager->fw_ctx) {
1351                 r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false);
1352                 if (r < 0)
1353                         return;
1354
1355                 assert(u->manager->fw_ctx);
1356         }
1357
1358         CGroupContext *c = ASSERT_PTR(unit_get_cgroup_context(u));
1359
1360         FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
1361                 if (nft_set->source != NFT_SET_SOURCE_CGROUP)
1362                         continue;
1363
1364                 uint64_t element = crt->cgroup_id;
1365
1366                 r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
1367                 if (r < 0)
1368                         log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m",
1369                                           add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id);
1370                 else
1371                         log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64,
1372                                   add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id);
1373         }
1374 }
1375
1376 static void cgroup_apply_socket_bind(Unit *u) {
1377         assert(u);
1378
1379         (void) bpf_socket_bind_install(u);
1380 }
1381
1382 static void cgroup_apply_restrict_network_interfaces(Unit *u) {
1383         assert(u);
1384
1385         (void) bpf_restrict_ifaces_install(u);
1386 }
1387
1388 static int cgroup_apply_devices(Unit *u) {
1389         _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
1390         CGroupContext *c;
1391         CGroupDevicePolicy policy;
1392         int r;
1393
1394         assert_se(c = unit_get_cgroup_context(u));
1395
1396         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1397         if (!crt || !crt->cgroup_path)
1398                 return -EOWNERDEAD;
1399
1400         policy = c->device_policy;
1401
1402         r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
1403         if (r < 0)
1404                 return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
1405
1406         bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
1407                 (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
1408
1409         bool any = false;
1410         if (allow_list_static) {
1411                 r = bpf_devices_allow_list_static(prog, crt->cgroup_path);
1412                 if (r > 0)
1413                         any = true;
1414         }
1415
1416         LIST_FOREACH(device_allow, a, c->device_allow) {
1417                 const char *val;
1418
1419                 if (a->permissions == 0)
1420                         continue;
1421
1422                 if (path_startswith(a->path, "/dev/"))
1423                         r = bpf_devices_allow_list_device(prog, crt->cgroup_path, a->path, a->permissions);
1424                 else if ((val = startswith(a->path, "block-")))
1425                         r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'b', a->permissions);
1426                 else if ((val = startswith(a->path, "char-")))
1427                         r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'c', a->permissions);
1428                 else {
1429                         log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
1430                         continue;
1431                 }
1432
1433                 if (r > 0)
1434                         any = true;
1435         }
1436
1437         if (prog && !any) {
1438                 log_unit_warning(u, "No devices matched by device filter.");
1439
1440                 /* The kernel verifier would reject a program we would build with the normal intro and outro
1441                    but no allow-listing rules (outro would contain an unreachable instruction for successful
1442                    return). */
1443                 policy = CGROUP_DEVICE_POLICY_STRICT;
1444         }
1445
1446         r = bpf_devices_apply_policy(&prog, policy, any, crt->cgroup_path, &crt->bpf_device_control_installed);
1447         if (r < 0) {
1448                 static bool warned = false;
1449
1450                 log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
1451                                "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
1452                                "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
1453                                "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
1454
1455                 warned = true;
1456         }
1457         return r;
1458 }
1459
1460 static void set_io_weight(Unit *u, uint64_t weight) {
1461         char buf[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)];
1462
1463         assert(u);
1464
1465         (void) set_bfq_weight(u, "io", makedev(0, 0), weight);
1466
1467         xsprintf(buf, "default %" PRIu64 "\n", weight);
1468         (void) set_attribute_and_warn(u, "io", "io.weight", buf);
1469 }
1470
1471 static void cgroup_apply_bpf_foreign_program(Unit *u) {
1472         assert(u);
1473
1474         (void) bpf_foreign_install(u);
1475 }
1476
1477 static void cgroup_context_apply(
1478                 Unit *u,
1479                 CGroupMask apply_mask,
1480                 ManagerState state) {
1481
1482         bool is_host_root, is_local_root;
1483         CGroupContext *c;
1484         int r;
1485
1486         assert(u);
1487
1488         /* Nothing to do? Exit early! */
1489         if (apply_mask == 0)
1490                 return;
1491
1492         /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
1493          * attributes should only be managed for cgroups further down the tree. */
1494         is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
1495         is_host_root = unit_has_host_root_cgroup(u);
1496
1497         assert_se(c = unit_get_cgroup_context(u));
1498
1499         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1500         if (!crt || !crt->cgroup_path)
1501                 return;
1502
1503         /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
1504          * then), and missing cgroups, i.e. EROFS and ENOENT. */
1505
1506         /* These attributes don't exist on the host cgroup root. */
1507         if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
1508                 uint64_t weight;
1509
1510                 if (cgroup_context_has_cpu_weight(c))
1511                         weight = cgroup_context_cpu_weight(c, state);
1512                 else
1513                         weight = CGROUP_WEIGHT_DEFAULT;
1514
1515                 cgroup_apply_cpu_idle(u, weight);
1516                 cgroup_apply_cpu_weight(u, weight);
1517                 cgroup_apply_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
1518         }
1519
1520         if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
1521                 cgroup_apply_cpuset(u, cgroup_context_allowed_cpus(c, state), "cpuset.cpus");
1522                 cgroup_apply_cpuset(u, cgroup_context_allowed_mems(c, state), "cpuset.mems");
1523         }
1524
1525         /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
1526          * controller), and in case of containers we want to leave control of these attributes to the container manager
1527          * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
1528         if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
1529                 bool has_io;
1530                 uint64_t weight;
1531
1532                 has_io = cgroup_context_has_io_config(c);
1533
1534                 if (has_io)
1535                         weight = cgroup_context_io_weight(c, state);
1536                 else
1537                         weight = CGROUP_WEIGHT_DEFAULT;
1538
1539                 set_io_weight(u, weight);
1540
1541                 if (has_io) {
1542                         LIST_FOREACH(device_weights, w, c->io_device_weights)
1543                                 cgroup_apply_io_device_weight(u, w->path, w->weight);
1544
1545                         LIST_FOREACH(device_limits, limit, c->io_device_limits)
1546                                 cgroup_apply_io_device_limit(u, limit->path, limit->limits);
1547
1548                         LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
1549                                 cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
1550                 }
1551         }
1552
1553         /* 'memory' attributes do not exist on the root cgroup. */
1554         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
1555                 uint64_t max = CGROUP_LIMIT_MAX, swap_max = CGROUP_LIMIT_MAX, zswap_max = CGROUP_LIMIT_MAX, high = CGROUP_LIMIT_MAX;
1556
1557                 if (unit_has_memory_config(u)) {
1558                         bool startup = IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
1559
1560                         high = startup && c->startup_memory_high_set ? c->startup_memory_high : c->memory_high;
1561                         max = startup && c->startup_memory_max_set ? c->startup_memory_max : c->memory_max;
1562                         swap_max = startup && c->startup_memory_swap_max_set ? c->startup_memory_swap_max : c->memory_swap_max;
1563                         zswap_max = startup && c->startup_memory_zswap_max_set ? c->startup_memory_zswap_max : c->memory_zswap_max;
1564                 }
1565
1566                 cgroup_apply_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
1567                 cgroup_apply_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
1568                 cgroup_apply_memory_limit(u, "memory.high", high);
1569                 cgroup_apply_memory_limit(u, "memory.max", max);
1570                 cgroup_apply_memory_limit(u, "memory.swap.max", swap_max);
1571                 cgroup_apply_memory_limit(u, "memory.zswap.max", zswap_max);
1572
1573                 (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
1574                 (void) set_attribute_and_warn(u, "memory", "memory.zswap.writeback", one_zero(c->memory_zswap_writeback));
1575         }
1576
1577         if (apply_mask & CGROUP_MASK_PIDS) {
1578
1579                 if (is_host_root) {
1580                         /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1581                          * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1582                          * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1583                          * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1584                          * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1585                          * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1586                          * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1587                          * it also counts. But if the user never set a limit through us (i.e. we are the default of
1588                          * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1589                          * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1590                          * which is desirable so that there's an official way to release control of the sysctl from
1591                          * systemd: set the limit to unbounded and reload. */
1592
1593                         if (cgroup_tasks_max_isset(&c->tasks_max)) {
1594                                 u->manager->sysctl_pid_max_changed = true;
1595                                 r = procfs_tasks_set_limit(cgroup_tasks_max_resolve(&c->tasks_max));
1596                         } else if (u->manager->sysctl_pid_max_changed)
1597                                 r = procfs_tasks_set_limit(TASKS_MAX);
1598                         else
1599                                 r = 0;
1600                         if (r < 0)
1601                                 log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
1602                                                     "Failed to write to tasks limit sysctls: %m");
1603                 }
1604
1605                 /* The attribute itself is not available on the host root cgroup, and in the container case we want to
1606                  * leave it for the container manager. */
1607                 if (!is_local_root) {
1608                         if (cgroup_tasks_max_isset(&c->tasks_max)) {
1609                                 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
1610
1611                                 xsprintf(buf, "%" PRIu64 "\n", cgroup_tasks_max_resolve(&c->tasks_max));
1612                                 (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
1613                         } else
1614                                 (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
1615                 }
1616         }
1617
1618         /* On cgroup v2 we can apply BPF everywhere. */
1619         if (apply_mask & CGROUP_MASK_BPF_DEVICES)
1620                 (void) cgroup_apply_devices(u);
1621
1622         if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
1623                 cgroup_apply_firewall(u);
1624
1625         if (apply_mask & CGROUP_MASK_BPF_FOREIGN)
1626                 cgroup_apply_bpf_foreign_program(u);
1627
1628         if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND)
1629                 cgroup_apply_socket_bind(u);
1630
1631         if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES)
1632                 cgroup_apply_restrict_network_interfaces(u);
1633
1634         unit_modify_nft_set(u, /* add = */ true);
1635 }
1636
1637 static bool unit_get_needs_bpf_firewall(Unit *u) {
1638         CGroupContext *c;
1639         assert(u);
1640
1641         c = unit_get_cgroup_context(u);
1642         if (!c)
1643                 return false;
1644
1645         if (c->ip_accounting ||
1646             !set_isempty(c->ip_address_allow) ||
1647             !set_isempty(c->ip_address_deny) ||
1648             c->ip_filters_ingress ||
1649             c->ip_filters_egress)
1650                 return true;
1651
1652         /* If any parent slice has an IP access list defined, it applies too */
1653         for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) {
1654                 c = unit_get_cgroup_context(p);
1655                 if (!c)
1656                         return false;
1657
1658                 if (!set_isempty(c->ip_address_allow) ||
1659                     !set_isempty(c->ip_address_deny))
1660                         return true;
1661         }
1662
1663         return false;
1664 }
1665
1666 static bool unit_get_needs_bpf_foreign_program(Unit *u) {
1667         CGroupContext *c;
1668         assert(u);
1669
1670         c = unit_get_cgroup_context(u);
1671         if (!c)
1672                 return false;
1673
1674         return !!c->bpf_foreign_programs;
1675 }
1676
1677 static bool unit_get_needs_socket_bind(Unit *u) {
1678         CGroupContext *c;
1679         assert(u);
1680
1681         c = unit_get_cgroup_context(u);
1682         if (!c)
1683                 return false;
1684
1685         return c->socket_bind_allow || c->socket_bind_deny;
1686 }
1687
1688 static bool unit_get_needs_restrict_network_interfaces(Unit *u) {
1689         CGroupContext *c;
1690         assert(u);
1691
1692         c = unit_get_cgroup_context(u);
1693         if (!c)
1694                 return false;
1695
1696         return !set_isempty(c->restrict_network_interfaces);
1697 }
1698
1699 static CGroupMask unit_get_cgroup_mask(Unit *u) {
1700         CGroupMask mask = 0;
1701         CGroupContext *c;
1702
1703         assert(u);
1704
1705         assert_se(c = unit_get_cgroup_context(u));
1706
1707         /* Figure out which controllers we need, based on the cgroup context object */
1708
1709         if (cgroup_context_has_cpu_weight(c) ||
1710             c->cpu_quota_per_sec_usec != USEC_INFINITY)
1711                 mask |= CGROUP_MASK_CPU;
1712
1713         if (cgroup_context_has_allowed_cpus(c) || cgroup_context_has_allowed_mems(c))
1714                 mask |= CGROUP_MASK_CPUSET;
1715
1716         if (cgroup_context_has_io_config(c))
1717                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1718
1719         if (c->memory_accounting ||
1720             unit_has_memory_config(u))
1721                 mask |= CGROUP_MASK_MEMORY;
1722
1723         if (c->device_allow ||
1724             c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
1725                 mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
1726
1727         if (c->tasks_accounting ||
1728             cgroup_tasks_max_isset(&c->tasks_max))
1729                 mask |= CGROUP_MASK_PIDS;
1730
1731         return mask;
1732 }
1733
1734 static CGroupMask unit_get_bpf_mask(Unit *u) {
1735         CGroupMask mask = 0;
1736
1737         /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
1738          * too. */
1739
1740         if (unit_get_needs_bpf_firewall(u))
1741                 mask |= CGROUP_MASK_BPF_FIREWALL;
1742
1743         if (unit_get_needs_bpf_foreign_program(u))
1744                 mask |= CGROUP_MASK_BPF_FOREIGN;
1745
1746         if (unit_get_needs_socket_bind(u))
1747                 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
1748
1749         if (unit_get_needs_restrict_network_interfaces(u))
1750                 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
1751
1752         return mask;
1753 }
1754
1755 CGroupMask unit_get_own_mask(Unit *u) {
1756         CGroupContext *c;
1757
1758         /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
1759          * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
1760
1761         if (u->load_state != UNIT_LOADED)
1762                 return 0;
1763
1764         c = unit_get_cgroup_context(u);
1765         if (!c)
1766                 return 0;
1767
1768         return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
1769 }
1770
1771 CGroupMask unit_get_delegate_mask(Unit *u) {
1772         CGroupContext *c;
1773
1774         /* If delegation is turned on, then turn on selected controllers.
1775          *
1776          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1777
1778         if (!unit_cgroup_delegate(u))
1779                 return 0;
1780
1781         assert_se(c = unit_get_cgroup_context(u));
1782         return c->delegate_controllers;
1783 }
1784
1785 static CGroupMask unit_get_subtree_mask(Unit *u) {
1786
1787         /* Returns the mask of this subtree, meaning of the group
1788          * itself and its children. */
1789
1790         return unit_get_own_mask(u) | unit_get_members_mask(u);
1791 }
1792
1793 CGroupMask unit_get_members_mask(Unit *u) {
1794         assert(u);
1795
1796         /* Returns the mask of controllers all of the unit's children require, merged */
1797
1798         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1799         if (crt && crt->cgroup_members_mask_valid)
1800                 return crt->cgroup_members_mask; /* Use cached value if possible */
1801
1802         CGroupMask m = 0;
1803         if (u->type == UNIT_SLICE) {
1804                 Unit *member;
1805
1806                 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
1807                         m |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1808         }
1809
1810         if (crt) {
1811                 crt->cgroup_members_mask = m;
1812                 crt->cgroup_members_mask_valid = true;
1813         }
1814
1815         return m;
1816 }
1817
1818 CGroupMask unit_get_siblings_mask(Unit *u) {
1819         Unit *slice;
1820         assert(u);
1821
1822         /* Returns the mask of controllers all of the unit's siblings
1823          * require, i.e. the members mask of the unit's parent slice
1824          * if there is one. */
1825
1826         slice = UNIT_GET_SLICE(u);
1827         if (slice)
1828                 return unit_get_members_mask(slice);
1829
1830         return unit_get_subtree_mask(u); /* we are the top-level slice */
1831 }
1832
1833 static CGroupMask unit_get_disable_mask(Unit *u) {
1834         CGroupContext *c;
1835
1836         c = unit_get_cgroup_context(u);
1837         if (!c)
1838                 return 0;
1839
1840         return c->disable_controllers;
1841 }
1842
1843 CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
1844         CGroupMask mask;
1845         Unit *slice;
1846
1847         assert(u);
1848         mask = unit_get_disable_mask(u);
1849
1850         /* Returns the mask of controllers which are marked as forcibly
1851          * disabled in any ancestor unit or the unit in question. */
1852
1853         slice = UNIT_GET_SLICE(u);
1854         if (slice)
1855                 mask |= unit_get_ancestor_disable_mask(slice);
1856
1857         return mask;
1858 }
1859
1860 CGroupMask unit_get_target_mask(Unit *u) {
1861         CGroupMask own_mask, mask;
1862
1863         /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
1864          * it needs itself, plus all that its children need, plus all that its siblings need. This is
1865          * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
1866          * hierarchy that shall be enabled for it. */
1867
1868         own_mask = unit_get_own_mask(u);
1869
1870         if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
1871                 emit_bpf_firewall_warning(u);
1872
1873         mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1874
1875         mask &= u->manager->cgroup_supported;
1876         mask &= ~unit_get_ancestor_disable_mask(u);
1877
1878         return mask;
1879 }
1880
1881 CGroupMask unit_get_enable_mask(Unit *u) {
1882         CGroupMask mask;
1883
1884         /* This returns the cgroup mask of all controllers to enable
1885          * for the children of a specific cgroup. This is primarily
1886          * useful for the unified cgroup hierarchy, where each cgroup
1887          * controls which controllers are enabled for its children. */
1888
1889         mask = unit_get_members_mask(u);
1890         mask &= u->manager->cgroup_supported;
1891         mask &= ~unit_get_ancestor_disable_mask(u);
1892
1893         return mask;
1894 }
1895
1896 void unit_invalidate_cgroup_members_masks(Unit *u) {
1897         Unit *slice;
1898
1899         assert(u);
1900
1901         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1902         if (!crt)
1903                 return;
1904
1905         /* Recurse invalidate the member masks cache all the way up the tree */
1906         crt->cgroup_members_mask_valid = false;
1907
1908         slice = UNIT_GET_SLICE(u);
1909         if (slice)
1910                 unit_invalidate_cgroup_members_masks(slice);
1911 }
1912
1913 static int unit_default_cgroup_path(const Unit *u, char **ret) {
1914         _cleanup_free_ char *p = NULL;
1915         int r;
1916
1917         assert(u);
1918         assert(ret);
1919
1920         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1921                 p = strdup(u->manager->cgroup_root);
1922         else {
1923                 _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
1924                 Unit *slice;
1925
1926                 slice = UNIT_GET_SLICE(u);
1927                 if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
1928                         r = cg_slice_to_path(slice->id, &slice_path);
1929                         if (r < 0)
1930                                 return r;
1931                 }
1932
1933                 r = cg_escape(u->id, &escaped);
1934                 if (r < 0)
1935                         return r;
1936
1937                 p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
1938         }
1939         if (!p)
1940                 return -ENOMEM;
1941
1942         *ret = TAKE_PTR(p);
1943         return 0;
1944 }
1945
1946 static int unit_set_cgroup_path(Unit *u, const char *path) {
1947         _cleanup_free_ char *p = NULL;
1948         CGroupRuntime *crt;
1949         int r;
1950
1951         assert(u);
1952
1953         crt = unit_get_cgroup_runtime(u);
1954         if (crt && streq_ptr(crt->cgroup_path, path))
1955                 return 0;
1956
1957         unit_release_cgroup(u, /* drop_cgroup_runtime = */ true);
1958
1959         crt = unit_setup_cgroup_runtime(u);
1960         if (!crt)
1961                 return -ENOMEM;
1962
1963         if (path) {
1964                 p = strdup(path);
1965                 if (!p)
1966                         return -ENOMEM;
1967
1968                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1969                 if (r < 0)
1970                         return r;
1971         }
1972
1973         assert(!crt->cgroup_path);
1974         crt->cgroup_path = TAKE_PTR(p);
1975
1976         return 1;
1977 }
1978
1979 int unit_get_cgroup_path_with_fallback(const Unit *u, char **ret) {
1980         assert(u);
1981         assert(ret);
1982
1983         const CGroupRuntime *crt = unit_get_cgroup_runtime(u);
1984         if (!crt || !crt->cgroup_path)
1985                 return unit_default_cgroup_path(u, ret);
1986
1987         return strdup_to_full(ret, crt->cgroup_path); /* returns 1 -> cgroup_path is alive */
1988 }
1989
1990 static int unit_watch_cgroup(Unit *u) {
1991         _cleanup_free_ char *events = NULL;
1992         int r;
1993
1994         assert(u);
1995
1996         /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
1997          * cgroupv2 is available. */
1998
1999         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2000         if (!crt || !crt->cgroup_path)
2001                 return 0;
2002
2003         if (crt->cgroup_control_inotify_wd >= 0)
2004                 return 0;
2005
2006         /* No point in watch the top-level slice, it's never going to run empty. */
2007         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
2008                 return 0;
2009
2010         r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
2011         if (r < 0)
2012                 return log_oom();
2013
2014         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.events", &events);
2015         if (r < 0)
2016                 return log_oom();
2017
2018         crt->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2019         if (crt->cgroup_control_inotify_wd < 0) {
2020
2021                 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2022                                       * is not an error */
2023                         return 0;
2024
2025                 return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path));
2026         }
2027
2028         r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd), u);
2029         if (r < 0)
2030                 return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path));
2031
2032         return 0;
2033 }
2034
2035 static int unit_watch_cgroup_memory(Unit *u) {
2036         _cleanup_free_ char *events = NULL;
2037         int r;
2038
2039         assert(u);
2040
2041         /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
2042          * cgroupv2 is available. */
2043
2044         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2045         if (!crt || !crt->cgroup_path)
2046                 return 0;
2047
2048         CGroupContext *c = unit_get_cgroup_context(u);
2049         if (!c)
2050                 return 0;
2051
2052         /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
2053          * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
2054          * all. */
2055         if (!c->memory_accounting)
2056                 return 0;
2057
2058         /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
2059          * we also don't want to generate a log message for each parent cgroup of a process. */
2060         if (u->type == UNIT_SLICE)
2061                 return 0;
2062
2063         if (crt->cgroup_memory_inotify_wd >= 0)
2064                 return 0;
2065
2066         r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
2067         if (r < 0)
2068                 return log_oom();
2069
2070         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "memory.events", &events);
2071         if (r < 0)
2072                 return log_oom();
2073
2074         crt->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
2075         if (crt->cgroup_memory_inotify_wd < 0) {
2076
2077                 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
2078                                       * is not an error */
2079                         return 0;
2080
2081                 return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path));
2082         }
2083
2084         r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd), u);
2085         if (r < 0)
2086                 return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path));
2087
2088         return 0;
2089 }
2090
2091 static int unit_update_cgroup(
2092                 Unit *u,
2093                 CGroupMask target_mask,
2094                 CGroupMask enable_mask,
2095                 ManagerState state) {
2096
2097         _cleanup_free_ char *cgroup = NULL, *cgroup_full_path = NULL;
2098         bool set_path, created;
2099         int r;
2100
2101         assert(u);
2102
2103         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2104                 return 0;
2105
2106         if (u->freezer_state != FREEZER_RUNNING)
2107                 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), "Cannot realize cgroup for frozen unit.");
2108
2109         r = unit_get_cgroup_path_with_fallback(u, &cgroup);
2110         if (r < 0)
2111                 return log_unit_error_errno(u, r, "Failed to get cgroup path: %m");
2112         set_path = r == 0;
2113
2114         /* First, create our own group */
2115         r = cg_create(cgroup);
2116         if (r < 0)
2117                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(cgroup));
2118         created = r;
2119
2120         if (set_path) {
2121                 r = unit_set_cgroup_path(u, cgroup);
2122                 if (r == -EEXIST)
2123                         return log_unit_error_errno(u, r, "Picked control group '%s' as default, but it's in use already.", empty_to_root(cgroup));
2124                 if (r < 0)
2125                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to '%s': %m", empty_to_root(cgroup));
2126                 assert(r > 0);
2127         }
2128
2129         CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
2130
2131         uint64_t cgroup_id = 0;
2132         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_full_path);
2133         if (r == 0) {
2134                 r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
2135                 if (r < 0)
2136                         log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
2137                                             "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path);
2138         } else
2139                 log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
2140
2141         crt->cgroup_id = cgroup_id;
2142
2143         /* Start watching it */
2144         (void) unit_watch_cgroup(u);
2145         (void) unit_watch_cgroup_memory(u);
2146
2147         /* For v2 we preserve enabled controllers in delegated units, adjust others, */
2148         if (created || !unit_cgroup_delegate(u)) {
2149                 CGroupMask result_mask = 0;
2150
2151                 /* Enable all controllers we need */
2152                 r = cg_enable(u->manager->cgroup_supported, enable_mask, crt->cgroup_path, &result_mask);
2153                 if (r < 0)
2154                         log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
2155
2156                 /* Remember what's actually enabled now */
2157                 crt->cgroup_enabled_mask = result_mask;
2158         }
2159
2160         /* Keep track that this is now realized */
2161         crt->cgroup_realized_mask = target_mask;
2162
2163         /* Set attributes */
2164         cgroup_context_apply(u, target_mask, state);
2165         cgroup_xattr_apply(u);
2166
2167         /* For most units we expect that memory monitoring is set up before the unit is started and we won't
2168          * touch it after. For PID 1 this is different though, because we couldn't possibly do that given
2169          * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's
2170          * try to open the memory pressure interface anew. */
2171         if (unit_has_name(u, SPECIAL_INIT_SCOPE))
2172                 (void) manager_setup_memory_pressure_event_source(u->manager);
2173
2174         return 0;
2175 }
2176
2177 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
2178         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2179         char *pp;
2180         int r;
2181
2182         assert(u);
2183
2184         if (MANAGER_IS_SYSTEM(u->manager))
2185                 return -EINVAL;
2186
2187         if (!u->manager->system_bus)
2188                 return -EIO;
2189
2190         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2191         if (!crt || !crt->cgroup_path)
2192                 return -EOWNERDEAD;
2193
2194         /* Determine this unit's cgroup path relative to our cgroup root */
2195         pp = path_startswith(crt->cgroup_path, u->manager->cgroup_root);
2196         if (!pp)
2197                 return -EINVAL;
2198
2199         pp = strjoina("/", pp, suffix_path);
2200         path_simplify(pp);
2201
2202         r = bus_call_method(u->manager->system_bus,
2203                             bus_systemd_mgr,
2204                             "AttachProcessesToUnit",
2205                             &error, NULL,
2206                             "ssau",
2207                             NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
2208         if (r < 0)
2209                 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
2210
2211         return 0;
2212 }
2213
2214 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
2215         _cleanup_free_ char *joined = NULL;
2216         const char *p;
2217         int ret = 0, r;
2218
2219         assert(u);
2220
2221         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2222                 return -EINVAL;
2223
2224         if (set_isempty(pids))
2225                 return 0;
2226
2227         /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
2228          * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
2229         r = bpf_firewall_load_custom(u);
2230         if (r < 0)
2231                 return r;
2232
2233         r = unit_realize_cgroup(u);
2234         if (r < 0)
2235                 return r;
2236
2237         CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
2238
2239         if (isempty(suffix_path))
2240                 p = crt->cgroup_path;
2241         else {
2242                 joined = path_join(crt->cgroup_path, suffix_path);
2243                 if (!joined)
2244                         return -ENOMEM;
2245
2246                 p = joined;
2247         }
2248
2249         PidRef *pid;
2250         SET_FOREACH(pid, pids) {
2251
2252                 /* Unfortunately we cannot add pids by pidfd to a cgroup. Hence we have to use PIDs instead,
2253                  * which of course is racy. Let's shorten the race a bit though, and re-validate the PID
2254                  * before we use it */
2255                 r = pidref_verify(pid);
2256                 if (r < 0) {
2257                         log_unit_info_errno(u, r, "PID " PID_FMT " vanished before we could move it to target cgroup '%s', skipping: %m", pid->pid, empty_to_root(p));
2258                         continue;
2259                 }
2260
2261                 r = cg_attach(p, pid->pid);
2262                 if (r < 0) {
2263                         bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_NEG_PRIVILEGE(r);
2264
2265                         log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO,  r,
2266                                             "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m",
2267                                             pid->pid, again ? " directly" : "", empty_to_root(p));
2268
2269                         if (again) {
2270                                 int z;
2271
2272                                 /* If we are in a user instance, and we can't move the process ourselves due
2273                                  * to permission problems, let's ask the system instance about it instead.
2274                                  * Since it's more privileged it might be able to move the process across the
2275                                  * leaves of a subtree whose top node is not owned by us. */
2276
2277                                 z = unit_attach_pid_to_cgroup_via_bus(u, pid->pid, suffix_path);
2278                                 if (z >= 0)
2279                                         goto success;
2280
2281                                 log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid->pid, empty_to_root(p));
2282                         }
2283
2284                         RET_GATHER(ret, r);
2285                         continue;
2286                 }
2287
2288         success:
2289                 /* the cgroup is definitely not empty now. in case the unit was in the cgroup empty queue,
2290                  * drop it from there */
2291                 unit_remove_from_cgroup_empty_queue(u);
2292
2293                 if (ret >= 0)
2294                         ret++; /* Count successful additions */
2295         }
2296
2297         return ret;
2298 }
2299
2300 int unit_remove_subcgroup(Unit *u, const char *suffix_path) {
2301         int r;
2302
2303         assert(u);
2304
2305         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2306                 return -EINVAL;
2307
2308         if (!unit_cgroup_delegate(u))
2309                 return -ENOMEDIUM;
2310
2311         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2312         if (!crt || !crt->cgroup_path)
2313                 return -EOWNERDEAD;
2314
2315         _cleanup_free_ char *j = NULL;
2316         bool delete_root;
2317         const char *d;
2318         if (empty_or_root(suffix_path)) {
2319                 d = empty_to_root(crt->cgroup_path);
2320                 delete_root = false; /* Don't attempt to delete the main cgroup of this unit */
2321         } else {
2322                 j = path_join(crt->cgroup_path, suffix_path);
2323                 if (!j)
2324                         return -ENOMEM;
2325
2326                 d = j;
2327                 delete_root = true;
2328         }
2329
2330         log_unit_debug(u, "Removing subcgroup '%s'...", d);
2331
2332         r = cg_trim(d, delete_root);
2333         if (r < 0)
2334                 return log_unit_debug_errno(u, r, "Failed to fully %s cgroup '%s': %m", delete_root ? "remove" : "trim", d);
2335
2336         return 0;
2337 }
2338
2339 static bool unit_has_mask_realized(
2340                 Unit *u,
2341                 CGroupMask target_mask,
2342                 CGroupMask enable_mask) {
2343
2344         assert(u);
2345
2346         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2347         if (!crt)
2348                 return false;
2349
2350         /* Returns true if this unit is fully realized. We check four things:
2351          *
2352          * 1. Whether the cgroup was created at all
2353          * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
2354          * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
2355          * 4. Whether the invalidation mask is currently zero
2356          *
2357          * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
2358          * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
2359          * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
2360          * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
2361          * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
2362          * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
2363          * simply don't matter. */
2364
2365         return crt->cgroup_path &&
2366                 ((crt->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
2367                 ((crt->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
2368                 crt->cgroup_invalidated_mask == 0;
2369 }
2370
2371 static bool unit_has_mask_disables_realized(
2372                 Unit *u,
2373                 CGroupMask target_mask,
2374                 CGroupMask enable_mask) {
2375
2376         assert(u);
2377
2378         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2379         if (!crt)
2380                 return true;
2381
2382         /* Returns true if all controllers which should be disabled are indeed disabled.
2383          *
2384          * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
2385          * already removed. */
2386
2387         return !crt->cgroup_path ||
2388                 (FLAGS_SET(crt->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
2389                  FLAGS_SET(crt->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
2390 }
2391
2392 static bool unit_has_mask_enables_realized(
2393                 Unit *u,
2394                 CGroupMask target_mask,
2395                 CGroupMask enable_mask) {
2396
2397         assert(u);
2398
2399         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2400         if (!crt)
2401                 return false;
2402
2403         /* Returns true if all controllers which should be enabled are indeed enabled.
2404          *
2405          * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
2406          * we want to add is already added. */
2407
2408         return crt->cgroup_path &&
2409                 ((crt->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (crt->cgroup_realized_mask & CGROUP_MASK_V1) &&
2410                 ((crt->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (crt->cgroup_enabled_mask & CGROUP_MASK_V2);
2411 }
2412
2413 void unit_add_to_cgroup_realize_queue(Unit *u) {
2414         assert(u);
2415
2416         if (u->in_cgroup_realize_queue)
2417                 return;
2418
2419         LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2420         u->in_cgroup_realize_queue = true;
2421 }
2422
2423 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
2424         assert(u);
2425
2426         if (!u->in_cgroup_realize_queue)
2427                 return;
2428
2429         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2430         u->in_cgroup_realize_queue = false;
2431 }
2432
2433 /* Controllers can only be enabled breadth-first, from the root of the
2434  * hierarchy downwards to the unit in question. */
2435 static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
2436         CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2437         Unit *slice;
2438         int r;
2439
2440         assert(u);
2441
2442         /* First go deal with this unit's parent, or we won't be able to enable
2443          * any new controllers at this layer. */
2444         slice = UNIT_GET_SLICE(u);
2445         if (slice) {
2446                 r = unit_realize_cgroup_now_enable(slice, state);
2447                 if (r < 0)
2448                         return r;
2449         }
2450
2451         target_mask = unit_get_target_mask(u);
2452         enable_mask = unit_get_enable_mask(u);
2453
2454         /* We can only enable in this direction, don't try to disable anything.
2455          */
2456         if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
2457                 return 0;
2458
2459         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2460
2461         new_target_mask = (crt ? crt->cgroup_realized_mask : 0) | target_mask;
2462         new_enable_mask = (crt ? crt->cgroup_enabled_mask : 0) | enable_mask;
2463
2464         return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
2465 }
2466
2467 /* Controllers can only be disabled depth-first, from the leaves of the
2468  * hierarchy upwards to the unit in question. */
2469 static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
2470         Unit *m;
2471
2472         assert(u);
2473
2474         if (u->type != UNIT_SLICE)
2475                 return 0;
2476
2477         UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
2478                 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2479                 int r;
2480
2481                 CGroupRuntime *rt = unit_get_cgroup_runtime(m);
2482                 if (!rt)
2483                         continue;
2484
2485                 /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
2486                  * holding any controllers open anyway. */
2487                 if (!rt->cgroup_path)
2488                         continue;
2489
2490                 /* We must disable those below us first in order to release the controller. */
2491                 if (m->type == UNIT_SLICE)
2492                         (void) unit_realize_cgroup_now_disable(m, state);
2493
2494                 target_mask = unit_get_target_mask(m);
2495                 enable_mask = unit_get_enable_mask(m);
2496
2497                 /* We can only disable in this direction, don't try to enable anything. */
2498                 if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
2499                         continue;
2500
2501                 new_target_mask = rt->cgroup_realized_mask & target_mask;
2502                 new_enable_mask = rt->cgroup_enabled_mask & enable_mask;
2503
2504                 r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
2505                 if (r < 0)
2506                         return r;
2507         }
2508
2509         return 0;
2510 }
2511
2512 /* Check if necessary controllers and attributes for a unit are in place.
2513  *
2514  * - If so, do nothing.
2515  * - If not, create paths, move processes over, and set attributes.
2516  *
2517  * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
2518  * a depth-first way. As such the process looks like this:
2519  *
2520  * Suppose we have a cgroup hierarchy which looks like this:
2521  *
2522  *             root
2523  *            /    \
2524  *           /      \
2525  *          /        \
2526  *         a          b
2527  *        / \        / \
2528  *       /   \      /   \
2529  *      c     d    e     f
2530  *     / \   / \  / \   / \
2531  *     h i   j k  l m   n o
2532  *
2533  * 1. We want to realise cgroup "d" now.
2534  * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
2535  * 3. cgroup "k" just started requesting the memory controller.
2536  *
2537  * To make this work we must do the following in order:
2538  *
2539  * 1. Disable CPU controller in k, j
2540  * 2. Disable CPU controller in d
2541  * 3. Enable memory controller in root
2542  * 4. Enable memory controller in a
2543  * 5. Enable memory controller in d
2544  * 6. Enable memory controller in k
2545  *
2546  * Notice that we need to touch j in one direction, but not the other. We also
2547  * don't go beyond d when disabling -- it's up to "a" to get realized if it
2548  * wants to disable further. The basic rules are therefore:
2549  *
2550  * - If you're disabling something, you need to realise all of the cgroups from
2551  *   your recursive descendants to the root. This starts from the leaves.
2552  * - If you're enabling something, you need to realise from the root cgroup
2553  *   downwards, but you don't need to iterate your recursive descendants.
2554  *
2555  * Returns 0 on success and < 0 on failure. */
2556 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
2557         CGroupMask target_mask, enable_mask;
2558         Unit *slice;
2559         int r;
2560
2561         assert(u);
2562
2563         unit_remove_from_cgroup_realize_queue(u);
2564
2565         target_mask = unit_get_target_mask(u);
2566         enable_mask = unit_get_enable_mask(u);
2567
2568         if (unit_has_mask_realized(u, target_mask, enable_mask))
2569                 return 0;
2570
2571         /* Disable controllers below us, if there are any */
2572         r = unit_realize_cgroup_now_disable(u, state);
2573         if (r < 0)
2574                 return r;
2575
2576         /* Enable controllers above us, if there are any */
2577         slice = UNIT_GET_SLICE(u);
2578         if (slice) {
2579                 r = unit_realize_cgroup_now_enable(slice, state);
2580                 if (r < 0)
2581                         return r;
2582         }
2583
2584         /* Now actually deal with the cgroup we were trying to realise and set attributes */
2585         r = unit_update_cgroup(u, target_mask, enable_mask, state);
2586         if (r < 0)
2587                 return r;
2588
2589         CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
2590
2591         /* Now, reset the invalidation mask */
2592         crt->cgroup_invalidated_mask = 0;
2593         return 0;
2594 }
2595
2596 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
2597         ManagerState state;
2598         unsigned n = 0;
2599         Unit *i;
2600         int r;
2601
2602         assert(m);
2603
2604         state = manager_state(m);
2605
2606         while ((i = m->cgroup_realize_queue)) {
2607                 assert(i->in_cgroup_realize_queue);
2608
2609                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
2610                         /* Maybe things changed, and the unit is not actually active anymore? */
2611                         unit_remove_from_cgroup_realize_queue(i);
2612                         continue;
2613                 }
2614
2615                 r = unit_realize_cgroup_now(i, state);
2616                 if (r < 0)
2617                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
2618
2619                 n++;
2620         }
2621
2622         return n;
2623 }
2624
2625 void unit_add_family_to_cgroup_realize_queue(Unit *u) {
2626         assert(u);
2627         assert(u->type == UNIT_SLICE);
2628
2629         /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
2630          * its ancestors.
2631          *
2632          * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
2633          * very weird if two units that own processes reside in the same slice, but one is realized in the
2634          * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
2635          * not), because that means individual processes need to be scheduled against whole cgroups. Let's
2636          * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
2637          * controller hierarchies too (if unit requires the controller to be realized).
2638          *
2639          * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
2640          * masks. */
2641
2642         do {
2643                 CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2644
2645                 /* Children of u likely changed when we're called */
2646                 if (crt)
2647                         crt->cgroup_members_mask_valid = false;
2648
2649                 Unit *m;
2650                 UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
2651
2652                         /* No point in doing cgroup application for units without active processes. */
2653                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
2654                                 continue;
2655
2656                         /* We only enqueue siblings if they were realized once at least, in the main
2657                          * hierarchy. */
2658                         crt = unit_get_cgroup_runtime(m);
2659                         if (!crt || !crt->cgroup_path)
2660                                 continue;
2661
2662                         /* If the unit doesn't need any new controllers and has current ones
2663                          * realized, it doesn't need any changes. */
2664                         if (unit_has_mask_realized(m,
2665                                                    unit_get_target_mask(m),
2666                                                    unit_get_enable_mask(m)))
2667                                 continue;
2668
2669                         unit_add_to_cgroup_realize_queue(m);
2670                 }
2671
2672                 /* Parent comes after children */
2673                 unit_add_to_cgroup_realize_queue(u);
2674
2675                 u = UNIT_GET_SLICE(u);
2676         } while (u);
2677 }
2678
2679 int unit_realize_cgroup(Unit *u) {
2680         Unit *slice;
2681
2682         assert(u);
2683
2684         if (!UNIT_HAS_CGROUP_CONTEXT(u))
2685                 return 0;
2686
2687         /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
2688          * parents, but there's more actually: for the weight-based controllers we also need to make sure
2689          * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too.  On the
2690          * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
2691          * and ancestors and they should be (de)realized too.
2692          *
2693          * This call will defer work on the siblings and derealized ancestors to the next event loop
2694          * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
2695
2696         slice = UNIT_GET_SLICE(u);
2697         if (slice)
2698                 unit_add_family_to_cgroup_realize_queue(slice);
2699
2700         /* And realize this one now (and apply the values) */
2701         return unit_realize_cgroup_now(u, manager_state(u->manager));
2702 }
2703
2704 void unit_release_cgroup(Unit *u, bool drop_cgroup_runtime) {
2705         assert(u);
2706
2707         /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
2708          * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
2709
2710         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2711         if (!crt)
2712                 return;
2713
2714         if (crt->cgroup_path) {
2715                 (void) hashmap_remove(u->manager->cgroup_unit, crt->cgroup_path);
2716                 crt->cgroup_path = mfree(crt->cgroup_path);
2717         }
2718
2719         if (crt->cgroup_control_inotify_wd >= 0) {
2720                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_control_inotify_wd) < 0)
2721                         log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", crt->cgroup_control_inotify_wd, u->id);
2722
2723                 (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd));
2724                 crt->cgroup_control_inotify_wd = -1;
2725         }
2726
2727         if (crt->cgroup_memory_inotify_wd >= 0) {
2728                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_memory_inotify_wd) < 0)
2729                         log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", crt->cgroup_memory_inotify_wd, u->id);
2730
2731                 (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd));
2732                 crt->cgroup_memory_inotify_wd = -1;
2733         }
2734
2735         if (drop_cgroup_runtime)
2736                 *(CGroupRuntime**) ((uint8_t*) u + UNIT_VTABLE(u)->cgroup_runtime_offset) = cgroup_runtime_free(crt);
2737 }
2738
2739 int unit_cgroup_is_empty(Unit *u) {
2740         int r;
2741
2742         assert(u);
2743
2744         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2745         if (!crt)
2746                 return -ENXIO;
2747         if (!crt->cgroup_path)
2748                 return -EOWNERDEAD;
2749
2750         r = cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path);
2751         if (r < 0)
2752                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(crt->cgroup_path));
2753         return r;
2754 }
2755
2756 static bool unit_maybe_release_cgroup(Unit *u) {
2757         int r;
2758
2759         /* Releases the cgroup only if it is recursively empty.
2760          * Returns true if the cgroup was released, false otherwise. */
2761
2762         assert(u);
2763
2764         /* Don't release the cgroup if there are still processes under it. If we get notified later when all
2765          * the processes exit (e.g. the processes were in D-state and exited after the unit was marked as
2766          * failed) we need the cgroup paths to continue to be tracked by the manager so they can be looked up
2767          * and cleaned up later. */
2768         r = unit_cgroup_is_empty(u);
2769         if (r > 0) {
2770                 /* Do not free CGroupRuntime when called from unit_prune_cgroup. Various accounting data
2771                  * we should keep, especially CPU usage and *_peak ones which would be shown even after
2772                  * the unit stops. */
2773                 unit_release_cgroup(u, /* drop_cgroup_runtime = */ false);
2774                 return true;
2775         }
2776
2777         return false;
2778 }
2779
2780 static int unit_prune_cgroup_via_bus(Unit *u) {
2781         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2782         int r;
2783
2784         assert(u);
2785         assert(u->manager);
2786
2787         if (MANAGER_IS_SYSTEM(u->manager))
2788                 return -EINVAL;
2789
2790         if (!u->manager->system_bus)
2791                 return -EIO;
2792
2793         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2794         if (!crt || !crt->cgroup_path)
2795                 return -EOWNERDEAD;
2796
2797         /* Determine this unit's cgroup path relative to our cgroup root */
2798         const char *pp = path_startswith_full(
2799                         crt->cgroup_path,
2800                         u->manager->cgroup_root,
2801                         PATH_STARTSWITH_RETURN_LEADING_SLASH|PATH_STARTSWITH_REFUSE_DOT_DOT);
2802         if (!pp)
2803                 return -EINVAL;
2804
2805         r = bus_call_method(u->manager->system_bus,
2806                             bus_systemd_mgr,
2807                             "RemoveSubgroupFromUnit",
2808                             &error, NULL,
2809                             "sst",
2810                             NULL /* empty unit name means client's unit, i.e. us */,
2811                             pp,
2812                             (uint64_t) 0);
2813         if (r < 0)
2814                 return log_unit_debug_errno(u, r, "Failed to trim cgroup via the bus: %s", bus_error_message(&error, r));
2815
2816         return 0;
2817 }
2818
2819 void unit_prune_cgroup(Unit *u) {
2820         bool is_root_slice;
2821         int r;
2822
2823         assert(u);
2824
2825         /* Removes the cgroup, if empty and possible, and stops watching it. */
2826         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2827         if (!crt || !crt->cgroup_path)
2828                 return;
2829
2830         /* Cache the last resource usage values before we destroy the cgroup */
2831         (void) unit_get_cpu_usage(u, /* ret = */ NULL);
2832
2833         for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++)
2834                 (void) unit_get_memory_accounting(u, metric, /* ret = */ NULL);
2835
2836         /* All IO metrics are read at once from the underlying cgroup, so issue just a single call */
2837         (void) unit_get_io_accounting(u, _CGROUP_IO_ACCOUNTING_METRIC_INVALID, /* ret = */ NULL);
2838
2839         /* We do not cache IP metrics here because the firewall objects are not freed with cgroups */
2840
2841 #if BPF_FRAMEWORK
2842         (void) bpf_restrict_fs_cleanup(u); /* Remove cgroup from the global LSM BPF map */
2843 #endif
2844
2845         unit_modify_nft_set(u, /* add = */ false);
2846
2847         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
2848
2849         r = cg_trim(crt->cgroup_path, !is_root_slice);
2850         if (r < 0) {
2851                 int k = unit_prune_cgroup_via_bus(u);
2852
2853                 if (k >= 0)
2854                         log_unit_debug_errno(u, r, "Failed to destroy cgroup %s on our own (%m), but worked when talking to PID 1.", empty_to_root(crt->cgroup_path));
2855                 else {
2856                         /* One reason we could have failed here is, that the cgroup still contains a process.
2857                          * However, if the cgroup becomes removable at a later time, it might be removed when
2858                          * the containing slice is stopped. So even if we failed now, this unit shouldn't
2859                          * assume that the cgroup is still realized the next time it is started. Do not
2860                          * return early on error, continue cleanup. */
2861                         log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r,
2862                                             "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
2863                 }
2864         }
2865
2866         if (is_root_slice)
2867                 return;
2868
2869         if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
2870                 return;
2871
2872         assert(crt == unit_get_cgroup_runtime(u));
2873         assert(!crt->cgroup_path);
2874
2875         crt->cgroup_realized_mask = 0;
2876         crt->cgroup_enabled_mask = 0;
2877
2878         crt->bpf_device_control_installed = bpf_program_free(crt->bpf_device_control_installed);
2879 }
2880
2881 int unit_search_main_pid(Unit *u, PidRef *ret) {
2882         _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
2883         _cleanup_fclose_ FILE *f = NULL;
2884         int r;
2885
2886         assert(u);
2887         assert(ret);
2888
2889         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
2890         if (!crt || !crt->cgroup_path)
2891                 return -ENXIO;
2892
2893         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, &f);
2894         if (r < 0)
2895                 return r;
2896
2897         for (;;) {
2898                 _cleanup_(pidref_done) PidRef npidref = PIDREF_NULL;
2899
2900                 /* cg_read_pidref() will return an error on unmapped PIDs.
2901                  * We can't reasonably deal with units that contain those. */
2902                 r = cg_read_pidref(f, &npidref, CGROUP_DONT_SKIP_UNMAPPED);
2903                 if (r < 0)
2904                         return r;
2905                 if (r == 0)
2906                         break;
2907
2908                 if (pidref_equal(&pidref, &npidref)) /* seen already, cgroupfs reports duplicates! */
2909                         continue;
2910
2911                 if (pidref_is_my_child(&npidref) <= 0) /* ignore processes further down the tree */
2912                         continue;
2913
2914                 if (pidref_is_set(&pidref) != 0)
2915                         /* Dang, there's more than one daemonized PID in this group, so we don't know what
2916                          * process is the main process. */
2917                         return -ENODATA;
2918
2919                 pidref = TAKE_PIDREF(npidref);
2920         }
2921
2922         if (!pidref_is_set(&pidref))
2923                 return -ENODATA;
2924
2925         *ret = TAKE_PIDREF(pidref);
2926         return 0;
2927 }
2928
2929 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2930         Manager *m = ASSERT_PTR(userdata);
2931         Unit *u;
2932         int r;
2933
2934         assert(s);
2935
2936         u = m->cgroup_empty_queue;
2937         if (!u)
2938                 return 0;
2939
2940         assert(u->in_cgroup_empty_queue);
2941         u->in_cgroup_empty_queue = false;
2942         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2943
2944         if (m->cgroup_empty_queue) {
2945                 /* More stuff queued, let's make sure we remain enabled */
2946                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2947                 if (r < 0)
2948                         log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
2949         }
2950
2951         /* Update state based on OOM kills before we notify about cgroup empty event */
2952         (void) unit_check_oom(u);
2953         (void) unit_check_oomd_kill(u);
2954
2955         unit_add_to_gc_queue(u);
2956
2957         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
2958                 unit_prune_cgroup(u);
2959         else if (UNIT_VTABLE(u)->notify_cgroup_empty)
2960                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2961
2962         return 0;
2963 }
2964
2965 static void unit_add_to_cgroup_empty_queue(Unit *u) {
2966         int r;
2967
2968         assert(u);
2969
2970         /* Note that cgroup empty events are dispatched in a separate queue with a lower priority than
2971          * the SIGCHLD handler, so that we always use SIGCHLD if we can get it first, and only use
2972          * the cgroup empty notifications if there's no SIGCHLD pending (which might happen if the cgroup
2973          * doesn't contain processes that are our own child, which is typically the case for scope units). */
2974
2975         if (u->in_cgroup_empty_queue)
2976                 return;
2977
2978         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2979         u->in_cgroup_empty_queue = true;
2980
2981         /* Trigger the defer event */
2982         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2983         if (r < 0)
2984                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2985 }
2986
2987 static void unit_remove_from_cgroup_empty_queue(Unit *u) {
2988         assert(u);
2989
2990         if (!u->in_cgroup_empty_queue)
2991                 return;
2992
2993         LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2994         u->in_cgroup_empty_queue = false;
2995 }
2996
2997 int unit_check_oomd_kill(Unit *u) {
2998         _cleanup_free_ char *value = NULL;
2999         bool increased;
3000         uint64_t n = 0;
3001         int r;
3002
3003         assert(u);
3004
3005         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3006         if (!crt || !crt->cgroup_path)
3007                 return 0;
3008
3009         r = cg_get_xattr(crt->cgroup_path, "user.oomd_ooms", &value, /* ret_size= */ NULL);
3010         if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3011                 return r;
3012
3013         if (!isempty(value)) {
3014                  r = safe_atou64(value, &n);
3015                  if (r < 0)
3016                          return r;
3017         }
3018
3019         increased = n > crt->managed_oom_kill_last;
3020         crt->managed_oom_kill_last = n;
3021
3022         if (!increased)
3023                 return 0;
3024
3025         n = 0;
3026         value = mfree(value);
3027         r = cg_get_xattr(crt->cgroup_path, "user.oomd_kill", &value, /* ret_size= */ NULL);
3028         if (r >= 0 && !isempty(value))
3029                 (void) safe_atou64(value, &n);
3030
3031         if (n > 0)
3032                 log_unit_struct(u, LOG_NOTICE,
3033                                 LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OOMD_KILL_STR),
3034                                 LOG_UNIT_INVOCATION_ID(u),
3035                                 LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n),
3036                                 LOG_ITEM("N_PROCESSES=%" PRIu64, n));
3037         else
3038                 log_unit_struct(u, LOG_NOTICE,
3039                                 LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OOMD_KILL_STR),
3040                                 LOG_UNIT_INVOCATION_ID(u),
3041                                 LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit."));
3042
3043         unit_notify_cgroup_oom(u, /* managed_oom= */ true);
3044
3045         return 1;
3046 }
3047
3048 int unit_check_oom(Unit *u) {
3049         _cleanup_free_ char *oom_kill = NULL;
3050         bool increased;
3051         uint64_t c;
3052         int r;
3053
3054         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3055         if (!crt || !crt->cgroup_path)
3056                 return 0;
3057
3058         r = cg_get_keyed_attribute(
3059                         "memory",
3060                         crt->cgroup_path,
3061                         "memory.events",
3062                         STRV_MAKE("oom_kill"),
3063                         &oom_kill);
3064         if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
3065                 c = 0;
3066         else if (r < 0)
3067                 return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
3068         else {
3069                 r = safe_atou64(oom_kill, &c);
3070                 if (r < 0)
3071                         return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
3072         }
3073
3074         increased = c > crt->oom_kill_last;
3075         crt->oom_kill_last = c;
3076
3077         if (!increased)
3078                 return 0;
3079
3080         log_unit_struct(u, LOG_NOTICE,
3081                         LOG_MESSAGE_ID(SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR),
3082                         LOG_UNIT_INVOCATION_ID(u),
3083                         LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
3084
3085         unit_notify_cgroup_oom(u, /* managed_oom= */ false);
3086
3087         return 1;
3088 }
3089
3090 static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
3091         Manager *m = ASSERT_PTR(userdata);
3092         Unit *u;
3093         int r;
3094
3095         assert(s);
3096
3097         u = m->cgroup_oom_queue;
3098         if (!u)
3099                 return 0;
3100
3101         assert(u->in_cgroup_oom_queue);
3102         u->in_cgroup_oom_queue = false;
3103         LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
3104
3105         if (m->cgroup_oom_queue) {
3106                 /* More stuff queued, let's make sure we remain enabled */
3107                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
3108                 if (r < 0)
3109                         log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
3110         }
3111
3112         (void) unit_check_oom(u);
3113         unit_add_to_gc_queue(u);
3114
3115         return 0;
3116 }
3117
3118 static void unit_add_to_cgroup_oom_queue(Unit *u) {
3119         int r;
3120
3121         assert(u);
3122
3123         if (u->in_cgroup_oom_queue)
3124                 return;
3125
3126         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3127         if (!crt || !crt->cgroup_path)
3128                 return;
3129
3130         LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
3131         u->in_cgroup_oom_queue = true;
3132
3133         /* Trigger the defer event */
3134         if (!u->manager->cgroup_oom_event_source) {
3135                 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
3136
3137                 r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
3138                 if (r < 0) {
3139                         log_error_errno(r, "Failed to create cgroup oom event source: %m");
3140                         return;
3141                 }
3142
3143                 r = sd_event_source_set_priority(s, EVENT_PRIORITY_CGROUP_OOM);
3144                 if (r < 0) {
3145                         log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
3146                         return;
3147                 }
3148
3149                 (void) sd_event_source_set_description(s, "cgroup-oom");
3150                 u->manager->cgroup_oom_event_source = TAKE_PTR(s);
3151         }
3152
3153         r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
3154         if (r < 0)
3155                 log_error_errno(r, "Failed to enable cgroup oom event source: %m");
3156 }
3157
3158 static int unit_check_cgroup_events(Unit *u) {
3159         char *values[2] = {};
3160         int r;
3161
3162         assert(u);
3163
3164         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3165         if (!crt || !crt->cgroup_path)
3166                 return 0;
3167
3168         r = cg_get_keyed_attribute(
3169                         SYSTEMD_CGROUP_CONTROLLER,
3170                         crt->cgroup_path,
3171                         "cgroup.events",
3172                         STRV_MAKE("populated", "frozen"),
3173                         values);
3174         if (r < 0)
3175                 return r;
3176
3177         /* The cgroup.events notifications can be merged together so act as we saw the given state for the
3178          * first time. The functions we call to handle given state are idempotent, which makes them
3179          * effectively remember the previous state. */
3180         if (streq(values[0], "1"))
3181                 unit_remove_from_cgroup_empty_queue(u);
3182         else
3183                 unit_add_to_cgroup_empty_queue(u);
3184
3185         /* Disregard freezer state changes due to operations not initiated by us.
3186          * See: https://github.com/systemd/systemd/pull/13512/files#r416469963 and
3187          *      https://github.com/systemd/systemd/pull/13512#issuecomment-573007207 */
3188         if (IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING))
3189                 unit_freezer_complete(u, streq(values[1], "0") ? FREEZER_RUNNING : FREEZER_FROZEN);
3190
3191         free_many_charp(values, ELEMENTSOF(values));
3192         return 0;
3193 }
3194
3195 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
3196         Manager *m = ASSERT_PTR(userdata);
3197
3198         assert(s);
3199         assert(fd >= 0);
3200
3201         for (;;) {
3202                 union inotify_event_buffer buffer;
3203                 ssize_t l;
3204
3205                 l = read(fd, &buffer, sizeof(buffer));
3206                 if (l < 0) {
3207                         if (ERRNO_IS_TRANSIENT(errno))
3208                                 return 0;
3209
3210                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
3211                 }
3212
3213                 FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) {
3214                         Unit *u;
3215
3216                         if (e->wd < 0)
3217                                 /* Queue overflow has no watch descriptor */
3218                                 continue;
3219
3220                         if (e->mask & IN_IGNORED)
3221                                 /* The watch was just removed */
3222                                 continue;
3223
3224                         /* Note that inotify might deliver events for a watch even after it was removed,
3225                          * because it was queued before the removal. Let's ignore this here safely. */
3226
3227                         u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
3228                         if (u)
3229                                 unit_check_cgroup_events(u);
3230
3231                         u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
3232                         if (u)
3233                                 unit_add_to_cgroup_oom_queue(u);
3234                 }
3235         }
3236 }
3237
3238 static int cg_bpf_mask_supported(CGroupMask *ret) {
3239         CGroupMask mask = 0;
3240         int r;
3241
3242         /* BPF-based firewall, device access control, and pinned foreign prog */
3243         if (bpf_program_supported() > 0)
3244                 mask |= CGROUP_MASK_BPF_FIREWALL |
3245                         CGROUP_MASK_BPF_DEVICES |
3246                         CGROUP_MASK_BPF_FOREIGN;
3247
3248         /* BPF-based bind{4|6} hooks */
3249         r = bpf_socket_bind_supported();
3250         if (r < 0)
3251                 return r;
3252         if (r > 0)
3253                 mask |= CGROUP_MASK_BPF_SOCKET_BIND;
3254
3255         /* BPF-based cgroup_skb/{egress|ingress} hooks */
3256         r = bpf_restrict_ifaces_supported();
3257         if (r < 0)
3258                 return r;
3259         if (r > 0)
3260                 mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
3261
3262         *ret = mask;
3263         return 0;
3264 }
3265
3266 int manager_setup_cgroup(Manager *m) {
3267         int r;
3268
3269         assert(m);
3270
3271         /* 1. Determine hierarchy */
3272         m->cgroup_root = mfree(m->cgroup_root);
3273         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
3274         if (r < 0)
3275                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
3276
3277         /* Chop off the init scope, if we are already located in it */
3278         char *e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
3279         if (e)
3280                 *e = 0;
3281
3282         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
3283          * easily prepend it everywhere. */
3284         delete_trailing_chars(m->cgroup_root, "/");
3285
3286         /* 2. Pin the cgroupfs mount, so that it cannot be unmounted */
3287         safe_close(m->pin_cgroupfs_fd);
3288         m->pin_cgroupfs_fd = open("/sys/fs/cgroup", O_PATH|O_CLOEXEC|O_DIRECTORY);
3289         if (m->pin_cgroupfs_fd < 0)
3290                 return log_error_errno(errno, "Failed to pin cgroup hierarchy: %m");
3291
3292         /* 3. Allocate cgroup empty defer event source */
3293         m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
3294         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
3295         if (r < 0)
3296                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
3297
3298         /* Schedule cgroup empty checks early, but after having processed service notification messages or
3299          * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
3300          * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
3301         r = sd_event_source_set_priority(m->cgroup_empty_event_source, EVENT_PRIORITY_CGROUP_EMPTY);
3302         if (r < 0)
3303                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
3304
3305         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
3306         if (r < 0)
3307                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
3308
3309         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
3310
3311         /* 4. Install cgroup empty event notifier inotify object */
3312         m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
3313         safe_close(m->cgroup_inotify_fd);
3314
3315         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
3316         if (m->cgroup_inotify_fd < 0)
3317                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
3318
3319         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
3320         if (r < 0)
3321                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
3322
3323         /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
3324          * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
3325          * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
3326         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, EVENT_PRIORITY_CGROUP_INOTIFY);
3327         if (r < 0)
3328                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
3329
3330         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
3331
3332         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
3333         const char *scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
3334         r = cg_create_and_attach(scope_path, /* pid = */ 0);
3335         if (r >= 0) {
3336                 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
3337                 r = cg_migrate(m->cgroup_root, scope_path, 0);
3338                 if (r < 0)
3339                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
3340
3341         } else if (!MANAGER_IS_TEST_RUN(m))
3342                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
3343
3344         /* 6. Figure out which controllers are supported */
3345         r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported);
3346         if (r < 0)
3347                 return log_error_errno(r, "Failed to determine supported controllers: %m");
3348
3349         /* 7. Figure out which bpf-based pseudo-controllers are supported */
3350         CGroupMask mask;
3351         r = cg_bpf_mask_supported(&mask);
3352         if (r < 0)
3353                 return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
3354         m->cgroup_supported |= mask;
3355
3356         /* 8. Log which controllers are supported */
3357         for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
3358                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c),
3359                           yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
3360
3361         return 0;
3362 }
3363
3364 void manager_shutdown_cgroup(Manager *m, bool delete) {
3365         assert(m);
3366
3367         /* We can't really delete the group, since we are in it. But
3368          * let's trim it. */
3369         if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
3370                 (void) cg_trim(m->cgroup_root, false);
3371
3372         m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
3373
3374         m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
3375         m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
3376
3377         m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
3378         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
3379
3380         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
3381
3382         m->cgroup_root = mfree(m->cgroup_root);
3383 }
3384
3385 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
3386         char *p;
3387         Unit *u;
3388
3389         assert(m);
3390         assert(cgroup);
3391
3392         u = hashmap_get(m->cgroup_unit, cgroup);
3393         if (u)
3394                 return u;
3395
3396         p = strdupa_safe(cgroup);
3397         for (;;) {
3398                 char *e;
3399
3400                 e = strrchr(p, '/');
3401                 if (!e || e == p)
3402                         return NULL; /* reached cgroup root? return NULL and possibly fall back to manager_get_unit_by_pidref_watching() */
3403
3404                 *e = 0;
3405
3406                 u = hashmap_get(m->cgroup_unit, p);
3407                 if (u)
3408                         return u;
3409         }
3410 }
3411
3412 Unit* manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid) {
3413         _cleanup_free_ char *cgroup = NULL;
3414
3415         assert(m);
3416
3417         if (cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
3418                 return NULL;
3419
3420         return manager_get_unit_by_cgroup(m, cgroup);
3421 }
3422
3423 Unit* manager_get_unit_by_pidref_watching(Manager *m, const PidRef *pid) {
3424         Unit *u, **array;
3425
3426         assert(m);
3427
3428         if (!pidref_is_set(pid))
3429                 return NULL;
3430
3431         u = hashmap_get(m->watch_pids, pid);
3432         if (u)
3433                 return u;
3434
3435         array = hashmap_get(m->watch_pids_more, pid);
3436         if (array)
3437                 return array[0];
3438
3439         return NULL;
3440 }
3441
3442 Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid) {
3443         Unit *u;
3444
3445         assert(m);
3446
3447         /* Note that a process might be owned by multiple units, we return only one here, which is good
3448          * enough for most cases, though not strictly correct. We prefer the one reported by cgroup
3449          * membership, as that's the most relevant one as children of the process will be assigned to that
3450          * one, too, before all else. */
3451
3452         if (!pidref_is_set(pid))
3453                 return NULL;
3454
3455         if (pidref_is_self(pid))
3456                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
3457         if (pid->pid == 1)
3458                 return NULL;
3459
3460         u = manager_get_unit_by_pidref_cgroup(m, pid);
3461         if (u)
3462                 return u;
3463
3464         u = manager_get_unit_by_pidref_watching(m, pid);
3465         if (u)
3466                 return u;
3467
3468         return NULL;
3469 }
3470
3471 int unit_get_memory_available(Unit *u, uint64_t *ret) {
3472         uint64_t available = UINT64_MAX, current = 0;
3473
3474         assert(u);
3475         assert(ret);
3476
3477         /* If data from cgroups can be accessed, try to find out how much more memory a unit can
3478          * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
3479          * and MemoryMax, and also any slice the unit might be nested below. */
3480
3481         do {
3482                 uint64_t unit_available, unit_limit = UINT64_MAX;
3483                 CGroupContext *unit_context;
3484
3485                 /* No point in continuing if we can't go any lower */
3486                 if (available == 0)
3487                         break;
3488
3489                 unit_context = unit_get_cgroup_context(u);
3490                 if (!unit_context)
3491                         return -ENODATA;
3492
3493                 (void) unit_get_memory_accounting(u, CGROUP_MEMORY_CURRENT, &current);
3494                 /* in case of error, previous current propagates as lower bound */
3495
3496                 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
3497                         unit_limit = physical_memory();
3498                 else if (unit_context->memory_max == UINT64_MAX && unit_context->memory_high == UINT64_MAX)
3499                         continue;
3500                 unit_limit = MIN3(unit_limit, unit_context->memory_max, unit_context->memory_high);
3501
3502                 unit_available = LESS_BY(unit_limit, current);
3503                 available = MIN(unit_available, available);
3504         } while ((u = UNIT_GET_SLICE(u)));
3505
3506         *ret = available;
3507
3508         return 0;
3509 }
3510
3511 int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) {
3512
3513         static const char* const attributes_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
3514                 [CGROUP_MEMORY_CURRENT]       = "memory.current",
3515                 [CGROUP_MEMORY_PEAK]          = "memory.peak",
3516                 [CGROUP_MEMORY_SWAP_CURRENT]  = "memory.swap.current",
3517                 [CGROUP_MEMORY_SWAP_PEAK]     = "memory.swap.peak",
3518                 [CGROUP_MEMORY_ZSWAP_CURRENT] = "memory.zswap.current",
3519         };
3520
3521         uint64_t bytes;
3522         bool updated = false;
3523         int r;
3524
3525         assert(u);
3526         assert(metric >= 0);
3527         assert(metric < _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX);
3528
3529         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
3530                 return -ENODATA;
3531
3532         /* The root cgroup doesn't expose this information. */
3533         if (unit_has_host_root_cgroup(u)) {
3534                 /* System-wide memory usage can be acquired from /proc/ */
3535                 if (metric == CGROUP_MEMORY_CURRENT)
3536                         return procfs_memory_get_used(ret);
3537
3538                 return -ENODATA;
3539         }
3540
3541         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3542         if (!crt)
3543                 return -ENODATA;
3544         if (!crt->cgroup_path)
3545                 /* If the cgroup is already gone, we try to find the last cached value. */
3546                 goto finish;
3547
3548         if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_MEMORY))
3549                 return -ENODATA;
3550
3551         r = cg_get_attribute_as_uint64("memory", crt->cgroup_path, attributes_table[metric], &bytes);
3552         if (r < 0 && r != -ENODATA)
3553                 return r;
3554         updated = r >= 0;
3555
3556 finish:
3557         if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) {
3558                 uint64_t *last = &crt->memory_accounting_last[metric];
3559
3560                 if (updated)
3561                         *last = bytes;
3562                 else if (*last != UINT64_MAX)
3563                         bytes = *last;
3564                 else
3565                         return -ENODATA;
3566
3567         } else if (!updated)
3568                 return -ENODATA;
3569
3570         if (ret)
3571                 *ret = bytes;
3572
3573         return 0;
3574 }
3575
3576 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
3577         assert(u);
3578         assert(ret);
3579
3580         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
3581                 return -ENODATA;
3582
3583         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3584         if (!crt || !crt->cgroup_path)
3585                 return -ENODATA;
3586
3587         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3588         if (unit_has_host_root_cgroup(u))
3589                 return procfs_tasks_get_current(ret);
3590
3591         if ((crt->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
3592                 return -ENODATA;
3593
3594         return cg_get_attribute_as_uint64("pids", crt->cgroup_path, "pids.current", ret);
3595 }
3596
3597 static int unit_get_cpu_usage_raw(const Unit *u, const CGroupRuntime *crt, nsec_t *ret) {
3598         int r;
3599
3600         assert(u);
3601         assert(crt);
3602         assert(ret);
3603
3604         if (!crt->cgroup_path)
3605                 return -ENODATA;
3606
3607         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3608         if (unit_has_host_root_cgroup(u))
3609                 return procfs_cpu_get_usage(ret);
3610
3611         _cleanup_free_ char *val = NULL;
3612         uint64_t us;
3613
3614         r = cg_get_keyed_attribute("cpu", crt->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
3615         if (r < 0)
3616                 return r;
3617
3618         r = safe_atou64(val, &us);
3619         if (r < 0)
3620                 return r;
3621
3622         *ret = us * NSEC_PER_USEC;
3623
3624         return 0;
3625 }
3626
3627 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
3628         nsec_t ns;
3629         int r;
3630
3631         assert(u);
3632
3633         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
3634          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
3635          * call this function with a NULL return value. */
3636
3637         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3638         if (!crt)
3639                 return -ENODATA;
3640
3641         r = unit_get_cpu_usage_raw(u, crt, &ns);
3642         if (r == -ENODATA && crt->cpu_usage_last != NSEC_INFINITY) {
3643                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
3644                  * cached value. */
3645
3646                 if (ret)
3647                         *ret = crt->cpu_usage_last;
3648                 return 0;
3649         }
3650         if (r < 0)
3651                 return r;
3652
3653         if (ns > crt->cpu_usage_base)
3654                 ns -= crt->cpu_usage_base;
3655         else
3656                 ns = 0;
3657
3658         crt->cpu_usage_last = ns;
3659         if (ret)
3660                 *ret = ns;
3661
3662         return 0;
3663 }
3664
3665 int unit_get_ip_accounting(
3666                 Unit *u,
3667                 CGroupIPAccountingMetric metric,
3668                 uint64_t *ret) {
3669
3670         uint64_t value;
3671         int fd, r;
3672
3673         assert(u);
3674         assert(metric >= 0);
3675         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
3676         assert(ret);
3677
3678         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
3679                 return -ENODATA;
3680
3681         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3682         if (!crt)
3683                 return -ENODATA;
3684
3685         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
3686                 crt->ip_accounting_ingress_map_fd :
3687                 crt->ip_accounting_egress_map_fd;
3688         if (fd < 0)
3689                 return -ENODATA;
3690
3691         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
3692                 r = bpf_firewall_read_accounting(fd, &value, NULL);
3693         else
3694                 r = bpf_firewall_read_accounting(fd, NULL, &value);
3695         if (r < 0)
3696                 return r;
3697
3698         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
3699          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
3700          * ip_accounting_extra[] field, and add them in here transparently. */
3701
3702         *ret = value + crt->ip_accounting_extra[metric];
3703
3704         return r;
3705 }
3706
3707 static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) {
3708         CGroupContext *cc;
3709
3710         assert(u);
3711         assert(UNIT_HAS_CGROUP_CONTEXT(u));
3712
3713         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
3714                 switch (type) {
3715                         case CGROUP_LIMIT_MEMORY_MAX:
3716                         case CGROUP_LIMIT_MEMORY_HIGH:
3717                                 return physical_memory();
3718                         case CGROUP_LIMIT_TASKS_MAX:
3719                                 return system_tasks_max();
3720                         default:
3721                                 assert_not_reached();
3722                 }
3723
3724         cc = ASSERT_PTR(unit_get_cgroup_context(u));
3725         switch (type) {
3726                 case CGROUP_LIMIT_MEMORY_MAX:
3727                         return cc->memory_max;
3728                 case CGROUP_LIMIT_MEMORY_HIGH:
3729                         return cc->memory_high;
3730                 case CGROUP_LIMIT_TASKS_MAX:
3731                         return cgroup_tasks_max_resolve(&cc->tasks_max);
3732                 default:
3733                         assert_not_reached();
3734         }
3735 }
3736
3737 int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) {
3738         uint64_t infimum;
3739
3740         assert(u);
3741         assert(ret);
3742         assert(type >= 0);
3743         assert(type < _CGROUP_LIMIT_TYPE_MAX);
3744
3745         if (!UNIT_HAS_CGROUP_CONTEXT(u))
3746                 return -EINVAL;
3747
3748         infimum = unit_get_effective_limit_one(u, type);
3749         for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice))
3750                 infimum = MIN(infimum, unit_get_effective_limit_one(slice, type));
3751
3752         *ret = infimum;
3753         return 0;
3754 }
3755
3756 static int unit_get_io_accounting_raw(
3757                 const Unit *u,
3758                 const CGroupRuntime *crt,
3759                 uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
3760
3761         static const char* const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
3762                 [CGROUP_IO_READ_BYTES]       = "rbytes=",
3763                 [CGROUP_IO_WRITE_BYTES]      = "wbytes=",
3764                 [CGROUP_IO_READ_OPERATIONS]  = "rios=",
3765                 [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
3766         };
3767
3768         uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
3769         _cleanup_free_ char *path = NULL;
3770         _cleanup_fclose_ FILE *f = NULL;
3771         int r;
3772
3773         assert(u);
3774         assert(crt);
3775
3776         if (!crt->cgroup_path)
3777                 return -ENODATA;
3778
3779         if (unit_has_host_root_cgroup(u))
3780                 return -ENODATA; /* TODO: return useful data for the top-level cgroup */
3781
3782         if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_IO))
3783                 return -ENODATA;
3784
3785         r = cg_get_path("io", crt->cgroup_path, "io.stat", &path);
3786         if (r < 0)
3787                 return r;
3788
3789         f = fopen(path, "re");
3790         if (!f)
3791                 return -errno;
3792
3793         for (;;) {
3794                 _cleanup_free_ char *line = NULL;
3795                 const char *p;
3796
3797                 r = read_line(f, LONG_LINE_MAX, &line);
3798                 if (r < 0)
3799                         return r;
3800                 if (r == 0)
3801                         break;
3802
3803                 p = line;
3804                 p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
3805                 p += strspn(p, WHITESPACE);  /* Skip over following whitespace */
3806
3807                 for (;;) {
3808                         _cleanup_free_ char *word = NULL;
3809
3810                         r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
3811                         if (r < 0)
3812                                 return r;
3813                         if (r == 0)
3814                                 break;
3815
3816                         for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3817                                 const char *x;
3818
3819                                 x = startswith(word, field_names[i]);
3820                                 if (x) {
3821                                         uint64_t w;
3822
3823                                         r = safe_atou64(x, &w);
3824                                         if (r < 0)
3825                                                 return r;
3826
3827                                         /* Sum up the stats of all devices */
3828                                         acc[i] += w;
3829                                         break;
3830                                 }
3831                         }
3832                 }
3833         }
3834
3835         memcpy(ret, acc, sizeof(acc));
3836         return 0;
3837 }
3838
3839 int unit_get_io_accounting(
3840                 Unit *u,
3841                 CGroupIOAccountingMetric metric,
3842                 uint64_t *ret) {
3843
3844         uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
3845         int r;
3846
3847         /*
3848          * Retrieve an IO counter, subtracting the value of the counter value at the time the unit was started.
3849          * If ret == NULL and metric == _<...>_INVALID, no return value is expected (refresh the caches only).
3850          */
3851
3852         assert(u);
3853         assert(metric >= 0 || (!ret && metric == _CGROUP_IO_ACCOUNTING_METRIC_INVALID));
3854         assert(metric < _CGROUP_IO_ACCOUNTING_METRIC_MAX);
3855
3856         if (!UNIT_CGROUP_BOOL(u, io_accounting))
3857                 return -ENODATA;
3858
3859         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3860         if (!crt)
3861                 return -ENODATA;
3862
3863         r = unit_get_io_accounting_raw(u, crt, raw);
3864         if (r == -ENODATA && metric >= 0 && crt->io_accounting_last[metric] != UINT64_MAX)
3865                 goto done;
3866         if (r < 0)
3867                 return r;
3868
3869         for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3870                 /* Saturated subtraction */
3871                 if (raw[i] > crt->io_accounting_base[i])
3872                         crt->io_accounting_last[i] = raw[i] - crt->io_accounting_base[i];
3873                 else
3874                         crt->io_accounting_last[i] = 0;
3875         }
3876
3877 done:
3878         if (ret)
3879                 *ret = crt->io_accounting_last[metric];
3880
3881         return 0;
3882 }
3883
3884 static int unit_reset_cpu_accounting(Unit *unit, CGroupRuntime *crt) {
3885         int r;
3886
3887         assert(crt);
3888
3889         crt->cpu_usage_base = 0;
3890         crt->cpu_usage_last = NSEC_INFINITY;
3891
3892         if (unit) {
3893                 r = unit_get_cpu_usage_raw(unit, crt, &crt->cpu_usage_base);
3894                 if (r < 0 && r != -ENODATA)
3895                         return r;
3896         }
3897
3898         return 0;
3899 }
3900
3901 static int unit_reset_io_accounting(Unit *unit, CGroupRuntime *crt) {
3902         int r;
3903
3904         assert(crt);
3905
3906         zero(crt->io_accounting_base);
3907         FOREACH_ELEMENT(i, crt->io_accounting_last)
3908                 *i = UINT64_MAX;
3909
3910         if (unit) {
3911                 r = unit_get_io_accounting_raw(unit, crt, crt->io_accounting_base);
3912                 if (r < 0 && r != -ENODATA)
3913                         return r;
3914         }
3915
3916         return 0;
3917 }
3918
3919 static void cgroup_runtime_reset_memory_accounting_last(CGroupRuntime *crt) {
3920         assert(crt);
3921
3922         FOREACH_ELEMENT(i, crt->memory_accounting_last)
3923                 *i = UINT64_MAX;
3924 }
3925
3926 static int cgroup_runtime_reset_ip_accounting(CGroupRuntime *crt) {
3927         int r = 0;
3928
3929         assert(crt);
3930
3931         if (crt->ip_accounting_ingress_map_fd >= 0)
3932                 RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_ingress_map_fd));
3933
3934         if (crt->ip_accounting_egress_map_fd >= 0)
3935                 RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_egress_map_fd));
3936
3937         zero(crt->ip_accounting_extra);
3938
3939         return r;
3940 }
3941
3942 int unit_reset_accounting(Unit *u) {
3943         int r = 0;
3944
3945         assert(u);
3946
3947         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3948         if (!crt)
3949                 return 0;
3950
3951         cgroup_runtime_reset_memory_accounting_last(crt);
3952         RET_GATHER(r, unit_reset_cpu_accounting(u, crt));
3953         RET_GATHER(r, unit_reset_io_accounting(u, crt));
3954         RET_GATHER(r, cgroup_runtime_reset_ip_accounting(crt));
3955
3956         return r;
3957 }
3958
3959 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
3960         assert(u);
3961
3962         if (!UNIT_HAS_CGROUP_CONTEXT(u))
3963                 return;
3964
3965         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3966         if (!crt)
3967                 return;
3968
3969         if (FLAGS_SET(crt->cgroup_invalidated_mask, m)) /* NOP? */
3970                 return;
3971
3972         crt->cgroup_invalidated_mask |= m;
3973         unit_add_to_cgroup_realize_queue(u);
3974 }
3975
3976 void unit_invalidate_cgroup_bpf(Unit *u) {
3977         assert(u);
3978
3979         if (!UNIT_HAS_CGROUP_CONTEXT(u))
3980                 return;
3981
3982         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
3983         if (!crt)
3984                 return;
3985
3986         if (crt->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
3987                 return;
3988
3989         crt->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
3990         unit_add_to_cgroup_realize_queue(u);
3991
3992         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
3993          * list of our children includes our own. */
3994         if (u->type == UNIT_SLICE) {
3995                 Unit *member;
3996
3997                 UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
3998                         unit_invalidate_cgroup_bpf(member);
3999         }
4000 }
4001
4002 void unit_cgroup_catchup(Unit *u) {
4003         assert(u);
4004
4005         if (!UNIT_HAS_CGROUP_CONTEXT(u))
4006                 return;
4007
4008         /* We dropped the inotify watch during reexec/reload, so we need to
4009          * check these as they may have changed.
4010          * Note that (currently) the kernel doesn't actually update cgroup
4011          * file modification times, so we can't just serialize and then check
4012          * the mtime for file(s) we are interested in. */
4013         (void) unit_check_cgroup_events(u);
4014         unit_add_to_cgroup_oom_queue(u);
4015 }
4016
4017 bool unit_cgroup_delegate(Unit *u) {
4018         CGroupContext *c;
4019
4020         assert(u);
4021
4022         if (!UNIT_VTABLE(u)->can_delegate)
4023                 return false;
4024
4025         c = unit_get_cgroup_context(u);
4026         if (!c)
4027                 return false;
4028
4029         return c->delegate;
4030 }
4031
4032 void manager_invalidate_startup_units(Manager *m) {
4033         Unit *u;
4034
4035         assert(m);
4036
4037         SET_FOREACH(u, m->startup_units)
4038                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
4039 }
4040
4041 static int unit_cgroup_freezer_kernel_state(Unit *u, FreezerState *ret) {
4042         _cleanup_free_ char *val = NULL;
4043         FreezerState s;
4044         int r;
4045
4046         assert(u);
4047         assert(ret);
4048
4049         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4050         if (!crt || !crt->cgroup_path)
4051                 return -EOWNERDEAD;
4052
4053         r = cg_get_keyed_attribute(
4054                         SYSTEMD_CGROUP_CONTROLLER,
4055                         crt->cgroup_path,
4056                         "cgroup.events",
4057                         STRV_MAKE("frozen"),
4058                         &val);
4059         if (r < 0)
4060                 return r;
4061
4062         if (streq(val, "0"))
4063                 s = FREEZER_RUNNING;
4064         else if (streq(val, "1"))
4065                 s = FREEZER_FROZEN;
4066         else {
4067                 log_unit_debug(u, "Unexpected cgroup frozen state: %s", val);
4068                 s = _FREEZER_STATE_INVALID;
4069         }
4070
4071         *ret = s;
4072         return 0;
4073 }
4074
4075 int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
4076         _cleanup_free_ char *path = NULL;
4077         FreezerState current, next, objective;
4078         bool action_in_progress = false;
4079         int r;
4080
4081         assert(u);
4082         assert(action >= 0);
4083         assert(action < _FREEZER_ACTION_MAX);
4084
4085         unit_next_freezer_state(u, action, &next, &objective);
4086
4087         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4088         if (!crt || !crt->cgroup_path)
4089                 /* No realized cgroup = nothing to freeze */
4090                 goto finish;
4091
4092         r = unit_cgroup_freezer_kernel_state(u, &current);
4093         if (r < 0)
4094                 return r;
4095
4096         if (current == objective) {
4097                 if (objective == FREEZER_FROZEN)
4098                         goto finish;
4099
4100                 /* Skip thaw only if no freeze operation was in flight */
4101                 if (IN_SET(u->freezer_state, FREEZER_RUNNING, FREEZER_THAWING))
4102                         goto finish;
4103         } else
4104                 action_in_progress = true;
4105
4106         if (next == freezer_state_finish(next)) {
4107                 /* We're directly transitioning into a finished state, which in theory means that
4108                  * the cgroup's current state already matches the objective and thus we'd return 0.
4109                  * But, reality shows otherwise (such case would have been handled by current == objective
4110                  * branch above). This indicates that our freezer_state tracking has diverged
4111                  * from the real state of the cgroup, which can happen if someone meddles with the
4112                  * cgroup from underneath us. This really shouldn't happen during normal operation,
4113                  * though. So, let's warn about it and fix up the state to be valid */
4114
4115                 log_unit_warning(u, "Unit wants to transition to %s freezer state but cgroup is unexpectedly %s, fixing up.",
4116                                  freezer_state_to_string(next), freezer_state_to_string(current) ?: "(invalid)");
4117
4118                 if (next == FREEZER_FROZEN)
4119                         next = FREEZER_FREEZING;
4120                 else if (next == FREEZER_FROZEN_BY_PARENT)
4121                         next = FREEZER_FREEZING_BY_PARENT;
4122                 else if (next == FREEZER_RUNNING)
4123                         next = FREEZER_THAWING;
4124                 else
4125                         assert_not_reached();
4126         }
4127
4128         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.freeze", &path);
4129         if (r < 0)
4130                 return r;
4131
4132         r = write_string_file(path, one_zero(objective == FREEZER_FROZEN), WRITE_STRING_FILE_DISABLE_BUFFER);
4133         if (r < 0)
4134                 return r;
4135
4136 finish:
4137         if (action_in_progress)
4138                 unit_set_freezer_state(u, next);
4139         else
4140                 unit_set_freezer_state(u, freezer_state_finish(next));
4141
4142         return action_in_progress;
4143 }
4144
4145 int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
4146         _cleanup_free_ char *v = NULL;
4147         int r;
4148
4149         assert(u);
4150         assert(cpus);
4151
4152         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4153         if (!crt || !crt->cgroup_path)
4154                 return -ENODATA;
4155
4156         if ((crt->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
4157                 return -ENODATA;
4158
4159         r = cg_get_attribute("cpuset", crt->cgroup_path, name, &v);
4160         if (r == -ENOENT)
4161                 return -ENODATA;
4162         if (r < 0)
4163                 return r;
4164
4165         return parse_cpu_set(v, cpus);
4166 }
4167
4168 CGroupRuntime* cgroup_runtime_new(void) {
4169         _cleanup_(cgroup_runtime_freep) CGroupRuntime *crt = NULL;
4170
4171         crt = new(CGroupRuntime, 1);
4172         if (!crt)
4173                 return NULL;
4174
4175         *crt = (CGroupRuntime) {
4176                 .cgroup_control_inotify_wd = -1,
4177                 .cgroup_memory_inotify_wd = -1,
4178
4179                 .ip_accounting_ingress_map_fd = -EBADF,
4180                 .ip_accounting_egress_map_fd = -EBADF,
4181
4182                 .ipv4_allow_map_fd = -EBADF,
4183                 .ipv6_allow_map_fd = -EBADF,
4184                 .ipv4_deny_map_fd = -EBADF,
4185                 .ipv6_deny_map_fd = -EBADF,
4186
4187                 .cgroup_invalidated_mask = _CGROUP_MASK_ALL,
4188
4189                 .deserialized_cgroup_realized = -1,
4190         };
4191
4192         unit_reset_cpu_accounting(/* unit = */ NULL, crt);
4193         unit_reset_io_accounting(/* unit = */ NULL, crt);
4194         cgroup_runtime_reset_memory_accounting_last(crt);
4195         assert_se(cgroup_runtime_reset_ip_accounting(crt) >= 0);
4196
4197         return TAKE_PTR(crt);
4198 }
4199
4200 CGroupRuntime* cgroup_runtime_free(CGroupRuntime *crt) {
4201         if (!crt)
4202                 return NULL;
4203
4204         fdset_free(crt->initial_socket_bind_link_fds);
4205 #if BPF_FRAMEWORK
4206         bpf_link_free(crt->ipv4_socket_bind_link);
4207         bpf_link_free(crt->ipv6_socket_bind_link);
4208 #endif
4209         hashmap_free(crt->bpf_foreign_by_key);
4210
4211         bpf_program_free(crt->bpf_device_control_installed);
4212
4213 #if BPF_FRAMEWORK
4214         bpf_link_free(crt->restrict_ifaces_ingress_bpf_link);
4215         bpf_link_free(crt->restrict_ifaces_egress_bpf_link);
4216 #endif
4217         fdset_free(crt->initial_restrict_ifaces_link_fds);
4218
4219         bpf_firewall_close(crt);
4220
4221         free(crt->cgroup_path);
4222
4223         return mfree(crt);
4224 }
4225
4226 static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
4227         [CGROUP_IP_INGRESS_BYTES]   = "ip-accounting-ingress-bytes",
4228         [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets",
4229         [CGROUP_IP_EGRESS_BYTES]    = "ip-accounting-egress-bytes",
4230         [CGROUP_IP_EGRESS_PACKETS]  = "ip-accounting-egress-packets",
4231 };
4232
4233 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric);
4234
4235 static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4236         [CGROUP_IO_READ_BYTES]       = "io-accounting-read-bytes-base",
4237         [CGROUP_IO_WRITE_BYTES]      = "io-accounting-write-bytes-base",
4238         [CGROUP_IO_READ_OPERATIONS]  = "io-accounting-read-operations-base",
4239         [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base",
4240 };
4241
4242 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric);
4243
4244 static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4245         [CGROUP_IO_READ_BYTES]       = "io-accounting-read-bytes-last",
4246         [CGROUP_IO_WRITE_BYTES]      = "io-accounting-write-bytes-last",
4247         [CGROUP_IO_READ_OPERATIONS]  = "io-accounting-read-operations-last",
4248         [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last",
4249 };
4250
4251 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric);
4252
4253 static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = {
4254         [CGROUP_MEMORY_PEAK]      = "memory-accounting-peak",
4255         [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak",
4256 };
4257
4258 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric);
4259
4260 static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) {
4261         _cleanup_free_ char *s = NULL;
4262         int r;
4263
4264         assert(f);
4265         assert(key);
4266
4267         if (mask == 0)
4268                 return 0;
4269
4270         r = cg_mask_to_string(mask, &s);
4271         if (r < 0)
4272                 return log_error_errno(r, "Failed to format cgroup mask: %m");
4273
4274         return serialize_item(f, key, s);
4275 }
4276
4277 int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds) {
4278         int r;
4279
4280         assert(u);
4281         assert(f);
4282         assert(fds);
4283
4284         CGroupRuntime *crt = unit_get_cgroup_runtime(u);
4285         if (!crt)
4286                 return 0;
4287
4288         (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, crt->cpu_usage_base);
4289         if (crt->cpu_usage_last != NSEC_INFINITY)
4290                 (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, crt->cpu_usage_last);
4291
4292         if (crt->managed_oom_kill_last > 0)
4293                 (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, crt->managed_oom_kill_last);
4294
4295         if (crt->oom_kill_last > 0)
4296                 (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, crt->oom_kill_last);
4297
4298         for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) {
4299                 uint64_t v;
4300
4301                 r = unit_get_memory_accounting(u, metric, &v);
4302                 if (r >= 0)
4303                         (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v);
4304         }
4305
4306         for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
4307                 uint64_t v;
4308
4309                 r = unit_get_ip_accounting(u, m, &v);
4310                 if (r >= 0)
4311                         (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v);
4312         }
4313
4314         for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) {
4315                 (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, crt->io_accounting_base[im]);
4316
4317                 if (crt->io_accounting_last[im] != UINT64_MAX)
4318                         (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, crt->io_accounting_last[im]);
4319         }
4320
4321         if (crt->cgroup_path)
4322                 (void) serialize_item(f, "cgroup", crt->cgroup_path);
4323         if (crt->cgroup_id != 0)
4324                 (void) serialize_item_format(f, "cgroup-id", "%" PRIu64, crt->cgroup_id);
4325
4326         (void) serialize_cgroup_mask(f, "cgroup-realized-mask", crt->cgroup_realized_mask);
4327         (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", crt->cgroup_enabled_mask);
4328         (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", crt->cgroup_invalidated_mask);
4329
4330         (void) bpf_socket_bind_serialize(u, f, fds);
4331
4332         (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", crt->ip_bpf_ingress_installed);
4333         (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", crt->ip_bpf_egress_installed);
4334         (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", crt->bpf_device_control_installed);
4335         (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", crt->ip_bpf_custom_ingress_installed);
4336         (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", crt->ip_bpf_custom_egress_installed);
4337
4338         (void) bpf_restrict_ifaces_serialize(u, f, fds);
4339
4340         return 0;
4341 }
4342
4343 #define MATCH_DESERIALIZE(u, key, l, v, parse_func, target)             \
4344         ({                                                              \
4345                 bool _deserialize_matched = streq(l, key);              \
4346                 if (_deserialize_matched) {                             \
4347                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4348                         if (!crt)                                       \
4349                                 log_oom_debug();                        \
4350                         else {                                          \
4351                                 int _deserialize_r = parse_func(v);     \
4352                                 if (_deserialize_r < 0)                 \
4353                                         log_unit_debug_errno(u, _deserialize_r, \
4354                                                              "Failed to parse \"%s=%s\", ignoring.", l, v); \
4355                                 else                                    \
4356                                         crt->target = _deserialize_r; \
4357                         }                                               \
4358                 }                                                       \
4359                 _deserialize_matched;                                   \
4360         })
4361
4362 #define MATCH_DESERIALIZE_IMMEDIATE(u, key, l, v, parse_func, target)   \
4363         ({                                                              \
4364                  bool _deserialize_matched = streq(l, key);             \
4365                  if (_deserialize_matched) {                            \
4366                          CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4367                          if (!crt)                                      \
4368                                  log_oom_debug();                       \
4369                          else {                                         \
4370                                  int _deserialize_r = parse_func(v, &crt->target); \
4371                                  if (_deserialize_r < 0)                \
4372                                          log_unit_debug_errno(u, _deserialize_r, \
4373                                                               "Failed to parse \"%s=%s\", ignoring", l, v); \
4374                          }                                              \
4375                  }                                                      \
4376                 _deserialize_matched;                                   \
4377         })
4378
4379 #define MATCH_DESERIALIZE_METRIC(u, key, l, v, parse_func, target)             \
4380         ({                                                              \
4381                 bool _deserialize_matched = streq(l, key);              \
4382                 if (_deserialize_matched) {                             \
4383                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
4384                         if (!crt)                                       \
4385                                 log_oom_debug();                        \
4386                         else {                                          \
4387                                 int _deserialize_r = parse_func(v);     \
4388                                 if (_deserialize_r < 0)                 \
4389                                         log_unit_debug_errno(u, _deserialize_r, \
4390                                                              "Failed to parse \"%s=%s\", ignoring.", l, v); \
4391                                 else                                    \
4392                                         crt->target = _deserialize_r; \
4393                         }                                               \
4394                 }                                                       \
4395                 _deserialize_matched;                                   \
4396         })
4397
4398 int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, FDSet *fds) {
4399         int r;
4400
4401         assert(u);
4402         assert(value);
4403
4404         if (!UNIT_HAS_CGROUP_CONTEXT(u))
4405                 return 0;
4406
4407         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-base", key, value, safe_atou64, cpu_usage_base) ||
4408             MATCH_DESERIALIZE_IMMEDIATE(u, "cpuacct-usage-base", key, value, safe_atou64, cpu_usage_base))
4409                 return 1;
4410
4411         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-last", key, value, safe_atou64, cpu_usage_last))
4412                 return 1;
4413
4414         if (MATCH_DESERIALIZE_IMMEDIATE(u, "managed-oom-kill-last", key, value, safe_atou64, managed_oom_kill_last))
4415                 return 1;
4416
4417         if (MATCH_DESERIALIZE_IMMEDIATE(u, "oom-kill-last", key, value, safe_atou64, oom_kill_last))
4418                 return 1;
4419
4420         if (streq(key, "cgroup")) {
4421                 r = unit_set_cgroup_path(u, value);
4422                 if (r < 0)
4423                         log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", value);
4424
4425                 return 1;
4426         }
4427
4428         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-id", key, value, safe_atou64, cgroup_id))
4429                 return 1;
4430
4431         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized", key, value, parse_tristate, deserialized_cgroup_realized))
4432                 return 1;
4433
4434         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized-mask", key, value, cg_mask_from_string, cgroup_realized_mask))
4435                 return 1;
4436
4437         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-enabled-mask", key, value, cg_mask_from_string, cgroup_enabled_mask))
4438                 return 1;
4439
4440         if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-invalidated-mask", key, value, cg_mask_from_string, cgroup_invalidated_mask))
4441                 return 1;
4442
4443         if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
4444                 int fd;
4445
4446                 fd = deserialize_fd(fds, value);
4447                 if (fd >= 0)
4448                         (void) bpf_socket_bind_add_initial_link_fd(u, fd);
4449
4450                 return 1;
4451         }
4452
4453         if (STR_IN_SET(key,
4454                        "ip-bpf-ingress-installed", "ip-bpf-egress-installed",
4455                        "bpf-device-control-installed",
4456                        "ip-bpf-custom-ingress-installed", "ip-bpf-custom-egress-installed")) {
4457
4458                 CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4459                 if (!crt)
4460                         log_oom_debug();
4461                 else {
4462                         if (streq(key, "ip-bpf-ingress-installed"))
4463                                 (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_ingress_installed);
4464
4465                         if (streq(key, "ip-bpf-egress-installed"))
4466                                 (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_egress_installed);
4467
4468                         if (streq(key, "bpf-device-control-installed"))
4469                                 (void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed);
4470
4471                         if (streq(key, "ip-bpf-custom-ingress-installed"))
4472                                 (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_ingress_installed);
4473
4474                         if (streq(key, "ip-bpf-custom-egress-installed"))
4475                                 (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_egress_installed);
4476                 }
4477
4478                 return 1;
4479         }
4480
4481         if (streq(key, "restrict-ifaces-bpf-fd")) {
4482                 int fd;
4483
4484                 fd = deserialize_fd(fds, value);
4485                 if (fd >= 0)
4486                         (void) bpf_restrict_ifaces_add_initial_link_fd(u, fd);
4487                 return 1;
4488         }
4489
4490         CGroupMemoryAccountingMetric mm = memory_accounting_metric_field_last_from_string(key);
4491         if (mm >= 0) {
4492                 uint64_t c;
4493
4494                 r = safe_atou64(value, &c);
4495                 if (r < 0)
4496                         log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", value);
4497                 else {
4498                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4499                         if (!crt)
4500                                 log_oom_debug();
4501                         else
4502                                 crt->memory_accounting_last[mm] = c;
4503                 }
4504
4505                 return 1;
4506         }
4507
4508         CGroupIPAccountingMetric ipm = ip_accounting_metric_field_from_string(key);
4509         if (ipm >= 0) {
4510                 uint64_t c;
4511
4512                 r = safe_atou64(value, &c);
4513                 if (r < 0)
4514                         log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", value);
4515                 else {
4516                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4517                         if (!crt)
4518                                 log_oom_debug();
4519                         else
4520                                 crt->ip_accounting_extra[ipm] = c;
4521                 }
4522
4523                 return 1;
4524         }
4525
4526         CGroupIOAccountingMetric iom = io_accounting_metric_field_base_from_string(key);
4527         if (iom >= 0) {
4528                 uint64_t c;
4529
4530                 r = safe_atou64(value, &c);
4531                 if (r < 0)
4532                         log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", value);
4533                 else {
4534                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4535                         if (!crt)
4536                                 log_oom_debug();
4537                         else
4538                                 crt->io_accounting_base[iom] = c;
4539                 }
4540
4541                 return 1;
4542         }
4543
4544         iom = io_accounting_metric_field_last_from_string(key);
4545         if (iom >= 0) {
4546                 uint64_t c;
4547
4548                 r = safe_atou64(value, &c);
4549                 if (r < 0)
4550                         log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", value);
4551                 else {
4552                         CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
4553                         if (!crt)
4554                                 log_oom_debug();
4555                         else
4556                                 crt->io_accounting_last[iom] = c;
4557                 }
4558                 return 1;
4559         }
4560
4561         return 0;
4562 }
4563
4564 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
4565         [CGROUP_DEVICE_POLICY_AUTO]   = "auto",
4566         [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
4567         [CGROUP_DEVICE_POLICY_STRICT] = "strict",
4568 };
4569
4570 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
4571
4572 static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
4573         [CGROUP_PRESSURE_WATCH_NO]   = "no",
4574         [CGROUP_PRESSURE_WATCH_YES]  = "yes",
4575         [CGROUP_PRESSURE_WATCH_AUTO] = "auto",
4576         [CGROUP_PRESSURE_WATCH_SKIP] = "skip",
4577 };
4578
4579 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_YES);
4580
4581 static const char* const cgroup_ip_accounting_metric_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
4582         [CGROUP_IP_INGRESS_BYTES]   = "IPIngressBytes",
4583         [CGROUP_IP_EGRESS_BYTES]    = "IPEgressBytes",
4584         [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets",
4585         [CGROUP_IP_EGRESS_PACKETS]  = "IPEgressPackets",
4586 };
4587
4588 DEFINE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric);
4589
4590 static const char* const cgroup_io_accounting_metric_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
4591         [CGROUP_IO_READ_BYTES]       = "IOReadBytes",
4592         [CGROUP_IO_WRITE_BYTES]      = "IOWriteBytes",
4593         [CGROUP_IO_READ_OPERATIONS]  = "IOReadOperations",
4594         [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations",
4595 };
4596
4597 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_accounting_metric, CGroupIOAccountingMetric);
4598
4599 static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
4600         [CGROUP_MEMORY_CURRENT]       = "MemoryCurrent",
4601         [CGROUP_MEMORY_PEAK]          = "MemoryPeak",
4602         [CGROUP_MEMORY_SWAP_CURRENT]  = "MemorySwapCurrent",
4603         [CGROUP_MEMORY_SWAP_PEAK]     = "MemorySwapPeak",
4604         [CGROUP_MEMORY_ZSWAP_CURRENT] = "MemoryZSwapCurrent",
4605 };
4606
4607 DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
4608
4609 static const char *const cgroup_effective_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = {
4610         [CGROUP_LIMIT_MEMORY_MAX]  = "EffectiveMemoryMax",
4611         [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh",
4612         [CGROUP_LIMIT_TASKS_MAX]   = "EffectiveTasksMax",
4613 };
4614
4615 DEFINE_STRING_TABLE_LOOKUP(cgroup_effective_limit_type, CGroupLimitType);