src/core/cgroup.c

   1 /***
   2   This file is part of systemd.
   3
   4   Copyright 2013 Lennart Poettering
   5
   6   systemd is free software; you can redistribute it and/or modify it
   7   under the terms of the GNU Lesser General Public License as published by
   8   the Free Software Foundation; either version 2.1 of the License, or
   9   (at your option) any later version.
  10
  11   systemd is distributed in the hope that it will be useful, but
  12   WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public License
  17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  18 ***/
  19
  20 #include <fcntl.h>
  21 #include <fnmatch.h>
  22
  23 #include "alloc-util.h"
  24 #include "cgroup-util.h"
  25 #include "cgroup.h"
  26 #include "fd-util.h"
  27 #include "fileio.h"
  28 #include "fs-util.h"
  29 #include "parse-util.h"
  30 #include "path-util.h"
  31 #include "process-util.h"
  32 #include "special.h"
  33 #include "string-table.h"
  34 #include "string-util.h"
  35 #include "stdio-util.h"
  36
  37 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  38
  39 static void cgroup_compat_warn(void) {
  40         static bool cgroup_compat_warned = false;
  41
  42         if (cgroup_compat_warned)
  43                 return;
  44
  45         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
  46         cgroup_compat_warned = true;
  47 }
  48
  49 #define log_cgroup_compat(unit, fmt, ...) do {                                  \
  50                 cgroup_compat_warn();                                           \
  51                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
  52         } while (false)
  53
  54 void cgroup_context_init(CGroupContext *c) {
  55         assert(c);
  56
  57         /* Initialize everything to the kernel defaults, assuming the
  58          * structure is preinitialized to 0 */
  59
  60         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  61         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  62         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  63
  64         c->memory_high = CGROUP_LIMIT_MAX;
  65         c->memory_max = CGROUP_LIMIT_MAX;
  66
  67         c->memory_limit = CGROUP_LIMIT_MAX;
  68
  69         c->io_weight = CGROUP_WEIGHT_INVALID;
  70         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  71
  72         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  73         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  74
  75         c->tasks_max = (uint64_t) -1;
  76 }
  77
  78 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  79         assert(c);
  80         assert(a);
  81
  82         LIST_REMOVE(device_allow, c->device_allow, a);
  83         free(a->path);
  84         free(a);
  85 }
  86
  87 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
  88         assert(c);
  89         assert(w);
  90
  91         LIST_REMOVE(device_weights, c->io_device_weights, w);
  92         free(w->path);
  93         free(w);
  94 }
  95
  96 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
  97         assert(c);
  98         assert(l);
  99
 100         LIST_REMOVE(device_limits, c->io_device_limits, l);
 101         free(l->path);
 102         free(l);
 103 }
 104
 105 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
 106         assert(c);
 107         assert(w);
 108
 109         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
 110         free(w->path);
 111         free(w);
 112 }
 113
 114 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
 115         assert(c);
 116         assert(b);
 117
 118         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 119         free(b->path);
 120         free(b);
 121 }
 122
 123 void cgroup_context_done(CGroupContext *c) {
 124         assert(c);
 125
 126         while (c->io_device_weights)
 127                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 128
 129         while (c->io_device_limits)
 130                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 131
 132         while (c->blockio_device_weights)
 133                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 134
 135         while (c->blockio_device_bandwidths)
 136                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 137
 138         while (c->device_allow)
 139                 cgroup_context_free_device_allow(c, c->device_allow);
 140 }
 141
 142 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 143         CGroupIODeviceLimit *il;
 144         CGroupIODeviceWeight *iw;
 145         CGroupBlockIODeviceBandwidth *b;
 146         CGroupBlockIODeviceWeight *w;
 147         CGroupDeviceAllow *a;
 148         char u[FORMAT_TIMESPAN_MAX];
 149
 150         assert(c);
 151         assert(f);
 152
 153         prefix = strempty(prefix);
 154
 155         fprintf(f,
 156                 "%sCPUAccounting=%s\n"
 157                 "%sIOAccounting=%s\n"
 158                 "%sBlockIOAccounting=%s\n"
 159                 "%sMemoryAccounting=%s\n"
 160                 "%sTasksAccounting=%s\n"
 161                 "%sCPUShares=%" PRIu64 "\n"
 162                 "%sStartupCPUShares=%" PRIu64 "\n"
 163                 "%sCPUQuotaPerSecSec=%s\n"
 164                 "%sIOWeight=%" PRIu64 "\n"
 165                 "%sStartupIOWeight=%" PRIu64 "\n"
 166                 "%sBlockIOWeight=%" PRIu64 "\n"
 167                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 168                 "%sMemoryLow=%" PRIu64 "\n"
 169                 "%sMemoryHigh=%" PRIu64 "\n"
 170                 "%sMemoryMax=%" PRIu64 "\n"
 171                 "%sMemoryLimit=%" PRIu64 "\n"
 172                 "%sTasksMax=%" PRIu64 "\n"
 173                 "%sDevicePolicy=%s\n"
 174                 "%sDelegate=%s\n",
 175                 prefix, yes_no(c->cpu_accounting),
 176                 prefix, yes_no(c->io_accounting),
 177                 prefix, yes_no(c->blockio_accounting),
 178                 prefix, yes_no(c->memory_accounting),
 179                 prefix, yes_no(c->tasks_accounting),
 180                 prefix, c->cpu_shares,
 181                 prefix, c->startup_cpu_shares,
 182                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 183                 prefix, c->io_weight,
 184                 prefix, c->startup_io_weight,
 185                 prefix, c->blockio_weight,
 186                 prefix, c->startup_blockio_weight,
 187                 prefix, c->memory_low,
 188                 prefix, c->memory_high,
 189                 prefix, c->memory_max,
 190                 prefix, c->memory_limit,
 191                 prefix, c->tasks_max,
 192                 prefix, cgroup_device_policy_to_string(c->device_policy),
 193                 prefix, yes_no(c->delegate));
 194
 195         LIST_FOREACH(device_allow, a, c->device_allow)
 196                 fprintf(f,
 197                         "%sDeviceAllow=%s %s%s%s\n",
 198                         prefix,
 199                         a->path,
 200                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 201
 202         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 203                 fprintf(f,
 204                         "%sIODeviceWeight=%s %" PRIu64,
 205                         prefix,
 206                         iw->path,
 207                         iw->weight);
 208
 209         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 210                 char buf[FORMAT_BYTES_MAX];
 211                 CGroupIOLimitType type;
 212
 213                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 214                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 215                                 fprintf(f,
 216                                         "%s%s=%s %s\n",
 217                                         prefix,
 218                                         cgroup_io_limit_type_to_string(type),
 219                                         il->path,
 220                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 221         }
 222
 223         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 224                 fprintf(f,
 225                         "%sBlockIODeviceWeight=%s %" PRIu64,
 226                         prefix,
 227                         w->path,
 228                         w->weight);
 229
 230         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 231                 char buf[FORMAT_BYTES_MAX];
 232
 233                 if (b->rbps != CGROUP_LIMIT_MAX)
 234                         fprintf(f,
 235                                 "%sBlockIOReadBandwidth=%s %s\n",
 236                                 prefix,
 237                                 b->path,
 238                                 format_bytes(buf, sizeof(buf), b->rbps));
 239                 if (b->wbps != CGROUP_LIMIT_MAX)
 240                         fprintf(f,
 241                                 "%sBlockIOWriteBandwidth=%s %s\n",
 242                                 prefix,
 243                                 b->path,
 244                                 format_bytes(buf, sizeof(buf), b->wbps));
 245         }
 246 }
 247
 248 static int lookup_block_device(const char *p, dev_t *dev) {
 249         struct stat st;
 250         int r;
 251
 252         assert(p);
 253         assert(dev);
 254
 255         r = stat(p, &st);
 256         if (r < 0)
 257                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 258
 259         if (S_ISBLK(st.st_mode))
 260                 *dev = st.st_rdev;
 261         else if (major(st.st_dev) != 0) {
 262                 /* If this is not a device node then find the block
 263                  * device this file is stored on */
 264                 *dev = st.st_dev;
 265
 266                 /* If this is a partition, try to get the originating
 267                  * block device */
 268                 block_get_whole_disk(*dev, dev);
 269         } else {
 270                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 271                 return -ENODEV;
 272         }
 273
 274         return 0;
 275 }
 276
 277 static int whitelist_device(const char *path, const char *node, const char *acc) {
 278         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 279         struct stat st;
 280         int r;
 281
 282         assert(path);
 283         assert(acc);
 284
 285         if (stat(node, &st) < 0) {
 286                 log_warning("Couldn't stat device %s", node);
 287                 return -errno;
 288         }
 289
 290         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 291                 log_warning("%s is not a device.", node);
 292                 return -ENODEV;
 293         }
 294
 295         sprintf(buf,
 296                 "%c %u:%u %s",
 297                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 298                 major(st.st_rdev), minor(st.st_rdev),
 299                 acc);
 300
 301         r = cg_set_attribute("devices", path, "devices.allow", buf);
 302         if (r < 0)
 303                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 304                                "Failed to set devices.allow on %s: %m", path);
 305
 306         return r;
 307 }
 308
 309 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 310         _cleanup_fclose_ FILE *f = NULL;
 311         char line[LINE_MAX];
 312         bool good = false;
 313         int r;
 314
 315         assert(path);
 316         assert(acc);
 317         assert(type == 'b' || type == 'c');
 318
 319         f = fopen("/proc/devices", "re");
 320         if (!f)
 321                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 322
 323         FOREACH_LINE(line, f, goto fail) {
 324                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 325                 unsigned maj;
 326
 327                 truncate_nl(line);
 328
 329                 if (type == 'c' && streq(line, "Character devices:")) {
 330                         good = true;
 331                         continue;
 332                 }
 333
 334                 if (type == 'b' && streq(line, "Block devices:")) {
 335                         good = true;
 336                         continue;
 337                 }
 338
 339                 if (isempty(line)) {
 340                         good = false;
 341                         continue;
 342                 }
 343
 344                 if (!good)
 345                         continue;
 346
 347                 p = strstrip(line);
 348
 349                 w = strpbrk(p, WHITESPACE);
 350                 if (!w)
 351                         continue;
 352                 *w = 0;
 353
 354                 r = safe_atou(p, &maj);
 355                 if (r < 0)
 356                         continue;
 357                 if (maj <= 0)
 358                         continue;
 359
 360                 w++;
 361                 w += strspn(w, WHITESPACE);
 362
 363                 if (fnmatch(name, w, 0) != 0)
 364                         continue;
 365
 366                 sprintf(buf,
 367                         "%c %u:* %s",
 368                         type,
 369                         maj,
 370                         acc);
 371
 372                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 373                 if (r < 0)
 374                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 375                                        "Failed to set devices.allow on %s: %m", path);
 376         }
 377
 378         return 0;
 379
 380 fail:
 381         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 382         return -errno;
 383 }
 384
 385 static bool cgroup_context_has_io_config(CGroupContext *c) {
 386         return c->io_accounting ||
 387                 c->io_weight != CGROUP_WEIGHT_INVALID ||
 388                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 389                 c->io_device_weights ||
 390                 c->io_device_limits;
 391 }
 392
 393 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
 394         return c->blockio_accounting ||
 395                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 396                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 397                 c->blockio_device_weights ||
 398                 c->blockio_device_bandwidths;
 399 }
 400
 401 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
 402         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 403             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 404                 return c->startup_io_weight;
 405         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 406                 return c->io_weight;
 407         else
 408                 return CGROUP_WEIGHT_DEFAULT;
 409 }
 410
 411 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
 412         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 413             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 414                 return c->startup_blockio_weight;
 415         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
 416                 return c->blockio_weight;
 417         else
 418                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
 419 }
 420
 421 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
 422         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
 423                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
 424 }
 425
 426 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
 427         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
 428                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
 429 }
 430
 431 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
 432         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 433         dev_t dev;
 434         int r;
 435
 436         r = lookup_block_device(dev_path, &dev);
 437         if (r < 0)
 438                 return;
 439
 440         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
 441         r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
 442         if (r < 0)
 443                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 444                               "Failed to set io.weight: %m");
 445 }
 446
 447 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
 448         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 449         dev_t dev;
 450         int r;
 451
 452         r = lookup_block_device(dev_path, &dev);
 453         if (r < 0)
 454                 return;
 455
 456         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
 457         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
 458         if (r < 0)
 459                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 460                               "Failed to set blkio.weight_device: %m");
 461 }
 462
 463 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
 464         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 465         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 466         CGroupIOLimitType type;
 467         dev_t dev;
 468         unsigned n = 0;
 469         int r;
 470
 471         r = lookup_block_device(dev_path, &dev);
 472         if (r < 0)
 473                 return 0;
 474
 475         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 476                 if (limits[type] != cgroup_io_limit_defaults[type]) {
 477                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
 478                         n++;
 479                 } else {
 480                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 481                 }
 482         }
 483
 484         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 485                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 486                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 487         r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
 488         if (r < 0)
 489                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 490                               "Failed to set io.max: %m");
 491         return n;
 492 }
 493
 494 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
 495         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
 496         dev_t dev;
 497         unsigned n = 0;
 498         int r;
 499
 500         r = lookup_block_device(dev_path, &dev);
 501         if (r < 0)
 502                 return 0;
 503
 504         if (rbps != CGROUP_LIMIT_MAX)
 505                 n++;
 506         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
 507         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
 508         if (r < 0)
 509                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 510                               "Failed to set blkio.throttle.read_bps_device: %m");
 511
 512         if (wbps != CGROUP_LIMIT_MAX)
 513                 n++;
 514         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
 515         r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
 516         if (r < 0)
 517                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 518                               "Failed to set blkio.throttle.write_bps_device: %m");
 519
 520         return n;
 521 }
 522
 523 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
 524         return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX;
 525 }
 526
 527 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
 528         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
 529         int r;
 530
 531         if (v != CGROUP_LIMIT_MAX)
 532                 xsprintf(buf, "%" PRIu64 "\n", v);
 533
 534         r = cg_set_attribute("memory", u->cgroup_path, file, buf);
 535         if (r < 0)
 536                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 537                               "Failed to set %s: %m", file);
 538 }
 539
 540 static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) {
 541         const char *path;
 542         CGroupContext *c;
 543         bool is_root;
 544         int r;
 545
 546         assert(u);
 547
 548         c = unit_get_cgroup_context(u);
 549         path = u->cgroup_path;
 550
 551         assert(c);
 552         assert(path);
 553
 554         if (mask == 0)
 555                 return;
 556
 557         /* Some cgroup attributes are not supported on the root cgroup,
 558          * hence silently ignore */
 559         is_root = isempty(path) || path_equal(path, "/");
 560         if (is_root)
 561                 /* Make sure we don't try to display messages with an empty path. */
 562                 path = "/";
 563
 564         /* We generally ignore errors caused by read-only mounted
 565          * cgroup trees (assuming we are running in a container then),
 566          * and missing cgroups, i.e. EROFS and ENOENT. */
 567
 568         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 569                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 570
 571                 sprintf(buf, "%" PRIu64 "\n",
 572                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 573                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 574                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 575                 if (r < 0)
 576                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 577                                       "Failed to set cpu.shares: %m");
 578
 579                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 580                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 581                 if (r < 0)
 582                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 583                                       "Failed to set cpu.cfs_period_us: %m");
 584
 585                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 586                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 587                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 588                 } else
 589                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 590                 if (r < 0)
 591                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 592                                       "Failed to set cpu.cfs_quota_us: %m");
 593         }
 594
 595         if (mask & CGROUP_MASK_IO) {
 596                 bool has_io = cgroup_context_has_io_config(c);
 597                 bool has_blockio = cgroup_context_has_blockio_config(c);
 598
 599                 if (!is_root) {
 600                         char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
 601                         uint64_t weight;
 602
 603                         if (has_io)
 604                                 weight = cgroup_context_io_weight(c, state);
 605                         else if (has_blockio) {
 606                                 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
 607
 608                                 weight = cgroup_weight_blkio_to_io(blkio_weight);
 609
 610                                 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
 611                                                   blkio_weight, weight);
 612                         } else
 613                                 weight = CGROUP_WEIGHT_DEFAULT;
 614
 615                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 616                         r = cg_set_attribute("io", path, "io.weight", buf);
 617                         if (r < 0)
 618                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 619                                               "Failed to set io.weight: %m");
 620
 621                         if (has_io) {
 622                                 CGroupIODeviceWeight *w;
 623
 624                                 /* FIXME: no way to reset this list */
 625                                 LIST_FOREACH(device_weights, w, c->io_device_weights)
 626                                         cgroup_apply_io_device_weight(u, w->path, w->weight);
 627                         } else if (has_blockio) {
 628                                 CGroupBlockIODeviceWeight *w;
 629
 630                                 /* FIXME: no way to reset this list */
 631                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 632                                         weight = cgroup_weight_blkio_to_io(w->weight);
 633
 634                                         log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
 635                                                           w->weight, weight, w->path);
 636
 637                                         cgroup_apply_io_device_weight(u, w->path, weight);
 638                                 }
 639                         }
 640                 }
 641
 642                 /* Apply limits and free ones without config. */
 643                 if (has_io) {
 644                         CGroupIODeviceLimit *l, *next;
 645
 646                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 647                                 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
 648                                         cgroup_context_free_io_device_limit(c, l);
 649                         }
 650                 } else if (has_blockio) {
 651                         CGroupBlockIODeviceBandwidth *b, *next;
 652
 653                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 654                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
 655                                 CGroupIOLimitType type;
 656
 657                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 658                                         limits[type] = cgroup_io_limit_defaults[type];
 659
 660                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
 661                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
 662
 663                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
 664                                                   b->rbps, b->wbps, b->path);
 665
 666                                 if (!cgroup_apply_io_device_limit(u, b->path, limits))
 667                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 668                         }
 669                 }
 670         }
 671
 672         if (mask & CGROUP_MASK_BLKIO) {
 673                 bool has_io = cgroup_context_has_io_config(c);
 674                 bool has_blockio = cgroup_context_has_blockio_config(c);
 675
 676                 if (!is_root) {
 677                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
 678                         uint64_t weight;
 679
 680                         if (has_blockio)
 681                                 weight = cgroup_context_blkio_weight(c, state);
 682                         else if (has_io) {
 683                                 uint64_t io_weight = cgroup_context_io_weight(c, state);
 684
 685                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
 686
 687                                 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
 688                                                   io_weight, weight);
 689                         } else
 690                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
 691
 692                         xsprintf(buf, "%" PRIu64 "\n", weight);
 693                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 694                         if (r < 0)
 695                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 696                                               "Failed to set blkio.weight: %m");
 697
 698                         if (has_blockio) {
 699                                 CGroupBlockIODeviceWeight *w;
 700
 701                                 /* FIXME: no way to reset this list */
 702                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 703                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
 704                         } else if (has_io) {
 705                                 CGroupIODeviceWeight *w;
 706
 707                                 /* FIXME: no way to reset this list */
 708                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
 709                                         weight = cgroup_weight_io_to_blkio(w->weight);
 710
 711                                         log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
 712                                                           w->weight, weight, w->path);
 713
 714                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
 715                                 }
 716                         }
 717                 }
 718
 719                 /* Apply limits and free ones without config. */
 720                 if (has_blockio) {
 721                         CGroupBlockIODeviceBandwidth *b, *next;
 722
 723                         LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
 724                                 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
 725                                         cgroup_context_free_blockio_device_bandwidth(c, b);
 726                         }
 727                 } else if (has_io) {
 728                         CGroupIODeviceLimit *l, *next;
 729
 730                         LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 731                                 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
 732                                                   l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
 733
 734                                 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
 735                                         cgroup_context_free_io_device_limit(c, l);
 736                         }
 737                 }
 738         }
 739
 740         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 741                 if (cg_unified() > 0) {
 742                         uint64_t max = c->memory_max;
 743
 744                         if (cgroup_context_has_unified_memory_config(c))
 745                                 max = c->memory_max;
 746                         else {
 747                                 max = c->memory_limit;
 748
 749                                 if (max != CGROUP_LIMIT_MAX)
 750                                         log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
 751                         }
 752
 753                         cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
 754                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
 755                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
 756                 } else {
 757                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 758                         uint64_t val = c->memory_limit;
 759
 760                         if (val == CGROUP_LIMIT_MAX) {
 761                                 val = c->memory_max;
 762
 763                                 if (val != CGROUP_LIMIT_MAX)
 764                                         log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", c->memory_max);
 765                         }
 766
 767                         if (val == CGROUP_LIMIT_MAX)
 768                                 strncpy(buf, "-1\n", sizeof(buf));
 769                         else
 770                                 xsprintf(buf, "%" PRIu64 "\n", val);
 771
 772                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 773                         if (r < 0)
 774                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 775                                               "Failed to set memory.limit_in_bytes: %m");
 776                 }
 777         }
 778
 779         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 780                 CGroupDeviceAllow *a;
 781
 782                 /* Changing the devices list of a populated cgroup
 783                  * might result in EINVAL, hence ignore EINVAL
 784                  * here. */
 785
 786                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 787                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 788                 else
 789                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 790                 if (r < 0)
 791                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 792                                       "Failed to reset devices.list: %m");
 793
 794                 if (c->device_policy == CGROUP_CLOSED ||
 795                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 796                         static const char auto_devices[] =
 797                                 "/dev/null\0" "rwm\0"
 798                                 "/dev/zero\0" "rwm\0"
 799                                 "/dev/full\0" "rwm\0"
 800                                 "/dev/random\0" "rwm\0"
 801                                 "/dev/urandom\0" "rwm\0"
 802                                 "/dev/tty\0" "rwm\0"
 803                                 "/dev/pts/ptmx\0" "rw\0" /* /dev/pts/ptmx may not be duplicated, but accessed */
 804                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
 805                                 "/run/systemd/inaccessible/chr\0" "rwm\0"
 806                                 "/run/systemd/inaccessible/blk\0" "rwm\0";
 807
 808                         const char *x, *y;
 809
 810                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 811                                 whitelist_device(path, x, y);
 812
 813                         whitelist_major(path, "pts", 'c', "rw");
 814                         whitelist_major(path, "kdbus", 'c', "rw");
 815                         whitelist_major(path, "kdbus/*", 'c', "rw");
 816                 }
 817
 818                 LIST_FOREACH(device_allow, a, c->device_allow) {
 819                         char acc[4];
 820                         unsigned k = 0;
 821
 822                         if (a->r)
 823                                 acc[k++] = 'r';
 824                         if (a->w)
 825                                 acc[k++] = 'w';
 826                         if (a->m)
 827                                 acc[k++] = 'm';
 828
 829                         if (k == 0)
 830                                 continue;
 831
 832                         acc[k++] = 0;
 833
 834                         if (startswith(a->path, "/dev/"))
 835                                 whitelist_device(path, a->path, acc);
 836                         else if (startswith(a->path, "block-"))
 837                                 whitelist_major(path, a->path + 6, 'b', acc);
 838                         else if (startswith(a->path, "char-"))
 839                                 whitelist_major(path, a->path + 5, 'c', acc);
 840                         else
 841                                 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
 842                 }
 843         }
 844
 845         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 846
 847                 if (c->tasks_max != (uint64_t) -1) {
 848                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 849
 850                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 851                         r = cg_set_attribute("pids", path, "pids.max", buf);
 852                 } else
 853                         r = cg_set_attribute("pids", path, "pids.max", "max");
 854
 855                 if (r < 0)
 856                         log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 857                                       "Failed to set pids.max: %m");
 858         }
 859 }
 860
 861 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 862         CGroupMask mask = 0;
 863
 864         /* Figure out which controllers we need */
 865
 866         if (c->cpu_accounting ||
 867             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 868             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 869             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 870                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 871
 872         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
 873                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
 874
 875         if (c->memory_accounting ||
 876             c->memory_limit != CGROUP_LIMIT_MAX ||
 877             cgroup_context_has_unified_memory_config(c))
 878                 mask |= CGROUP_MASK_MEMORY;
 879
 880         if (c->device_allow ||
 881             c->device_policy != CGROUP_AUTO)
 882                 mask |= CGROUP_MASK_DEVICES;
 883
 884         if (c->tasks_accounting ||
 885             c->tasks_max != (uint64_t) -1)
 886                 mask |= CGROUP_MASK_PIDS;
 887
 888         return mask;
 889 }
 890
 891 CGroupMask unit_get_own_mask(Unit *u) {
 892         CGroupContext *c;
 893
 894         /* Returns the mask of controllers the unit needs for itself */
 895
 896         c = unit_get_cgroup_context(u);
 897         if (!c)
 898                 return 0;
 899
 900         /* If delegation is turned on, then turn on all cgroups,
 901          * unless we are on the legacy hierarchy and the process we
 902          * fork into it is known to drop privileges, and hence
 903          * shouldn't get access to the controllers.
 904          *
 905          * Note that on the unified hierarchy it is safe to delegate
 906          * controllers to unprivileged services. */
 907
 908         if (c->delegate) {
 909                 ExecContext *e;
 910
 911                 e = unit_get_exec_context(u);
 912                 if (!e ||
 913                     exec_context_maintains_privileges(e) ||
 914                     cg_unified() > 0)
 915                         return _CGROUP_MASK_ALL;
 916         }
 917
 918         return cgroup_context_get_mask(c);
 919 }
 920
 921 CGroupMask unit_get_members_mask(Unit *u) {
 922         assert(u);
 923
 924         /* Returns the mask of controllers all of the unit's children
 925          * require, merged */
 926
 927         if (u->cgroup_members_mask_valid)
 928                 return u->cgroup_members_mask;
 929
 930         u->cgroup_members_mask = 0;
 931
 932         if (u->type == UNIT_SLICE) {
 933                 Unit *member;
 934                 Iterator i;
 935
 936                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 937
 938                         if (member == u)
 939                                 continue;
 940
 941                         if (UNIT_DEREF(member->slice) != u)
 942                                 continue;
 943
 944                         u->cgroup_members_mask |=
 945                                 unit_get_own_mask(member) |
 946                                 unit_get_members_mask(member);
 947                 }
 948         }
 949
 950         u->cgroup_members_mask_valid = true;
 951         return u->cgroup_members_mask;
 952 }
 953
 954 CGroupMask unit_get_siblings_mask(Unit *u) {
 955         assert(u);
 956
 957         /* Returns the mask of controllers all of the unit's siblings
 958          * require, i.e. the members mask of the unit's parent slice
 959          * if there is one. */
 960
 961         if (UNIT_ISSET(u->slice))
 962                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 963
 964         return unit_get_own_mask(u) | unit_get_members_mask(u);
 965 }
 966
 967 CGroupMask unit_get_subtree_mask(Unit *u) {
 968
 969         /* Returns the mask of this subtree, meaning of the group
 970          * itself and its children. */
 971
 972         return unit_get_own_mask(u) | unit_get_members_mask(u);
 973 }
 974
 975 CGroupMask unit_get_target_mask(Unit *u) {
 976         CGroupMask mask;
 977
 978         /* This returns the cgroup mask of all controllers to enable
 979          * for a specific cgroup, i.e. everything it needs itself,
 980          * plus all that its children need, plus all that its siblings
 981          * need. This is primarily useful on the legacy cgroup
 982          * hierarchy, where we need to duplicate each cgroup in each
 983          * hierarchy that shall be enabled for it. */
 984
 985         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 986         mask &= u->manager->cgroup_supported;
 987
 988         return mask;
 989 }
 990
 991 CGroupMask unit_get_enable_mask(Unit *u) {
 992         CGroupMask mask;
 993
 994         /* This returns the cgroup mask of all controllers to enable
 995          * for the children of a specific cgroup. This is primarily
 996          * useful for the unified cgroup hierarchy, where each cgroup
 997          * controls which controllers are enabled for its children. */
 998
 999         mask = unit_get_members_mask(u);
1000         mask &= u->manager->cgroup_supported;
1001
1002         return mask;
1003 }
1004
1005 /* Recurse from a unit up through its containing slices, propagating
1006  * mask bits upward. A unit is also member of itself. */
1007 void unit_update_cgroup_members_masks(Unit *u) {
1008         CGroupMask m;
1009         bool more;
1010
1011         assert(u);
1012
1013         /* Calculate subtree mask */
1014         m = unit_get_subtree_mask(u);
1015
1016         /* See if anything changed from the previous invocation. If
1017          * not, we're done. */
1018         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1019                 return;
1020
1021         more =
1022                 u->cgroup_subtree_mask_valid &&
1023                 ((m & ~u->cgroup_subtree_mask) != 0) &&
1024                 ((~m & u->cgroup_subtree_mask) == 0);
1025
1026         u->cgroup_subtree_mask = m;
1027         u->cgroup_subtree_mask_valid = true;
1028
1029         if (UNIT_ISSET(u->slice)) {
1030                 Unit *s = UNIT_DEREF(u->slice);
1031
1032                 if (more)
1033                         /* There's more set now than before. We
1034                          * propagate the new mask to the parent's mask
1035                          * (not caring if it actually was valid or
1036                          * not). */
1037
1038                         s->cgroup_members_mask |= m;
1039
1040                 else
1041                         /* There's less set now than before (or we
1042                          * don't know), we need to recalculate
1043                          * everything, so let's invalidate the
1044                          * parent's members mask */
1045
1046                         s->cgroup_members_mask_valid = false;
1047
1048                 /* And now make sure that this change also hits our
1049                  * grandparents */
1050                 unit_update_cgroup_members_masks(s);
1051         }
1052 }
1053
1054 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1055         Unit *u = userdata;
1056
1057         assert(mask != 0);
1058         assert(u);
1059
1060         while (u) {
1061                 if (u->cgroup_path &&
1062                     u->cgroup_realized &&
1063                     (u->cgroup_realized_mask & mask) == mask)
1064                         return u->cgroup_path;
1065
1066                 u = UNIT_DEREF(u->slice);
1067         }
1068
1069         return NULL;
1070 }
1071
1072 char *unit_default_cgroup_path(Unit *u) {
1073         _cleanup_free_ char *escaped = NULL, *slice = NULL;
1074         int r;
1075
1076         assert(u);
1077
1078         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1079                 return strdup(u->manager->cgroup_root);
1080
1081         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1082                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1083                 if (r < 0)
1084                         return NULL;
1085         }
1086
1087         escaped = cg_escape(u->id);
1088         if (!escaped)
1089                 return NULL;
1090
1091         if (slice)
1092                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
1093         else
1094                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
1095 }
1096
1097 int unit_set_cgroup_path(Unit *u, const char *path) {
1098         _cleanup_free_ char *p = NULL;
1099         int r;
1100
1101         assert(u);
1102
1103         if (path) {
1104                 p = strdup(path);
1105                 if (!p)
1106                         return -ENOMEM;
1107         } else
1108                 p = NULL;
1109
1110         if (streq_ptr(u->cgroup_path, p))
1111                 return 0;
1112
1113         if (p) {
1114                 r = hashmap_put(u->manager->cgroup_unit, p, u);
1115                 if (r < 0)
1116                         return r;
1117         }
1118
1119         unit_release_cgroup(u);
1120
1121         u->cgroup_path = p;
1122         p = NULL;
1123
1124         return 1;
1125 }
1126
1127 int unit_watch_cgroup(Unit *u) {
1128         _cleanup_free_ char *events = NULL;
1129         int r;
1130
1131         assert(u);
1132
1133         if (!u->cgroup_path)
1134                 return 0;
1135
1136         if (u->cgroup_inotify_wd >= 0)
1137                 return 0;
1138
1139         /* Only applies to the unified hierarchy */
1140         r = cg_unified();
1141         if (r < 0)
1142                 return log_unit_error_errno(u, r, "Failed detect whether the unified hierarchy is used: %m");
1143         if (r == 0)
1144                 return 0;
1145
1146         /* Don't watch the root slice, it's pointless. */
1147         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1148                 return 0;
1149
1150         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1151         if (r < 0)
1152                 return log_oom();
1153
1154         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1155         if (r < 0)
1156                 return log_oom();
1157
1158         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1159         if (u->cgroup_inotify_wd < 0) {
1160
1161                 if (errno == ENOENT) /* If the directory is already
1162                                       * gone we don't need to track
1163                                       * it, so this is not an error */
1164                         return 0;
1165
1166                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1167         }
1168
1169         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1170         if (r < 0)
1171                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1172
1173         return 0;
1174 }
1175
1176 static int unit_create_cgroup(
1177                 Unit *u,
1178                 CGroupMask target_mask,
1179                 CGroupMask enable_mask) {
1180
1181         CGroupContext *c;
1182         int r;
1183
1184         assert(u);
1185
1186         c = unit_get_cgroup_context(u);
1187         if (!c)
1188                 return 0;
1189
1190         if (!u->cgroup_path) {
1191                 _cleanup_free_ char *path = NULL;
1192
1193                 path = unit_default_cgroup_path(u);
1194                 if (!path)
1195                         return log_oom();
1196
1197                 r = unit_set_cgroup_path(u, path);
1198                 if (r == -EEXIST)
1199                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1200                 if (r < 0)
1201                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1202         }
1203
1204         /* First, create our own group */
1205         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1206         if (r < 0)
1207                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1208
1209         /* Start watching it */
1210         (void) unit_watch_cgroup(u);
1211
1212         /* Enable all controllers we need */
1213         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1214         if (r < 0)
1215                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1216
1217         /* Keep track that this is now realized */
1218         u->cgroup_realized = true;
1219         u->cgroup_realized_mask = target_mask;
1220         u->cgroup_enabled_mask = enable_mask;
1221
1222         if (u->type != UNIT_SLICE && !c->delegate) {
1223
1224                 /* Then, possibly move things over, but not if
1225                  * subgroups may contain processes, which is the case
1226                  * for slice and delegation units. */
1227                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1228                 if (r < 0)
1229                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1230         }
1231
1232         return 0;
1233 }
1234
1235 int unit_attach_pids_to_cgroup(Unit *u) {
1236         int r;
1237         assert(u);
1238
1239         r = unit_realize_cgroup(u);
1240         if (r < 0)
1241                 return r;
1242
1243         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1244         if (r < 0)
1245                 return r;
1246
1247         return 0;
1248 }
1249
1250 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask, CGroupMask enable_mask) {
1251         assert(u);
1252
1253         return u->cgroup_realized && u->cgroup_realized_mask == target_mask && u->cgroup_enabled_mask == enable_mask;
1254 }
1255
1256 /* Check if necessary controllers and attributes for a unit are in place.
1257  *
1258  * If so, do nothing.
1259  * If not, create paths, move processes over, and set attributes.
1260  *
1261  * Returns 0 on success and < 0 on failure. */
1262 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1263         CGroupMask target_mask, enable_mask;
1264         int r;
1265
1266         assert(u);
1267
1268         if (u->in_cgroup_queue) {
1269                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1270                 u->in_cgroup_queue = false;
1271         }
1272
1273         target_mask = unit_get_target_mask(u);
1274         enable_mask = unit_get_enable_mask(u);
1275
1276         if (unit_has_mask_realized(u, target_mask, enable_mask))
1277                 return 0;
1278
1279         /* First, realize parents */
1280         if (UNIT_ISSET(u->slice)) {
1281                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1282                 if (r < 0)
1283                         return r;
1284         }
1285
1286         /* And then do the real work */
1287         r = unit_create_cgroup(u, target_mask, enable_mask);
1288         if (r < 0)
1289                 return r;
1290
1291         /* Finally, apply the necessary attributes. */
1292         cgroup_context_apply(u, target_mask, state);
1293
1294         return 0;
1295 }
1296
1297 static void unit_add_to_cgroup_queue(Unit *u) {
1298
1299         if (u->in_cgroup_queue)
1300                 return;
1301
1302         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1303         u->in_cgroup_queue = true;
1304 }
1305
1306 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1307         ManagerState state;
1308         unsigned n = 0;
1309         Unit *i;
1310         int r;
1311
1312         state = manager_state(m);
1313
1314         while ((i = m->cgroup_queue)) {
1315                 assert(i->in_cgroup_queue);
1316
1317                 r = unit_realize_cgroup_now(i, state);
1318                 if (r < 0)
1319                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1320
1321                 n++;
1322         }
1323
1324         return n;
1325 }
1326
1327 static void unit_queue_siblings(Unit *u) {
1328         Unit *slice;
1329
1330         /* This adds the siblings of the specified unit and the
1331          * siblings of all parent units to the cgroup queue. (But
1332          * neither the specified unit itself nor the parents.) */
1333
1334         while ((slice = UNIT_DEREF(u->slice))) {
1335                 Iterator i;
1336                 Unit *m;
1337
1338                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1339                         if (m == u)
1340                                 continue;
1341
1342                         /* Skip units that have a dependency on the slice
1343                          * but aren't actually in it. */
1344                         if (UNIT_DEREF(m->slice) != slice)
1345                                 continue;
1346
1347                         /* No point in doing cgroup application for units
1348                          * without active processes. */
1349                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1350                                 continue;
1351
1352                         /* If the unit doesn't need any new controllers
1353                          * and has current ones realized, it doesn't need
1354                          * any changes. */
1355                         if (unit_has_mask_realized(m, unit_get_target_mask(m), unit_get_enable_mask(m)))
1356                                 continue;
1357
1358                         unit_add_to_cgroup_queue(m);
1359                 }
1360
1361                 u = slice;
1362         }
1363 }
1364
1365 int unit_realize_cgroup(Unit *u) {
1366         assert(u);
1367
1368         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1369                 return 0;
1370
1371         /* So, here's the deal: when realizing the cgroups for this
1372          * unit, we need to first create all parents, but there's more
1373          * actually: for the weight-based controllers we also need to
1374          * make sure that all our siblings (i.e. units that are in the
1375          * same slice as we are) have cgroups, too. Otherwise, things
1376          * would become very uneven as each of their processes would
1377          * get as much resources as all our group together. This call
1378          * will synchronously create the parent cgroups, but will
1379          * defer work on the siblings to the next event loop
1380          * iteration. */
1381
1382         /* Add all sibling slices to the cgroup queue. */
1383         unit_queue_siblings(u);
1384
1385         /* And realize this one now (and apply the values) */
1386         return unit_realize_cgroup_now(u, manager_state(u->manager));
1387 }
1388
1389 void unit_release_cgroup(Unit *u) {
1390         assert(u);
1391
1392         /* Forgets all cgroup details for this cgroup */
1393
1394         if (u->cgroup_path) {
1395                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1396                 u->cgroup_path = mfree(u->cgroup_path);
1397         }
1398
1399         if (u->cgroup_inotify_wd >= 0) {
1400                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1401                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1402
1403                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1404                 u->cgroup_inotify_wd = -1;
1405         }
1406 }
1407
1408 void unit_prune_cgroup(Unit *u) {
1409         int r;
1410         bool is_root_slice;
1411
1412         assert(u);
1413
1414         /* Removes the cgroup, if empty and possible, and stops watching it. */
1415
1416         if (!u->cgroup_path)
1417                 return;
1418
1419         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1420
1421         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1422         if (r < 0) {
1423                 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1424                 return;
1425         }
1426
1427         if (is_root_slice)
1428                 return;
1429
1430         unit_release_cgroup(u);
1431
1432         u->cgroup_realized = false;
1433         u->cgroup_realized_mask = 0;
1434         u->cgroup_enabled_mask = 0;
1435 }
1436
1437 int unit_search_main_pid(Unit *u, pid_t *ret) {
1438         _cleanup_fclose_ FILE *f = NULL;
1439         pid_t pid = 0, npid, mypid;
1440         int r;
1441
1442         assert(u);
1443         assert(ret);
1444
1445         if (!u->cgroup_path)
1446                 return -ENXIO;
1447
1448         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1449         if (r < 0)
1450                 return r;
1451
1452         mypid = getpid();
1453         while (cg_read_pid(f, &npid) > 0)  {
1454                 pid_t ppid;
1455
1456                 if (npid == pid)
1457                         continue;
1458
1459                 /* Ignore processes that aren't our kids */
1460                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1461                         continue;
1462
1463                 if (pid != 0)
1464                         /* Dang, there's more than one daemonized PID
1465                         in this group, so we don't know what process
1466                         is the main process. */
1467
1468                         return -ENODATA;
1469
1470                 pid = npid;
1471         }
1472
1473         *ret = pid;
1474         return 0;
1475 }
1476
1477 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1478         _cleanup_closedir_ DIR *d = NULL;
1479         _cleanup_fclose_ FILE *f = NULL;
1480         int ret = 0, r;
1481
1482         assert(u);
1483         assert(path);
1484
1485         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1486         if (r < 0)
1487                 ret = r;
1488         else {
1489                 pid_t pid;
1490
1491                 while ((r = cg_read_pid(f, &pid)) > 0) {
1492                         r = unit_watch_pid(u, pid);
1493                         if (r < 0 && ret >= 0)
1494                                 ret = r;
1495                 }
1496
1497                 if (r < 0 && ret >= 0)
1498                         ret = r;
1499         }
1500
1501         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1502         if (r < 0) {
1503                 if (ret >= 0)
1504                         ret = r;
1505         } else {
1506                 char *fn;
1507
1508                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1509                         _cleanup_free_ char *p = NULL;
1510
1511                         p = strjoin(path, "/", fn, NULL);
1512                         free(fn);
1513
1514                         if (!p)
1515                                 return -ENOMEM;
1516
1517                         r = unit_watch_pids_in_path(u, p);
1518                         if (r < 0 && ret >= 0)
1519                                 ret = r;
1520                 }
1521
1522                 if (r < 0 && ret >= 0)
1523                         ret = r;
1524         }
1525
1526         return ret;
1527 }
1528
1529 int unit_watch_all_pids(Unit *u) {
1530         assert(u);
1531
1532         /* Adds all PIDs from our cgroup to the set of PIDs we
1533          * watch. This is a fallback logic for cases where we do not
1534          * get reliable cgroup empty notifications: we try to use
1535          * SIGCHLD as replacement. */
1536
1537         if (!u->cgroup_path)
1538                 return -ENOENT;
1539
1540         if (cg_unified() > 0) /* On unified we can use proper notifications */
1541                 return 0;
1542
1543         return unit_watch_pids_in_path(u, u->cgroup_path);
1544 }
1545
1546 int unit_notify_cgroup_empty(Unit *u) {
1547         int r;
1548
1549         assert(u);
1550
1551         if (!u->cgroup_path)
1552                 return 0;
1553
1554         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1555         if (r <= 0)
1556                 return r;
1557
1558         unit_add_to_gc_queue(u);
1559
1560         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1561                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1562
1563         return 0;
1564 }
1565
1566 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1567         Manager *m = userdata;
1568
1569         assert(s);
1570         assert(fd >= 0);
1571         assert(m);
1572
1573         for (;;) {
1574                 union inotify_event_buffer buffer;
1575                 struct inotify_event *e;
1576                 ssize_t l;
1577
1578                 l = read(fd, &buffer, sizeof(buffer));
1579                 if (l < 0) {
1580                         if (errno == EINTR || errno == EAGAIN)
1581                                 return 0;
1582
1583                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1584                 }
1585
1586                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1587                         Unit *u;
1588
1589                         if (e->wd < 0)
1590                                 /* Queue overflow has no watch descriptor */
1591                                 continue;
1592
1593                         if (e->mask & IN_IGNORED)
1594                                 /* The watch was just removed */
1595                                 continue;
1596
1597                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1598                         if (!u) /* Not that inotify might deliver
1599                                  * events for a watch even after it
1600                                  * was removed, because it was queued
1601                                  * before the removal. Let's ignore
1602                                  * this here safely. */
1603                                 continue;
1604
1605                         (void) unit_notify_cgroup_empty(u);
1606                 }
1607         }
1608 }
1609
1610 int manager_setup_cgroup(Manager *m) {
1611         _cleanup_free_ char *path = NULL;
1612         CGroupController c;
1613         int r, unified;
1614         char *e;
1615
1616         assert(m);
1617
1618         /* 1. Determine hierarchy */
1619         m->cgroup_root = mfree(m->cgroup_root);
1620         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1621         if (r < 0)
1622                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1623
1624         /* Chop off the init scope, if we are already located in it */
1625         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1626
1627         /* LEGACY: Also chop off the system slice if we are in
1628          * it. This is to support live upgrades from older systemd
1629          * versions where PID 1 was moved there. Also see
1630          * cg_get_root_path(). */
1631         if (!e && MANAGER_IS_SYSTEM(m)) {
1632                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1633                 if (!e)
1634                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1635         }
1636         if (e)
1637                 *e = 0;
1638
1639         /* And make sure to store away the root value without trailing
1640          * slash, even for the root dir, so that we can easily prepend
1641          * it everywhere. */
1642         while ((e = endswith(m->cgroup_root, "/")))
1643                 *e = 0;
1644
1645         /* 2. Show data */
1646         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1647         if (r < 0)
1648                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1649
1650         unified = cg_unified();
1651         if (unified < 0)
1652                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1653         if (unified > 0)
1654                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1655         else
1656                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1657
1658         if (!m->test_run) {
1659                 const char *scope_path;
1660
1661                 /* 3. Install agent */
1662                 if (unified) {
1663
1664                         /* In the unified hierarchy we can get
1665                          * cgroup empty notifications via inotify. */
1666
1667                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1668                         safe_close(m->cgroup_inotify_fd);
1669
1670                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1671                         if (m->cgroup_inotify_fd < 0)
1672                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1673
1674                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1675                         if (r < 0)
1676                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1677
1678                         /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
1679                          * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
1680                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5);
1681                         if (r < 0)
1682                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1683
1684                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1685
1686                 } else if (MANAGER_IS_SYSTEM(m)) {
1687
1688                         /* On the legacy hierarchy we only get
1689                          * notifications via cgroup agents. (Which
1690                          * isn't really reliable, since it does not
1691                          * generate events when control groups with
1692                          * children run empty. */
1693
1694                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1695                         if (r < 0)
1696                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1697                         else if (r > 0)
1698                                 log_debug("Installed release agent.");
1699                         else if (r == 0)
1700                                 log_debug("Release agent already installed.");
1701                 }
1702
1703                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1704                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1705                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1706                 if (r < 0)
1707                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1708
1709                 /* also, move all other userspace processes remaining
1710                  * in the root cgroup into that scope. */
1711                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1712                 if (r < 0)
1713                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1714
1715                 /* 5. And pin it, so that it cannot be unmounted */
1716                 safe_close(m->pin_cgroupfs_fd);
1717                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1718                 if (m->pin_cgroupfs_fd < 0)
1719                         return log_error_errno(errno, "Failed to open pin file: %m");
1720
1721                 /* 6.  Always enable hierarchical support if it exists... */
1722                 if (!unified)
1723                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1724         }
1725
1726         /* 7. Figure out which controllers are supported */
1727         r = cg_mask_supported(&m->cgroup_supported);
1728         if (r < 0)
1729                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1730
1731         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1732                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
1733
1734         return 0;
1735 }
1736
1737 void manager_shutdown_cgroup(Manager *m, bool delete) {
1738         assert(m);
1739
1740         /* We can't really delete the group, since we are in it. But
1741          * let's trim it. */
1742         if (delete && m->cgroup_root)
1743                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1744
1745         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1746
1747         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1748         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1749
1750         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1751
1752         m->cgroup_root = mfree(m->cgroup_root);
1753 }
1754
1755 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1756         char *p;
1757         Unit *u;
1758
1759         assert(m);
1760         assert(cgroup);
1761
1762         u = hashmap_get(m->cgroup_unit, cgroup);
1763         if (u)
1764                 return u;
1765
1766         p = strdupa(cgroup);
1767         for (;;) {
1768                 char *e;
1769
1770                 e = strrchr(p, '/');
1771                 if (!e || e == p)
1772                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1773
1774                 *e = 0;
1775
1776                 u = hashmap_get(m->cgroup_unit, p);
1777                 if (u)
1778                         return u;
1779         }
1780 }
1781
1782 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1783         _cleanup_free_ char *cgroup = NULL;
1784         int r;
1785
1786         assert(m);
1787
1788         if (pid <= 0)
1789                 return NULL;
1790
1791         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1792         if (r < 0)
1793                 return NULL;
1794
1795         return manager_get_unit_by_cgroup(m, cgroup);
1796 }
1797
1798 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1799         Unit *u;
1800
1801         assert(m);
1802
1803         if (pid <= 0)
1804                 return NULL;
1805
1806         if (pid == 1)
1807                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1808
1809         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1810         if (u)
1811                 return u;
1812
1813         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1814         if (u)
1815                 return u;
1816
1817         return manager_get_unit_by_pid_cgroup(m, pid);
1818 }
1819
1820 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1821         Unit *u;
1822
1823         assert(m);
1824         assert(cgroup);
1825
1826         log_debug("Got cgroup empty notification for: %s", cgroup);
1827
1828         u = manager_get_unit_by_cgroup(m, cgroup);
1829         if (!u)
1830                 return 0;
1831
1832         return unit_notify_cgroup_empty(u);
1833 }
1834
1835 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1836         _cleanup_free_ char *v = NULL;
1837         int r;
1838
1839         assert(u);
1840         assert(ret);
1841
1842         if (!u->cgroup_path)
1843                 return -ENODATA;
1844
1845         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1846                 return -ENODATA;
1847
1848         if (cg_unified() <= 0)
1849                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1850         else
1851                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1852         if (r == -ENOENT)
1853                 return -ENODATA;
1854         if (r < 0)
1855                 return r;
1856
1857         return safe_atou64(v, ret);
1858 }
1859
1860 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1861         _cleanup_free_ char *v = NULL;
1862         int r;
1863
1864         assert(u);
1865         assert(ret);
1866
1867         if (!u->cgroup_path)
1868                 return -ENODATA;
1869
1870         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1871                 return -ENODATA;
1872
1873         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1874         if (r == -ENOENT)
1875                 return -ENODATA;
1876         if (r < 0)
1877                 return r;
1878
1879         return safe_atou64(v, ret);
1880 }
1881
1882 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1883         _cleanup_free_ char *v = NULL;
1884         uint64_t ns;
1885         int r;
1886
1887         assert(u);
1888         assert(ret);
1889
1890         if (!u->cgroup_path)
1891                 return -ENODATA;
1892
1893         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1894                 return -ENODATA;
1895
1896         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1897         if (r == -ENOENT)
1898                 return -ENODATA;
1899         if (r < 0)
1900                 return r;
1901
1902         r = safe_atou64(v, &ns);
1903         if (r < 0)
1904                 return r;
1905
1906         *ret = ns;
1907         return 0;
1908 }
1909
1910 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1911         nsec_t ns;
1912         int r;
1913
1914         r = unit_get_cpu_usage_raw(u, &ns);
1915         if (r < 0)
1916                 return r;
1917
1918         if (ns > u->cpuacct_usage_base)
1919                 ns -= u->cpuacct_usage_base;
1920         else
1921                 ns = 0;
1922
1923         *ret = ns;
1924         return 0;
1925 }
1926
1927 int unit_reset_cpu_usage(Unit *u) {
1928         nsec_t ns;
1929         int r;
1930
1931         assert(u);
1932
1933         r = unit_get_cpu_usage_raw(u, &ns);
1934         if (r < 0) {
1935                 u->cpuacct_usage_base = 0;
1936                 return r;
1937         }
1938
1939         u->cpuacct_usage_base = ns;
1940         return 0;
1941 }
1942
1943 bool unit_cgroup_delegate(Unit *u) {
1944         CGroupContext *c;
1945
1946         assert(u);
1947
1948         c = unit_get_cgroup_context(u);
1949         if (!c)
1950                 return false;
1951
1952         return c->delegate;
1953 }
1954
1955 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1956         assert(u);
1957
1958         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1959                 return;
1960
1961         if (m == 0)
1962                 return;
1963
1964         /* always invalidate compat pairs together */
1965         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
1966                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1967
1968         if ((u->cgroup_realized_mask & m) == 0)
1969                 return;
1970
1971         u->cgroup_realized_mask &= ~m;
1972         unit_add_to_cgroup_queue(u);
1973 }
1974
1975 void manager_invalidate_startup_units(Manager *m) {
1976         Iterator i;
1977         Unit *u;
1978
1979         assert(m);
1980
1981         SET_FOREACH(u, m->startup_units, i)
1982                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
1983 }
1984
1985 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1986         [CGROUP_AUTO] = "auto",
1987         [CGROUP_CLOSED] = "closed",
1988         [CGROUP_STRICT] = "strict",
1989 };
1990
1991 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);