src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "process-util.h"
  26 #include "path-util.h"
  27 #include "special.h"
  28 #include "cgroup-util.h"
  29 #include "cgroup.h"
  30
  31 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  32
  33 void cgroup_context_init(CGroupContext *c) {
  34         assert(c);
  35
  36         /* Initialize everything to the kernel defaults, assuming the
  37          * structure is preinitialized to 0 */
  38
  39         c->cpu_shares = (unsigned long) -1;
  40         c->startup_cpu_shares = (unsigned long) -1;
  41         c->memory_limit = (uint64_t) -1;
  42         c->blockio_weight = (unsigned long) -1;
  43         c->startup_blockio_weight = (unsigned long) -1;
  44
  45         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  46 }
  47
  48 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  49         assert(c);
  50         assert(a);
  51
  52         LIST_REMOVE(device_allow, c->device_allow, a);
  53         free(a->path);
  54         free(a);
  55 }
  56
  57 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  58         assert(c);
  59         assert(w);
  60
  61         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  62         free(w->path);
  63         free(w);
  64 }
  65
  66 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  67         assert(c);
  68         assert(b);
  69
  70         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  71         free(b->path);
  72         free(b);
  73 }
  74
  75 void cgroup_context_done(CGroupContext *c) {
  76         assert(c);
  77
  78         while (c->blockio_device_weights)
  79                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  80
  81         while (c->blockio_device_bandwidths)
  82                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  83
  84         while (c->device_allow)
  85                 cgroup_context_free_device_allow(c, c->device_allow);
  86 }
  87
  88 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  89         CGroupBlockIODeviceBandwidth *b;
  90         CGroupBlockIODeviceWeight *w;
  91         CGroupDeviceAllow *a;
  92         char u[FORMAT_TIMESPAN_MAX];
  93
  94         assert(c);
  95         assert(f);
  96
  97         prefix = strempty(prefix);
  98
  99         fprintf(f,
 100                 "%sCPUAccounting=%s\n"
 101                 "%sBlockIOAccounting=%s\n"
 102                 "%sMemoryAccounting=%s\n"
 103                 "%sCPUShares=%lu\n"
 104                 "%sStartupCPUShares=%lu\n"
 105                 "%sCPUQuotaPerSecSec=%s\n"
 106                 "%sBlockIOWeight=%lu\n"
 107                 "%sStartupBlockIOWeight=%lu\n"
 108                 "%sMemoryLimit=%" PRIu64 "\n"
 109                 "%sDevicePolicy=%s\n"
 110                 "%sDelegate=%s\n",
 111                 prefix, yes_no(c->cpu_accounting),
 112                 prefix, yes_no(c->blockio_accounting),
 113                 prefix, yes_no(c->memory_accounting),
 114                 prefix, c->cpu_shares,
 115                 prefix, c->startup_cpu_shares,
 116                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 117                 prefix, c->blockio_weight,
 118                 prefix, c->startup_blockio_weight,
 119                 prefix, c->memory_limit,
 120                 prefix, cgroup_device_policy_to_string(c->device_policy),
 121                 prefix, yes_no(c->delegate));
 122
 123         LIST_FOREACH(device_allow, a, c->device_allow)
 124                 fprintf(f,
 125                         "%sDeviceAllow=%s %s%s%s\n",
 126                         prefix,
 127                         a->path,
 128                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 129
 130         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 131                 fprintf(f,
 132                         "%sBlockIODeviceWeight=%s %lu",
 133                         prefix,
 134                         w->path,
 135                         w->weight);
 136
 137         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 138                 char buf[FORMAT_BYTES_MAX];
 139
 140                 fprintf(f,
 141                         "%s%s=%s %s\n",
 142                         prefix,
 143                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 144                         b->path,
 145                         format_bytes(buf, sizeof(buf), b->bandwidth));
 146         }
 147 }
 148
 149 static int lookup_blkio_device(const char *p, dev_t *dev) {
 150         struct stat st;
 151         int r;
 152
 153         assert(p);
 154         assert(dev);
 155
 156         r = stat(p, &st);
 157         if (r < 0)
 158                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 159
 160         if (S_ISBLK(st.st_mode))
 161                 *dev = st.st_rdev;
 162         else if (major(st.st_dev) != 0) {
 163                 /* If this is not a device node then find the block
 164                  * device this file is stored on */
 165                 *dev = st.st_dev;
 166
 167                 /* If this is a partition, try to get the originating
 168                  * block device */
 169                 block_get_whole_disk(*dev, dev);
 170         } else {
 171                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 172                 return -ENODEV;
 173         }
 174
 175         return 0;
 176 }
 177
 178 static int whitelist_device(const char *path, const char *node, const char *acc) {
 179         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 180         struct stat st;
 181         int r;
 182
 183         assert(path);
 184         assert(acc);
 185
 186         if (stat(node, &st) < 0) {
 187                 log_warning("Couldn't stat device %s", node);
 188                 return -errno;
 189         }
 190
 191         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 192                 log_warning("%s is not a device.", node);
 193                 return -ENODEV;
 194         }
 195
 196         sprintf(buf,
 197                 "%c %u:%u %s",
 198                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 199                 major(st.st_rdev), minor(st.st_rdev),
 200                 acc);
 201
 202         r = cg_set_attribute("devices", path, "devices.allow", buf);
 203         if (r < 0)
 204                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 205                                "Failed to set devices.allow on %s: %m", path);
 206
 207         return r;
 208 }
 209
 210 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 211         _cleanup_fclose_ FILE *f = NULL;
 212         char line[LINE_MAX];
 213         bool good = false;
 214         int r;
 215
 216         assert(path);
 217         assert(acc);
 218         assert(type == 'b' || type == 'c');
 219
 220         f = fopen("/proc/devices", "re");
 221         if (!f)
 222                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 223
 224         FOREACH_LINE(line, f, goto fail) {
 225                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 226                 unsigned maj;
 227
 228                 truncate_nl(line);
 229
 230                 if (type == 'c' && streq(line, "Character devices:")) {
 231                         good = true;
 232                         continue;
 233                 }
 234
 235                 if (type == 'b' && streq(line, "Block devices:")) {
 236                         good = true;
 237                         continue;
 238                 }
 239
 240                 if (isempty(line)) {
 241                         good = false;
 242                         continue;
 243                 }
 244
 245                 if (!good)
 246                         continue;
 247
 248                 p = strstrip(line);
 249
 250                 w = strpbrk(p, WHITESPACE);
 251                 if (!w)
 252                         continue;
 253                 *w = 0;
 254
 255                 r = safe_atou(p, &maj);
 256                 if (r < 0)
 257                         continue;
 258                 if (maj <= 0)
 259                         continue;
 260
 261                 w++;
 262                 w += strspn(w, WHITESPACE);
 263
 264                 if (fnmatch(name, w, 0) != 0)
 265                         continue;
 266
 267                 sprintf(buf,
 268                         "%c %u:* %s",
 269                         type,
 270                         maj,
 271                         acc);
 272
 273                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 274                 if (r < 0)
 275                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 276                                        "Failed to set devices.allow on %s: %m", path);
 277         }
 278
 279         return 0;
 280
 281 fail:
 282         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 283         return -errno;
 284 }
 285
 286 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
 287         bool is_root;
 288         int r;
 289
 290         assert(c);
 291         assert(path);
 292
 293         if (mask == 0)
 294                 return;
 295
 296         /* Some cgroup attributes are not supported on the root cgroup,
 297          * hence silently ignore */
 298         is_root = isempty(path) || path_equal(path, "/");
 299         if (is_root)
 300                 /* Make sure we don't try to display messages with an empty path. */
 301                 path = "/";
 302
 303         /* We generally ignore errors caused by read-only mounted
 304          * cgroup trees (assuming we are running in a container then),
 305          * and missing cgroups, i.e. EROFS and ENOENT. */
 306
 307         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 308                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 309
 310                 sprintf(buf, "%lu\n",
 311                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 312                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 313                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 314                 if (r < 0)
 315                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 316                                        "Failed to set cpu.shares on %s: %m", path);
 317
 318                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 319                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 320                 if (r < 0)
 321                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 322                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 323
 324                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 325                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 326                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 327                 } else
 328                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 329                 if (r < 0)
 330                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 331                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 332         }
 333
 334         if (mask & CGROUP_MASK_BLKIO) {
 335                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 336                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 337                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 338                 CGroupBlockIODeviceWeight *w;
 339                 CGroupBlockIODeviceBandwidth *b;
 340
 341                 if (!is_root) {
 342                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 343                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 344                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 345                         if (r < 0)
 346                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 347                                                "Failed to set blkio.weight on %s: %m", path);
 348
 349                         /* FIXME: no way to reset this list */
 350                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 351                                 dev_t dev;
 352
 353                                 r = lookup_blkio_device(w->path, &dev);
 354                                 if (r < 0)
 355                                         continue;
 356
 357                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 358                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 359                                 if (r < 0)
 360                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 361                                                        "Failed to set blkio.weight_device on %s: %m", path);
 362                         }
 363                 }
 364
 365                 /* FIXME: no way to reset this list */
 366                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 367                         const char *a;
 368                         dev_t dev;
 369
 370                         r = lookup_blkio_device(b->path, &dev);
 371                         if (r < 0)
 372                                 continue;
 373
 374                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 375
 376                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 377                         r = cg_set_attribute("blkio", path, a, buf);
 378                         if (r < 0)
 379                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 380                                                "Failed to set %s on %s: %m", a, path);
 381                 }
 382         }
 383
 384         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 385                 if (c->memory_limit != (uint64_t) -1) {
 386                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 387
 388                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 389
 390                         if (cg_unified() <= 0)
 391                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 392                         else
 393                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 394
 395                 } else {
 396                         if (cg_unified() <= 0)
 397                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 398                         else
 399                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 400                 }
 401
 402                 if (r < 0)
 403                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 404                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 405         }
 406
 407         if ((mask & CGROUP_MASK_DEVICE) && !is_root) {
 408                 CGroupDeviceAllow *a;
 409
 410                 /* Changing the devices list of a populated cgroup
 411                  * might result in EINVAL, hence ignore EINVAL
 412                  * here. */
 413
 414                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 415                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 416                 else
 417                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 418                 if (r < 0)
 419                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 420                                        "Failed to reset devices.list on %s: %m", path);
 421
 422                 if (c->device_policy == CGROUP_CLOSED ||
 423                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 424                         static const char auto_devices[] =
 425                                 "/dev/null\0" "rwm\0"
 426                                 "/dev/zero\0" "rwm\0"
 427                                 "/dev/full\0" "rwm\0"
 428                                 "/dev/random\0" "rwm\0"
 429                                 "/dev/urandom\0" "rwm\0"
 430                                 "/dev/tty\0" "rwm\0"
 431                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 432
 433                         const char *x, *y;
 434
 435                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 436                                 whitelist_device(path, x, y);
 437
 438                         whitelist_major(path, "pts", 'c', "rw");
 439                         whitelist_major(path, "kdbus", 'c', "rw");
 440                         whitelist_major(path, "kdbus/*", 'c', "rw");
 441                 }
 442
 443                 LIST_FOREACH(device_allow, a, c->device_allow) {
 444                         char acc[4];
 445                         unsigned k = 0;
 446
 447                         if (a->r)
 448                                 acc[k++] = 'r';
 449                         if (a->w)
 450                                 acc[k++] = 'w';
 451                         if (a->m)
 452                                 acc[k++] = 'm';
 453
 454                         if (k == 0)
 455                                 continue;
 456
 457                         acc[k++] = 0;
 458
 459                         if (startswith(a->path, "/dev/"))
 460                                 whitelist_device(path, a->path, acc);
 461                         else if (startswith(a->path, "block-"))
 462                                 whitelist_major(path, a->path + 6, 'b', acc);
 463                         else if (startswith(a->path, "char-"))
 464                                 whitelist_major(path, a->path + 5, 'c', acc);
 465                         else
 466                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 467                 }
 468         }
 469 }
 470
 471 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 472         CGroupMask mask = 0;
 473
 474         /* Figure out which controllers we need */
 475
 476         if (c->cpu_accounting ||
 477             c->cpu_shares != (unsigned long) -1 ||
 478             c->startup_cpu_shares != (unsigned long) -1 ||
 479             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 480                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 481
 482         if (c->blockio_accounting ||
 483             c->blockio_weight != (unsigned long) -1 ||
 484             c->startup_blockio_weight != (unsigned long) -1 ||
 485             c->blockio_device_weights ||
 486             c->blockio_device_bandwidths)
 487                 mask |= CGROUP_MASK_BLKIO;
 488
 489         if (c->memory_accounting ||
 490             c->memory_limit != (uint64_t) -1)
 491                 mask |= CGROUP_MASK_MEMORY;
 492
 493         if (c->device_allow ||
 494             c->device_policy != CGROUP_AUTO)
 495                 mask |= CGROUP_MASK_DEVICE;
 496
 497         return mask;
 498 }
 499
 500 CGroupMask unit_get_own_mask(Unit *u) {
 501         CGroupContext *c;
 502
 503         /* Returns the mask of controllers the unit needs for itself */
 504
 505         c = unit_get_cgroup_context(u);
 506         if (!c)
 507                 return 0;
 508
 509         /* If delegation is turned on, then turn on all cgroups,
 510          * unless we are on the legacy hierarchy and the process we
 511          * fork into it is known to drop privileges, and hence
 512          * shouldn't get access to the controllers.
 513          *
 514          * Note that on the unified hierarchy it is safe to delegate
 515          * controllers to unprivileged services. */
 516
 517         if (c->delegate) {
 518                 ExecContext *e;
 519
 520                 e = unit_get_exec_context(u);
 521                 if (!e ||
 522                     exec_context_maintains_privileges(e) ||
 523                     cg_unified() > 0)
 524                         return _CGROUP_MASK_ALL;
 525         }
 526
 527         return cgroup_context_get_mask(c);
 528 }
 529
 530 CGroupMask unit_get_members_mask(Unit *u) {
 531         assert(u);
 532
 533         /* Returns the mask of controllers all of the unit's children
 534          * require, merged */
 535
 536         if (u->cgroup_members_mask_valid)
 537                 return u->cgroup_members_mask;
 538
 539         u->cgroup_members_mask = 0;
 540
 541         if (u->type == UNIT_SLICE) {
 542                 Unit *member;
 543                 Iterator i;
 544
 545                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 546
 547                         if (member == u)
 548                                 continue;
 549
 550                         if (UNIT_DEREF(member->slice) != u)
 551                                 continue;
 552
 553                         u->cgroup_members_mask |=
 554                                 unit_get_own_mask(member) |
 555                                 unit_get_members_mask(member);
 556                 }
 557         }
 558
 559         u->cgroup_members_mask_valid = true;
 560         return u->cgroup_members_mask;
 561 }
 562
 563 CGroupMask unit_get_siblings_mask(Unit *u) {
 564         assert(u);
 565
 566         /* Returns the mask of controllers all of the unit's siblings
 567          * require, i.e. the members mask of the unit's parent slice
 568          * if there is one. */
 569
 570         if (UNIT_ISSET(u->slice))
 571                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 572
 573         return unit_get_own_mask(u) | unit_get_members_mask(u);
 574 }
 575
 576 CGroupMask unit_get_subtree_mask(Unit *u) {
 577
 578         /* Returns the mask of this subtree, meaning of the group
 579          * itself and its children. */
 580
 581         return unit_get_own_mask(u) | unit_get_members_mask(u);
 582 }
 583
 584 CGroupMask unit_get_target_mask(Unit *u) {
 585         CGroupMask mask;
 586
 587         /* This returns the cgroup mask of all controllers to enable
 588          * for a specific cgroup, i.e. everything it needs itself,
 589          * plus all that its children need, plus all that its siblings
 590          * need. This is primarily useful on the legacy cgroup
 591          * hierarchy, where we need to duplicate each cgroup in each
 592          * hierarchy that shall be enabled for it. */
 593
 594         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 595         mask &= u->manager->cgroup_supported;
 596
 597         return mask;
 598 }
 599
 600 CGroupMask unit_get_enable_mask(Unit *u) {
 601         CGroupMask mask;
 602
 603         /* This returns the cgroup mask of all controllers to enable
 604          * for the children of a specific cgroup. This is primarily
 605          * useful for the unified cgroup hierarchy, where each cgroup
 606          * controls which controllers are enabled for its children. */
 607
 608         mask = unit_get_members_mask(u);
 609         mask &= u->manager->cgroup_supported;
 610
 611         return mask;
 612 }
 613
 614 /* Recurse from a unit up through its containing slices, propagating
 615  * mask bits upward. A unit is also member of itself. */
 616 void unit_update_cgroup_members_masks(Unit *u) {
 617         CGroupMask m;
 618         bool more;
 619
 620         assert(u);
 621
 622         /* Calculate subtree mask */
 623         m = unit_get_subtree_mask(u);
 624
 625         /* See if anything changed from the previous invocation. If
 626          * not, we're done. */
 627         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 628                 return;
 629
 630         more =
 631                 u->cgroup_subtree_mask_valid &&
 632                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 633                 ((~m & u->cgroup_subtree_mask) == 0);
 634
 635         u->cgroup_subtree_mask = m;
 636         u->cgroup_subtree_mask_valid = true;
 637
 638         if (UNIT_ISSET(u->slice)) {
 639                 Unit *s = UNIT_DEREF(u->slice);
 640
 641                 if (more)
 642                         /* There's more set now than before. We
 643                          * propagate the new mask to the parent's mask
 644                          * (not caring if it actually was valid or
 645                          * not). */
 646
 647                         s->cgroup_members_mask |= m;
 648
 649                 else
 650                         /* There's less set now than before (or we
 651                          * don't know), we need to recalculate
 652                          * everything, so let's invalidate the
 653                          * parent's members mask */
 654
 655                         s->cgroup_members_mask_valid = false;
 656
 657                 /* And now make sure that this change also hits our
 658                  * grandparents */
 659                 unit_update_cgroup_members_masks(s);
 660         }
 661 }
 662
 663 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 664         Unit *u = userdata;
 665
 666         assert(mask != 0);
 667         assert(u);
 668
 669         while (u) {
 670                 if (u->cgroup_path &&
 671                     u->cgroup_realized &&
 672                     (u->cgroup_realized_mask & mask) == mask)
 673                         return u->cgroup_path;
 674
 675                 u = UNIT_DEREF(u->slice);
 676         }
 677
 678         return NULL;
 679 }
 680
 681 char *unit_default_cgroup_path(Unit *u) {
 682         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 683         int r;
 684
 685         assert(u);
 686
 687         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 688                 return strdup(u->manager->cgroup_root);
 689
 690         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 691                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 692                 if (r < 0)
 693                         return NULL;
 694         }
 695
 696         escaped = cg_escape(u->id);
 697         if (!escaped)
 698                 return NULL;
 699
 700         if (slice)
 701                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 702         else
 703                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 704 }
 705
 706 int unit_set_cgroup_path(Unit *u, const char *path) {
 707         _cleanup_free_ char *p = NULL;
 708         int r;
 709
 710         assert(u);
 711
 712         if (path) {
 713                 p = strdup(path);
 714                 if (!p)
 715                         return -ENOMEM;
 716         } else
 717                 p = NULL;
 718
 719         if (streq_ptr(u->cgroup_path, p))
 720                 return 0;
 721
 722         if (p) {
 723                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 724                 if (r < 0)
 725                         return r;
 726         }
 727
 728         unit_release_cgroup(u);
 729
 730         u->cgroup_path = p;
 731         p = NULL;
 732
 733         return 1;
 734 }
 735
 736 int unit_watch_cgroup(Unit *u) {
 737         _cleanup_free_ char *populated = NULL;
 738         int r;
 739
 740         assert(u);
 741
 742         if (!u->cgroup_path)
 743                 return 0;
 744
 745         if (u->cgroup_inotify_wd >= 0)
 746                 return 0;
 747
 748         /* Only applies to the unified hierarchy */
 749         r = cg_unified();
 750         if (r < 0)
 751                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 752         if (r == 0)
 753                 return 0;
 754
 755         /* Don't watch the root slice, it's pointless. */
 756         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 757                 return 0;
 758
 759         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 760         if (r < 0)
 761                 return log_oom();
 762
 763         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 764         if (r < 0)
 765                 return log_oom();
 766
 767         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 768         if (u->cgroup_inotify_wd < 0) {
 769
 770                 if (errno == ENOENT) /* If the directory is already
 771                                       * gone we don't need to track
 772                                       * it, so this is not an error */
 773                         return 0;
 774
 775                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 776         }
 777
 778         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 779         if (r < 0)
 780                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 781
 782         return 0;
 783 }
 784
 785 static int unit_create_cgroup(
 786                 Unit *u,
 787                 CGroupMask target_mask,
 788                 CGroupMask enable_mask) {
 789
 790         CGroupContext *c;
 791         int r;
 792
 793         assert(u);
 794
 795         c = unit_get_cgroup_context(u);
 796         if (!c)
 797                 return 0;
 798
 799         if (!u->cgroup_path) {
 800                 _cleanup_free_ char *path = NULL;
 801
 802                 path = unit_default_cgroup_path(u);
 803                 if (!path)
 804                         return log_oom();
 805
 806                 r = unit_set_cgroup_path(u, path);
 807                 if (r == -EEXIST)
 808                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 809                 if (r < 0)
 810                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 811         }
 812
 813         /* First, create our own group */
 814         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 815         if (r < 0)
 816                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 817
 818         /* Start watching it */
 819         (void) unit_watch_cgroup(u);
 820
 821         /* Enable all controllers we need */
 822         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 823         if (r < 0)
 824                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 825
 826         /* Keep track that this is now realized */
 827         u->cgroup_realized = true;
 828         u->cgroup_realized_mask = target_mask;
 829
 830         if (u->type != UNIT_SLICE && !c->delegate) {
 831
 832                 /* Then, possibly move things over, but not if
 833                  * subgroups may contain processes, which is the case
 834                  * for slice and delegation units. */
 835                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 836                 if (r < 0)
 837                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 838         }
 839
 840         return 0;
 841 }
 842
 843 int unit_attach_pids_to_cgroup(Unit *u) {
 844         int r;
 845         assert(u);
 846
 847         r = unit_realize_cgroup(u);
 848         if (r < 0)
 849                 return r;
 850
 851         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 852         if (r < 0)
 853                 return r;
 854
 855         return 0;
 856 }
 857
 858 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 859         assert(u);
 860
 861         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 862 }
 863
 864 /* Check if necessary controllers and attributes for a unit are in place.
 865  *
 866  * If so, do nothing.
 867  * If not, create paths, move processes over, and set attributes.
 868  *
 869  * Returns 0 on success and < 0 on failure. */
 870 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 871         CGroupMask target_mask, enable_mask;
 872         int r;
 873
 874         assert(u);
 875
 876         if (u->in_cgroup_queue) {
 877                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 878                 u->in_cgroup_queue = false;
 879         }
 880
 881         target_mask = unit_get_target_mask(u);
 882         if (unit_has_mask_realized(u, target_mask))
 883                 return 0;
 884
 885         /* First, realize parents */
 886         if (UNIT_ISSET(u->slice)) {
 887                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 888                 if (r < 0)
 889                         return r;
 890         }
 891
 892         /* And then do the real work */
 893         enable_mask = unit_get_enable_mask(u);
 894         r = unit_create_cgroup(u, target_mask, enable_mask);
 895         if (r < 0)
 896                 return r;
 897
 898         /* Finally, apply the necessary attributes. */
 899         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
 900
 901         return 0;
 902 }
 903
 904 static void unit_add_to_cgroup_queue(Unit *u) {
 905
 906         if (u->in_cgroup_queue)
 907                 return;
 908
 909         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 910         u->in_cgroup_queue = true;
 911 }
 912
 913 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 914         ManagerState state;
 915         unsigned n = 0;
 916         Unit *i;
 917         int r;
 918
 919         state = manager_state(m);
 920
 921         while ((i = m->cgroup_queue)) {
 922                 assert(i->in_cgroup_queue);
 923
 924                 r = unit_realize_cgroup_now(i, state);
 925                 if (r < 0)
 926                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
 927
 928                 n++;
 929         }
 930
 931         return n;
 932 }
 933
 934 static void unit_queue_siblings(Unit *u) {
 935         Unit *slice;
 936
 937         /* This adds the siblings of the specified unit and the
 938          * siblings of all parent units to the cgroup queue. (But
 939          * neither the specified unit itself nor the parents.) */
 940
 941         while ((slice = UNIT_DEREF(u->slice))) {
 942                 Iterator i;
 943                 Unit *m;
 944
 945                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 946                         if (m == u)
 947                                 continue;
 948
 949                         /* Skip units that have a dependency on the slice
 950                          * but aren't actually in it. */
 951                         if (UNIT_DEREF(m->slice) != slice)
 952                                 continue;
 953
 954                         /* No point in doing cgroup application for units
 955                          * without active processes. */
 956                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 957                                 continue;
 958
 959                         /* If the unit doesn't need any new controllers
 960                          * and has current ones realized, it doesn't need
 961                          * any changes. */
 962                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 963                                 continue;
 964
 965                         unit_add_to_cgroup_queue(m);
 966                 }
 967
 968                 u = slice;
 969         }
 970 }
 971
 972 int unit_realize_cgroup(Unit *u) {
 973         assert(u);
 974
 975         if (!UNIT_HAS_CGROUP_CONTEXT(u))
 976                 return 0;
 977
 978         /* So, here's the deal: when realizing the cgroups for this
 979          * unit, we need to first create all parents, but there's more
 980          * actually: for the weight-based controllers we also need to
 981          * make sure that all our siblings (i.e. units that are in the
 982          * same slice as we are) have cgroups, too. Otherwise, things
 983          * would become very uneven as each of their processes would
 984          * get as much resources as all our group together. This call
 985          * will synchronously create the parent cgroups, but will
 986          * defer work on the siblings to the next event loop
 987          * iteration. */
 988
 989         /* Add all sibling slices to the cgroup queue. */
 990         unit_queue_siblings(u);
 991
 992         /* And realize this one now (and apply the values) */
 993         return unit_realize_cgroup_now(u, manager_state(u->manager));
 994 }
 995
 996 void unit_release_cgroup(Unit *u) {
 997         assert(u);
 998
 999         /* Forgets all cgroup details for this cgroup */
1000
1001         if (u->cgroup_path) {
1002                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1003                 u->cgroup_path = mfree(u->cgroup_path);
1004         }
1005
1006         if (u->cgroup_inotify_wd >= 0) {
1007                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1008                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1009
1010                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1011                 u->cgroup_inotify_wd = -1;
1012         }
1013 }
1014
1015 void unit_prune_cgroup(Unit *u) {
1016         int r;
1017         bool is_root_slice;
1018
1019         assert(u);
1020
1021         /* Removes the cgroup, if empty and possible, and stops watching it. */
1022
1023         if (!u->cgroup_path)
1024                 return;
1025
1026         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1027
1028         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1029         if (r < 0) {
1030                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1031                 return;
1032         }
1033
1034         if (is_root_slice)
1035                 return;
1036
1037         unit_release_cgroup(u);
1038
1039         u->cgroup_realized = false;
1040         u->cgroup_realized_mask = 0;
1041 }
1042
1043 int unit_search_main_pid(Unit *u, pid_t *ret) {
1044         _cleanup_fclose_ FILE *f = NULL;
1045         pid_t pid = 0, npid, mypid;
1046         int r;
1047
1048         assert(u);
1049         assert(ret);
1050
1051         if (!u->cgroup_path)
1052                 return -ENXIO;
1053
1054         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1055         if (r < 0)
1056                 return r;
1057
1058         mypid = getpid();
1059         while (cg_read_pid(f, &npid) > 0)  {
1060                 pid_t ppid;
1061
1062                 if (npid == pid)
1063                         continue;
1064
1065                 /* Ignore processes that aren't our kids */
1066                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1067                         continue;
1068
1069                 if (pid != 0)
1070                         /* Dang, there's more than one daemonized PID
1071                         in this group, so we don't know what process
1072                         is the main process. */
1073
1074                         return -ENODATA;
1075
1076                 pid = npid;
1077         }
1078
1079         *ret = pid;
1080         return 0;
1081 }
1082
1083 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1084         _cleanup_closedir_ DIR *d = NULL;
1085         _cleanup_fclose_ FILE *f = NULL;
1086         int ret = 0, r;
1087
1088         assert(u);
1089         assert(path);
1090
1091         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1092         if (r < 0)
1093                 ret = r;
1094         else {
1095                 pid_t pid;
1096
1097                 while ((r = cg_read_pid(f, &pid)) > 0) {
1098                         r = unit_watch_pid(u, pid);
1099                         if (r < 0 && ret >= 0)
1100                                 ret = r;
1101                 }
1102
1103                 if (r < 0 && ret >= 0)
1104                         ret = r;
1105         }
1106
1107         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1108         if (r < 0) {
1109                 if (ret >= 0)
1110                         ret = r;
1111         } else {
1112                 char *fn;
1113
1114                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1115                         _cleanup_free_ char *p = NULL;
1116
1117                         p = strjoin(path, "/", fn, NULL);
1118                         free(fn);
1119
1120                         if (!p)
1121                                 return -ENOMEM;
1122
1123                         r = unit_watch_pids_in_path(u, p);
1124                         if (r < 0 && ret >= 0)
1125                                 ret = r;
1126                 }
1127
1128                 if (r < 0 && ret >= 0)
1129                         ret = r;
1130         }
1131
1132         return ret;
1133 }
1134
1135 int unit_watch_all_pids(Unit *u) {
1136         assert(u);
1137
1138         /* Adds all PIDs from our cgroup to the set of PIDs we
1139          * watch. This is a fallback logic for cases where we do not
1140          * get reliable cgroup empty notifications: we try to use
1141          * SIGCHLD as replacement. */
1142
1143         if (!u->cgroup_path)
1144                 return -ENOENT;
1145
1146         if (cg_unified() > 0) /* On unified we can use proper notifications */
1147                 return 0;
1148
1149         return unit_watch_pids_in_path(u, u->cgroup_path);
1150 }
1151
1152 int unit_notify_cgroup_empty(Unit *u) {
1153         int r;
1154
1155         assert(u);
1156
1157         if (!u->cgroup_path)
1158                 return 0;
1159
1160         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1161         if (r <= 0)
1162                 return r;
1163
1164         unit_add_to_gc_queue(u);
1165
1166         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1167                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1168
1169         return 0;
1170 }
1171
1172 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1173         Manager *m = userdata;
1174
1175         assert(s);
1176         assert(fd >= 0);
1177         assert(m);
1178
1179         for (;;) {
1180                 union inotify_event_buffer buffer;
1181                 struct inotify_event *e;
1182                 ssize_t l;
1183
1184                 l = read(fd, &buffer, sizeof(buffer));
1185                 if (l < 0) {
1186                         if (errno == EINTR || errno == EAGAIN)
1187                                 return 0;
1188
1189                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1190                 }
1191
1192                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1193                         Unit *u;
1194
1195                         if (e->wd < 0)
1196                                 /* Queue overflow has no watch descriptor */
1197                                 continue;
1198
1199                         if (e->mask & IN_IGNORED)
1200                                 /* The watch was just removed */
1201                                 continue;
1202
1203                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1204                         if (!u) /* Not that inotify might deliver
1205                                  * events for a watch even after it
1206                                  * was removed, because it was queued
1207                                  * before the removal. Let's ignore
1208                                  * this here safely. */
1209                                 continue;
1210
1211                         (void) unit_notify_cgroup_empty(u);
1212                 }
1213         }
1214 }
1215
1216 int manager_setup_cgroup(Manager *m) {
1217         _cleanup_free_ char *path = NULL;
1218         CGroupController c;
1219         int r, unified;
1220         char *e;
1221
1222         assert(m);
1223
1224         /* 1. Determine hierarchy */
1225         m->cgroup_root = mfree(m->cgroup_root);
1226         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1227         if (r < 0)
1228                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1229
1230         /* Chop off the init scope, if we are already located in it */
1231         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1232
1233         /* LEGACY: Also chop off the system slice if we are in
1234          * it. This is to support live upgrades from older systemd
1235          * versions where PID 1 was moved there. Also see
1236          * cg_get_root_path(). */
1237         if (!e && m->running_as == MANAGER_SYSTEM) {
1238                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1239                 if (!e)
1240                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1241         }
1242         if (e)
1243                 *e = 0;
1244
1245         /* And make sure to store away the root value without trailing
1246          * slash, even for the root dir, so that we can easily prepend
1247          * it everywhere. */
1248         while ((e = endswith(m->cgroup_root, "/")))
1249                 *e = 0;
1250
1251         /* 2. Show data */
1252         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1253         if (r < 0)
1254                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1255
1256         unified = cg_unified();
1257         if (unified < 0)
1258                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1259         if (unified > 0)
1260                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1261         else
1262                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1263
1264         if (!m->test_run) {
1265                 const char *scope_path;
1266
1267                 /* 3. Install agent */
1268                 if (unified) {
1269
1270                         /* In the unified hierarchy we can can get
1271                          * cgroup empty notifications via inotify. */
1272
1273                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1274                         safe_close(m->cgroup_inotify_fd);
1275
1276                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1277                         if (m->cgroup_inotify_fd < 0)
1278                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1279
1280                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1281                         if (r < 0)
1282                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1283
1284                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1285                         if (r < 0)
1286                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1287
1288                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1289
1290                 } else if (m->running_as == MANAGER_SYSTEM) {
1291
1292                         /* On the legacy hierarchy we only get
1293                          * notifications via cgroup agents. (Which
1294                          * isn't really reliable, since it does not
1295                          * generate events when control groups with
1296                          * children run empty. */
1297
1298                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1299                         if (r < 0)
1300                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1301                         else if (r > 0)
1302                                 log_debug("Installed release agent.");
1303                         else if (r == 0)
1304                                 log_debug("Release agent already installed.");
1305                 }
1306
1307                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1308                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1309                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1310                 if (r < 0)
1311                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1312
1313                 /* also, move all other userspace processes remaining
1314                  * in the root cgroup into that scope. */
1315                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1316                 if (r < 0)
1317                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1318
1319                 /* 5. And pin it, so that it cannot be unmounted */
1320                 safe_close(m->pin_cgroupfs_fd);
1321                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1322                 if (m->pin_cgroupfs_fd < 0)
1323                         return log_error_errno(errno, "Failed to open pin file: %m");
1324
1325                 /* 6.  Always enable hierarchical support if it exists... */
1326                 if (!unified)
1327                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1328         }
1329
1330         /* 7. Figure out which controllers are supported */
1331         r = cg_mask_supported(&m->cgroup_supported);
1332         if (r < 0)
1333                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1334
1335         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1336                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1337
1338         return 0;
1339 }
1340
1341 void manager_shutdown_cgroup(Manager *m, bool delete) {
1342         assert(m);
1343
1344         /* We can't really delete the group, since we are in it. But
1345          * let's trim it. */
1346         if (delete && m->cgroup_root)
1347                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1348
1349         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1350
1351         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1352         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1353
1354         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1355
1356         m->cgroup_root = mfree(m->cgroup_root);
1357 }
1358
1359 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1360         char *p;
1361         Unit *u;
1362
1363         assert(m);
1364         assert(cgroup);
1365
1366         u = hashmap_get(m->cgroup_unit, cgroup);
1367         if (u)
1368                 return u;
1369
1370         p = strdupa(cgroup);
1371         for (;;) {
1372                 char *e;
1373
1374                 e = strrchr(p, '/');
1375                 if (!e || e == p)
1376                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1377
1378                 *e = 0;
1379
1380                 u = hashmap_get(m->cgroup_unit, p);
1381                 if (u)
1382                         return u;
1383         }
1384 }
1385
1386 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1387         _cleanup_free_ char *cgroup = NULL;
1388         int r;
1389
1390         assert(m);
1391
1392         if (pid <= 0)
1393                 return NULL;
1394
1395         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1396         if (r < 0)
1397                 return NULL;
1398
1399         return manager_get_unit_by_cgroup(m, cgroup);
1400 }
1401
1402 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1403         Unit *u;
1404
1405         assert(m);
1406
1407         if (pid <= 0)
1408                 return NULL;
1409
1410         if (pid == 1)
1411                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1412
1413         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1414         if (u)
1415                 return u;
1416
1417         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1418         if (u)
1419                 return u;
1420
1421         return manager_get_unit_by_pid_cgroup(m, pid);
1422 }
1423
1424 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1425         Unit *u;
1426
1427         assert(m);
1428         assert(cgroup);
1429
1430         u = manager_get_unit_by_cgroup(m, cgroup);
1431         if (!u)
1432                 return 0;
1433
1434         return unit_notify_cgroup_empty(u);
1435 }
1436
1437 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1438         _cleanup_free_ char *v = NULL;
1439         int r;
1440
1441         assert(u);
1442         assert(ret);
1443
1444         if (!u->cgroup_path)
1445                 return -ENODATA;
1446
1447         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1448                 return -ENODATA;
1449
1450         if (cg_unified() <= 0)
1451                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1452         else
1453                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1454         if (r == -ENOENT)
1455                 return -ENODATA;
1456         if (r < 0)
1457                 return r;
1458
1459         return safe_atou64(v, ret);
1460 }
1461
1462 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1463         _cleanup_free_ char *v = NULL;
1464         uint64_t ns;
1465         int r;
1466
1467         assert(u);
1468         assert(ret);
1469
1470         if (!u->cgroup_path)
1471                 return -ENODATA;
1472
1473         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1474                 return -ENODATA;
1475
1476         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1477         if (r == -ENOENT)
1478                 return -ENODATA;
1479         if (r < 0)
1480                 return r;
1481
1482         r = safe_atou64(v, &ns);
1483         if (r < 0)
1484                 return r;
1485
1486         *ret = ns;
1487         return 0;
1488 }
1489
1490 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1491         nsec_t ns;
1492         int r;
1493
1494         r = unit_get_cpu_usage_raw(u, &ns);
1495         if (r < 0)
1496                 return r;
1497
1498         if (ns > u->cpuacct_usage_base)
1499                 ns -= u->cpuacct_usage_base;
1500         else
1501                 ns = 0;
1502
1503         *ret = ns;
1504         return 0;
1505 }
1506
1507 int unit_reset_cpu_usage(Unit *u) {
1508         nsec_t ns;
1509         int r;
1510
1511         assert(u);
1512
1513         r = unit_get_cpu_usage_raw(u, &ns);
1514         if (r < 0) {
1515                 u->cpuacct_usage_base = 0;
1516                 return r;
1517         }
1518
1519         u->cpuacct_usage_base = ns;
1520         return 0;
1521 }
1522
1523 bool unit_cgroup_delegate(Unit *u) {
1524         CGroupContext *c;
1525
1526         assert(u);
1527
1528         c = unit_get_cgroup_context(u);
1529         if (!c)
1530                 return false;
1531
1532         return c->delegate;
1533 }
1534
1535 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1536         [CGROUP_AUTO] = "auto",
1537         [CGROUP_CLOSED] = "closed",
1538         [CGROUP_STRICT] = "strict",
1539 };
1540
1541 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);