src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "process-util.h"
  26 #include "path-util.h"
  27 #include "special.h"
  28 #include "cgroup-util.h"
  29 #include "cgroup.h"
  30
  31 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  32
  33 void cgroup_context_init(CGroupContext *c) {
  34         assert(c);
  35
  36         /* Initialize everything to the kernel defaults, assuming the
  37          * structure is preinitialized to 0 */
  38
  39         c->cpu_shares = (unsigned long) -1;
  40         c->startup_cpu_shares = (unsigned long) -1;
  41         c->memory_limit = (uint64_t) -1;
  42         c->blockio_weight = (unsigned long) -1;
  43         c->startup_blockio_weight = (unsigned long) -1;
  44
  45         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  46 }
  47
  48 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  49         assert(c);
  50         assert(a);
  51
  52         LIST_REMOVE(device_allow, c->device_allow, a);
  53         free(a->path);
  54         free(a);
  55 }
  56
  57 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  58         assert(c);
  59         assert(w);
  60
  61         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  62         free(w->path);
  63         free(w);
  64 }
  65
  66 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  67         assert(c);
  68         assert(b);
  69
  70         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  71         free(b->path);
  72         free(b);
  73 }
  74
  75 void cgroup_context_done(CGroupContext *c) {
  76         assert(c);
  77
  78         while (c->blockio_device_weights)
  79                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  80
  81         while (c->blockio_device_bandwidths)
  82                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  83
  84         while (c->device_allow)
  85                 cgroup_context_free_device_allow(c, c->device_allow);
  86 }
  87
  88 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  89         CGroupBlockIODeviceBandwidth *b;
  90         CGroupBlockIODeviceWeight *w;
  91         CGroupDeviceAllow *a;
  92         char u[FORMAT_TIMESPAN_MAX];
  93
  94         assert(c);
  95         assert(f);
  96
  97         prefix = strempty(prefix);
  98
  99         fprintf(f,
 100                 "%sCPUAccounting=%s\n"
 101                 "%sBlockIOAccounting=%s\n"
 102                 "%sMemoryAccounting=%s\n"
 103                 "%sCPUShares=%lu\n"
 104                 "%sStartupCPUShares=%lu\n"
 105                 "%sCPUQuotaPerSecSec=%s\n"
 106                 "%sBlockIOWeight=%lu\n"
 107                 "%sStartupBlockIOWeight=%lu\n"
 108                 "%sMemoryLimit=%" PRIu64 "\n"
 109                 "%sDevicePolicy=%s\n"
 110                 "%sDelegate=%s\n",
 111                 prefix, yes_no(c->cpu_accounting),
 112                 prefix, yes_no(c->blockio_accounting),
 113                 prefix, yes_no(c->memory_accounting),
 114                 prefix, c->cpu_shares,
 115                 prefix, c->startup_cpu_shares,
 116                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 117                 prefix, c->blockio_weight,
 118                 prefix, c->startup_blockio_weight,
 119                 prefix, c->memory_limit,
 120                 prefix, cgroup_device_policy_to_string(c->device_policy),
 121                 prefix, yes_no(c->delegate));
 122
 123         LIST_FOREACH(device_allow, a, c->device_allow)
 124                 fprintf(f,
 125                         "%sDeviceAllow=%s %s%s%s\n",
 126                         prefix,
 127                         a->path,
 128                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 129
 130         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 131                 fprintf(f,
 132                         "%sBlockIODeviceWeight=%s %lu",
 133                         prefix,
 134                         w->path,
 135                         w->weight);
 136
 137         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 138                 char buf[FORMAT_BYTES_MAX];
 139
 140                 fprintf(f,
 141                         "%s%s=%s %s\n",
 142                         prefix,
 143                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 144                         b->path,
 145                         format_bytes(buf, sizeof(buf), b->bandwidth));
 146         }
 147 }
 148
 149 static int lookup_blkio_device(const char *p, dev_t *dev) {
 150         struct stat st;
 151         int r;
 152
 153         assert(p);
 154         assert(dev);
 155
 156         r = stat(p, &st);
 157         if (r < 0)
 158                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 159
 160         if (S_ISBLK(st.st_mode))
 161                 *dev = st.st_rdev;
 162         else if (major(st.st_dev) != 0) {
 163                 /* If this is not a device node then find the block
 164                  * device this file is stored on */
 165                 *dev = st.st_dev;
 166
 167                 /* If this is a partition, try to get the originating
 168                  * block device */
 169                 block_get_whole_disk(*dev, dev);
 170         } else {
 171                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 172                 return -ENODEV;
 173         }
 174
 175         return 0;
 176 }
 177
 178 static int whitelist_device(const char *path, const char *node, const char *acc) {
 179         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 180         struct stat st;
 181         int r;
 182
 183         assert(path);
 184         assert(acc);
 185
 186         if (stat(node, &st) < 0) {
 187                 log_warning("Couldn't stat device %s", node);
 188                 return -errno;
 189         }
 190
 191         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 192                 log_warning("%s is not a device.", node);
 193                 return -ENODEV;
 194         }
 195
 196         sprintf(buf,
 197                 "%c %u:%u %s",
 198                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 199                 major(st.st_rdev), minor(st.st_rdev),
 200                 acc);
 201
 202         r = cg_set_attribute("devices", path, "devices.allow", buf);
 203         if (r < 0)
 204                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 205                                "Failed to set devices.allow on %s: %m", path);
 206
 207         return r;
 208 }
 209
 210 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 211         _cleanup_fclose_ FILE *f = NULL;
 212         char line[LINE_MAX];
 213         bool good = false;
 214         int r;
 215
 216         assert(path);
 217         assert(acc);
 218         assert(type == 'b' || type == 'c');
 219
 220         f = fopen("/proc/devices", "re");
 221         if (!f)
 222                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 223
 224         FOREACH_LINE(line, f, goto fail) {
 225                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 226                 unsigned maj;
 227
 228                 truncate_nl(line);
 229
 230                 if (type == 'c' && streq(line, "Character devices:")) {
 231                         good = true;
 232                         continue;
 233                 }
 234
 235                 if (type == 'b' && streq(line, "Block devices:")) {
 236                         good = true;
 237                         continue;
 238                 }
 239
 240                 if (isempty(line)) {
 241                         good = false;
 242                         continue;
 243                 }
 244
 245                 if (!good)
 246                         continue;
 247
 248                 p = strstrip(line);
 249
 250                 w = strpbrk(p, WHITESPACE);
 251                 if (!w)
 252                         continue;
 253                 *w = 0;
 254
 255                 r = safe_atou(p, &maj);
 256                 if (r < 0)
 257                         continue;
 258                 if (maj <= 0)
 259                         continue;
 260
 261                 w++;
 262                 w += strspn(w, WHITESPACE);
 263
 264                 if (fnmatch(name, w, 0) != 0)
 265                         continue;
 266
 267                 sprintf(buf,
 268                         "%c %u:* %s",
 269                         type,
 270                         maj,
 271                         acc);
 272
 273                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 274                 if (r < 0)
 275                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 276                                        "Failed to set devices.allow on %s: %m", path);
 277         }
 278
 279         return 0;
 280
 281 fail:
 282         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 283         return -errno;
 284 }
 285
 286 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 287         bool is_root;
 288         int r;
 289
 290         assert(c);
 291         assert(path);
 292
 293         if (mask == 0)
 294                 return;
 295
 296         /* Some cgroup attributes are not supported on the root cgroup,
 297          * hence silently ignore */
 298         is_root = isempty(path) || path_equal(path, "/");
 299         if (is_root)
 300                 /* Make sure we don't try to display messages with an empty path. */
 301                 path = "/";
 302
 303         /* We generally ignore errors caused by read-only mounted
 304          * cgroup trees (assuming we are running in a container then),
 305          * and missing cgroups, i.e. EROFS and ENOENT. */
 306
 307         if ((mask & CGROUP_CPU) && !is_root) {
 308                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 309
 310                 sprintf(buf, "%lu\n",
 311                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 312                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 313                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 314                 if (r < 0)
 315                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 316                                        "Failed to set cpu.shares on %s: %m", path);
 317
 318                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 319                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 320                 if (r < 0)
 321                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 322                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 323
 324                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 325                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 326                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 327                 } else
 328                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 329                 if (r < 0)
 330                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 331                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 332         }
 333
 334         if (mask & CGROUP_BLKIO) {
 335                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 336                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 337                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 338                 CGroupBlockIODeviceWeight *w;
 339                 CGroupBlockIODeviceBandwidth *b;
 340
 341                 if (!is_root) {
 342                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 343                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 344                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 345                         if (r < 0)
 346                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 347                                                "Failed to set blkio.weight on %s: %m", path);
 348
 349                         /* FIXME: no way to reset this list */
 350                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 351                                 dev_t dev;
 352
 353                                 r = lookup_blkio_device(w->path, &dev);
 354                                 if (r < 0)
 355                                         continue;
 356
 357                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 358                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 359                                 if (r < 0)
 360                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 361                                                        "Failed to set blkio.weight_device on %s: %m", path);
 362                         }
 363                 }
 364
 365                 /* FIXME: no way to reset this list */
 366                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 367                         const char *a;
 368                         dev_t dev;
 369
 370                         r = lookup_blkio_device(b->path, &dev);
 371                         if (r < 0)
 372                                 continue;
 373
 374                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 375
 376                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 377                         r = cg_set_attribute("blkio", path, a, buf);
 378                         if (r < 0)
 379                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 380                                                "Failed to set %s on %s: %m", a, path);
 381                 }
 382         }
 383
 384         if ((mask & CGROUP_MEMORY) && !is_root) {
 385                 if (c->memory_limit != (uint64_t) -1) {
 386                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 387
 388                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 389                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 390                 } else
 391                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 392
 393                 if (r < 0)
 394                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 395                                        "Failed to set memory.limit_in_bytes on %s: %m", path);
 396         }
 397
 398         if ((mask & CGROUP_DEVICE) && !is_root) {
 399                 CGroupDeviceAllow *a;
 400
 401                 /* Changing the devices list of a populated cgroup
 402                  * might result in EINVAL, hence ignore EINVAL
 403                  * here. */
 404
 405                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 406                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 407                 else
 408                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 409                 if (r < 0)
 410                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 411                                        "Failed to reset devices.list on %s: %m", path);
 412
 413                 if (c->device_policy == CGROUP_CLOSED ||
 414                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 415                         static const char auto_devices[] =
 416                                 "/dev/null\0" "rwm\0"
 417                                 "/dev/zero\0" "rwm\0"
 418                                 "/dev/full\0" "rwm\0"
 419                                 "/dev/random\0" "rwm\0"
 420                                 "/dev/urandom\0" "rwm\0"
 421                                 "/dev/tty\0" "rwm\0"
 422                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 423
 424                         const char *x, *y;
 425
 426                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 427                                 whitelist_device(path, x, y);
 428
 429                         whitelist_major(path, "pts", 'c', "rw");
 430                         whitelist_major(path, "kdbus", 'c', "rw");
 431                         whitelist_major(path, "kdbus/*", 'c', "rw");
 432                 }
 433
 434                 LIST_FOREACH(device_allow, a, c->device_allow) {
 435                         char acc[4];
 436                         unsigned k = 0;
 437
 438                         if (a->r)
 439                                 acc[k++] = 'r';
 440                         if (a->w)
 441                                 acc[k++] = 'w';
 442                         if (a->m)
 443                                 acc[k++] = 'm';
 444
 445                         if (k == 0)
 446                                 continue;
 447
 448                         acc[k++] = 0;
 449
 450                         if (startswith(a->path, "/dev/"))
 451                                 whitelist_device(path, a->path, acc);
 452                         else if (startswith(a->path, "block-"))
 453                                 whitelist_major(path, a->path + 6, 'b', acc);
 454                         else if (startswith(a->path, "char-"))
 455                                 whitelist_major(path, a->path + 5, 'c', acc);
 456                         else
 457                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 458                 }
 459         }
 460 }
 461
 462 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 463         CGroupControllerMask mask = 0;
 464
 465         /* Figure out which controllers we need */
 466
 467         if (c->cpu_accounting ||
 468             c->cpu_shares != (unsigned long) -1 ||
 469             c->startup_cpu_shares != (unsigned long) -1 ||
 470             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 471                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 472
 473         if (c->blockio_accounting ||
 474             c->blockio_weight != (unsigned long) -1 ||
 475             c->startup_blockio_weight != (unsigned long) -1 ||
 476             c->blockio_device_weights ||
 477             c->blockio_device_bandwidths)
 478                 mask |= CGROUP_BLKIO;
 479
 480         if (c->memory_accounting ||
 481             c->memory_limit != (uint64_t) -1)
 482                 mask |= CGROUP_MEMORY;
 483
 484         if (c->device_allow ||
 485             c->device_policy != CGROUP_AUTO)
 486                 mask |= CGROUP_DEVICE;
 487
 488         return mask;
 489 }
 490
 491 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 492         CGroupContext *c;
 493
 494         c = unit_get_cgroup_context(u);
 495         if (!c)
 496                 return 0;
 497
 498         /* If delegation is turned on, then turn on all cgroups,
 499          * unless the process we fork into it is known to drop
 500          * privileges anyway, and shouldn't get access to the
 501          * controllers anyway. */
 502
 503         if (c->delegate) {
 504                 ExecContext *e;
 505
 506                 e = unit_get_exec_context(u);
 507                 if (!e || exec_context_maintains_privileges(e))
 508                         return _CGROUP_CONTROLLER_MASK_ALL;
 509         }
 510
 511         return cgroup_context_get_mask(c);
 512 }
 513
 514 CGroupControllerMask unit_get_members_mask(Unit *u) {
 515         assert(u);
 516
 517         if (u->cgroup_members_mask_valid)
 518                 return u->cgroup_members_mask;
 519
 520         u->cgroup_members_mask = 0;
 521
 522         if (u->type == UNIT_SLICE) {
 523                 Unit *member;
 524                 Iterator i;
 525
 526                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 527
 528                         if (member == u)
 529                                 continue;
 530
 531                         if (UNIT_DEREF(member->slice) != u)
 532                                 continue;
 533
 534                         u->cgroup_members_mask |=
 535                                 unit_get_cgroup_mask(member) |
 536                                 unit_get_members_mask(member);
 537                 }
 538         }
 539
 540         u->cgroup_members_mask_valid = true;
 541         return u->cgroup_members_mask;
 542 }
 543
 544 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 545         assert(u);
 546
 547         if (UNIT_ISSET(u->slice))
 548                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 549
 550         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 551 }
 552
 553 CGroupControllerMask unit_get_target_mask(Unit *u) {
 554         CGroupControllerMask mask;
 555
 556         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 557         mask &= u->manager->cgroup_supported;
 558
 559         return mask;
 560 }
 561
 562 /* Recurse from a unit up through its containing slices, propagating
 563  * mask bits upward. A unit is also member of itself. */
 564 void unit_update_cgroup_members_masks(Unit *u) {
 565         CGroupControllerMask m;
 566         bool more;
 567
 568         assert(u);
 569
 570         /* Calculate subtree mask */
 571         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 572
 573         /* See if anything changed from the previous invocation. If
 574          * not, we're done. */
 575         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 576                 return;
 577
 578         more =
 579                 u->cgroup_subtree_mask_valid &&
 580                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 581                 ((~m & u->cgroup_subtree_mask) == 0);
 582
 583         u->cgroup_subtree_mask = m;
 584         u->cgroup_subtree_mask_valid = true;
 585
 586         if (UNIT_ISSET(u->slice)) {
 587                 Unit *s = UNIT_DEREF(u->slice);
 588
 589                 if (more)
 590                         /* There's more set now than before. We
 591                          * propagate the new mask to the parent's mask
 592                          * (not caring if it actually was valid or
 593                          * not). */
 594
 595                         s->cgroup_members_mask |= m;
 596
 597                 else
 598                         /* There's less set now than before (or we
 599                          * don't know), we need to recalculate
 600                          * everything, so let's invalidate the
 601                          * parent's members mask */
 602
 603                         s->cgroup_members_mask_valid = false;
 604
 605                 /* And now make sure that this change also hits our
 606                  * grandparents */
 607                 unit_update_cgroup_members_masks(s);
 608         }
 609 }
 610
 611 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 612         Unit *u = userdata;
 613
 614         assert(mask != 0);
 615         assert(u);
 616
 617         while (u) {
 618                 if (u->cgroup_path &&
 619                     u->cgroup_realized &&
 620                     (u->cgroup_realized_mask & mask) == mask)
 621                         return u->cgroup_path;
 622
 623                 u = UNIT_DEREF(u->slice);
 624         }
 625
 626         return NULL;
 627 }
 628
 629 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 630         CGroupContext *c;
 631         int r;
 632
 633         assert(u);
 634
 635         c = unit_get_cgroup_context(u);
 636         if (!c)
 637                 return 0;
 638
 639         if (!u->cgroup_path) {
 640                 _cleanup_free_ char *path = NULL;
 641
 642                 path = unit_default_cgroup_path(u);
 643                 if (!path)
 644                         return log_oom();
 645
 646                 r = hashmap_put(u->manager->cgroup_unit, path, u);
 647                 if (r < 0) {
 648                         log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 649                         return r;
 650                 }
 651                 if (r > 0) {
 652                         u->cgroup_path = path;
 653                         path = NULL;
 654                 }
 655         }
 656
 657         /* First, create our own group */
 658         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 659         if (r < 0)
 660                 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
 661
 662         /* Keep track that this is now realized */
 663         u->cgroup_realized = true;
 664         u->cgroup_realized_mask = mask;
 665
 666         if (u->type != UNIT_SLICE && !c->delegate) {
 667
 668                 /* Then, possibly move things over, but not if
 669                  * subgroups may contain processes, which is the case
 670                  * for slice and delegation units. */
 671                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 672                 if (r < 0)
 673                         log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
 674         }
 675
 676         return 0;
 677 }
 678
 679 int unit_attach_pids_to_cgroup(Unit *u) {
 680         int r;
 681         assert(u);
 682
 683         r = unit_realize_cgroup(u);
 684         if (r < 0)
 685                 return r;
 686
 687         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 688         if (r < 0)
 689                 return r;
 690
 691         return 0;
 692 }
 693
 694 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 695         assert(u);
 696
 697         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 698 }
 699
 700 /* Check if necessary controllers and attributes for a unit are in place.
 701  *
 702  * If so, do nothing.
 703  * If not, create paths, move processes over, and set attributes.
 704  *
 705  * Returns 0 on success and < 0 on failure. */
 706 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 707         CGroupControllerMask mask;
 708         int r;
 709
 710         assert(u);
 711
 712         if (u->in_cgroup_queue) {
 713                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 714                 u->in_cgroup_queue = false;
 715         }
 716
 717         mask = unit_get_target_mask(u);
 718
 719         if (unit_has_mask_realized(u, mask))
 720                 return 0;
 721
 722         /* First, realize parents */
 723         if (UNIT_ISSET(u->slice)) {
 724                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 725                 if (r < 0)
 726                         return r;
 727         }
 728
 729         /* And then do the real work */
 730         r = unit_create_cgroups(u, mask);
 731         if (r < 0)
 732                 return r;
 733
 734         /* Finally, apply the necessary attributes. */
 735         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 736
 737         return 0;
 738 }
 739
 740 static void unit_add_to_cgroup_queue(Unit *u) {
 741
 742         if (u->in_cgroup_queue)
 743                 return;
 744
 745         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 746         u->in_cgroup_queue = true;
 747 }
 748
 749 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 750         ManagerState state;
 751         unsigned n = 0;
 752         Unit *i;
 753         int r;
 754
 755         state = manager_state(m);
 756
 757         while ((i = m->cgroup_queue)) {
 758                 assert(i->in_cgroup_queue);
 759
 760                 r = unit_realize_cgroup_now(i, state);
 761                 if (r < 0)
 762                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
 763
 764                 n++;
 765         }
 766
 767         return n;
 768 }
 769
 770 static void unit_queue_siblings(Unit *u) {
 771         Unit *slice;
 772
 773         /* This adds the siblings of the specified unit and the
 774          * siblings of all parent units to the cgroup queue. (But
 775          * neither the specified unit itself nor the parents.) */
 776
 777         while ((slice = UNIT_DEREF(u->slice))) {
 778                 Iterator i;
 779                 Unit *m;
 780
 781                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 782                         if (m == u)
 783                                 continue;
 784
 785                         /* Skip units that have a dependency on the slice
 786                          * but aren't actually in it. */
 787                         if (UNIT_DEREF(m->slice) != slice)
 788                                 continue;
 789
 790                         /* No point in doing cgroup application for units
 791                          * without active processes. */
 792                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 793                                 continue;
 794
 795                         /* If the unit doesn't need any new controllers
 796                          * and has current ones realized, it doesn't need
 797                          * any changes. */
 798                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 799                                 continue;
 800
 801                         unit_add_to_cgroup_queue(m);
 802                 }
 803
 804                 u = slice;
 805         }
 806 }
 807
 808 int unit_realize_cgroup(Unit *u) {
 809         CGroupContext *c;
 810
 811         assert(u);
 812
 813         c = unit_get_cgroup_context(u);
 814         if (!c)
 815                 return 0;
 816
 817         /* So, here's the deal: when realizing the cgroups for this
 818          * unit, we need to first create all parents, but there's more
 819          * actually: for the weight-based controllers we also need to
 820          * make sure that all our siblings (i.e. units that are in the
 821          * same slice as we are) have cgroups, too. Otherwise, things
 822          * would become very uneven as each of their processes would
 823          * get as much resources as all our group together. This call
 824          * will synchronously create the parent cgroups, but will
 825          * defer work on the siblings to the next event loop
 826          * iteration. */
 827
 828         /* Add all sibling slices to the cgroup queue. */
 829         unit_queue_siblings(u);
 830
 831         /* And realize this one now (and apply the values) */
 832         return unit_realize_cgroup_now(u, manager_state(u->manager));
 833 }
 834
 835 void unit_destroy_cgroup_if_empty(Unit *u) {
 836         int r;
 837
 838         assert(u);
 839
 840         if (!u->cgroup_path)
 841                 return;
 842
 843         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 844         if (r < 0) {
 845                 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
 846                 return;
 847         }
 848
 849         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 850
 851         free(u->cgroup_path);
 852         u->cgroup_path = NULL;
 853         u->cgroup_realized = false;
 854         u->cgroup_realized_mask = 0;
 855 }
 856
 857 pid_t unit_search_main_pid(Unit *u) {
 858         _cleanup_fclose_ FILE *f = NULL;
 859         pid_t pid = 0, npid, mypid;
 860
 861         assert(u);
 862
 863         if (!u->cgroup_path)
 864                 return 0;
 865
 866         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 867                 return 0;
 868
 869         mypid = getpid();
 870         while (cg_read_pid(f, &npid) > 0)  {
 871                 pid_t ppid;
 872
 873                 if (npid == pid)
 874                         continue;
 875
 876                 /* Ignore processes that aren't our kids */
 877                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 878                         continue;
 879
 880                 if (pid != 0) {
 881                         /* Dang, there's more than one daemonized PID
 882                         in this group, so we don't know what process
 883                         is the main process. */
 884                         pid = 0;
 885                         break;
 886                 }
 887
 888                 pid = npid;
 889         }
 890
 891         return pid;
 892 }
 893
 894 int manager_setup_cgroup(Manager *m) {
 895         _cleanup_free_ char *path = NULL;
 896         int r;
 897
 898         assert(m);
 899
 900         /* 1. Determine hierarchy */
 901         free(m->cgroup_root);
 902         m->cgroup_root = NULL;
 903
 904         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 905         if (r < 0)
 906                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
 907
 908         /* LEGACY: Already in /system.slice? If so, let's cut this
 909          * off. This is to support live upgrades from older systemd
 910          * versions where PID 1 was moved there. */
 911         if (m->running_as == MANAGER_SYSTEM) {
 912                 char *e;
 913
 914                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 915                 if (!e)
 916                         e = endswith(m->cgroup_root, "/system");
 917                 if (e)
 918                         *e = 0;
 919         }
 920
 921         /* And make sure to store away the root value without trailing
 922          * slash, even for the root dir, so that we can easily prepend
 923          * it everywhere. */
 924         if (streq(m->cgroup_root, "/"))
 925                 m->cgroup_root[0] = 0;
 926
 927         /* 2. Show data */
 928         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 929         if (r < 0)
 930                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
 931
 932         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 933         if (!m->test_run) {
 934
 935                 /* 3. Install agent */
 936                 if (m->running_as == MANAGER_SYSTEM) {
 937                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 938                         if (r < 0)
 939                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
 940                         else if (r > 0)
 941                                 log_debug("Installed release agent.");
 942                         else
 943                                 log_debug("Release agent already installed.");
 944                 }
 945
 946                 /* 4. Make sure we are in the root cgroup */
 947                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 948                 if (r < 0)
 949                         return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
 950
 951                 /* 5. And pin it, so that it cannot be unmounted */
 952                 safe_close(m->pin_cgroupfs_fd);
 953
 954                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 955                 if (m->pin_cgroupfs_fd < 0)
 956                         return log_error_errno(errno, "Failed to open pin file: %m");
 957
 958                 /* 6.  Always enable hierarchical support if it exists... */
 959                 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 960         }
 961
 962         /* 7. Figure out which controllers are supported */
 963         m->cgroup_supported = cg_mask_supported();
 964
 965         return 0;
 966 }
 967
 968 void manager_shutdown_cgroup(Manager *m, bool delete) {
 969         assert(m);
 970
 971         /* We can't really delete the group, since we are in it. But
 972          * let's trim it. */
 973         if (delete && m->cgroup_root)
 974                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 975
 976         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 977
 978         free(m->cgroup_root);
 979         m->cgroup_root = NULL;
 980 }
 981
 982 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 983         char *p;
 984         Unit *u;
 985
 986         assert(m);
 987         assert(cgroup);
 988
 989         u = hashmap_get(m->cgroup_unit, cgroup);
 990         if (u)
 991                 return u;
 992
 993         p = strdupa(cgroup);
 994         for (;;) {
 995                 char *e;
 996
 997                 e = strrchr(p, '/');
 998                 if (e == p || !e)
 999                         return NULL;
1000
1001                 *e = 0;
1002
1003                 u = hashmap_get(m->cgroup_unit, p);
1004                 if (u)
1005                         return u;
1006         }
1007 }
1008
1009 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1010         _cleanup_free_ char *cgroup = NULL;
1011         int r;
1012
1013         assert(m);
1014
1015         if (pid <= 1)
1016                 return NULL;
1017
1018         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1019         if (r < 0)
1020                 return NULL;
1021
1022         return manager_get_unit_by_cgroup(m, cgroup);
1023 }
1024
1025 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1026         Unit *u;
1027         int r;
1028
1029         assert(m);
1030         assert(cgroup);
1031
1032         u = manager_get_unit_by_cgroup(m, cgroup);
1033         if (!u)
1034                 return 0;
1035
1036         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1037         if (r <= 0)
1038                 return r;
1039
1040         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1041                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1042
1043         unit_add_to_gc_queue(u);
1044         return 0;
1045 }
1046
1047 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1048         _cleanup_free_ char *v = NULL;
1049         int r;
1050
1051         assert(u);
1052         assert(ret);
1053
1054         if (!u->cgroup_path)
1055                 return -ENODATA;
1056
1057         if ((u->cgroup_realized_mask & CGROUP_MEMORY) == 0)
1058                 return -ENODATA;
1059
1060         r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1061         if (r == -ENOENT)
1062                 return -ENODATA;
1063         if (r < 0)
1064                 return r;
1065
1066         return safe_atou64(v, ret);
1067 }
1068
1069 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1070         _cleanup_free_ char *v = NULL;
1071         uint64_t ns;
1072         int r;
1073
1074         assert(u);
1075         assert(ret);
1076
1077         if (!u->cgroup_path)
1078                 return -ENODATA;
1079
1080         if ((u->cgroup_realized_mask & CGROUP_CPUACCT) == 0)
1081                 return -ENODATA;
1082
1083         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1084         if (r == -ENOENT)
1085                 return -ENODATA;
1086         if (r < 0)
1087                 return r;
1088
1089         r = safe_atou64(v, &ns);
1090         if (r < 0)
1091                 return r;
1092
1093         *ret = ns;
1094         return 0;
1095 }
1096
1097 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1098         nsec_t ns;
1099         int r;
1100
1101         r = unit_get_cpu_usage_raw(u, &ns);
1102         if (r < 0)
1103                 return r;
1104
1105         if (ns > u->cpuacct_usage_base)
1106                 ns -= u->cpuacct_usage_base;
1107         else
1108                 ns = 0;
1109
1110         *ret = ns;
1111         return 0;
1112 }
1113
1114 int unit_reset_cpu_usage(Unit *u) {
1115         nsec_t ns;
1116         int r;
1117
1118         assert(u);
1119
1120         r = unit_get_cpu_usage_raw(u, &ns);
1121         if (r < 0) {
1122                 u->cpuacct_usage_base = 0;
1123                 return r;
1124         }
1125
1126         u->cpuacct_usage_base = ns;
1127         return 0;
1128 }
1129
1130 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1131         [CGROUP_AUTO] = "auto",
1132         [CGROUP_CLOSED] = "closed",
1133         [CGROUP_STRICT] = "strict",
1134 };
1135
1136 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);