src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "process-util.h"
  26 #include "path-util.h"
  27 #include "special.h"
  28 #include "cgroup-util.h"
  29 #include "cgroup.h"
  30
  31 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  32
  33 void cgroup_context_init(CGroupContext *c) {
  34         assert(c);
  35
  36         /* Initialize everything to the kernel defaults, assuming the
  37          * structure is preinitialized to 0 */
  38
  39         c->cpu_shares = (unsigned long) -1;
  40         c->startup_cpu_shares = (unsigned long) -1;
  41         c->memory_limit = (uint64_t) -1;
  42         c->blockio_weight = (unsigned long) -1;
  43         c->startup_blockio_weight = (unsigned long) -1;
  44
  45         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  46 }
  47
  48 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  49         assert(c);
  50         assert(a);
  51
  52         LIST_REMOVE(device_allow, c->device_allow, a);
  53         free(a->path);
  54         free(a);
  55 }
  56
  57 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  58         assert(c);
  59         assert(w);
  60
  61         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  62         free(w->path);
  63         free(w);
  64 }
  65
  66 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  67         assert(c);
  68         assert(b);
  69
  70         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  71         free(b->path);
  72         free(b);
  73 }
  74
  75 void cgroup_context_done(CGroupContext *c) {
  76         assert(c);
  77
  78         while (c->blockio_device_weights)
  79                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  80
  81         while (c->blockio_device_bandwidths)
  82                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  83
  84         while (c->device_allow)
  85                 cgroup_context_free_device_allow(c, c->device_allow);
  86 }
  87
  88 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  89         CGroupBlockIODeviceBandwidth *b;
  90         CGroupBlockIODeviceWeight *w;
  91         CGroupDeviceAllow *a;
  92         char u[FORMAT_TIMESPAN_MAX];
  93
  94         assert(c);
  95         assert(f);
  96
  97         prefix = strempty(prefix);
  98
  99         fprintf(f,
 100                 "%sCPUAccounting=%s\n"
 101                 "%sBlockIOAccounting=%s\n"
 102                 "%sMemoryAccounting=%s\n"
 103                 "%sCPUShares=%lu\n"
 104                 "%sStartupCPUShares=%lu\n"
 105                 "%sCPUQuotaPerSecSec=%s\n"
 106                 "%sBlockIOWeight=%lu\n"
 107                 "%sStartupBlockIOWeight=%lu\n"
 108                 "%sMemoryLimit=%" PRIu64 "\n"
 109                 "%sDevicePolicy=%s\n"
 110                 "%sDelegate=%s\n",
 111                 prefix, yes_no(c->cpu_accounting),
 112                 prefix, yes_no(c->blockio_accounting),
 113                 prefix, yes_no(c->memory_accounting),
 114                 prefix, c->cpu_shares,
 115                 prefix, c->startup_cpu_shares,
 116                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 117                 prefix, c->blockio_weight,
 118                 prefix, c->startup_blockio_weight,
 119                 prefix, c->memory_limit,
 120                 prefix, cgroup_device_policy_to_string(c->device_policy),
 121                 prefix, yes_no(c->delegate));
 122
 123         LIST_FOREACH(device_allow, a, c->device_allow)
 124                 fprintf(f,
 125                         "%sDeviceAllow=%s %s%s%s\n",
 126                         prefix,
 127                         a->path,
 128                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 129
 130         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 131                 fprintf(f,
 132                         "%sBlockIODeviceWeight=%s %lu",
 133                         prefix,
 134                         w->path,
 135                         w->weight);
 136
 137         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 138                 char buf[FORMAT_BYTES_MAX];
 139
 140                 fprintf(f,
 141                         "%s%s=%s %s\n",
 142                         prefix,
 143                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 144                         b->path,
 145                         format_bytes(buf, sizeof(buf), b->bandwidth));
 146         }
 147 }
 148
 149 static int lookup_blkio_device(const char *p, dev_t *dev) {
 150         struct stat st;
 151         int r;
 152
 153         assert(p);
 154         assert(dev);
 155
 156         r = stat(p, &st);
 157         if (r < 0)
 158                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 159
 160         if (S_ISBLK(st.st_mode))
 161                 *dev = st.st_rdev;
 162         else if (major(st.st_dev) != 0) {
 163                 /* If this is not a device node then find the block
 164                  * device this file is stored on */
 165                 *dev = st.st_dev;
 166
 167                 /* If this is a partition, try to get the originating
 168                  * block device */
 169                 block_get_whole_disk(*dev, dev);
 170         } else {
 171                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 172                 return -ENODEV;
 173         }
 174
 175         return 0;
 176 }
 177
 178 static int whitelist_device(const char *path, const char *node, const char *acc) {
 179         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 180         struct stat st;
 181         int r;
 182
 183         assert(path);
 184         assert(acc);
 185
 186         if (stat(node, &st) < 0) {
 187                 log_warning("Couldn't stat device %s", node);
 188                 return -errno;
 189         }
 190
 191         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 192                 log_warning("%s is not a device.", node);
 193                 return -ENODEV;
 194         }
 195
 196         sprintf(buf,
 197                 "%c %u:%u %s",
 198                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 199                 major(st.st_rdev), minor(st.st_rdev),
 200                 acc);
 201
 202         r = cg_set_attribute("devices", path, "devices.allow", buf);
 203         if (r < 0)
 204                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 205                                "Failed to set devices.allow on %s: %m", path);
 206
 207         return r;
 208 }
 209
 210 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 211         _cleanup_fclose_ FILE *f = NULL;
 212         char line[LINE_MAX];
 213         bool good = false;
 214         int r;
 215
 216         assert(path);
 217         assert(acc);
 218         assert(type == 'b' || type == 'c');
 219
 220         f = fopen("/proc/devices", "re");
 221         if (!f)
 222                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 223
 224         FOREACH_LINE(line, f, goto fail) {
 225                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 226                 unsigned maj;
 227
 228                 truncate_nl(line);
 229
 230                 if (type == 'c' && streq(line, "Character devices:")) {
 231                         good = true;
 232                         continue;
 233                 }
 234
 235                 if (type == 'b' && streq(line, "Block devices:")) {
 236                         good = true;
 237                         continue;
 238                 }
 239
 240                 if (isempty(line)) {
 241                         good = false;
 242                         continue;
 243                 }
 244
 245                 if (!good)
 246                         continue;
 247
 248                 p = strstrip(line);
 249
 250                 w = strpbrk(p, WHITESPACE);
 251                 if (!w)
 252                         continue;
 253                 *w = 0;
 254
 255                 r = safe_atou(p, &maj);
 256                 if (r < 0)
 257                         continue;
 258                 if (maj <= 0)
 259                         continue;
 260
 261                 w++;
 262                 w += strspn(w, WHITESPACE);
 263
 264                 if (fnmatch(name, w, 0) != 0)
 265                         continue;
 266
 267                 sprintf(buf,
 268                         "%c %u:* %s",
 269                         type,
 270                         maj,
 271                         acc);
 272
 273                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 274                 if (r < 0)
 275                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 276                                        "Failed to set devices.allow on %s: %m", path);
 277         }
 278
 279         return 0;
 280
 281 fail:
 282         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 283         return -errno;
 284 }
 285
 286 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 287         bool is_root;
 288         int r;
 289
 290         assert(c);
 291         assert(path);
 292
 293         if (mask == 0)
 294                 return;
 295
 296         /* Some cgroup attributes are not supported on the root cgroup,
 297          * hence silently ignore */
 298         is_root = isempty(path) || path_equal(path, "/");
 299         if (is_root)
 300                 /* Make sure we don't try to display messages with an empty path. */
 301                 path = "/";
 302
 303         /* We generally ignore errors caused by read-only mounted
 304          * cgroup trees (assuming we are running in a container then),
 305          * and missing cgroups, i.e. EROFS and ENOENT. */
 306
 307         if ((mask & CGROUP_CPU) && !is_root) {
 308                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 309
 310                 sprintf(buf, "%lu\n",
 311                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 312                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 313                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 314                 if (r < 0)
 315                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 316                                        "Failed to set cpu.shares on %s: %m", path);
 317
 318                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 319                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 320                 if (r < 0)
 321                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 322                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 323
 324                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 325                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 326                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 327                 } else
 328                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 329                 if (r < 0)
 330                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 331                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 332         }
 333
 334         if (mask & CGROUP_BLKIO) {
 335                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 336                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 337                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 338                 CGroupBlockIODeviceWeight *w;
 339                 CGroupBlockIODeviceBandwidth *b;
 340
 341                 if (!is_root) {
 342                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 343                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 344                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 345                         if (r < 0)
 346                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 347                                                "Failed to set blkio.weight on %s: %m", path);
 348
 349                         /* FIXME: no way to reset this list */
 350                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 351                                 dev_t dev;
 352
 353                                 r = lookup_blkio_device(w->path, &dev);
 354                                 if (r < 0)
 355                                         continue;
 356
 357                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 358                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 359                                 if (r < 0)
 360                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 361                                                        "Failed to set blkio.weight_device on %s: %m", path);
 362                         }
 363                 }
 364
 365                 /* FIXME: no way to reset this list */
 366                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 367                         const char *a;
 368                         dev_t dev;
 369
 370                         r = lookup_blkio_device(b->path, &dev);
 371                         if (r < 0)
 372                                 continue;
 373
 374                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 375
 376                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 377                         r = cg_set_attribute("blkio", path, a, buf);
 378                         if (r < 0)
 379                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 380                                                "Failed to set %s on %s: %m", a, path);
 381                 }
 382         }
 383
 384         if ((mask & CGROUP_MEMORY) && !is_root) {
 385                 if (c->memory_limit != (uint64_t) -1) {
 386                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 387
 388                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 389                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 390                 } else
 391                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 392
 393                 if (r < 0)
 394                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 395                                        "Failed to set memory.limit_in_bytes on %s: %m", path);
 396         }
 397
 398         if ((mask & CGROUP_DEVICE) && !is_root) {
 399                 CGroupDeviceAllow *a;
 400
 401                 /* Changing the devices list of a populated cgroup
 402                  * might result in EINVAL, hence ignore EINVAL
 403                  * here. */
 404
 405                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 406                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 407                 else
 408                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 409                 if (r < 0)
 410                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 411                                        "Failed to reset devices.list on %s: %m", path);
 412
 413                 if (c->device_policy == CGROUP_CLOSED ||
 414                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 415                         static const char auto_devices[] =
 416                                 "/dev/null\0" "rwm\0"
 417                                 "/dev/zero\0" "rwm\0"
 418                                 "/dev/full\0" "rwm\0"
 419                                 "/dev/random\0" "rwm\0"
 420                                 "/dev/urandom\0" "rwm\0"
 421                                 "/dev/tty\0" "rwm\0"
 422                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 423
 424                         const char *x, *y;
 425
 426                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 427                                 whitelist_device(path, x, y);
 428
 429                         whitelist_major(path, "pts", 'c', "rw");
 430                         whitelist_major(path, "kdbus", 'c', "rw");
 431                         whitelist_major(path, "kdbus/*", 'c', "rw");
 432                 }
 433
 434                 LIST_FOREACH(device_allow, a, c->device_allow) {
 435                         char acc[4];
 436                         unsigned k = 0;
 437
 438                         if (a->r)
 439                                 acc[k++] = 'r';
 440                         if (a->w)
 441                                 acc[k++] = 'w';
 442                         if (a->m)
 443                                 acc[k++] = 'm';
 444
 445                         if (k == 0)
 446                                 continue;
 447
 448                         acc[k++] = 0;
 449
 450                         if (startswith(a->path, "/dev/"))
 451                                 whitelist_device(path, a->path, acc);
 452                         else if (startswith(a->path, "block-"))
 453                                 whitelist_major(path, a->path + 6, 'b', acc);
 454                         else if (startswith(a->path, "char-"))
 455                                 whitelist_major(path, a->path + 5, 'c', acc);
 456                         else
 457                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 458                 }
 459         }
 460 }
 461
 462 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 463         CGroupControllerMask mask = 0;
 464
 465         /* Figure out which controllers we need */
 466
 467         if (c->cpu_accounting ||
 468             c->cpu_shares != (unsigned long) -1 ||
 469             c->startup_cpu_shares != (unsigned long) -1 ||
 470             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 471                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 472
 473         if (c->blockio_accounting ||
 474             c->blockio_weight != (unsigned long) -1 ||
 475             c->startup_blockio_weight != (unsigned long) -1 ||
 476             c->blockio_device_weights ||
 477             c->blockio_device_bandwidths)
 478                 mask |= CGROUP_BLKIO;
 479
 480         if (c->memory_accounting ||
 481             c->memory_limit != (uint64_t) -1)
 482                 mask |= CGROUP_MEMORY;
 483
 484         if (c->device_allow ||
 485             c->device_policy != CGROUP_AUTO)
 486                 mask |= CGROUP_DEVICE;
 487
 488         return mask;
 489 }
 490
 491 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 492         CGroupContext *c;
 493
 494         c = unit_get_cgroup_context(u);
 495         if (!c)
 496                 return 0;
 497
 498         /* If delegation is turned on, then turn on all cgroups,
 499          * unless the process we fork into it is known to drop
 500          * privileges anyway, and shouldn't get access to the
 501          * controllers anyway. */
 502
 503         if (c->delegate) {
 504                 ExecContext *e;
 505
 506                 e = unit_get_exec_context(u);
 507                 if (!e || exec_context_maintains_privileges(e))
 508                         return _CGROUP_CONTROLLER_MASK_ALL;
 509         }
 510
 511         return cgroup_context_get_mask(c);
 512 }
 513
 514 CGroupControllerMask unit_get_members_mask(Unit *u) {
 515         assert(u);
 516
 517         if (u->cgroup_members_mask_valid)
 518                 return u->cgroup_members_mask;
 519
 520         u->cgroup_members_mask = 0;
 521
 522         if (u->type == UNIT_SLICE) {
 523                 Unit *member;
 524                 Iterator i;
 525
 526                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 527
 528                         if (member == u)
 529                                 continue;
 530
 531                         if (UNIT_DEREF(member->slice) != u)
 532                                 continue;
 533
 534                         u->cgroup_members_mask |=
 535                                 unit_get_cgroup_mask(member) |
 536                                 unit_get_members_mask(member);
 537                 }
 538         }
 539
 540         u->cgroup_members_mask_valid = true;
 541         return u->cgroup_members_mask;
 542 }
 543
 544 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 545         assert(u);
 546
 547         if (UNIT_ISSET(u->slice))
 548                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 549
 550         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 551 }
 552
 553 CGroupControllerMask unit_get_target_mask(Unit *u) {
 554         CGroupControllerMask mask;
 555
 556         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 557         mask &= u->manager->cgroup_supported;
 558
 559         return mask;
 560 }
 561
 562 /* Recurse from a unit up through its containing slices, propagating
 563  * mask bits upward. A unit is also member of itself. */
 564 void unit_update_cgroup_members_masks(Unit *u) {
 565         CGroupControllerMask m;
 566         bool more;
 567
 568         assert(u);
 569
 570         /* Calculate subtree mask */
 571         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 572
 573         /* See if anything changed from the previous invocation. If
 574          * not, we're done. */
 575         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 576                 return;
 577
 578         more =
 579                 u->cgroup_subtree_mask_valid &&
 580                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 581                 ((~m & u->cgroup_subtree_mask) == 0);
 582
 583         u->cgroup_subtree_mask = m;
 584         u->cgroup_subtree_mask_valid = true;
 585
 586         if (UNIT_ISSET(u->slice)) {
 587                 Unit *s = UNIT_DEREF(u->slice);
 588
 589                 if (more)
 590                         /* There's more set now than before. We
 591                          * propagate the new mask to the parent's mask
 592                          * (not caring if it actually was valid or
 593                          * not). */
 594
 595                         s->cgroup_members_mask |= m;
 596
 597                 else
 598                         /* There's less set now than before (or we
 599                          * don't know), we need to recalculate
 600                          * everything, so let's invalidate the
 601                          * parent's members mask */
 602
 603                         s->cgroup_members_mask_valid = false;
 604
 605                 /* And now make sure that this change also hits our
 606                  * grandparents */
 607                 unit_update_cgroup_members_masks(s);
 608         }
 609 }
 610
 611 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 612         Unit *u = userdata;
 613
 614         assert(mask != 0);
 615         assert(u);
 616
 617         while (u) {
 618                 if (u->cgroup_path &&
 619                     u->cgroup_realized &&
 620                     (u->cgroup_realized_mask & mask) == mask)
 621                         return u->cgroup_path;
 622
 623                 u = UNIT_DEREF(u->slice);
 624         }
 625
 626         return NULL;
 627 }
 628
 629 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 630         CGroupContext *c;
 631         int r;
 632
 633         assert(u);
 634
 635         c = unit_get_cgroup_context(u);
 636         if (!c)
 637                 return 0;
 638
 639         if (!u->cgroup_path) {
 640                 _cleanup_free_ char *path = NULL;
 641
 642                 path = unit_default_cgroup_path(u);
 643                 if (!path)
 644                         return log_oom();
 645
 646                 r = hashmap_put(u->manager->cgroup_unit, path, u);
 647                 if (r < 0) {
 648                         log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 649                         return r;
 650                 }
 651                 if (r > 0) {
 652                         u->cgroup_path = path;
 653                         path = NULL;
 654                 }
 655         }
 656
 657         /* First, create our own group */
 658         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 659         if (r < 0)
 660                 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
 661
 662         /* Keep track that this is now realized */
 663         u->cgroup_realized = true;
 664         u->cgroup_realized_mask = mask;
 665
 666         if (u->type != UNIT_SLICE && !c->delegate) {
 667
 668                 /* Then, possibly move things over, but not if
 669                  * subgroups may contain processes, which is the case
 670                  * for slice and delegation units. */
 671                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 672                 if (r < 0)
 673                         log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
 674         }
 675
 676         return 0;
 677 }
 678
 679 int unit_attach_pids_to_cgroup(Unit *u) {
 680         int r;
 681         assert(u);
 682
 683         r = unit_realize_cgroup(u);
 684         if (r < 0)
 685                 return r;
 686
 687         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 688         if (r < 0)
 689                 return r;
 690
 691         return 0;
 692 }
 693
 694 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 695         assert(u);
 696
 697         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 698 }
 699
 700 /* Check if necessary controllers and attributes for a unit are in place.
 701  *
 702  * If so, do nothing.
 703  * If not, create paths, move processes over, and set attributes.
 704  *
 705  * Returns 0 on success and < 0 on failure. */
 706 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 707         CGroupControllerMask mask;
 708         int r;
 709
 710         assert(u);
 711
 712         if (u->in_cgroup_queue) {
 713                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 714                 u->in_cgroup_queue = false;
 715         }
 716
 717         mask = unit_get_target_mask(u);
 718
 719         if (unit_has_mask_realized(u, mask))
 720                 return 0;
 721
 722         /* First, realize parents */
 723         if (UNIT_ISSET(u->slice)) {
 724                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 725                 if (r < 0)
 726                         return r;
 727         }
 728
 729         /* And then do the real work */
 730         r = unit_create_cgroups(u, mask);
 731         if (r < 0)
 732                 return r;
 733
 734         /* Finally, apply the necessary attributes. */
 735         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 736
 737         return 0;
 738 }
 739
 740 static void unit_add_to_cgroup_queue(Unit *u) {
 741
 742         if (u->in_cgroup_queue)
 743                 return;
 744
 745         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 746         u->in_cgroup_queue = true;
 747 }
 748
 749 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 750         ManagerState state;
 751         unsigned n = 0;
 752         Unit *i;
 753         int r;
 754
 755         state = manager_state(m);
 756
 757         while ((i = m->cgroup_queue)) {
 758                 assert(i->in_cgroup_queue);
 759
 760                 r = unit_realize_cgroup_now(i, state);
 761                 if (r < 0)
 762                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
 763
 764                 n++;
 765         }
 766
 767         return n;
 768 }
 769
 770 static void unit_queue_siblings(Unit *u) {
 771         Unit *slice;
 772
 773         /* This adds the siblings of the specified unit and the
 774          * siblings of all parent units to the cgroup queue. (But
 775          * neither the specified unit itself nor the parents.) */
 776
 777         while ((slice = UNIT_DEREF(u->slice))) {
 778                 Iterator i;
 779                 Unit *m;
 780
 781                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 782                         if (m == u)
 783                                 continue;
 784
 785                         /* Skip units that have a dependency on the slice
 786                          * but aren't actually in it. */
 787                         if (UNIT_DEREF(m->slice) != slice)
 788                                 continue;
 789
 790                         /* No point in doing cgroup application for units
 791                          * without active processes. */
 792                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 793                                 continue;
 794
 795                         /* If the unit doesn't need any new controllers
 796                          * and has current ones realized, it doesn't need
 797                          * any changes. */
 798                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 799                                 continue;
 800
 801                         unit_add_to_cgroup_queue(m);
 802                 }
 803
 804                 u = slice;
 805         }
 806 }
 807
 808 int unit_realize_cgroup(Unit *u) {
 809         assert(u);
 810
 811         if (!UNIT_HAS_CGROUP_CONTEXT(u))
 812                 return 0;
 813
 814         /* So, here's the deal: when realizing the cgroups for this
 815          * unit, we need to first create all parents, but there's more
 816          * actually: for the weight-based controllers we also need to
 817          * make sure that all our siblings (i.e. units that are in the
 818          * same slice as we are) have cgroups, too. Otherwise, things
 819          * would become very uneven as each of their processes would
 820          * get as much resources as all our group together. This call
 821          * will synchronously create the parent cgroups, but will
 822          * defer work on the siblings to the next event loop
 823          * iteration. */
 824
 825         /* Add all sibling slices to the cgroup queue. */
 826         unit_queue_siblings(u);
 827
 828         /* And realize this one now (and apply the values) */
 829         return unit_realize_cgroup_now(u, manager_state(u->manager));
 830 }
 831
 832 void unit_destroy_cgroup_if_empty(Unit *u) {
 833         int r;
 834
 835         assert(u);
 836
 837         if (!u->cgroup_path)
 838                 return;
 839
 840         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 841         if (r < 0) {
 842                 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
 843                 return;
 844         }
 845
 846         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 847
 848         free(u->cgroup_path);
 849         u->cgroup_path = NULL;
 850         u->cgroup_realized = false;
 851         u->cgroup_realized_mask = 0;
 852 }
 853
 854 pid_t unit_search_main_pid(Unit *u) {
 855         _cleanup_fclose_ FILE *f = NULL;
 856         pid_t pid = 0, npid, mypid;
 857
 858         assert(u);
 859
 860         if (!u->cgroup_path)
 861                 return 0;
 862
 863         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 864                 return 0;
 865
 866         mypid = getpid();
 867         while (cg_read_pid(f, &npid) > 0)  {
 868                 pid_t ppid;
 869
 870                 if (npid == pid)
 871                         continue;
 872
 873                 /* Ignore processes that aren't our kids */
 874                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 875                         continue;
 876
 877                 if (pid != 0) {
 878                         /* Dang, there's more than one daemonized PID
 879                         in this group, so we don't know what process
 880                         is the main process. */
 881                         pid = 0;
 882                         break;
 883                 }
 884
 885                 pid = npid;
 886         }
 887
 888         return pid;
 889 }
 890
 891 int manager_setup_cgroup(Manager *m) {
 892         _cleanup_free_ char *path = NULL;
 893         int r;
 894
 895         assert(m);
 896
 897         /* 1. Determine hierarchy */
 898         free(m->cgroup_root);
 899         m->cgroup_root = NULL;
 900
 901         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 902         if (r < 0)
 903                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
 904
 905         /* LEGACY: Already in /system.slice? If so, let's cut this
 906          * off. This is to support live upgrades from older systemd
 907          * versions where PID 1 was moved there. */
 908         if (m->running_as == MANAGER_SYSTEM) {
 909                 char *e;
 910
 911                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 912                 if (!e)
 913                         e = endswith(m->cgroup_root, "/system");
 914                 if (e)
 915                         *e = 0;
 916         }
 917
 918         /* And make sure to store away the root value without trailing
 919          * slash, even for the root dir, so that we can easily prepend
 920          * it everywhere. */
 921         if (streq(m->cgroup_root, "/"))
 922                 m->cgroup_root[0] = 0;
 923
 924         /* 2. Show data */
 925         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 926         if (r < 0)
 927                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
 928
 929         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 930         if (!m->test_run) {
 931
 932                 /* 3. Install agent */
 933                 if (m->running_as == MANAGER_SYSTEM) {
 934                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 935                         if (r < 0)
 936                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
 937                         else if (r > 0)
 938                                 log_debug("Installed release agent.");
 939                         else
 940                                 log_debug("Release agent already installed.");
 941                 }
 942
 943                 /* 4. Make sure we are in the root cgroup */
 944                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 945                 if (r < 0)
 946                         return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
 947
 948                 /* 5. And pin it, so that it cannot be unmounted */
 949                 safe_close(m->pin_cgroupfs_fd);
 950
 951                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 952                 if (m->pin_cgroupfs_fd < 0)
 953                         return log_error_errno(errno, "Failed to open pin file: %m");
 954
 955                 /* 6.  Always enable hierarchical support if it exists... */
 956                 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 957         }
 958
 959         /* 7. Figure out which controllers are supported */
 960         m->cgroup_supported = cg_mask_supported();
 961
 962         return 0;
 963 }
 964
 965 void manager_shutdown_cgroup(Manager *m, bool delete) {
 966         assert(m);
 967
 968         /* We can't really delete the group, since we are in it. But
 969          * let's trim it. */
 970         if (delete && m->cgroup_root)
 971                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 972
 973         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 974
 975         free(m->cgroup_root);
 976         m->cgroup_root = NULL;
 977 }
 978
 979 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 980         char *p;
 981         Unit *u;
 982
 983         assert(m);
 984         assert(cgroup);
 985
 986         u = hashmap_get(m->cgroup_unit, cgroup);
 987         if (u)
 988                 return u;
 989
 990         p = strdupa(cgroup);
 991         for (;;) {
 992                 char *e;
 993
 994                 e = strrchr(p, '/');
 995                 if (e == p || !e)
 996                         return NULL;
 997
 998                 *e = 0;
 999
1000                 u = hashmap_get(m->cgroup_unit, p);
1001                 if (u)
1002                         return u;
1003         }
1004 }
1005
1006 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1007         _cleanup_free_ char *cgroup = NULL;
1008         Unit *u;
1009         int r;
1010
1011         assert(m);
1012
1013         if (pid <= 1)
1014                 return NULL;
1015
1016         u = hashmap_get(m->watch_pids1, LONG_TO_PTR(pid));
1017         if (u)
1018                 return u;
1019
1020         u = hashmap_get(m->watch_pids2, LONG_TO_PTR(pid));
1021         if (u)
1022                 return u;
1023
1024         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1025         if (r < 0)
1026                 return NULL;
1027
1028         return manager_get_unit_by_cgroup(m, cgroup);
1029 }
1030
1031 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1032         Unit *u;
1033         int r;
1034
1035         assert(m);
1036         assert(cgroup);
1037
1038         u = manager_get_unit_by_cgroup(m, cgroup);
1039         if (!u)
1040                 return 0;
1041
1042         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1043         if (r <= 0)
1044                 return r;
1045
1046         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1047                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1048
1049         unit_add_to_gc_queue(u);
1050         return 0;
1051 }
1052
1053 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1054         _cleanup_free_ char *v = NULL;
1055         int r;
1056
1057         assert(u);
1058         assert(ret);
1059
1060         if (!u->cgroup_path)
1061                 return -ENODATA;
1062
1063         if ((u->cgroup_realized_mask & CGROUP_MEMORY) == 0)
1064                 return -ENODATA;
1065
1066         r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1067         if (r == -ENOENT)
1068                 return -ENODATA;
1069         if (r < 0)
1070                 return r;
1071
1072         return safe_atou64(v, ret);
1073 }
1074
1075 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1076         _cleanup_free_ char *v = NULL;
1077         uint64_t ns;
1078         int r;
1079
1080         assert(u);
1081         assert(ret);
1082
1083         if (!u->cgroup_path)
1084                 return -ENODATA;
1085
1086         if ((u->cgroup_realized_mask & CGROUP_CPUACCT) == 0)
1087                 return -ENODATA;
1088
1089         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1090         if (r == -ENOENT)
1091                 return -ENODATA;
1092         if (r < 0)
1093                 return r;
1094
1095         r = safe_atou64(v, &ns);
1096         if (r < 0)
1097                 return r;
1098
1099         *ret = ns;
1100         return 0;
1101 }
1102
1103 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1104         nsec_t ns;
1105         int r;
1106
1107         r = unit_get_cpu_usage_raw(u, &ns);
1108         if (r < 0)
1109                 return r;
1110
1111         if (ns > u->cpuacct_usage_base)
1112                 ns -= u->cpuacct_usage_base;
1113         else
1114                 ns = 0;
1115
1116         *ret = ns;
1117         return 0;
1118 }
1119
1120 int unit_reset_cpu_usage(Unit *u) {
1121         nsec_t ns;
1122         int r;
1123
1124         assert(u);
1125
1126         r = unit_get_cpu_usage_raw(u, &ns);
1127         if (r < 0) {
1128                 u->cpuacct_usage_base = 0;
1129                 return r;
1130         }
1131
1132         u->cpuacct_usage_base = ns;
1133         return 0;
1134 }
1135
1136 bool unit_cgroup_delegate(Unit *u) {
1137         CGroupContext *c;
1138
1139         assert(u);
1140
1141         c = unit_get_cgroup_context(u);
1142         if (!c)
1143                 return false;
1144
1145         return c->delegate;
1146 }
1147
1148 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1149         [CGROUP_AUTO] = "auto",
1150         [CGROUP_CLOSED] = "closed",
1151         [CGROUP_STRICT] = "strict",
1152 };
1153
1154 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);