src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  31
  32 void cgroup_context_init(CGroupContext *c) {
  33         assert(c);
  34
  35         /* Initialize everything to the kernel defaults, assuming the
  36          * structure is preinitialized to 0 */
  37
  38         c->cpu_shares = (unsigned long) -1;
  39         c->startup_cpu_shares = (unsigned long) -1;
  40         c->memory_limit = (uint64_t) -1;
  41         c->blockio_weight = (unsigned long) -1;
  42         c->startup_blockio_weight = (unsigned long) -1;
  43
  44         c->cpu_quota_per_sec_usec = (usec_t) -1;
  45 }
  46
  47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  48         assert(c);
  49         assert(a);
  50
  51         LIST_REMOVE(device_allow, c->device_allow, a);
  52         free(a->path);
  53         free(a);
  54 }
  55
  56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  57         assert(c);
  58         assert(w);
  59
  60         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  61         free(w->path);
  62         free(w);
  63 }
  64
  65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  66         assert(c);
  67         assert(b);
  68
  69         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  70         free(b->path);
  71         free(b);
  72 }
  73
  74 void cgroup_context_done(CGroupContext *c) {
  75         assert(c);
  76
  77         while (c->blockio_device_weights)
  78                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  79
  80         while (c->blockio_device_bandwidths)
  81                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  82
  83         while (c->device_allow)
  84                 cgroup_context_free_device_allow(c, c->device_allow);
  85 }
  86
  87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  88         CGroupBlockIODeviceBandwidth *b;
  89         CGroupBlockIODeviceWeight *w;
  90         CGroupDeviceAllow *a;
  91         char u[FORMAT_TIMESPAN_MAX];
  92
  93         assert(c);
  94         assert(f);
  95
  96         prefix = strempty(prefix);
  97
  98         fprintf(f,
  99                 "%sCPUAccounting=%s\n"
 100                 "%sBlockIOAccounting=%s\n"
 101                 "%sMemoryAccounting=%s\n"
 102                 "%sCPUShares=%lu\n"
 103                 "%sStartupCPUShares=%lu\n"
 104                 "%sCPUQuotaPerSecSec=%s\n"
 105                 "%sBlockIOWeight=%lu\n"
 106                 "%sStartupBlockIOWeight=%lu\n"
 107                 "%sMemoryLimit=%" PRIu64 "\n"
 108                 "%sDevicePolicy=%s\n",
 109                 prefix, yes_no(c->cpu_accounting),
 110                 prefix, yes_no(c->blockio_accounting),
 111                 prefix, yes_no(c->memory_accounting),
 112                 prefix, c->cpu_shares,
 113                 prefix, c->startup_cpu_shares,
 114                 prefix, strna(format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1)),
 115                 prefix, c->blockio_weight,
 116                 prefix, c->startup_blockio_weight,
 117                 prefix, c->memory_limit,
 118                 prefix, cgroup_device_policy_to_string(c->device_policy));
 119
 120         LIST_FOREACH(device_allow, a, c->device_allow)
 121                 fprintf(f,
 122                         "%sDeviceAllow=%s %s%s%s\n",
 123                         prefix,
 124                         a->path,
 125                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 126
 127         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 128                 fprintf(f,
 129                         "%sBlockIODeviceWeight=%s %lu",
 130                         prefix,
 131                         w->path,
 132                         w->weight);
 133
 134         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 135                 char buf[FORMAT_BYTES_MAX];
 136
 137                 fprintf(f,
 138                         "%s%s=%s %s\n",
 139                         prefix,
 140                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 141                         b->path,
 142                         format_bytes(buf, sizeof(buf), b->bandwidth));
 143         }
 144 }
 145
 146 static int lookup_blkio_device(const char *p, dev_t *dev) {
 147         struct stat st;
 148         int r;
 149
 150         assert(p);
 151         assert(dev);
 152
 153         r = stat(p, &st);
 154         if (r < 0) {
 155                 log_warning("Couldn't stat device %s: %m", p);
 156                 return -errno;
 157         }
 158
 159         if (S_ISBLK(st.st_mode))
 160                 *dev = st.st_rdev;
 161         else if (major(st.st_dev) != 0) {
 162                 /* If this is not a device node then find the block
 163                  * device this file is stored on */
 164                 *dev = st.st_dev;
 165
 166                 /* If this is a partition, try to get the originating
 167                  * block device */
 168                 block_get_whole_disk(*dev, dev);
 169         } else {
 170                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 171                 return -ENODEV;
 172         }
 173
 174         return 0;
 175 }
 176
 177 static int whitelist_device(const char *path, const char *node, const char *acc) {
 178         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 179         struct stat st;
 180         int r;
 181
 182         assert(path);
 183         assert(acc);
 184
 185         if (stat(node, &st) < 0) {
 186                 log_warning("Couldn't stat device %s", node);
 187                 return -errno;
 188         }
 189
 190         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 191                 log_warning("%s is not a device.", node);
 192                 return -ENODEV;
 193         }
 194
 195         sprintf(buf,
 196                 "%c %u:%u %s",
 197                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 198                 major(st.st_rdev), minor(st.st_rdev),
 199                 acc);
 200
 201         r = cg_set_attribute("devices", path, "devices.allow", buf);
 202         if (r < 0)
 203                 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 204
 205         return r;
 206 }
 207
 208 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 209         _cleanup_fclose_ FILE *f = NULL;
 210         char line[LINE_MAX];
 211         bool good = false;
 212         int r;
 213
 214         assert(path);
 215         assert(acc);
 216         assert(type == 'b' || type == 'c');
 217
 218         f = fopen("/proc/devices", "re");
 219         if (!f) {
 220                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 221                 return -errno;
 222         }
 223
 224         FOREACH_LINE(line, f, goto fail) {
 225                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 226                 unsigned maj;
 227
 228                 truncate_nl(line);
 229
 230                 if (type == 'c' && streq(line, "Character devices:")) {
 231                         good = true;
 232                         continue;
 233                 }
 234
 235                 if (type == 'b' && streq(line, "Block devices:")) {
 236                         good = true;
 237                         continue;
 238                 }
 239
 240                 if (isempty(line)) {
 241                         good = false;
 242                         continue;
 243                 }
 244
 245                 if (!good)
 246                         continue;
 247
 248                 p = strstrip(line);
 249
 250                 w = strpbrk(p, WHITESPACE);
 251                 if (!w)
 252                         continue;
 253                 *w = 0;
 254
 255                 r = safe_atou(p, &maj);
 256                 if (r < 0)
 257                         continue;
 258                 if (maj <= 0)
 259                         continue;
 260
 261                 w++;
 262                 w += strspn(w, WHITESPACE);
 263
 264                 if (fnmatch(name, w, 0) != 0)
 265                         continue;
 266
 267                 sprintf(buf,
 268                         "%c %u:* %s",
 269                         type,
 270                         maj,
 271                         acc);
 272
 273                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 274                 if (r < 0)
 275                         log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 276         }
 277
 278         return 0;
 279
 280 fail:
 281         log_warning("Failed to read /proc/devices: %m");
 282         return -errno;
 283 }
 284
 285 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 286         bool is_root;
 287         int r;
 288
 289         assert(c);
 290         assert(path);
 291
 292         if (mask == 0)
 293                 return;
 294
 295         /* Some cgroup attributes are not support on the root cgroup,
 296          * hence silently ignore */
 297         is_root = isempty(path) || path_equal(path, "/");
 298
 299         if ((mask & CGROUP_CPU) && !is_root) {
 300                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 301
 302                 sprintf(buf, "%lu\n",
 303                         state == MANAGER_STARTING && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 304                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 305                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 306                 if (r < 0)
 307                         log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
 308
 309                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 310                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 311                 if (r < 0)
 312                         log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
 313
 314                 if (c->cpu_quota_per_sec_usec != (usec_t) -1) {
 315                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 316                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 317                 } else
 318                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 319                 if (r < 0)
 320                         log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
 321         }
 322
 323         if (mask & CGROUP_BLKIO) {
 324                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 325                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 326                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 327                 CGroupBlockIODeviceWeight *w;
 328                 CGroupBlockIODeviceBandwidth *b;
 329
 330                 if (!is_root) {
 331                         sprintf(buf, "%lu\n", state == MANAGER_STARTING && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 332                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 333                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 334                         if (r < 0)
 335                                 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
 336
 337                         /* FIXME: no way to reset this list */
 338                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 339                                 dev_t dev;
 340
 341                                 r = lookup_blkio_device(w->path, &dev);
 342                                 if (r < 0)
 343                                         continue;
 344
 345                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 346                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 347                                 if (r < 0)
 348                                         log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 349                         }
 350                 }
 351
 352                 /* FIXME: no way to reset this list */
 353                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 354                         const char *a;
 355                         dev_t dev;
 356
 357                         r = lookup_blkio_device(b->path, &dev);
 358                         if (r < 0)
 359                                 continue;
 360
 361                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 362
 363                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 364                         r = cg_set_attribute("blkio", path, a, buf);
 365                         if (r < 0)
 366                                 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
 367                 }
 368         }
 369
 370         if (mask & CGROUP_MEMORY) {
 371                 if (c->memory_limit != (uint64_t) -1) {
 372                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 373
 374                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 375                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 376                 } else
 377                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 378
 379                 if (r < 0)
 380                         log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 381         }
 382
 383         if ((mask & CGROUP_DEVICE) && !is_root) {
 384                 CGroupDeviceAllow *a;
 385
 386                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 387                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 388                 else
 389                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 390                 if (r < 0)
 391                         log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
 392
 393                 if (c->device_policy == CGROUP_CLOSED ||
 394                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 395                         static const char auto_devices[] =
 396                                 "/dev/null\0" "rwm\0"
 397                                 "/dev/zero\0" "rwm\0"
 398                                 "/dev/full\0" "rwm\0"
 399                                 "/dev/random\0" "rwm\0"
 400                                 "/dev/urandom\0" "rwm\0"
 401                                 "/dev/tty\0" "rwm\0"
 402                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 403
 404                         const char *x, *y;
 405
 406                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 407                                 whitelist_device(path, x, y);
 408
 409                         whitelist_major(path, "pts", 'c', "rw");
 410                         whitelist_major(path, "kdbus", 'c', "rw");
 411                         whitelist_major(path, "kdbus/*", 'c', "rw");
 412                 }
 413
 414                 LIST_FOREACH(device_allow, a, c->device_allow) {
 415                         char acc[4];
 416                         unsigned k = 0;
 417
 418                         if (a->r)
 419                                 acc[k++] = 'r';
 420                         if (a->w)
 421                                 acc[k++] = 'w';
 422                         if (a->m)
 423                                 acc[k++] = 'm';
 424
 425                         if (k == 0)
 426                                 continue;
 427
 428                         acc[k++] = 0;
 429
 430                         if (startswith(a->path, "/dev/"))
 431                                 whitelist_device(path, a->path, acc);
 432                         else if (startswith(a->path, "block-"))
 433                                 whitelist_major(path, a->path + 6, 'b', acc);
 434                         else if (startswith(a->path, "char-"))
 435                                 whitelist_major(path, a->path + 5, 'c', acc);
 436                         else
 437                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 438                 }
 439         }
 440 }
 441
 442 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 443         CGroupControllerMask mask = 0;
 444
 445         /* Figure out which controllers we need */
 446
 447         if (c->cpu_accounting ||
 448             c->cpu_shares != (unsigned long) -1 ||
 449             c->startup_cpu_shares != (unsigned long) -1 ||
 450             c->cpu_quota_per_sec_usec != (usec_t) -1)
 451                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 452
 453         if (c->blockio_accounting ||
 454             c->blockio_weight != (unsigned long) -1 ||
 455             c->startup_blockio_weight != (unsigned long) -1 ||
 456             c->blockio_device_weights ||
 457             c->blockio_device_bandwidths)
 458                 mask |= CGROUP_BLKIO;
 459
 460         if (c->memory_accounting ||
 461             c->memory_limit != (uint64_t) -1)
 462                 mask |= CGROUP_MEMORY;
 463
 464         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 465                 mask |= CGROUP_DEVICE;
 466
 467         return mask;
 468 }
 469
 470 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 471         CGroupContext *c;
 472
 473         c = unit_get_cgroup_context(u);
 474         if (!c)
 475                 return 0;
 476
 477         return cgroup_context_get_mask(c);
 478 }
 479
 480 CGroupControllerMask unit_get_members_mask(Unit *u) {
 481         assert(u);
 482
 483         if (u->cgroup_members_mask_valid)
 484                 return u->cgroup_members_mask;
 485
 486         u->cgroup_members_mask = 0;
 487
 488         if (u->type == UNIT_SLICE) {
 489                 Unit *member;
 490                 Iterator i;
 491
 492                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 493
 494                         if (member == u)
 495                                 continue;
 496
 497                         if (UNIT_DEREF(member->slice) != u)
 498                                 continue;
 499
 500                         u->cgroup_members_mask |=
 501                                 unit_get_cgroup_mask(member) |
 502                                 unit_get_members_mask(member);
 503                 }
 504         }
 505
 506         u->cgroup_members_mask_valid = true;
 507         return u->cgroup_members_mask;
 508 }
 509
 510 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 511         assert(u);
 512
 513         if (UNIT_ISSET(u->slice))
 514                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 515
 516         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 517 }
 518
 519 CGroupControllerMask unit_get_target_mask(Unit *u) {
 520         CGroupControllerMask mask;
 521
 522         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 523         mask &= u->manager->cgroup_supported;
 524
 525         return mask;
 526 }
 527
 528 /* Recurse from a unit up through its containing slices, propagating
 529  * mask bits upward. A unit is also member of itself. */
 530 void unit_update_cgroup_members_masks(Unit *u) {
 531         CGroupControllerMask m;
 532         bool more;
 533
 534         assert(u);
 535
 536         /* Calculate subtree mask */
 537         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 538
 539         /* See if anything changed from the previous invocation. If
 540          * not, we're done. */
 541         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 542                 return;
 543
 544         more =
 545                 u->cgroup_subtree_mask_valid &&
 546                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 547                 ((~m & u->cgroup_subtree_mask) == 0);
 548
 549         u->cgroup_subtree_mask = m;
 550         u->cgroup_subtree_mask_valid = true;
 551
 552         if (UNIT_ISSET(u->slice)) {
 553                 Unit *s = UNIT_DEREF(u->slice);
 554
 555                 if (more)
 556                         /* There's more set now than before. We
 557                          * propagate the new mask to the parent's mask
 558                          * (not caring if it actually was valid or
 559                          * not). */
 560
 561                         s->cgroup_members_mask |= m;
 562
 563                 else
 564                         /* There's less set now than before (or we
 565                          * don't know), we need to recalculate
 566                          * everything, so let's invalidate the
 567                          * parent's members mask */
 568
 569                         s->cgroup_members_mask_valid = false;
 570
 571                 /* And now make sure that this change also hits our
 572                  * grandparents */
 573                 unit_update_cgroup_members_masks(s);
 574         }
 575 }
 576
 577 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 578         Unit *u = userdata;
 579
 580         assert(mask != 0);
 581         assert(u);
 582
 583         while (u) {
 584                 if (u->cgroup_path &&
 585                     u->cgroup_realized &&
 586                     (u->cgroup_realized_mask & mask) == mask)
 587                         return u->cgroup_path;
 588
 589                 u = UNIT_DEREF(u->slice);
 590         }
 591
 592         return NULL;
 593 }
 594
 595 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 596         _cleanup_free_ char *path = NULL;
 597         int r;
 598
 599         assert(u);
 600
 601         path = unit_default_cgroup_path(u);
 602         if (!path)
 603                 return log_oom();
 604
 605         r = hashmap_put(u->manager->cgroup_unit, path, u);
 606         if (r < 0) {
 607                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 608                 return r;
 609         }
 610         if (r > 0) {
 611                 u->cgroup_path = path;
 612                 path = NULL;
 613         }
 614
 615         /* First, create our own group */
 616         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 617         if (r < 0) {
 618                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 619                 return r;
 620         }
 621
 622         /* Keep track that this is now realized */
 623         u->cgroup_realized = true;
 624         u->cgroup_realized_mask = mask;
 625
 626         /* Then, possibly move things over */
 627         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 628         if (r < 0)
 629                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 630
 631         return 0;
 632 }
 633
 634 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 635         assert(u);
 636
 637         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 638 }
 639
 640 /* Check if necessary controllers and attributes for a unit are in place.
 641  *
 642  * If so, do nothing.
 643  * If not, create paths, move processes over, and set attributes.
 644  *
 645  * Returns 0 on success and < 0 on failure. */
 646 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 647         CGroupControllerMask mask;
 648         int r;
 649
 650         assert(u);
 651
 652         if (u->in_cgroup_queue) {
 653                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 654                 u->in_cgroup_queue = false;
 655         }
 656
 657         mask = unit_get_target_mask(u);
 658
 659         if (unit_has_mask_realized(u, mask))
 660                 return 0;
 661
 662         /* First, realize parents */
 663         if (UNIT_ISSET(u->slice)) {
 664                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 665                 if (r < 0)
 666                         return r;
 667         }
 668
 669         /* And then do the real work */
 670         r = unit_create_cgroups(u, mask);
 671         if (r < 0)
 672                 return r;
 673
 674         /* Finally, apply the necessary attributes. */
 675         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 676
 677         return 0;
 678 }
 679
 680 static void unit_add_to_cgroup_queue(Unit *u) {
 681
 682         if (u->in_cgroup_queue)
 683                 return;
 684
 685         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 686         u->in_cgroup_queue = true;
 687 }
 688
 689 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 690         ManagerState state;
 691         unsigned n = 0;
 692         Unit *i;
 693         int r;
 694
 695         state = manager_state(m);
 696
 697         while ((i = m->cgroup_queue)) {
 698                 assert(i->in_cgroup_queue);
 699
 700                 r = unit_realize_cgroup_now(i, state);
 701                 if (r < 0)
 702                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 703
 704                 n++;
 705         }
 706
 707         return n;
 708 }
 709
 710 static void unit_queue_siblings(Unit *u) {
 711         Unit *slice;
 712
 713         /* This adds the siblings of the specified unit and the
 714          * siblings of all parent units to the cgroup queue. (But
 715          * neither the specified unit itself nor the parents.) */
 716
 717         while ((slice = UNIT_DEREF(u->slice))) {
 718                 Iterator i;
 719                 Unit *m;
 720
 721                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 722                         if (m == u)
 723                                 continue;
 724
 725                         /* Skip units that have a dependency on the slice
 726                          * but aren't actually in it. */
 727                         if (UNIT_DEREF(m->slice) != slice)
 728                                 continue;
 729
 730                         /* No point in doing cgroup application for units
 731                          * without active processes. */
 732                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 733                                 continue;
 734
 735                         /* If the unit doesn't need any new controllers
 736                          * and has current ones realized, it doesn't need
 737                          * any changes. */
 738                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 739                                 continue;
 740
 741                         unit_add_to_cgroup_queue(m);
 742                 }
 743
 744                 u = slice;
 745         }
 746 }
 747
 748 int unit_realize_cgroup(Unit *u) {
 749         CGroupContext *c;
 750
 751         assert(u);
 752
 753         c = unit_get_cgroup_context(u);
 754         if (!c)
 755                 return 0;
 756
 757         /* So, here's the deal: when realizing the cgroups for this
 758          * unit, we need to first create all parents, but there's more
 759          * actually: for the weight-based controllers we also need to
 760          * make sure that all our siblings (i.e. units that are in the
 761          * same slice as we are) have cgroups, too. Otherwise, things
 762          * would become very uneven as each of their processes would
 763          * get as much resources as all our group together. This call
 764          * will synchronously create the parent cgroups, but will
 765          * defer work on the siblings to the next event loop
 766          * iteration. */
 767
 768         /* Add all sibling slices to the cgroup queue. */
 769         unit_queue_siblings(u);
 770
 771         /* And realize this one now (and apply the values) */
 772         return unit_realize_cgroup_now(u, manager_state(u->manager));
 773 }
 774
 775 void unit_destroy_cgroup(Unit *u) {
 776         int r;
 777
 778         assert(u);
 779
 780         if (!u->cgroup_path)
 781                 return;
 782
 783         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 784         if (r < 0)
 785                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 786
 787         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 788
 789         free(u->cgroup_path);
 790         u->cgroup_path = NULL;
 791         u->cgroup_realized = false;
 792         u->cgroup_realized_mask = 0;
 793
 794 }
 795
 796 pid_t unit_search_main_pid(Unit *u) {
 797         _cleanup_fclose_ FILE *f = NULL;
 798         pid_t pid = 0, npid, mypid;
 799
 800         assert(u);
 801
 802         if (!u->cgroup_path)
 803                 return 0;
 804
 805         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 806                 return 0;
 807
 808         mypid = getpid();
 809         while (cg_read_pid(f, &npid) > 0)  {
 810                 pid_t ppid;
 811
 812                 if (npid == pid)
 813                         continue;
 814
 815                 /* Ignore processes that aren't our kids */
 816                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 817                         continue;
 818
 819                 if (pid != 0) {
 820                         /* Dang, there's more than one daemonized PID
 821                         in this group, so we don't know what process
 822                         is the main process. */
 823                         pid = 0;
 824                         break;
 825                 }
 826
 827                 pid = npid;
 828         }
 829
 830         return pid;
 831 }
 832
 833 int manager_setup_cgroup(Manager *m) {
 834         _cleanup_free_ char *path = NULL;
 835         char *e;
 836         int r;
 837
 838         assert(m);
 839
 840         /* 1. Determine hierarchy */
 841         free(m->cgroup_root);
 842         m->cgroup_root = NULL;
 843
 844         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 845         if (r < 0) {
 846                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 847                 return r;
 848         }
 849
 850         /* LEGACY: Already in /system.slice? If so, let's cut this
 851          * off. This is to support live upgrades from older systemd
 852          * versions where PID 1 was moved there. */
 853         if (m->running_as == SYSTEMD_SYSTEM) {
 854                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 855                 if (!e)
 856                         e = endswith(m->cgroup_root, "/system");
 857                 if (e)
 858                         *e = 0;
 859         }
 860
 861         /* And make sure to store away the root value without trailing
 862          * slash, even for the root dir, so that we can easily prepend
 863          * it everywhere. */
 864         if (streq(m->cgroup_root, "/"))
 865                 m->cgroup_root[0] = 0;
 866
 867         /* 2. Show data */
 868         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 869         if (r < 0) {
 870                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 871                 return r;
 872         }
 873
 874         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 875
 876         /* 3. Install agent */
 877         if (m->running_as == SYSTEMD_SYSTEM) {
 878                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 879                 if (r < 0)
 880                         log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 881                 else if (r > 0)
 882                         log_debug("Installed release agent.");
 883                 else
 884                         log_debug("Release agent already installed.");
 885         }
 886
 887         /* 4. Make sure we are in the root cgroup */
 888         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 889         if (r < 0) {
 890                 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 891                 return r;
 892         }
 893
 894         /* 5. And pin it, so that it cannot be unmounted */
 895         safe_close(m->pin_cgroupfs_fd);
 896
 897         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 898         if (m->pin_cgroupfs_fd < 0) {
 899                 log_error("Failed to open pin file: %m");
 900                 return -errno;
 901         }
 902
 903         /* 6. Figure out which controllers are supported */
 904         m->cgroup_supported = cg_mask_supported();
 905
 906         /* 7.  Always enable hierarchial support if it exists... */
 907         cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 908
 909         return 0;
 910 }
 911
 912 void manager_shutdown_cgroup(Manager *m, bool delete) {
 913         assert(m);
 914
 915         /* We can't really delete the group, since we are in it. But
 916          * let's trim it. */
 917         if (delete && m->cgroup_root)
 918                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 919
 920         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 921
 922         free(m->cgroup_root);
 923         m->cgroup_root = NULL;
 924 }
 925
 926 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 927         char *p;
 928         Unit *u;
 929
 930         assert(m);
 931         assert(cgroup);
 932
 933         u = hashmap_get(m->cgroup_unit, cgroup);
 934         if (u)
 935                 return u;
 936
 937         p = strdupa(cgroup);
 938         for (;;) {
 939                 char *e;
 940
 941                 e = strrchr(p, '/');
 942                 if (e == p || !e)
 943                         return NULL;
 944
 945                 *e = 0;
 946
 947                 u = hashmap_get(m->cgroup_unit, p);
 948                 if (u)
 949                         return u;
 950         }
 951 }
 952
 953 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 954         _cleanup_free_ char *cgroup = NULL;
 955         int r;
 956
 957         assert(m);
 958
 959         if (pid <= 1)
 960                 return NULL;
 961
 962         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 963         if (r < 0)
 964                 return NULL;
 965
 966         return manager_get_unit_by_cgroup(m, cgroup);
 967 }
 968
 969 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 970         Unit *u;
 971         int r;
 972
 973         assert(m);
 974         assert(cgroup);
 975
 976         u = manager_get_unit_by_cgroup(m, cgroup);
 977         if (u) {
 978                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
 979                 if (r > 0) {
 980                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
 981                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
 982
 983                         unit_add_to_gc_queue(u);
 984                 }
 985         }
 986
 987         return 0;
 988 }
 989
 990 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
 991         [CGROUP_AUTO] = "auto",
 992         [CGROUP_CLOSED] = "closed",
 993         [CGROUP_STRICT] = "strict",
 994 };
 995
 996 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);