src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 void cgroup_context_init(CGroupContext *c) {
  31         assert(c);
  32
  33         /* Initialize everything to the kernel defaults, assuming the
  34          * structure is preinitialized to 0 */
  35
  36         c->cpu_shares = 1024;
  37         c->memory_limit = (uint64_t) -1;
  38         c->blockio_weight = 1000;
  39
  40         c->cpu_quota_per_sec_usec = (usec_t) -1;
  41         c->cpu_quota_usec = (usec_t) -1;
  42         c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
  43 }
  44
  45 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  46         assert(c);
  47         assert(a);
  48
  49         LIST_REMOVE(device_allow, c->device_allow, a);
  50         free(a->path);
  51         free(a);
  52 }
  53
  54 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  55         assert(c);
  56         assert(w);
  57
  58         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  59         free(w->path);
  60         free(w);
  61 }
  62
  63 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  64         assert(c);
  65         assert(b);
  66
  67         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  68         free(b->path);
  69         free(b);
  70 }
  71
  72 void cgroup_context_done(CGroupContext *c) {
  73         assert(c);
  74
  75         while (c->blockio_device_weights)
  76                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  77
  78         while (c->blockio_device_bandwidths)
  79                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  80
  81         while (c->device_allow)
  82                 cgroup_context_free_device_allow(c, c->device_allow);
  83 }
  84
  85 usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
  86         assert(c);
  87
  88         /* Returns the absolute CPU quota */
  89
  90         if (c->cpu_quota_usec != (usec_t) -1)
  91                 return c->cpu_quota_usec;
  92         else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
  93                 return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
  94         else
  95                 return (usec_t) -1;
  96 }
  97
  98 usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
  99         assert(c);
 100
 101         /* Returns the CPU quota relative to 1s */
 102
 103         if (c->cpu_quota_usec != (usec_t) -1)
 104                 return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
 105         else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
 106                 return c->cpu_quota_per_sec_usec;
 107         else
 108                 return (usec_t) -1;
 109 }
 110
 111 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 112         CGroupBlockIODeviceBandwidth *b;
 113         CGroupBlockIODeviceWeight *w;
 114         CGroupDeviceAllow *a;
 115         char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];
 116
 117         assert(c);
 118         assert(f);
 119
 120         prefix = strempty(prefix);
 121
 122         fprintf(f,
 123                 "%sCPUAccounting=%s\n"
 124                 "%sBlockIOAccounting=%s\n"
 125                 "%sMemoryAccounting=%s\n"
 126                 "%sCPUShares=%lu\n"
 127                 "%sCPUQuota=%s\n"
 128                 "%sCPUQuotaPerSecSec=%s\n"
 129                 "%sCPUQuotaPeriodSec=%s\n"
 130                 "%sBlockIOWeight=%lu\n"
 131                 "%sMemoryLimit=%" PRIu64 "\n"
 132                 "%sDevicePolicy=%s\n",
 133                 prefix, yes_no(c->cpu_accounting),
 134                 prefix, yes_no(c->blockio_accounting),
 135                 prefix, yes_no(c->memory_accounting),
 136                 prefix, c->cpu_shares,
 137                 prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
 138                 prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
 139                 prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
 140                 prefix, c->blockio_weight,
 141                 prefix, c->memory_limit,
 142                 prefix, cgroup_device_policy_to_string(c->device_policy));
 143
 144         LIST_FOREACH(device_allow, a, c->device_allow)
 145                 fprintf(f,
 146                         "%sDeviceAllow=%s %s%s%s\n",
 147                         prefix,
 148                         a->path,
 149                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 150
 151         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 152                 fprintf(f,
 153                         "%sBlockIODeviceWeight=%s %lu",
 154                         prefix,
 155                         w->path,
 156                         w->weight);
 157
 158         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 159                 char buf[FORMAT_BYTES_MAX];
 160
 161                 fprintf(f,
 162                         "%s%s=%s %s\n",
 163                         prefix,
 164                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 165                         b->path,
 166                         format_bytes(buf, sizeof(buf), b->bandwidth));
 167         }
 168 }
 169
 170 static int lookup_blkio_device(const char *p, dev_t *dev) {
 171         struct stat st;
 172         int r;
 173
 174         assert(p);
 175         assert(dev);
 176
 177         r = stat(p, &st);
 178         if (r < 0) {
 179                 log_warning("Couldn't stat device %s: %m", p);
 180                 return -errno;
 181         }
 182
 183         if (S_ISBLK(st.st_mode))
 184                 *dev = st.st_rdev;
 185         else if (major(st.st_dev) != 0) {
 186                 /* If this is not a device node then find the block
 187                  * device this file is stored on */
 188                 *dev = st.st_dev;
 189
 190                 /* If this is a partition, try to get the originating
 191                  * block device */
 192                 block_get_whole_disk(*dev, dev);
 193         } else {
 194                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 195                 return -ENODEV;
 196         }
 197
 198         return 0;
 199 }
 200
 201 static int whitelist_device(const char *path, const char *node, const char *acc) {
 202         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 203         struct stat st;
 204         int r;
 205
 206         assert(path);
 207         assert(acc);
 208
 209         if (stat(node, &st) < 0) {
 210                 log_warning("Couldn't stat device %s", node);
 211                 return -errno;
 212         }
 213
 214         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 215                 log_warning("%s is not a device.", node);
 216                 return -ENODEV;
 217         }
 218
 219         sprintf(buf,
 220                 "%c %u:%u %s",
 221                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 222                 major(st.st_rdev), minor(st.st_rdev),
 223                 acc);
 224
 225         r = cg_set_attribute("devices", path, "devices.allow", buf);
 226         if (r < 0)
 227                 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 228
 229         return r;
 230 }
 231
 232 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 233         _cleanup_fclose_ FILE *f = NULL;
 234         char line[LINE_MAX];
 235         bool good = false;
 236         int r;
 237
 238         assert(path);
 239         assert(acc);
 240         assert(type == 'b' || type == 'c');
 241
 242         f = fopen("/proc/devices", "re");
 243         if (!f) {
 244                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 245                 return -errno;
 246         }
 247
 248         FOREACH_LINE(line, f, goto fail) {
 249                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 250                 unsigned maj;
 251
 252                 truncate_nl(line);
 253
 254                 if (type == 'c' && streq(line, "Character devices:")) {
 255                         good = true;
 256                         continue;
 257                 }
 258
 259                 if (type == 'b' && streq(line, "Block devices:")) {
 260                         good = true;
 261                         continue;
 262                 }
 263
 264                 if (isempty(line)) {
 265                         good = false;
 266                         continue;
 267                 }
 268
 269                 if (!good)
 270                         continue;
 271
 272                 p = strstrip(line);
 273
 274                 w = strpbrk(p, WHITESPACE);
 275                 if (!w)
 276                         continue;
 277                 *w = 0;
 278
 279                 r = safe_atou(p, &maj);
 280                 if (r < 0)
 281                         continue;
 282                 if (maj <= 0)
 283                         continue;
 284
 285                 w++;
 286                 w += strspn(w, WHITESPACE);
 287
 288                 if (fnmatch(name, w, 0) != 0)
 289                         continue;
 290
 291                 sprintf(buf,
 292                         "%c %u:* %s",
 293                         type,
 294                         maj,
 295                         acc);
 296
 297                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 298                 if (r < 0)
 299                         log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 300         }
 301
 302         return 0;
 303
 304 fail:
 305         log_warning("Failed to read /proc/devices: %m");
 306         return -errno;
 307 }
 308
 309 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
 310         bool is_root;
 311         int r;
 312
 313         assert(c);
 314         assert(path);
 315
 316         if (mask == 0)
 317                 return;
 318
 319         /* Some cgroup attributes are not support on the root cgroup,
 320          * hence silently ignore */
 321         is_root = isempty(path) || path_equal(path, "/");
 322
 323         if ((mask & CGROUP_CPU) && !is_root) {
 324                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 325                 usec_t q;
 326
 327                 sprintf(buf, "%lu\n", c->cpu_shares);
 328                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 329                 if (r < 0)
 330                         log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
 331
 332                 sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
 333                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 334                 if (r < 0)
 335                         log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
 336
 337                 q = cgroup_context_get_cpu_quota_usec(c);
 338                 if (q != (usec_t) -1) {
 339                         sprintf(buf, USEC_FMT "\n", q);
 340                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 341                 } else
 342                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 343                 if (r < 0)
 344                         log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
 345         }
 346
 347         if (mask & CGROUP_BLKIO) {
 348                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 349                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 350                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 351                 CGroupBlockIODeviceWeight *w;
 352                 CGroupBlockIODeviceBandwidth *b;
 353
 354                 if (!is_root) {
 355                         sprintf(buf, "%lu\n", c->blockio_weight);
 356                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 357                         if (r < 0)
 358                                 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
 359
 360                         /* FIXME: no way to reset this list */
 361                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 362                                 dev_t dev;
 363
 364                                 r = lookup_blkio_device(w->path, &dev);
 365                                 if (r < 0)
 366                                         continue;
 367
 368                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 369                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 370                                 if (r < 0)
 371                                         log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 372                         }
 373                 }
 374
 375                 /* FIXME: no way to reset this list */
 376                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 377                         const char *a;
 378                         dev_t dev;
 379
 380                         r = lookup_blkio_device(b->path, &dev);
 381                         if (r < 0)
 382                                 continue;
 383
 384                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 385
 386                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 387                         r = cg_set_attribute("blkio", path, a, buf);
 388                         if (r < 0)
 389                                 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
 390                 }
 391         }
 392
 393         if (mask & CGROUP_MEMORY) {
 394                 if (c->memory_limit != (uint64_t) -1) {
 395                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 396
 397                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 398                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 399                 } else
 400                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 401
 402                 if (r < 0)
 403                         log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 404         }
 405
 406         if ((mask & CGROUP_DEVICE) && !is_root) {
 407                 CGroupDeviceAllow *a;
 408
 409                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 410                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 411                 else
 412                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 413                 if (r < 0)
 414                         log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
 415
 416                 if (c->device_policy == CGROUP_CLOSED ||
 417                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 418                         static const char auto_devices[] =
 419                                 "/dev/null\0" "rwm\0"
 420                                 "/dev/zero\0" "rwm\0"
 421                                 "/dev/full\0" "rwm\0"
 422                                 "/dev/random\0" "rwm\0"
 423                                 "/dev/urandom\0" "rwm\0"
 424                                 "/dev/tty\0" "rwm\0"
 425                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 426
 427                         const char *x, *y;
 428
 429                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 430                                 whitelist_device(path, x, y);
 431
 432                         whitelist_major(path, "pts", 'c', "rw");
 433                         whitelist_major(path, "kdbus", 'c', "rw");
 434                         whitelist_major(path, "kdbus/*", 'c', "rw");
 435                 }
 436
 437                 LIST_FOREACH(device_allow, a, c->device_allow) {
 438                         char acc[4];
 439                         unsigned k = 0;
 440
 441                         if (a->r)
 442                                 acc[k++] = 'r';
 443                         if (a->w)
 444                                 acc[k++] = 'w';
 445                         if (a->m)
 446                                 acc[k++] = 'm';
 447
 448                         if (k == 0)
 449                                 continue;
 450
 451                         acc[k++] = 0;
 452
 453                         if (startswith(a->path, "/dev/"))
 454                                 whitelist_device(path, a->path, acc);
 455                         else if (startswith(a->path, "block-"))
 456                                 whitelist_major(path, a->path + 6, 'b', acc);
 457                         else if (startswith(a->path, "char-"))
 458                                 whitelist_major(path, a->path + 5, 'c', acc);
 459                         else
 460                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 461                 }
 462         }
 463 }
 464
 465 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 466         CGroupControllerMask mask = 0;
 467
 468         /* Figure out which controllers we need */
 469
 470         if (c->cpu_accounting ||
 471             c->cpu_shares != 1024 ||
 472             c->cpu_quota_usec != (usec_t) -1 ||
 473             c->cpu_quota_per_sec_usec != (usec_t) -1)
 474                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 475
 476         if (c->blockio_accounting ||
 477             c->blockio_weight != 1000 ||
 478             c->blockio_device_weights ||
 479             c->blockio_device_bandwidths)
 480                 mask |= CGROUP_BLKIO;
 481
 482         if (c->memory_accounting ||
 483             c->memory_limit != (uint64_t) -1)
 484                 mask |= CGROUP_MEMORY;
 485
 486         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 487                 mask |= CGROUP_DEVICE;
 488
 489         return mask;
 490 }
 491
 492 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 493         CGroupContext *c;
 494
 495         c = unit_get_cgroup_context(u);
 496         if (!c)
 497                 return 0;
 498
 499         return cgroup_context_get_mask(c);
 500 }
 501
 502 CGroupControllerMask unit_get_members_mask(Unit *u) {
 503         assert(u);
 504
 505         if (u->cgroup_members_mask_valid)
 506                 return u->cgroup_members_mask;
 507
 508         u->cgroup_members_mask = 0;
 509
 510         if (u->type == UNIT_SLICE) {
 511                 Unit *member;
 512                 Iterator i;
 513
 514                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 515
 516                         if (member == u)
 517                                 continue;
 518
 519                         if (UNIT_DEREF(member->slice) != u)
 520                                 continue;
 521
 522                         u->cgroup_members_mask |=
 523                                 unit_get_cgroup_mask(member) |
 524                                 unit_get_members_mask(member);
 525                 }
 526         }
 527
 528         u->cgroup_members_mask_valid = true;
 529         return u->cgroup_members_mask;
 530 }
 531
 532 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 533         CGroupControllerMask m;
 534
 535         assert(u);
 536
 537         if (UNIT_ISSET(u->slice))
 538                 m = unit_get_members_mask(UNIT_DEREF(u->slice));
 539         else
 540                 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 541
 542         /* Sibling propagation is only relevant for weight-based
 543          * controllers, so let's mask out everything else */
 544         return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
 545 }
 546
 547 CGroupControllerMask unit_get_target_mask(Unit *u) {
 548         CGroupControllerMask mask;
 549
 550         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 551         mask &= u->manager->cgroup_supported;
 552
 553         return mask;
 554 }
 555
 556 /* Recurse from a unit up through its containing slices, propagating
 557  * mask bits upward. A unit is also member of itself. */
 558 void unit_update_cgroup_members_masks(Unit *u) {
 559         CGroupControllerMask m;
 560         bool more;
 561
 562         assert(u);
 563
 564         /* Calculate subtree mask */
 565         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 566
 567         /* See if anything changed from the previous invocation. If
 568          * not, we're done. */
 569         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 570                 return;
 571
 572         more =
 573                 u->cgroup_subtree_mask_valid &&
 574                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 575                 ((~m & u->cgroup_subtree_mask) == 0);
 576
 577         u->cgroup_subtree_mask = m;
 578         u->cgroup_subtree_mask_valid = true;
 579
 580         if (UNIT_ISSET(u->slice)) {
 581                 Unit *s = UNIT_DEREF(u->slice);
 582
 583                 if (more)
 584                         /* There's more set now than before. We
 585                          * propagate the new mask to the parent's mask
 586                          * (not caring if it actually was valid or
 587                          * not). */
 588
 589                         s->cgroup_members_mask |= m;
 590
 591                 else
 592                         /* There's less set now than before (or we
 593                          * don't know), we need to recalculate
 594                          * everything, so let's invalidate the
 595                          * parent's members mask */
 596
 597                         s->cgroup_members_mask_valid = false;
 598
 599                 /* And now make sure that this change also hits our
 600                  * grandparents */
 601                 unit_update_cgroup_members_masks(s);
 602         }
 603 }
 604
 605 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 606         Unit *u = userdata;
 607
 608         assert(mask != 0);
 609         assert(u);
 610
 611         while (u) {
 612                 if (u->cgroup_path &&
 613                     u->cgroup_realized &&
 614                     (u->cgroup_realized_mask & mask) == mask)
 615                         return u->cgroup_path;
 616
 617                 u = UNIT_DEREF(u->slice);
 618         }
 619
 620         return NULL;
 621 }
 622
 623 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 624         _cleanup_free_ char *path = NULL;
 625         int r;
 626
 627         assert(u);
 628
 629         path = unit_default_cgroup_path(u);
 630         if (!path)
 631                 return log_oom();
 632
 633         r = hashmap_put(u->manager->cgroup_unit, path, u);
 634         if (r < 0) {
 635                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 636                 return r;
 637         }
 638         if (r > 0) {
 639                 u->cgroup_path = path;
 640                 path = NULL;
 641         }
 642
 643         /* First, create our own group */
 644         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 645         if (r < 0) {
 646                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 647                 return r;
 648         }
 649
 650         /* Keep track that this is now realized */
 651         u->cgroup_realized = true;
 652         u->cgroup_realized_mask = mask;
 653
 654         /* Then, possibly move things over */
 655         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 656         if (r < 0)
 657                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 658
 659         return 0;
 660 }
 661
 662 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 663         assert(u);
 664
 665         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 666 }
 667
 668 /* Check if necessary controllers and attributes for a unit are in place.
 669  *
 670  * If so, do nothing.
 671  * If not, create paths, move processes over, and set attributes.
 672  *
 673  * Returns 0 on success and < 0 on failure. */
 674 static int unit_realize_cgroup_now(Unit *u) {
 675         CGroupControllerMask mask;
 676         int r;
 677
 678         assert(u);
 679
 680         if (u->in_cgroup_queue) {
 681                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 682                 u->in_cgroup_queue = false;
 683         }
 684
 685         mask = unit_get_target_mask(u);
 686
 687         if (unit_has_mask_realized(u, mask))
 688                 return 0;
 689
 690         /* First, realize parents */
 691         if (UNIT_ISSET(u->slice)) {
 692                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
 693                 if (r < 0)
 694                         return r;
 695         }
 696
 697         /* And then do the real work */
 698         r = unit_create_cgroups(u, mask);
 699         if (r < 0)
 700                 return r;
 701
 702         /* Finally, apply the necessary attributes. */
 703         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
 704
 705         return 0;
 706 }
 707
 708 static void unit_add_to_cgroup_queue(Unit *u) {
 709
 710         if (u->in_cgroup_queue)
 711                 return;
 712
 713         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 714         u->in_cgroup_queue = true;
 715 }
 716
 717 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 718         Unit *i;
 719         unsigned n = 0;
 720         int r;
 721
 722         while ((i = m->cgroup_queue)) {
 723                 assert(i->in_cgroup_queue);
 724
 725                 r = unit_realize_cgroup_now(i);
 726                 if (r < 0)
 727                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 728
 729                 n++;
 730         }
 731
 732         return n;
 733 }
 734
 735 static void unit_queue_siblings(Unit *u) {
 736         Unit *slice;
 737
 738         /* This adds the siblings of the specified unit and the
 739          * siblings of all parent units to the cgroup queue. (But
 740          * neither the specified unit itself nor the parents.) */
 741
 742         while ((slice = UNIT_DEREF(u->slice))) {
 743                 Iterator i;
 744                 Unit *m;
 745
 746                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 747                         if (m == u)
 748                                 continue;
 749
 750                         /* Skip units that have a dependency on the slice
 751                          * but aren't actually in it. */
 752                         if (UNIT_DEREF(m->slice) != slice)
 753                                 continue;
 754
 755                         /* No point in doing cgroup application for units
 756                          * without active processes. */
 757                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 758                                 continue;
 759
 760                         /* If the unit doesn't need any new controllers
 761                          * and has current ones realized, it doesn't need
 762                          * any changes. */
 763                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 764                                 continue;
 765
 766                         unit_add_to_cgroup_queue(m);
 767                 }
 768
 769                 u = slice;
 770         }
 771 }
 772
 773 int unit_realize_cgroup(Unit *u) {
 774         CGroupContext *c;
 775
 776         assert(u);
 777
 778         c = unit_get_cgroup_context(u);
 779         if (!c)
 780                 return 0;
 781
 782         /* So, here's the deal: when realizing the cgroups for this
 783          * unit, we need to first create all parents, but there's more
 784          * actually: for the weight-based controllers we also need to
 785          * make sure that all our siblings (i.e. units that are in the
 786          * same slice as we are) have cgroups, too. Otherwise, things
 787          * would become very uneven as each of their processes would
 788          * get as much resources as all our group together. This call
 789          * will synchronously create the parent cgroups, but will
 790          * defer work on the siblings to the next event loop
 791          * iteration. */
 792
 793         /* Add all sibling slices to the cgroup queue. */
 794         unit_queue_siblings(u);
 795
 796         /* And realize this one now (and apply the values) */
 797         return unit_realize_cgroup_now(u);
 798 }
 799
 800 void unit_destroy_cgroup(Unit *u) {
 801         int r;
 802
 803         assert(u);
 804
 805         if (!u->cgroup_path)
 806                 return;
 807
 808         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 809         if (r < 0)
 810                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 811
 812         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 813
 814         free(u->cgroup_path);
 815         u->cgroup_path = NULL;
 816         u->cgroup_realized = false;
 817         u->cgroup_realized_mask = 0;
 818
 819 }
 820
 821 pid_t unit_search_main_pid(Unit *u) {
 822         _cleanup_fclose_ FILE *f = NULL;
 823         pid_t pid = 0, npid, mypid;
 824
 825         assert(u);
 826
 827         if (!u->cgroup_path)
 828                 return 0;
 829
 830         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 831                 return 0;
 832
 833         mypid = getpid();
 834         while (cg_read_pid(f, &npid) > 0)  {
 835                 pid_t ppid;
 836
 837                 if (npid == pid)
 838                         continue;
 839
 840                 /* Ignore processes that aren't our kids */
 841                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 842                         continue;
 843
 844                 if (pid != 0) {
 845                         /* Dang, there's more than one daemonized PID
 846                         in this group, so we don't know what process
 847                         is the main process. */
 848                         pid = 0;
 849                         break;
 850                 }
 851
 852                 pid = npid;
 853         }
 854
 855         return pid;
 856 }
 857
 858 int manager_setup_cgroup(Manager *m) {
 859         _cleanup_free_ char *path = NULL;
 860         char *e;
 861         int r;
 862
 863         assert(m);
 864
 865         /* 1. Determine hierarchy */
 866         free(m->cgroup_root);
 867         m->cgroup_root = NULL;
 868
 869         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 870         if (r < 0) {
 871                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 872                 return r;
 873         }
 874
 875         /* LEGACY: Already in /system.slice? If so, let's cut this
 876          * off. This is to support live upgrades from older systemd
 877          * versions where PID 1 was moved there. */
 878         if (m->running_as == SYSTEMD_SYSTEM) {
 879                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 880                 if (!e)
 881                         e = endswith(m->cgroup_root, "/system");
 882                 if (e)
 883                         *e = 0;
 884         }
 885
 886         /* And make sure to store away the root value without trailing
 887          * slash, even for the root dir, so that we can easily prepend
 888          * it everywhere. */
 889         if (streq(m->cgroup_root, "/"))
 890                 m->cgroup_root[0] = 0;
 891
 892         /* 2. Show data */
 893         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 894         if (r < 0) {
 895                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 896                 return r;
 897         }
 898
 899         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 900
 901         /* 3. Install agent */
 902         if (m->running_as == SYSTEMD_SYSTEM) {
 903                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 904                 if (r < 0)
 905                         log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 906                 else if (r > 0)
 907                         log_debug("Installed release agent.");
 908                 else
 909                         log_debug("Release agent already installed.");
 910         }
 911
 912         /* 4. Make sure we are in the root cgroup */
 913         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 914         if (r < 0) {
 915                 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 916                 return r;
 917         }
 918
 919         /* 5. And pin it, so that it cannot be unmounted */
 920         safe_close(m->pin_cgroupfs_fd);
 921
 922         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 923         if (m->pin_cgroupfs_fd < 0) {
 924                 log_error("Failed to open pin file: %m");
 925                 return -errno;
 926         }
 927
 928         /* 6. Figure out which controllers are supported */
 929         m->cgroup_supported = cg_mask_supported();
 930
 931         /* 7.  Always enable hierarchial support if it exists... */
 932         cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 933
 934         return 0;
 935 }
 936
 937 void manager_shutdown_cgroup(Manager *m, bool delete) {
 938         assert(m);
 939
 940         /* We can't really delete the group, since we are in it. But
 941          * let's trim it. */
 942         if (delete && m->cgroup_root)
 943                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 944
 945         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 946
 947         free(m->cgroup_root);
 948         m->cgroup_root = NULL;
 949 }
 950
 951 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 952         char *p;
 953         Unit *u;
 954
 955         assert(m);
 956         assert(cgroup);
 957
 958         u = hashmap_get(m->cgroup_unit, cgroup);
 959         if (u)
 960                 return u;
 961
 962         p = strdupa(cgroup);
 963         for (;;) {
 964                 char *e;
 965
 966                 e = strrchr(p, '/');
 967                 if (e == p || !e)
 968                         return NULL;
 969
 970                 *e = 0;
 971
 972                 u = hashmap_get(m->cgroup_unit, p);
 973                 if (u)
 974                         return u;
 975         }
 976 }
 977
 978 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 979         _cleanup_free_ char *cgroup = NULL;
 980         int r;
 981
 982         assert(m);
 983
 984         if (pid <= 1)
 985                 return NULL;
 986
 987         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 988         if (r < 0)
 989                 return NULL;
 990
 991         return manager_get_unit_by_cgroup(m, cgroup);
 992 }
 993
 994 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 995         Unit *u;
 996         int r;
 997
 998         assert(m);
 999         assert(cgroup);
1000
1001         u = manager_get_unit_by_cgroup(m, cgroup);
1002         if (u) {
1003                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1004                 if (r > 0) {
1005                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1006                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1007
1008                         unit_add_to_gc_queue(u);
1009                 }
1010         }
1011
1012         return 0;
1013 }
1014
1015 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1016         [CGROUP_AUTO] = "auto",
1017         [CGROUP_CLOSED] = "closed",
1018         [CGROUP_STRICT] = "strict",
1019 };
1020
1021 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);