src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 void cgroup_context_init(CGroupContext *c) {
  31         assert(c);
  32
  33         /* Initialize everything to the kernel defaults, assuming the
  34          * structure is preinitialized to 0 */
  35
  36         c->cpu_shares = (unsigned long) -1;
  37         c->startup_cpu_shares = (unsigned long) -1;
  38         c->memory_limit = (uint64_t) -1;
  39         c->blockio_weight = (unsigned long) -1;
  40         c->startup_blockio_weight = (unsigned long) -1;
  41
  42         c->cpu_quota_per_sec_usec = (usec_t) -1;
  43         c->cpu_quota_usec = (usec_t) -1;
  44         c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
  45 }
  46
  47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  48         assert(c);
  49         assert(a);
  50
  51         LIST_REMOVE(device_allow, c->device_allow, a);
  52         free(a->path);
  53         free(a);
  54 }
  55
  56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  57         assert(c);
  58         assert(w);
  59
  60         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  61         free(w->path);
  62         free(w);
  63 }
  64
  65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  66         assert(c);
  67         assert(b);
  68
  69         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  70         free(b->path);
  71         free(b);
  72 }
  73
  74 void cgroup_context_done(CGroupContext *c) {
  75         assert(c);
  76
  77         while (c->blockio_device_weights)
  78                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  79
  80         while (c->blockio_device_bandwidths)
  81                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  82
  83         while (c->device_allow)
  84                 cgroup_context_free_device_allow(c, c->device_allow);
  85 }
  86
  87 usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
  88         assert(c);
  89
  90         /* Returns the absolute CPU quota */
  91
  92         if (c->cpu_quota_usec != (usec_t) -1)
  93                 return c->cpu_quota_usec;
  94         else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
  95                 return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
  96         else
  97                 return (usec_t) -1;
  98 }
  99
 100 usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
 101         assert(c);
 102
 103         /* Returns the CPU quota relative to 1s */
 104
 105         if (c->cpu_quota_usec != (usec_t) -1)
 106                 return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
 107         else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
 108                 return c->cpu_quota_per_sec_usec;
 109         else
 110                 return (usec_t) -1;
 111 }
 112
 113 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 114         CGroupBlockIODeviceBandwidth *b;
 115         CGroupBlockIODeviceWeight *w;
 116         CGroupDeviceAllow *a;
 117         char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];
 118
 119         assert(c);
 120         assert(f);
 121
 122         prefix = strempty(prefix);
 123
 124         fprintf(f,
 125                 "%sCPUAccounting=%s\n"
 126                 "%sBlockIOAccounting=%s\n"
 127                 "%sMemoryAccounting=%s\n"
 128                 "%sCPUShares=%lu\n"
 129                 "%sStartupCPUShares=%lu\n"
 130                 "%sCPUQuota=%s\n"
 131                 "%sCPUQuotaPerSecSec=%s\n"
 132                 "%sCPUQuotaPeriodSec=%s\n"
 133                 "%sBlockIOWeight=%lu\n"
 134                 "%sStartupBlockIOWeight=%lu\n"
 135                 "%sMemoryLimit=%" PRIu64 "\n"
 136                 "%sDevicePolicy=%s\n",
 137                 prefix, yes_no(c->cpu_accounting),
 138                 prefix, yes_no(c->blockio_accounting),
 139                 prefix, yes_no(c->memory_accounting),
 140                 prefix, c->cpu_shares,
 141                 prefix, c->startup_cpu_shares,
 142                 prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
 143                 prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
 144                 prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
 145                 prefix, c->blockio_weight,
 146                 prefix, c->startup_blockio_weight,
 147                 prefix, c->memory_limit,
 148                 prefix, cgroup_device_policy_to_string(c->device_policy));
 149
 150         LIST_FOREACH(device_allow, a, c->device_allow)
 151                 fprintf(f,
 152                         "%sDeviceAllow=%s %s%s%s\n",
 153                         prefix,
 154                         a->path,
 155                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 156
 157         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 158                 fprintf(f,
 159                         "%sBlockIODeviceWeight=%s %lu",
 160                         prefix,
 161                         w->path,
 162                         w->weight);
 163
 164         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 165                 char buf[FORMAT_BYTES_MAX];
 166
 167                 fprintf(f,
 168                         "%s%s=%s %s\n",
 169                         prefix,
 170                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 171                         b->path,
 172                         format_bytes(buf, sizeof(buf), b->bandwidth));
 173         }
 174 }
 175
 176 static int lookup_blkio_device(const char *p, dev_t *dev) {
 177         struct stat st;
 178         int r;
 179
 180         assert(p);
 181         assert(dev);
 182
 183         r = stat(p, &st);
 184         if (r < 0) {
 185                 log_warning("Couldn't stat device %s: %m", p);
 186                 return -errno;
 187         }
 188
 189         if (S_ISBLK(st.st_mode))
 190                 *dev = st.st_rdev;
 191         else if (major(st.st_dev) != 0) {
 192                 /* If this is not a device node then find the block
 193                  * device this file is stored on */
 194                 *dev = st.st_dev;
 195
 196                 /* If this is a partition, try to get the originating
 197                  * block device */
 198                 block_get_whole_disk(*dev, dev);
 199         } else {
 200                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 201                 return -ENODEV;
 202         }
 203
 204         return 0;
 205 }
 206
 207 static int whitelist_device(const char *path, const char *node, const char *acc) {
 208         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 209         struct stat st;
 210         int r;
 211
 212         assert(path);
 213         assert(acc);
 214
 215         if (stat(node, &st) < 0) {
 216                 log_warning("Couldn't stat device %s", node);
 217                 return -errno;
 218         }
 219
 220         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 221                 log_warning("%s is not a device.", node);
 222                 return -ENODEV;
 223         }
 224
 225         sprintf(buf,
 226                 "%c %u:%u %s",
 227                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 228                 major(st.st_rdev), minor(st.st_rdev),
 229                 acc);
 230
 231         r = cg_set_attribute("devices", path, "devices.allow", buf);
 232         if (r < 0)
 233                 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 234
 235         return r;
 236 }
 237
 238 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 239         _cleanup_fclose_ FILE *f = NULL;
 240         char line[LINE_MAX];
 241         bool good = false;
 242         int r;
 243
 244         assert(path);
 245         assert(acc);
 246         assert(type == 'b' || type == 'c');
 247
 248         f = fopen("/proc/devices", "re");
 249         if (!f) {
 250                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 251                 return -errno;
 252         }
 253
 254         FOREACH_LINE(line, f, goto fail) {
 255                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 256                 unsigned maj;
 257
 258                 truncate_nl(line);
 259
 260                 if (type == 'c' && streq(line, "Character devices:")) {
 261                         good = true;
 262                         continue;
 263                 }
 264
 265                 if (type == 'b' && streq(line, "Block devices:")) {
 266                         good = true;
 267                         continue;
 268                 }
 269
 270                 if (isempty(line)) {
 271                         good = false;
 272                         continue;
 273                 }
 274
 275                 if (!good)
 276                         continue;
 277
 278                 p = strstrip(line);
 279
 280                 w = strpbrk(p, WHITESPACE);
 281                 if (!w)
 282                         continue;
 283                 *w = 0;
 284
 285                 r = safe_atou(p, &maj);
 286                 if (r < 0)
 287                         continue;
 288                 if (maj <= 0)
 289                         continue;
 290
 291                 w++;
 292                 w += strspn(w, WHITESPACE);
 293
 294                 if (fnmatch(name, w, 0) != 0)
 295                         continue;
 296
 297                 sprintf(buf,
 298                         "%c %u:* %s",
 299                         type,
 300                         maj,
 301                         acc);
 302
 303                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 304                 if (r < 0)
 305                         log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 306         }
 307
 308         return 0;
 309
 310 fail:
 311         log_warning("Failed to read /proc/devices: %m");
 312         return -errno;
 313 }
 314
 315 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 316         bool is_root;
 317         int r;
 318
 319         assert(c);
 320         assert(path);
 321
 322         if (mask == 0)
 323                 return;
 324
 325         /* Some cgroup attributes are not support on the root cgroup,
 326          * hence silently ignore */
 327         is_root = isempty(path) || path_equal(path, "/");
 328
 329         if ((mask & CGROUP_CPU) && !is_root) {
 330                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 331                 usec_t q;
 332
 333                 sprintf(buf, "%lu\n",
 334                         state == MANAGER_STARTING && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 335                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 336                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 337                 if (r < 0)
 338                         log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
 339
 340                 sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
 341                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 342                 if (r < 0)
 343                         log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
 344
 345                 q = cgroup_context_get_cpu_quota_usec(c);
 346                 if (q != (usec_t) -1) {
 347                         sprintf(buf, USEC_FMT "\n", q);
 348                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 349                 } else
 350                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 351                 if (r < 0)
 352                         log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
 353         }
 354
 355         if (mask & CGROUP_BLKIO) {
 356                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 357                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 358                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 359                 CGroupBlockIODeviceWeight *w;
 360                 CGroupBlockIODeviceBandwidth *b;
 361
 362                 if (!is_root) {
 363                         sprintf(buf, "%lu\n", state == MANAGER_STARTING && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 364                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 365                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 366                         if (r < 0)
 367                                 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
 368
 369                         /* FIXME: no way to reset this list */
 370                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 371                                 dev_t dev;
 372
 373                                 r = lookup_blkio_device(w->path, &dev);
 374                                 if (r < 0)
 375                                         continue;
 376
 377                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 378                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 379                                 if (r < 0)
 380                                         log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 381                         }
 382                 }
 383
 384                 /* FIXME: no way to reset this list */
 385                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 386                         const char *a;
 387                         dev_t dev;
 388
 389                         r = lookup_blkio_device(b->path, &dev);
 390                         if (r < 0)
 391                                 continue;
 392
 393                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 394
 395                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 396                         r = cg_set_attribute("blkio", path, a, buf);
 397                         if (r < 0)
 398                                 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
 399                 }
 400         }
 401
 402         if (mask & CGROUP_MEMORY) {
 403                 if (c->memory_limit != (uint64_t) -1) {
 404                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 405
 406                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 407                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 408                 } else
 409                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 410
 411                 if (r < 0)
 412                         log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 413         }
 414
 415         if ((mask & CGROUP_DEVICE) && !is_root) {
 416                 CGroupDeviceAllow *a;
 417
 418                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 419                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 420                 else
 421                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 422                 if (r < 0)
 423                         log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
 424
 425                 if (c->device_policy == CGROUP_CLOSED ||
 426                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 427                         static const char auto_devices[] =
 428                                 "/dev/null\0" "rwm\0"
 429                                 "/dev/zero\0" "rwm\0"
 430                                 "/dev/full\0" "rwm\0"
 431                                 "/dev/random\0" "rwm\0"
 432                                 "/dev/urandom\0" "rwm\0"
 433                                 "/dev/tty\0" "rwm\0"
 434                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 435
 436                         const char *x, *y;
 437
 438                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 439                                 whitelist_device(path, x, y);
 440
 441                         whitelist_major(path, "pts", 'c', "rw");
 442                         whitelist_major(path, "kdbus", 'c', "rw");
 443                         whitelist_major(path, "kdbus/*", 'c', "rw");
 444                 }
 445
 446                 LIST_FOREACH(device_allow, a, c->device_allow) {
 447                         char acc[4];
 448                         unsigned k = 0;
 449
 450                         if (a->r)
 451                                 acc[k++] = 'r';
 452                         if (a->w)
 453                                 acc[k++] = 'w';
 454                         if (a->m)
 455                                 acc[k++] = 'm';
 456
 457                         if (k == 0)
 458                                 continue;
 459
 460                         acc[k++] = 0;
 461
 462                         if (startswith(a->path, "/dev/"))
 463                                 whitelist_device(path, a->path, acc);
 464                         else if (startswith(a->path, "block-"))
 465                                 whitelist_major(path, a->path + 6, 'b', acc);
 466                         else if (startswith(a->path, "char-"))
 467                                 whitelist_major(path, a->path + 5, 'c', acc);
 468                         else
 469                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 470                 }
 471         }
 472 }
 473
 474 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 475         CGroupControllerMask mask = 0;
 476
 477         /* Figure out which controllers we need */
 478
 479         if (c->cpu_accounting ||
 480             c->cpu_shares != (unsigned long) -1 ||
 481             c->startup_cpu_shares != (unsigned long) -1 ||
 482             c->cpu_quota_usec != (usec_t) -1 ||
 483             c->cpu_quota_per_sec_usec != (usec_t) -1)
 484                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 485
 486         if (c->blockio_accounting ||
 487             c->blockio_weight != (unsigned long) -1 ||
 488             c->startup_blockio_weight != (unsigned long) -1 ||
 489             c->blockio_device_weights ||
 490             c->blockio_device_bandwidths)
 491                 mask |= CGROUP_BLKIO;
 492
 493         if (c->memory_accounting ||
 494             c->memory_limit != (uint64_t) -1)
 495                 mask |= CGROUP_MEMORY;
 496
 497         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 498                 mask |= CGROUP_DEVICE;
 499
 500         return mask;
 501 }
 502
 503 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 504         CGroupContext *c;
 505
 506         c = unit_get_cgroup_context(u);
 507         if (!c)
 508                 return 0;
 509
 510         return cgroup_context_get_mask(c);
 511 }
 512
 513 CGroupControllerMask unit_get_members_mask(Unit *u) {
 514         assert(u);
 515
 516         if (u->cgroup_members_mask_valid)
 517                 return u->cgroup_members_mask;
 518
 519         u->cgroup_members_mask = 0;
 520
 521         if (u->type == UNIT_SLICE) {
 522                 Unit *member;
 523                 Iterator i;
 524
 525                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 526
 527                         if (member == u)
 528                                 continue;
 529
 530                         if (UNIT_DEREF(member->slice) != u)
 531                                 continue;
 532
 533                         u->cgroup_members_mask |=
 534                                 unit_get_cgroup_mask(member) |
 535                                 unit_get_members_mask(member);
 536                 }
 537         }
 538
 539         u->cgroup_members_mask_valid = true;
 540         return u->cgroup_members_mask;
 541 }
 542
 543 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 544         CGroupControllerMask m;
 545
 546         assert(u);
 547
 548         if (UNIT_ISSET(u->slice))
 549                 m = unit_get_members_mask(UNIT_DEREF(u->slice));
 550         else
 551                 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 552
 553         /* Sibling propagation is only relevant for weight-based
 554          * controllers, so let's mask out everything else */
 555         return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
 556 }
 557
 558 CGroupControllerMask unit_get_target_mask(Unit *u) {
 559         CGroupControllerMask mask;
 560
 561         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 562         mask &= u->manager->cgroup_supported;
 563
 564         return mask;
 565 }
 566
 567 /* Recurse from a unit up through its containing slices, propagating
 568  * mask bits upward. A unit is also member of itself. */
 569 void unit_update_cgroup_members_masks(Unit *u) {
 570         CGroupControllerMask m;
 571         bool more;
 572
 573         assert(u);
 574
 575         /* Calculate subtree mask */
 576         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 577
 578         /* See if anything changed from the previous invocation. If
 579          * not, we're done. */
 580         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 581                 return;
 582
 583         more =
 584                 u->cgroup_subtree_mask_valid &&
 585                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 586                 ((~m & u->cgroup_subtree_mask) == 0);
 587
 588         u->cgroup_subtree_mask = m;
 589         u->cgroup_subtree_mask_valid = true;
 590
 591         if (UNIT_ISSET(u->slice)) {
 592                 Unit *s = UNIT_DEREF(u->slice);
 593
 594                 if (more)
 595                         /* There's more set now than before. We
 596                          * propagate the new mask to the parent's mask
 597                          * (not caring if it actually was valid or
 598                          * not). */
 599
 600                         s->cgroup_members_mask |= m;
 601
 602                 else
 603                         /* There's less set now than before (or we
 604                          * don't know), we need to recalculate
 605                          * everything, so let's invalidate the
 606                          * parent's members mask */
 607
 608                         s->cgroup_members_mask_valid = false;
 609
 610                 /* And now make sure that this change also hits our
 611                  * grandparents */
 612                 unit_update_cgroup_members_masks(s);
 613         }
 614 }
 615
 616 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 617         Unit *u = userdata;
 618
 619         assert(mask != 0);
 620         assert(u);
 621
 622         while (u) {
 623                 if (u->cgroup_path &&
 624                     u->cgroup_realized &&
 625                     (u->cgroup_realized_mask & mask) == mask)
 626                         return u->cgroup_path;
 627
 628                 u = UNIT_DEREF(u->slice);
 629         }
 630
 631         return NULL;
 632 }
 633
 634 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 635         _cleanup_free_ char *path = NULL;
 636         int r;
 637
 638         assert(u);
 639
 640         path = unit_default_cgroup_path(u);
 641         if (!path)
 642                 return log_oom();
 643
 644         r = hashmap_put(u->manager->cgroup_unit, path, u);
 645         if (r < 0) {
 646                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 647                 return r;
 648         }
 649         if (r > 0) {
 650                 u->cgroup_path = path;
 651                 path = NULL;
 652         }
 653
 654         /* First, create our own group */
 655         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 656         if (r < 0) {
 657                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 658                 return r;
 659         }
 660
 661         /* Keep track that this is now realized */
 662         u->cgroup_realized = true;
 663         u->cgroup_realized_mask = mask;
 664
 665         /* Then, possibly move things over */
 666         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 667         if (r < 0)
 668                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 669
 670         return 0;
 671 }
 672
 673 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 674         assert(u);
 675
 676         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 677 }
 678
 679 /* Check if necessary controllers and attributes for a unit are in place.
 680  *
 681  * If so, do nothing.
 682  * If not, create paths, move processes over, and set attributes.
 683  *
 684  * Returns 0 on success and < 0 on failure. */
 685 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 686         CGroupControllerMask mask;
 687         int r;
 688
 689         assert(u);
 690
 691         if (u->in_cgroup_queue) {
 692                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 693                 u->in_cgroup_queue = false;
 694         }
 695
 696         mask = unit_get_target_mask(u);
 697
 698         if (unit_has_mask_realized(u, mask))
 699                 return 0;
 700
 701         /* First, realize parents */
 702         if (UNIT_ISSET(u->slice)) {
 703                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 704                 if (r < 0)
 705                         return r;
 706         }
 707
 708         /* And then do the real work */
 709         r = unit_create_cgroups(u, mask);
 710         if (r < 0)
 711                 return r;
 712
 713         /* Finally, apply the necessary attributes. */
 714         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 715
 716         return 0;
 717 }
 718
 719 static void unit_add_to_cgroup_queue(Unit *u) {
 720
 721         if (u->in_cgroup_queue)
 722                 return;
 723
 724         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 725         u->in_cgroup_queue = true;
 726 }
 727
 728 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 729         ManagerState state;
 730         unsigned n = 0;
 731         Unit *i;
 732         int r;
 733
 734         state = manager_state(m);
 735
 736         while ((i = m->cgroup_queue)) {
 737                 assert(i->in_cgroup_queue);
 738
 739                 r = unit_realize_cgroup_now(i, state);
 740                 if (r < 0)
 741                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 742
 743                 n++;
 744         }
 745
 746         return n;
 747 }
 748
 749 static void unit_queue_siblings(Unit *u) {
 750         Unit *slice;
 751
 752         /* This adds the siblings of the specified unit and the
 753          * siblings of all parent units to the cgroup queue. (But
 754          * neither the specified unit itself nor the parents.) */
 755
 756         while ((slice = UNIT_DEREF(u->slice))) {
 757                 Iterator i;
 758                 Unit *m;
 759
 760                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 761                         if (m == u)
 762                                 continue;
 763
 764                         /* Skip units that have a dependency on the slice
 765                          * but aren't actually in it. */
 766                         if (UNIT_DEREF(m->slice) != slice)
 767                                 continue;
 768
 769                         /* No point in doing cgroup application for units
 770                          * without active processes. */
 771                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 772                                 continue;
 773
 774                         /* If the unit doesn't need any new controllers
 775                          * and has current ones realized, it doesn't need
 776                          * any changes. */
 777                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 778                                 continue;
 779
 780                         unit_add_to_cgroup_queue(m);
 781                 }
 782
 783                 u = slice;
 784         }
 785 }
 786
 787 int unit_realize_cgroup(Unit *u) {
 788         CGroupContext *c;
 789
 790         assert(u);
 791
 792         c = unit_get_cgroup_context(u);
 793         if (!c)
 794                 return 0;
 795
 796         /* So, here's the deal: when realizing the cgroups for this
 797          * unit, we need to first create all parents, but there's more
 798          * actually: for the weight-based controllers we also need to
 799          * make sure that all our siblings (i.e. units that are in the
 800          * same slice as we are) have cgroups, too. Otherwise, things
 801          * would become very uneven as each of their processes would
 802          * get as much resources as all our group together. This call
 803          * will synchronously create the parent cgroups, but will
 804          * defer work on the siblings to the next event loop
 805          * iteration. */
 806
 807         /* Add all sibling slices to the cgroup queue. */
 808         unit_queue_siblings(u);
 809
 810         /* And realize this one now (and apply the values) */
 811         return unit_realize_cgroup_now(u, manager_state(u->manager));
 812 }
 813
 814 void unit_destroy_cgroup(Unit *u) {
 815         int r;
 816
 817         assert(u);
 818
 819         if (!u->cgroup_path)
 820                 return;
 821
 822         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 823         if (r < 0)
 824                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 825
 826         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 827
 828         free(u->cgroup_path);
 829         u->cgroup_path = NULL;
 830         u->cgroup_realized = false;
 831         u->cgroup_realized_mask = 0;
 832
 833 }
 834
 835 pid_t unit_search_main_pid(Unit *u) {
 836         _cleanup_fclose_ FILE *f = NULL;
 837         pid_t pid = 0, npid, mypid;
 838
 839         assert(u);
 840
 841         if (!u->cgroup_path)
 842                 return 0;
 843
 844         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 845                 return 0;
 846
 847         mypid = getpid();
 848         while (cg_read_pid(f, &npid) > 0)  {
 849                 pid_t ppid;
 850
 851                 if (npid == pid)
 852                         continue;
 853
 854                 /* Ignore processes that aren't our kids */
 855                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 856                         continue;
 857
 858                 if (pid != 0) {
 859                         /* Dang, there's more than one daemonized PID
 860                         in this group, so we don't know what process
 861                         is the main process. */
 862                         pid = 0;
 863                         break;
 864                 }
 865
 866                 pid = npid;
 867         }
 868
 869         return pid;
 870 }
 871
 872 int manager_setup_cgroup(Manager *m) {
 873         _cleanup_free_ char *path = NULL;
 874         char *e;
 875         int r;
 876
 877         assert(m);
 878
 879         /* 1. Determine hierarchy */
 880         free(m->cgroup_root);
 881         m->cgroup_root = NULL;
 882
 883         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 884         if (r < 0) {
 885                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 886                 return r;
 887         }
 888
 889         /* LEGACY: Already in /system.slice? If so, let's cut this
 890          * off. This is to support live upgrades from older systemd
 891          * versions where PID 1 was moved there. */
 892         if (m->running_as == SYSTEMD_SYSTEM) {
 893                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 894                 if (!e)
 895                         e = endswith(m->cgroup_root, "/system");
 896                 if (e)
 897                         *e = 0;
 898         }
 899
 900         /* And make sure to store away the root value without trailing
 901          * slash, even for the root dir, so that we can easily prepend
 902          * it everywhere. */
 903         if (streq(m->cgroup_root, "/"))
 904                 m->cgroup_root[0] = 0;
 905
 906         /* 2. Show data */
 907         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 908         if (r < 0) {
 909                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 910                 return r;
 911         }
 912
 913         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 914
 915         /* 3. Install agent */
 916         if (m->running_as == SYSTEMD_SYSTEM) {
 917                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 918                 if (r < 0)
 919                         log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 920                 else if (r > 0)
 921                         log_debug("Installed release agent.");
 922                 else
 923                         log_debug("Release agent already installed.");
 924         }
 925
 926         /* 4. Make sure we are in the root cgroup */
 927         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 928         if (r < 0) {
 929                 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 930                 return r;
 931         }
 932
 933         /* 5. And pin it, so that it cannot be unmounted */
 934         safe_close(m->pin_cgroupfs_fd);
 935
 936         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 937         if (m->pin_cgroupfs_fd < 0) {
 938                 log_error("Failed to open pin file: %m");
 939                 return -errno;
 940         }
 941
 942         /* 6. Figure out which controllers are supported */
 943         m->cgroup_supported = cg_mask_supported();
 944
 945         /* 7.  Always enable hierarchial support if it exists... */
 946         cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 947
 948         return 0;
 949 }
 950
 951 void manager_shutdown_cgroup(Manager *m, bool delete) {
 952         assert(m);
 953
 954         /* We can't really delete the group, since we are in it. But
 955          * let's trim it. */
 956         if (delete && m->cgroup_root)
 957                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 958
 959         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 960
 961         free(m->cgroup_root);
 962         m->cgroup_root = NULL;
 963 }
 964
 965 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 966         char *p;
 967         Unit *u;
 968
 969         assert(m);
 970         assert(cgroup);
 971
 972         u = hashmap_get(m->cgroup_unit, cgroup);
 973         if (u)
 974                 return u;
 975
 976         p = strdupa(cgroup);
 977         for (;;) {
 978                 char *e;
 979
 980                 e = strrchr(p, '/');
 981                 if (e == p || !e)
 982                         return NULL;
 983
 984                 *e = 0;
 985
 986                 u = hashmap_get(m->cgroup_unit, p);
 987                 if (u)
 988                         return u;
 989         }
 990 }
 991
 992 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 993         _cleanup_free_ char *cgroup = NULL;
 994         int r;
 995
 996         assert(m);
 997
 998         if (pid <= 1)
 999                 return NULL;
1000
1001         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1002         if (r < 0)
1003                 return NULL;
1004
1005         return manager_get_unit_by_cgroup(m, cgroup);
1006 }
1007
1008 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1009         Unit *u;
1010         int r;
1011
1012         assert(m);
1013         assert(cgroup);
1014
1015         u = manager_get_unit_by_cgroup(m, cgroup);
1016         if (u) {
1017                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1018                 if (r > 0) {
1019                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1020                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1021
1022                         unit_add_to_gc_queue(u);
1023                 }
1024         }
1025
1026         return 0;
1027 }
1028
1029 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1030         [CGROUP_AUTO] = "auto",
1031         [CGROUP_CLOSED] = "closed",
1032         [CGROUP_STRICT] = "strict",
1033 };
1034
1035 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);