src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 void cgroup_context_init(CGroupContext *c) {
  31         assert(c);
  32
  33         /* Initialize everything to the kernel defaults, assuming the
  34          * structure is preinitialized to 0 */
  35
  36         c->cpu_shares = 1024;
  37         c->startup_cpu_shares = 1024;
  38         c->startup_cpu_shares_set = false;
  39         c->memory_limit = (uint64_t) -1;
  40         c->blockio_weight = 1000;
  41         c->startup_blockio_weight = 1000;
  42         c->startup_blockio_weight_set = false;
  43
  44         c->cpu_quota_per_sec_usec = (usec_t) -1;
  45         c->cpu_quota_usec = (usec_t) -1;
  46         c->cpu_quota_period_usec = 100*USEC_PER_MSEC;
  47 }
  48
  49 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  50         assert(c);
  51         assert(a);
  52
  53         LIST_REMOVE(device_allow, c->device_allow, a);
  54         free(a->path);
  55         free(a);
  56 }
  57
  58 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  59         assert(c);
  60         assert(w);
  61
  62         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  63         free(w->path);
  64         free(w);
  65 }
  66
  67 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  68         assert(c);
  69         assert(b);
  70
  71         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  72         free(b->path);
  73         free(b);
  74 }
  75
  76 void cgroup_context_done(CGroupContext *c) {
  77         assert(c);
  78
  79         while (c->blockio_device_weights)
  80                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  81
  82         while (c->blockio_device_bandwidths)
  83                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  84
  85         while (c->device_allow)
  86                 cgroup_context_free_device_allow(c, c->device_allow);
  87 }
  88
  89 usec_t cgroup_context_get_cpu_quota_usec(CGroupContext *c) {
  90         assert(c);
  91
  92         /* Returns the absolute CPU quota */
  93
  94         if (c->cpu_quota_usec != (usec_t) -1)
  95                 return c->cpu_quota_usec;
  96         else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
  97                 return c->cpu_quota_per_sec_usec*c->cpu_quota_period_usec/USEC_PER_SEC;
  98         else
  99                 return (usec_t) -1;
 100 }
 101
 102 usec_t cgroup_context_get_cpu_quota_per_sec_usec(CGroupContext *c) {
 103         assert(c);
 104
 105         /* Returns the CPU quota relative to 1s */
 106
 107         if (c->cpu_quota_usec != (usec_t) -1)
 108                 return c->cpu_quota_usec*USEC_PER_SEC/c->cpu_quota_period_usec;
 109         else if (c->cpu_quota_per_sec_usec != (usec_t) -1)
 110                 return c->cpu_quota_per_sec_usec;
 111         else
 112                 return (usec_t) -1;
 113 }
 114
 115 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 116         CGroupBlockIODeviceBandwidth *b;
 117         CGroupBlockIODeviceWeight *w;
 118         CGroupDeviceAllow *a;
 119         char t[FORMAT_TIMESPAN_MAX], s[FORMAT_TIMESPAN_MAX], u[FORMAT_TIMESPAN_MAX];
 120
 121         assert(c);
 122         assert(f);
 123
 124         prefix = strempty(prefix);
 125
 126         fprintf(f,
 127                 "%sCPUAccounting=%s\n"
 128                 "%sBlockIOAccounting=%s\n"
 129                 "%sMemoryAccounting=%s\n"
 130                 "%sCPUShares=%lu\n"
 131                 "%sStartupCPUShares=%lu\n"
 132                 "%sCPUQuota=%s\n"
 133                 "%sCPUQuotaPerSecSec=%s\n"
 134                 "%sCPUQuotaPeriodSec=%s\n"
 135                 "%sBlockIOWeight=%lu\n"
 136                 "%sStartupBlockIOWeight=%lu\n"
 137                 "%sMemoryLimit=%" PRIu64 "\n"
 138                 "%sDevicePolicy=%s\n",
 139                 prefix, yes_no(c->cpu_accounting),
 140                 prefix, yes_no(c->blockio_accounting),
 141                 prefix, yes_no(c->memory_accounting),
 142                 prefix, c->cpu_shares,
 143                 prefix, c->startup_cpu_shares,
 144                 prefix, strna(format_timespan(u, sizeof(u), cgroup_context_get_cpu_quota_usec(c), 1)),
 145                 prefix, strna(format_timespan(t, sizeof(t), cgroup_context_get_cpu_quota_per_sec_usec(c), 1)),
 146                 prefix, strna(format_timespan(s, sizeof(s), c->cpu_quota_period_usec, 1)),
 147                 prefix, c->blockio_weight,
 148                 prefix, c->startup_blockio_weight,
 149                 prefix, c->memory_limit,
 150                 prefix, cgroup_device_policy_to_string(c->device_policy));
 151
 152         LIST_FOREACH(device_allow, a, c->device_allow)
 153                 fprintf(f,
 154                         "%sDeviceAllow=%s %s%s%s\n",
 155                         prefix,
 156                         a->path,
 157                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 158
 159         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 160                 fprintf(f,
 161                         "%sBlockIODeviceWeight=%s %lu",
 162                         prefix,
 163                         w->path,
 164                         w->weight);
 165
 166         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 167                 char buf[FORMAT_BYTES_MAX];
 168
 169                 fprintf(f,
 170                         "%s%s=%s %s\n",
 171                         prefix,
 172                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 173                         b->path,
 174                         format_bytes(buf, sizeof(buf), b->bandwidth));
 175         }
 176 }
 177
 178 static int lookup_blkio_device(const char *p, dev_t *dev) {
 179         struct stat st;
 180         int r;
 181
 182         assert(p);
 183         assert(dev);
 184
 185         r = stat(p, &st);
 186         if (r < 0) {
 187                 log_warning("Couldn't stat device %s: %m", p);
 188                 return -errno;
 189         }
 190
 191         if (S_ISBLK(st.st_mode))
 192                 *dev = st.st_rdev;
 193         else if (major(st.st_dev) != 0) {
 194                 /* If this is not a device node then find the block
 195                  * device this file is stored on */
 196                 *dev = st.st_dev;
 197
 198                 /* If this is a partition, try to get the originating
 199                  * block device */
 200                 block_get_whole_disk(*dev, dev);
 201         } else {
 202                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 203                 return -ENODEV;
 204         }
 205
 206         return 0;
 207 }
 208
 209 static int whitelist_device(const char *path, const char *node, const char *acc) {
 210         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 211         struct stat st;
 212         int r;
 213
 214         assert(path);
 215         assert(acc);
 216
 217         if (stat(node, &st) < 0) {
 218                 log_warning("Couldn't stat device %s", node);
 219                 return -errno;
 220         }
 221
 222         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 223                 log_warning("%s is not a device.", node);
 224                 return -ENODEV;
 225         }
 226
 227         sprintf(buf,
 228                 "%c %u:%u %s",
 229                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 230                 major(st.st_rdev), minor(st.st_rdev),
 231                 acc);
 232
 233         r = cg_set_attribute("devices", path, "devices.allow", buf);
 234         if (r < 0)
 235                 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 236
 237         return r;
 238 }
 239
 240 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 241         _cleanup_fclose_ FILE *f = NULL;
 242         char line[LINE_MAX];
 243         bool good = false;
 244         int r;
 245
 246         assert(path);
 247         assert(acc);
 248         assert(type == 'b' || type == 'c');
 249
 250         f = fopen("/proc/devices", "re");
 251         if (!f) {
 252                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 253                 return -errno;
 254         }
 255
 256         FOREACH_LINE(line, f, goto fail) {
 257                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 258                 unsigned maj;
 259
 260                 truncate_nl(line);
 261
 262                 if (type == 'c' && streq(line, "Character devices:")) {
 263                         good = true;
 264                         continue;
 265                 }
 266
 267                 if (type == 'b' && streq(line, "Block devices:")) {
 268                         good = true;
 269                         continue;
 270                 }
 271
 272                 if (isempty(line)) {
 273                         good = false;
 274                         continue;
 275                 }
 276
 277                 if (!good)
 278                         continue;
 279
 280                 p = strstrip(line);
 281
 282                 w = strpbrk(p, WHITESPACE);
 283                 if (!w)
 284                         continue;
 285                 *w = 0;
 286
 287                 r = safe_atou(p, &maj);
 288                 if (r < 0)
 289                         continue;
 290                 if (maj <= 0)
 291                         continue;
 292
 293                 w++;
 294                 w += strspn(w, WHITESPACE);
 295
 296                 if (fnmatch(name, w, 0) != 0)
 297                         continue;
 298
 299                 sprintf(buf,
 300                         "%c %u:* %s",
 301                         type,
 302                         maj,
 303                         acc);
 304
 305                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 306                 if (r < 0)
 307                         log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 308         }
 309
 310         return 0;
 311
 312 fail:
 313         log_warning("Failed to read /proc/devices: %m");
 314         return -errno;
 315 }
 316
 317 void cgroup_context_apply(Manager *m, CGroupContext *c, CGroupControllerMask mask, const char *path) {
 318         bool is_root;
 319         int r;
 320
 321         assert(c);
 322         assert(path);
 323
 324         if (mask == 0)
 325                 return;
 326
 327         /* Some cgroup attributes are not support on the root cgroup,
 328          * hence silently ignore */
 329         is_root = isempty(path) || path_equal(path, "/");
 330
 331         if ((mask & CGROUP_CPU) && !is_root) {
 332                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 333                 usec_t q;
 334
 335                 sprintf(buf, "%lu\n", manager_state(m) == MANAGER_STARTING
 336                         ? c->startup_cpu_shares
 337                         : c->cpu_shares);
 338                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 339                 if (r < 0)
 340                         log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
 341
 342                 sprintf(buf, USEC_FMT "\n", c->cpu_quota_period_usec);
 343                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 344                 if (r < 0)
 345                         log_warning("Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
 346
 347                 q = cgroup_context_get_cpu_quota_usec(c);
 348                 if (q != (usec_t) -1) {
 349                         sprintf(buf, USEC_FMT "\n", q);
 350                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 351                 } else
 352                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 353                 if (r < 0)
 354                         log_warning("Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
 355         }
 356
 357         if (mask & CGROUP_BLKIO) {
 358                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 359                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 360                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 361                 CGroupBlockIODeviceWeight *w;
 362                 CGroupBlockIODeviceBandwidth *b;
 363
 364                 if (!is_root) {
 365                         sprintf(buf, "%lu\n", manager_state(m) == MANAGER_STARTING
 366                                 ? c->startup_blockio_weight
 367                                 : c->blockio_weight);
 368                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 369                         if (r < 0)
 370                                 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
 371
 372                         /* FIXME: no way to reset this list */
 373                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 374                                 dev_t dev;
 375
 376                                 r = lookup_blkio_device(w->path, &dev);
 377                                 if (r < 0)
 378                                         continue;
 379
 380                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 381                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 382                                 if (r < 0)
 383                                         log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 384                         }
 385                 }
 386
 387                 /* FIXME: no way to reset this list */
 388                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 389                         const char *a;
 390                         dev_t dev;
 391
 392                         r = lookup_blkio_device(b->path, &dev);
 393                         if (r < 0)
 394                                 continue;
 395
 396                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 397
 398                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 399                         r = cg_set_attribute("blkio", path, a, buf);
 400                         if (r < 0)
 401                                 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
 402                 }
 403         }
 404
 405         if (mask & CGROUP_MEMORY) {
 406                 if (c->memory_limit != (uint64_t) -1) {
 407                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 408
 409                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 410                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 411                 } else
 412                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 413
 414                 if (r < 0)
 415                         log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 416         }
 417
 418         if ((mask & CGROUP_DEVICE) && !is_root) {
 419                 CGroupDeviceAllow *a;
 420
 421                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 422                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 423                 else
 424                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 425                 if (r < 0)
 426                         log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
 427
 428                 if (c->device_policy == CGROUP_CLOSED ||
 429                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 430                         static const char auto_devices[] =
 431                                 "/dev/null\0" "rwm\0"
 432                                 "/dev/zero\0" "rwm\0"
 433                                 "/dev/full\0" "rwm\0"
 434                                 "/dev/random\0" "rwm\0"
 435                                 "/dev/urandom\0" "rwm\0"
 436                                 "/dev/tty\0" "rwm\0"
 437                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 438
 439                         const char *x, *y;
 440
 441                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 442                                 whitelist_device(path, x, y);
 443
 444                         whitelist_major(path, "pts", 'c', "rw");
 445                         whitelist_major(path, "kdbus", 'c', "rw");
 446                         whitelist_major(path, "kdbus/*", 'c', "rw");
 447                 }
 448
 449                 LIST_FOREACH(device_allow, a, c->device_allow) {
 450                         char acc[4];
 451                         unsigned k = 0;
 452
 453                         if (a->r)
 454                                 acc[k++] = 'r';
 455                         if (a->w)
 456                                 acc[k++] = 'w';
 457                         if (a->m)
 458                                 acc[k++] = 'm';
 459
 460                         if (k == 0)
 461                                 continue;
 462
 463                         acc[k++] = 0;
 464
 465                         if (startswith(a->path, "/dev/"))
 466                                 whitelist_device(path, a->path, acc);
 467                         else if (startswith(a->path, "block-"))
 468                                 whitelist_major(path, a->path + 6, 'b', acc);
 469                         else if (startswith(a->path, "char-"))
 470                                 whitelist_major(path, a->path + 5, 'c', acc);
 471                         else
 472                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 473                 }
 474         }
 475 }
 476
 477 CGroupControllerMask cgroup_context_get_mask(Manager *m, CGroupContext *c) {
 478         CGroupControllerMask mask = 0;
 479
 480         /* Figure out which controllers we need */
 481
 482         if (c->cpu_accounting ||
 483             (manager_state(m) == MANAGER_STARTING ? c->startup_cpu_shares : c->cpu_shares) != 1024 ||
 484             (manager_state(m) != MANAGER_STARTING && c->startup_cpu_shares_set && c->startup_cpu_shares != c->cpu_shares) ||
 485             c->cpu_quota_usec != (usec_t) -1 ||
 486             c->cpu_quota_per_sec_usec != (usec_t) -1) {
 487                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 488                 if (manager_state(m) != MANAGER_STARTING)
 489                         c->startup_cpu_shares_set = false;
 490         }
 491
 492         if (c->blockio_accounting ||
 493             (manager_state(m) == MANAGER_STARTING ? c->startup_blockio_weight : c->blockio_weight) != 1000 ||
 494             (manager_state(m) != MANAGER_STARTING && c->startup_blockio_weight_set && c->startup_blockio_weight != c->blockio_weight) ||
 495             c->blockio_device_weights ||
 496             c->blockio_device_bandwidths) {
 497                 mask |= CGROUP_BLKIO;
 498                 if (manager_state(m) != MANAGER_STARTING)
 499                         c->startup_blockio_weight_set = false;
 500         }
 501
 502         if (c->memory_accounting ||
 503             c->memory_limit != (uint64_t) -1)
 504                 mask |= CGROUP_MEMORY;
 505
 506         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 507                 mask |= CGROUP_DEVICE;
 508
 509         return mask;
 510 }
 511
 512 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 513         CGroupContext *c;
 514
 515         c = unit_get_cgroup_context(u);
 516         if (!c)
 517                 return 0;
 518
 519         return cgroup_context_get_mask(u->manager, c);
 520 }
 521
 522 CGroupControllerMask unit_get_members_mask(Unit *u) {
 523         assert(u);
 524
 525         if (u->cgroup_members_mask_valid)
 526                 return u->cgroup_members_mask;
 527
 528         u->cgroup_members_mask = 0;
 529
 530         if (u->type == UNIT_SLICE) {
 531                 Unit *member;
 532                 Iterator i;
 533
 534                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 535
 536                         if (member == u)
 537                                 continue;
 538
 539                         if (UNIT_DEREF(member->slice) != u)
 540                                 continue;
 541
 542                         u->cgroup_members_mask |=
 543                                 unit_get_cgroup_mask(member) |
 544                                 unit_get_members_mask(member);
 545                 }
 546         }
 547
 548         u->cgroup_members_mask_valid = true;
 549         return u->cgroup_members_mask;
 550 }
 551
 552 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 553         CGroupControllerMask m;
 554
 555         assert(u);
 556
 557         if (UNIT_ISSET(u->slice))
 558                 m = unit_get_members_mask(UNIT_DEREF(u->slice));
 559         else
 560                 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 561
 562         /* Sibling propagation is only relevant for weight-based
 563          * controllers, so let's mask out everything else */
 564         return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
 565 }
 566
 567 CGroupControllerMask unit_get_target_mask(Unit *u) {
 568         CGroupControllerMask mask;
 569
 570         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 571         mask &= u->manager->cgroup_supported;
 572
 573         return mask;
 574 }
 575
 576 /* Recurse from a unit up through its containing slices, propagating
 577  * mask bits upward. A unit is also member of itself. */
 578 void unit_update_cgroup_members_masks(Unit *u) {
 579         CGroupControllerMask m;
 580         bool more;
 581
 582         assert(u);
 583
 584         /* Calculate subtree mask */
 585         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 586
 587         /* See if anything changed from the previous invocation. If
 588          * not, we're done. */
 589         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 590                 return;
 591
 592         more =
 593                 u->cgroup_subtree_mask_valid &&
 594                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 595                 ((~m & u->cgroup_subtree_mask) == 0);
 596
 597         u->cgroup_subtree_mask = m;
 598         u->cgroup_subtree_mask_valid = true;
 599
 600         if (UNIT_ISSET(u->slice)) {
 601                 Unit *s = UNIT_DEREF(u->slice);
 602
 603                 if (more)
 604                         /* There's more set now than before. We
 605                          * propagate the new mask to the parent's mask
 606                          * (not caring if it actually was valid or
 607                          * not). */
 608
 609                         s->cgroup_members_mask |= m;
 610
 611                 else
 612                         /* There's less set now than before (or we
 613                          * don't know), we need to recalculate
 614                          * everything, so let's invalidate the
 615                          * parent's members mask */
 616
 617                         s->cgroup_members_mask_valid = false;
 618
 619                 /* And now make sure that this change also hits our
 620                  * grandparents */
 621                 unit_update_cgroup_members_masks(s);
 622         }
 623 }
 624
 625 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 626         Unit *u = userdata;
 627
 628         assert(mask != 0);
 629         assert(u);
 630
 631         while (u) {
 632                 if (u->cgroup_path &&
 633                     u->cgroup_realized &&
 634                     (u->cgroup_realized_mask & mask) == mask)
 635                         return u->cgroup_path;
 636
 637                 u = UNIT_DEREF(u->slice);
 638         }
 639
 640         return NULL;
 641 }
 642
 643 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 644         _cleanup_free_ char *path = NULL;
 645         int r;
 646
 647         assert(u);
 648
 649         path = unit_default_cgroup_path(u);
 650         if (!path)
 651                 return log_oom();
 652
 653         r = hashmap_put(u->manager->cgroup_unit, path, u);
 654         if (r < 0) {
 655                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 656                 return r;
 657         }
 658         if (r > 0) {
 659                 u->cgroup_path = path;
 660                 path = NULL;
 661         }
 662
 663         /* First, create our own group */
 664         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 665         if (r < 0) {
 666                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 667                 return r;
 668         }
 669
 670         /* Keep track that this is now realized */
 671         u->cgroup_realized = true;
 672         u->cgroup_realized_mask = mask;
 673
 674         /* Then, possibly move things over */
 675         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 676         if (r < 0)
 677                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 678
 679         return 0;
 680 }
 681
 682 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 683         assert(u);
 684
 685         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 686 }
 687
 688 /* Check if necessary controllers and attributes for a unit are in place.
 689  *
 690  * If so, do nothing.
 691  * If not, create paths, move processes over, and set attributes.
 692  *
 693  * Returns 0 on success and < 0 on failure. */
 694 static int unit_realize_cgroup_now(Unit *u) {
 695         CGroupControllerMask mask;
 696         int r;
 697
 698         assert(u);
 699
 700         if (u->in_cgroup_queue) {
 701                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 702                 u->in_cgroup_queue = false;
 703         }
 704
 705         mask = unit_get_target_mask(u);
 706
 707         if (unit_has_mask_realized(u, mask))
 708                 return 0;
 709
 710         /* First, realize parents */
 711         if (UNIT_ISSET(u->slice)) {
 712                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
 713                 if (r < 0)
 714                         return r;
 715         }
 716
 717         /* And then do the real work */
 718         r = unit_create_cgroups(u, mask);
 719         if (r < 0)
 720                 return r;
 721
 722         /* Finally, apply the necessary attributes. */
 723         cgroup_context_apply(u->manager, unit_get_cgroup_context(u), mask, u->cgroup_path);
 724
 725         return 0;
 726 }
 727
 728 static void unit_add_to_cgroup_queue(Unit *u) {
 729
 730         if (u->in_cgroup_queue)
 731                 return;
 732
 733         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 734         u->in_cgroup_queue = true;
 735 }
 736
 737 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 738         Unit *i;
 739         unsigned n = 0;
 740         int r;
 741
 742         while ((i = m->cgroup_queue)) {
 743                 assert(i->in_cgroup_queue);
 744
 745                 r = unit_realize_cgroup_now(i);
 746                 if (r < 0)
 747                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 748
 749                 n++;
 750         }
 751
 752         return n;
 753 }
 754
 755 static void unit_queue_siblings(Unit *u) {
 756         Unit *slice;
 757
 758         /* This adds the siblings of the specified unit and the
 759          * siblings of all parent units to the cgroup queue. (But
 760          * neither the specified unit itself nor the parents.) */
 761
 762         while ((slice = UNIT_DEREF(u->slice))) {
 763                 Iterator i;
 764                 Unit *m;
 765
 766                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 767                         if (m == u)
 768                                 continue;
 769
 770                         /* Skip units that have a dependency on the slice
 771                          * but aren't actually in it. */
 772                         if (UNIT_DEREF(m->slice) != slice)
 773                                 continue;
 774
 775                         /* No point in doing cgroup application for units
 776                          * without active processes. */
 777                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 778                                 continue;
 779
 780                         /* If the unit doesn't need any new controllers
 781                          * and has current ones realized, it doesn't need
 782                          * any changes. */
 783                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 784                                 continue;
 785
 786                         unit_add_to_cgroup_queue(m);
 787                 }
 788
 789                 u = slice;
 790         }
 791 }
 792
 793 int unit_realize_cgroup(Unit *u) {
 794         CGroupContext *c;
 795
 796         assert(u);
 797
 798         c = unit_get_cgroup_context(u);
 799         if (!c)
 800                 return 0;
 801
 802         /* So, here's the deal: when realizing the cgroups for this
 803          * unit, we need to first create all parents, but there's more
 804          * actually: for the weight-based controllers we also need to
 805          * make sure that all our siblings (i.e. units that are in the
 806          * same slice as we are) have cgroups, too. Otherwise, things
 807          * would become very uneven as each of their processes would
 808          * get as much resources as all our group together. This call
 809          * will synchronously create the parent cgroups, but will
 810          * defer work on the siblings to the next event loop
 811          * iteration. */
 812
 813         /* Add all sibling slices to the cgroup queue. */
 814         unit_queue_siblings(u);
 815
 816         /* And realize this one now (and apply the values) */
 817         return unit_realize_cgroup_now(u);
 818 }
 819
 820 void unit_destroy_cgroup(Unit *u) {
 821         int r;
 822
 823         assert(u);
 824
 825         if (!u->cgroup_path)
 826                 return;
 827
 828         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 829         if (r < 0)
 830                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 831
 832         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 833
 834         free(u->cgroup_path);
 835         u->cgroup_path = NULL;
 836         u->cgroup_realized = false;
 837         u->cgroup_realized_mask = 0;
 838
 839 }
 840
 841 pid_t unit_search_main_pid(Unit *u) {
 842         _cleanup_fclose_ FILE *f = NULL;
 843         pid_t pid = 0, npid, mypid;
 844
 845         assert(u);
 846
 847         if (!u->cgroup_path)
 848                 return 0;
 849
 850         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 851                 return 0;
 852
 853         mypid = getpid();
 854         while (cg_read_pid(f, &npid) > 0)  {
 855                 pid_t ppid;
 856
 857                 if (npid == pid)
 858                         continue;
 859
 860                 /* Ignore processes that aren't our kids */
 861                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 862                         continue;
 863
 864                 if (pid != 0) {
 865                         /* Dang, there's more than one daemonized PID
 866                         in this group, so we don't know what process
 867                         is the main process. */
 868                         pid = 0;
 869                         break;
 870                 }
 871
 872                 pid = npid;
 873         }
 874
 875         return pid;
 876 }
 877
 878 int manager_setup_cgroup(Manager *m) {
 879         _cleanup_free_ char *path = NULL;
 880         char *e;
 881         int r;
 882
 883         assert(m);
 884
 885         /* 1. Determine hierarchy */
 886         free(m->cgroup_root);
 887         m->cgroup_root = NULL;
 888
 889         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 890         if (r < 0) {
 891                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 892                 return r;
 893         }
 894
 895         /* LEGACY: Already in /system.slice? If so, let's cut this
 896          * off. This is to support live upgrades from older systemd
 897          * versions where PID 1 was moved there. */
 898         if (m->running_as == SYSTEMD_SYSTEM) {
 899                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 900                 if (!e)
 901                         e = endswith(m->cgroup_root, "/system");
 902                 if (e)
 903                         *e = 0;
 904         }
 905
 906         /* And make sure to store away the root value without trailing
 907          * slash, even for the root dir, so that we can easily prepend
 908          * it everywhere. */
 909         if (streq(m->cgroup_root, "/"))
 910                 m->cgroup_root[0] = 0;
 911
 912         /* 2. Show data */
 913         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 914         if (r < 0) {
 915                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 916                 return r;
 917         }
 918
 919         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 920
 921         /* 3. Install agent */
 922         if (m->running_as == SYSTEMD_SYSTEM) {
 923                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 924                 if (r < 0)
 925                         log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 926                 else if (r > 0)
 927                         log_debug("Installed release agent.");
 928                 else
 929                         log_debug("Release agent already installed.");
 930         }
 931
 932         /* 4. Make sure we are in the root cgroup */
 933         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 934         if (r < 0) {
 935                 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 936                 return r;
 937         }
 938
 939         /* 5. And pin it, so that it cannot be unmounted */
 940         safe_close(m->pin_cgroupfs_fd);
 941
 942         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 943         if (m->pin_cgroupfs_fd < 0) {
 944                 log_error("Failed to open pin file: %m");
 945                 return -errno;
 946         }
 947
 948         /* 6. Figure out which controllers are supported */
 949         m->cgroup_supported = cg_mask_supported();
 950
 951         /* 7.  Always enable hierarchial support if it exists... */
 952         cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 953
 954         return 0;
 955 }
 956
 957 void manager_shutdown_cgroup(Manager *m, bool delete) {
 958         assert(m);
 959
 960         /* We can't really delete the group, since we are in it. But
 961          * let's trim it. */
 962         if (delete && m->cgroup_root)
 963                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 964
 965         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 966
 967         free(m->cgroup_root);
 968         m->cgroup_root = NULL;
 969 }
 970
 971 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 972         char *p;
 973         Unit *u;
 974
 975         assert(m);
 976         assert(cgroup);
 977
 978         u = hashmap_get(m->cgroup_unit, cgroup);
 979         if (u)
 980                 return u;
 981
 982         p = strdupa(cgroup);
 983         for (;;) {
 984                 char *e;
 985
 986                 e = strrchr(p, '/');
 987                 if (e == p || !e)
 988                         return NULL;
 989
 990                 *e = 0;
 991
 992                 u = hashmap_get(m->cgroup_unit, p);
 993                 if (u)
 994                         return u;
 995         }
 996 }
 997
 998 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 999         _cleanup_free_ char *cgroup = NULL;
1000         int r;
1001
1002         assert(m);
1003
1004         if (pid <= 1)
1005                 return NULL;
1006
1007         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1008         if (r < 0)
1009                 return NULL;
1010
1011         return manager_get_unit_by_cgroup(m, cgroup);
1012 }
1013
1014 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1015         Unit *u;
1016         int r;
1017
1018         assert(m);
1019         assert(cgroup);
1020
1021         u = manager_get_unit_by_cgroup(m, cgroup);
1022         if (u) {
1023                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1024                 if (r > 0) {
1025                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1026                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1027
1028                         unit_add_to_gc_queue(u);
1029                 }
1030         }
1031
1032         return 0;
1033 }
1034
1035 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1036         [CGROUP_AUTO] = "auto",
1037         [CGROUP_CLOSED] = "closed",
1038         [CGROUP_STRICT] = "strict",
1039 };
1040
1041 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);