src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  31
  32 void cgroup_context_init(CGroupContext *c) {
  33         assert(c);
  34
  35         /* Initialize everything to the kernel defaults, assuming the
  36          * structure is preinitialized to 0 */
  37
  38         c->cpu_shares = (unsigned long) -1;
  39         c->startup_cpu_shares = (unsigned long) -1;
  40         c->memory_limit = (uint64_t) -1;
  41         c->blockio_weight = (unsigned long) -1;
  42         c->startup_blockio_weight = (unsigned long) -1;
  43
  44         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  45 }
  46
  47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  48         assert(c);
  49         assert(a);
  50
  51         LIST_REMOVE(device_allow, c->device_allow, a);
  52         free(a->path);
  53         free(a);
  54 }
  55
  56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  57         assert(c);
  58         assert(w);
  59
  60         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  61         free(w->path);
  62         free(w);
  63 }
  64
  65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  66         assert(c);
  67         assert(b);
  68
  69         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  70         free(b->path);
  71         free(b);
  72 }
  73
  74 void cgroup_context_done(CGroupContext *c) {
  75         assert(c);
  76
  77         while (c->blockio_device_weights)
  78                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  79
  80         while (c->blockio_device_bandwidths)
  81                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  82
  83         while (c->device_allow)
  84                 cgroup_context_free_device_allow(c, c->device_allow);
  85 }
  86
  87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  88         CGroupBlockIODeviceBandwidth *b;
  89         CGroupBlockIODeviceWeight *w;
  90         CGroupDeviceAllow *a;
  91         char u[FORMAT_TIMESPAN_MAX];
  92
  93         assert(c);
  94         assert(f);
  95
  96         prefix = strempty(prefix);
  97
  98         fprintf(f,
  99                 "%sCPUAccounting=%s\n"
 100                 "%sBlockIOAccounting=%s\n"
 101                 "%sMemoryAccounting=%s\n"
 102                 "%sCPUShares=%lu\n"
 103                 "%sStartupCPUShares=%lu\n"
 104                 "%sCPUQuotaPerSecSec=%s\n"
 105                 "%sBlockIOWeight=%lu\n"
 106                 "%sStartupBlockIOWeight=%lu\n"
 107                 "%sMemoryLimit=%" PRIu64 "\n"
 108                 "%sDevicePolicy=%s\n"
 109                 "%sDelegate=%s\n",
 110                 prefix, yes_no(c->cpu_accounting),
 111                 prefix, yes_no(c->blockio_accounting),
 112                 prefix, yes_no(c->memory_accounting),
 113                 prefix, c->cpu_shares,
 114                 prefix, c->startup_cpu_shares,
 115                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 116                 prefix, c->blockio_weight,
 117                 prefix, c->startup_blockio_weight,
 118                 prefix, c->memory_limit,
 119                 prefix, cgroup_device_policy_to_string(c->device_policy),
 120                 prefix, yes_no(c->delegate));
 121
 122         LIST_FOREACH(device_allow, a, c->device_allow)
 123                 fprintf(f,
 124                         "%sDeviceAllow=%s %s%s%s\n",
 125                         prefix,
 126                         a->path,
 127                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 128
 129         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 130                 fprintf(f,
 131                         "%sBlockIODeviceWeight=%s %lu",
 132                         prefix,
 133                         w->path,
 134                         w->weight);
 135
 136         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 137                 char buf[FORMAT_BYTES_MAX];
 138
 139                 fprintf(f,
 140                         "%s%s=%s %s\n",
 141                         prefix,
 142                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 143                         b->path,
 144                         format_bytes(buf, sizeof(buf), b->bandwidth));
 145         }
 146 }
 147
 148 static int lookup_blkio_device(const char *p, dev_t *dev) {
 149         struct stat st;
 150         int r;
 151
 152         assert(p);
 153         assert(dev);
 154
 155         r = stat(p, &st);
 156         if (r < 0)
 157                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 158
 159         if (S_ISBLK(st.st_mode))
 160                 *dev = st.st_rdev;
 161         else if (major(st.st_dev) != 0) {
 162                 /* If this is not a device node then find the block
 163                  * device this file is stored on */
 164                 *dev = st.st_dev;
 165
 166                 /* If this is a partition, try to get the originating
 167                  * block device */
 168                 block_get_whole_disk(*dev, dev);
 169         } else {
 170                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 171                 return -ENODEV;
 172         }
 173
 174         return 0;
 175 }
 176
 177 static int whitelist_device(const char *path, const char *node, const char *acc) {
 178         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 179         struct stat st;
 180         int r;
 181
 182         assert(path);
 183         assert(acc);
 184
 185         if (stat(node, &st) < 0) {
 186                 log_warning("Couldn't stat device %s", node);
 187                 return -errno;
 188         }
 189
 190         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 191                 log_warning("%s is not a device.", node);
 192                 return -ENODEV;
 193         }
 194
 195         sprintf(buf,
 196                 "%c %u:%u %s",
 197                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 198                 major(st.st_rdev), minor(st.st_rdev),
 199                 acc);
 200
 201         r = cg_set_attribute("devices", path, "devices.allow", buf);
 202         if (r < 0)
 203                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 204                                "Failed to set devices.allow on %s: %m", path);
 205
 206         return r;
 207 }
 208
 209 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 210         _cleanup_fclose_ FILE *f = NULL;
 211         char line[LINE_MAX];
 212         bool good = false;
 213         int r;
 214
 215         assert(path);
 216         assert(acc);
 217         assert(type == 'b' || type == 'c');
 218
 219         f = fopen("/proc/devices", "re");
 220         if (!f)
 221                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 222
 223         FOREACH_LINE(line, f, goto fail) {
 224                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 225                 unsigned maj;
 226
 227                 truncate_nl(line);
 228
 229                 if (type == 'c' && streq(line, "Character devices:")) {
 230                         good = true;
 231                         continue;
 232                 }
 233
 234                 if (type == 'b' && streq(line, "Block devices:")) {
 235                         good = true;
 236                         continue;
 237                 }
 238
 239                 if (isempty(line)) {
 240                         good = false;
 241                         continue;
 242                 }
 243
 244                 if (!good)
 245                         continue;
 246
 247                 p = strstrip(line);
 248
 249                 w = strpbrk(p, WHITESPACE);
 250                 if (!w)
 251                         continue;
 252                 *w = 0;
 253
 254                 r = safe_atou(p, &maj);
 255                 if (r < 0)
 256                         continue;
 257                 if (maj <= 0)
 258                         continue;
 259
 260                 w++;
 261                 w += strspn(w, WHITESPACE);
 262
 263                 if (fnmatch(name, w, 0) != 0)
 264                         continue;
 265
 266                 sprintf(buf,
 267                         "%c %u:* %s",
 268                         type,
 269                         maj,
 270                         acc);
 271
 272                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 273                 if (r < 0)
 274                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 275                                        "Failed to set devices.allow on %s: %m", path);
 276         }
 277
 278         return 0;
 279
 280 fail:
 281         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 282         return -errno;
 283 }
 284
 285 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 286         bool is_root;
 287         int r;
 288
 289         assert(c);
 290         assert(path);
 291
 292         if (mask == 0)
 293                 return;
 294
 295         /* Some cgroup attributes are not supported on the root cgroup,
 296          * hence silently ignore */
 297         is_root = isempty(path) || path_equal(path, "/");
 298         if (is_root)
 299                 /* Make sure we don't try to display messages with an empty path. */
 300                 path = "/";
 301
 302         /* We generally ignore errors caused by read-only mounted
 303          * cgroup trees (assuming we are running in a container then),
 304          * and missing cgroups, i.e. EROFS and ENOENT. */
 305
 306         if ((mask & CGROUP_CPU) && !is_root) {
 307                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 308
 309                 sprintf(buf, "%lu\n",
 310                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 311                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 312                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 313                 if (r < 0)
 314                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 315                                        "Failed to set cpu.shares on %s: %m", path);
 316
 317                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 318                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 319                 if (r < 0)
 320                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 321                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 322
 323                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 324                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 325                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 326                 } else
 327                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 328                 if (r < 0)
 329                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 330                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 331         }
 332
 333         if (mask & CGROUP_BLKIO) {
 334                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 335                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 336                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 337                 CGroupBlockIODeviceWeight *w;
 338                 CGroupBlockIODeviceBandwidth *b;
 339
 340                 if (!is_root) {
 341                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 342                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 343                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 344                         if (r < 0)
 345                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 346                                                "Failed to set blkio.weight on %s: %m", path);
 347
 348                         /* FIXME: no way to reset this list */
 349                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 350                                 dev_t dev;
 351
 352                                 r = lookup_blkio_device(w->path, &dev);
 353                                 if (r < 0)
 354                                         continue;
 355
 356                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 357                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 358                                 if (r < 0)
 359                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 360                                                        "Failed to set blkio.weight_device on %s: %m", path);
 361                         }
 362                 }
 363
 364                 /* FIXME: no way to reset this list */
 365                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 366                         const char *a;
 367                         dev_t dev;
 368
 369                         r = lookup_blkio_device(b->path, &dev);
 370                         if (r < 0)
 371                                 continue;
 372
 373                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 374
 375                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 376                         r = cg_set_attribute("blkio", path, a, buf);
 377                         if (r < 0)
 378                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 379                                                "Failed to set %s on %s: %m", a, path);
 380                 }
 381         }
 382
 383         if ((mask & CGROUP_MEMORY) & !is_root) {
 384                 if (c->memory_limit != (uint64_t) -1) {
 385                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 386
 387                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 388                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 389                 } else
 390                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 391
 392                 if (r < 0)
 393                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 394                                        "Failed to set memory.limit_in_bytes on %s: %m", path);
 395         }
 396
 397         if ((mask & CGROUP_DEVICE) && !is_root) {
 398                 CGroupDeviceAllow *a;
 399
 400                 /* Changing the devices list of a populated cgroup
 401                  * might result in EINVAL, hence ignore EINVAL
 402                  * here. */
 403
 404                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 405                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 406                 else
 407                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 408                 if (r < 0)
 409                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 410                                        "Failed to reset devices.list on %s: %m", path);
 411
 412                 if (c->device_policy == CGROUP_CLOSED ||
 413                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 414                         static const char auto_devices[] =
 415                                 "/dev/null\0" "rwm\0"
 416                                 "/dev/zero\0" "rwm\0"
 417                                 "/dev/full\0" "rwm\0"
 418                                 "/dev/random\0" "rwm\0"
 419                                 "/dev/urandom\0" "rwm\0"
 420                                 "/dev/tty\0" "rwm\0"
 421                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 422
 423                         const char *x, *y;
 424
 425                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 426                                 whitelist_device(path, x, y);
 427
 428                         whitelist_major(path, "pts", 'c', "rw");
 429                         whitelist_major(path, "kdbus", 'c', "rw");
 430                         whitelist_major(path, "kdbus/*", 'c', "rw");
 431                 }
 432
 433                 LIST_FOREACH(device_allow, a, c->device_allow) {
 434                         char acc[4];
 435                         unsigned k = 0;
 436
 437                         if (a->r)
 438                                 acc[k++] = 'r';
 439                         if (a->w)
 440                                 acc[k++] = 'w';
 441                         if (a->m)
 442                                 acc[k++] = 'm';
 443
 444                         if (k == 0)
 445                                 continue;
 446
 447                         acc[k++] = 0;
 448
 449                         if (startswith(a->path, "/dev/"))
 450                                 whitelist_device(path, a->path, acc);
 451                         else if (startswith(a->path, "block-"))
 452                                 whitelist_major(path, a->path + 6, 'b', acc);
 453                         else if (startswith(a->path, "char-"))
 454                                 whitelist_major(path, a->path + 5, 'c', acc);
 455                         else
 456                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 457                 }
 458         }
 459 }
 460
 461 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 462         CGroupControllerMask mask = 0;
 463
 464         /* Figure out which controllers we need */
 465
 466         if (c->cpu_accounting ||
 467             c->cpu_shares != (unsigned long) -1 ||
 468             c->startup_cpu_shares != (unsigned long) -1 ||
 469             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 470                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 471
 472         if (c->blockio_accounting ||
 473             c->blockio_weight != (unsigned long) -1 ||
 474             c->startup_blockio_weight != (unsigned long) -1 ||
 475             c->blockio_device_weights ||
 476             c->blockio_device_bandwidths)
 477                 mask |= CGROUP_BLKIO;
 478
 479         if (c->memory_accounting ||
 480             c->memory_limit != (uint64_t) -1)
 481                 mask |= CGROUP_MEMORY;
 482
 483         if (c->device_allow ||
 484             c->device_policy != CGROUP_AUTO)
 485                 mask |= CGROUP_DEVICE;
 486
 487         return mask;
 488 }
 489
 490 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 491         CGroupContext *c;
 492
 493         c = unit_get_cgroup_context(u);
 494         if (!c)
 495                 return 0;
 496
 497         /* If delegation is turned on, then turn on all cgroups,
 498          * unless the process we fork into it is known to drop
 499          * privileges anyway, and shouldn't get access to the
 500          * controllers anyway. */
 501
 502         if (c->delegate) {
 503                 ExecContext *e;
 504
 505                 e = unit_get_exec_context(u);
 506                 if (!e || exec_context_maintains_privileges(e))
 507                         return _CGROUP_CONTROLLER_MASK_ALL;
 508         }
 509
 510         return cgroup_context_get_mask(c);
 511 }
 512
 513 CGroupControllerMask unit_get_members_mask(Unit *u) {
 514         assert(u);
 515
 516         if (u->cgroup_members_mask_valid)
 517                 return u->cgroup_members_mask;
 518
 519         u->cgroup_members_mask = 0;
 520
 521         if (u->type == UNIT_SLICE) {
 522                 Unit *member;
 523                 Iterator i;
 524
 525                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 526
 527                         if (member == u)
 528                                 continue;
 529
 530                         if (UNIT_DEREF(member->slice) != u)
 531                                 continue;
 532
 533                         u->cgroup_members_mask |=
 534                                 unit_get_cgroup_mask(member) |
 535                                 unit_get_members_mask(member);
 536                 }
 537         }
 538
 539         u->cgroup_members_mask_valid = true;
 540         return u->cgroup_members_mask;
 541 }
 542
 543 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 544         assert(u);
 545
 546         if (UNIT_ISSET(u->slice))
 547                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 548
 549         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 550 }
 551
 552 CGroupControllerMask unit_get_target_mask(Unit *u) {
 553         CGroupControllerMask mask;
 554
 555         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 556         mask &= u->manager->cgroup_supported;
 557
 558         return mask;
 559 }
 560
 561 /* Recurse from a unit up through its containing slices, propagating
 562  * mask bits upward. A unit is also member of itself. */
 563 void unit_update_cgroup_members_masks(Unit *u) {
 564         CGroupControllerMask m;
 565         bool more;
 566
 567         assert(u);
 568
 569         /* Calculate subtree mask */
 570         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 571
 572         /* See if anything changed from the previous invocation. If
 573          * not, we're done. */
 574         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 575                 return;
 576
 577         more =
 578                 u->cgroup_subtree_mask_valid &&
 579                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 580                 ((~m & u->cgroup_subtree_mask) == 0);
 581
 582         u->cgroup_subtree_mask = m;
 583         u->cgroup_subtree_mask_valid = true;
 584
 585         if (UNIT_ISSET(u->slice)) {
 586                 Unit *s = UNIT_DEREF(u->slice);
 587
 588                 if (more)
 589                         /* There's more set now than before. We
 590                          * propagate the new mask to the parent's mask
 591                          * (not caring if it actually was valid or
 592                          * not). */
 593
 594                         s->cgroup_members_mask |= m;
 595
 596                 else
 597                         /* There's less set now than before (or we
 598                          * don't know), we need to recalculate
 599                          * everything, so let's invalidate the
 600                          * parent's members mask */
 601
 602                         s->cgroup_members_mask_valid = false;
 603
 604                 /* And now make sure that this change also hits our
 605                  * grandparents */
 606                 unit_update_cgroup_members_masks(s);
 607         }
 608 }
 609
 610 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 611         Unit *u = userdata;
 612
 613         assert(mask != 0);
 614         assert(u);
 615
 616         while (u) {
 617                 if (u->cgroup_path &&
 618                     u->cgroup_realized &&
 619                     (u->cgroup_realized_mask & mask) == mask)
 620                         return u->cgroup_path;
 621
 622                 u = UNIT_DEREF(u->slice);
 623         }
 624
 625         return NULL;
 626 }
 627
 628 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 629         CGroupContext *c;
 630         int r;
 631
 632         assert(u);
 633
 634         c = unit_get_cgroup_context(u);
 635         if (!c)
 636                 return 0;
 637
 638         if (!u->cgroup_path) {
 639                 _cleanup_free_ char *path = NULL;
 640
 641                 path = unit_default_cgroup_path(u);
 642                 if (!path)
 643                         return log_oom();
 644
 645                 r = hashmap_put(u->manager->cgroup_unit, path, u);
 646                 if (r < 0) {
 647                         log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 648                         return r;
 649                 }
 650                 if (r > 0) {
 651                         u->cgroup_path = path;
 652                         path = NULL;
 653                 }
 654         }
 655
 656         /* First, create our own group */
 657         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 658         if (r < 0)
 659                 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
 660
 661         /* Keep track that this is now realized */
 662         u->cgroup_realized = true;
 663         u->cgroup_realized_mask = mask;
 664
 665         if (u->type != UNIT_SLICE && !c->delegate) {
 666
 667                 /* Then, possibly move things over, but not if
 668                  * subgroups may contain processes, which is the case
 669                  * for slice and delegation units. */
 670                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 671                 if (r < 0)
 672                         log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
 673         }
 674
 675         return 0;
 676 }
 677
 678 int unit_attach_pids_to_cgroup(Unit *u) {
 679         int r;
 680         assert(u);
 681
 682         r = unit_realize_cgroup(u);
 683         if (r < 0)
 684                 return r;
 685
 686         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 687         if (r < 0)
 688                 return r;
 689
 690         return 0;
 691 }
 692
 693 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 694         assert(u);
 695
 696         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 697 }
 698
 699 /* Check if necessary controllers and attributes for a unit are in place.
 700  *
 701  * If so, do nothing.
 702  * If not, create paths, move processes over, and set attributes.
 703  *
 704  * Returns 0 on success and < 0 on failure. */
 705 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 706         CGroupControllerMask mask;
 707         int r;
 708
 709         assert(u);
 710
 711         if (u->in_cgroup_queue) {
 712                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 713                 u->in_cgroup_queue = false;
 714         }
 715
 716         mask = unit_get_target_mask(u);
 717
 718         if (unit_has_mask_realized(u, mask))
 719                 return 0;
 720
 721         /* First, realize parents */
 722         if (UNIT_ISSET(u->slice)) {
 723                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 724                 if (r < 0)
 725                         return r;
 726         }
 727
 728         /* And then do the real work */
 729         r = unit_create_cgroups(u, mask);
 730         if (r < 0)
 731                 return r;
 732
 733         /* Finally, apply the necessary attributes. */
 734         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 735
 736         return 0;
 737 }
 738
 739 static void unit_add_to_cgroup_queue(Unit *u) {
 740
 741         if (u->in_cgroup_queue)
 742                 return;
 743
 744         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 745         u->in_cgroup_queue = true;
 746 }
 747
 748 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 749         ManagerState state;
 750         unsigned n = 0;
 751         Unit *i;
 752         int r;
 753
 754         state = manager_state(m);
 755
 756         while ((i = m->cgroup_queue)) {
 757                 assert(i->in_cgroup_queue);
 758
 759                 r = unit_realize_cgroup_now(i, state);
 760                 if (r < 0)
 761                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
 762
 763                 n++;
 764         }
 765
 766         return n;
 767 }
 768
 769 static void unit_queue_siblings(Unit *u) {
 770         Unit *slice;
 771
 772         /* This adds the siblings of the specified unit and the
 773          * siblings of all parent units to the cgroup queue. (But
 774          * neither the specified unit itself nor the parents.) */
 775
 776         while ((slice = UNIT_DEREF(u->slice))) {
 777                 Iterator i;
 778                 Unit *m;
 779
 780                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 781                         if (m == u)
 782                                 continue;
 783
 784                         /* Skip units that have a dependency on the slice
 785                          * but aren't actually in it. */
 786                         if (UNIT_DEREF(m->slice) != slice)
 787                                 continue;
 788
 789                         /* No point in doing cgroup application for units
 790                          * without active processes. */
 791                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 792                                 continue;
 793
 794                         /* If the unit doesn't need any new controllers
 795                          * and has current ones realized, it doesn't need
 796                          * any changes. */
 797                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 798                                 continue;
 799
 800                         unit_add_to_cgroup_queue(m);
 801                 }
 802
 803                 u = slice;
 804         }
 805 }
 806
 807 int unit_realize_cgroup(Unit *u) {
 808         CGroupContext *c;
 809
 810         assert(u);
 811
 812         c = unit_get_cgroup_context(u);
 813         if (!c)
 814                 return 0;
 815
 816         /* So, here's the deal: when realizing the cgroups for this
 817          * unit, we need to first create all parents, but there's more
 818          * actually: for the weight-based controllers we also need to
 819          * make sure that all our siblings (i.e. units that are in the
 820          * same slice as we are) have cgroups, too. Otherwise, things
 821          * would become very uneven as each of their processes would
 822          * get as much resources as all our group together. This call
 823          * will synchronously create the parent cgroups, but will
 824          * defer work on the siblings to the next event loop
 825          * iteration. */
 826
 827         /* Add all sibling slices to the cgroup queue. */
 828         unit_queue_siblings(u);
 829
 830         /* And realize this one now (and apply the values) */
 831         return unit_realize_cgroup_now(u, manager_state(u->manager));
 832 }
 833
 834 void unit_destroy_cgroup_if_empty(Unit *u) {
 835         int r;
 836
 837         assert(u);
 838
 839         if (!u->cgroup_path)
 840                 return;
 841
 842         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 843         if (r < 0) {
 844                 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
 845                 return;
 846         }
 847
 848         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 849
 850         free(u->cgroup_path);
 851         u->cgroup_path = NULL;
 852         u->cgroup_realized = false;
 853         u->cgroup_realized_mask = 0;
 854 }
 855
 856 pid_t unit_search_main_pid(Unit *u) {
 857         _cleanup_fclose_ FILE *f = NULL;
 858         pid_t pid = 0, npid, mypid;
 859
 860         assert(u);
 861
 862         if (!u->cgroup_path)
 863                 return 0;
 864
 865         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 866                 return 0;
 867
 868         mypid = getpid();
 869         while (cg_read_pid(f, &npid) > 0)  {
 870                 pid_t ppid;
 871
 872                 if (npid == pid)
 873                         continue;
 874
 875                 /* Ignore processes that aren't our kids */
 876                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 877                         continue;
 878
 879                 if (pid != 0) {
 880                         /* Dang, there's more than one daemonized PID
 881                         in this group, so we don't know what process
 882                         is the main process. */
 883                         pid = 0;
 884                         break;
 885                 }
 886
 887                 pid = npid;
 888         }
 889
 890         return pid;
 891 }
 892
 893 int manager_setup_cgroup(Manager *m) {
 894         _cleanup_free_ char *path = NULL;
 895         int r;
 896
 897         assert(m);
 898
 899         /* 1. Determine hierarchy */
 900         free(m->cgroup_root);
 901         m->cgroup_root = NULL;
 902
 903         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 904         if (r < 0)
 905                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
 906
 907         /* LEGACY: Already in /system.slice? If so, let's cut this
 908          * off. This is to support live upgrades from older systemd
 909          * versions where PID 1 was moved there. */
 910         if (m->running_as == SYSTEMD_SYSTEM) {
 911                 char *e;
 912
 913                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 914                 if (!e)
 915                         e = endswith(m->cgroup_root, "/system");
 916                 if (e)
 917                         *e = 0;
 918         }
 919
 920         /* And make sure to store away the root value without trailing
 921          * slash, even for the root dir, so that we can easily prepend
 922          * it everywhere. */
 923         if (streq(m->cgroup_root, "/"))
 924                 m->cgroup_root[0] = 0;
 925
 926         /* 2. Show data */
 927         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 928         if (r < 0)
 929                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
 930
 931         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 932         if (!m->test_run) {
 933
 934                 /* 3. Install agent */
 935                 if (m->running_as == SYSTEMD_SYSTEM) {
 936                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 937                         if (r < 0)
 938                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
 939                         else if (r > 0)
 940                                 log_debug("Installed release agent.");
 941                         else
 942                                 log_debug("Release agent already installed.");
 943                 }
 944
 945                 /* 4. Make sure we are in the root cgroup */
 946                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 947                 if (r < 0)
 948                         return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
 949
 950                 /* 5. And pin it, so that it cannot be unmounted */
 951                 safe_close(m->pin_cgroupfs_fd);
 952
 953                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 954                 if (m->pin_cgroupfs_fd < 0)
 955                         return log_error_errno(errno, "Failed to open pin file: %m");
 956
 957                 /* 6.  Always enable hierarchical support if it exists... */
 958                 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 959         }
 960
 961         /* 7. Figure out which controllers are supported */
 962         m->cgroup_supported = cg_mask_supported();
 963
 964         return 0;
 965 }
 966
 967 void manager_shutdown_cgroup(Manager *m, bool delete) {
 968         assert(m);
 969
 970         /* We can't really delete the group, since we are in it. But
 971          * let's trim it. */
 972         if (delete && m->cgroup_root)
 973                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 974
 975         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 976
 977         free(m->cgroup_root);
 978         m->cgroup_root = NULL;
 979 }
 980
 981 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 982         char *p;
 983         Unit *u;
 984
 985         assert(m);
 986         assert(cgroup);
 987
 988         u = hashmap_get(m->cgroup_unit, cgroup);
 989         if (u)
 990                 return u;
 991
 992         p = strdupa(cgroup);
 993         for (;;) {
 994                 char *e;
 995
 996                 e = strrchr(p, '/');
 997                 if (e == p || !e)
 998                         return NULL;
 999
1000                 *e = 0;
1001
1002                 u = hashmap_get(m->cgroup_unit, p);
1003                 if (u)
1004                         return u;
1005         }
1006 }
1007
1008 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1009         _cleanup_free_ char *cgroup = NULL;
1010         int r;
1011
1012         assert(m);
1013
1014         if (pid <= 1)
1015                 return NULL;
1016
1017         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1018         if (r < 0)
1019                 return NULL;
1020
1021         return manager_get_unit_by_cgroup(m, cgroup);
1022 }
1023
1024 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1025         Unit *u;
1026         int r;
1027
1028         assert(m);
1029         assert(cgroup);
1030
1031         u = manager_get_unit_by_cgroup(m, cgroup);
1032         if (u) {
1033                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1034                 if (r > 0) {
1035                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1036                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1037
1038                         unit_add_to_gc_queue(u);
1039                 }
1040         }
1041
1042         return 0;
1043 }
1044
1045 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1046         [CGROUP_AUTO] = "auto",
1047         [CGROUP_CLOSED] = "closed",
1048         [CGROUP_STRICT] = "strict",
1049 };
1050
1051 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);