src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  31
  32 void cgroup_context_init(CGroupContext *c) {
  33         assert(c);
  34
  35         /* Initialize everything to the kernel defaults, assuming the
  36          * structure is preinitialized to 0 */
  37
  38         c->cpu_shares = (unsigned long) -1;
  39         c->startup_cpu_shares = (unsigned long) -1;
  40         c->memory_limit = (uint64_t) -1;
  41         c->blockio_weight = (unsigned long) -1;
  42         c->startup_blockio_weight = (unsigned long) -1;
  43
  44         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  45 }
  46
  47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  48         assert(c);
  49         assert(a);
  50
  51         LIST_REMOVE(device_allow, c->device_allow, a);
  52         free(a->path);
  53         free(a);
  54 }
  55
  56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  57         assert(c);
  58         assert(w);
  59
  60         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  61         free(w->path);
  62         free(w);
  63 }
  64
  65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  66         assert(c);
  67         assert(b);
  68
  69         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  70         free(b->path);
  71         free(b);
  72 }
  73
  74 void cgroup_context_done(CGroupContext *c) {
  75         assert(c);
  76
  77         while (c->blockio_device_weights)
  78                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  79
  80         while (c->blockio_device_bandwidths)
  81                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  82
  83         while (c->device_allow)
  84                 cgroup_context_free_device_allow(c, c->device_allow);
  85 }
  86
  87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  88         CGroupBlockIODeviceBandwidth *b;
  89         CGroupBlockIODeviceWeight *w;
  90         CGroupDeviceAllow *a;
  91         char u[FORMAT_TIMESPAN_MAX];
  92
  93         assert(c);
  94         assert(f);
  95
  96         prefix = strempty(prefix);
  97
  98         fprintf(f,
  99                 "%sCPUAccounting=%s\n"
 100                 "%sBlockIOAccounting=%s\n"
 101                 "%sMemoryAccounting=%s\n"
 102                 "%sCPUShares=%lu\n"
 103                 "%sStartupCPUShares=%lu\n"
 104                 "%sCPUQuotaPerSecSec=%s\n"
 105                 "%sBlockIOWeight=%lu\n"
 106                 "%sStartupBlockIOWeight=%lu\n"
 107                 "%sMemoryLimit=%" PRIu64 "\n"
 108                 "%sDevicePolicy=%s\n",
 109                 prefix, yes_no(c->cpu_accounting),
 110                 prefix, yes_no(c->blockio_accounting),
 111                 prefix, yes_no(c->memory_accounting),
 112                 prefix, c->cpu_shares,
 113                 prefix, c->startup_cpu_shares,
 114                 prefix, strna(format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1)),
 115                 prefix, c->blockio_weight,
 116                 prefix, c->startup_blockio_weight,
 117                 prefix, c->memory_limit,
 118                 prefix, cgroup_device_policy_to_string(c->device_policy));
 119
 120         LIST_FOREACH(device_allow, a, c->device_allow)
 121                 fprintf(f,
 122                         "%sDeviceAllow=%s %s%s%s\n",
 123                         prefix,
 124                         a->path,
 125                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 126
 127         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 128                 fprintf(f,
 129                         "%sBlockIODeviceWeight=%s %lu",
 130                         prefix,
 131                         w->path,
 132                         w->weight);
 133
 134         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 135                 char buf[FORMAT_BYTES_MAX];
 136
 137                 fprintf(f,
 138                         "%s%s=%s %s\n",
 139                         prefix,
 140                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 141                         b->path,
 142                         format_bytes(buf, sizeof(buf), b->bandwidth));
 143         }
 144 }
 145
 146 static int lookup_blkio_device(const char *p, dev_t *dev) {
 147         struct stat st;
 148         int r;
 149
 150         assert(p);
 151         assert(dev);
 152
 153         r = stat(p, &st);
 154         if (r < 0) {
 155                 log_warning("Couldn't stat device %s: %m", p);
 156                 return -errno;
 157         }
 158
 159         if (S_ISBLK(st.st_mode))
 160                 *dev = st.st_rdev;
 161         else if (major(st.st_dev) != 0) {
 162                 /* If this is not a device node then find the block
 163                  * device this file is stored on */
 164                 *dev = st.st_dev;
 165
 166                 /* If this is a partition, try to get the originating
 167                  * block device */
 168                 block_get_whole_disk(*dev, dev);
 169         } else {
 170                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 171                 return -ENODEV;
 172         }
 173
 174         return 0;
 175 }
 176
 177 static int whitelist_device(const char *path, const char *node, const char *acc) {
 178         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 179         struct stat st;
 180         int r;
 181
 182         assert(path);
 183         assert(acc);
 184
 185         if (stat(node, &st) < 0) {
 186                 log_warning("Couldn't stat device %s", node);
 187                 return -errno;
 188         }
 189
 190         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 191                 log_warning("%s is not a device.", node);
 192                 return -ENODEV;
 193         }
 194
 195         sprintf(buf,
 196                 "%c %u:%u %s",
 197                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 198                 major(st.st_rdev), minor(st.st_rdev),
 199                 acc);
 200
 201         r = cg_set_attribute("devices", path, "devices.allow", buf);
 202         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
 203
 204         return r;
 205 }
 206
 207 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 208         _cleanup_fclose_ FILE *f = NULL;
 209         char line[LINE_MAX];
 210         bool good = false;
 211         int r;
 212
 213         assert(path);
 214         assert(acc);
 215         assert(type == 'b' || type == 'c');
 216
 217         f = fopen("/proc/devices", "re");
 218         if (!f) {
 219                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 220                 return -errno;
 221         }
 222
 223         FOREACH_LINE(line, f, goto fail) {
 224                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 225                 unsigned maj;
 226
 227                 truncate_nl(line);
 228
 229                 if (type == 'c' && streq(line, "Character devices:")) {
 230                         good = true;
 231                         continue;
 232                 }
 233
 234                 if (type == 'b' && streq(line, "Block devices:")) {
 235                         good = true;
 236                         continue;
 237                 }
 238
 239                 if (isempty(line)) {
 240                         good = false;
 241                         continue;
 242                 }
 243
 244                 if (!good)
 245                         continue;
 246
 247                 p = strstrip(line);
 248
 249                 w = strpbrk(p, WHITESPACE);
 250                 if (!w)
 251                         continue;
 252                 *w = 0;
 253
 254                 r = safe_atou(p, &maj);
 255                 if (r < 0)
 256                         continue;
 257                 if (maj <= 0)
 258                         continue;
 259
 260                 w++;
 261                 w += strspn(w, WHITESPACE);
 262
 263                 if (fnmatch(name, w, 0) != 0)
 264                         continue;
 265
 266                 sprintf(buf,
 267                         "%c %u:* %s",
 268                         type,
 269                         maj,
 270                         acc);
 271
 272                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 273                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
 274         }
 275
 276         return 0;
 277
 278 fail:
 279         log_warning("Failed to read /proc/devices: %m");
 280         return -errno;
 281 }
 282
 283 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 284         bool is_root;
 285         int r;
 286
 287         assert(c);
 288         assert(path);
 289
 290         if (mask == 0)
 291                 return;
 292
 293         /* Some cgroup attributes are not support on the root cgroup,
 294          * hence silently ignore */
 295         is_root = isempty(path) || path_equal(path, "/");
 296
 297         if ((mask & CGROUP_CPU) && !is_root) {
 298                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 299
 300                 sprintf(buf, "%lu\n",
 301                         state == MANAGER_STARTING && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 302                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 303                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 304                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.shares on %s: %s", path, strerror(-r));
 305
 306                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 307                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 308                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
 309
 310                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 311                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 312                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 313                 } else
 314                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 315                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
 316         }
 317
 318         if (mask & CGROUP_BLKIO) {
 319                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 320                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 321                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 322                 CGroupBlockIODeviceWeight *w;
 323                 CGroupBlockIODeviceBandwidth *b;
 324
 325                 if (!is_root) {
 326                         sprintf(buf, "%lu\n", state == MANAGER_STARTING && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 327                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 328                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 329                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight on %s: %s", path, strerror(-r));
 330
 331                         /* FIXME: no way to reset this list */
 332                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 333                                 dev_t dev;
 334
 335                                 r = lookup_blkio_device(w->path, &dev);
 336                                 if (r < 0)
 337                                         continue;
 338
 339                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 340                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 341                                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 342                         }
 343                 }
 344
 345                 /* FIXME: no way to reset this list */
 346                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 347                         const char *a;
 348                         dev_t dev;
 349
 350                         r = lookup_blkio_device(b->path, &dev);
 351                         if (r < 0)
 352                                 continue;
 353
 354                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 355
 356                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 357                         r = cg_set_attribute("blkio", path, a, buf);
 358                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set %s on %s: %s", a, path, strerror(-r));
 359                 }
 360         }
 361
 362         if (mask & CGROUP_MEMORY) {
 363                 if (c->memory_limit != (uint64_t) -1) {
 364                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 365
 366                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 367                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 368                 } else
 369                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 370
 371                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 372         }
 373
 374         if ((mask & CGROUP_DEVICE) && !is_root) {
 375                 CGroupDeviceAllow *a;
 376
 377                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 378                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 379                 else
 380                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 381                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to reset devices.list on %s: %s", path, strerror(-r));
 382
 383                 if (c->device_policy == CGROUP_CLOSED ||
 384                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 385                         static const char auto_devices[] =
 386                                 "/dev/null\0" "rwm\0"
 387                                 "/dev/zero\0" "rwm\0"
 388                                 "/dev/full\0" "rwm\0"
 389                                 "/dev/random\0" "rwm\0"
 390                                 "/dev/urandom\0" "rwm\0"
 391                                 "/dev/tty\0" "rwm\0"
 392                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 393
 394                         const char *x, *y;
 395
 396                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 397                                 whitelist_device(path, x, y);
 398
 399                         whitelist_major(path, "pts", 'c', "rw");
 400                         whitelist_major(path, "kdbus", 'c', "rw");
 401                         whitelist_major(path, "kdbus/*", 'c', "rw");
 402                 }
 403
 404                 LIST_FOREACH(device_allow, a, c->device_allow) {
 405                         char acc[4];
 406                         unsigned k = 0;
 407
 408                         if (a->r)
 409                                 acc[k++] = 'r';
 410                         if (a->w)
 411                                 acc[k++] = 'w';
 412                         if (a->m)
 413                                 acc[k++] = 'm';
 414
 415                         if (k == 0)
 416                                 continue;
 417
 418                         acc[k++] = 0;
 419
 420                         if (startswith(a->path, "/dev/"))
 421                                 whitelist_device(path, a->path, acc);
 422                         else if (startswith(a->path, "block-"))
 423                                 whitelist_major(path, a->path + 6, 'b', acc);
 424                         else if (startswith(a->path, "char-"))
 425                                 whitelist_major(path, a->path + 5, 'c', acc);
 426                         else
 427                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 428                 }
 429         }
 430 }
 431
 432 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 433         CGroupControllerMask mask = 0;
 434
 435         /* Figure out which controllers we need */
 436
 437         if (c->cpu_accounting ||
 438             c->cpu_shares != (unsigned long) -1 ||
 439             c->startup_cpu_shares != (unsigned long) -1 ||
 440             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 441                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 442
 443         if (c->blockio_accounting ||
 444             c->blockio_weight != (unsigned long) -1 ||
 445             c->startup_blockio_weight != (unsigned long) -1 ||
 446             c->blockio_device_weights ||
 447             c->blockio_device_bandwidths)
 448                 mask |= CGROUP_BLKIO;
 449
 450         if (c->memory_accounting ||
 451             c->memory_limit != (uint64_t) -1)
 452                 mask |= CGROUP_MEMORY;
 453
 454         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 455                 mask |= CGROUP_DEVICE;
 456
 457         return mask;
 458 }
 459
 460 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 461         CGroupContext *c;
 462
 463         c = unit_get_cgroup_context(u);
 464         if (!c)
 465                 return 0;
 466
 467         return cgroup_context_get_mask(c);
 468 }
 469
 470 CGroupControllerMask unit_get_members_mask(Unit *u) {
 471         assert(u);
 472
 473         if (u->cgroup_members_mask_valid)
 474                 return u->cgroup_members_mask;
 475
 476         u->cgroup_members_mask = 0;
 477
 478         if (u->type == UNIT_SLICE) {
 479                 Unit *member;
 480                 Iterator i;
 481
 482                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 483
 484                         if (member == u)
 485                                 continue;
 486
 487                         if (UNIT_DEREF(member->slice) != u)
 488                                 continue;
 489
 490                         u->cgroup_members_mask |=
 491                                 unit_get_cgroup_mask(member) |
 492                                 unit_get_members_mask(member);
 493                 }
 494         }
 495
 496         u->cgroup_members_mask_valid = true;
 497         return u->cgroup_members_mask;
 498 }
 499
 500 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 501         assert(u);
 502
 503         if (UNIT_ISSET(u->slice))
 504                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 505
 506         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 507 }
 508
 509 CGroupControllerMask unit_get_target_mask(Unit *u) {
 510         CGroupControllerMask mask;
 511
 512         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 513         mask &= u->manager->cgroup_supported;
 514
 515         return mask;
 516 }
 517
 518 /* Recurse from a unit up through its containing slices, propagating
 519  * mask bits upward. A unit is also member of itself. */
 520 void unit_update_cgroup_members_masks(Unit *u) {
 521         CGroupControllerMask m;
 522         bool more;
 523
 524         assert(u);
 525
 526         /* Calculate subtree mask */
 527         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 528
 529         /* See if anything changed from the previous invocation. If
 530          * not, we're done. */
 531         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 532                 return;
 533
 534         more =
 535                 u->cgroup_subtree_mask_valid &&
 536                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 537                 ((~m & u->cgroup_subtree_mask) == 0);
 538
 539         u->cgroup_subtree_mask = m;
 540         u->cgroup_subtree_mask_valid = true;
 541
 542         if (UNIT_ISSET(u->slice)) {
 543                 Unit *s = UNIT_DEREF(u->slice);
 544
 545                 if (more)
 546                         /* There's more set now than before. We
 547                          * propagate the new mask to the parent's mask
 548                          * (not caring if it actually was valid or
 549                          * not). */
 550
 551                         s->cgroup_members_mask |= m;
 552
 553                 else
 554                         /* There's less set now than before (or we
 555                          * don't know), we need to recalculate
 556                          * everything, so let's invalidate the
 557                          * parent's members mask */
 558
 559                         s->cgroup_members_mask_valid = false;
 560
 561                 /* And now make sure that this change also hits our
 562                  * grandparents */
 563                 unit_update_cgroup_members_masks(s);
 564         }
 565 }
 566
 567 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 568         Unit *u = userdata;
 569
 570         assert(mask != 0);
 571         assert(u);
 572
 573         while (u) {
 574                 if (u->cgroup_path &&
 575                     u->cgroup_realized &&
 576                     (u->cgroup_realized_mask & mask) == mask)
 577                         return u->cgroup_path;
 578
 579                 u = UNIT_DEREF(u->slice);
 580         }
 581
 582         return NULL;
 583 }
 584
 585 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 586         _cleanup_free_ char *path = NULL;
 587         int r;
 588
 589         assert(u);
 590
 591         path = unit_default_cgroup_path(u);
 592         if (!path)
 593                 return log_oom();
 594
 595         r = hashmap_put(u->manager->cgroup_unit, path, u);
 596         if (r < 0) {
 597                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 598                 return r;
 599         }
 600         if (r > 0) {
 601                 u->cgroup_path = path;
 602                 path = NULL;
 603         }
 604
 605         /* First, create our own group */
 606         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 607         if (r < 0) {
 608                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 609                 return r;
 610         }
 611
 612         /* Keep track that this is now realized */
 613         u->cgroup_realized = true;
 614         u->cgroup_realized_mask = mask;
 615
 616         /* Then, possibly move things over */
 617         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 618         if (r < 0)
 619                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 620
 621         return 0;
 622 }
 623
 624 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 625         assert(u);
 626
 627         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 628 }
 629
 630 /* Check if necessary controllers and attributes for a unit are in place.
 631  *
 632  * If so, do nothing.
 633  * If not, create paths, move processes over, and set attributes.
 634  *
 635  * Returns 0 on success and < 0 on failure. */
 636 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 637         CGroupControllerMask mask;
 638         int r;
 639
 640         assert(u);
 641
 642         if (u->in_cgroup_queue) {
 643                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 644                 u->in_cgroup_queue = false;
 645         }
 646
 647         mask = unit_get_target_mask(u);
 648
 649         if (unit_has_mask_realized(u, mask))
 650                 return 0;
 651
 652         /* First, realize parents */
 653         if (UNIT_ISSET(u->slice)) {
 654                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 655                 if (r < 0)
 656                         return r;
 657         }
 658
 659         /* And then do the real work */
 660         r = unit_create_cgroups(u, mask);
 661         if (r < 0)
 662                 return r;
 663
 664         /* Finally, apply the necessary attributes. */
 665         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 666
 667         return 0;
 668 }
 669
 670 static void unit_add_to_cgroup_queue(Unit *u) {
 671
 672         if (u->in_cgroup_queue)
 673                 return;
 674
 675         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 676         u->in_cgroup_queue = true;
 677 }
 678
 679 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 680         ManagerState state;
 681         unsigned n = 0;
 682         Unit *i;
 683         int r;
 684
 685         state = manager_state(m);
 686
 687         while ((i = m->cgroup_queue)) {
 688                 assert(i->in_cgroup_queue);
 689
 690                 r = unit_realize_cgroup_now(i, state);
 691                 if (r < 0)
 692                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 693
 694                 n++;
 695         }
 696
 697         return n;
 698 }
 699
 700 static void unit_queue_siblings(Unit *u) {
 701         Unit *slice;
 702
 703         /* This adds the siblings of the specified unit and the
 704          * siblings of all parent units to the cgroup queue. (But
 705          * neither the specified unit itself nor the parents.) */
 706
 707         while ((slice = UNIT_DEREF(u->slice))) {
 708                 Iterator i;
 709                 Unit *m;
 710
 711                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 712                         if (m == u)
 713                                 continue;
 714
 715                         /* Skip units that have a dependency on the slice
 716                          * but aren't actually in it. */
 717                         if (UNIT_DEREF(m->slice) != slice)
 718                                 continue;
 719
 720                         /* No point in doing cgroup application for units
 721                          * without active processes. */
 722                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 723                                 continue;
 724
 725                         /* If the unit doesn't need any new controllers
 726                          * and has current ones realized, it doesn't need
 727                          * any changes. */
 728                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 729                                 continue;
 730
 731                         unit_add_to_cgroup_queue(m);
 732                 }
 733
 734                 u = slice;
 735         }
 736 }
 737
 738 int unit_realize_cgroup(Unit *u) {
 739         CGroupContext *c;
 740
 741         assert(u);
 742
 743         c = unit_get_cgroup_context(u);
 744         if (!c)
 745                 return 0;
 746
 747         /* So, here's the deal: when realizing the cgroups for this
 748          * unit, we need to first create all parents, but there's more
 749          * actually: for the weight-based controllers we also need to
 750          * make sure that all our siblings (i.e. units that are in the
 751          * same slice as we are) have cgroups, too. Otherwise, things
 752          * would become very uneven as each of their processes would
 753          * get as much resources as all our group together. This call
 754          * will synchronously create the parent cgroups, but will
 755          * defer work on the siblings to the next event loop
 756          * iteration. */
 757
 758         /* Add all sibling slices to the cgroup queue. */
 759         unit_queue_siblings(u);
 760
 761         /* And realize this one now (and apply the values) */
 762         return unit_realize_cgroup_now(u, manager_state(u->manager));
 763 }
 764
 765 void unit_destroy_cgroup(Unit *u) {
 766         int r;
 767
 768         assert(u);
 769
 770         if (!u->cgroup_path)
 771                 return;
 772
 773         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 774         if (r < 0)
 775                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 776
 777         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 778
 779         free(u->cgroup_path);
 780         u->cgroup_path = NULL;
 781         u->cgroup_realized = false;
 782         u->cgroup_realized_mask = 0;
 783
 784 }
 785
 786 pid_t unit_search_main_pid(Unit *u) {
 787         _cleanup_fclose_ FILE *f = NULL;
 788         pid_t pid = 0, npid, mypid;
 789
 790         assert(u);
 791
 792         if (!u->cgroup_path)
 793                 return 0;
 794
 795         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 796                 return 0;
 797
 798         mypid = getpid();
 799         while (cg_read_pid(f, &npid) > 0)  {
 800                 pid_t ppid;
 801
 802                 if (npid == pid)
 803                         continue;
 804
 805                 /* Ignore processes that aren't our kids */
 806                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 807                         continue;
 808
 809                 if (pid != 0) {
 810                         /* Dang, there's more than one daemonized PID
 811                         in this group, so we don't know what process
 812                         is the main process. */
 813                         pid = 0;
 814                         break;
 815                 }
 816
 817                 pid = npid;
 818         }
 819
 820         return pid;
 821 }
 822
 823 int manager_setup_cgroup(Manager *m) {
 824         _cleanup_free_ char *path = NULL;
 825         int r;
 826
 827         assert(m);
 828
 829         /* 1. Determine hierarchy */
 830         free(m->cgroup_root);
 831         m->cgroup_root = NULL;
 832
 833         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 834         if (r < 0) {
 835                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 836                 return r;
 837         }
 838
 839         /* LEGACY: Already in /system.slice? If so, let's cut this
 840          * off. This is to support live upgrades from older systemd
 841          * versions where PID 1 was moved there. */
 842         if (m->running_as == SYSTEMD_SYSTEM) {
 843                 char *e;
 844
 845                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 846                 if (!e)
 847                         e = endswith(m->cgroup_root, "/system");
 848                 if (e)
 849                         *e = 0;
 850         }
 851
 852         /* And make sure to store away the root value without trailing
 853          * slash, even for the root dir, so that we can easily prepend
 854          * it everywhere. */
 855         if (streq(m->cgroup_root, "/"))
 856                 m->cgroup_root[0] = 0;
 857
 858         /* 2. Show data */
 859         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 860         if (r < 0) {
 861                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 862                 return r;
 863         }
 864
 865         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 866         if (!m->test_run) {
 867
 868                 /* 3. Install agent */
 869                 if (m->running_as == SYSTEMD_SYSTEM) {
 870                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 871                         if (r < 0)
 872                                 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 873                         else if (r > 0)
 874                                 log_debug("Installed release agent.");
 875                         else
 876                                 log_debug("Release agent already installed.");
 877                 }
 878
 879                 /* 4. Make sure we are in the root cgroup */
 880                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 881                 if (r < 0) {
 882                         log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 883                         return r;
 884                 }
 885
 886                 /* 5. And pin it, so that it cannot be unmounted */
 887                 safe_close(m->pin_cgroupfs_fd);
 888
 889                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 890                 if (m->pin_cgroupfs_fd < 0) {
 891                         log_error("Failed to open pin file: %m");
 892                         return -errno;
 893                 }
 894
 895                 /* 6.  Always enable hierarchial support if it exists... */
 896                 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 897         }
 898
 899         /* 7. Figure out which controllers are supported */
 900         m->cgroup_supported = cg_mask_supported();
 901
 902         return 0;
 903 }
 904
 905 void manager_shutdown_cgroup(Manager *m, bool delete) {
 906         assert(m);
 907
 908         /* We can't really delete the group, since we are in it. But
 909          * let's trim it. */
 910         if (delete && m->cgroup_root)
 911                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 912
 913         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 914
 915         free(m->cgroup_root);
 916         m->cgroup_root = NULL;
 917 }
 918
 919 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 920         char *p;
 921         Unit *u;
 922
 923         assert(m);
 924         assert(cgroup);
 925
 926         u = hashmap_get(m->cgroup_unit, cgroup);
 927         if (u)
 928                 return u;
 929
 930         p = strdupa(cgroup);
 931         for (;;) {
 932                 char *e;
 933
 934                 e = strrchr(p, '/');
 935                 if (e == p || !e)
 936                         return NULL;
 937
 938                 *e = 0;
 939
 940                 u = hashmap_get(m->cgroup_unit, p);
 941                 if (u)
 942                         return u;
 943         }
 944 }
 945
 946 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 947         _cleanup_free_ char *cgroup = NULL;
 948         int r;
 949
 950         assert(m);
 951
 952         if (pid <= 1)
 953                 return NULL;
 954
 955         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 956         if (r < 0)
 957                 return NULL;
 958
 959         return manager_get_unit_by_cgroup(m, cgroup);
 960 }
 961
 962 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 963         Unit *u;
 964         int r;
 965
 966         assert(m);
 967         assert(cgroup);
 968
 969         u = manager_get_unit_by_cgroup(m, cgroup);
 970         if (u) {
 971                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
 972                 if (r > 0) {
 973                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
 974                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
 975
 976                         unit_add_to_gc_queue(u);
 977                 }
 978         }
 979
 980         return 0;
 981 }
 982
 983 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
 984         [CGROUP_AUTO] = "auto",
 985         [CGROUP_CLOSED] = "closed",
 986         [CGROUP_STRICT] = "strict",
 987 };
 988
 989 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);