src/core/cgroup.c

   1 /***
   2   This file is part of systemd.
   3
   4   Copyright 2013 Lennart Poettering
   5
   6   systemd is free software; you can redistribute it and/or modify it
   7   under the terms of the GNU Lesser General Public License as published by
   8   the Free Software Foundation; either version 2.1 of the License, or
   9   (at your option) any later version.
  10
  11   systemd is distributed in the hope that it will be useful, but
  12   WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public License
  17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  18 ***/
  19
  20 #include <fcntl.h>
  21 #include <fnmatch.h>
  22
  23 #include "alloc-util.h"
  24 #include "cgroup-util.h"
  25 #include "cgroup.h"
  26 #include "fd-util.h"
  27 #include "fileio.h"
  28 #include "fs-util.h"
  29 #include "parse-util.h"
  30 #include "path-util.h"
  31 #include "process-util.h"
  32 #include "special.h"
  33 #include "string-table.h"
  34 #include "string-util.h"
  35 #include "stdio-util.h"
  36
  37 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  38
  39 void cgroup_context_init(CGroupContext *c) {
  40         assert(c);
  41
  42         /* Initialize everything to the kernel defaults, assuming the
  43          * structure is preinitialized to 0 */
  44
  45         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  46         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  47         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  48
  49         c->memory_limit = (uint64_t) -1;
  50
  51         c->io_weight = CGROUP_WEIGHT_INVALID;
  52         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
  53
  54         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  55         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  56
  57         c->tasks_max = (uint64_t) -1;
  58 }
  59
  60 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  61         assert(c);
  62         assert(a);
  63
  64         LIST_REMOVE(device_allow, c->device_allow, a);
  65         free(a->path);
  66         free(a);
  67 }
  68
  69 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
  70         assert(c);
  71         assert(w);
  72
  73         LIST_REMOVE(device_weights, c->io_device_weights, w);
  74         free(w->path);
  75         free(w);
  76 }
  77
  78 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
  79         assert(c);
  80         assert(l);
  81
  82         LIST_REMOVE(device_limits, c->io_device_limits, l);
  83         free(l->path);
  84         free(l);
  85 }
  86
  87 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  88         assert(c);
  89         assert(w);
  90
  91         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  92         free(w->path);
  93         free(w);
  94 }
  95
  96 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  97         assert(c);
  98         assert(b);
  99
 100         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
 101         free(b->path);
 102         free(b);
 103 }
 104
 105 void cgroup_context_done(CGroupContext *c) {
 106         assert(c);
 107
 108         while (c->io_device_weights)
 109                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
 110
 111         while (c->io_device_limits)
 112                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
 113
 114         while (c->blockio_device_weights)
 115                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
 116
 117         while (c->blockio_device_bandwidths)
 118                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
 119
 120         while (c->device_allow)
 121                 cgroup_context_free_device_allow(c, c->device_allow);
 122 }
 123
 124 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
 125         CGroupIODeviceLimit *il;
 126         CGroupIODeviceWeight *iw;
 127         CGroupBlockIODeviceBandwidth *b;
 128         CGroupBlockIODeviceWeight *w;
 129         CGroupDeviceAllow *a;
 130         char u[FORMAT_TIMESPAN_MAX];
 131
 132         assert(c);
 133         assert(f);
 134
 135         prefix = strempty(prefix);
 136
 137         fprintf(f,
 138                 "%sCPUAccounting=%s\n"
 139                 "%sIOAccounting=%s\n"
 140                 "%sBlockIOAccounting=%s\n"
 141                 "%sMemoryAccounting=%s\n"
 142                 "%sTasksAccounting=%s\n"
 143                 "%sCPUShares=%" PRIu64 "\n"
 144                 "%sStartupCPUShares=%" PRIu64 "\n"
 145                 "%sCPUQuotaPerSecSec=%s\n"
 146                 "%sIOWeight=%" PRIu64 "\n"
 147                 "%sStartupIOWeight=%" PRIu64 "\n"
 148                 "%sBlockIOWeight=%" PRIu64 "\n"
 149                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 150                 "%sMemoryLimit=%" PRIu64 "\n"
 151                 "%sTasksMax=%" PRIu64 "\n"
 152                 "%sDevicePolicy=%s\n"
 153                 "%sDelegate=%s\n",
 154                 prefix, yes_no(c->cpu_accounting),
 155                 prefix, yes_no(c->io_accounting),
 156                 prefix, yes_no(c->blockio_accounting),
 157                 prefix, yes_no(c->memory_accounting),
 158                 prefix, yes_no(c->tasks_accounting),
 159                 prefix, c->cpu_shares,
 160                 prefix, c->startup_cpu_shares,
 161                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 162                 prefix, c->io_weight,
 163                 prefix, c->startup_io_weight,
 164                 prefix, c->blockio_weight,
 165                 prefix, c->startup_blockio_weight,
 166                 prefix, c->memory_limit,
 167                 prefix, c->tasks_max,
 168                 prefix, cgroup_device_policy_to_string(c->device_policy),
 169                 prefix, yes_no(c->delegate));
 170
 171         LIST_FOREACH(device_allow, a, c->device_allow)
 172                 fprintf(f,
 173                         "%sDeviceAllow=%s %s%s%s\n",
 174                         prefix,
 175                         a->path,
 176                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 177
 178         LIST_FOREACH(device_weights, iw, c->io_device_weights)
 179                 fprintf(f,
 180                         "%sIODeviceWeight=%s %" PRIu64,
 181                         prefix,
 182                         iw->path,
 183                         iw->weight);
 184
 185         LIST_FOREACH(device_limits, il, c->io_device_limits) {
 186                 char buf[FORMAT_BYTES_MAX];
 187                 CGroupIOLimitType type;
 188
 189                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
 190                         if (il->limits[type] != cgroup_io_limit_defaults[type])
 191                                 fprintf(f,
 192                                         "%s%s=%s %s\n",
 193                                         prefix,
 194                                         cgroup_io_limit_type_to_string(type),
 195                                         il->path,
 196                                         format_bytes(buf, sizeof(buf), il->limits[type]));
 197         }
 198
 199         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 200                 fprintf(f,
 201                         "%sBlockIODeviceWeight=%s %" PRIu64,
 202                         prefix,
 203                         w->path,
 204                         w->weight);
 205
 206         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 207                 char buf[FORMAT_BYTES_MAX];
 208
 209                 fprintf(f,
 210                         "%s%s=%s %s\n",
 211                         prefix,
 212                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 213                         b->path,
 214                         format_bytes(buf, sizeof(buf), b->bandwidth));
 215         }
 216 }
 217
 218 static int lookup_block_device(const char *p, dev_t *dev) {
 219         struct stat st;
 220         int r;
 221
 222         assert(p);
 223         assert(dev);
 224
 225         r = stat(p, &st);
 226         if (r < 0)
 227                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 228
 229         if (S_ISBLK(st.st_mode))
 230                 *dev = st.st_rdev;
 231         else if (major(st.st_dev) != 0) {
 232                 /* If this is not a device node then find the block
 233                  * device this file is stored on */
 234                 *dev = st.st_dev;
 235
 236                 /* If this is a partition, try to get the originating
 237                  * block device */
 238                 block_get_whole_disk(*dev, dev);
 239         } else {
 240                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 241                 return -ENODEV;
 242         }
 243
 244         return 0;
 245 }
 246
 247 static int whitelist_device(const char *path, const char *node, const char *acc) {
 248         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 249         struct stat st;
 250         int r;
 251
 252         assert(path);
 253         assert(acc);
 254
 255         if (stat(node, &st) < 0) {
 256                 log_warning("Couldn't stat device %s", node);
 257                 return -errno;
 258         }
 259
 260         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 261                 log_warning("%s is not a device.", node);
 262                 return -ENODEV;
 263         }
 264
 265         sprintf(buf,
 266                 "%c %u:%u %s",
 267                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 268                 major(st.st_rdev), minor(st.st_rdev),
 269                 acc);
 270
 271         r = cg_set_attribute("devices", path, "devices.allow", buf);
 272         if (r < 0)
 273                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 274                                "Failed to set devices.allow on %s: %m", path);
 275
 276         return r;
 277 }
 278
 279 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 280         _cleanup_fclose_ FILE *f = NULL;
 281         char line[LINE_MAX];
 282         bool good = false;
 283         int r;
 284
 285         assert(path);
 286         assert(acc);
 287         assert(type == 'b' || type == 'c');
 288
 289         f = fopen("/proc/devices", "re");
 290         if (!f)
 291                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 292
 293         FOREACH_LINE(line, f, goto fail) {
 294                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 295                 unsigned maj;
 296
 297                 truncate_nl(line);
 298
 299                 if (type == 'c' && streq(line, "Character devices:")) {
 300                         good = true;
 301                         continue;
 302                 }
 303
 304                 if (type == 'b' && streq(line, "Block devices:")) {
 305                         good = true;
 306                         continue;
 307                 }
 308
 309                 if (isempty(line)) {
 310                         good = false;
 311                         continue;
 312                 }
 313
 314                 if (!good)
 315                         continue;
 316
 317                 p = strstrip(line);
 318
 319                 w = strpbrk(p, WHITESPACE);
 320                 if (!w)
 321                         continue;
 322                 *w = 0;
 323
 324                 r = safe_atou(p, &maj);
 325                 if (r < 0)
 326                         continue;
 327                 if (maj <= 0)
 328                         continue;
 329
 330                 w++;
 331                 w += strspn(w, WHITESPACE);
 332
 333                 if (fnmatch(name, w, 0) != 0)
 334                         continue;
 335
 336                 sprintf(buf,
 337                         "%c %u:* %s",
 338                         type,
 339                         maj,
 340                         acc);
 341
 342                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 343                 if (r < 0)
 344                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 345                                        "Failed to set devices.allow on %s: %m", path);
 346         }
 347
 348         return 0;
 349
 350 fail:
 351         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 352         return -errno;
 353 }
 354
 355 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
 356         bool is_root;
 357         int r;
 358
 359         assert(c);
 360         assert(path);
 361
 362         if (mask == 0)
 363                 return;
 364
 365         /* Some cgroup attributes are not supported on the root cgroup,
 366          * hence silently ignore */
 367         is_root = isempty(path) || path_equal(path, "/");
 368         if (is_root)
 369                 /* Make sure we don't try to display messages with an empty path. */
 370                 path = "/";
 371
 372         /* We generally ignore errors caused by read-only mounted
 373          * cgroup trees (assuming we are running in a container then),
 374          * and missing cgroups, i.e. EROFS and ENOENT. */
 375
 376         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 377                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 378
 379                 sprintf(buf, "%" PRIu64 "\n",
 380                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 381                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 382                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 383                 if (r < 0)
 384                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 385                                        "Failed to set cpu.shares on %s: %m", path);
 386
 387                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 388                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 389                 if (r < 0)
 390                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 391                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 392
 393                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 394                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 395                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 396                 } else
 397                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 398                 if (r < 0)
 399                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 400                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 401         }
 402
 403         if (mask & CGROUP_MASK_IO) {
 404                 CGroupIODeviceWeight *w;
 405                 CGroupIODeviceLimit *l, *next;
 406
 407                 if (!is_root) {
 408                         char buf[MAX(8+DECIMAL_STR_MAX(uint64_t)+1,
 409                                      DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 410                         uint64_t weight = CGROUP_WEIGHT_DEFAULT;
 411
 412                         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
 413                             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
 414                                 weight = c->startup_io_weight;
 415                         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
 416                                 weight = c->io_weight;
 417
 418                         xsprintf(buf, "default %" PRIu64 "\n", weight);
 419                         r = cg_set_attribute("io", path, "io.weight", buf);
 420                         if (r < 0)
 421                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 422                                                "Failed to set io.weight on %s: %m", path);
 423
 424                         /* FIXME: no way to reset this list */
 425                         LIST_FOREACH(device_weights, w, c->io_device_weights) {
 426                                 dev_t dev;
 427
 428                                 r = lookup_block_device(w->path, &dev);
 429                                 if (r < 0)
 430                                         continue;
 431
 432                                 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 433                                 r = cg_set_attribute("io", path, "io.weight", buf);
 434                                 if (r < 0)
 435                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 436                                                        "Failed to set io.weight on %s: %m", path);
 437                         }
 438                 }
 439
 440                 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
 441                         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
 442                         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
 443                         CGroupIOLimitType type;
 444                         dev_t dev;
 445                         unsigned n = 0;
 446
 447                         r = lookup_block_device(l->path, &dev);
 448                         if (r < 0)
 449                                 continue;
 450
 451                         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
 452                                 if (l->limits[type] != cgroup_io_limit_defaults[type]) {
 453                                         xsprintf(limit_bufs[type], "%" PRIu64, l->limits[type]);
 454                                         n++;
 455                                 } else {
 456                                         xsprintf(limit_bufs[type], "%s",
 457                                                  l->limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
 458                                 }
 459                         }
 460
 461                         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
 462                                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
 463                                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
 464                         r = cg_set_attribute("io", path, "io.max", buf);
 465                         if (r < 0)
 466                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 467                                                "Failed to set io.max on %s: %m", path);
 468
 469                         /* If @l contained no config, we just cleared the kernel
 470                            counterpart too. No reason to keep @l around. */
 471                         if (!n)
 472                                 cgroup_context_free_io_device_limit(c, l);
 473                 }
 474         }
 475
 476         if (mask & CGROUP_MASK_BLKIO) {
 477                 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
 478                              DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 479                 CGroupBlockIODeviceWeight *w;
 480                 CGroupBlockIODeviceBandwidth *b;
 481
 482                 if (!is_root) {
 483                         sprintf(buf, "%" PRIu64 "\n",
 484                                 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
 485                                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
 486                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 487                         if (r < 0)
 488                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 489                                                "Failed to set blkio.weight on %s: %m", path);
 490
 491                         /* FIXME: no way to reset this list */
 492                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 493                                 dev_t dev;
 494
 495                                 r = lookup_block_device(w->path, &dev);
 496                                 if (r < 0)
 497                                         continue;
 498
 499                                 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 500                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 501                                 if (r < 0)
 502                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 503                                                        "Failed to set blkio.weight_device on %s: %m", path);
 504                         }
 505                 }
 506
 507                 /* FIXME: no way to reset this list */
 508                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 509                         const char *a;
 510                         dev_t dev;
 511
 512                         r = lookup_block_device(b->path, &dev);
 513                         if (r < 0)
 514                                 continue;
 515
 516                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 517
 518                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 519                         r = cg_set_attribute("blkio", path, a, buf);
 520                         if (r < 0)
 521                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 522                                                "Failed to set %s on %s: %m", a, path);
 523                 }
 524         }
 525
 526         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 527                 if (c->memory_limit != (uint64_t) -1) {
 528                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 529
 530                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 531
 532                         if (cg_unified() <= 0)
 533                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 534                         else
 535                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 536
 537                 } else {
 538                         if (cg_unified() <= 0)
 539                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 540                         else
 541                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 542                 }
 543
 544                 if (r < 0)
 545                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 546                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 547         }
 548
 549         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 550                 CGroupDeviceAllow *a;
 551
 552                 /* Changing the devices list of a populated cgroup
 553                  * might result in EINVAL, hence ignore EINVAL
 554                  * here. */
 555
 556                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 557                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 558                 else
 559                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 560                 if (r < 0)
 561                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 562                                        "Failed to reset devices.list on %s: %m", path);
 563
 564                 if (c->device_policy == CGROUP_CLOSED ||
 565                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 566                         static const char auto_devices[] =
 567                                 "/dev/null\0" "rwm\0"
 568                                 "/dev/zero\0" "rwm\0"
 569                                 "/dev/full\0" "rwm\0"
 570                                 "/dev/random\0" "rwm\0"
 571                                 "/dev/urandom\0" "rwm\0"
 572                                 "/dev/tty\0" "rwm\0"
 573                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 574
 575                         const char *x, *y;
 576
 577                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 578                                 whitelist_device(path, x, y);
 579
 580                         whitelist_major(path, "pts", 'c', "rw");
 581                         whitelist_major(path, "kdbus", 'c', "rw");
 582                         whitelist_major(path, "kdbus/*", 'c', "rw");
 583                 }
 584
 585                 LIST_FOREACH(device_allow, a, c->device_allow) {
 586                         char acc[4];
 587                         unsigned k = 0;
 588
 589                         if (a->r)
 590                                 acc[k++] = 'r';
 591                         if (a->w)
 592                                 acc[k++] = 'w';
 593                         if (a->m)
 594                                 acc[k++] = 'm';
 595
 596                         if (k == 0)
 597                                 continue;
 598
 599                         acc[k++] = 0;
 600
 601                         if (startswith(a->path, "/dev/"))
 602                                 whitelist_device(path, a->path, acc);
 603                         else if (startswith(a->path, "block-"))
 604                                 whitelist_major(path, a->path + 6, 'b', acc);
 605                         else if (startswith(a->path, "char-"))
 606                                 whitelist_major(path, a->path + 5, 'c', acc);
 607                         else
 608                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 609                 }
 610         }
 611
 612         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 613
 614                 if (c->tasks_max != (uint64_t) -1) {
 615                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 616
 617                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 618                         r = cg_set_attribute("pids", path, "pids.max", buf);
 619                 } else
 620                         r = cg_set_attribute("pids", path, "pids.max", "max");
 621
 622                 if (r < 0)
 623                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
 624                                        "Failed to set pids.max on %s: %m", path);
 625         }
 626 }
 627
 628 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 629         CGroupMask mask = 0;
 630
 631         /* Figure out which controllers we need */
 632
 633         if (c->cpu_accounting ||
 634             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 635             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 636             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 637                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 638
 639         if (c->io_accounting ||
 640             c->io_weight != CGROUP_WEIGHT_INVALID ||
 641             c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
 642             c->io_device_weights ||
 643             c->io_device_limits)
 644                 mask |= CGROUP_MASK_IO;
 645
 646         if (c->blockio_accounting ||
 647             c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 648             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 649             c->blockio_device_weights ||
 650             c->blockio_device_bandwidths)
 651                 mask |= CGROUP_MASK_BLKIO;
 652
 653         if (c->memory_accounting ||
 654             c->memory_limit != (uint64_t) -1)
 655                 mask |= CGROUP_MASK_MEMORY;
 656
 657         if (c->device_allow ||
 658             c->device_policy != CGROUP_AUTO)
 659                 mask |= CGROUP_MASK_DEVICES;
 660
 661         if (c->tasks_accounting ||
 662             c->tasks_max != (uint64_t) -1)
 663                 mask |= CGROUP_MASK_PIDS;
 664
 665         return mask;
 666 }
 667
 668 CGroupMask unit_get_own_mask(Unit *u) {
 669         CGroupContext *c;
 670
 671         /* Returns the mask of controllers the unit needs for itself */
 672
 673         c = unit_get_cgroup_context(u);
 674         if (!c)
 675                 return 0;
 676
 677         /* If delegation is turned on, then turn on all cgroups,
 678          * unless we are on the legacy hierarchy and the process we
 679          * fork into it is known to drop privileges, and hence
 680          * shouldn't get access to the controllers.
 681          *
 682          * Note that on the unified hierarchy it is safe to delegate
 683          * controllers to unprivileged services. */
 684
 685         if (c->delegate) {
 686                 ExecContext *e;
 687
 688                 e = unit_get_exec_context(u);
 689                 if (!e ||
 690                     exec_context_maintains_privileges(e) ||
 691                     cg_unified() > 0)
 692                         return _CGROUP_MASK_ALL;
 693         }
 694
 695         return cgroup_context_get_mask(c);
 696 }
 697
 698 CGroupMask unit_get_members_mask(Unit *u) {
 699         assert(u);
 700
 701         /* Returns the mask of controllers all of the unit's children
 702          * require, merged */
 703
 704         if (u->cgroup_members_mask_valid)
 705                 return u->cgroup_members_mask;
 706
 707         u->cgroup_members_mask = 0;
 708
 709         if (u->type == UNIT_SLICE) {
 710                 Unit *member;
 711                 Iterator i;
 712
 713                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 714
 715                         if (member == u)
 716                                 continue;
 717
 718                         if (UNIT_DEREF(member->slice) != u)
 719                                 continue;
 720
 721                         u->cgroup_members_mask |=
 722                                 unit_get_own_mask(member) |
 723                                 unit_get_members_mask(member);
 724                 }
 725         }
 726
 727         u->cgroup_members_mask_valid = true;
 728         return u->cgroup_members_mask;
 729 }
 730
 731 CGroupMask unit_get_siblings_mask(Unit *u) {
 732         assert(u);
 733
 734         /* Returns the mask of controllers all of the unit's siblings
 735          * require, i.e. the members mask of the unit's parent slice
 736          * if there is one. */
 737
 738         if (UNIT_ISSET(u->slice))
 739                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 740
 741         return unit_get_own_mask(u) | unit_get_members_mask(u);
 742 }
 743
 744 CGroupMask unit_get_subtree_mask(Unit *u) {
 745
 746         /* Returns the mask of this subtree, meaning of the group
 747          * itself and its children. */
 748
 749         return unit_get_own_mask(u) | unit_get_members_mask(u);
 750 }
 751
 752 CGroupMask unit_get_target_mask(Unit *u) {
 753         CGroupMask mask;
 754
 755         /* This returns the cgroup mask of all controllers to enable
 756          * for a specific cgroup, i.e. everything it needs itself,
 757          * plus all that its children need, plus all that its siblings
 758          * need. This is primarily useful on the legacy cgroup
 759          * hierarchy, where we need to duplicate each cgroup in each
 760          * hierarchy that shall be enabled for it. */
 761
 762         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 763         mask &= u->manager->cgroup_supported;
 764
 765         return mask;
 766 }
 767
 768 CGroupMask unit_get_enable_mask(Unit *u) {
 769         CGroupMask mask;
 770
 771         /* This returns the cgroup mask of all controllers to enable
 772          * for the children of a specific cgroup. This is primarily
 773          * useful for the unified cgroup hierarchy, where each cgroup
 774          * controls which controllers are enabled for its children. */
 775
 776         mask = unit_get_members_mask(u);
 777         mask &= u->manager->cgroup_supported;
 778
 779         return mask;
 780 }
 781
 782 /* Recurse from a unit up through its containing slices, propagating
 783  * mask bits upward. A unit is also member of itself. */
 784 void unit_update_cgroup_members_masks(Unit *u) {
 785         CGroupMask m;
 786         bool more;
 787
 788         assert(u);
 789
 790         /* Calculate subtree mask */
 791         m = unit_get_subtree_mask(u);
 792
 793         /* See if anything changed from the previous invocation. If
 794          * not, we're done. */
 795         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 796                 return;
 797
 798         more =
 799                 u->cgroup_subtree_mask_valid &&
 800                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 801                 ((~m & u->cgroup_subtree_mask) == 0);
 802
 803         u->cgroup_subtree_mask = m;
 804         u->cgroup_subtree_mask_valid = true;
 805
 806         if (UNIT_ISSET(u->slice)) {
 807                 Unit *s = UNIT_DEREF(u->slice);
 808
 809                 if (more)
 810                         /* There's more set now than before. We
 811                          * propagate the new mask to the parent's mask
 812                          * (not caring if it actually was valid or
 813                          * not). */
 814
 815                         s->cgroup_members_mask |= m;
 816
 817                 else
 818                         /* There's less set now than before (or we
 819                          * don't know), we need to recalculate
 820                          * everything, so let's invalidate the
 821                          * parent's members mask */
 822
 823                         s->cgroup_members_mask_valid = false;
 824
 825                 /* And now make sure that this change also hits our
 826                  * grandparents */
 827                 unit_update_cgroup_members_masks(s);
 828         }
 829 }
 830
 831 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 832         Unit *u = userdata;
 833
 834         assert(mask != 0);
 835         assert(u);
 836
 837         while (u) {
 838                 if (u->cgroup_path &&
 839                     u->cgroup_realized &&
 840                     (u->cgroup_realized_mask & mask) == mask)
 841                         return u->cgroup_path;
 842
 843                 u = UNIT_DEREF(u->slice);
 844         }
 845
 846         return NULL;
 847 }
 848
 849 char *unit_default_cgroup_path(Unit *u) {
 850         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 851         int r;
 852
 853         assert(u);
 854
 855         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 856                 return strdup(u->manager->cgroup_root);
 857
 858         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 859                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 860                 if (r < 0)
 861                         return NULL;
 862         }
 863
 864         escaped = cg_escape(u->id);
 865         if (!escaped)
 866                 return NULL;
 867
 868         if (slice)
 869                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 870         else
 871                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 872 }
 873
 874 int unit_set_cgroup_path(Unit *u, const char *path) {
 875         _cleanup_free_ char *p = NULL;
 876         int r;
 877
 878         assert(u);
 879
 880         if (path) {
 881                 p = strdup(path);
 882                 if (!p)
 883                         return -ENOMEM;
 884         } else
 885                 p = NULL;
 886
 887         if (streq_ptr(u->cgroup_path, p))
 888                 return 0;
 889
 890         if (p) {
 891                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 892                 if (r < 0)
 893                         return r;
 894         }
 895
 896         unit_release_cgroup(u);
 897
 898         u->cgroup_path = p;
 899         p = NULL;
 900
 901         return 1;
 902 }
 903
 904 int unit_watch_cgroup(Unit *u) {
 905         _cleanup_free_ char *events = NULL;
 906         int r;
 907
 908         assert(u);
 909
 910         if (!u->cgroup_path)
 911                 return 0;
 912
 913         if (u->cgroup_inotify_wd >= 0)
 914                 return 0;
 915
 916         /* Only applies to the unified hierarchy */
 917         r = cg_unified();
 918         if (r < 0)
 919                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 920         if (r == 0)
 921                 return 0;
 922
 923         /* Don't watch the root slice, it's pointless. */
 924         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 925                 return 0;
 926
 927         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 928         if (r < 0)
 929                 return log_oom();
 930
 931         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
 932         if (r < 0)
 933                 return log_oom();
 934
 935         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
 936         if (u->cgroup_inotify_wd < 0) {
 937
 938                 if (errno == ENOENT) /* If the directory is already
 939                                       * gone we don't need to track
 940                                       * it, so this is not an error */
 941                         return 0;
 942
 943                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 944         }
 945
 946         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 947         if (r < 0)
 948                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 949
 950         return 0;
 951 }
 952
 953 static int unit_create_cgroup(
 954                 Unit *u,
 955                 CGroupMask target_mask,
 956                 CGroupMask enable_mask) {
 957
 958         CGroupContext *c;
 959         int r;
 960
 961         assert(u);
 962
 963         c = unit_get_cgroup_context(u);
 964         if (!c)
 965                 return 0;
 966
 967         if (!u->cgroup_path) {
 968                 _cleanup_free_ char *path = NULL;
 969
 970                 path = unit_default_cgroup_path(u);
 971                 if (!path)
 972                         return log_oom();
 973
 974                 r = unit_set_cgroup_path(u, path);
 975                 if (r == -EEXIST)
 976                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 977                 if (r < 0)
 978                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 979         }
 980
 981         /* First, create our own group */
 982         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 983         if (r < 0)
 984                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 985
 986         /* Start watching it */
 987         (void) unit_watch_cgroup(u);
 988
 989         /* Enable all controllers we need */
 990         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 991         if (r < 0)
 992                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 993
 994         /* Keep track that this is now realized */
 995         u->cgroup_realized = true;
 996         u->cgroup_realized_mask = target_mask;
 997         u->cgroup_enabled_mask = enable_mask;
 998
 999         if (u->type != UNIT_SLICE && !c->delegate) {
1000
1001                 /* Then, possibly move things over, but not if
1002                  * subgroups may contain processes, which is the case
1003                  * for slice and delegation units. */
1004                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1005                 if (r < 0)
1006                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1007         }
1008
1009         return 0;
1010 }
1011
1012 int unit_attach_pids_to_cgroup(Unit *u) {
1013         int r;
1014         assert(u);
1015
1016         r = unit_realize_cgroup(u);
1017         if (r < 0)
1018                 return r;
1019
1020         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1021         if (r < 0)
1022                 return r;
1023
1024         return 0;
1025 }
1026
1027 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask, CGroupMask enable_mask) {
1028         assert(u);
1029
1030         return u->cgroup_realized && u->cgroup_realized_mask == target_mask && u->cgroup_enabled_mask == enable_mask;
1031 }
1032
1033 /* Check if necessary controllers and attributes for a unit are in place.
1034  *
1035  * If so, do nothing.
1036  * If not, create paths, move processes over, and set attributes.
1037  *
1038  * Returns 0 on success and < 0 on failure. */
1039 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1040         CGroupMask target_mask, enable_mask;
1041         int r;
1042
1043         assert(u);
1044
1045         if (u->in_cgroup_queue) {
1046                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1047                 u->in_cgroup_queue = false;
1048         }
1049
1050         target_mask = unit_get_target_mask(u);
1051         enable_mask = unit_get_enable_mask(u);
1052
1053         if (unit_has_mask_realized(u, target_mask, enable_mask))
1054                 return 0;
1055
1056         /* First, realize parents */
1057         if (UNIT_ISSET(u->slice)) {
1058                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1059                 if (r < 0)
1060                         return r;
1061         }
1062
1063         /* And then do the real work */
1064         r = unit_create_cgroup(u, target_mask, enable_mask);
1065         if (r < 0)
1066                 return r;
1067
1068         /* Finally, apply the necessary attributes. */
1069         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
1070
1071         return 0;
1072 }
1073
1074 static void unit_add_to_cgroup_queue(Unit *u) {
1075
1076         if (u->in_cgroup_queue)
1077                 return;
1078
1079         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1080         u->in_cgroup_queue = true;
1081 }
1082
1083 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1084         ManagerState state;
1085         unsigned n = 0;
1086         Unit *i;
1087         int r;
1088
1089         state = manager_state(m);
1090
1091         while ((i = m->cgroup_queue)) {
1092                 assert(i->in_cgroup_queue);
1093
1094                 r = unit_realize_cgroup_now(i, state);
1095                 if (r < 0)
1096                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1097
1098                 n++;
1099         }
1100
1101         return n;
1102 }
1103
1104 static void unit_queue_siblings(Unit *u) {
1105         Unit *slice;
1106
1107         /* This adds the siblings of the specified unit and the
1108          * siblings of all parent units to the cgroup queue. (But
1109          * neither the specified unit itself nor the parents.) */
1110
1111         while ((slice = UNIT_DEREF(u->slice))) {
1112                 Iterator i;
1113                 Unit *m;
1114
1115                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1116                         if (m == u)
1117                                 continue;
1118
1119                         /* Skip units that have a dependency on the slice
1120                          * but aren't actually in it. */
1121                         if (UNIT_DEREF(m->slice) != slice)
1122                                 continue;
1123
1124                         /* No point in doing cgroup application for units
1125                          * without active processes. */
1126                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1127                                 continue;
1128
1129                         /* If the unit doesn't need any new controllers
1130                          * and has current ones realized, it doesn't need
1131                          * any changes. */
1132                         if (unit_has_mask_realized(m, unit_get_target_mask(m), unit_get_enable_mask(m)))
1133                                 continue;
1134
1135                         unit_add_to_cgroup_queue(m);
1136                 }
1137
1138                 u = slice;
1139         }
1140 }
1141
1142 int unit_realize_cgroup(Unit *u) {
1143         assert(u);
1144
1145         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1146                 return 0;
1147
1148         /* So, here's the deal: when realizing the cgroups for this
1149          * unit, we need to first create all parents, but there's more
1150          * actually: for the weight-based controllers we also need to
1151          * make sure that all our siblings (i.e. units that are in the
1152          * same slice as we are) have cgroups, too. Otherwise, things
1153          * would become very uneven as each of their processes would
1154          * get as much resources as all our group together. This call
1155          * will synchronously create the parent cgroups, but will
1156          * defer work on the siblings to the next event loop
1157          * iteration. */
1158
1159         /* Add all sibling slices to the cgroup queue. */
1160         unit_queue_siblings(u);
1161
1162         /* And realize this one now (and apply the values) */
1163         return unit_realize_cgroup_now(u, manager_state(u->manager));
1164 }
1165
1166 void unit_release_cgroup(Unit *u) {
1167         assert(u);
1168
1169         /* Forgets all cgroup details for this cgroup */
1170
1171         if (u->cgroup_path) {
1172                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1173                 u->cgroup_path = mfree(u->cgroup_path);
1174         }
1175
1176         if (u->cgroup_inotify_wd >= 0) {
1177                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1178                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1179
1180                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1181                 u->cgroup_inotify_wd = -1;
1182         }
1183 }
1184
1185 void unit_prune_cgroup(Unit *u) {
1186         int r;
1187         bool is_root_slice;
1188
1189         assert(u);
1190
1191         /* Removes the cgroup, if empty and possible, and stops watching it. */
1192
1193         if (!u->cgroup_path)
1194                 return;
1195
1196         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1197
1198         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1199         if (r < 0) {
1200                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1201                 return;
1202         }
1203
1204         if (is_root_slice)
1205                 return;
1206
1207         unit_release_cgroup(u);
1208
1209         u->cgroup_realized = false;
1210         u->cgroup_realized_mask = 0;
1211         u->cgroup_enabled_mask = 0;
1212 }
1213
1214 int unit_search_main_pid(Unit *u, pid_t *ret) {
1215         _cleanup_fclose_ FILE *f = NULL;
1216         pid_t pid = 0, npid, mypid;
1217         int r;
1218
1219         assert(u);
1220         assert(ret);
1221
1222         if (!u->cgroup_path)
1223                 return -ENXIO;
1224
1225         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1226         if (r < 0)
1227                 return r;
1228
1229         mypid = getpid();
1230         while (cg_read_pid(f, &npid) > 0)  {
1231                 pid_t ppid;
1232
1233                 if (npid == pid)
1234                         continue;
1235
1236                 /* Ignore processes that aren't our kids */
1237                 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1238                         continue;
1239
1240                 if (pid != 0)
1241                         /* Dang, there's more than one daemonized PID
1242                         in this group, so we don't know what process
1243                         is the main process. */
1244
1245                         return -ENODATA;
1246
1247                 pid = npid;
1248         }
1249
1250         *ret = pid;
1251         return 0;
1252 }
1253
1254 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1255         _cleanup_closedir_ DIR *d = NULL;
1256         _cleanup_fclose_ FILE *f = NULL;
1257         int ret = 0, r;
1258
1259         assert(u);
1260         assert(path);
1261
1262         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1263         if (r < 0)
1264                 ret = r;
1265         else {
1266                 pid_t pid;
1267
1268                 while ((r = cg_read_pid(f, &pid)) > 0) {
1269                         r = unit_watch_pid(u, pid);
1270                         if (r < 0 && ret >= 0)
1271                                 ret = r;
1272                 }
1273
1274                 if (r < 0 && ret >= 0)
1275                         ret = r;
1276         }
1277
1278         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1279         if (r < 0) {
1280                 if (ret >= 0)
1281                         ret = r;
1282         } else {
1283                 char *fn;
1284
1285                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1286                         _cleanup_free_ char *p = NULL;
1287
1288                         p = strjoin(path, "/", fn, NULL);
1289                         free(fn);
1290
1291                         if (!p)
1292                                 return -ENOMEM;
1293
1294                         r = unit_watch_pids_in_path(u, p);
1295                         if (r < 0 && ret >= 0)
1296                                 ret = r;
1297                 }
1298
1299                 if (r < 0 && ret >= 0)
1300                         ret = r;
1301         }
1302
1303         return ret;
1304 }
1305
1306 int unit_watch_all_pids(Unit *u) {
1307         assert(u);
1308
1309         /* Adds all PIDs from our cgroup to the set of PIDs we
1310          * watch. This is a fallback logic for cases where we do not
1311          * get reliable cgroup empty notifications: we try to use
1312          * SIGCHLD as replacement. */
1313
1314         if (!u->cgroup_path)
1315                 return -ENOENT;
1316
1317         if (cg_unified() > 0) /* On unified we can use proper notifications */
1318                 return 0;
1319
1320         return unit_watch_pids_in_path(u, u->cgroup_path);
1321 }
1322
1323 int unit_notify_cgroup_empty(Unit *u) {
1324         int r;
1325
1326         assert(u);
1327
1328         if (!u->cgroup_path)
1329                 return 0;
1330
1331         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1332         if (r <= 0)
1333                 return r;
1334
1335         unit_add_to_gc_queue(u);
1336
1337         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1338                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1339
1340         return 0;
1341 }
1342
1343 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1344         Manager *m = userdata;
1345
1346         assert(s);
1347         assert(fd >= 0);
1348         assert(m);
1349
1350         for (;;) {
1351                 union inotify_event_buffer buffer;
1352                 struct inotify_event *e;
1353                 ssize_t l;
1354
1355                 l = read(fd, &buffer, sizeof(buffer));
1356                 if (l < 0) {
1357                         if (errno == EINTR || errno == EAGAIN)
1358                                 return 0;
1359
1360                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1361                 }
1362
1363                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1364                         Unit *u;
1365
1366                         if (e->wd < 0)
1367                                 /* Queue overflow has no watch descriptor */
1368                                 continue;
1369
1370                         if (e->mask & IN_IGNORED)
1371                                 /* The watch was just removed */
1372                                 continue;
1373
1374                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1375                         if (!u) /* Not that inotify might deliver
1376                                  * events for a watch even after it
1377                                  * was removed, because it was queued
1378                                  * before the removal. Let's ignore
1379                                  * this here safely. */
1380                                 continue;
1381
1382                         (void) unit_notify_cgroup_empty(u);
1383                 }
1384         }
1385 }
1386
1387 int manager_setup_cgroup(Manager *m) {
1388         _cleanup_free_ char *path = NULL;
1389         CGroupController c;
1390         int r, unified;
1391         char *e;
1392
1393         assert(m);
1394
1395         /* 1. Determine hierarchy */
1396         m->cgroup_root = mfree(m->cgroup_root);
1397         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1398         if (r < 0)
1399                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1400
1401         /* Chop off the init scope, if we are already located in it */
1402         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1403
1404         /* LEGACY: Also chop off the system slice if we are in
1405          * it. This is to support live upgrades from older systemd
1406          * versions where PID 1 was moved there. Also see
1407          * cg_get_root_path(). */
1408         if (!e && MANAGER_IS_SYSTEM(m)) {
1409                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1410                 if (!e)
1411                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1412         }
1413         if (e)
1414                 *e = 0;
1415
1416         /* And make sure to store away the root value without trailing
1417          * slash, even for the root dir, so that we can easily prepend
1418          * it everywhere. */
1419         while ((e = endswith(m->cgroup_root, "/")))
1420                 *e = 0;
1421
1422         /* 2. Show data */
1423         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1424         if (r < 0)
1425                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1426
1427         unified = cg_unified();
1428         if (unified < 0)
1429                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1430         if (unified > 0)
1431                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1432         else
1433                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1434
1435         if (!m->test_run) {
1436                 const char *scope_path;
1437
1438                 /* 3. Install agent */
1439                 if (unified) {
1440
1441                         /* In the unified hierarchy we can can get
1442                          * cgroup empty notifications via inotify. */
1443
1444                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1445                         safe_close(m->cgroup_inotify_fd);
1446
1447                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1448                         if (m->cgroup_inotify_fd < 0)
1449                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1450
1451                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1452                         if (r < 0)
1453                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1454
1455                         /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
1456                          * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
1457                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5);
1458                         if (r < 0)
1459                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1460
1461                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1462
1463                 } else if (MANAGER_IS_SYSTEM(m)) {
1464
1465                         /* On the legacy hierarchy we only get
1466                          * notifications via cgroup agents. (Which
1467                          * isn't really reliable, since it does not
1468                          * generate events when control groups with
1469                          * children run empty. */
1470
1471                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1472                         if (r < 0)
1473                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1474                         else if (r > 0)
1475                                 log_debug("Installed release agent.");
1476                         else if (r == 0)
1477                                 log_debug("Release agent already installed.");
1478                 }
1479
1480                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1481                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1482                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1483                 if (r < 0)
1484                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1485
1486                 /* also, move all other userspace processes remaining
1487                  * in the root cgroup into that scope. */
1488                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1489                 if (r < 0)
1490                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1491
1492                 /* 5. And pin it, so that it cannot be unmounted */
1493                 safe_close(m->pin_cgroupfs_fd);
1494                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1495                 if (m->pin_cgroupfs_fd < 0)
1496                         return log_error_errno(errno, "Failed to open pin file: %m");
1497
1498                 /* 6.  Always enable hierarchical support if it exists... */
1499                 if (!unified)
1500                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1501         }
1502
1503         /* 7. Figure out which controllers are supported */
1504         r = cg_mask_supported(&m->cgroup_supported);
1505         if (r < 0)
1506                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1507
1508         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1509                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1510
1511         return 0;
1512 }
1513
1514 void manager_shutdown_cgroup(Manager *m, bool delete) {
1515         assert(m);
1516
1517         /* We can't really delete the group, since we are in it. But
1518          * let's trim it. */
1519         if (delete && m->cgroup_root)
1520                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1521
1522         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1523
1524         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1525         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1526
1527         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1528
1529         m->cgroup_root = mfree(m->cgroup_root);
1530 }
1531
1532 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1533         char *p;
1534         Unit *u;
1535
1536         assert(m);
1537         assert(cgroup);
1538
1539         u = hashmap_get(m->cgroup_unit, cgroup);
1540         if (u)
1541                 return u;
1542
1543         p = strdupa(cgroup);
1544         for (;;) {
1545                 char *e;
1546
1547                 e = strrchr(p, '/');
1548                 if (!e || e == p)
1549                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1550
1551                 *e = 0;
1552
1553                 u = hashmap_get(m->cgroup_unit, p);
1554                 if (u)
1555                         return u;
1556         }
1557 }
1558
1559 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1560         _cleanup_free_ char *cgroup = NULL;
1561         int r;
1562
1563         assert(m);
1564
1565         if (pid <= 0)
1566                 return NULL;
1567
1568         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1569         if (r < 0)
1570                 return NULL;
1571
1572         return manager_get_unit_by_cgroup(m, cgroup);
1573 }
1574
1575 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1576         Unit *u;
1577
1578         assert(m);
1579
1580         if (pid <= 0)
1581                 return NULL;
1582
1583         if (pid == 1)
1584                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1585
1586         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1587         if (u)
1588                 return u;
1589
1590         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1591         if (u)
1592                 return u;
1593
1594         return manager_get_unit_by_pid_cgroup(m, pid);
1595 }
1596
1597 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1598         Unit *u;
1599
1600         assert(m);
1601         assert(cgroup);
1602
1603         log_debug("Got cgroup empty notification for: %s", cgroup);
1604
1605         u = manager_get_unit_by_cgroup(m, cgroup);
1606         if (!u)
1607                 return 0;
1608
1609         return unit_notify_cgroup_empty(u);
1610 }
1611
1612 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1613         _cleanup_free_ char *v = NULL;
1614         int r;
1615
1616         assert(u);
1617         assert(ret);
1618
1619         if (!u->cgroup_path)
1620                 return -ENODATA;
1621
1622         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1623                 return -ENODATA;
1624
1625         if (cg_unified() <= 0)
1626                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1627         else
1628                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1629         if (r == -ENOENT)
1630                 return -ENODATA;
1631         if (r < 0)
1632                 return r;
1633
1634         return safe_atou64(v, ret);
1635 }
1636
1637 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1638         _cleanup_free_ char *v = NULL;
1639         int r;
1640
1641         assert(u);
1642         assert(ret);
1643
1644         if (!u->cgroup_path)
1645                 return -ENODATA;
1646
1647         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1648                 return -ENODATA;
1649
1650         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1651         if (r == -ENOENT)
1652                 return -ENODATA;
1653         if (r < 0)
1654                 return r;
1655
1656         return safe_atou64(v, ret);
1657 }
1658
1659 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1660         _cleanup_free_ char *v = NULL;
1661         uint64_t ns;
1662         int r;
1663
1664         assert(u);
1665         assert(ret);
1666
1667         if (!u->cgroup_path)
1668                 return -ENODATA;
1669
1670         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1671                 return -ENODATA;
1672
1673         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1674         if (r == -ENOENT)
1675                 return -ENODATA;
1676         if (r < 0)
1677                 return r;
1678
1679         r = safe_atou64(v, &ns);
1680         if (r < 0)
1681                 return r;
1682
1683         *ret = ns;
1684         return 0;
1685 }
1686
1687 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1688         nsec_t ns;
1689         int r;
1690
1691         r = unit_get_cpu_usage_raw(u, &ns);
1692         if (r < 0)
1693                 return r;
1694
1695         if (ns > u->cpuacct_usage_base)
1696                 ns -= u->cpuacct_usage_base;
1697         else
1698                 ns = 0;
1699
1700         *ret = ns;
1701         return 0;
1702 }
1703
1704 int unit_reset_cpu_usage(Unit *u) {
1705         nsec_t ns;
1706         int r;
1707
1708         assert(u);
1709
1710         r = unit_get_cpu_usage_raw(u, &ns);
1711         if (r < 0) {
1712                 u->cpuacct_usage_base = 0;
1713                 return r;
1714         }
1715
1716         u->cpuacct_usage_base = ns;
1717         return 0;
1718 }
1719
1720 bool unit_cgroup_delegate(Unit *u) {
1721         CGroupContext *c;
1722
1723         assert(u);
1724
1725         c = unit_get_cgroup_context(u);
1726         if (!c)
1727                 return false;
1728
1729         return c->delegate;
1730 }
1731
1732 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1733         assert(u);
1734
1735         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1736                 return;
1737
1738         if (m == 0)
1739                 return;
1740
1741         if ((u->cgroup_realized_mask & m) == 0)
1742                 return;
1743
1744         u->cgroup_realized_mask &= ~m;
1745         unit_add_to_cgroup_queue(u);
1746 }
1747
1748 void manager_invalidate_startup_units(Manager *m) {
1749         Iterator i;
1750         Unit *u;
1751
1752         assert(m);
1753
1754         SET_FOREACH(u, m->startup_units, i)
1755                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
1756 }
1757
1758 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1759         [CGROUP_AUTO] = "auto",
1760         [CGROUP_CLOSED] = "closed",
1761         [CGROUP_STRICT] = "strict",
1762 };
1763
1764 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);