src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "cgroup-util.h"
  26 #include "path-util.h"
  27 #include "process-util.h"
  28 #include "special.h"
  29 #include "string-util.h"
  30 #include "cgroup.h"
  31
  32 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  33
  34 void cgroup_context_init(CGroupContext *c) {
  35         assert(c);
  36
  37         /* Initialize everything to the kernel defaults, assuming the
  38          * structure is preinitialized to 0 */
  39
  40         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  41         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  42         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  43
  44         c->memory_limit = (uint64_t) -1;
  45
  46         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  47         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  48
  49         c->tasks_max = (uint64_t) -1;
  50
  51         c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
  52 }
  53
  54 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  55         assert(c);
  56         assert(a);
  57
  58         LIST_REMOVE(device_allow, c->device_allow, a);
  59         free(a->path);
  60         free(a);
  61 }
  62
  63 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  64         assert(c);
  65         assert(w);
  66
  67         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  68         free(w->path);
  69         free(w);
  70 }
  71
  72 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  73         assert(c);
  74         assert(b);
  75
  76         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  77         free(b->path);
  78         free(b);
  79 }
  80
  81 void cgroup_context_done(CGroupContext *c) {
  82         assert(c);
  83
  84         while (c->blockio_device_weights)
  85                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  86
  87         while (c->blockio_device_bandwidths)
  88                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  89
  90         while (c->device_allow)
  91                 cgroup_context_free_device_allow(c, c->device_allow);
  92 }
  93
  94 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  95         CGroupBlockIODeviceBandwidth *b;
  96         CGroupBlockIODeviceWeight *w;
  97         CGroupDeviceAllow *a;
  98         char u[FORMAT_TIMESPAN_MAX];
  99
 100         assert(c);
 101         assert(f);
 102
 103         prefix = strempty(prefix);
 104
 105         fprintf(f,
 106                 "%sCPUAccounting=%s\n"
 107                 "%sBlockIOAccounting=%s\n"
 108                 "%sMemoryAccounting=%s\n"
 109                 "%sTasksAccounting=%s\n"
 110                 "%sCPUShares=%" PRIu64 "\n"
 111                 "%sStartupCPUShares=%" PRIu64 "\n"
 112                 "%sCPUQuotaPerSecSec=%s\n"
 113                 "%sBlockIOWeight=%" PRIu64 "\n"
 114                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 115                 "%sMemoryLimit=%" PRIu64 "\n"
 116                 "%sTasksMax=%" PRIu64 "\n"
 117                 "%sDevicePolicy=%s\n"
 118                 "%sDelegate=%s\n",
 119                 prefix, yes_no(c->cpu_accounting),
 120                 prefix, yes_no(c->blockio_accounting),
 121                 prefix, yes_no(c->memory_accounting),
 122                 prefix, yes_no(c->tasks_accounting),
 123                 prefix, c->cpu_shares,
 124                 prefix, c->startup_cpu_shares,
 125                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 126                 prefix, c->blockio_weight,
 127                 prefix, c->startup_blockio_weight,
 128                 prefix, c->memory_limit,
 129                 prefix, c->tasks_max,
 130                 prefix, cgroup_device_policy_to_string(c->device_policy),
 131                 prefix, yes_no(c->delegate));
 132
 133         LIST_FOREACH(device_allow, a, c->device_allow)
 134                 fprintf(f,
 135                         "%sDeviceAllow=%s %s%s%s\n",
 136                         prefix,
 137                         a->path,
 138                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 139
 140         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 141                 fprintf(f,
 142                         "%sBlockIODeviceWeight=%s %" PRIu64,
 143                         prefix,
 144                         w->path,
 145                         w->weight);
 146
 147         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 148                 char buf[FORMAT_BYTES_MAX];
 149
 150                 fprintf(f,
 151                         "%s%s=%s %s\n",
 152                         prefix,
 153                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 154                         b->path,
 155                         format_bytes(buf, sizeof(buf), b->bandwidth));
 156         }
 157 }
 158
 159 static int lookup_blkio_device(const char *p, dev_t *dev) {
 160         struct stat st;
 161         int r;
 162
 163         assert(p);
 164         assert(dev);
 165
 166         r = stat(p, &st);
 167         if (r < 0)
 168                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 169
 170         if (S_ISBLK(st.st_mode))
 171                 *dev = st.st_rdev;
 172         else if (major(st.st_dev) != 0) {
 173                 /* If this is not a device node then find the block
 174                  * device this file is stored on */
 175                 *dev = st.st_dev;
 176
 177                 /* If this is a partition, try to get the originating
 178                  * block device */
 179                 block_get_whole_disk(*dev, dev);
 180         } else {
 181                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 182                 return -ENODEV;
 183         }
 184
 185         return 0;
 186 }
 187
 188 static int whitelist_device(const char *path, const char *node, const char *acc) {
 189         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 190         struct stat st;
 191         int r;
 192
 193         assert(path);
 194         assert(acc);
 195
 196         if (stat(node, &st) < 0) {
 197                 log_warning("Couldn't stat device %s", node);
 198                 return -errno;
 199         }
 200
 201         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 202                 log_warning("%s is not a device.", node);
 203                 return -ENODEV;
 204         }
 205
 206         sprintf(buf,
 207                 "%c %u:%u %s",
 208                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 209                 major(st.st_rdev), minor(st.st_rdev),
 210                 acc);
 211
 212         r = cg_set_attribute("devices", path, "devices.allow", buf);
 213         if (r < 0)
 214                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 215                                "Failed to set devices.allow on %s: %m", path);
 216
 217         return r;
 218 }
 219
 220 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 221         _cleanup_fclose_ FILE *f = NULL;
 222         char line[LINE_MAX];
 223         bool good = false;
 224         int r;
 225
 226         assert(path);
 227         assert(acc);
 228         assert(type == 'b' || type == 'c');
 229
 230         f = fopen("/proc/devices", "re");
 231         if (!f)
 232                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 233
 234         FOREACH_LINE(line, f, goto fail) {
 235                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 236                 unsigned maj;
 237
 238                 truncate_nl(line);
 239
 240                 if (type == 'c' && streq(line, "Character devices:")) {
 241                         good = true;
 242                         continue;
 243                 }
 244
 245                 if (type == 'b' && streq(line, "Block devices:")) {
 246                         good = true;
 247                         continue;
 248                 }
 249
 250                 if (isempty(line)) {
 251                         good = false;
 252                         continue;
 253                 }
 254
 255                 if (!good)
 256                         continue;
 257
 258                 p = strstrip(line);
 259
 260                 w = strpbrk(p, WHITESPACE);
 261                 if (!w)
 262                         continue;
 263                 *w = 0;
 264
 265                 r = safe_atou(p, &maj);
 266                 if (r < 0)
 267                         continue;
 268                 if (maj <= 0)
 269                         continue;
 270
 271                 w++;
 272                 w += strspn(w, WHITESPACE);
 273
 274                 if (fnmatch(name, w, 0) != 0)
 275                         continue;
 276
 277                 sprintf(buf,
 278                         "%c %u:* %s",
 279                         type,
 280                         maj,
 281                         acc);
 282
 283                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 284                 if (r < 0)
 285                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 286                                        "Failed to set devices.allow on %s: %m", path);
 287         }
 288
 289         return 0;
 290
 291 fail:
 292         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 293         return -errno;
 294 }
 295
 296 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
 297         bool is_root;
 298         int r;
 299
 300         assert(c);
 301         assert(path);
 302
 303         if (mask == 0)
 304                 return;
 305
 306         /* Some cgroup attributes are not supported on the root cgroup,
 307          * hence silently ignore */
 308         is_root = isempty(path) || path_equal(path, "/");
 309         if (is_root)
 310                 /* Make sure we don't try to display messages with an empty path. */
 311                 path = "/";
 312
 313         /* We generally ignore errors caused by read-only mounted
 314          * cgroup trees (assuming we are running in a container then),
 315          * and missing cgroups, i.e. EROFS and ENOENT. */
 316
 317         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 318                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 319
 320                 sprintf(buf, "%" PRIu64 "\n",
 321                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 322                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 323                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 324                 if (r < 0)
 325                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 326                                        "Failed to set cpu.shares on %s: %m", path);
 327
 328                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 329                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 330                 if (r < 0)
 331                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 332                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 333
 334                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 335                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 336                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 337                 } else
 338                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 339                 if (r < 0)
 340                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 341                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 342         }
 343
 344         if (mask & CGROUP_MASK_BLKIO) {
 345                 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
 346                              DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 347                 CGroupBlockIODeviceWeight *w;
 348                 CGroupBlockIODeviceBandwidth *b;
 349
 350                 if (!is_root) {
 351                         sprintf(buf, "%" PRIu64 "\n",
 352                                 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
 353                                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
 354                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 355                         if (r < 0)
 356                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 357                                                "Failed to set blkio.weight on %s: %m", path);
 358
 359                         /* FIXME: no way to reset this list */
 360                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 361                                 dev_t dev;
 362
 363                                 r = lookup_blkio_device(w->path, &dev);
 364                                 if (r < 0)
 365                                         continue;
 366
 367                                 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 368                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 369                                 if (r < 0)
 370                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 371                                                        "Failed to set blkio.weight_device on %s: %m", path);
 372                         }
 373                 }
 374
 375                 /* FIXME: no way to reset this list */
 376                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 377                         const char *a;
 378                         dev_t dev;
 379
 380                         r = lookup_blkio_device(b->path, &dev);
 381                         if (r < 0)
 382                                 continue;
 383
 384                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 385
 386                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 387                         r = cg_set_attribute("blkio", path, a, buf);
 388                         if (r < 0)
 389                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 390                                                "Failed to set %s on %s: %m", a, path);
 391                 }
 392         }
 393
 394         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 395                 if (c->memory_limit != (uint64_t) -1) {
 396                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 397
 398                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 399
 400                         if (cg_unified() <= 0)
 401                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 402                         else
 403                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 404
 405                 } else {
 406                         if (cg_unified() <= 0)
 407                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 408                         else
 409                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 410                 }
 411
 412                 if (r < 0)
 413                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 414                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 415         }
 416
 417         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 418                 CGroupDeviceAllow *a;
 419
 420                 /* Changing the devices list of a populated cgroup
 421                  * might result in EINVAL, hence ignore EINVAL
 422                  * here. */
 423
 424                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 425                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 426                 else
 427                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 428                 if (r < 0)
 429                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 430                                        "Failed to reset devices.list on %s: %m", path);
 431
 432                 if (c->device_policy == CGROUP_CLOSED ||
 433                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 434                         static const char auto_devices[] =
 435                                 "/dev/null\0" "rwm\0"
 436                                 "/dev/zero\0" "rwm\0"
 437                                 "/dev/full\0" "rwm\0"
 438                                 "/dev/random\0" "rwm\0"
 439                                 "/dev/urandom\0" "rwm\0"
 440                                 "/dev/tty\0" "rwm\0"
 441                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 442
 443                         const char *x, *y;
 444
 445                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 446                                 whitelist_device(path, x, y);
 447
 448                         whitelist_major(path, "pts", 'c', "rw");
 449                         whitelist_major(path, "kdbus", 'c', "rw");
 450                         whitelist_major(path, "kdbus/*", 'c', "rw");
 451                 }
 452
 453                 LIST_FOREACH(device_allow, a, c->device_allow) {
 454                         char acc[4];
 455                         unsigned k = 0;
 456
 457                         if (a->r)
 458                                 acc[k++] = 'r';
 459                         if (a->w)
 460                                 acc[k++] = 'w';
 461                         if (a->m)
 462                                 acc[k++] = 'm';
 463
 464                         if (k == 0)
 465                                 continue;
 466
 467                         acc[k++] = 0;
 468
 469                         if (startswith(a->path, "/dev/"))
 470                                 whitelist_device(path, a->path, acc);
 471                         else if (startswith(a->path, "block-"))
 472                                 whitelist_major(path, a->path + 6, 'b', acc);
 473                         else if (startswith(a->path, "char-"))
 474                                 whitelist_major(path, a->path + 5, 'c', acc);
 475                         else
 476                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 477                 }
 478         }
 479
 480         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 481
 482                 if (c->tasks_max != (uint64_t) -1) {
 483                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 484
 485                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 486                         r = cg_set_attribute("pids", path, "pids.max", buf);
 487                 } else
 488                         r = cg_set_attribute("pids", path, "pids.max", "max");
 489
 490                 if (r < 0)
 491                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 492                                        "Failed to set pids.max on %s: %m", path);
 493         }
 494
 495         if (mask & CGROUP_MASK_NET_CLS) {
 496                 char buf[DECIMAL_STR_MAX(uint32_t)];
 497
 498                 sprintf(buf, "%" PRIu32, netclass);
 499
 500                 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
 501                 if (r < 0)
 502                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 503                                        "Failed to set net_cls.classid on %s: %m", path);
 504         }
 505 }
 506
 507 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 508         CGroupMask mask = 0;
 509
 510         /* Figure out which controllers we need */
 511
 512         if (c->cpu_accounting ||
 513             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 514             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 515             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 516                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 517
 518         if (c->blockio_accounting ||
 519             c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 520             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 521             c->blockio_device_weights ||
 522             c->blockio_device_bandwidths)
 523                 mask |= CGROUP_MASK_BLKIO;
 524
 525         if (c->memory_accounting ||
 526             c->memory_limit != (uint64_t) -1)
 527                 mask |= CGROUP_MASK_MEMORY;
 528
 529         if (c->device_allow ||
 530             c->device_policy != CGROUP_AUTO)
 531                 mask |= CGROUP_MASK_DEVICES;
 532
 533         if (c->tasks_accounting ||
 534             c->tasks_max != (uint64_t) -1)
 535                 mask |= CGROUP_MASK_PIDS;
 536
 537         if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
 538                 mask |= CGROUP_MASK_NET_CLS;
 539
 540         return mask;
 541 }
 542
 543 CGroupMask unit_get_own_mask(Unit *u) {
 544         CGroupContext *c;
 545
 546         /* Returns the mask of controllers the unit needs for itself */
 547
 548         c = unit_get_cgroup_context(u);
 549         if (!c)
 550                 return 0;
 551
 552         /* If delegation is turned on, then turn on all cgroups,
 553          * unless we are on the legacy hierarchy and the process we
 554          * fork into it is known to drop privileges, and hence
 555          * shouldn't get access to the controllers.
 556          *
 557          * Note that on the unified hierarchy it is safe to delegate
 558          * controllers to unprivileged services. */
 559
 560         if (c->delegate) {
 561                 ExecContext *e;
 562
 563                 e = unit_get_exec_context(u);
 564                 if (!e ||
 565                     exec_context_maintains_privileges(e) ||
 566                     cg_unified() > 0)
 567                         return _CGROUP_MASK_ALL;
 568         }
 569
 570         return cgroup_context_get_mask(c);
 571 }
 572
 573 CGroupMask unit_get_members_mask(Unit *u) {
 574         assert(u);
 575
 576         /* Returns the mask of controllers all of the unit's children
 577          * require, merged */
 578
 579         if (u->cgroup_members_mask_valid)
 580                 return u->cgroup_members_mask;
 581
 582         u->cgroup_members_mask = 0;
 583
 584         if (u->type == UNIT_SLICE) {
 585                 Unit *member;
 586                 Iterator i;
 587
 588                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 589
 590                         if (member == u)
 591                                 continue;
 592
 593                         if (UNIT_DEREF(member->slice) != u)
 594                                 continue;
 595
 596                         u->cgroup_members_mask |=
 597                                 unit_get_own_mask(member) |
 598                                 unit_get_members_mask(member);
 599                 }
 600         }
 601
 602         u->cgroup_members_mask_valid = true;
 603         return u->cgroup_members_mask;
 604 }
 605
 606 CGroupMask unit_get_siblings_mask(Unit *u) {
 607         assert(u);
 608
 609         /* Returns the mask of controllers all of the unit's siblings
 610          * require, i.e. the members mask of the unit's parent slice
 611          * if there is one. */
 612
 613         if (UNIT_ISSET(u->slice))
 614                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 615
 616         return unit_get_own_mask(u) | unit_get_members_mask(u);
 617 }
 618
 619 CGroupMask unit_get_subtree_mask(Unit *u) {
 620
 621         /* Returns the mask of this subtree, meaning of the group
 622          * itself and its children. */
 623
 624         return unit_get_own_mask(u) | unit_get_members_mask(u);
 625 }
 626
 627 CGroupMask unit_get_target_mask(Unit *u) {
 628         CGroupMask mask;
 629
 630         /* This returns the cgroup mask of all controllers to enable
 631          * for a specific cgroup, i.e. everything it needs itself,
 632          * plus all that its children need, plus all that its siblings
 633          * need. This is primarily useful on the legacy cgroup
 634          * hierarchy, where we need to duplicate each cgroup in each
 635          * hierarchy that shall be enabled for it. */
 636
 637         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 638         mask &= u->manager->cgroup_supported;
 639
 640         return mask;
 641 }
 642
 643 CGroupMask unit_get_enable_mask(Unit *u) {
 644         CGroupMask mask;
 645
 646         /* This returns the cgroup mask of all controllers to enable
 647          * for the children of a specific cgroup. This is primarily
 648          * useful for the unified cgroup hierarchy, where each cgroup
 649          * controls which controllers are enabled for its children. */
 650
 651         mask = unit_get_members_mask(u);
 652         mask &= u->manager->cgroup_supported;
 653
 654         return mask;
 655 }
 656
 657 /* Recurse from a unit up through its containing slices, propagating
 658  * mask bits upward. A unit is also member of itself. */
 659 void unit_update_cgroup_members_masks(Unit *u) {
 660         CGroupMask m;
 661         bool more;
 662
 663         assert(u);
 664
 665         /* Calculate subtree mask */
 666         m = unit_get_subtree_mask(u);
 667
 668         /* See if anything changed from the previous invocation. If
 669          * not, we're done. */
 670         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 671                 return;
 672
 673         more =
 674                 u->cgroup_subtree_mask_valid &&
 675                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 676                 ((~m & u->cgroup_subtree_mask) == 0);
 677
 678         u->cgroup_subtree_mask = m;
 679         u->cgroup_subtree_mask_valid = true;
 680
 681         if (UNIT_ISSET(u->slice)) {
 682                 Unit *s = UNIT_DEREF(u->slice);
 683
 684                 if (more)
 685                         /* There's more set now than before. We
 686                          * propagate the new mask to the parent's mask
 687                          * (not caring if it actually was valid or
 688                          * not). */
 689
 690                         s->cgroup_members_mask |= m;
 691
 692                 else
 693                         /* There's less set now than before (or we
 694                          * don't know), we need to recalculate
 695                          * everything, so let's invalidate the
 696                          * parent's members mask */
 697
 698                         s->cgroup_members_mask_valid = false;
 699
 700                 /* And now make sure that this change also hits our
 701                  * grandparents */
 702                 unit_update_cgroup_members_masks(s);
 703         }
 704 }
 705
 706 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 707         Unit *u = userdata;
 708
 709         assert(mask != 0);
 710         assert(u);
 711
 712         while (u) {
 713                 if (u->cgroup_path &&
 714                     u->cgroup_realized &&
 715                     (u->cgroup_realized_mask & mask) == mask)
 716                         return u->cgroup_path;
 717
 718                 u = UNIT_DEREF(u->slice);
 719         }
 720
 721         return NULL;
 722 }
 723
 724 char *unit_default_cgroup_path(Unit *u) {
 725         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 726         int r;
 727
 728         assert(u);
 729
 730         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 731                 return strdup(u->manager->cgroup_root);
 732
 733         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 734                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 735                 if (r < 0)
 736                         return NULL;
 737         }
 738
 739         escaped = cg_escape(u->id);
 740         if (!escaped)
 741                 return NULL;
 742
 743         if (slice)
 744                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 745         else
 746                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 747 }
 748
 749 int unit_set_cgroup_path(Unit *u, const char *path) {
 750         _cleanup_free_ char *p = NULL;
 751         int r;
 752
 753         assert(u);
 754
 755         if (path) {
 756                 p = strdup(path);
 757                 if (!p)
 758                         return -ENOMEM;
 759         } else
 760                 p = NULL;
 761
 762         if (streq_ptr(u->cgroup_path, p))
 763                 return 0;
 764
 765         if (p) {
 766                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 767                 if (r < 0)
 768                         return r;
 769         }
 770
 771         unit_release_cgroup(u);
 772
 773         u->cgroup_path = p;
 774         p = NULL;
 775
 776         return 1;
 777 }
 778
 779 int unit_watch_cgroup(Unit *u) {
 780         _cleanup_free_ char *populated = NULL;
 781         int r;
 782
 783         assert(u);
 784
 785         if (!u->cgroup_path)
 786                 return 0;
 787
 788         if (u->cgroup_inotify_wd >= 0)
 789                 return 0;
 790
 791         /* Only applies to the unified hierarchy */
 792         r = cg_unified();
 793         if (r < 0)
 794                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 795         if (r == 0)
 796                 return 0;
 797
 798         /* Don't watch the root slice, it's pointless. */
 799         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 800                 return 0;
 801
 802         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 803         if (r < 0)
 804                 return log_oom();
 805
 806         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 807         if (r < 0)
 808                 return log_oom();
 809
 810         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 811         if (u->cgroup_inotify_wd < 0) {
 812
 813                 if (errno == ENOENT) /* If the directory is already
 814                                       * gone we don't need to track
 815                                       * it, so this is not an error */
 816                         return 0;
 817
 818                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 819         }
 820
 821         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 822         if (r < 0)
 823                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 824
 825         return 0;
 826 }
 827
 828 static int unit_create_cgroup(
 829                 Unit *u,
 830                 CGroupMask target_mask,
 831                 CGroupMask enable_mask) {
 832
 833         CGroupContext *c;
 834         int r;
 835
 836         assert(u);
 837
 838         c = unit_get_cgroup_context(u);
 839         if (!c)
 840                 return 0;
 841
 842         if (!u->cgroup_path) {
 843                 _cleanup_free_ char *path = NULL;
 844
 845                 path = unit_default_cgroup_path(u);
 846                 if (!path)
 847                         return log_oom();
 848
 849                 r = unit_set_cgroup_path(u, path);
 850                 if (r == -EEXIST)
 851                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 852                 if (r < 0)
 853                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 854         }
 855
 856         /* First, create our own group */
 857         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 858         if (r < 0)
 859                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 860
 861         /* Start watching it */
 862         (void) unit_watch_cgroup(u);
 863
 864         /* Enable all controllers we need */
 865         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 866         if (r < 0)
 867                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 868
 869         /* Keep track that this is now realized */
 870         u->cgroup_realized = true;
 871         u->cgroup_realized_mask = target_mask;
 872
 873         if (u->type != UNIT_SLICE && !c->delegate) {
 874
 875                 /* Then, possibly move things over, but not if
 876                  * subgroups may contain processes, which is the case
 877                  * for slice and delegation units. */
 878                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 879                 if (r < 0)
 880                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 881         }
 882
 883         return 0;
 884 }
 885
 886 int unit_attach_pids_to_cgroup(Unit *u) {
 887         int r;
 888         assert(u);
 889
 890         r = unit_realize_cgroup(u);
 891         if (r < 0)
 892                 return r;
 893
 894         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 895         if (r < 0)
 896                 return r;
 897
 898         return 0;
 899 }
 900
 901 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 902         assert(u);
 903
 904         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 905 }
 906
 907 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
 908
 909         uint32_t start, i;
 910         Manager *m;
 911
 912         assert(u);
 913
 914         m = u->manager;
 915
 916         i = start = m->cgroup_netclass_registry_last;
 917
 918         do {
 919                 i++;
 920
 921                 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
 922                         m->cgroup_netclass_registry_last = i;
 923                         *ret = i;
 924                         return 0;
 925                 }
 926
 927                 if (i == UINT32_MAX)
 928                         i = CGROUP_NETCLASS_FIXED_MAX;
 929
 930         } while (i != start);
 931
 932         return -ENOBUFS;
 933 }
 934
 935 int unit_add_to_netclass_cgroup(Unit *u) {
 936
 937         CGroupContext *cc;
 938         Unit *first;
 939         void *key;
 940         int r;
 941
 942         assert(u);
 943
 944         cc = unit_get_cgroup_context(u);
 945         if (!cc)
 946                 return 0;
 947
 948         switch (cc->netclass_type) {
 949         case CGROUP_NETCLASS_TYPE_NONE:
 950                 return 0;
 951
 952         case CGROUP_NETCLASS_TYPE_FIXED:
 953                 u->cgroup_netclass_id = cc->netclass_id;
 954                 break;
 955
 956         case CGROUP_NETCLASS_TYPE_AUTO:
 957                 /* Allocate a new ID in case it was requested and not done yet */
 958                 if (u->cgroup_netclass_id == 0) {
 959                         r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
 960                         if (r < 0)
 961                                 return r;
 962
 963                         log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
 964                 }
 965
 966                 break;
 967         }
 968
 969         r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
 970         if (r < 0)
 971                 return r;
 972
 973         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 974         first = hashmap_get(u->manager->cgroup_netclass_registry, key);
 975
 976         if (first) {
 977                 LIST_PREPEND(cgroup_netclass, first, u);
 978                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
 979         }
 980
 981         return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
 982 }
 983
 984 int unit_remove_from_netclass_cgroup(Unit *u) {
 985
 986         Unit *head;
 987         void *key;
 988
 989         assert(u);
 990
 991         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 992
 993         LIST_FIND_HEAD(cgroup_netclass, u, head);
 994         LIST_REMOVE(cgroup_netclass, head, u);
 995
 996         if (head)
 997                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
 998
 999         hashmap_remove(u->manager->cgroup_netclass_registry, key);
1000
1001         return 0;
1002 }
1003
1004 /* Check if necessary controllers and attributes for a unit are in place.
1005  *
1006  * If so, do nothing.
1007  * If not, create paths, move processes over, and set attributes.
1008  *
1009  * Returns 0 on success and < 0 on failure. */
1010 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1011         CGroupMask target_mask, enable_mask;
1012         int r;
1013
1014         assert(u);
1015
1016         if (u->in_cgroup_queue) {
1017                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1018                 u->in_cgroup_queue = false;
1019         }
1020
1021         target_mask = unit_get_target_mask(u);
1022         if (unit_has_mask_realized(u, target_mask))
1023                 return 0;
1024
1025         /* First, realize parents */
1026         if (UNIT_ISSET(u->slice)) {
1027                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1028                 if (r < 0)
1029                         return r;
1030         }
1031
1032         /* And then do the real work */
1033         enable_mask = unit_get_enable_mask(u);
1034         r = unit_create_cgroup(u, target_mask, enable_mask);
1035         if (r < 0)
1036                 return r;
1037
1038         /* Finally, apply the necessary attributes. */
1039         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1040
1041         return 0;
1042 }
1043
1044 static void unit_add_to_cgroup_queue(Unit *u) {
1045
1046         if (u->in_cgroup_queue)
1047                 return;
1048
1049         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1050         u->in_cgroup_queue = true;
1051 }
1052
1053 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1054         ManagerState state;
1055         unsigned n = 0;
1056         Unit *i;
1057         int r;
1058
1059         state = manager_state(m);
1060
1061         while ((i = m->cgroup_queue)) {
1062                 assert(i->in_cgroup_queue);
1063
1064                 r = unit_realize_cgroup_now(i, state);
1065                 if (r < 0)
1066                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1067
1068                 n++;
1069         }
1070
1071         return n;
1072 }
1073
1074 static void unit_queue_siblings(Unit *u) {
1075         Unit *slice;
1076
1077         /* This adds the siblings of the specified unit and the
1078          * siblings of all parent units to the cgroup queue. (But
1079          * neither the specified unit itself nor the parents.) */
1080
1081         while ((slice = UNIT_DEREF(u->slice))) {
1082                 Iterator i;
1083                 Unit *m;
1084
1085                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1086                         if (m == u)
1087                                 continue;
1088
1089                         /* Skip units that have a dependency on the slice
1090                          * but aren't actually in it. */
1091                         if (UNIT_DEREF(m->slice) != slice)
1092                                 continue;
1093
1094                         /* No point in doing cgroup application for units
1095                          * without active processes. */
1096                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1097                                 continue;
1098
1099                         /* If the unit doesn't need any new controllers
1100                          * and has current ones realized, it doesn't need
1101                          * any changes. */
1102                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1103                                 continue;
1104
1105                         unit_add_to_cgroup_queue(m);
1106                 }
1107
1108                 u = slice;
1109         }
1110 }
1111
1112 int unit_realize_cgroup(Unit *u) {
1113         assert(u);
1114
1115         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1116                 return 0;
1117
1118         /* So, here's the deal: when realizing the cgroups for this
1119          * unit, we need to first create all parents, but there's more
1120          * actually: for the weight-based controllers we also need to
1121          * make sure that all our siblings (i.e. units that are in the
1122          * same slice as we are) have cgroups, too. Otherwise, things
1123          * would become very uneven as each of their processes would
1124          * get as much resources as all our group together. This call
1125          * will synchronously create the parent cgroups, but will
1126          * defer work on the siblings to the next event loop
1127          * iteration. */
1128
1129         /* Add all sibling slices to the cgroup queue. */
1130         unit_queue_siblings(u);
1131
1132         /* And realize this one now (and apply the values) */
1133         return unit_realize_cgroup_now(u, manager_state(u->manager));
1134 }
1135
1136 void unit_release_cgroup(Unit *u) {
1137         assert(u);
1138
1139         /* Forgets all cgroup details for this cgroup */
1140
1141         if (u->cgroup_path) {
1142                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1143                 u->cgroup_path = mfree(u->cgroup_path);
1144         }
1145
1146         if (u->cgroup_inotify_wd >= 0) {
1147                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1148                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1149
1150                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1151                 u->cgroup_inotify_wd = -1;
1152         }
1153 }
1154
1155 void unit_prune_cgroup(Unit *u) {
1156         int r;
1157         bool is_root_slice;
1158
1159         assert(u);
1160
1161         /* Removes the cgroup, if empty and possible, and stops watching it. */
1162
1163         if (!u->cgroup_path)
1164                 return;
1165
1166         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1167
1168         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1169         if (r < 0) {
1170                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1171                 return;
1172         }
1173
1174         if (is_root_slice)
1175                 return;
1176
1177         unit_release_cgroup(u);
1178
1179         u->cgroup_realized = false;
1180         u->cgroup_realized_mask = 0;
1181 }
1182
1183 int unit_search_main_pid(Unit *u, pid_t *ret) {
1184         _cleanup_fclose_ FILE *f = NULL;
1185         pid_t pid = 0, npid, mypid;
1186         int r;
1187
1188         assert(u);
1189         assert(ret);
1190
1191         if (!u->cgroup_path)
1192                 return -ENXIO;
1193
1194         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1195         if (r < 0)
1196                 return r;
1197
1198         mypid = getpid();
1199         while (cg_read_pid(f, &npid) > 0)  {
1200                 pid_t ppid;
1201
1202                 if (npid == pid)
1203                         continue;
1204
1205                 /* Ignore processes that aren't our kids */
1206                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1207                         continue;
1208
1209                 if (pid != 0)
1210                         /* Dang, there's more than one daemonized PID
1211                         in this group, so we don't know what process
1212                         is the main process. */
1213
1214                         return -ENODATA;
1215
1216                 pid = npid;
1217         }
1218
1219         *ret = pid;
1220         return 0;
1221 }
1222
1223 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1224         _cleanup_closedir_ DIR *d = NULL;
1225         _cleanup_fclose_ FILE *f = NULL;
1226         int ret = 0, r;
1227
1228         assert(u);
1229         assert(path);
1230
1231         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1232         if (r < 0)
1233                 ret = r;
1234         else {
1235                 pid_t pid;
1236
1237                 while ((r = cg_read_pid(f, &pid)) > 0) {
1238                         r = unit_watch_pid(u, pid);
1239                         if (r < 0 && ret >= 0)
1240                                 ret = r;
1241                 }
1242
1243                 if (r < 0 && ret >= 0)
1244                         ret = r;
1245         }
1246
1247         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1248         if (r < 0) {
1249                 if (ret >= 0)
1250                         ret = r;
1251         } else {
1252                 char *fn;
1253
1254                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1255                         _cleanup_free_ char *p = NULL;
1256
1257                         p = strjoin(path, "/", fn, NULL);
1258                         free(fn);
1259
1260                         if (!p)
1261                                 return -ENOMEM;
1262
1263                         r = unit_watch_pids_in_path(u, p);
1264                         if (r < 0 && ret >= 0)
1265                                 ret = r;
1266                 }
1267
1268                 if (r < 0 && ret >= 0)
1269                         ret = r;
1270         }
1271
1272         return ret;
1273 }
1274
1275 int unit_watch_all_pids(Unit *u) {
1276         assert(u);
1277
1278         /* Adds all PIDs from our cgroup to the set of PIDs we
1279          * watch. This is a fallback logic for cases where we do not
1280          * get reliable cgroup empty notifications: we try to use
1281          * SIGCHLD as replacement. */
1282
1283         if (!u->cgroup_path)
1284                 return -ENOENT;
1285
1286         if (cg_unified() > 0) /* On unified we can use proper notifications */
1287                 return 0;
1288
1289         return unit_watch_pids_in_path(u, u->cgroup_path);
1290 }
1291
1292 int unit_notify_cgroup_empty(Unit *u) {
1293         int r;
1294
1295         assert(u);
1296
1297         if (!u->cgroup_path)
1298                 return 0;
1299
1300         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1301         if (r <= 0)
1302                 return r;
1303
1304         unit_add_to_gc_queue(u);
1305
1306         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1307                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1308
1309         return 0;
1310 }
1311
1312 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1313         Manager *m = userdata;
1314
1315         assert(s);
1316         assert(fd >= 0);
1317         assert(m);
1318
1319         for (;;) {
1320                 union inotify_event_buffer buffer;
1321                 struct inotify_event *e;
1322                 ssize_t l;
1323
1324                 l = read(fd, &buffer, sizeof(buffer));
1325                 if (l < 0) {
1326                         if (errno == EINTR || errno == EAGAIN)
1327                                 return 0;
1328
1329                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1330                 }
1331
1332                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1333                         Unit *u;
1334
1335                         if (e->wd < 0)
1336                                 /* Queue overflow has no watch descriptor */
1337                                 continue;
1338
1339                         if (e->mask & IN_IGNORED)
1340                                 /* The watch was just removed */
1341                                 continue;
1342
1343                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1344                         if (!u) /* Not that inotify might deliver
1345                                  * events for a watch even after it
1346                                  * was removed, because it was queued
1347                                  * before the removal. Let's ignore
1348                                  * this here safely. */
1349                                 continue;
1350
1351                         (void) unit_notify_cgroup_empty(u);
1352                 }
1353         }
1354 }
1355
1356 int manager_setup_cgroup(Manager *m) {
1357         _cleanup_free_ char *path = NULL;
1358         CGroupController c;
1359         int r, unified;
1360         char *e;
1361
1362         assert(m);
1363
1364         /* 1. Determine hierarchy */
1365         m->cgroup_root = mfree(m->cgroup_root);
1366         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1367         if (r < 0)
1368                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1369
1370         /* Chop off the init scope, if we are already located in it */
1371         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1372
1373         /* LEGACY: Also chop off the system slice if we are in
1374          * it. This is to support live upgrades from older systemd
1375          * versions where PID 1 was moved there. Also see
1376          * cg_get_root_path(). */
1377         if (!e && m->running_as == MANAGER_SYSTEM) {
1378                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1379                 if (!e)
1380                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1381         }
1382         if (e)
1383                 *e = 0;
1384
1385         /* And make sure to store away the root value without trailing
1386          * slash, even for the root dir, so that we can easily prepend
1387          * it everywhere. */
1388         while ((e = endswith(m->cgroup_root, "/")))
1389                 *e = 0;
1390
1391         /* 2. Show data */
1392         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1393         if (r < 0)
1394                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1395
1396         unified = cg_unified();
1397         if (unified < 0)
1398                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1399         if (unified > 0)
1400                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1401         else
1402                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1403
1404         if (!m->test_run) {
1405                 const char *scope_path;
1406
1407                 /* 3. Install agent */
1408                 if (unified) {
1409
1410                         /* In the unified hierarchy we can can get
1411                          * cgroup empty notifications via inotify. */
1412
1413                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1414                         safe_close(m->cgroup_inotify_fd);
1415
1416                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1417                         if (m->cgroup_inotify_fd < 0)
1418                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1419
1420                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1421                         if (r < 0)
1422                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1423
1424                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1425                         if (r < 0)
1426                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1427
1428                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1429
1430                 } else if (m->running_as == MANAGER_SYSTEM) {
1431
1432                         /* On the legacy hierarchy we only get
1433                          * notifications via cgroup agents. (Which
1434                          * isn't really reliable, since it does not
1435                          * generate events when control groups with
1436                          * children run empty. */
1437
1438                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1439                         if (r < 0)
1440                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1441                         else if (r > 0)
1442                                 log_debug("Installed release agent.");
1443                         else if (r == 0)
1444                                 log_debug("Release agent already installed.");
1445                 }
1446
1447                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1448                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1449                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1450                 if (r < 0)
1451                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1452
1453                 /* also, move all other userspace processes remaining
1454                  * in the root cgroup into that scope. */
1455                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1456                 if (r < 0)
1457                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1458
1459                 /* 5. And pin it, so that it cannot be unmounted */
1460                 safe_close(m->pin_cgroupfs_fd);
1461                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1462                 if (m->pin_cgroupfs_fd < 0)
1463                         return log_error_errno(errno, "Failed to open pin file: %m");
1464
1465                 /* 6.  Always enable hierarchical support if it exists... */
1466                 if (!unified)
1467                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1468         }
1469
1470         /* 7. Figure out which controllers are supported */
1471         r = cg_mask_supported(&m->cgroup_supported);
1472         if (r < 0)
1473                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1474
1475         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1476                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1477
1478         return 0;
1479 }
1480
1481 void manager_shutdown_cgroup(Manager *m, bool delete) {
1482         assert(m);
1483
1484         /* We can't really delete the group, since we are in it. But
1485          * let's trim it. */
1486         if (delete && m->cgroup_root)
1487                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1488
1489         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1490
1491         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1492         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1493
1494         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1495
1496         m->cgroup_root = mfree(m->cgroup_root);
1497 }
1498
1499 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1500         char *p;
1501         Unit *u;
1502
1503         assert(m);
1504         assert(cgroup);
1505
1506         u = hashmap_get(m->cgroup_unit, cgroup);
1507         if (u)
1508                 return u;
1509
1510         p = strdupa(cgroup);
1511         for (;;) {
1512                 char *e;
1513
1514                 e = strrchr(p, '/');
1515                 if (!e || e == p)
1516                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1517
1518                 *e = 0;
1519
1520                 u = hashmap_get(m->cgroup_unit, p);
1521                 if (u)
1522                         return u;
1523         }
1524 }
1525
1526 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1527         _cleanup_free_ char *cgroup = NULL;
1528         int r;
1529
1530         assert(m);
1531
1532         if (pid <= 0)
1533                 return NULL;
1534
1535         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1536         if (r < 0)
1537                 return NULL;
1538
1539         return manager_get_unit_by_cgroup(m, cgroup);
1540 }
1541
1542 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1543         Unit *u;
1544
1545         assert(m);
1546
1547         if (pid <= 0)
1548                 return NULL;
1549
1550         if (pid == 1)
1551                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1552
1553         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1554         if (u)
1555                 return u;
1556
1557         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1558         if (u)
1559                 return u;
1560
1561         return manager_get_unit_by_pid_cgroup(m, pid);
1562 }
1563
1564 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1565         Unit *u;
1566
1567         assert(m);
1568         assert(cgroup);
1569
1570         u = manager_get_unit_by_cgroup(m, cgroup);
1571         if (!u)
1572                 return 0;
1573
1574         return unit_notify_cgroup_empty(u);
1575 }
1576
1577 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1578         _cleanup_free_ char *v = NULL;
1579         int r;
1580
1581         assert(u);
1582         assert(ret);
1583
1584         if (!u->cgroup_path)
1585                 return -ENODATA;
1586
1587         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1588                 return -ENODATA;
1589
1590         if (cg_unified() <= 0)
1591                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1592         else
1593                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1594         if (r == -ENOENT)
1595                 return -ENODATA;
1596         if (r < 0)
1597                 return r;
1598
1599         return safe_atou64(v, ret);
1600 }
1601
1602 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1603         _cleanup_free_ char *v = NULL;
1604         int r;
1605
1606         assert(u);
1607         assert(ret);
1608
1609         if (!u->cgroup_path)
1610                 return -ENODATA;
1611
1612         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1613                 return -ENODATA;
1614
1615         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1616         if (r == -ENOENT)
1617                 return -ENODATA;
1618         if (r < 0)
1619                 return r;
1620
1621         return safe_atou64(v, ret);
1622 }
1623
1624 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1625         _cleanup_free_ char *v = NULL;
1626         uint64_t ns;
1627         int r;
1628
1629         assert(u);
1630         assert(ret);
1631
1632         if (!u->cgroup_path)
1633                 return -ENODATA;
1634
1635         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1636                 return -ENODATA;
1637
1638         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1639         if (r == -ENOENT)
1640                 return -ENODATA;
1641         if (r < 0)
1642                 return r;
1643
1644         r = safe_atou64(v, &ns);
1645         if (r < 0)
1646                 return r;
1647
1648         *ret = ns;
1649         return 0;
1650 }
1651
1652 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1653         nsec_t ns;
1654         int r;
1655
1656         r = unit_get_cpu_usage_raw(u, &ns);
1657         if (r < 0)
1658                 return r;
1659
1660         if (ns > u->cpuacct_usage_base)
1661                 ns -= u->cpuacct_usage_base;
1662         else
1663                 ns = 0;
1664
1665         *ret = ns;
1666         return 0;
1667 }
1668
1669 int unit_reset_cpu_usage(Unit *u) {
1670         nsec_t ns;
1671         int r;
1672
1673         assert(u);
1674
1675         r = unit_get_cpu_usage_raw(u, &ns);
1676         if (r < 0) {
1677                 u->cpuacct_usage_base = 0;
1678                 return r;
1679         }
1680
1681         u->cpuacct_usage_base = ns;
1682         return 0;
1683 }
1684
1685 bool unit_cgroup_delegate(Unit *u) {
1686         CGroupContext *c;
1687
1688         assert(u);
1689
1690         c = unit_get_cgroup_context(u);
1691         if (!c)
1692                 return false;
1693
1694         return c->delegate;
1695 }
1696
1697 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1698         assert(u);
1699
1700         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1701                 return;
1702
1703         if (m == 0)
1704                 return;
1705
1706         if ((u->cgroup_realized_mask & m) == 0)
1707                 return;
1708
1709         u->cgroup_realized_mask &= ~m;
1710         unit_add_to_cgroup_queue(u);
1711 }
1712
1713 void manager_invalidate_startup_units(Manager *m) {
1714         Iterator i;
1715         Unit *u;
1716
1717         assert(m);
1718
1719         SET_FOREACH(u, m->startup_units, i)
1720                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1721 }
1722
1723 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1724         [CGROUP_AUTO] = "auto",
1725         [CGROUP_CLOSED] = "closed",
1726         [CGROUP_STRICT] = "strict",
1727 };
1728
1729 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);