src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "cgroup-util.h"
  26 #include "cgroup.h"
  27 #include "fd-util.h"
  28 #include "fileio.h"
  29 #include "parse-util.h"
  30 #include "path-util.h"
  31 #include "process-util.h"
  32 #include "special.h"
  33 #include "string-table.h"
  34 #include "string-util.h"
  35
  36 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  37
  38 void cgroup_context_init(CGroupContext *c) {
  39         assert(c);
  40
  41         /* Initialize everything to the kernel defaults, assuming the
  42          * structure is preinitialized to 0 */
  43
  44         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  45         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  46         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  47
  48         c->memory_limit = (uint64_t) -1;
  49
  50         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  51         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  52
  53         c->tasks_max = (uint64_t) -1;
  54
  55         c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
  56 }
  57
  58 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  59         assert(c);
  60         assert(a);
  61
  62         LIST_REMOVE(device_allow, c->device_allow, a);
  63         free(a->path);
  64         free(a);
  65 }
  66
  67 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  68         assert(c);
  69         assert(w);
  70
  71         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  72         free(w->path);
  73         free(w);
  74 }
  75
  76 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  77         assert(c);
  78         assert(b);
  79
  80         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  81         free(b->path);
  82         free(b);
  83 }
  84
  85 void cgroup_context_done(CGroupContext *c) {
  86         assert(c);
  87
  88         while (c->blockio_device_weights)
  89                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  90
  91         while (c->blockio_device_bandwidths)
  92                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  93
  94         while (c->device_allow)
  95                 cgroup_context_free_device_allow(c, c->device_allow);
  96 }
  97
  98 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  99         CGroupBlockIODeviceBandwidth *b;
 100         CGroupBlockIODeviceWeight *w;
 101         CGroupDeviceAllow *a;
 102         char u[FORMAT_TIMESPAN_MAX];
 103
 104         assert(c);
 105         assert(f);
 106
 107         prefix = strempty(prefix);
 108
 109         fprintf(f,
 110                 "%sCPUAccounting=%s\n"
 111                 "%sBlockIOAccounting=%s\n"
 112                 "%sMemoryAccounting=%s\n"
 113                 "%sTasksAccounting=%s\n"
 114                 "%sCPUShares=%" PRIu64 "\n"
 115                 "%sStartupCPUShares=%" PRIu64 "\n"
 116                 "%sCPUQuotaPerSecSec=%s\n"
 117                 "%sBlockIOWeight=%" PRIu64 "\n"
 118                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 119                 "%sMemoryLimit=%" PRIu64 "\n"
 120                 "%sTasksMax=%" PRIu64 "\n"
 121                 "%sDevicePolicy=%s\n"
 122                 "%sDelegate=%s\n",
 123                 prefix, yes_no(c->cpu_accounting),
 124                 prefix, yes_no(c->blockio_accounting),
 125                 prefix, yes_no(c->memory_accounting),
 126                 prefix, yes_no(c->tasks_accounting),
 127                 prefix, c->cpu_shares,
 128                 prefix, c->startup_cpu_shares,
 129                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 130                 prefix, c->blockio_weight,
 131                 prefix, c->startup_blockio_weight,
 132                 prefix, c->memory_limit,
 133                 prefix, c->tasks_max,
 134                 prefix, cgroup_device_policy_to_string(c->device_policy),
 135                 prefix, yes_no(c->delegate));
 136
 137         LIST_FOREACH(device_allow, a, c->device_allow)
 138                 fprintf(f,
 139                         "%sDeviceAllow=%s %s%s%s\n",
 140                         prefix,
 141                         a->path,
 142                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 143
 144         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 145                 fprintf(f,
 146                         "%sBlockIODeviceWeight=%s %" PRIu64,
 147                         prefix,
 148                         w->path,
 149                         w->weight);
 150
 151         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 152                 char buf[FORMAT_BYTES_MAX];
 153
 154                 fprintf(f,
 155                         "%s%s=%s %s\n",
 156                         prefix,
 157                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 158                         b->path,
 159                         format_bytes(buf, sizeof(buf), b->bandwidth));
 160         }
 161 }
 162
 163 static int lookup_blkio_device(const char *p, dev_t *dev) {
 164         struct stat st;
 165         int r;
 166
 167         assert(p);
 168         assert(dev);
 169
 170         r = stat(p, &st);
 171         if (r < 0)
 172                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 173
 174         if (S_ISBLK(st.st_mode))
 175                 *dev = st.st_rdev;
 176         else if (major(st.st_dev) != 0) {
 177                 /* If this is not a device node then find the block
 178                  * device this file is stored on */
 179                 *dev = st.st_dev;
 180
 181                 /* If this is a partition, try to get the originating
 182                  * block device */
 183                 block_get_whole_disk(*dev, dev);
 184         } else {
 185                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 186                 return -ENODEV;
 187         }
 188
 189         return 0;
 190 }
 191
 192 static int whitelist_device(const char *path, const char *node, const char *acc) {
 193         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 194         struct stat st;
 195         int r;
 196
 197         assert(path);
 198         assert(acc);
 199
 200         if (stat(node, &st) < 0) {
 201                 log_warning("Couldn't stat device %s", node);
 202                 return -errno;
 203         }
 204
 205         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 206                 log_warning("%s is not a device.", node);
 207                 return -ENODEV;
 208         }
 209
 210         sprintf(buf,
 211                 "%c %u:%u %s",
 212                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 213                 major(st.st_rdev), minor(st.st_rdev),
 214                 acc);
 215
 216         r = cg_set_attribute("devices", path, "devices.allow", buf);
 217         if (r < 0)
 218                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 219                                "Failed to set devices.allow on %s: %m", path);
 220
 221         return r;
 222 }
 223
 224 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 225         _cleanup_fclose_ FILE *f = NULL;
 226         char line[LINE_MAX];
 227         bool good = false;
 228         int r;
 229
 230         assert(path);
 231         assert(acc);
 232         assert(type == 'b' || type == 'c');
 233
 234         f = fopen("/proc/devices", "re");
 235         if (!f)
 236                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 237
 238         FOREACH_LINE(line, f, goto fail) {
 239                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 240                 unsigned maj;
 241
 242                 truncate_nl(line);
 243
 244                 if (type == 'c' && streq(line, "Character devices:")) {
 245                         good = true;
 246                         continue;
 247                 }
 248
 249                 if (type == 'b' && streq(line, "Block devices:")) {
 250                         good = true;
 251                         continue;
 252                 }
 253
 254                 if (isempty(line)) {
 255                         good = false;
 256                         continue;
 257                 }
 258
 259                 if (!good)
 260                         continue;
 261
 262                 p = strstrip(line);
 263
 264                 w = strpbrk(p, WHITESPACE);
 265                 if (!w)
 266                         continue;
 267                 *w = 0;
 268
 269                 r = safe_atou(p, &maj);
 270                 if (r < 0)
 271                         continue;
 272                 if (maj <= 0)
 273                         continue;
 274
 275                 w++;
 276                 w += strspn(w, WHITESPACE);
 277
 278                 if (fnmatch(name, w, 0) != 0)
 279                         continue;
 280
 281                 sprintf(buf,
 282                         "%c %u:* %s",
 283                         type,
 284                         maj,
 285                         acc);
 286
 287                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 288                 if (r < 0)
 289                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 290                                        "Failed to set devices.allow on %s: %m", path);
 291         }
 292
 293         return 0;
 294
 295 fail:
 296         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 297         return -errno;
 298 }
 299
 300 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
 301         bool is_root;
 302         int r;
 303
 304         assert(c);
 305         assert(path);
 306
 307         if (mask == 0)
 308                 return;
 309
 310         /* Some cgroup attributes are not supported on the root cgroup,
 311          * hence silently ignore */
 312         is_root = isempty(path) || path_equal(path, "/");
 313         if (is_root)
 314                 /* Make sure we don't try to display messages with an empty path. */
 315                 path = "/";
 316
 317         /* We generally ignore errors caused by read-only mounted
 318          * cgroup trees (assuming we are running in a container then),
 319          * and missing cgroups, i.e. EROFS and ENOENT. */
 320
 321         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 322                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 323
 324                 sprintf(buf, "%" PRIu64 "\n",
 325                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 326                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 327                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 328                 if (r < 0)
 329                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 330                                        "Failed to set cpu.shares on %s: %m", path);
 331
 332                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 333                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 334                 if (r < 0)
 335                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 336                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 337
 338                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 339                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 340                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 341                 } else
 342                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 343                 if (r < 0)
 344                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 345                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 346         }
 347
 348         if (mask & CGROUP_MASK_BLKIO) {
 349                 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
 350                              DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 351                 CGroupBlockIODeviceWeight *w;
 352                 CGroupBlockIODeviceBandwidth *b;
 353
 354                 if (!is_root) {
 355                         sprintf(buf, "%" PRIu64 "\n",
 356                                 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
 357                                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
 358                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 359                         if (r < 0)
 360                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 361                                                "Failed to set blkio.weight on %s: %m", path);
 362
 363                         /* FIXME: no way to reset this list */
 364                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 365                                 dev_t dev;
 366
 367                                 r = lookup_blkio_device(w->path, &dev);
 368                                 if (r < 0)
 369                                         continue;
 370
 371                                 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 372                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 373                                 if (r < 0)
 374                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 375                                                        "Failed to set blkio.weight_device on %s: %m", path);
 376                         }
 377                 }
 378
 379                 /* FIXME: no way to reset this list */
 380                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 381                         const char *a;
 382                         dev_t dev;
 383
 384                         r = lookup_blkio_device(b->path, &dev);
 385                         if (r < 0)
 386                                 continue;
 387
 388                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 389
 390                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 391                         r = cg_set_attribute("blkio", path, a, buf);
 392                         if (r < 0)
 393                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 394                                                "Failed to set %s on %s: %m", a, path);
 395                 }
 396         }
 397
 398         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 399                 if (c->memory_limit != (uint64_t) -1) {
 400                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 401
 402                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 403
 404                         if (cg_unified() <= 0)
 405                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 406                         else
 407                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 408
 409                 } else {
 410                         if (cg_unified() <= 0)
 411                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 412                         else
 413                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 414                 }
 415
 416                 if (r < 0)
 417                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 418                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 419         }
 420
 421         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 422                 CGroupDeviceAllow *a;
 423
 424                 /* Changing the devices list of a populated cgroup
 425                  * might result in EINVAL, hence ignore EINVAL
 426                  * here. */
 427
 428                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 429                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 430                 else
 431                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 432                 if (r < 0)
 433                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 434                                        "Failed to reset devices.list on %s: %m", path);
 435
 436                 if (c->device_policy == CGROUP_CLOSED ||
 437                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 438                         static const char auto_devices[] =
 439                                 "/dev/null\0" "rwm\0"
 440                                 "/dev/zero\0" "rwm\0"
 441                                 "/dev/full\0" "rwm\0"
 442                                 "/dev/random\0" "rwm\0"
 443                                 "/dev/urandom\0" "rwm\0"
 444                                 "/dev/tty\0" "rwm\0"
 445                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 446
 447                         const char *x, *y;
 448
 449                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 450                                 whitelist_device(path, x, y);
 451
 452                         whitelist_major(path, "pts", 'c', "rw");
 453                         whitelist_major(path, "kdbus", 'c', "rw");
 454                         whitelist_major(path, "kdbus/*", 'c', "rw");
 455                 }
 456
 457                 LIST_FOREACH(device_allow, a, c->device_allow) {
 458                         char acc[4];
 459                         unsigned k = 0;
 460
 461                         if (a->r)
 462                                 acc[k++] = 'r';
 463                         if (a->w)
 464                                 acc[k++] = 'w';
 465                         if (a->m)
 466                                 acc[k++] = 'm';
 467
 468                         if (k == 0)
 469                                 continue;
 470
 471                         acc[k++] = 0;
 472
 473                         if (startswith(a->path, "/dev/"))
 474                                 whitelist_device(path, a->path, acc);
 475                         else if (startswith(a->path, "block-"))
 476                                 whitelist_major(path, a->path + 6, 'b', acc);
 477                         else if (startswith(a->path, "char-"))
 478                                 whitelist_major(path, a->path + 5, 'c', acc);
 479                         else
 480                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 481                 }
 482         }
 483
 484         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 485
 486                 if (c->tasks_max != (uint64_t) -1) {
 487                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 488
 489                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 490                         r = cg_set_attribute("pids", path, "pids.max", buf);
 491                 } else
 492                         r = cg_set_attribute("pids", path, "pids.max", "max");
 493
 494                 if (r < 0)
 495                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 496                                        "Failed to set pids.max on %s: %m", path);
 497         }
 498
 499         if (mask & CGROUP_MASK_NET_CLS) {
 500                 char buf[DECIMAL_STR_MAX(uint32_t)];
 501
 502                 sprintf(buf, "%" PRIu32, netclass);
 503
 504                 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
 505                 if (r < 0)
 506                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 507                                        "Failed to set net_cls.classid on %s: %m", path);
 508         }
 509 }
 510
 511 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 512         CGroupMask mask = 0;
 513
 514         /* Figure out which controllers we need */
 515
 516         if (c->cpu_accounting ||
 517             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 518             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 519             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 520                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 521
 522         if (c->blockio_accounting ||
 523             c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 524             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 525             c->blockio_device_weights ||
 526             c->blockio_device_bandwidths)
 527                 mask |= CGROUP_MASK_BLKIO;
 528
 529         if (c->memory_accounting ||
 530             c->memory_limit != (uint64_t) -1)
 531                 mask |= CGROUP_MASK_MEMORY;
 532
 533         if (c->device_allow ||
 534             c->device_policy != CGROUP_AUTO)
 535                 mask |= CGROUP_MASK_DEVICES;
 536
 537         if (c->tasks_accounting ||
 538             c->tasks_max != (uint64_t) -1)
 539                 mask |= CGROUP_MASK_PIDS;
 540
 541         if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
 542                 mask |= CGROUP_MASK_NET_CLS;
 543
 544         return mask;
 545 }
 546
 547 CGroupMask unit_get_own_mask(Unit *u) {
 548         CGroupContext *c;
 549
 550         /* Returns the mask of controllers the unit needs for itself */
 551
 552         c = unit_get_cgroup_context(u);
 553         if (!c)
 554                 return 0;
 555
 556         /* If delegation is turned on, then turn on all cgroups,
 557          * unless we are on the legacy hierarchy and the process we
 558          * fork into it is known to drop privileges, and hence
 559          * shouldn't get access to the controllers.
 560          *
 561          * Note that on the unified hierarchy it is safe to delegate
 562          * controllers to unprivileged services. */
 563
 564         if (c->delegate) {
 565                 ExecContext *e;
 566
 567                 e = unit_get_exec_context(u);
 568                 if (!e ||
 569                     exec_context_maintains_privileges(e) ||
 570                     cg_unified() > 0)
 571                         return _CGROUP_MASK_ALL;
 572         }
 573
 574         return cgroup_context_get_mask(c);
 575 }
 576
 577 CGroupMask unit_get_members_mask(Unit *u) {
 578         assert(u);
 579
 580         /* Returns the mask of controllers all of the unit's children
 581          * require, merged */
 582
 583         if (u->cgroup_members_mask_valid)
 584                 return u->cgroup_members_mask;
 585
 586         u->cgroup_members_mask = 0;
 587
 588         if (u->type == UNIT_SLICE) {
 589                 Unit *member;
 590                 Iterator i;
 591
 592                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 593
 594                         if (member == u)
 595                                 continue;
 596
 597                         if (UNIT_DEREF(member->slice) != u)
 598                                 continue;
 599
 600                         u->cgroup_members_mask |=
 601                                 unit_get_own_mask(member) |
 602                                 unit_get_members_mask(member);
 603                 }
 604         }
 605
 606         u->cgroup_members_mask_valid = true;
 607         return u->cgroup_members_mask;
 608 }
 609
 610 CGroupMask unit_get_siblings_mask(Unit *u) {
 611         assert(u);
 612
 613         /* Returns the mask of controllers all of the unit's siblings
 614          * require, i.e. the members mask of the unit's parent slice
 615          * if there is one. */
 616
 617         if (UNIT_ISSET(u->slice))
 618                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 619
 620         return unit_get_own_mask(u) | unit_get_members_mask(u);
 621 }
 622
 623 CGroupMask unit_get_subtree_mask(Unit *u) {
 624
 625         /* Returns the mask of this subtree, meaning of the group
 626          * itself and its children. */
 627
 628         return unit_get_own_mask(u) | unit_get_members_mask(u);
 629 }
 630
 631 CGroupMask unit_get_target_mask(Unit *u) {
 632         CGroupMask mask;
 633
 634         /* This returns the cgroup mask of all controllers to enable
 635          * for a specific cgroup, i.e. everything it needs itself,
 636          * plus all that its children need, plus all that its siblings
 637          * need. This is primarily useful on the legacy cgroup
 638          * hierarchy, where we need to duplicate each cgroup in each
 639          * hierarchy that shall be enabled for it. */
 640
 641         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 642         mask &= u->manager->cgroup_supported;
 643
 644         return mask;
 645 }
 646
 647 CGroupMask unit_get_enable_mask(Unit *u) {
 648         CGroupMask mask;
 649
 650         /* This returns the cgroup mask of all controllers to enable
 651          * for the children of a specific cgroup. This is primarily
 652          * useful for the unified cgroup hierarchy, where each cgroup
 653          * controls which controllers are enabled for its children. */
 654
 655         mask = unit_get_members_mask(u);
 656         mask &= u->manager->cgroup_supported;
 657
 658         return mask;
 659 }
 660
 661 /* Recurse from a unit up through its containing slices, propagating
 662  * mask bits upward. A unit is also member of itself. */
 663 void unit_update_cgroup_members_masks(Unit *u) {
 664         CGroupMask m;
 665         bool more;
 666
 667         assert(u);
 668
 669         /* Calculate subtree mask */
 670         m = unit_get_subtree_mask(u);
 671
 672         /* See if anything changed from the previous invocation. If
 673          * not, we're done. */
 674         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 675                 return;
 676
 677         more =
 678                 u->cgroup_subtree_mask_valid &&
 679                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 680                 ((~m & u->cgroup_subtree_mask) == 0);
 681
 682         u->cgroup_subtree_mask = m;
 683         u->cgroup_subtree_mask_valid = true;
 684
 685         if (UNIT_ISSET(u->slice)) {
 686                 Unit *s = UNIT_DEREF(u->slice);
 687
 688                 if (more)
 689                         /* There's more set now than before. We
 690                          * propagate the new mask to the parent's mask
 691                          * (not caring if it actually was valid or
 692                          * not). */
 693
 694                         s->cgroup_members_mask |= m;
 695
 696                 else
 697                         /* There's less set now than before (or we
 698                          * don't know), we need to recalculate
 699                          * everything, so let's invalidate the
 700                          * parent's members mask */
 701
 702                         s->cgroup_members_mask_valid = false;
 703
 704                 /* And now make sure that this change also hits our
 705                  * grandparents */
 706                 unit_update_cgroup_members_masks(s);
 707         }
 708 }
 709
 710 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 711         Unit *u = userdata;
 712
 713         assert(mask != 0);
 714         assert(u);
 715
 716         while (u) {
 717                 if (u->cgroup_path &&
 718                     u->cgroup_realized &&
 719                     (u->cgroup_realized_mask & mask) == mask)
 720                         return u->cgroup_path;
 721
 722                 u = UNIT_DEREF(u->slice);
 723         }
 724
 725         return NULL;
 726 }
 727
 728 char *unit_default_cgroup_path(Unit *u) {
 729         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 730         int r;
 731
 732         assert(u);
 733
 734         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 735                 return strdup(u->manager->cgroup_root);
 736
 737         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 738                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 739                 if (r < 0)
 740                         return NULL;
 741         }
 742
 743         escaped = cg_escape(u->id);
 744         if (!escaped)
 745                 return NULL;
 746
 747         if (slice)
 748                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 749         else
 750                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 751 }
 752
 753 int unit_set_cgroup_path(Unit *u, const char *path) {
 754         _cleanup_free_ char *p = NULL;
 755         int r;
 756
 757         assert(u);
 758
 759         if (path) {
 760                 p = strdup(path);
 761                 if (!p)
 762                         return -ENOMEM;
 763         } else
 764                 p = NULL;
 765
 766         if (streq_ptr(u->cgroup_path, p))
 767                 return 0;
 768
 769         if (p) {
 770                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 771                 if (r < 0)
 772                         return r;
 773         }
 774
 775         unit_release_cgroup(u);
 776
 777         u->cgroup_path = p;
 778         p = NULL;
 779
 780         return 1;
 781 }
 782
 783 int unit_watch_cgroup(Unit *u) {
 784         _cleanup_free_ char *populated = NULL;
 785         int r;
 786
 787         assert(u);
 788
 789         if (!u->cgroup_path)
 790                 return 0;
 791
 792         if (u->cgroup_inotify_wd >= 0)
 793                 return 0;
 794
 795         /* Only applies to the unified hierarchy */
 796         r = cg_unified();
 797         if (r < 0)
 798                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 799         if (r == 0)
 800                 return 0;
 801
 802         /* Don't watch the root slice, it's pointless. */
 803         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 804                 return 0;
 805
 806         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 807         if (r < 0)
 808                 return log_oom();
 809
 810         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 811         if (r < 0)
 812                 return log_oom();
 813
 814         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 815         if (u->cgroup_inotify_wd < 0) {
 816
 817                 if (errno == ENOENT) /* If the directory is already
 818                                       * gone we don't need to track
 819                                       * it, so this is not an error */
 820                         return 0;
 821
 822                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 823         }
 824
 825         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 826         if (r < 0)
 827                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 828
 829         return 0;
 830 }
 831
 832 static int unit_create_cgroup(
 833                 Unit *u,
 834                 CGroupMask target_mask,
 835                 CGroupMask enable_mask) {
 836
 837         CGroupContext *c;
 838         int r;
 839
 840         assert(u);
 841
 842         c = unit_get_cgroup_context(u);
 843         if (!c)
 844                 return 0;
 845
 846         if (!u->cgroup_path) {
 847                 _cleanup_free_ char *path = NULL;
 848
 849                 path = unit_default_cgroup_path(u);
 850                 if (!path)
 851                         return log_oom();
 852
 853                 r = unit_set_cgroup_path(u, path);
 854                 if (r == -EEXIST)
 855                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 856                 if (r < 0)
 857                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 858         }
 859
 860         /* First, create our own group */
 861         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 862         if (r < 0)
 863                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 864
 865         /* Start watching it */
 866         (void) unit_watch_cgroup(u);
 867
 868         /* Enable all controllers we need */
 869         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 870         if (r < 0)
 871                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 872
 873         /* Keep track that this is now realized */
 874         u->cgroup_realized = true;
 875         u->cgroup_realized_mask = target_mask;
 876
 877         if (u->type != UNIT_SLICE && !c->delegate) {
 878
 879                 /* Then, possibly move things over, but not if
 880                  * subgroups may contain processes, which is the case
 881                  * for slice and delegation units. */
 882                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 883                 if (r < 0)
 884                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 885         }
 886
 887         return 0;
 888 }
 889
 890 int unit_attach_pids_to_cgroup(Unit *u) {
 891         int r;
 892         assert(u);
 893
 894         r = unit_realize_cgroup(u);
 895         if (r < 0)
 896                 return r;
 897
 898         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 899         if (r < 0)
 900                 return r;
 901
 902         return 0;
 903 }
 904
 905 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 906         assert(u);
 907
 908         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 909 }
 910
 911 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
 912
 913         uint32_t start, i;
 914         Manager *m;
 915
 916         assert(u);
 917
 918         m = u->manager;
 919
 920         i = start = m->cgroup_netclass_registry_last;
 921
 922         do {
 923                 i++;
 924
 925                 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
 926                         m->cgroup_netclass_registry_last = i;
 927                         *ret = i;
 928                         return 0;
 929                 }
 930
 931                 if (i == UINT32_MAX)
 932                         i = CGROUP_NETCLASS_FIXED_MAX;
 933
 934         } while (i != start);
 935
 936         return -ENOBUFS;
 937 }
 938
 939 int unit_add_to_netclass_cgroup(Unit *u) {
 940
 941         CGroupContext *cc;
 942         Unit *first;
 943         void *key;
 944         int r;
 945
 946         assert(u);
 947
 948         cc = unit_get_cgroup_context(u);
 949         if (!cc)
 950                 return 0;
 951
 952         switch (cc->netclass_type) {
 953         case CGROUP_NETCLASS_TYPE_NONE:
 954                 return 0;
 955
 956         case CGROUP_NETCLASS_TYPE_FIXED:
 957                 u->cgroup_netclass_id = cc->netclass_id;
 958                 break;
 959
 960         case CGROUP_NETCLASS_TYPE_AUTO:
 961                 /* Allocate a new ID in case it was requested and not done yet */
 962                 if (u->cgroup_netclass_id == 0) {
 963                         r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
 964                         if (r < 0)
 965                                 return r;
 966
 967                         log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
 968                 }
 969
 970                 break;
 971         }
 972
 973         r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
 974         if (r < 0)
 975                 return r;
 976
 977         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 978         first = hashmap_get(u->manager->cgroup_netclass_registry, key);
 979
 980         if (first) {
 981                 LIST_PREPEND(cgroup_netclass, first, u);
 982                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
 983         }
 984
 985         return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
 986 }
 987
 988 int unit_remove_from_netclass_cgroup(Unit *u) {
 989
 990         Unit *head;
 991         void *key;
 992
 993         assert(u);
 994
 995         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 996
 997         LIST_FIND_HEAD(cgroup_netclass, u, head);
 998         LIST_REMOVE(cgroup_netclass, head, u);
 999
1000         if (head)
1001                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1002
1003         hashmap_remove(u->manager->cgroup_netclass_registry, key);
1004
1005         return 0;
1006 }
1007
1008 /* Check if necessary controllers and attributes for a unit are in place.
1009  *
1010  * If so, do nothing.
1011  * If not, create paths, move processes over, and set attributes.
1012  *
1013  * Returns 0 on success and < 0 on failure. */
1014 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1015         CGroupMask target_mask, enable_mask;
1016         int r;
1017
1018         assert(u);
1019
1020         if (u->in_cgroup_queue) {
1021                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1022                 u->in_cgroup_queue = false;
1023         }
1024
1025         target_mask = unit_get_target_mask(u);
1026         if (unit_has_mask_realized(u, target_mask))
1027                 return 0;
1028
1029         /* First, realize parents */
1030         if (UNIT_ISSET(u->slice)) {
1031                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1032                 if (r < 0)
1033                         return r;
1034         }
1035
1036         /* And then do the real work */
1037         enable_mask = unit_get_enable_mask(u);
1038         r = unit_create_cgroup(u, target_mask, enable_mask);
1039         if (r < 0)
1040                 return r;
1041
1042         /* Finally, apply the necessary attributes. */
1043         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1044
1045         return 0;
1046 }
1047
1048 static void unit_add_to_cgroup_queue(Unit *u) {
1049
1050         if (u->in_cgroup_queue)
1051                 return;
1052
1053         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1054         u->in_cgroup_queue = true;
1055 }
1056
1057 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1058         ManagerState state;
1059         unsigned n = 0;
1060         Unit *i;
1061         int r;
1062
1063         state = manager_state(m);
1064
1065         while ((i = m->cgroup_queue)) {
1066                 assert(i->in_cgroup_queue);
1067
1068                 r = unit_realize_cgroup_now(i, state);
1069                 if (r < 0)
1070                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1071
1072                 n++;
1073         }
1074
1075         return n;
1076 }
1077
1078 static void unit_queue_siblings(Unit *u) {
1079         Unit *slice;
1080
1081         /* This adds the siblings of the specified unit and the
1082          * siblings of all parent units to the cgroup queue. (But
1083          * neither the specified unit itself nor the parents.) */
1084
1085         while ((slice = UNIT_DEREF(u->slice))) {
1086                 Iterator i;
1087                 Unit *m;
1088
1089                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1090                         if (m == u)
1091                                 continue;
1092
1093                         /* Skip units that have a dependency on the slice
1094                          * but aren't actually in it. */
1095                         if (UNIT_DEREF(m->slice) != slice)
1096                                 continue;
1097
1098                         /* No point in doing cgroup application for units
1099                          * without active processes. */
1100                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1101                                 continue;
1102
1103                         /* If the unit doesn't need any new controllers
1104                          * and has current ones realized, it doesn't need
1105                          * any changes. */
1106                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1107                                 continue;
1108
1109                         unit_add_to_cgroup_queue(m);
1110                 }
1111
1112                 u = slice;
1113         }
1114 }
1115
1116 int unit_realize_cgroup(Unit *u) {
1117         assert(u);
1118
1119         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1120                 return 0;
1121
1122         /* So, here's the deal: when realizing the cgroups for this
1123          * unit, we need to first create all parents, but there's more
1124          * actually: for the weight-based controllers we also need to
1125          * make sure that all our siblings (i.e. units that are in the
1126          * same slice as we are) have cgroups, too. Otherwise, things
1127          * would become very uneven as each of their processes would
1128          * get as much resources as all our group together. This call
1129          * will synchronously create the parent cgroups, but will
1130          * defer work on the siblings to the next event loop
1131          * iteration. */
1132
1133         /* Add all sibling slices to the cgroup queue. */
1134         unit_queue_siblings(u);
1135
1136         /* And realize this one now (and apply the values) */
1137         return unit_realize_cgroup_now(u, manager_state(u->manager));
1138 }
1139
1140 void unit_release_cgroup(Unit *u) {
1141         assert(u);
1142
1143         /* Forgets all cgroup details for this cgroup */
1144
1145         if (u->cgroup_path) {
1146                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1147                 u->cgroup_path = mfree(u->cgroup_path);
1148         }
1149
1150         if (u->cgroup_inotify_wd >= 0) {
1151                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1152                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1153
1154                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1155                 u->cgroup_inotify_wd = -1;
1156         }
1157 }
1158
1159 void unit_prune_cgroup(Unit *u) {
1160         int r;
1161         bool is_root_slice;
1162
1163         assert(u);
1164
1165         /* Removes the cgroup, if empty and possible, and stops watching it. */
1166
1167         if (!u->cgroup_path)
1168                 return;
1169
1170         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1171
1172         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1173         if (r < 0) {
1174                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1175                 return;
1176         }
1177
1178         if (is_root_slice)
1179                 return;
1180
1181         unit_release_cgroup(u);
1182
1183         u->cgroup_realized = false;
1184         u->cgroup_realized_mask = 0;
1185 }
1186
1187 int unit_search_main_pid(Unit *u, pid_t *ret) {
1188         _cleanup_fclose_ FILE *f = NULL;
1189         pid_t pid = 0, npid, mypid;
1190         int r;
1191
1192         assert(u);
1193         assert(ret);
1194
1195         if (!u->cgroup_path)
1196                 return -ENXIO;
1197
1198         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1199         if (r < 0)
1200                 return r;
1201
1202         mypid = getpid();
1203         while (cg_read_pid(f, &npid) > 0)  {
1204                 pid_t ppid;
1205
1206                 if (npid == pid)
1207                         continue;
1208
1209                 /* Ignore processes that aren't our kids */
1210                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1211                         continue;
1212
1213                 if (pid != 0)
1214                         /* Dang, there's more than one daemonized PID
1215                         in this group, so we don't know what process
1216                         is the main process. */
1217
1218                         return -ENODATA;
1219
1220                 pid = npid;
1221         }
1222
1223         *ret = pid;
1224         return 0;
1225 }
1226
1227 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1228         _cleanup_closedir_ DIR *d = NULL;
1229         _cleanup_fclose_ FILE *f = NULL;
1230         int ret = 0, r;
1231
1232         assert(u);
1233         assert(path);
1234
1235         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1236         if (r < 0)
1237                 ret = r;
1238         else {
1239                 pid_t pid;
1240
1241                 while ((r = cg_read_pid(f, &pid)) > 0) {
1242                         r = unit_watch_pid(u, pid);
1243                         if (r < 0 && ret >= 0)
1244                                 ret = r;
1245                 }
1246
1247                 if (r < 0 && ret >= 0)
1248                         ret = r;
1249         }
1250
1251         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1252         if (r < 0) {
1253                 if (ret >= 0)
1254                         ret = r;
1255         } else {
1256                 char *fn;
1257
1258                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1259                         _cleanup_free_ char *p = NULL;
1260
1261                         p = strjoin(path, "/", fn, NULL);
1262                         free(fn);
1263
1264                         if (!p)
1265                                 return -ENOMEM;
1266
1267                         r = unit_watch_pids_in_path(u, p);
1268                         if (r < 0 && ret >= 0)
1269                                 ret = r;
1270                 }
1271
1272                 if (r < 0 && ret >= 0)
1273                         ret = r;
1274         }
1275
1276         return ret;
1277 }
1278
1279 int unit_watch_all_pids(Unit *u) {
1280         assert(u);
1281
1282         /* Adds all PIDs from our cgroup to the set of PIDs we
1283          * watch. This is a fallback logic for cases where we do not
1284          * get reliable cgroup empty notifications: we try to use
1285          * SIGCHLD as replacement. */
1286
1287         if (!u->cgroup_path)
1288                 return -ENOENT;
1289
1290         if (cg_unified() > 0) /* On unified we can use proper notifications */
1291                 return 0;
1292
1293         return unit_watch_pids_in_path(u, u->cgroup_path);
1294 }
1295
1296 int unit_notify_cgroup_empty(Unit *u) {
1297         int r;
1298
1299         assert(u);
1300
1301         if (!u->cgroup_path)
1302                 return 0;
1303
1304         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1305         if (r <= 0)
1306                 return r;
1307
1308         unit_add_to_gc_queue(u);
1309
1310         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1311                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1312
1313         return 0;
1314 }
1315
1316 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1317         Manager *m = userdata;
1318
1319         assert(s);
1320         assert(fd >= 0);
1321         assert(m);
1322
1323         for (;;) {
1324                 union inotify_event_buffer buffer;
1325                 struct inotify_event *e;
1326                 ssize_t l;
1327
1328                 l = read(fd, &buffer, sizeof(buffer));
1329                 if (l < 0) {
1330                         if (errno == EINTR || errno == EAGAIN)
1331                                 return 0;
1332
1333                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1334                 }
1335
1336                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1337                         Unit *u;
1338
1339                         if (e->wd < 0)
1340                                 /* Queue overflow has no watch descriptor */
1341                                 continue;
1342
1343                         if (e->mask & IN_IGNORED)
1344                                 /* The watch was just removed */
1345                                 continue;
1346
1347                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1348                         if (!u) /* Not that inotify might deliver
1349                                  * events for a watch even after it
1350                                  * was removed, because it was queued
1351                                  * before the removal. Let's ignore
1352                                  * this here safely. */
1353                                 continue;
1354
1355                         (void) unit_notify_cgroup_empty(u);
1356                 }
1357         }
1358 }
1359
1360 int manager_setup_cgroup(Manager *m) {
1361         _cleanup_free_ char *path = NULL;
1362         CGroupController c;
1363         int r, unified;
1364         char *e;
1365
1366         assert(m);
1367
1368         /* 1. Determine hierarchy */
1369         m->cgroup_root = mfree(m->cgroup_root);
1370         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1371         if (r < 0)
1372                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1373
1374         /* Chop off the init scope, if we are already located in it */
1375         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1376
1377         /* LEGACY: Also chop off the system slice if we are in
1378          * it. This is to support live upgrades from older systemd
1379          * versions where PID 1 was moved there. Also see
1380          * cg_get_root_path(). */
1381         if (!e && m->running_as == MANAGER_SYSTEM) {
1382                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1383                 if (!e)
1384                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1385         }
1386         if (e)
1387                 *e = 0;
1388
1389         /* And make sure to store away the root value without trailing
1390          * slash, even for the root dir, so that we can easily prepend
1391          * it everywhere. */
1392         while ((e = endswith(m->cgroup_root, "/")))
1393                 *e = 0;
1394
1395         /* 2. Show data */
1396         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1397         if (r < 0)
1398                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1399
1400         unified = cg_unified();
1401         if (unified < 0)
1402                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1403         if (unified > 0)
1404                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1405         else
1406                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1407
1408         if (!m->test_run) {
1409                 const char *scope_path;
1410
1411                 /* 3. Install agent */
1412                 if (unified) {
1413
1414                         /* In the unified hierarchy we can can get
1415                          * cgroup empty notifications via inotify. */
1416
1417                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1418                         safe_close(m->cgroup_inotify_fd);
1419
1420                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1421                         if (m->cgroup_inotify_fd < 0)
1422                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1423
1424                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1425                         if (r < 0)
1426                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1427
1428                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1429                         if (r < 0)
1430                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1431
1432                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1433
1434                 } else if (m->running_as == MANAGER_SYSTEM) {
1435
1436                         /* On the legacy hierarchy we only get
1437                          * notifications via cgroup agents. (Which
1438                          * isn't really reliable, since it does not
1439                          * generate events when control groups with
1440                          * children run empty. */
1441
1442                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1443                         if (r < 0)
1444                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1445                         else if (r > 0)
1446                                 log_debug("Installed release agent.");
1447                         else if (r == 0)
1448                                 log_debug("Release agent already installed.");
1449                 }
1450
1451                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1452                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1453                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1454                 if (r < 0)
1455                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1456
1457                 /* also, move all other userspace processes remaining
1458                  * in the root cgroup into that scope. */
1459                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1460                 if (r < 0)
1461                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1462
1463                 /* 5. And pin it, so that it cannot be unmounted */
1464                 safe_close(m->pin_cgroupfs_fd);
1465                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1466                 if (m->pin_cgroupfs_fd < 0)
1467                         return log_error_errno(errno, "Failed to open pin file: %m");
1468
1469                 /* 6.  Always enable hierarchical support if it exists... */
1470                 if (!unified)
1471                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1472         }
1473
1474         /* 7. Figure out which controllers are supported */
1475         r = cg_mask_supported(&m->cgroup_supported);
1476         if (r < 0)
1477                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1478
1479         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1480                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1481
1482         return 0;
1483 }
1484
1485 void manager_shutdown_cgroup(Manager *m, bool delete) {
1486         assert(m);
1487
1488         /* We can't really delete the group, since we are in it. But
1489          * let's trim it. */
1490         if (delete && m->cgroup_root)
1491                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1492
1493         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1494
1495         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1496         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1497
1498         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1499
1500         m->cgroup_root = mfree(m->cgroup_root);
1501 }
1502
1503 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1504         char *p;
1505         Unit *u;
1506
1507         assert(m);
1508         assert(cgroup);
1509
1510         u = hashmap_get(m->cgroup_unit, cgroup);
1511         if (u)
1512                 return u;
1513
1514         p = strdupa(cgroup);
1515         for (;;) {
1516                 char *e;
1517
1518                 e = strrchr(p, '/');
1519                 if (!e || e == p)
1520                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1521
1522                 *e = 0;
1523
1524                 u = hashmap_get(m->cgroup_unit, p);
1525                 if (u)
1526                         return u;
1527         }
1528 }
1529
1530 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1531         _cleanup_free_ char *cgroup = NULL;
1532         int r;
1533
1534         assert(m);
1535
1536         if (pid <= 0)
1537                 return NULL;
1538
1539         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1540         if (r < 0)
1541                 return NULL;
1542
1543         return manager_get_unit_by_cgroup(m, cgroup);
1544 }
1545
1546 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1547         Unit *u;
1548
1549         assert(m);
1550
1551         if (pid <= 0)
1552                 return NULL;
1553
1554         if (pid == 1)
1555                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1556
1557         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1558         if (u)
1559                 return u;
1560
1561         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1562         if (u)
1563                 return u;
1564
1565         return manager_get_unit_by_pid_cgroup(m, pid);
1566 }
1567
1568 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1569         Unit *u;
1570
1571         assert(m);
1572         assert(cgroup);
1573
1574         u = manager_get_unit_by_cgroup(m, cgroup);
1575         if (!u)
1576                 return 0;
1577
1578         return unit_notify_cgroup_empty(u);
1579 }
1580
1581 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1582         _cleanup_free_ char *v = NULL;
1583         int r;
1584
1585         assert(u);
1586         assert(ret);
1587
1588         if (!u->cgroup_path)
1589                 return -ENODATA;
1590
1591         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1592                 return -ENODATA;
1593
1594         if (cg_unified() <= 0)
1595                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1596         else
1597                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1598         if (r == -ENOENT)
1599                 return -ENODATA;
1600         if (r < 0)
1601                 return r;
1602
1603         return safe_atou64(v, ret);
1604 }
1605
1606 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1607         _cleanup_free_ char *v = NULL;
1608         int r;
1609
1610         assert(u);
1611         assert(ret);
1612
1613         if (!u->cgroup_path)
1614                 return -ENODATA;
1615
1616         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1617                 return -ENODATA;
1618
1619         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1620         if (r == -ENOENT)
1621                 return -ENODATA;
1622         if (r < 0)
1623                 return r;
1624
1625         return safe_atou64(v, ret);
1626 }
1627
1628 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1629         _cleanup_free_ char *v = NULL;
1630         uint64_t ns;
1631         int r;
1632
1633         assert(u);
1634         assert(ret);
1635
1636         if (!u->cgroup_path)
1637                 return -ENODATA;
1638
1639         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1640                 return -ENODATA;
1641
1642         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1643         if (r == -ENOENT)
1644                 return -ENODATA;
1645         if (r < 0)
1646                 return r;
1647
1648         r = safe_atou64(v, &ns);
1649         if (r < 0)
1650                 return r;
1651
1652         *ret = ns;
1653         return 0;
1654 }
1655
1656 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1657         nsec_t ns;
1658         int r;
1659
1660         r = unit_get_cpu_usage_raw(u, &ns);
1661         if (r < 0)
1662                 return r;
1663
1664         if (ns > u->cpuacct_usage_base)
1665                 ns -= u->cpuacct_usage_base;
1666         else
1667                 ns = 0;
1668
1669         *ret = ns;
1670         return 0;
1671 }
1672
1673 int unit_reset_cpu_usage(Unit *u) {
1674         nsec_t ns;
1675         int r;
1676
1677         assert(u);
1678
1679         r = unit_get_cpu_usage_raw(u, &ns);
1680         if (r < 0) {
1681                 u->cpuacct_usage_base = 0;
1682                 return r;
1683         }
1684
1685         u->cpuacct_usage_base = ns;
1686         return 0;
1687 }
1688
1689 bool unit_cgroup_delegate(Unit *u) {
1690         CGroupContext *c;
1691
1692         assert(u);
1693
1694         c = unit_get_cgroup_context(u);
1695         if (!c)
1696                 return false;
1697
1698         return c->delegate;
1699 }
1700
1701 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1702         assert(u);
1703
1704         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1705                 return;
1706
1707         if (m == 0)
1708                 return;
1709
1710         if ((u->cgroup_realized_mask & m) == 0)
1711                 return;
1712
1713         u->cgroup_realized_mask &= ~m;
1714         unit_add_to_cgroup_queue(u);
1715 }
1716
1717 void manager_invalidate_startup_units(Manager *m) {
1718         Iterator i;
1719         Unit *u;
1720
1721         assert(m);
1722
1723         SET_FOREACH(u, m->startup_units, i)
1724                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1725 }
1726
1727 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1728         [CGROUP_AUTO] = "auto",
1729         [CGROUP_CLOSED] = "closed",
1730         [CGROUP_STRICT] = "strict",
1731 };
1732
1733 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);