src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "cgroup-util.h"
  26 #include "cgroup.h"
  27 #include "fd-util.h"
  28 #include "path-util.h"
  29 #include "process-util.h"
  30 #include "special.h"
  31 #include "string-util.h"
  32
  33 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  34
  35 void cgroup_context_init(CGroupContext *c) {
  36         assert(c);
  37
  38         /* Initialize everything to the kernel defaults, assuming the
  39          * structure is preinitialized to 0 */
  40
  41         c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
  42         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
  43         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  44
  45         c->memory_limit = (uint64_t) -1;
  46
  47         c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  48         c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
  49
  50         c->tasks_max = (uint64_t) -1;
  51
  52         c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
  53 }
  54
  55 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  56         assert(c);
  57         assert(a);
  58
  59         LIST_REMOVE(device_allow, c->device_allow, a);
  60         free(a->path);
  61         free(a);
  62 }
  63
  64 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  65         assert(c);
  66         assert(w);
  67
  68         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  69         free(w->path);
  70         free(w);
  71 }
  72
  73 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  74         assert(c);
  75         assert(b);
  76
  77         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  78         free(b->path);
  79         free(b);
  80 }
  81
  82 void cgroup_context_done(CGroupContext *c) {
  83         assert(c);
  84
  85         while (c->blockio_device_weights)
  86                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  87
  88         while (c->blockio_device_bandwidths)
  89                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  90
  91         while (c->device_allow)
  92                 cgroup_context_free_device_allow(c, c->device_allow);
  93 }
  94
  95 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  96         CGroupBlockIODeviceBandwidth *b;
  97         CGroupBlockIODeviceWeight *w;
  98         CGroupDeviceAllow *a;
  99         char u[FORMAT_TIMESPAN_MAX];
 100
 101         assert(c);
 102         assert(f);
 103
 104         prefix = strempty(prefix);
 105
 106         fprintf(f,
 107                 "%sCPUAccounting=%s\n"
 108                 "%sBlockIOAccounting=%s\n"
 109                 "%sMemoryAccounting=%s\n"
 110                 "%sTasksAccounting=%s\n"
 111                 "%sCPUShares=%" PRIu64 "\n"
 112                 "%sStartupCPUShares=%" PRIu64 "\n"
 113                 "%sCPUQuotaPerSecSec=%s\n"
 114                 "%sBlockIOWeight=%" PRIu64 "\n"
 115                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
 116                 "%sMemoryLimit=%" PRIu64 "\n"
 117                 "%sTasksMax=%" PRIu64 "\n"
 118                 "%sDevicePolicy=%s\n"
 119                 "%sDelegate=%s\n",
 120                 prefix, yes_no(c->cpu_accounting),
 121                 prefix, yes_no(c->blockio_accounting),
 122                 prefix, yes_no(c->memory_accounting),
 123                 prefix, yes_no(c->tasks_accounting),
 124                 prefix, c->cpu_shares,
 125                 prefix, c->startup_cpu_shares,
 126                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 127                 prefix, c->blockio_weight,
 128                 prefix, c->startup_blockio_weight,
 129                 prefix, c->memory_limit,
 130                 prefix, c->tasks_max,
 131                 prefix, cgroup_device_policy_to_string(c->device_policy),
 132                 prefix, yes_no(c->delegate));
 133
 134         LIST_FOREACH(device_allow, a, c->device_allow)
 135                 fprintf(f,
 136                         "%sDeviceAllow=%s %s%s%s\n",
 137                         prefix,
 138                         a->path,
 139                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 140
 141         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 142                 fprintf(f,
 143                         "%sBlockIODeviceWeight=%s %" PRIu64,
 144                         prefix,
 145                         w->path,
 146                         w->weight);
 147
 148         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 149                 char buf[FORMAT_BYTES_MAX];
 150
 151                 fprintf(f,
 152                         "%s%s=%s %s\n",
 153                         prefix,
 154                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 155                         b->path,
 156                         format_bytes(buf, sizeof(buf), b->bandwidth));
 157         }
 158 }
 159
 160 static int lookup_blkio_device(const char *p, dev_t *dev) {
 161         struct stat st;
 162         int r;
 163
 164         assert(p);
 165         assert(dev);
 166
 167         r = stat(p, &st);
 168         if (r < 0)
 169                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 170
 171         if (S_ISBLK(st.st_mode))
 172                 *dev = st.st_rdev;
 173         else if (major(st.st_dev) != 0) {
 174                 /* If this is not a device node then find the block
 175                  * device this file is stored on */
 176                 *dev = st.st_dev;
 177
 178                 /* If this is a partition, try to get the originating
 179                  * block device */
 180                 block_get_whole_disk(*dev, dev);
 181         } else {
 182                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 183                 return -ENODEV;
 184         }
 185
 186         return 0;
 187 }
 188
 189 static int whitelist_device(const char *path, const char *node, const char *acc) {
 190         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 191         struct stat st;
 192         int r;
 193
 194         assert(path);
 195         assert(acc);
 196
 197         if (stat(node, &st) < 0) {
 198                 log_warning("Couldn't stat device %s", node);
 199                 return -errno;
 200         }
 201
 202         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 203                 log_warning("%s is not a device.", node);
 204                 return -ENODEV;
 205         }
 206
 207         sprintf(buf,
 208                 "%c %u:%u %s",
 209                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 210                 major(st.st_rdev), minor(st.st_rdev),
 211                 acc);
 212
 213         r = cg_set_attribute("devices", path, "devices.allow", buf);
 214         if (r < 0)
 215                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 216                                "Failed to set devices.allow on %s: %m", path);
 217
 218         return r;
 219 }
 220
 221 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 222         _cleanup_fclose_ FILE *f = NULL;
 223         char line[LINE_MAX];
 224         bool good = false;
 225         int r;
 226
 227         assert(path);
 228         assert(acc);
 229         assert(type == 'b' || type == 'c');
 230
 231         f = fopen("/proc/devices", "re");
 232         if (!f)
 233                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 234
 235         FOREACH_LINE(line, f, goto fail) {
 236                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 237                 unsigned maj;
 238
 239                 truncate_nl(line);
 240
 241                 if (type == 'c' && streq(line, "Character devices:")) {
 242                         good = true;
 243                         continue;
 244                 }
 245
 246                 if (type == 'b' && streq(line, "Block devices:")) {
 247                         good = true;
 248                         continue;
 249                 }
 250
 251                 if (isempty(line)) {
 252                         good = false;
 253                         continue;
 254                 }
 255
 256                 if (!good)
 257                         continue;
 258
 259                 p = strstrip(line);
 260
 261                 w = strpbrk(p, WHITESPACE);
 262                 if (!w)
 263                         continue;
 264                 *w = 0;
 265
 266                 r = safe_atou(p, &maj);
 267                 if (r < 0)
 268                         continue;
 269                 if (maj <= 0)
 270                         continue;
 271
 272                 w++;
 273                 w += strspn(w, WHITESPACE);
 274
 275                 if (fnmatch(name, w, 0) != 0)
 276                         continue;
 277
 278                 sprintf(buf,
 279                         "%c %u:* %s",
 280                         type,
 281                         maj,
 282                         acc);
 283
 284                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 285                 if (r < 0)
 286                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 287                                        "Failed to set devices.allow on %s: %m", path);
 288         }
 289
 290         return 0;
 291
 292 fail:
 293         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 294         return -errno;
 295 }
 296
 297 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
 298         bool is_root;
 299         int r;
 300
 301         assert(c);
 302         assert(path);
 303
 304         if (mask == 0)
 305                 return;
 306
 307         /* Some cgroup attributes are not supported on the root cgroup,
 308          * hence silently ignore */
 309         is_root = isempty(path) || path_equal(path, "/");
 310         if (is_root)
 311                 /* Make sure we don't try to display messages with an empty path. */
 312                 path = "/";
 313
 314         /* We generally ignore errors caused by read-only mounted
 315          * cgroup trees (assuming we are running in a container then),
 316          * and missing cgroups, i.e. EROFS and ENOENT. */
 317
 318         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 319                 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
 320
 321                 sprintf(buf, "%" PRIu64 "\n",
 322                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
 323                         c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
 324                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 325                 if (r < 0)
 326                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 327                                        "Failed to set cpu.shares on %s: %m", path);
 328
 329                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 330                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 331                 if (r < 0)
 332                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 333                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 334
 335                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 336                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 337                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 338                 } else
 339                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 340                 if (r < 0)
 341                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 342                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 343         }
 344
 345         if (mask & CGROUP_MASK_BLKIO) {
 346                 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
 347                              DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 348                 CGroupBlockIODeviceWeight *w;
 349                 CGroupBlockIODeviceBandwidth *b;
 350
 351                 if (!is_root) {
 352                         sprintf(buf, "%" PRIu64 "\n",
 353                                 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
 354                                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
 355                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 356                         if (r < 0)
 357                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 358                                                "Failed to set blkio.weight on %s: %m", path);
 359
 360                         /* FIXME: no way to reset this list */
 361                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 362                                 dev_t dev;
 363
 364                                 r = lookup_blkio_device(w->path, &dev);
 365                                 if (r < 0)
 366                                         continue;
 367
 368                                 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
 369                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 370                                 if (r < 0)
 371                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 372                                                        "Failed to set blkio.weight_device on %s: %m", path);
 373                         }
 374                 }
 375
 376                 /* FIXME: no way to reset this list */
 377                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 378                         const char *a;
 379                         dev_t dev;
 380
 381                         r = lookup_blkio_device(b->path, &dev);
 382                         if (r < 0)
 383                                 continue;
 384
 385                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 386
 387                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 388                         r = cg_set_attribute("blkio", path, a, buf);
 389                         if (r < 0)
 390                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 391                                                "Failed to set %s on %s: %m", a, path);
 392                 }
 393         }
 394
 395         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 396                 if (c->memory_limit != (uint64_t) -1) {
 397                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 398
 399                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 400
 401                         if (cg_unified() <= 0)
 402                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 403                         else
 404                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 405
 406                 } else {
 407                         if (cg_unified() <= 0)
 408                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 409                         else
 410                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 411                 }
 412
 413                 if (r < 0)
 414                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 415                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 416         }
 417
 418         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
 419                 CGroupDeviceAllow *a;
 420
 421                 /* Changing the devices list of a populated cgroup
 422                  * might result in EINVAL, hence ignore EINVAL
 423                  * here. */
 424
 425                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 426                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 427                 else
 428                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 429                 if (r < 0)
 430                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 431                                        "Failed to reset devices.list on %s: %m", path);
 432
 433                 if (c->device_policy == CGROUP_CLOSED ||
 434                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 435                         static const char auto_devices[] =
 436                                 "/dev/null\0" "rwm\0"
 437                                 "/dev/zero\0" "rwm\0"
 438                                 "/dev/full\0" "rwm\0"
 439                                 "/dev/random\0" "rwm\0"
 440                                 "/dev/urandom\0" "rwm\0"
 441                                 "/dev/tty\0" "rwm\0"
 442                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 443
 444                         const char *x, *y;
 445
 446                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 447                                 whitelist_device(path, x, y);
 448
 449                         whitelist_major(path, "pts", 'c', "rw");
 450                         whitelist_major(path, "kdbus", 'c', "rw");
 451                         whitelist_major(path, "kdbus/*", 'c', "rw");
 452                 }
 453
 454                 LIST_FOREACH(device_allow, a, c->device_allow) {
 455                         char acc[4];
 456                         unsigned k = 0;
 457
 458                         if (a->r)
 459                                 acc[k++] = 'r';
 460                         if (a->w)
 461                                 acc[k++] = 'w';
 462                         if (a->m)
 463                                 acc[k++] = 'm';
 464
 465                         if (k == 0)
 466                                 continue;
 467
 468                         acc[k++] = 0;
 469
 470                         if (startswith(a->path, "/dev/"))
 471                                 whitelist_device(path, a->path, acc);
 472                         else if (startswith(a->path, "block-"))
 473                                 whitelist_major(path, a->path + 6, 'b', acc);
 474                         else if (startswith(a->path, "char-"))
 475                                 whitelist_major(path, a->path + 5, 'c', acc);
 476                         else
 477                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 478                 }
 479         }
 480
 481         if ((mask & CGROUP_MASK_PIDS) && !is_root) {
 482
 483                 if (c->tasks_max != (uint64_t) -1) {
 484                         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
 485
 486                         sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
 487                         r = cg_set_attribute("pids", path, "pids.max", buf);
 488                 } else
 489                         r = cg_set_attribute("pids", path, "pids.max", "max");
 490
 491                 if (r < 0)
 492                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 493                                        "Failed to set pids.max on %s: %m", path);
 494         }
 495
 496         if (mask & CGROUP_MASK_NET_CLS) {
 497                 char buf[DECIMAL_STR_MAX(uint32_t)];
 498
 499                 sprintf(buf, "%" PRIu32, netclass);
 500
 501                 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
 502                 if (r < 0)
 503                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 504                                        "Failed to set net_cls.classid on %s: %m", path);
 505         }
 506 }
 507
 508 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 509         CGroupMask mask = 0;
 510
 511         /* Figure out which controllers we need */
 512
 513         if (c->cpu_accounting ||
 514             c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 515             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
 516             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 517                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 518
 519         if (c->blockio_accounting ||
 520             c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 521             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
 522             c->blockio_device_weights ||
 523             c->blockio_device_bandwidths)
 524                 mask |= CGROUP_MASK_BLKIO;
 525
 526         if (c->memory_accounting ||
 527             c->memory_limit != (uint64_t) -1)
 528                 mask |= CGROUP_MASK_MEMORY;
 529
 530         if (c->device_allow ||
 531             c->device_policy != CGROUP_AUTO)
 532                 mask |= CGROUP_MASK_DEVICES;
 533
 534         if (c->tasks_accounting ||
 535             c->tasks_max != (uint64_t) -1)
 536                 mask |= CGROUP_MASK_PIDS;
 537
 538         if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
 539                 mask |= CGROUP_MASK_NET_CLS;
 540
 541         return mask;
 542 }
 543
 544 CGroupMask unit_get_own_mask(Unit *u) {
 545         CGroupContext *c;
 546
 547         /* Returns the mask of controllers the unit needs for itself */
 548
 549         c = unit_get_cgroup_context(u);
 550         if (!c)
 551                 return 0;
 552
 553         /* If delegation is turned on, then turn on all cgroups,
 554          * unless we are on the legacy hierarchy and the process we
 555          * fork into it is known to drop privileges, and hence
 556          * shouldn't get access to the controllers.
 557          *
 558          * Note that on the unified hierarchy it is safe to delegate
 559          * controllers to unprivileged services. */
 560
 561         if (c->delegate) {
 562                 ExecContext *e;
 563
 564                 e = unit_get_exec_context(u);
 565                 if (!e ||
 566                     exec_context_maintains_privileges(e) ||
 567                     cg_unified() > 0)
 568                         return _CGROUP_MASK_ALL;
 569         }
 570
 571         return cgroup_context_get_mask(c);
 572 }
 573
 574 CGroupMask unit_get_members_mask(Unit *u) {
 575         assert(u);
 576
 577         /* Returns the mask of controllers all of the unit's children
 578          * require, merged */
 579
 580         if (u->cgroup_members_mask_valid)
 581                 return u->cgroup_members_mask;
 582
 583         u->cgroup_members_mask = 0;
 584
 585         if (u->type == UNIT_SLICE) {
 586                 Unit *member;
 587                 Iterator i;
 588
 589                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 590
 591                         if (member == u)
 592                                 continue;
 593
 594                         if (UNIT_DEREF(member->slice) != u)
 595                                 continue;
 596
 597                         u->cgroup_members_mask |=
 598                                 unit_get_own_mask(member) |
 599                                 unit_get_members_mask(member);
 600                 }
 601         }
 602
 603         u->cgroup_members_mask_valid = true;
 604         return u->cgroup_members_mask;
 605 }
 606
 607 CGroupMask unit_get_siblings_mask(Unit *u) {
 608         assert(u);
 609
 610         /* Returns the mask of controllers all of the unit's siblings
 611          * require, i.e. the members mask of the unit's parent slice
 612          * if there is one. */
 613
 614         if (UNIT_ISSET(u->slice))
 615                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 616
 617         return unit_get_own_mask(u) | unit_get_members_mask(u);
 618 }
 619
 620 CGroupMask unit_get_subtree_mask(Unit *u) {
 621
 622         /* Returns the mask of this subtree, meaning of the group
 623          * itself and its children. */
 624
 625         return unit_get_own_mask(u) | unit_get_members_mask(u);
 626 }
 627
 628 CGroupMask unit_get_target_mask(Unit *u) {
 629         CGroupMask mask;
 630
 631         /* This returns the cgroup mask of all controllers to enable
 632          * for a specific cgroup, i.e. everything it needs itself,
 633          * plus all that its children need, plus all that its siblings
 634          * need. This is primarily useful on the legacy cgroup
 635          * hierarchy, where we need to duplicate each cgroup in each
 636          * hierarchy that shall be enabled for it. */
 637
 638         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 639         mask &= u->manager->cgroup_supported;
 640
 641         return mask;
 642 }
 643
 644 CGroupMask unit_get_enable_mask(Unit *u) {
 645         CGroupMask mask;
 646
 647         /* This returns the cgroup mask of all controllers to enable
 648          * for the children of a specific cgroup. This is primarily
 649          * useful for the unified cgroup hierarchy, where each cgroup
 650          * controls which controllers are enabled for its children. */
 651
 652         mask = unit_get_members_mask(u);
 653         mask &= u->manager->cgroup_supported;
 654
 655         return mask;
 656 }
 657
 658 /* Recurse from a unit up through its containing slices, propagating
 659  * mask bits upward. A unit is also member of itself. */
 660 void unit_update_cgroup_members_masks(Unit *u) {
 661         CGroupMask m;
 662         bool more;
 663
 664         assert(u);
 665
 666         /* Calculate subtree mask */
 667         m = unit_get_subtree_mask(u);
 668
 669         /* See if anything changed from the previous invocation. If
 670          * not, we're done. */
 671         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 672                 return;
 673
 674         more =
 675                 u->cgroup_subtree_mask_valid &&
 676                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 677                 ((~m & u->cgroup_subtree_mask) == 0);
 678
 679         u->cgroup_subtree_mask = m;
 680         u->cgroup_subtree_mask_valid = true;
 681
 682         if (UNIT_ISSET(u->slice)) {
 683                 Unit *s = UNIT_DEREF(u->slice);
 684
 685                 if (more)
 686                         /* There's more set now than before. We
 687                          * propagate the new mask to the parent's mask
 688                          * (not caring if it actually was valid or
 689                          * not). */
 690
 691                         s->cgroup_members_mask |= m;
 692
 693                 else
 694                         /* There's less set now than before (or we
 695                          * don't know), we need to recalculate
 696                          * everything, so let's invalidate the
 697                          * parent's members mask */
 698
 699                         s->cgroup_members_mask_valid = false;
 700
 701                 /* And now make sure that this change also hits our
 702                  * grandparents */
 703                 unit_update_cgroup_members_masks(s);
 704         }
 705 }
 706
 707 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 708         Unit *u = userdata;
 709
 710         assert(mask != 0);
 711         assert(u);
 712
 713         while (u) {
 714                 if (u->cgroup_path &&
 715                     u->cgroup_realized &&
 716                     (u->cgroup_realized_mask & mask) == mask)
 717                         return u->cgroup_path;
 718
 719                 u = UNIT_DEREF(u->slice);
 720         }
 721
 722         return NULL;
 723 }
 724
 725 char *unit_default_cgroup_path(Unit *u) {
 726         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 727         int r;
 728
 729         assert(u);
 730
 731         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 732                 return strdup(u->manager->cgroup_root);
 733
 734         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 735                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 736                 if (r < 0)
 737                         return NULL;
 738         }
 739
 740         escaped = cg_escape(u->id);
 741         if (!escaped)
 742                 return NULL;
 743
 744         if (slice)
 745                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 746         else
 747                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 748 }
 749
 750 int unit_set_cgroup_path(Unit *u, const char *path) {
 751         _cleanup_free_ char *p = NULL;
 752         int r;
 753
 754         assert(u);
 755
 756         if (path) {
 757                 p = strdup(path);
 758                 if (!p)
 759                         return -ENOMEM;
 760         } else
 761                 p = NULL;
 762
 763         if (streq_ptr(u->cgroup_path, p))
 764                 return 0;
 765
 766         if (p) {
 767                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 768                 if (r < 0)
 769                         return r;
 770         }
 771
 772         unit_release_cgroup(u);
 773
 774         u->cgroup_path = p;
 775         p = NULL;
 776
 777         return 1;
 778 }
 779
 780 int unit_watch_cgroup(Unit *u) {
 781         _cleanup_free_ char *populated = NULL;
 782         int r;
 783
 784         assert(u);
 785
 786         if (!u->cgroup_path)
 787                 return 0;
 788
 789         if (u->cgroup_inotify_wd >= 0)
 790                 return 0;
 791
 792         /* Only applies to the unified hierarchy */
 793         r = cg_unified();
 794         if (r < 0)
 795                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 796         if (r == 0)
 797                 return 0;
 798
 799         /* Don't watch the root slice, it's pointless. */
 800         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 801                 return 0;
 802
 803         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 804         if (r < 0)
 805                 return log_oom();
 806
 807         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 808         if (r < 0)
 809                 return log_oom();
 810
 811         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 812         if (u->cgroup_inotify_wd < 0) {
 813
 814                 if (errno == ENOENT) /* If the directory is already
 815                                       * gone we don't need to track
 816                                       * it, so this is not an error */
 817                         return 0;
 818
 819                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 820         }
 821
 822         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 823         if (r < 0)
 824                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 825
 826         return 0;
 827 }
 828
 829 static int unit_create_cgroup(
 830                 Unit *u,
 831                 CGroupMask target_mask,
 832                 CGroupMask enable_mask) {
 833
 834         CGroupContext *c;
 835         int r;
 836
 837         assert(u);
 838
 839         c = unit_get_cgroup_context(u);
 840         if (!c)
 841                 return 0;
 842
 843         if (!u->cgroup_path) {
 844                 _cleanup_free_ char *path = NULL;
 845
 846                 path = unit_default_cgroup_path(u);
 847                 if (!path)
 848                         return log_oom();
 849
 850                 r = unit_set_cgroup_path(u, path);
 851                 if (r == -EEXIST)
 852                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 853                 if (r < 0)
 854                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 855         }
 856
 857         /* First, create our own group */
 858         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 859         if (r < 0)
 860                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 861
 862         /* Start watching it */
 863         (void) unit_watch_cgroup(u);
 864
 865         /* Enable all controllers we need */
 866         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 867         if (r < 0)
 868                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 869
 870         /* Keep track that this is now realized */
 871         u->cgroup_realized = true;
 872         u->cgroup_realized_mask = target_mask;
 873
 874         if (u->type != UNIT_SLICE && !c->delegate) {
 875
 876                 /* Then, possibly move things over, but not if
 877                  * subgroups may contain processes, which is the case
 878                  * for slice and delegation units. */
 879                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 880                 if (r < 0)
 881                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 882         }
 883
 884         return 0;
 885 }
 886
 887 int unit_attach_pids_to_cgroup(Unit *u) {
 888         int r;
 889         assert(u);
 890
 891         r = unit_realize_cgroup(u);
 892         if (r < 0)
 893                 return r;
 894
 895         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 896         if (r < 0)
 897                 return r;
 898
 899         return 0;
 900 }
 901
 902 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 903         assert(u);
 904
 905         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 906 }
 907
 908 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
 909
 910         uint32_t start, i;
 911         Manager *m;
 912
 913         assert(u);
 914
 915         m = u->manager;
 916
 917         i = start = m->cgroup_netclass_registry_last;
 918
 919         do {
 920                 i++;
 921
 922                 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
 923                         m->cgroup_netclass_registry_last = i;
 924                         *ret = i;
 925                         return 0;
 926                 }
 927
 928                 if (i == UINT32_MAX)
 929                         i = CGROUP_NETCLASS_FIXED_MAX;
 930
 931         } while (i != start);
 932
 933         return -ENOBUFS;
 934 }
 935
 936 int unit_add_to_netclass_cgroup(Unit *u) {
 937
 938         CGroupContext *cc;
 939         Unit *first;
 940         void *key;
 941         int r;
 942
 943         assert(u);
 944
 945         cc = unit_get_cgroup_context(u);
 946         if (!cc)
 947                 return 0;
 948
 949         switch (cc->netclass_type) {
 950         case CGROUP_NETCLASS_TYPE_NONE:
 951                 return 0;
 952
 953         case CGROUP_NETCLASS_TYPE_FIXED:
 954                 u->cgroup_netclass_id = cc->netclass_id;
 955                 break;
 956
 957         case CGROUP_NETCLASS_TYPE_AUTO:
 958                 /* Allocate a new ID in case it was requested and not done yet */
 959                 if (u->cgroup_netclass_id == 0) {
 960                         r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
 961                         if (r < 0)
 962                                 return r;
 963
 964                         log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
 965                 }
 966
 967                 break;
 968         }
 969
 970         r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
 971         if (r < 0)
 972                 return r;
 973
 974         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 975         first = hashmap_get(u->manager->cgroup_netclass_registry, key);
 976
 977         if (first) {
 978                 LIST_PREPEND(cgroup_netclass, first, u);
 979                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
 980         }
 981
 982         return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
 983 }
 984
 985 int unit_remove_from_netclass_cgroup(Unit *u) {
 986
 987         Unit *head;
 988         void *key;
 989
 990         assert(u);
 991
 992         key = UINT32_TO_PTR(u->cgroup_netclass_id);
 993
 994         LIST_FIND_HEAD(cgroup_netclass, u, head);
 995         LIST_REMOVE(cgroup_netclass, head, u);
 996
 997         if (head)
 998                 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
 999
1000         hashmap_remove(u->manager->cgroup_netclass_registry, key);
1001
1002         return 0;
1003 }
1004
1005 /* Check if necessary controllers and attributes for a unit are in place.
1006  *
1007  * If so, do nothing.
1008  * If not, create paths, move processes over, and set attributes.
1009  *
1010  * Returns 0 on success and < 0 on failure. */
1011 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1012         CGroupMask target_mask, enable_mask;
1013         int r;
1014
1015         assert(u);
1016
1017         if (u->in_cgroup_queue) {
1018                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1019                 u->in_cgroup_queue = false;
1020         }
1021
1022         target_mask = unit_get_target_mask(u);
1023         if (unit_has_mask_realized(u, target_mask))
1024                 return 0;
1025
1026         /* First, realize parents */
1027         if (UNIT_ISSET(u->slice)) {
1028                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1029                 if (r < 0)
1030                         return r;
1031         }
1032
1033         /* And then do the real work */
1034         enable_mask = unit_get_enable_mask(u);
1035         r = unit_create_cgroup(u, target_mask, enable_mask);
1036         if (r < 0)
1037                 return r;
1038
1039         /* Finally, apply the necessary attributes. */
1040         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1041
1042         return 0;
1043 }
1044
1045 static void unit_add_to_cgroup_queue(Unit *u) {
1046
1047         if (u->in_cgroup_queue)
1048                 return;
1049
1050         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1051         u->in_cgroup_queue = true;
1052 }
1053
1054 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1055         ManagerState state;
1056         unsigned n = 0;
1057         Unit *i;
1058         int r;
1059
1060         state = manager_state(m);
1061
1062         while ((i = m->cgroup_queue)) {
1063                 assert(i->in_cgroup_queue);
1064
1065                 r = unit_realize_cgroup_now(i, state);
1066                 if (r < 0)
1067                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1068
1069                 n++;
1070         }
1071
1072         return n;
1073 }
1074
1075 static void unit_queue_siblings(Unit *u) {
1076         Unit *slice;
1077
1078         /* This adds the siblings of the specified unit and the
1079          * siblings of all parent units to the cgroup queue. (But
1080          * neither the specified unit itself nor the parents.) */
1081
1082         while ((slice = UNIT_DEREF(u->slice))) {
1083                 Iterator i;
1084                 Unit *m;
1085
1086                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1087                         if (m == u)
1088                                 continue;
1089
1090                         /* Skip units that have a dependency on the slice
1091                          * but aren't actually in it. */
1092                         if (UNIT_DEREF(m->slice) != slice)
1093                                 continue;
1094
1095                         /* No point in doing cgroup application for units
1096                          * without active processes. */
1097                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1098                                 continue;
1099
1100                         /* If the unit doesn't need any new controllers
1101                          * and has current ones realized, it doesn't need
1102                          * any changes. */
1103                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1104                                 continue;
1105
1106                         unit_add_to_cgroup_queue(m);
1107                 }
1108
1109                 u = slice;
1110         }
1111 }
1112
1113 int unit_realize_cgroup(Unit *u) {
1114         assert(u);
1115
1116         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1117                 return 0;
1118
1119         /* So, here's the deal: when realizing the cgroups for this
1120          * unit, we need to first create all parents, but there's more
1121          * actually: for the weight-based controllers we also need to
1122          * make sure that all our siblings (i.e. units that are in the
1123          * same slice as we are) have cgroups, too. Otherwise, things
1124          * would become very uneven as each of their processes would
1125          * get as much resources as all our group together. This call
1126          * will synchronously create the parent cgroups, but will
1127          * defer work on the siblings to the next event loop
1128          * iteration. */
1129
1130         /* Add all sibling slices to the cgroup queue. */
1131         unit_queue_siblings(u);
1132
1133         /* And realize this one now (and apply the values) */
1134         return unit_realize_cgroup_now(u, manager_state(u->manager));
1135 }
1136
1137 void unit_release_cgroup(Unit *u) {
1138         assert(u);
1139
1140         /* Forgets all cgroup details for this cgroup */
1141
1142         if (u->cgroup_path) {
1143                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1144                 u->cgroup_path = mfree(u->cgroup_path);
1145         }
1146
1147         if (u->cgroup_inotify_wd >= 0) {
1148                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1149                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1150
1151                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1152                 u->cgroup_inotify_wd = -1;
1153         }
1154 }
1155
1156 void unit_prune_cgroup(Unit *u) {
1157         int r;
1158         bool is_root_slice;
1159
1160         assert(u);
1161
1162         /* Removes the cgroup, if empty and possible, and stops watching it. */
1163
1164         if (!u->cgroup_path)
1165                 return;
1166
1167         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1168
1169         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1170         if (r < 0) {
1171                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1172                 return;
1173         }
1174
1175         if (is_root_slice)
1176                 return;
1177
1178         unit_release_cgroup(u);
1179
1180         u->cgroup_realized = false;
1181         u->cgroup_realized_mask = 0;
1182 }
1183
1184 int unit_search_main_pid(Unit *u, pid_t *ret) {
1185         _cleanup_fclose_ FILE *f = NULL;
1186         pid_t pid = 0, npid, mypid;
1187         int r;
1188
1189         assert(u);
1190         assert(ret);
1191
1192         if (!u->cgroup_path)
1193                 return -ENXIO;
1194
1195         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1196         if (r < 0)
1197                 return r;
1198
1199         mypid = getpid();
1200         while (cg_read_pid(f, &npid) > 0)  {
1201                 pid_t ppid;
1202
1203                 if (npid == pid)
1204                         continue;
1205
1206                 /* Ignore processes that aren't our kids */
1207                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1208                         continue;
1209
1210                 if (pid != 0)
1211                         /* Dang, there's more than one daemonized PID
1212                         in this group, so we don't know what process
1213                         is the main process. */
1214
1215                         return -ENODATA;
1216
1217                 pid = npid;
1218         }
1219
1220         *ret = pid;
1221         return 0;
1222 }
1223
1224 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1225         _cleanup_closedir_ DIR *d = NULL;
1226         _cleanup_fclose_ FILE *f = NULL;
1227         int ret = 0, r;
1228
1229         assert(u);
1230         assert(path);
1231
1232         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1233         if (r < 0)
1234                 ret = r;
1235         else {
1236                 pid_t pid;
1237
1238                 while ((r = cg_read_pid(f, &pid)) > 0) {
1239                         r = unit_watch_pid(u, pid);
1240                         if (r < 0 && ret >= 0)
1241                                 ret = r;
1242                 }
1243
1244                 if (r < 0 && ret >= 0)
1245                         ret = r;
1246         }
1247
1248         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1249         if (r < 0) {
1250                 if (ret >= 0)
1251                         ret = r;
1252         } else {
1253                 char *fn;
1254
1255                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1256                         _cleanup_free_ char *p = NULL;
1257
1258                         p = strjoin(path, "/", fn, NULL);
1259                         free(fn);
1260
1261                         if (!p)
1262                                 return -ENOMEM;
1263
1264                         r = unit_watch_pids_in_path(u, p);
1265                         if (r < 0 && ret >= 0)
1266                                 ret = r;
1267                 }
1268
1269                 if (r < 0 && ret >= 0)
1270                         ret = r;
1271         }
1272
1273         return ret;
1274 }
1275
1276 int unit_watch_all_pids(Unit *u) {
1277         assert(u);
1278
1279         /* Adds all PIDs from our cgroup to the set of PIDs we
1280          * watch. This is a fallback logic for cases where we do not
1281          * get reliable cgroup empty notifications: we try to use
1282          * SIGCHLD as replacement. */
1283
1284         if (!u->cgroup_path)
1285                 return -ENOENT;
1286
1287         if (cg_unified() > 0) /* On unified we can use proper notifications */
1288                 return 0;
1289
1290         return unit_watch_pids_in_path(u, u->cgroup_path);
1291 }
1292
1293 int unit_notify_cgroup_empty(Unit *u) {
1294         int r;
1295
1296         assert(u);
1297
1298         if (!u->cgroup_path)
1299                 return 0;
1300
1301         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1302         if (r <= 0)
1303                 return r;
1304
1305         unit_add_to_gc_queue(u);
1306
1307         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1308                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1309
1310         return 0;
1311 }
1312
1313 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1314         Manager *m = userdata;
1315
1316         assert(s);
1317         assert(fd >= 0);
1318         assert(m);
1319
1320         for (;;) {
1321                 union inotify_event_buffer buffer;
1322                 struct inotify_event *e;
1323                 ssize_t l;
1324
1325                 l = read(fd, &buffer, sizeof(buffer));
1326                 if (l < 0) {
1327                         if (errno == EINTR || errno == EAGAIN)
1328                                 return 0;
1329
1330                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1331                 }
1332
1333                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1334                         Unit *u;
1335
1336                         if (e->wd < 0)
1337                                 /* Queue overflow has no watch descriptor */
1338                                 continue;
1339
1340                         if (e->mask & IN_IGNORED)
1341                                 /* The watch was just removed */
1342                                 continue;
1343
1344                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1345                         if (!u) /* Not that inotify might deliver
1346                                  * events for a watch even after it
1347                                  * was removed, because it was queued
1348                                  * before the removal. Let's ignore
1349                                  * this here safely. */
1350                                 continue;
1351
1352                         (void) unit_notify_cgroup_empty(u);
1353                 }
1354         }
1355 }
1356
1357 int manager_setup_cgroup(Manager *m) {
1358         _cleanup_free_ char *path = NULL;
1359         CGroupController c;
1360         int r, unified;
1361         char *e;
1362
1363         assert(m);
1364
1365         /* 1. Determine hierarchy */
1366         m->cgroup_root = mfree(m->cgroup_root);
1367         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1368         if (r < 0)
1369                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1370
1371         /* Chop off the init scope, if we are already located in it */
1372         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1373
1374         /* LEGACY: Also chop off the system slice if we are in
1375          * it. This is to support live upgrades from older systemd
1376          * versions where PID 1 was moved there. Also see
1377          * cg_get_root_path(). */
1378         if (!e && m->running_as == MANAGER_SYSTEM) {
1379                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1380                 if (!e)
1381                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1382         }
1383         if (e)
1384                 *e = 0;
1385
1386         /* And make sure to store away the root value without trailing
1387          * slash, even for the root dir, so that we can easily prepend
1388          * it everywhere. */
1389         while ((e = endswith(m->cgroup_root, "/")))
1390                 *e = 0;
1391
1392         /* 2. Show data */
1393         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1394         if (r < 0)
1395                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1396
1397         unified = cg_unified();
1398         if (unified < 0)
1399                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1400         if (unified > 0)
1401                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1402         else
1403                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1404
1405         if (!m->test_run) {
1406                 const char *scope_path;
1407
1408                 /* 3. Install agent */
1409                 if (unified) {
1410
1411                         /* In the unified hierarchy we can can get
1412                          * cgroup empty notifications via inotify. */
1413
1414                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1415                         safe_close(m->cgroup_inotify_fd);
1416
1417                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1418                         if (m->cgroup_inotify_fd < 0)
1419                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1420
1421                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1422                         if (r < 0)
1423                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1424
1425                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1426                         if (r < 0)
1427                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1428
1429                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1430
1431                 } else if (m->running_as == MANAGER_SYSTEM) {
1432
1433                         /* On the legacy hierarchy we only get
1434                          * notifications via cgroup agents. (Which
1435                          * isn't really reliable, since it does not
1436                          * generate events when control groups with
1437                          * children run empty. */
1438
1439                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
1440                         if (r < 0)
1441                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1442                         else if (r > 0)
1443                                 log_debug("Installed release agent.");
1444                         else if (r == 0)
1445                                 log_debug("Release agent already installed.");
1446                 }
1447
1448                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1449                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1450                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1451                 if (r < 0)
1452                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1453
1454                 /* also, move all other userspace processes remaining
1455                  * in the root cgroup into that scope. */
1456                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1457                 if (r < 0)
1458                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1459
1460                 /* 5. And pin it, so that it cannot be unmounted */
1461                 safe_close(m->pin_cgroupfs_fd);
1462                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1463                 if (m->pin_cgroupfs_fd < 0)
1464                         return log_error_errno(errno, "Failed to open pin file: %m");
1465
1466                 /* 6.  Always enable hierarchical support if it exists... */
1467                 if (!unified)
1468                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1469         }
1470
1471         /* 7. Figure out which controllers are supported */
1472         r = cg_mask_supported(&m->cgroup_supported);
1473         if (r < 0)
1474                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1475
1476         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1477                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1478
1479         return 0;
1480 }
1481
1482 void manager_shutdown_cgroup(Manager *m, bool delete) {
1483         assert(m);
1484
1485         /* We can't really delete the group, since we are in it. But
1486          * let's trim it. */
1487         if (delete && m->cgroup_root)
1488                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1489
1490         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1491
1492         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1493         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1494
1495         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1496
1497         m->cgroup_root = mfree(m->cgroup_root);
1498 }
1499
1500 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1501         char *p;
1502         Unit *u;
1503
1504         assert(m);
1505         assert(cgroup);
1506
1507         u = hashmap_get(m->cgroup_unit, cgroup);
1508         if (u)
1509                 return u;
1510
1511         p = strdupa(cgroup);
1512         for (;;) {
1513                 char *e;
1514
1515                 e = strrchr(p, '/');
1516                 if (!e || e == p)
1517                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1518
1519                 *e = 0;
1520
1521                 u = hashmap_get(m->cgroup_unit, p);
1522                 if (u)
1523                         return u;
1524         }
1525 }
1526
1527 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1528         _cleanup_free_ char *cgroup = NULL;
1529         int r;
1530
1531         assert(m);
1532
1533         if (pid <= 0)
1534                 return NULL;
1535
1536         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1537         if (r < 0)
1538                 return NULL;
1539
1540         return manager_get_unit_by_cgroup(m, cgroup);
1541 }
1542
1543 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1544         Unit *u;
1545
1546         assert(m);
1547
1548         if (pid <= 0)
1549                 return NULL;
1550
1551         if (pid == 1)
1552                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1553
1554         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1555         if (u)
1556                 return u;
1557
1558         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1559         if (u)
1560                 return u;
1561
1562         return manager_get_unit_by_pid_cgroup(m, pid);
1563 }
1564
1565 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1566         Unit *u;
1567
1568         assert(m);
1569         assert(cgroup);
1570
1571         u = manager_get_unit_by_cgroup(m, cgroup);
1572         if (!u)
1573                 return 0;
1574
1575         return unit_notify_cgroup_empty(u);
1576 }
1577
1578 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1579         _cleanup_free_ char *v = NULL;
1580         int r;
1581
1582         assert(u);
1583         assert(ret);
1584
1585         if (!u->cgroup_path)
1586                 return -ENODATA;
1587
1588         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1589                 return -ENODATA;
1590
1591         if (cg_unified() <= 0)
1592                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1593         else
1594                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1595         if (r == -ENOENT)
1596                 return -ENODATA;
1597         if (r < 0)
1598                 return r;
1599
1600         return safe_atou64(v, ret);
1601 }
1602
1603 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1604         _cleanup_free_ char *v = NULL;
1605         int r;
1606
1607         assert(u);
1608         assert(ret);
1609
1610         if (!u->cgroup_path)
1611                 return -ENODATA;
1612
1613         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1614                 return -ENODATA;
1615
1616         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1617         if (r == -ENOENT)
1618                 return -ENODATA;
1619         if (r < 0)
1620                 return r;
1621
1622         return safe_atou64(v, ret);
1623 }
1624
1625 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1626         _cleanup_free_ char *v = NULL;
1627         uint64_t ns;
1628         int r;
1629
1630         assert(u);
1631         assert(ret);
1632
1633         if (!u->cgroup_path)
1634                 return -ENODATA;
1635
1636         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1637                 return -ENODATA;
1638
1639         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1640         if (r == -ENOENT)
1641                 return -ENODATA;
1642         if (r < 0)
1643                 return r;
1644
1645         r = safe_atou64(v, &ns);
1646         if (r < 0)
1647                 return r;
1648
1649         *ret = ns;
1650         return 0;
1651 }
1652
1653 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1654         nsec_t ns;
1655         int r;
1656
1657         r = unit_get_cpu_usage_raw(u, &ns);
1658         if (r < 0)
1659                 return r;
1660
1661         if (ns > u->cpuacct_usage_base)
1662                 ns -= u->cpuacct_usage_base;
1663         else
1664                 ns = 0;
1665
1666         *ret = ns;
1667         return 0;
1668 }
1669
1670 int unit_reset_cpu_usage(Unit *u) {
1671         nsec_t ns;
1672         int r;
1673
1674         assert(u);
1675
1676         r = unit_get_cpu_usage_raw(u, &ns);
1677         if (r < 0) {
1678                 u->cpuacct_usage_base = 0;
1679                 return r;
1680         }
1681
1682         u->cpuacct_usage_base = ns;
1683         return 0;
1684 }
1685
1686 bool unit_cgroup_delegate(Unit *u) {
1687         CGroupContext *c;
1688
1689         assert(u);
1690
1691         c = unit_get_cgroup_context(u);
1692         if (!c)
1693                 return false;
1694
1695         return c->delegate;
1696 }
1697
1698 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1699         assert(u);
1700
1701         if (!UNIT_HAS_CGROUP_CONTEXT(u))
1702                 return;
1703
1704         if (m == 0)
1705                 return;
1706
1707         if ((u->cgroup_realized_mask & m) == 0)
1708                 return;
1709
1710         u->cgroup_realized_mask &= ~m;
1711         unit_add_to_cgroup_queue(u);
1712 }
1713
1714 void manager_invalidate_startup_units(Manager *m) {
1715         Iterator i;
1716         Unit *u;
1717
1718         assert(m);
1719
1720         SET_FOREACH(u, m->startup_units, i)
1721                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1722 }
1723
1724 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1725         [CGROUP_AUTO] = "auto",
1726         [CGROUP_CLOSED] = "closed",
1727         [CGROUP_STRICT] = "strict",
1728 };
1729
1730 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);