src/basic/cgroup-util.c

   1 /***
   2   This file is part of systemd.
   3
   4   Copyright 2010 Lennart Poettering
   5
   6   systemd is free software; you can redistribute it and/or modify it
   7   under the terms of the GNU Lesser General Public License as published by
   8   the Free Software Foundation; either version 2.1 of the License, or
   9   (at your option) any later version.
  10
  11   systemd is distributed in the hope that it will be useful, but
  12   WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public License
  17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  18 ***/
  19
  20 #include <dirent.h>
  21 #include <errno.h>
  22 #include <ftw.h>
  23 #include <limits.h>
  24 #include <signal.h>
  25 #include <stddef.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <sys/stat.h>
  29 #include <sys/statfs.h>
  30 #include <sys/types.h>
  31 #include <sys/xattr.h>
  32 #include <unistd.h>
  33
  34 #include "alloc-util.h"
  35 #include "cgroup-util.h"
  36 #include "def.h"
  37 #include "dirent-util.h"
  38 #include "extract-word.h"
  39 #include "fd-util.h"
  40 #include "fileio.h"
  41 #include "format-util.h"
  42 #include "fs-util.h"
  43 #include "log.h"
  44 #include "login-util.h"
  45 #include "macro.h"
  46 #include "missing.h"
  47 #include "mkdir.h"
  48 #include "parse-util.h"
  49 #include "path-util.h"
  50 #include "proc-cmdline.h"
  51 #include "process-util.h"
  52 #include "set.h"
  53 #include "special.h"
  54 #include "stat-util.h"
  55 #include "stdio-util.h"
  56 #include "string-table.h"
  57 #include "string-util.h"
  58 #include "unit-name.h"
  59 #include "user-util.h"
  60
  61 int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
  62         _cleanup_free_ char *fs = NULL;
  63         FILE *f;
  64         int r;
  65
  66         assert(_f);
  67
  68         r = cg_get_path(controller, path, "cgroup.procs", &fs);
  69         if (r < 0)
  70                 return r;
  71
  72         f = fopen(fs, "re");
  73         if (!f)
  74                 return -errno;
  75
  76         *_f = f;
  77         return 0;
  78 }
  79
  80 int cg_read_pid(FILE *f, pid_t *_pid) {
  81         unsigned long ul;
  82
  83         /* Note that the cgroup.procs might contain duplicates! See
  84          * cgroups.txt for details. */
  85
  86         assert(f);
  87         assert(_pid);
  88
  89         errno = 0;
  90         if (fscanf(f, "%lu", &ul) != 1) {
  91
  92                 if (feof(f))
  93                         return 0;
  94
  95                 return errno > 0 ? -errno : -EIO;
  96         }
  97
  98         if (ul <= 0)
  99                 return -EIO;
 100
 101         *_pid = (pid_t) ul;
 102         return 1;
 103 }
 104
 105 int cg_read_event(const char *controller, const char *path, const char *event,
 106                   char **val)
 107 {
 108         _cleanup_free_ char *events = NULL, *content = NULL;
 109         char *p, *line;
 110         int r;
 111
 112         r = cg_get_path(controller, path, "cgroup.events", &events);
 113         if (r < 0)
 114                 return r;
 115
 116         r = read_full_file(events, &content, NULL);
 117         if (r < 0)
 118                 return r;
 119
 120         p = content;
 121         while ((line = strsep(&p, "\n"))) {
 122                 char *key;
 123
 124                 key = strsep(&line, " ");
 125                 if (!key || !line)
 126                         return -EINVAL;
 127
 128                 if (strcmp(key, event))
 129                         continue;
 130
 131                 *val = strdup(line);
 132                 return 0;
 133         }
 134
 135         return -ENOENT;
 136 }
 137
 138 bool cg_ns_supported(void) {
 139         static thread_local int enabled = -1;
 140
 141         if (enabled >= 0)
 142                 return enabled;
 143
 144         if (access("/proc/self/ns/cgroup", F_OK) == 0)
 145                 enabled = 1;
 146         else
 147                 enabled = 0;
 148
 149         return enabled;
 150 }
 151
 152 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
 153         _cleanup_free_ char *fs = NULL;
 154         int r;
 155         DIR *d;
 156
 157         assert(_d);
 158
 159         /* This is not recursive! */
 160
 161         r = cg_get_path(controller, path, NULL, &fs);
 162         if (r < 0)
 163                 return r;
 164
 165         d = opendir(fs);
 166         if (!d)
 167                 return -errno;
 168
 169         *_d = d;
 170         return 0;
 171 }
 172
 173 int cg_read_subgroup(DIR *d, char **fn) {
 174         struct dirent *de;
 175
 176         assert(d);
 177         assert(fn);
 178
 179         FOREACH_DIRENT_ALL(de, d, return -errno) {
 180                 char *b;
 181
 182                 if (de->d_type != DT_DIR)
 183                         continue;
 184
 185                 if (dot_or_dot_dot(de->d_name))
 186                         continue;
 187
 188                 b = strdup(de->d_name);
 189                 if (!b)
 190                         return -ENOMEM;
 191
 192                 *fn = b;
 193                 return 1;
 194         }
 195
 196         return 0;
 197 }
 198
 199 int cg_rmdir(const char *controller, const char *path) {
 200         _cleanup_free_ char *p = NULL;
 201         int r;
 202
 203         r = cg_get_path(controller, path, NULL, &p);
 204         if (r < 0)
 205                 return r;
 206
 207         r = rmdir(p);
 208         if (r < 0 && errno != ENOENT)
 209                 return -errno;
 210
 211         return 0;
 212 }
 213
 214 int cg_kill(
 215                 const char *controller,
 216                 const char *path,
 217                 int sig,
 218                 CGroupFlags flags,
 219                 Set *s,
 220                 cg_kill_log_func_t log_kill,
 221                 void *userdata) {
 222
 223         _cleanup_set_free_ Set *allocated_set = NULL;
 224         bool done = false;
 225         int r, ret = 0;
 226         pid_t my_pid;
 227
 228         assert(sig >= 0);
 229
 230          /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
 231           * SIGCONT on SIGKILL. */
 232         if (IN_SET(sig, SIGCONT, SIGKILL))
 233                 flags &= ~CGROUP_SIGCONT;
 234
 235         /* This goes through the tasks list and kills them all. This
 236          * is repeated until no further processes are added to the
 237          * tasks list, to properly handle forking processes */
 238
 239         if (!s) {
 240                 s = allocated_set = set_new(NULL);
 241                 if (!s)
 242                         return -ENOMEM;
 243         }
 244
 245         my_pid = getpid();
 246
 247         do {
 248                 _cleanup_fclose_ FILE *f = NULL;
 249                 pid_t pid = 0;
 250                 done = true;
 251
 252                 r = cg_enumerate_processes(controller, path, &f);
 253                 if (r < 0) {
 254                         if (ret >= 0 && r != -ENOENT)
 255                                 return r;
 256
 257                         return ret;
 258                 }
 259
 260                 while ((r = cg_read_pid(f, &pid)) > 0) {
 261
 262                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 263                                 continue;
 264
 265                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 266                                 continue;
 267
 268                         if (log_kill)
 269                                 log_kill(pid, sig, userdata);
 270
 271                         /* If we haven't killed this process yet, kill
 272                          * it */
 273                         if (kill(pid, sig) < 0) {
 274                                 if (ret >= 0 && errno != ESRCH)
 275                                         ret = -errno;
 276                         } else {
 277                                 if (flags & CGROUP_SIGCONT)
 278                                         (void) kill(pid, SIGCONT);
 279
 280                                 if (ret == 0)
 281                                         ret = 1;
 282                         }
 283
 284                         done = false;
 285
 286                         r = set_put(s, PID_TO_PTR(pid));
 287                         if (r < 0) {
 288                                 if (ret >= 0)
 289                                         return r;
 290
 291                                 return ret;
 292                         }
 293                 }
 294
 295                 if (r < 0) {
 296                         if (ret >= 0)
 297                                 return r;
 298
 299                         return ret;
 300                 }
 301
 302                 /* To avoid racing against processes which fork
 303                  * quicker than we can kill them we repeat this until
 304                  * no new pids need to be killed. */
 305
 306         } while (!done);
 307
 308         return ret;
 309 }
 310
 311 int cg_kill_recursive(
 312                 const char *controller,
 313                 const char *path,
 314                 int sig,
 315                 CGroupFlags flags,
 316                 Set *s,
 317                 cg_kill_log_func_t log_kill,
 318                 void *userdata) {
 319
 320         _cleanup_set_free_ Set *allocated_set = NULL;
 321         _cleanup_closedir_ DIR *d = NULL;
 322         int r, ret;
 323         char *fn;
 324
 325         assert(path);
 326         assert(sig >= 0);
 327
 328         if (!s) {
 329                 s = allocated_set = set_new(NULL);
 330                 if (!s)
 331                         return -ENOMEM;
 332         }
 333
 334         ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
 335
 336         r = cg_enumerate_subgroups(controller, path, &d);
 337         if (r < 0) {
 338                 if (ret >= 0 && r != -ENOENT)
 339                         return r;
 340
 341                 return ret;
 342         }
 343
 344         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 345                 _cleanup_free_ char *p = NULL;
 346
 347                 p = strjoin(path, "/", fn);
 348                 free(fn);
 349                 if (!p)
 350                         return -ENOMEM;
 351
 352                 r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
 353                 if (r != 0 && ret >= 0)
 354                         ret = r;
 355         }
 356         if (ret >= 0 && r < 0)
 357                 ret = r;
 358
 359         if (flags & CGROUP_REMOVE) {
 360                 r = cg_rmdir(controller, path);
 361                 if (r < 0 && ret >= 0 && r != -ENOENT && r != -EBUSY)
 362                         return r;
 363         }
 364
 365         return ret;
 366 }
 367
 368 int cg_migrate(
 369                 const char *cfrom,
 370                 const char *pfrom,
 371                 const char *cto,
 372                 const char *pto,
 373                 CGroupFlags flags) {
 374
 375         bool done = false;
 376         _cleanup_set_free_ Set *s = NULL;
 377         int r, ret = 0;
 378         pid_t my_pid;
 379
 380         assert(cfrom);
 381         assert(pfrom);
 382         assert(cto);
 383         assert(pto);
 384
 385         s = set_new(NULL);
 386         if (!s)
 387                 return -ENOMEM;
 388
 389         my_pid = getpid();
 390
 391         do {
 392                 _cleanup_fclose_ FILE *f = NULL;
 393                 pid_t pid = 0;
 394                 done = true;
 395
 396                 r = cg_enumerate_processes(cfrom, pfrom, &f);
 397                 if (r < 0) {
 398                         if (ret >= 0 && r != -ENOENT)
 399                                 return r;
 400
 401                         return ret;
 402                 }
 403
 404                 while ((r = cg_read_pid(f, &pid)) > 0) {
 405
 406                         /* This might do weird stuff if we aren't a
 407                          * single-threaded program. However, we
 408                          * luckily know we are not */
 409                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 410                                 continue;
 411
 412                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 413                                 continue;
 414
 415                         /* Ignore kernel threads. Since they can only
 416                          * exist in the root cgroup, we only check for
 417                          * them there. */
 418                         if (cfrom &&
 419                             (isempty(pfrom) || path_equal(pfrom, "/")) &&
 420                             is_kernel_thread(pid) > 0)
 421                                 continue;
 422
 423                         r = cg_attach(cto, pto, pid);
 424                         if (r < 0) {
 425                                 if (ret >= 0 && r != -ESRCH)
 426                                         ret = r;
 427                         } else if (ret == 0)
 428                                 ret = 1;
 429
 430                         done = false;
 431
 432                         r = set_put(s, PID_TO_PTR(pid));
 433                         if (r < 0) {
 434                                 if (ret >= 0)
 435                                         return r;
 436
 437                                 return ret;
 438                         }
 439                 }
 440
 441                 if (r < 0) {
 442                         if (ret >= 0)
 443                                 return r;
 444
 445                         return ret;
 446                 }
 447         } while (!done);
 448
 449         return ret;
 450 }
 451
 452 int cg_migrate_recursive(
 453                 const char *cfrom,
 454                 const char *pfrom,
 455                 const char *cto,
 456                 const char *pto,
 457                 CGroupFlags flags) {
 458
 459         _cleanup_closedir_ DIR *d = NULL;
 460         int r, ret = 0;
 461         char *fn;
 462
 463         assert(cfrom);
 464         assert(pfrom);
 465         assert(cto);
 466         assert(pto);
 467
 468         ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
 469
 470         r = cg_enumerate_subgroups(cfrom, pfrom, &d);
 471         if (r < 0) {
 472                 if (ret >= 0 && r != -ENOENT)
 473                         return r;
 474
 475                 return ret;
 476         }
 477
 478         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 479                 _cleanup_free_ char *p = NULL;
 480
 481                 p = strjoin(pfrom, "/", fn);
 482                 free(fn);
 483                 if (!p)
 484                         return -ENOMEM;
 485
 486                 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
 487                 if (r != 0 && ret >= 0)
 488                         ret = r;
 489         }
 490
 491         if (r < 0 && ret >= 0)
 492                 ret = r;
 493
 494         if (flags & CGROUP_REMOVE) {
 495                 r = cg_rmdir(cfrom, pfrom);
 496                 if (r < 0 && ret >= 0 && r != -ENOENT && r != -EBUSY)
 497                         return r;
 498         }
 499
 500         return ret;
 501 }
 502
 503 int cg_migrate_recursive_fallback(
 504                 const char *cfrom,
 505                 const char *pfrom,
 506                 const char *cto,
 507                 const char *pto,
 508                 CGroupFlags flags) {
 509
 510         int r;
 511
 512         assert(cfrom);
 513         assert(pfrom);
 514         assert(cto);
 515         assert(pto);
 516
 517         r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
 518         if (r < 0) {
 519                 char prefix[strlen(pto) + 1];
 520
 521                 /* This didn't work? Then let's try all prefixes of the destination */
 522
 523                 PATH_FOREACH_PREFIX(prefix, pto) {
 524                         int q;
 525
 526                         q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
 527                         if (q >= 0)
 528                                 return q;
 529                 }
 530         }
 531
 532         return r;
 533 }
 534
 535 static const char *controller_to_dirname(const char *controller) {
 536         const char *e;
 537
 538         assert(controller);
 539
 540         /* Converts a controller name to the directory name below
 541          * /sys/fs/cgroup/ we want to mount it to. Effectively, this
 542          * just cuts off the name= prefixed used for named
 543          * hierarchies, if it is specified. */
 544
 545         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 546                 controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 547
 548         e = startswith(controller, "name=");
 549         if (e)
 550                 return e;
 551
 552         return controller;
 553 }
 554
 555 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
 556         const char *dn;
 557         char *t = NULL;
 558
 559         assert(fs);
 560         assert(controller);
 561
 562         dn = controller_to_dirname(controller);
 563
 564         if (isempty(path) && isempty(suffix))
 565                 t = strappend("/sys/fs/cgroup/", dn);
 566         else if (isempty(path))
 567                 t = strjoin("/sys/fs/cgroup/", dn, "/", suffix);
 568         else if (isempty(suffix))
 569                 t = strjoin("/sys/fs/cgroup/", dn, "/", path);
 570         else
 571                 t = strjoin("/sys/fs/cgroup/", dn, "/", path, "/", suffix);
 572         if (!t)
 573                 return -ENOMEM;
 574
 575         *fs = t;
 576         return 0;
 577 }
 578
 579 static int join_path_unified(const char *path, const char *suffix, char **fs) {
 580         char *t;
 581
 582         assert(fs);
 583
 584         if (isempty(path) && isempty(suffix))
 585                 t = strdup("/sys/fs/cgroup");
 586         else if (isempty(path))
 587                 t = strappend("/sys/fs/cgroup/", suffix);
 588         else if (isempty(suffix))
 589                 t = strappend("/sys/fs/cgroup/", path);
 590         else
 591                 t = strjoin("/sys/fs/cgroup/", path, "/", suffix);
 592         if (!t)
 593                 return -ENOMEM;
 594
 595         *fs = t;
 596         return 0;
 597 }
 598
 599 int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
 600         int r;
 601
 602         assert(fs);
 603
 604         if (!controller) {
 605                 char *t;
 606
 607                 /* If no controller is specified, we return the path
 608                  * *below* the controllers, without any prefix. */
 609
 610                 if (!path && !suffix)
 611                         return -EINVAL;
 612
 613                 if (!suffix)
 614                         t = strdup(path);
 615                 else if (!path)
 616                         t = strdup(suffix);
 617                 else
 618                         t = strjoin(path, "/", suffix);
 619                 if (!t)
 620                         return -ENOMEM;
 621
 622                 *fs = path_kill_slashes(t);
 623                 return 0;
 624         }
 625
 626         if (!cg_controller_is_valid(controller))
 627                 return -EINVAL;
 628
 629         if (cg_all_unified())
 630                 r = join_path_unified(path, suffix, fs);
 631         else
 632                 r = join_path_legacy(controller, path, suffix, fs);
 633         if (r < 0)
 634                 return r;
 635
 636         path_kill_slashes(*fs);
 637         return 0;
 638 }
 639
 640 static int controller_is_accessible(const char *controller) {
 641
 642         assert(controller);
 643
 644         /* Checks whether a specific controller is accessible,
 645          * i.e. its hierarchy mounted. In the unified hierarchy all
 646          * controllers are considered accessible, except for the named
 647          * hierarchies */
 648
 649         if (!cg_controller_is_valid(controller))
 650                 return -EINVAL;
 651
 652         if (cg_all_unified()) {
 653                 /* We don't support named hierarchies if we are using
 654                  * the unified hierarchy. */
 655
 656                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 657                         return 0;
 658
 659                 if (startswith(controller, "name="))
 660                         return -EOPNOTSUPP;
 661
 662         } else {
 663                 const char *cc, *dn;
 664
 665                 dn = controller_to_dirname(controller);
 666                 cc = strjoina("/sys/fs/cgroup/", dn);
 667
 668                 if (laccess(cc, F_OK) < 0)
 669                         return -errno;
 670         }
 671
 672         return 0;
 673 }
 674
 675 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
 676         int r;
 677
 678         assert(controller);
 679         assert(fs);
 680
 681         /* Check if the specified controller is actually accessible */
 682         r = controller_is_accessible(controller);
 683         if (r < 0)
 684                 return r;
 685
 686         return cg_get_path(controller, path, suffix, fs);
 687 }
 688
 689 static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
 690         assert(path);
 691         assert(sb);
 692         assert(ftwbuf);
 693
 694         if (typeflag != FTW_DP)
 695                 return 0;
 696
 697         if (ftwbuf->level < 1)
 698                 return 0;
 699
 700         (void) rmdir(path);
 701         return 0;
 702 }
 703
 704 int cg_trim(const char *controller, const char *path, bool delete_root) {
 705         _cleanup_free_ char *fs = NULL;
 706         int r = 0;
 707
 708         assert(path);
 709
 710         r = cg_get_path(controller, path, NULL, &fs);
 711         if (r < 0)
 712                 return r;
 713
 714         errno = 0;
 715         if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
 716                 if (errno == ENOENT)
 717                         r = 0;
 718                 else if (errno > 0)
 719                         r = -errno;
 720                 else
 721                         r = -EIO;
 722         }
 723
 724         if (delete_root) {
 725                 if (rmdir(fs) < 0 && errno != ENOENT)
 726                         return -errno;
 727         }
 728
 729         return r;
 730 }
 731
 732 int cg_create(const char *controller, const char *path) {
 733         _cleanup_free_ char *fs = NULL;
 734         int r;
 735
 736         r = cg_get_path_and_check(controller, path, NULL, &fs);
 737         if (r < 0)
 738                 return r;
 739
 740         r = mkdir_parents(fs, 0755);
 741         if (r < 0)
 742                 return r;
 743
 744         if (mkdir(fs, 0755) < 0) {
 745
 746                 if (errno == EEXIST)
 747                         return 0;
 748
 749                 return -errno;
 750         }
 751
 752         return 1;
 753 }
 754
 755 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
 756         int r, q;
 757
 758         assert(pid >= 0);
 759
 760         r = cg_create(controller, path);
 761         if (r < 0)
 762                 return r;
 763
 764         q = cg_attach(controller, path, pid);
 765         if (q < 0)
 766                 return q;
 767
 768         /* This does not remove the cgroup on failure */
 769         return r;
 770 }
 771
 772 int cg_attach(const char *controller, const char *path, pid_t pid) {
 773         _cleanup_free_ char *fs = NULL;
 774         char c[DECIMAL_STR_MAX(pid_t) + 2];
 775         int r;
 776
 777         assert(path);
 778         assert(pid >= 0);
 779
 780         r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
 781         if (r < 0)
 782                 return r;
 783
 784         if (pid == 0)
 785                 pid = getpid();
 786
 787         xsprintf(c, PID_FMT "\n", pid);
 788
 789         return write_string_file(fs, c, 0);
 790 }
 791
 792 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
 793         int r;
 794
 795         assert(controller);
 796         assert(path);
 797         assert(pid >= 0);
 798
 799         r = cg_attach(controller, path, pid);
 800         if (r < 0) {
 801                 char prefix[strlen(path) + 1];
 802
 803                 /* This didn't work? Then let's try all prefixes of
 804                  * the destination */
 805
 806                 PATH_FOREACH_PREFIX(prefix, path) {
 807                         int q;
 808
 809                         q = cg_attach(controller, prefix, pid);
 810                         if (q >= 0)
 811                                 return q;
 812                 }
 813         }
 814
 815         return r;
 816 }
 817
 818 int cg_set_group_access(
 819                 const char *controller,
 820                 const char *path,
 821                 mode_t mode,
 822                 uid_t uid,
 823                 gid_t gid) {
 824
 825         _cleanup_free_ char *fs = NULL;
 826         int r;
 827
 828         if (mode == MODE_INVALID && uid == UID_INVALID && gid == GID_INVALID)
 829                 return 0;
 830
 831         if (mode != MODE_INVALID)
 832                 mode &= 0777;
 833
 834         r = cg_get_path(controller, path, NULL, &fs);
 835         if (r < 0)
 836                 return r;
 837
 838         return chmod_and_chown(fs, mode, uid, gid);
 839 }
 840
 841 int cg_set_task_access(
 842                 const char *controller,
 843                 const char *path,
 844                 mode_t mode,
 845                 uid_t uid,
 846                 gid_t gid) {
 847
 848         _cleanup_free_ char *fs = NULL, *procs = NULL;
 849         int r;
 850
 851         assert(path);
 852
 853         if (mode == MODE_INVALID && uid == UID_INVALID && gid == GID_INVALID)
 854                 return 0;
 855
 856         if (mode != MODE_INVALID)
 857                 mode &= 0666;
 858
 859         r = cg_get_path(controller, path, "cgroup.procs", &fs);
 860         if (r < 0)
 861                 return r;
 862
 863         r = chmod_and_chown(fs, mode, uid, gid);
 864         if (r < 0)
 865                 return r;
 866
 867         if (cg_unified(controller))
 868                 return 0;
 869
 870         /* Compatibility, Always keep values for "tasks" in sync with
 871          * "cgroup.procs" */
 872         if (cg_get_path(controller, path, "tasks", &procs) >= 0)
 873                 (void) chmod_and_chown(procs, mode, uid, gid);
 874
 875         return 0;
 876 }
 877
 878 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
 879         _cleanup_free_ char *fs = NULL;
 880         int r;
 881
 882         assert(path);
 883         assert(name);
 884         assert(value || size <= 0);
 885
 886         r = cg_get_path(controller, path, NULL, &fs);
 887         if (r < 0)
 888                 return r;
 889
 890         if (setxattr(fs, name, value, size, flags) < 0)
 891                 return -errno;
 892
 893         return 0;
 894 }
 895
 896 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
 897         _cleanup_free_ char *fs = NULL;
 898         ssize_t n;
 899         int r;
 900
 901         assert(path);
 902         assert(name);
 903
 904         r = cg_get_path(controller, path, NULL, &fs);
 905         if (r < 0)
 906                 return r;
 907
 908         n = getxattr(fs, name, value, size);
 909         if (n < 0)
 910                 return -errno;
 911
 912         return (int) n;
 913 }
 914
 915 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
 916         _cleanup_fclose_ FILE *f = NULL;
 917         char line[LINE_MAX];
 918         const char *fs, *controller_str;
 919         size_t cs = 0;
 920         bool unified;
 921
 922         assert(path);
 923         assert(pid >= 0);
 924
 925         if (controller) {
 926                 if (!cg_controller_is_valid(controller))
 927                         return -EINVAL;
 928         } else
 929                 controller = SYSTEMD_CGROUP_CONTROLLER;
 930
 931         unified = cg_unified(controller);
 932         if (!unified) {
 933                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 934                         controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 935                 else
 936                         controller_str = controller;
 937
 938                 cs = strlen(controller_str);
 939         }
 940
 941         fs = procfs_file_alloca(pid, "cgroup");
 942         f = fopen(fs, "re");
 943         if (!f)
 944                 return errno == ENOENT ? -ESRCH : -errno;
 945
 946         FOREACH_LINE(line, f, return -errno) {
 947                 char *e, *p;
 948
 949                 truncate_nl(line);
 950
 951                 if (unified) {
 952                         e = startswith(line, "0:");
 953                         if (!e)
 954                                 continue;
 955
 956                         e = strchr(e, ':');
 957                         if (!e)
 958                                 continue;
 959                 } else {
 960                         char *l;
 961                         size_t k;
 962                         const char *word, *state;
 963                         bool found = false;
 964
 965                         l = strchr(line, ':');
 966                         if (!l)
 967                                 continue;
 968
 969                         l++;
 970                         e = strchr(l, ':');
 971                         if (!e)
 972                                 continue;
 973
 974                         *e = 0;
 975                         FOREACH_WORD_SEPARATOR(word, k, l, ",", state) {
 976                                 if (k == cs && memcmp(word, controller_str, cs) == 0) {
 977                                         found = true;
 978                                         break;
 979                                 }
 980                         }
 981
 982                         if (!found)
 983                                 continue;
 984                 }
 985
 986                 p = strdup(e + 1);
 987                 if (!p)
 988                         return -ENOMEM;
 989
 990                 *path = p;
 991                 return 0;
 992         }
 993
 994         return -ENODATA;
 995 }
 996
 997 int cg_install_release_agent(const char *controller, const char *agent) {
 998         _cleanup_free_ char *fs = NULL, *contents = NULL;
 999         const char *sc;
1000         int r;
1001
1002         assert(agent);
1003
1004         if (cg_unified(controller)) /* doesn't apply to unified hierarchy */
1005                 return -EOPNOTSUPP;
1006
1007         r = cg_get_path(controller, NULL, "release_agent", &fs);
1008         if (r < 0)
1009                 return r;
1010
1011         r = read_one_line_file(fs, &contents);
1012         if (r < 0)
1013                 return r;
1014
1015         sc = strstrip(contents);
1016         if (isempty(sc)) {
1017                 r = write_string_file(fs, agent, 0);
1018                 if (r < 0)
1019                         return r;
1020         } else if (!path_equal(sc, agent))
1021                 return -EEXIST;
1022
1023         fs = mfree(fs);
1024         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1025         if (r < 0)
1026                 return r;
1027
1028         contents = mfree(contents);
1029         r = read_one_line_file(fs, &contents);
1030         if (r < 0)
1031                 return r;
1032
1033         sc = strstrip(contents);
1034         if (streq(sc, "0")) {
1035                 r = write_string_file(fs, "1", 0);
1036                 if (r < 0)
1037                         return r;
1038
1039                 return 1;
1040         }
1041
1042         if (!streq(sc, "1"))
1043                 return -EIO;
1044
1045         return 0;
1046 }
1047
1048 int cg_uninstall_release_agent(const char *controller) {
1049         _cleanup_free_ char *fs = NULL;
1050         int r;
1051
1052         if (cg_unified(controller)) /* Doesn't apply to unified hierarchy */
1053                 return -EOPNOTSUPP;
1054
1055         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1056         if (r < 0)
1057                 return r;
1058
1059         r = write_string_file(fs, "0", 0);
1060         if (r < 0)
1061                 return r;
1062
1063         fs = mfree(fs);
1064
1065         r = cg_get_path(controller, NULL, "release_agent", &fs);
1066         if (r < 0)
1067                 return r;
1068
1069         r = write_string_file(fs, "", 0);
1070         if (r < 0)
1071                 return r;
1072
1073         return 0;
1074 }
1075
1076 int cg_is_empty(const char *controller, const char *path) {
1077         _cleanup_fclose_ FILE *f = NULL;
1078         pid_t pid;
1079         int r;
1080
1081         assert(path);
1082
1083         r = cg_enumerate_processes(controller, path, &f);
1084         if (r == -ENOENT)
1085                 return 1;
1086         if (r < 0)
1087                 return r;
1088
1089         r = cg_read_pid(f, &pid);
1090         if (r < 0)
1091                 return r;
1092
1093         return r == 0;
1094 }
1095
1096 int cg_is_empty_recursive(const char *controller, const char *path) {
1097         int r;
1098
1099         assert(path);
1100
1101         /* The root cgroup is always populated */
1102         if (controller && (isempty(path) || path_equal(path, "/")))
1103                 return false;
1104
1105         if (cg_unified(controller)) {
1106                 _cleanup_free_ char *t = NULL;
1107
1108                 /* On the unified hierarchy we can check empty state
1109                  * via the "populated" attribute of "cgroup.events". */
1110
1111                 r = cg_read_event(controller, path, "populated", &t);
1112                 if (r < 0)
1113                         return r;
1114
1115                 return streq(t, "0");
1116         } else {
1117                 _cleanup_closedir_ DIR *d = NULL;
1118                 char *fn;
1119
1120                 r = cg_is_empty(controller, path);
1121                 if (r <= 0)
1122                         return r;
1123
1124                 r = cg_enumerate_subgroups(controller, path, &d);
1125                 if (r == -ENOENT)
1126                         return 1;
1127                 if (r < 0)
1128                         return r;
1129
1130                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1131                         _cleanup_free_ char *p = NULL;
1132
1133                         p = strjoin(path, "/", fn);
1134                         free(fn);
1135                         if (!p)
1136                                 return -ENOMEM;
1137
1138                         r = cg_is_empty_recursive(controller, p);
1139                         if (r <= 0)
1140                                 return r;
1141                 }
1142                 if (r < 0)
1143                         return r;
1144
1145                 return true;
1146         }
1147 }
1148
1149 int cg_split_spec(const char *spec, char **controller, char **path) {
1150         char *t = NULL, *u = NULL;
1151         const char *e;
1152
1153         assert(spec);
1154
1155         if (*spec == '/') {
1156                 if (!path_is_safe(spec))
1157                         return -EINVAL;
1158
1159                 if (path) {
1160                         t = strdup(spec);
1161                         if (!t)
1162                                 return -ENOMEM;
1163
1164                         *path = path_kill_slashes(t);
1165                 }
1166
1167                 if (controller)
1168                         *controller = NULL;
1169
1170                 return 0;
1171         }
1172
1173         e = strchr(spec, ':');
1174         if (!e) {
1175                 if (!cg_controller_is_valid(spec))
1176                         return -EINVAL;
1177
1178                 if (controller) {
1179                         t = strdup(spec);
1180                         if (!t)
1181                                 return -ENOMEM;
1182
1183                         *controller = t;
1184                 }
1185
1186                 if (path)
1187                         *path = NULL;
1188
1189                 return 0;
1190         }
1191
1192         t = strndup(spec, e-spec);
1193         if (!t)
1194                 return -ENOMEM;
1195         if (!cg_controller_is_valid(t)) {
1196                 free(t);
1197                 return -EINVAL;
1198         }
1199
1200         if (isempty(e+1))
1201                 u = NULL;
1202         else {
1203                 u = strdup(e+1);
1204                 if (!u) {
1205                         free(t);
1206                         return -ENOMEM;
1207                 }
1208
1209                 if (!path_is_safe(u) ||
1210                     !path_is_absolute(u)) {
1211                         free(t);
1212                         free(u);
1213                         return -EINVAL;
1214                 }
1215
1216                 path_kill_slashes(u);
1217         }
1218
1219         if (controller)
1220                 *controller = t;
1221         else
1222                 free(t);
1223
1224         if (path)
1225                 *path = u;
1226         else
1227                 free(u);
1228
1229         return 0;
1230 }
1231
1232 int cg_mangle_path(const char *path, char **result) {
1233         _cleanup_free_ char *c = NULL, *p = NULL;
1234         char *t;
1235         int r;
1236
1237         assert(path);
1238         assert(result);
1239
1240         /* First, check if it already is a filesystem path */
1241         if (path_startswith(path, "/sys/fs/cgroup")) {
1242
1243                 t = strdup(path);
1244                 if (!t)
1245                         return -ENOMEM;
1246
1247                 *result = path_kill_slashes(t);
1248                 return 0;
1249         }
1250
1251         /* Otherwise, treat it as cg spec */
1252         r = cg_split_spec(path, &c, &p);
1253         if (r < 0)
1254                 return r;
1255
1256         return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1257 }
1258
1259 int cg_get_root_path(char **path) {
1260         char *p, *e;
1261         int r;
1262
1263         assert(path);
1264
1265         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1266         if (r < 0)
1267                 return r;
1268
1269         e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1270         if (!e)
1271                 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1272         if (!e)
1273                 e = endswith(p, "/system"); /* even more legacy */
1274         if (e)
1275                 *e = 0;
1276
1277         *path = p;
1278         return 0;
1279 }
1280
1281 int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1282         _cleanup_free_ char *rt = NULL;
1283         char *p;
1284         int r;
1285
1286         assert(cgroup);
1287         assert(shifted);
1288
1289         if (!root) {
1290                 /* If the root was specified let's use that, otherwise
1291                  * let's determine it from PID 1 */
1292
1293                 r = cg_get_root_path(&rt);
1294                 if (r < 0)
1295                         return r;
1296
1297                 root = rt;
1298         }
1299
1300         p = path_startswith(cgroup, root);
1301         if (p && p > cgroup)
1302                 *shifted = p - 1;
1303         else
1304                 *shifted = cgroup;
1305
1306         return 0;
1307 }
1308
1309 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1310         _cleanup_free_ char *raw = NULL;
1311         const char *c;
1312         int r;
1313
1314         assert(pid >= 0);
1315         assert(cgroup);
1316
1317         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1318         if (r < 0)
1319                 return r;
1320
1321         r = cg_shift_path(raw, root, &c);
1322         if (r < 0)
1323                 return r;
1324
1325         if (c == raw) {
1326                 *cgroup = raw;
1327                 raw = NULL;
1328         } else {
1329                 char *n;
1330
1331                 n = strdup(c);
1332                 if (!n)
1333                         return -ENOMEM;
1334
1335                 *cgroup = n;
1336         }
1337
1338         return 0;
1339 }
1340
1341 int cg_path_decode_unit(const char *cgroup, char **unit) {
1342         char *c, *s;
1343         size_t n;
1344
1345         assert(cgroup);
1346         assert(unit);
1347
1348         n = strcspn(cgroup, "/");
1349         if (n < 3)
1350                 return -ENXIO;
1351
1352         c = strndupa(cgroup, n);
1353         c = cg_unescape(c);
1354
1355         if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1356                 return -ENXIO;
1357
1358         s = strdup(c);
1359         if (!s)
1360                 return -ENOMEM;
1361
1362         *unit = s;
1363         return 0;
1364 }
1365
1366 static bool valid_slice_name(const char *p, size_t n) {
1367
1368         if (!p)
1369                 return false;
1370
1371         if (n < strlen("x.slice"))
1372                 return false;
1373
1374         if (memcmp(p + n - 6, ".slice", 6) == 0) {
1375                 char buf[n+1], *c;
1376
1377                 memcpy(buf, p, n);
1378                 buf[n] = 0;
1379
1380                 c = cg_unescape(buf);
1381
1382                 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1383         }
1384
1385         return false;
1386 }
1387
1388 static const char *skip_slices(const char *p) {
1389         assert(p);
1390
1391         /* Skips over all slice assignments */
1392
1393         for (;;) {
1394                 size_t n;
1395
1396                 p += strspn(p, "/");
1397
1398                 n = strcspn(p, "/");
1399                 if (!valid_slice_name(p, n))
1400                         return p;
1401
1402                 p += n;
1403         }
1404 }
1405
1406 int cg_path_get_unit(const char *path, char **ret) {
1407         const char *e;
1408         char *unit;
1409         int r;
1410
1411         assert(path);
1412         assert(ret);
1413
1414         e = skip_slices(path);
1415
1416         r = cg_path_decode_unit(e, &unit);
1417         if (r < 0)
1418                 return r;
1419
1420         /* We skipped over the slices, don't accept any now */
1421         if (endswith(unit, ".slice")) {
1422                 free(unit);
1423                 return -ENXIO;
1424         }
1425
1426         *ret = unit;
1427         return 0;
1428 }
1429
1430 int cg_pid_get_unit(pid_t pid, char **unit) {
1431         _cleanup_free_ char *cgroup = NULL;
1432         int r;
1433
1434         assert(unit);
1435
1436         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1437         if (r < 0)
1438                 return r;
1439
1440         return cg_path_get_unit(cgroup, unit);
1441 }
1442
1443 /**
1444  * Skip session-*.scope, but require it to be there.
1445  */
1446 static const char *skip_session(const char *p) {
1447         size_t n;
1448
1449         if (isempty(p))
1450                 return NULL;
1451
1452         p += strspn(p, "/");
1453
1454         n = strcspn(p, "/");
1455         if (n < strlen("session-x.scope"))
1456                 return NULL;
1457
1458         if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1459                 char buf[n - 8 - 6 + 1];
1460
1461                 memcpy(buf, p + 8, n - 8 - 6);
1462                 buf[n - 8 - 6] = 0;
1463
1464                 /* Note that session scopes never need unescaping,
1465                  * since they cannot conflict with the kernel's own
1466                  * names, hence we don't need to call cg_unescape()
1467                  * here. */
1468
1469                 if (!session_id_valid(buf))
1470                         return false;
1471
1472                 p += n;
1473                 p += strspn(p, "/");
1474                 return p;
1475         }
1476
1477         return NULL;
1478 }
1479
1480 /**
1481  * Skip user@*.service, but require it to be there.
1482  */
1483 static const char *skip_user_manager(const char *p) {
1484         size_t n;
1485
1486         if (isempty(p))
1487                 return NULL;
1488
1489         p += strspn(p, "/");
1490
1491         n = strcspn(p, "/");
1492         if (n < strlen("user@x.service"))
1493                 return NULL;
1494
1495         if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1496                 char buf[n - 5 - 8 + 1];
1497
1498                 memcpy(buf, p + 5, n - 5 - 8);
1499                 buf[n - 5 - 8] = 0;
1500
1501                 /* Note that user manager services never need unescaping,
1502                  * since they cannot conflict with the kernel's own
1503                  * names, hence we don't need to call cg_unescape()
1504                  * here. */
1505
1506                 if (parse_uid(buf, NULL) < 0)
1507                         return NULL;
1508
1509                 p += n;
1510                 p += strspn(p, "/");
1511
1512                 return p;
1513         }
1514
1515         return NULL;
1516 }
1517
1518 static const char *skip_user_prefix(const char *path) {
1519         const char *e, *t;
1520
1521         assert(path);
1522
1523         /* Skip slices, if there are any */
1524         e = skip_slices(path);
1525
1526         /* Skip the user manager, if it's in the path now... */
1527         t = skip_user_manager(e);
1528         if (t)
1529                 return t;
1530
1531         /* Alternatively skip the user session if it is in the path... */
1532         return skip_session(e);
1533 }
1534
1535 int cg_path_get_user_unit(const char *path, char **ret) {
1536         const char *t;
1537
1538         assert(path);
1539         assert(ret);
1540
1541         t = skip_user_prefix(path);
1542         if (!t)
1543                 return -ENXIO;
1544
1545         /* And from here on it looks pretty much the same as for a
1546          * system unit, hence let's use the same parser from here
1547          * on. */
1548         return cg_path_get_unit(t, ret);
1549 }
1550
1551 int cg_pid_get_user_unit(pid_t pid, char **unit) {
1552         _cleanup_free_ char *cgroup = NULL;
1553         int r;
1554
1555         assert(unit);
1556
1557         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1558         if (r < 0)
1559                 return r;
1560
1561         return cg_path_get_user_unit(cgroup, unit);
1562 }
1563
1564 int cg_path_get_machine_name(const char *path, char **machine) {
1565         _cleanup_free_ char *u = NULL;
1566         const char *sl;
1567         int r;
1568
1569         r = cg_path_get_unit(path, &u);
1570         if (r < 0)
1571                 return r;
1572
1573         sl = strjoina("/run/systemd/machines/unit:", u);
1574         return readlink_malloc(sl, machine);
1575 }
1576
1577 int cg_pid_get_machine_name(pid_t pid, char **machine) {
1578         _cleanup_free_ char *cgroup = NULL;
1579         int r;
1580
1581         assert(machine);
1582
1583         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1584         if (r < 0)
1585                 return r;
1586
1587         return cg_path_get_machine_name(cgroup, machine);
1588 }
1589
1590 int cg_path_get_session(const char *path, char **session) {
1591         _cleanup_free_ char *unit = NULL;
1592         char *start, *end;
1593         int r;
1594
1595         assert(path);
1596
1597         r = cg_path_get_unit(path, &unit);
1598         if (r < 0)
1599                 return r;
1600
1601         start = startswith(unit, "session-");
1602         if (!start)
1603                 return -ENXIO;
1604         end = endswith(start, ".scope");
1605         if (!end)
1606                 return -ENXIO;
1607
1608         *end = 0;
1609         if (!session_id_valid(start))
1610                 return -ENXIO;
1611
1612         if (session) {
1613                 char *rr;
1614
1615                 rr = strdup(start);
1616                 if (!rr)
1617                         return -ENOMEM;
1618
1619                 *session = rr;
1620         }
1621
1622         return 0;
1623 }
1624
1625 int cg_pid_get_session(pid_t pid, char **session) {
1626         _cleanup_free_ char *cgroup = NULL;
1627         int r;
1628
1629         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1630         if (r < 0)
1631                 return r;
1632
1633         return cg_path_get_session(cgroup, session);
1634 }
1635
1636 int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1637         _cleanup_free_ char *slice = NULL;
1638         char *start, *end;
1639         int r;
1640
1641         assert(path);
1642
1643         r = cg_path_get_slice(path, &slice);
1644         if (r < 0)
1645                 return r;
1646
1647         start = startswith(slice, "user-");
1648         if (!start)
1649                 return -ENXIO;
1650         end = endswith(start, ".slice");
1651         if (!end)
1652                 return -ENXIO;
1653
1654         *end = 0;
1655         if (parse_uid(start, uid) < 0)
1656                 return -ENXIO;
1657
1658         return 0;
1659 }
1660
1661 int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1662         _cleanup_free_ char *cgroup = NULL;
1663         int r;
1664
1665         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1666         if (r < 0)
1667                 return r;
1668
1669         return cg_path_get_owner_uid(cgroup, uid);
1670 }
1671
1672 int cg_path_get_slice(const char *p, char **slice) {
1673         const char *e = NULL;
1674
1675         assert(p);
1676         assert(slice);
1677
1678         /* Finds the right-most slice unit from the beginning, but
1679          * stops before we come to the first non-slice unit. */
1680
1681         for (;;) {
1682                 size_t n;
1683
1684                 p += strspn(p, "/");
1685
1686                 n = strcspn(p, "/");
1687                 if (!valid_slice_name(p, n)) {
1688
1689                         if (!e) {
1690                                 char *s;
1691
1692                                 s = strdup(SPECIAL_ROOT_SLICE);
1693                                 if (!s)
1694                                         return -ENOMEM;
1695
1696                                 *slice = s;
1697                                 return 0;
1698                         }
1699
1700                         return cg_path_decode_unit(e, slice);
1701                 }
1702
1703                 e = p;
1704                 p += n;
1705         }
1706 }
1707
1708 int cg_pid_get_slice(pid_t pid, char **slice) {
1709         _cleanup_free_ char *cgroup = NULL;
1710         int r;
1711
1712         assert(slice);
1713
1714         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1715         if (r < 0)
1716                 return r;
1717
1718         return cg_path_get_slice(cgroup, slice);
1719 }
1720
1721 int cg_path_get_user_slice(const char *p, char **slice) {
1722         const char *t;
1723         assert(p);
1724         assert(slice);
1725
1726         t = skip_user_prefix(p);
1727         if (!t)
1728                 return -ENXIO;
1729
1730         /* And now it looks pretty much the same as for a system
1731          * slice, so let's just use the same parser from here on. */
1732         return cg_path_get_slice(t, slice);
1733 }
1734
1735 int cg_pid_get_user_slice(pid_t pid, char **slice) {
1736         _cleanup_free_ char *cgroup = NULL;
1737         int r;
1738
1739         assert(slice);
1740
1741         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1742         if (r < 0)
1743                 return r;
1744
1745         return cg_path_get_user_slice(cgroup, slice);
1746 }
1747
1748 char *cg_escape(const char *p) {
1749         bool need_prefix = false;
1750
1751         /* This implements very minimal escaping for names to be used
1752          * as file names in the cgroup tree: any name which might
1753          * conflict with a kernel name or is prefixed with '_' is
1754          * prefixed with a '_'. That way, when reading cgroup names it
1755          * is sufficient to remove a single prefixing underscore if
1756          * there is one. */
1757
1758         /* The return value of this function (unlike cg_unescape())
1759          * needs free()! */
1760
1761         if (p[0] == 0 ||
1762             p[0] == '_' ||
1763             p[0] == '.' ||
1764             streq(p, "notify_on_release") ||
1765             streq(p, "release_agent") ||
1766             streq(p, "tasks") ||
1767             startswith(p, "cgroup."))
1768                 need_prefix = true;
1769         else {
1770                 const char *dot;
1771
1772                 dot = strrchr(p, '.');
1773                 if (dot) {
1774                         CGroupController c;
1775                         size_t l = dot - p;
1776
1777                         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1778                                 const char *n;
1779
1780                                 n = cgroup_controller_to_string(c);
1781
1782                                 if (l != strlen(n))
1783                                         continue;
1784
1785                                 if (memcmp(p, n, l) != 0)
1786                                         continue;
1787
1788                                 need_prefix = true;
1789                                 break;
1790                         }
1791                 }
1792         }
1793
1794         if (need_prefix)
1795                 return strappend("_", p);
1796
1797         return strdup(p);
1798 }
1799
1800 char *cg_unescape(const char *p) {
1801         assert(p);
1802
1803         /* The return value of this function (unlike cg_escape())
1804          * doesn't need free()! */
1805
1806         if (p[0] == '_')
1807                 return (char*) p+1;
1808
1809         return (char*) p;
1810 }
1811
1812 #define CONTROLLER_VALID                        \
1813         DIGITS LETTERS                          \
1814         "_"
1815
1816 bool cg_controller_is_valid(const char *p) {
1817         const char *t, *s;
1818
1819         if (!p)
1820                 return false;
1821
1822         if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1823                 return true;
1824
1825         s = startswith(p, "name=");
1826         if (s)
1827                 p = s;
1828
1829         if (*p == 0 || *p == '_')
1830                 return false;
1831
1832         for (t = p; *t; t++)
1833                 if (!strchr(CONTROLLER_VALID, *t))
1834                         return false;
1835
1836         if (t - p > FILENAME_MAX)
1837                 return false;
1838
1839         return true;
1840 }
1841
1842 int cg_slice_to_path(const char *unit, char **ret) {
1843         _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1844         const char *dash;
1845         int r;
1846
1847         assert(unit);
1848         assert(ret);
1849
1850         if (streq(unit, SPECIAL_ROOT_SLICE)) {
1851                 char *x;
1852
1853                 x = strdup("");
1854                 if (!x)
1855                         return -ENOMEM;
1856                 *ret = x;
1857                 return 0;
1858         }
1859
1860         if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1861                 return -EINVAL;
1862
1863         if (!endswith(unit, ".slice"))
1864                 return -EINVAL;
1865
1866         r = unit_name_to_prefix(unit, &p);
1867         if (r < 0)
1868                 return r;
1869
1870         dash = strchr(p, '-');
1871
1872         /* Don't allow initial dashes */
1873         if (dash == p)
1874                 return -EINVAL;
1875
1876         while (dash) {
1877                 _cleanup_free_ char *escaped = NULL;
1878                 char n[dash - p + sizeof(".slice")];
1879
1880                 /* Don't allow trailing or double dashes */
1881                 if (dash[1] == 0 || dash[1] == '-')
1882                         return -EINVAL;
1883
1884                 strcpy(stpncpy(n, p, dash - p), ".slice");
1885                 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1886                         return -EINVAL;
1887
1888                 escaped = cg_escape(n);
1889                 if (!escaped)
1890                         return -ENOMEM;
1891
1892                 if (!strextend(&s, escaped, "/", NULL))
1893                         return -ENOMEM;
1894
1895                 dash = strchr(dash+1, '-');
1896         }
1897
1898         e = cg_escape(unit);
1899         if (!e)
1900                 return -ENOMEM;
1901
1902         if (!strextend(&s, e, NULL))
1903                 return -ENOMEM;
1904
1905         *ret = s;
1906         s = NULL;
1907
1908         return 0;
1909 }
1910
1911 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
1912         _cleanup_free_ char *p = NULL;
1913         int r;
1914
1915         r = cg_get_path(controller, path, attribute, &p);
1916         if (r < 0)
1917                 return r;
1918
1919         return write_string_file(p, value, 0);
1920 }
1921
1922 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
1923         _cleanup_free_ char *p = NULL;
1924         int r;
1925
1926         r = cg_get_path(controller, path, attribute, &p);
1927         if (r < 0)
1928                 return r;
1929
1930         return read_one_line_file(p, ret);
1931 }
1932
1933 int cg_get_keyed_attribute(const char *controller, const char *path, const char *attribute, const char **keys, char **values) {
1934         _cleanup_free_ char *filename = NULL, *content = NULL;
1935         char *line, *p;
1936         int i, r;
1937
1938         for (i = 0; keys[i]; i++)
1939                 values[i] = NULL;
1940
1941         r = cg_get_path(controller, path, attribute, &filename);
1942         if (r < 0)
1943                 return r;
1944
1945         r = read_full_file(filename, &content, NULL);
1946         if (r < 0)
1947                 return r;
1948
1949         p = content;
1950         while ((line = strsep(&p, "\n"))) {
1951                 char *key;
1952
1953                 key = strsep(&line, " ");
1954
1955                 for (i = 0; keys[i]; i++) {
1956                         if (streq(key, keys[i])) {
1957                                 values[i] = strdup(line);
1958                                 break;
1959                         }
1960                 }
1961         }
1962
1963         for (i = 0; keys[i]; i++) {
1964                 if (!values[i]) {
1965                         for (i = 0; keys[i]; i++) {
1966                                 free(values[i]);
1967                                 values[i] = NULL;
1968                         }
1969                         return -ENOENT;
1970                 }
1971         }
1972
1973         return 0;
1974 }
1975
1976 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
1977         CGroupController c;
1978         int r;
1979
1980         /* This one will create a cgroup in our private tree, but also
1981          * duplicate it in the trees specified in mask, and remove it
1982          * in all others */
1983
1984         /* First create the cgroup in our own hierarchy. */
1985         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
1986         if (r < 0)
1987                 return r;
1988
1989         /* If we are in the unified hierarchy, we are done now */
1990         if (cg_all_unified())
1991                 return 0;
1992
1993         /* Otherwise, do the same in the other hierarchies */
1994         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1995                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1996                 const char *n;
1997
1998                 n = cgroup_controller_to_string(c);
1999
2000                 if (mask & bit)
2001                         (void) cg_create(n, path);
2002                 else if (supported & bit)
2003                         (void) cg_trim(n, path, true);
2004         }
2005
2006         return 0;
2007 }
2008
2009 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2010         CGroupController c;
2011         int r;
2012
2013         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
2014         if (r < 0)
2015                 return r;
2016
2017         if (cg_all_unified())
2018                 return 0;
2019
2020         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2021                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2022                 const char *p = NULL;
2023
2024                 if (!(supported & bit))
2025                         continue;
2026
2027                 if (path_callback)
2028                         p = path_callback(bit, userdata);
2029
2030                 if (!p)
2031                         p = path;
2032
2033                 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
2034         }
2035
2036         return 0;
2037 }
2038
2039 int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
2040         Iterator i;
2041         void *pidp;
2042         int r = 0;
2043
2044         SET_FOREACH(pidp, pids, i) {
2045                 pid_t pid = PTR_TO_PID(pidp);
2046                 int q;
2047
2048                 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
2049                 if (q < 0 && r >= 0)
2050                         r = q;
2051         }
2052
2053         return r;
2054 }
2055
2056 int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
2057         CGroupController c;
2058         int r = 0;
2059
2060         if (!path_equal(from, to))  {
2061                 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
2062                 if (r < 0)
2063                         return r;
2064         }
2065
2066         if (cg_all_unified())
2067                 return r;
2068
2069         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2070                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2071                 const char *p = NULL;
2072
2073                 if (!(supported & bit))
2074                         continue;
2075
2076                 if (to_callback)
2077                         p = to_callback(bit, userdata);
2078
2079                 if (!p)
2080                         p = to;
2081
2082                 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
2083         }
2084
2085         return 0;
2086 }
2087
2088 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2089         CGroupController c;
2090         int r;
2091
2092         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2093         if (r < 0)
2094                 return r;
2095
2096         if (cg_all_unified())
2097                 return r;
2098
2099         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2100                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2101
2102                 if (!(supported & bit))
2103                         continue;
2104
2105                 (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
2106         }
2107
2108         return 0;
2109 }
2110
2111 int cg_mask_supported(CGroupMask *ret) {
2112         CGroupMask mask = 0;
2113         int r;
2114
2115         /* Determines the mask of supported cgroup controllers. Only
2116          * includes controllers we can make sense of and that are
2117          * actually accessible. */
2118
2119         if (cg_all_unified()) {
2120                 _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
2121                 const char *c;
2122
2123                 /* In the unified hierarchy we can read the supported
2124                  * and accessible controllers from a the top-level
2125                  * cgroup attribute */
2126
2127                 r = cg_get_root_path(&root);
2128                 if (r < 0)
2129                         return r;
2130
2131                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2132                 if (r < 0)
2133                         return r;
2134
2135                 r = read_one_line_file(path, &controllers);
2136                 if (r < 0)
2137                         return r;
2138
2139                 c = controllers;
2140                 for (;;) {
2141                         _cleanup_free_ char *n = NULL;
2142                         CGroupController v;
2143
2144                         r = extract_first_word(&c, &n, NULL, 0);
2145                         if (r < 0)
2146                                 return r;
2147                         if (r == 0)
2148                                 break;
2149
2150                         v = cgroup_controller_from_string(n);
2151                         if (v < 0)
2152                                 continue;
2153
2154                         mask |= CGROUP_CONTROLLER_TO_MASK(v);
2155                 }
2156
2157                 /* Currently, we support the cpu, memory, io and pids
2158                  * controller in the unified hierarchy, mask
2159                  * everything else off. */
2160                 mask &= CGROUP_MASK_CPU | CGROUP_MASK_MEMORY | CGROUP_MASK_IO | CGROUP_MASK_PIDS;
2161
2162         } else {
2163                 CGroupController c;
2164
2165                 /* In the legacy hierarchy, we check whether which
2166                  * hierarchies are mounted. */
2167
2168                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2169                         const char *n;
2170
2171                         n = cgroup_controller_to_string(c);
2172                         if (controller_is_accessible(n) >= 0)
2173                                 mask |= CGROUP_CONTROLLER_TO_MASK(c);
2174                 }
2175         }
2176
2177         *ret = mask;
2178         return 0;
2179 }
2180
2181 int cg_kernel_controllers(Set *controllers) {
2182         _cleanup_fclose_ FILE *f = NULL;
2183         char buf[LINE_MAX];
2184         int r;
2185
2186         assert(controllers);
2187
2188         /* Determines the full list of kernel-known controllers. Might
2189          * include controllers we don't actually support, arbitrary
2190          * named hierarchies and controllers that aren't currently
2191          * accessible (because not mounted). */
2192
2193         f = fopen("/proc/cgroups", "re");
2194         if (!f) {
2195                 if (errno == ENOENT)
2196                         return 0;
2197                 return -errno;
2198         }
2199
2200         /* Ignore the header line */
2201         (void) fgets(buf, sizeof(buf), f);
2202
2203         for (;;) {
2204                 char *controller;
2205                 int enabled = 0;
2206
2207                 errno = 0;
2208                 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2209
2210                         if (feof(f))
2211                                 break;
2212
2213                         if (ferror(f) && errno > 0)
2214                                 return -errno;
2215
2216                         return -EBADMSG;
2217                 }
2218
2219                 if (!enabled) {
2220                         free(controller);
2221                         continue;
2222                 }
2223
2224                 if (!cg_controller_is_valid(controller)) {
2225                         free(controller);
2226                         return -EBADMSG;
2227                 }
2228
2229                 r = set_consume(controllers, controller);
2230                 if (r < 0)
2231                         return r;
2232         }
2233
2234         return 0;
2235 }
2236
2237 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2238
2239 static int cg_update_unified(void) {
2240
2241         struct statfs fs;
2242
2243         /* Checks if we support the unified hierarchy. Returns an
2244          * error when the cgroup hierarchies aren't mounted yet or we
2245          * have any other trouble determining if the unified hierarchy
2246          * is supported. */
2247
2248         if (unified_cache >= CGROUP_UNIFIED_NONE)
2249                 return 0;
2250
2251         if (statfs("/sys/fs/cgroup/", &fs) < 0)
2252                 return -errno;
2253
2254         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC))
2255                 unified_cache = CGROUP_UNIFIED_ALL;
2256         else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2257                 if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
2258                         return -errno;
2259
2260                 unified_cache = F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC) ?
2261                         CGROUP_UNIFIED_SYSTEMD : CGROUP_UNIFIED_NONE;
2262         } else
2263                 return -ENOMEDIUM;
2264
2265         return 0;
2266 }
2267
2268 bool cg_unified(const char *controller) {
2269
2270         assert(cg_update_unified() >= 0);
2271
2272         if (streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER))
2273                 return unified_cache >= CGROUP_UNIFIED_SYSTEMD;
2274         else
2275                 return unified_cache >= CGROUP_UNIFIED_ALL;
2276 }
2277
2278 bool cg_all_unified(void) {
2279
2280         return cg_unified(NULL);
2281 }
2282
2283 int cg_unified_flush(void) {
2284         unified_cache = CGROUP_UNIFIED_UNKNOWN;
2285
2286         return cg_update_unified();
2287 }
2288
2289 int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
2290         _cleanup_free_ char *fs = NULL;
2291         CGroupController c;
2292         int r;
2293
2294         assert(p);
2295
2296         if (supported == 0)
2297                 return 0;
2298
2299         if (!cg_all_unified()) /* on the legacy hiearchy there's no joining of controllers defined */
2300                 return 0;
2301
2302         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2303         if (r < 0)
2304                 return r;
2305
2306         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2307                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2308                 const char *n;
2309
2310                 if (!(supported & bit))
2311                         continue;
2312
2313                 n = cgroup_controller_to_string(c);
2314                 {
2315                         char s[1 + strlen(n) + 1];
2316
2317                         s[0] = mask & bit ? '+' : '-';
2318                         strcpy(s + 1, n);
2319
2320                         r = write_string_file(fs, s, 0);
2321                         if (r < 0)
2322                                 log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs);
2323                 }
2324         }
2325
2326         return 0;
2327 }
2328
2329 bool cg_is_unified_wanted(void) {
2330         static thread_local int wanted = -1;
2331         int r;
2332         bool b;
2333
2334         /* If the hierarchy is already mounted, then follow whatever
2335          * was chosen for it. */
2336         if (cg_unified_flush() >= 0)
2337                 return cg_all_unified();
2338
2339         /* Otherwise, let's see what the kernel command line has to
2340          * say. Since checking that is expensive, let's cache the
2341          * result. */
2342         if (wanted >= 0)
2343                 return wanted;
2344
2345         r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
2346         if (r < 0)
2347                 return false;
2348
2349         return (wanted = r > 0 ? b : false);
2350 }
2351
2352 bool cg_is_legacy_wanted(void) {
2353         return !cg_is_unified_wanted();
2354 }
2355
2356 bool cg_is_unified_systemd_controller_wanted(void) {
2357         static thread_local int wanted = -1;
2358         int r;
2359         bool b;
2360
2361         /* If the unified hierarchy is requested in full, no need to
2362          * bother with this. */
2363         if (cg_is_unified_wanted())
2364                 return 0;
2365
2366         /* If the hierarchy is already mounted, then follow whatever
2367          * was chosen for it. */
2368         if (cg_unified_flush() >= 0)
2369                 return cg_unified(SYSTEMD_CGROUP_CONTROLLER);
2370
2371         /* Otherwise, let's see what the kernel command line has to
2372          * say. Since checking that is expensive, let's cache the
2373          * result. */
2374         if (wanted >= 0)
2375                 return wanted;
2376
2377         r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
2378         if (r < 0)
2379                 return false;
2380
2381         return (wanted = r > 0 ? b : false);
2382 }
2383
2384 bool cg_is_legacy_systemd_controller_wanted(void) {
2385         return cg_is_legacy_wanted() && !cg_is_unified_systemd_controller_wanted();
2386 }
2387
2388 int cg_weight_parse(const char *s, uint64_t *ret) {
2389         uint64_t u;
2390         int r;
2391
2392         if (isempty(s)) {
2393                 *ret = CGROUP_WEIGHT_INVALID;
2394                 return 0;
2395         }
2396
2397         r = safe_atou64(s, &u);
2398         if (r < 0)
2399                 return r;
2400
2401         if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2402                 return -ERANGE;
2403
2404         *ret = u;
2405         return 0;
2406 }
2407
2408 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2409         [CGROUP_IO_RBPS_MAX]    = CGROUP_LIMIT_MAX,
2410         [CGROUP_IO_WBPS_MAX]    = CGROUP_LIMIT_MAX,
2411         [CGROUP_IO_RIOPS_MAX]   = CGROUP_LIMIT_MAX,
2412         [CGROUP_IO_WIOPS_MAX]   = CGROUP_LIMIT_MAX,
2413 };
2414
2415 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2416         [CGROUP_IO_RBPS_MAX]    = "IOReadBandwidthMax",
2417         [CGROUP_IO_WBPS_MAX]    = "IOWriteBandwidthMax",
2418         [CGROUP_IO_RIOPS_MAX]   = "IOReadIOPSMax",
2419         [CGROUP_IO_WIOPS_MAX]   = "IOWriteIOPSMax",
2420 };
2421
2422 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2423
2424 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2425         uint64_t u;
2426         int r;
2427
2428         if (isempty(s)) {
2429                 *ret = CGROUP_CPU_SHARES_INVALID;
2430                 return 0;
2431         }
2432
2433         r = safe_atou64(s, &u);
2434         if (r < 0)
2435                 return r;
2436
2437         if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2438                 return -ERANGE;
2439
2440         *ret = u;
2441         return 0;
2442 }
2443
2444 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2445         uint64_t u;
2446         int r;
2447
2448         if (isempty(s)) {
2449                 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2450                 return 0;
2451         }
2452
2453         r = safe_atou64(s, &u);
2454         if (r < 0)
2455                 return r;
2456
2457         if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2458                 return -ERANGE;
2459
2460         *ret = u;
2461         return 0;
2462 }
2463
2464 bool is_cgroup_fs(const struct statfs *s) {
2465         return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2466                is_fs_type(s, CGROUP2_SUPER_MAGIC);
2467 }
2468
2469 bool fd_is_cgroup_fs(int fd) {
2470         struct statfs s;
2471
2472         if (fstatfs(fd, &s) < 0)
2473                 return -errno;
2474
2475         return is_cgroup_fs(&s);
2476 }
2477
2478 static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2479         [CGROUP_CONTROLLER_CPU] = "cpu",
2480         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2481         [CGROUP_CONTROLLER_IO] = "io",
2482         [CGROUP_CONTROLLER_BLKIO] = "blkio",
2483         [CGROUP_CONTROLLER_MEMORY] = "memory",
2484         [CGROUP_CONTROLLER_DEVICES] = "devices",
2485         [CGROUP_CONTROLLER_PIDS] = "pids",
2486 };
2487
2488 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);