]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: unified cgroup hierarchy support 1116/head
authorLennart Poettering <lennart@poettering.net>
Tue, 1 Sep 2015 17:22:36 +0000 (19:22 +0200)
committerLennart Poettering <lennart@poettering.net>
Tue, 1 Sep 2015 21:52:27 +0000 (23:52 +0200)
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.

A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).

It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.

The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.

This patch also removes cg_delete() which is unused now.

On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.

This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.

This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.

The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.

To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.

This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.

When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.

23 files changed:
src/basic/cgroup-util.c
src/basic/cgroup-util.h
src/basic/def.h
src/basic/missing.h
src/basic/special.h
src/cgls/cgls.c
src/cgtop/cgtop.c
src/core/cgroup.c
src/core/cgroup.h
src/core/dbus-cgroup.c
src/core/dbus-unit.c
src/core/execute.h
src/core/manager.c
src/core/manager.h
src/core/mount-setup.c
src/core/scope.c
src/core/service.c
src/core/slice.c
src/core/unit.c
src/core/unit.h
src/nspawn/nspawn.c
src/test/test-cgroup-mask.c
src/test/test-cgroup.c

index 98adace55a53a4b8cd91fddd89513cfaf993b176..218de0b376f598aad81f5a06319b44149de7a521 100644 (file)
@@ -443,105 +443,169 @@ int cg_migrate_recursive_fallback(
         return r;
 }
 
-static const char *normalize_controller(const char *controller) {
+static const char *controller_to_dirname(const char *controller) {
+        const char *e;
 
         assert(controller);
 
-        if (startswith(controller, "name="))
-                return controller + 5;
-        else
-                return controller;
+        /* Converts a controller name to the directory name below
+         * /sys/fs/cgroup/ we want to mount it to. Effectively, this
+         * just cuts off the name= prefixed used for named
+         * hierarchies, if it is specified. */
+
+        e = startswith(controller, "name=");
+        if (e)
+                return e;
+
+        return controller;
 }
 
-static int join_path(const char *controller, const char *path, const char *suffix, char **fs) {
+static int join_path_legacy(const char *controller_dn, const char *path, const char *suffix, char **fs) {
         char *t = NULL;
 
-        if (!isempty(controller)) {
-                if (!isempty(path) && !isempty(suffix))
-                        t = strjoin("/sys/fs/cgroup/", controller, "/", path, "/", suffix, NULL);
-                else if (!isempty(path))
-                        t = strjoin("/sys/fs/cgroup/", controller, "/", path, NULL);
-                else if (!isempty(suffix))
-                        t = strjoin("/sys/fs/cgroup/", controller, "/", suffix, NULL);
-                else
-                        t = strappend("/sys/fs/cgroup/", controller);
-        } else {
-                if (!isempty(path) && !isempty(suffix))
-                        t = strjoin(path, "/", suffix, NULL);
-                else if (!isempty(path))
-                        t = strdup(path);
-                else
-                        return -EINVAL;
-        }
+        assert(fs);
+        assert(controller_dn);
+
+        if (isempty(path) && isempty(suffix))
+                t = strappend("/sys/fs/cgroup/", controller_dn);
+        else if (isempty(path))
+                t = strjoin("/sys/fs/cgroup/", controller_dn, "/", suffix, NULL);
+        else if (isempty(suffix))
+                t = strjoin("/sys/fs/cgroup/", controller_dn, "/", path, NULL);
+        else
+                t = strjoin("/sys/fs/cgroup/", controller_dn, "/", path, "/", suffix, NULL);
+        if (!t)
+                return -ENOMEM;
 
+        *fs = t;
+        return 0;
+}
+
+static int join_path_unified(const char *path, const char *suffix, char **fs) {
+        char *t;
+
+        assert(fs);
+
+        if (isempty(path) && isempty(suffix))
+                t = strdup("/sys/fs/cgroup");
+        else if (isempty(path))
+                t = strappend("/sys/fs/cgroup/", suffix);
+        else if (isempty(suffix))
+                t = strappend("/sys/fs/cgroup/", path);
+        else
+                t = strjoin("/sys/fs/cgroup/", path, "/", suffix, NULL);
         if (!t)
                 return -ENOMEM;
 
-        *fs = path_kill_slashes(t);
+        *fs = t;
         return 0;
 }
 
 int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
-        const char *p;
-        static thread_local bool good = false;
+        int unified, r;
 
         assert(fs);
 
-        if (controller && !cg_controller_is_valid(controller))
+        if (!controller) {
+                char *t;
+
+                /* If no controller is specified, we assume only the
+                 * path below the controller matters */
+
+                if (!path && !suffix)
+                        return -EINVAL;
+
+                if (isempty(suffix))
+                        t = strdup(path);
+                else if (isempty(path))
+                        t = strdup(suffix);
+                else
+                        t = strjoin(path, "/", suffix, NULL);
+                if (!t)
+                        return -ENOMEM;
+
+                *fs = path_kill_slashes(t);
+                return 0;
+        }
+
+        if (!cg_controller_is_valid(controller))
                 return -EINVAL;
 
-        if (_unlikely_(!good)) {
-                int r;
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
 
-                r = path_is_mount_point("/sys/fs/cgroup", 0);
-                if (r < 0)
-                        return r;
-                if (r == 0)
-                        return -ENOENT;
+        if (unified > 0)
+                r = join_path_unified(path, suffix, fs);
+        else {
+                const char *dn;
 
-                /* Cache this to save a few stat()s */
-                good = true;
+                if (controller)
+                        dn = controller_to_dirname(controller);
+                else
+                        dn = NULL;
+
+                r = join_path_legacy(dn, path, suffix, fs);
         }
 
-        p = controller ? normalize_controller(controller) : NULL;
+        if (r < 0)
+                return r;
 
-        return join_path(p, path, suffix, fs);
+        path_kill_slashes(*fs);
+        return 0;
 }
 
-static int check_hierarchy(const char *p) {
-        const char *cc;
+static int controller_is_accessible(const char *controller) {
+        int unified;
 
-        assert(p);
+        assert(controller);
 
-        if (!filename_is_valid(p))
-                return 0;
+        /* Checks whether a specific controller is accessible,
+         * i.e. its hierarchy mounted. In the unified hierarchy all
+         * controllers are considered accessible, except for the named
+         * hierarchies */
 
-        /* Check if this controller actually really exists */
-        cc = strjoina("/sys/fs/cgroup/", p);
-        if (laccess(cc, F_OK) < 0)
-                return -errno;
+        if (!cg_controller_is_valid(controller))
+                return -EINVAL;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified > 0) {
+                /* We don't support named hierarchies if we are using
+                 * the unified hierarchy. */
+
+                if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
+                        return 0;
+
+                if (startswith(controller, "name="))
+                        return -EOPNOTSUPP;
+
+        } else {
+                const char *cc, *dn;
+
+                dn = controller_to_dirname(controller);
+                cc = strjoina("/sys/fs/cgroup/", dn);
+
+                if (laccess(cc, F_OK) < 0)
+                        return -errno;
+        }
 
         return 0;
 }
 
 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
-        const char *p;
         int r;
 
+        assert(controller);
         assert(fs);
 
-        if (!cg_controller_is_valid(controller))
-                return -EINVAL;
-
-        /* Normalize the controller syntax */
-        p = normalize_controller(controller);
-
-        /* Check if this controller actually really exists */
-        r = check_hierarchy(p);
+        /* Check if the specified controller is actually accessible */
+        r = controller_is_accessible(controller);
         if (r < 0)
                 return r;
 
-        return join_path(p, path, suffix, fs);
+        return cg_get_path(controller, path, suffix, fs);
 }
 
 static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
@@ -587,20 +651,6 @@ int cg_trim(const char *controller, const char *path, bool delete_root) {
         return r;
 }
 
-int cg_delete(const char *controller, const char *path) {
-        _cleanup_free_ char *parent = NULL;
-        int r;
-
-        assert(path);
-
-        r = path_get_parent(path, &parent);
-        if (r < 0)
-                return r;
-
-        r = cg_migrate_recursive(controller, path, controller, parent, false, true);
-        return r == -ENOENT ? 0 : r;
-}
-
 int cg_create(const char *controller, const char *path) {
         _cleanup_free_ char *fs = NULL;
         int r;
@@ -718,7 +768,7 @@ int cg_set_task_access(
                 gid_t gid) {
 
         _cleanup_free_ char *fs = NULL, *procs = NULL;
-        int r;
+        int r, unified;
 
         assert(path);
 
@@ -736,77 +786,88 @@ int cg_set_task_access(
         if (r < 0)
                 return r;
 
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified)
+                return 0;
+
         /* Compatibility, Always keep values for "tasks" in sync with
          * "cgroup.procs" */
-        r = cg_get_path(controller, path, "tasks", &procs);
-        if (r < 0)
-                return r;
+        if (cg_get_path(controller, path, "tasks", &procs) >= 0)
+                (void) chmod_and_chown(procs, mode, uid, gid);
 
-        return chmod_and_chown(procs, mode, uid, gid);
+        return 0;
 }
 
 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
         _cleanup_fclose_ FILE *f = NULL;
         char line[LINE_MAX];
         const char *fs;
-        size_t cs;
+        size_t cs = 0;
+        int unified;
 
         assert(path);
         assert(pid >= 0);
 
-        if (controller) {
-                if (!cg_controller_is_valid(controller))
-                        return -EINVAL;
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified == 0) {
+                if (controller) {
+                        if (!cg_controller_is_valid(controller))
+                                return -EINVAL;
+                } else
+                        controller = SYSTEMD_CGROUP_CONTROLLER;
 
-                controller = normalize_controller(controller);
-        } else
-                controller = SYSTEMD_CGROUP_CONTROLLER;
+                cs = strlen(controller);
+        }
 
         fs = procfs_file_alloca(pid, "cgroup");
-
         f = fopen(fs, "re");
         if (!f)
                 return errno == ENOENT ? -ESRCH : -errno;
 
-        cs = strlen(controller);
-
         FOREACH_LINE(line, f, return -errno) {
-                char *l, *p, *e;
-                size_t k;
-                const char *word, *state;
-                bool found = false;
+                char *e, *p;
 
                 truncate_nl(line);
 
-                l = strchr(line, ':');
-                if (!l)
-                        continue;
-
-                l++;
-                e = strchr(l, ':');
-                if (!e)
-                        continue;
+                if (unified) {
+                        e = startswith(line, "0:");
+                        if (!e)
+                                continue;
 
-                *e = 0;
+                        e = strchr(e, ':');
+                        if (!e)
+                                continue;
+                } else {
+                        char *l;
+                        size_t k;
+                        const char *word, *state;
+                        bool found = false;
+
+                        l = strchr(line, ':');
+                        if (!l)
+                                continue;
 
-                FOREACH_WORD_SEPARATOR(word, k, l, ",", state) {
+                        l++;
+                        e = strchr(l, ':');
+                        if (!e)
+                                continue;
 
-                        if (k == cs && memcmp(word, controller, cs) == 0) {
-                                found = true;
-                                break;
+                        *e = 0;
+                        FOREACH_WORD_SEPARATOR(word, k, l, ",", state) {
+                                if (k == cs && memcmp(word, controller, cs) == 0) {
+                                        found = true;
+                                        break;
+                                }
                         }
 
-                        if (k == 5 + cs &&
-                            memcmp(word, "name=", 5) == 0 &&
-                            memcmp(word+5, controller, cs) == 0) {
-                                found = true;
-                                break;
-                        }
+                        if (!found)
+                                continue;
                 }
 
-                if (!found)
-                        continue;
-
                 p = strdup(e + 1);
                 if (!p)
                         return -ENOMEM;
@@ -820,11 +881,17 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
 
 int cg_install_release_agent(const char *controller, const char *agent) {
         _cleanup_free_ char *fs = NULL, *contents = NULL;
-        char *sc;
-        int r;
+        const char *sc;
+        int r, unified;
 
         assert(agent);
 
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified) /* doesn't apply to unified hierarchy */
+                return -EOPNOTSUPP;
+
         r = cg_get_path(controller, NULL, "release_agent", &fs);
         if (r < 0)
                 return r;
@@ -868,7 +935,13 @@ int cg_install_release_agent(const char *controller, const char *agent) {
 
 int cg_uninstall_release_agent(const char *controller) {
         _cleanup_free_ char *fs = NULL;
-        int r;
+        int r, unified;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified) /* Doesn't apply to unified hierarchy */
+                return -EOPNOTSUPP;
 
         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
         if (r < 0)
@@ -893,7 +966,7 @@ int cg_uninstall_release_agent(const char *controller) {
 
 int cg_is_empty(const char *controller, const char *path) {
         _cleanup_fclose_ FILE *f = NULL;
-        pid_t pid = 0;
+        pid_t pid;
         int r;
 
         assert(path);
@@ -912,49 +985,69 @@ int cg_is_empty(const char *controller, const char *path) {
 }
 
 int cg_is_empty_recursive(const char *controller, const char *path) {
-        _cleanup_closedir_ DIR *d = NULL;
-        char *fn;
-        int r;
+        int unified, r;
 
         assert(path);
 
         /* The root cgroup is always populated */
         if (controller && (isempty(path) || path_equal(path, "/")))
-                return 0;
+                return false;
 
-        r = cg_is_empty(controller, path);
-        if (r <= 0)
-                return r;
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
 
-        r = cg_enumerate_subgroups(controller, path, &d);
-        if (r == -ENOENT)
-                return 1;
-        if (r < 0)
-                return r;
+        if (unified > 0) {
+                _cleanup_free_ char *populated = NULL, *t = NULL;
 
-        while ((r = cg_read_subgroup(d, &fn)) > 0) {
-                _cleanup_free_ char *p = NULL;
+                /* On the unified hierarchy we can check empty state
+                 * via the "cgroup.populated" attribute. */
 
-                p = strjoin(path, "/", fn, NULL);
-                free(fn);
-                if (!p)
-                        return -ENOMEM;
+                r = cg_get_path(controller, path, "cgroup.populated", &populated);
+                if (r < 0)
+                        return r;
+
+                r = read_one_line_file(populated, &t);
+                if (r < 0)
+                        return r;
+
+                return streq(t, "0");
+        } else {
+                _cleanup_closedir_ DIR *d = NULL;
+                char *fn;
 
-                r = cg_is_empty_recursive(controller, p);
+                r = cg_is_empty(controller, path);
                 if (r <= 0)
                         return r;
-        }
 
-        if (r < 0)
-                return r;
+                r = cg_enumerate_subgroups(controller, path, &d);
+                if (r == -ENOENT)
+                        return 1;
+                if (r < 0)
+                        return r;
 
-        return 1;
+                while ((r = cg_read_subgroup(d, &fn)) > 0) {
+                        _cleanup_free_ char *p = NULL;
+
+                        p = strjoin(path, "/", fn, NULL);
+                        free(fn);
+                        if (!p)
+                                return -ENOMEM;
+
+                        r = cg_is_empty_recursive(controller, p);
+                        if (r <= 0)
+                                return r;
+                }
+                if (r < 0)
+                        return r;
+
+                return true;
+        }
 }
 
 int cg_split_spec(const char *spec, char **controller, char **path) {
-        const char *e;
         char *t = NULL, *u = NULL;
-        _cleanup_free_ char *v = NULL;
+        const char *e;
 
         assert(spec);
 
@@ -982,7 +1075,7 @@ int cg_split_spec(const char *spec, char **controller, char **path) {
                         return -EINVAL;
 
                 if (controller) {
-                        t = strdup(normalize_controller(spec));
+                        t = strdup(spec);
                         if (!t)
                                 return -ENOMEM;
 
@@ -995,10 +1088,7 @@ int cg_split_spec(const char *spec, char **controller, char **path) {
                 return 0;
         }
 
-        v = strndup(spec, e-spec);
-        if (!v)
-                return -ENOMEM;
-        t = strdup(normalize_controller(v));
+        t = strndup(spec, e-spec);
         if (!t)
                 return -ENOMEM;
         if (!cg_controller_is_valid(t)) {
@@ -1006,13 +1096,9 @@ int cg_split_spec(const char *spec, char **controller, char **path) {
                 return -EINVAL;
         }
 
-        if (streq(e+1, "")) {
-                u = strdup("/");
-                if (!u) {
-                        free(t);
-                        return -ENOMEM;
-                }
-        } else {
+        if (isempty(e+1))
+                u = NULL;
+        else {
                 u = strdup(e+1);
                 if (!u) {
                         free(t);
@@ -1066,7 +1152,7 @@ int cg_mangle_path(const char *path, char **result) {
         if (r < 0)
                 return r;
 
-        return cg_get_path(c ? c : SYSTEMD_CGROUP_CONTROLLER, p ? p : "/", NULL, result);
+        return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
 }
 
 int cg_get_root_path(char **path) {
@@ -1079,7 +1165,11 @@ int cg_get_root_path(char **path) {
         if (r < 0)
                 return r;
 
-        e = endswith(p, "/" SPECIAL_SYSTEM_SLICE);
+        e = endswith(p, "/" SPECIAL_INIT_SCOPE);
+        if (!e)
+                e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
+        if (!e)
+                e = endswith(p, "/system"); /* even more legacy */
         if (e)
                 *e = 0;
 
@@ -1107,7 +1197,7 @@ int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
         }
 
         p = path_startswith(cgroup, root);
-        if (p)
+        if (p && p > cgroup)
                 *shifted = p - 1;
         else
                 *shifted = cgroup;
@@ -1371,17 +1461,15 @@ int cg_pid_get_user_unit(pid_t pid, char **unit) {
 }
 
 int cg_path_get_machine_name(const char *path, char **machine) {
-        _cleanup_free_ char *u = NULL, *sl = NULL;
+        _cleanup_free_ char *u = NULL;
+        const char *sl;
         int r;
 
         r = cg_path_get_unit(path, &u);
         if (r < 0)
                 return r;
 
-        sl = strjoin("/run/systemd/machines/unit:", u, NULL);
-        if (!sl)
-                return -ENOMEM;
-
+        sl = strjoina("/run/systemd/machines/unit:", u);
         return readlink_malloc(sl, machine);
 }
 
@@ -1574,31 +1662,38 @@ char *cg_escape(const char *p) {
             p[0] == '.' ||
             streq(p, "notify_on_release") ||
             streq(p, "release_agent") ||
-            streq(p, "tasks"))
+            streq(p, "tasks") ||
+            startswith(p, "cgroup."))
                 need_prefix = true;
         else {
                 const char *dot;
 
                 dot = strrchr(p, '.');
                 if (dot) {
+                        CGroupController c;
+                        size_t l = dot - p;
 
-                        if (dot - p == 6 && memcmp(p, "cgroup", 6) == 0)
-                                need_prefix = true;
-                        else {
-                                char *n;
+                        for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+                                const char *n;
+
+                                n = cgroup_controller_to_string(c);
 
-                                n = strndupa(p, dot - p);
+                                if (l != strlen(n))
+                                        continue;
 
-                                if (check_hierarchy(n) >= 0)
-                                        need_prefix = true;
+                                if (memcmp(p, n, l) != 0)
+                                        continue;
+
+                                need_prefix = true;
+                                break;
                         }
                 }
         }
 
         if (need_prefix)
                 return strappend("_", p);
-        else
-                return strdup(p);
+
+        return strdup(p);
 }
 
 char *cg_unescape(const char *p) {
@@ -1731,17 +1826,9 @@ int cg_get_attribute(const char *controller, const char *path, const char *attri
         return read_one_line_file(p, ret);
 }
 
-static const char mask_names[] =
-        "cpu\0"
-        "cpuacct\0"
-        "blkio\0"
-        "memory\0"
-        "devices\0";
-
-int cg_create_everywhere(CGroupControllerMask supported, CGroupControllerMask mask, const char *path) {
-        CGroupControllerMask bit = 1;
-        const char *n;
-        int r;
+int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
+        CGroupController c;
+        int r, unified;
 
         /* This one will create a cgroup in our private tree, but also
          * duplicate it in the trees specified in mask, and remove it
@@ -1752,49 +1839,63 @@ int cg_create_everywhere(CGroupControllerMask supported, CGroupControllerMask ma
         if (r < 0)
                 return r;
 
-        /* Then, do the same in the other hierarchies */
-        NULSTR_FOREACH(n, mask_names) {
+        /* If we are in the unified hierarchy, we are done now */
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified > 0)
+                return 0;
+
+        /* Otherwise, do the same in the other hierarchies */
+        for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+                CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+                const char *n;
+
+                n = cgroup_controller_to_string(c);
+
                 if (mask & bit)
-                        cg_create(n, path);
+                        (void) cg_create(n, path);
                 else if (supported & bit)
-                        cg_trim(n, path, true);
-
-                bit <<= 1;
+                        (void) cg_trim(n, path, true);
         }
 
         return 0;
 }
 
-int cg_attach_everywhere(CGroupControllerMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
-        CGroupControllerMask bit = 1;
-        const char *n;
-        int r;
+int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
+        CGroupController c;
+        int r, unified;
 
         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
         if (r < 0)
                 return r;
 
-        NULSTR_FOREACH(n, mask_names) {
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified > 0)
+                return 0;
 
-                if (supported & bit) {
-                        const char *p = NULL;
+        for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+                CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+                const char *p = NULL;
 
-                        if (path_callback)
-                                p = path_callback(bit, userdata);
+                if (!(supported & bit))
+                        continue;
 
-                        if (!p)
-                                p = path;
+                if (path_callback)
+                        p = path_callback(bit, userdata);
 
-                        cg_attach_fallback(n, p, pid);
-                }
+                if (!p)
+                        p = path;
 
-                bit <<= 1;
+                (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
         }
 
         return 0;
 }
 
-int cg_attach_many_everywhere(CGroupControllerMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
+int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
         Iterator i;
         void *pidp;
         int r = 0;
@@ -1804,17 +1905,16 @@ int cg_attach_many_everywhere(CGroupControllerMask supported, const char *path,
                 int q;
 
                 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
-                if (q < 0)
+                if (q < 0 && r >= 0)
                         r = q;
         }
 
         return r;
 }
 
-int cg_migrate_everywhere(CGroupControllerMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
-        CGroupControllerMask bit = 1;
-        const char *n;
-        int r;
+int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
+         CGroupController c;
+        int r, unified;
 
         if (!path_equal(from, to))  {
                 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, false, true);
@@ -1822,56 +1922,119 @@ int cg_migrate_everywhere(CGroupControllerMask supported, const char *from, cons
                         return r;
         }
 
-        NULSTR_FOREACH(n, mask_names) {
-                if (supported & bit) {
-                        const char *p = NULL;
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified > 0)
+                return r;
 
-                        if (to_callback)
-                                p = to_callback(bit, userdata);
+        for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+                CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+                const char *p = NULL;
 
-                        if (!p)
-                                p = to;
+                if (!(supported & bit))
+                        continue;
 
-                        cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, n, p, false, false);
-                }
+                if (to_callback)
+                        p = to_callback(bit, userdata);
 
-                bit <<= 1;
+                if (!p)
+                        p = to;
+
+                (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, false, false);
         }
 
         return 0;
 }
 
-int cg_trim_everywhere(CGroupControllerMask supported, const char *path, bool delete_root) {
-        CGroupControllerMask bit = 1;
-        const char *n;
-        int r;
+int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
+        CGroupController c;
+        int r, unified;
 
         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
         if (r < 0)
                 return r;
 
-        NULSTR_FOREACH(n, mask_names) {
-                if (supported & bit)
-                        cg_trim(n, path, delete_root);
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified > 0)
+                return r;
+
+        for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+                CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+
+                if (!(supported & bit))
+                        continue;
 
-                bit <<= 1;
+                (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
         }
 
         return 0;
 }
 
-CGroupControllerMask cg_mask_supported(void) {
-        CGroupControllerMask bit = 1, mask = 0;
-        const char *n;
+int cg_mask_supported(CGroupMask *ret) {
+        CGroupMask mask = 0;
+        int r, unified;
+
+        /* Determines the mask of supported cgroup controllers. Only
+         * includes controllers we can make sense of and that are
+         * actually accessible. */
 
-        NULSTR_FOREACH(n, mask_names) {
-                if (check_hierarchy(n) >= 0)
-                        mask |= bit;
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (unified > 0) {
+                _cleanup_free_ char *controllers = NULL;
+                const char *c;
+
+                /* In the unified hierarchy we can read the supported
+                 * and accessible controllers from a the top-level
+                 * cgroup attribute */
+
+                r = read_one_line_file("/sys/fs/cgroup/cgroup.controllers", &controllers);
+                if (r < 0)
+                        return r;
 
-                bit <<= 1;
+                c = controllers;
+                for (;;) {
+                        _cleanup_free_ char *n = NULL;
+                        CGroupController v;
+
+                        r = extract_first_word(&c, &n, NULL, 0);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        v = cgroup_controller_from_string(n);
+                        if (v < 0)
+                                continue;
+
+                        mask |= CGROUP_CONTROLLER_TO_MASK(v);
+                }
+
+                /* Currently, we only support the memory controller in
+                 * the unified hierarchy, mask everything else off. */
+                mask &= CGROUP_MASK_MEMORY;
+
+        } else {
+                CGroupController c;
+
+                /* In the legacy hierarchy, we check whether which
+                 * hierarchies are mounted. */
+
+                for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+                        const char *n;
+
+                        n = cgroup_controller_to_string(c);
+                        if (controller_is_accessible(n) >= 0)
+                                mask |= CGROUP_CONTROLLER_TO_MASK(c);
+                }
         }
 
-        return mask;
+        *ret = mask;
+        return 0;
 }
 
 int cg_kernel_controllers(Set *controllers) {
@@ -1917,7 +2080,7 @@ int cg_kernel_controllers(Set *controllers) {
                         continue;
                 }
 
-                if (!filename_is_valid(controller)) {
+                if (!cg_controller_is_valid(controller)) {
                         free(controller);
                         return -EBADMSG;
                 }
@@ -1929,3 +2092,122 @@ int cg_kernel_controllers(Set *controllers) {
 
         return 0;
 }
+
+static thread_local int unified_cache = -1;
+
+int cg_unified(void) {
+        struct statfs fs;
+
+        /* Checks if we support the unified hierarchy. Returns an
+         * error when the cgroup hierarchies aren't mounted yet or we
+         * have any other trouble determining if the unified hierarchy
+         * is supported. */
+
+        if (unified_cache >= 0)
+                return unified_cache;
+
+        if (statfs("/sys/fs/cgroup/", &fs) < 0)
+                return -errno;
+
+        if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC))
+                unified_cache = true;
+        else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC))
+                unified_cache = false;
+        else
+                return -ENOEXEC;
+
+        return unified_cache;
+}
+
+void cg_unified_flush(void) {
+        unified_cache = -1;
+}
+
+int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
+        _cleanup_free_ char *fs = NULL;
+        CGroupController c;
+        int r, unified;
+
+        assert(p);
+
+        if (supported == 0)
+                return 0;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return unified;
+        if (!unified) /* on the legacy hiearchy there's no joining of controllers defined */
+                return 0;
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
+        if (r < 0)
+                return r;
+
+        for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+                CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+                const char *n;
+
+                if (!(supported & bit))
+                        continue;
+
+                n = cgroup_controller_to_string(c);
+                {
+                        char s[1 + strlen(n) + 1];
+
+                        s[0] = mask & bit ? '+' : '-';
+                        strcpy(s + 1, n);
+
+                        r = write_string_file(fs, s, 0);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs);
+                }
+        }
+
+        return 0;
+}
+
+bool cg_is_unified_wanted(void) {
+        static thread_local int wanted = -1;
+        int r, unified;
+
+        /* If the hierarchy is already mounted, then follow whatever
+         * was chosen for it. */
+        unified = cg_unified();
+        if (unified >= 0)
+                return unified;
+
+        /* Otherwise, let's see what the kernel command line has to
+         * say. Since checking that is expensive, let's cache the
+         * result. */
+        if (wanted >= 0)
+                return wanted;
+
+        r = get_proc_cmdline_key("systemd.unified_cgroup_hierarchy", NULL);
+        if (r > 0)
+                return (wanted = true);
+        else {
+                _cleanup_free_ char *value = NULL;
+
+                r = get_proc_cmdline_key("systemd.unified_cgroup_hierarchy=", &value);
+                if (r < 0)
+                        return false;
+                if (r == 0)
+                        return (wanted = false);
+
+                return (wanted = parse_boolean(value) > 0);
+        }
+}
+
+bool cg_is_legacy_wanted(void) {
+        return !cg_is_unified_wanted();
+}
+
+static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
+        [CGROUP_CONTROLLER_CPU] = "cpu",
+        [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
+        [CGROUP_CONTROLLER_BLKIO] = "blkio",
+        [CGROUP_CONTROLLER_MEMORY] = "memory",
+        [CGROUP_CONTROLLER_DEVICE] = "device",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
index 1c86581eb5f00208b47b580ed2aae83f33e95b06..6fd6d8059031f2f662a52822fec77ca7258a24c1 100644 (file)
 #include "set.h"
 #include "def.h"
 
+/* An enum of well known cgroup controllers */
+typedef enum CGroupController {
+        CGROUP_CONTROLLER_CPU,
+        CGROUP_CONTROLLER_CPUACCT,
+        CGROUP_CONTROLLER_BLKIO,
+        CGROUP_CONTROLLER_MEMORY,
+        CGROUP_CONTROLLER_DEVICE,
+        _CGROUP_CONTROLLER_MAX,
+        _CGROUP_CONTROLLER_INVALID = -1,
+} CGroupController;
+
+#define CGROUP_CONTROLLER_TO_MASK(c) (1 << (c))
+
 /* A bit mask of well known cgroup controllers */
-typedef enum CGroupControllerMask {
-        CGROUP_CPU = 1,
-        CGROUP_CPUACCT = 2,
-        CGROUP_BLKIO = 4,
-        CGROUP_MEMORY = 8,
-        CGROUP_DEVICE = 16,
-        _CGROUP_CONTROLLER_MASK_ALL = 31
-} CGroupControllerMask;
+typedef enum CGroupMask {
+        CGROUP_MASK_CPU = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPU),
+        CGROUP_MASK_CPUACCT = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPUACCT),
+        CGROUP_MASK_BLKIO = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BLKIO),
+        CGROUP_MASK_MEMORY = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_MEMORY),
+        CGROUP_MASK_DEVICE = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_DEVICE),
+        _CGROUP_MASK_ALL = CGROUP_CONTROLLER_TO_MASK(_CGROUP_CONTROLLER_MAX) - 1
+} CGroupMask;
 
 /*
  * General rules:
@@ -77,7 +90,6 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path);
 int cg_trim(const char *controller, const char *path, bool delete_root);
 
 int cg_rmdir(const char *controller, const char *path);
-int cg_delete(const char *controller, const char *path);
 
 int cg_create(const char *controller, const char *path);
 int cg_attach(const char *controller, const char *path, pid_t pid);
@@ -126,14 +138,24 @@ bool cg_controller_is_valid(const char *p);
 
 int cg_slice_to_path(const char *unit, char **ret);
 
-typedef const char* (*cg_migrate_callback_t)(CGroupControllerMask mask, void *userdata);
+typedef const char* (*cg_migrate_callback_t)(CGroupMask mask, void *userdata);
 
-int cg_create_everywhere(CGroupControllerMask supported, CGroupControllerMask mask, const char *path);
-int cg_attach_everywhere(CGroupControllerMask supported, const char *path, pid_t pid, cg_migrate_callback_t callback, void *userdata);
-int cg_attach_many_everywhere(CGroupControllerMask supported, const char *path, Set* pids, cg_migrate_callback_t callback, void *userdata);
-int cg_migrate_everywhere(CGroupControllerMask supported, const char *from, const char *to, cg_migrate_callback_t callback, void *userdata);
-int cg_trim_everywhere(CGroupControllerMask supported, const char *path, bool delete_root);
+int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path);
+int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t callback, void *userdata);
+int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t callback, void *userdata);
+int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t callback, void *userdata);
+int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root);
+int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p);
 
-CGroupControllerMask cg_mask_supported(void);
+int cg_mask_supported(CGroupMask *ret);
 
 int cg_kernel_controllers(Set *controllers);
+
+int cg_unified(void);
+void cg_unified_flush(void);
+
+bool cg_is_unified_wanted(void);
+bool cg_is_legacy_wanted(void);
+
+const char* cgroup_controller_to_string(CGroupController c) _const_;
+CGroupController cgroup_controller_from_string(const char *s) _pure_;
index 5aaba1fe87264d06b2ca6448eafae0e167a9f07b..7c4161eb7277f2a651ad3f4d56287818ebb5a9d0 100644 (file)
@@ -35,7 +35,7 @@
  * the watchdog pings will keep the loop busy. */
 #define DEFAULT_EXIT_USEC (30*USEC_PER_SEC)
 
-#define SYSTEMD_CGROUP_CONTROLLER "systemd"
+#define SYSTEMD_CGROUP_CONTROLLER "name=systemd"
 
 #define SIGNALS_CRASH_HANDLER SIGSEGV,SIGILL,SIGFPE,SIGBUS,SIGQUIT,SIGABRT
 #define SIGNALS_IGNORE SIGPIPE
index 34ab0254ddabc45007bf0e80f22da67cdbf74add..dc1f244d4c5e0d5b4608460c8dd8768ebe9b4509 100644 (file)
@@ -492,6 +492,14 @@ struct btrfs_ioctl_quota_ctl_args {
 #define BTRFS_SUPER_MAGIC 0x9123683E
 #endif
 
+#ifndef CGROUP_SUPER_MAGIC
+#define CGROUP_SUPER_MAGIC 0x27e0eb
+#endif
+
+#ifndef TMPFS_MAGIC
+#define TMPFS_MAGIC 0x01021994
+#endif
+
 #ifndef MS_MOVE
 #define MS_MOVE 8192
 #endif
index e51310eb6dafa13860426a541d784ca1827b7a43..f30458f25ae569548cb6ca5b3537d46de77bdd52 100644 (file)
 #define SPECIAL_USER_SLICE "user.slice"
 #define SPECIAL_MACHINE_SLICE "machine.slice"
 #define SPECIAL_ROOT_SLICE "-.slice"
+
+/* The scope unit systemd itself lives in. */
+#define SPECIAL_INIT_SCOPE "init.scope"
index a8d910d5321eae516a8ee6a3e22e19ca15622c87..4fb642e7b38f4f536acab8f93d610cdaa5e925ad 100644 (file)
@@ -225,7 +225,10 @@ int main(int argc, char *argv[]) {
                                 } else
                                         path = root;
 
-                                printf("Controller %s; control group %s:\n", controller, path);
+                                if (cg_unified() > 0)
+                                        printf("Control group %s:\n", path);
+                                else
+                                        printf("Controller %s; control group %s:\n", controller, path);
                                 fflush(stdout);
 
                                 q = show_cgroup(controller, path, NULL, 0, arg_kernel_threads, output_flags);
index ae562ba135b0d21e5c31736478cc7d0c5d71ea9c..1c94bea31ae3050cffba07b69eb0a8a981857cf7 100644 (file)
@@ -175,7 +175,7 @@ static int process(
                 if (g->n_tasks > 0)
                         g->n_tasks_valid = true;
 
-        } else if (streq(controller, "cpuacct")) {
+        } else if (streq(controller, "cpuacct") && cg_unified() <= 0) {
                 _cleanup_free_ char *p = NULL, *v = NULL;
                 uint64_t new_usage;
                 nsec_t timestamp;
@@ -217,7 +217,10 @@ static int process(
         } else if (streq(controller, "memory")) {
                 _cleanup_free_ char *p = NULL, *v = NULL;
 
-                r = cg_get_path(controller, path, "memory.usage_in_bytes", &p);
+                if (cg_unified() <= 0)
+                        r = cg_get_path(controller, path, "memory.usage_in_bytes", &p);
+                else
+                        r = cg_get_path(controller, path, "memory.current", &p);
                 if (r < 0)
                         return r;
 
@@ -234,7 +237,7 @@ static int process(
                 if (g->memory > 0)
                         g->memory_valid = true;
 
-        } else if (streq(controller, "blkio")) {
+        } else if (streq(controller, "blkio") && cg_unified() <= 0) {
                 _cleanup_fclose_ FILE *f = NULL;
                 _cleanup_free_ char *p = NULL;
                 uint64_t wr = 0, rd = 0;
index e92d2cc850f7c9c7c93c3f644e2723f66b4ad16e..a70b4d33aed86bd2068a5332594eb9eb1f59037b 100644 (file)
@@ -283,7 +283,7 @@ fail:
         return -errno;
 }
 
-void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
+void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
         bool is_root;
         int r;
 
@@ -304,7 +304,7 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
          * cgroup trees (assuming we are running in a container then),
          * and missing cgroups, i.e. EROFS and ENOENT. */
 
-        if ((mask & CGROUP_CPU) && !is_root) {
+        if ((mask & CGROUP_MASK_CPU) && !is_root) {
                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 
                 sprintf(buf, "%lu\n",
@@ -331,7 +331,7 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
         }
 
-        if (mask & CGROUP_BLKIO) {
+        if (mask & CGROUP_MASK_BLKIO) {
                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
@@ -381,21 +381,30 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
                 }
         }
 
-        if ((mask & CGROUP_MEMORY) && !is_root) {
+        if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
                 if (c->memory_limit != (uint64_t) -1) {
                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 
                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
-                        r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
-                } else
-                        r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
+
+                        if (cg_unified() <= 0)
+                                r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
+                        else
+                                r = cg_set_attribute("memory", path, "memory.max", buf);
+
+                } else {
+                        if (cg_unified() <= 0)
+                                r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
+                        else
+                                r = cg_set_attribute("memory", path, "memory.max", "max");
+                }
 
                 if (r < 0)
                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
-                                       "Failed to set memory.limit_in_bytes on %s: %m", path);
+                                       "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
         }
 
-        if ((mask & CGROUP_DEVICE) && !is_root) {
+        if ((mask & CGROUP_MASK_DEVICE) && !is_root) {
                 CGroupDeviceAllow *a;
 
                 /* Changing the devices list of a populated cgroup
@@ -459,8 +468,8 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
         }
 }
 
-CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
-        CGroupControllerMask mask = 0;
+CGroupMask cgroup_context_get_mask(CGroupContext *c) {
+        CGroupMask mask = 0;
 
         /* Figure out which controllers we need */
 
@@ -468,29 +477,31 @@ CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
             c->cpu_shares != (unsigned long) -1 ||
             c->startup_cpu_shares != (unsigned long) -1 ||
             c->cpu_quota_per_sec_usec != USEC_INFINITY)
-                mask |= CGROUP_CPUACCT | CGROUP_CPU;
+                mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 
         if (c->blockio_accounting ||
             c->blockio_weight != (unsigned long) -1 ||
             c->startup_blockio_weight != (unsigned long) -1 ||
             c->blockio_device_weights ||
             c->blockio_device_bandwidths)
-                mask |= CGROUP_BLKIO;
+                mask |= CGROUP_MASK_BLKIO;
 
         if (c->memory_accounting ||
             c->memory_limit != (uint64_t) -1)
-                mask |= CGROUP_MEMORY;
+                mask |= CGROUP_MASK_MEMORY;
 
         if (c->device_allow ||
             c->device_policy != CGROUP_AUTO)
-                mask |= CGROUP_DEVICE;
+                mask |= CGROUP_MASK_DEVICE;
 
         return mask;
 }
 
-CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
+CGroupMask unit_get_own_mask(Unit *u) {
         CGroupContext *c;
 
+        /* Returns the mask of controllers the unit needs for itself */
+
         c = unit_get_cgroup_context(u);
         if (!c)
                 return 0;
@@ -505,15 +516,18 @@ CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 
                 e = unit_get_exec_context(u);
                 if (!e || exec_context_maintains_privileges(e))
-                        return _CGROUP_CONTROLLER_MASK_ALL;
+                        return _CGROUP_MASK_ALL;
         }
 
         return cgroup_context_get_mask(c);
 }
 
-CGroupControllerMask unit_get_members_mask(Unit *u) {
+CGroupMask unit_get_members_mask(Unit *u) {
         assert(u);
 
+        /* Returns the mask of controllers all of the unit's children
+         * require, merged */
+
         if (u->cgroup_members_mask_valid)
                 return u->cgroup_members_mask;
 
@@ -532,7 +546,7 @@ CGroupControllerMask unit_get_members_mask(Unit *u) {
                                 continue;
 
                         u->cgroup_members_mask |=
-                                unit_get_cgroup_mask(member) |
+                                unit_get_own_mask(member) |
                                 unit_get_members_mask(member);
                 }
         }
@@ -541,19 +555,52 @@ CGroupControllerMask unit_get_members_mask(Unit *u) {
         return u->cgroup_members_mask;
 }
 
-CGroupControllerMask unit_get_siblings_mask(Unit *u) {
+CGroupMask unit_get_siblings_mask(Unit *u) {
         assert(u);
 
+        /* Returns the mask of controllers all of the unit's siblings
+         * require, i.e. the members mask of the unit's parent slice
+         * if there is one. */
+
         if (UNIT_ISSET(u->slice))
                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 
-        return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
+        return unit_get_own_mask(u) | unit_get_members_mask(u);
 }
 
-CGroupControllerMask unit_get_target_mask(Unit *u) {
-        CGroupControllerMask mask;
+CGroupMask unit_get_subtree_mask(Unit *u) {
+
+        /* Returns the mask of this subtree, meaning of the group
+         * itself and its children. */
+
+        return unit_get_own_mask(u) | unit_get_members_mask(u);
+}
+
+CGroupMask unit_get_target_mask(Unit *u) {
+        CGroupMask mask;
+
+        /* This returns the cgroup mask of all controllers to enable
+         * for a specific cgroup, i.e. everything it needs itself,
+         * plus all that its children need, plus all that its siblings
+         * need. This is primarily useful on the legacy cgroup
+         * hierarchy, where we need to duplicate each cgroup in each
+         * hierarchy that shall be enabled for it. */
 
-        mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
+        mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
+        mask &= u->manager->cgroup_supported;
+
+        return mask;
+}
+
+CGroupMask unit_get_enable_mask(Unit *u) {
+        CGroupMask mask;
+
+        /* This returns the cgroup mask of all controllers to enable
+         * for the children of a specific cgroup. This is primarily
+         * useful for the unified cgroup hierarchy, where each cgroup
+         * controls which controllers are enabled for its children. */
+
+        mask = unit_get_members_mask(u);
         mask &= u->manager->cgroup_supported;
 
         return mask;
@@ -562,13 +609,13 @@ CGroupControllerMask unit_get_target_mask(Unit *u) {
 /* Recurse from a unit up through its containing slices, propagating
  * mask bits upward. A unit is also member of itself. */
 void unit_update_cgroup_members_masks(Unit *u) {
-        CGroupControllerMask m;
+        CGroupMask m;
         bool more;
 
         assert(u);
 
         /* Calculate subtree mask */
-        m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
+        m = unit_get_subtree_mask(u);
 
         /* See if anything changed from the previous invocation. If
          * not, we're done. */
@@ -608,7 +655,7 @@ void unit_update_cgroup_members_masks(Unit *u) {
         }
 }
 
-static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
+static const char *migrate_callback(CGroupMask mask, void *userdata) {
         Unit *u = userdata;
 
         assert(mask != 0);
@@ -626,7 +673,115 @@ static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
         return NULL;
 }
 
-static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
+char *unit_default_cgroup_path(Unit *u) {
+        _cleanup_free_ char *escaped = NULL, *slice = NULL;
+        int r;
+
+        assert(u);
+
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                return strdup(u->manager->cgroup_root);
+
+        if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
+                r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
+                if (r < 0)
+                        return NULL;
+        }
+
+        escaped = cg_escape(u->id);
+        if (!escaped)
+                return NULL;
+
+        if (slice)
+                return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
+        else
+                return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
+}
+
+int unit_set_cgroup_path(Unit *u, const char *path) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(u);
+
+        if (path) {
+                p = strdup(path);
+                if (!p)
+                        return -ENOMEM;
+        } else
+                p = NULL;
+
+        if (streq_ptr(u->cgroup_path, p))
+                return 0;
+
+        if (p) {
+                r = hashmap_put(u->manager->cgroup_unit, p, u);
+                if (r < 0)
+                        return r;
+        }
+
+        unit_release_cgroup(u);
+
+        u->cgroup_path = p;
+        p = NULL;
+
+        return 1;
+}
+
+int unit_watch_cgroup(Unit *u) {
+        _cleanup_free_ char *populated = NULL;
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_path)
+                return 0;
+
+        if (u->cgroup_inotify_wd >= 0)
+                return 0;
+
+        /* Only applies to the unified hierarchy */
+        r = cg_unified();
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
+        if (r == 0)
+                return 0;
+
+        /* Don't watch the root slice, it's pointless. */
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                return 0;
+
+        r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
+        if (r < 0)
+                return log_oom();
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
+        if (r < 0)
+                return log_oom();
+
+        u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
+        if (u->cgroup_inotify_wd < 0) {
+
+                if (errno == ENOENT) /* If the directory is already
+                                      * gone we don't need to track
+                                      * it, so this is not an error */
+                        return 0;
+
+                return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
+        }
+
+        r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
+
+        return 0;
+}
+
+static int unit_create_cgroup(
+                Unit *u,
+                CGroupMask target_mask,
+                CGroupMask enable_mask) {
+
         CGroupContext *c;
         int r;
 
@@ -643,25 +798,29 @@ static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
                 if (!path)
                         return log_oom();
 
-                r = hashmap_put(u->manager->cgroup_unit, path, u);
-                if (r < 0) {
-                        log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
-                        return r;
-                }
-                if (r > 0) {
-                        u->cgroup_path = path;
-                        path = NULL;
-                }
+                r = unit_set_cgroup_path(u, path);
+                if (r == -EEXIST)
+                        return log_unit_error_errno(u, r, "Control group %s exists already.", path);
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
         }
 
         /* First, create our own group */
-        r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
+        r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
+
+        /* Start watching it */
+        (void) unit_watch_cgroup(u);
+
+        /* Enable all controllers we need */
+        r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
         if (r < 0)
-                return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
+                log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 
         /* Keep track that this is now realized */
         u->cgroup_realized = true;
-        u->cgroup_realized_mask = mask;
+        u->cgroup_realized_mask = target_mask;
 
         if (u->type != UNIT_SLICE && !c->delegate) {
 
@@ -670,7 +829,7 @@ static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
                  * for slice and delegation units. */
                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
                 if (r < 0)
-                        log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
+                        log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
         }
 
         return 0;
@@ -691,10 +850,10 @@ int unit_attach_pids_to_cgroup(Unit *u) {
         return 0;
 }
 
-static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
+static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
         assert(u);
 
-        return u->cgroup_realized && u->cgroup_realized_mask == mask;
+        return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 }
 
 /* Check if necessary controllers and attributes for a unit are in place.
@@ -704,7 +863,7 @@ static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
  *
  * Returns 0 on success and < 0 on failure. */
 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
-        CGroupControllerMask mask;
+        CGroupMask target_mask, enable_mask;
         int r;
 
         assert(u);
@@ -714,9 +873,8 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
                 u->in_cgroup_queue = false;
         }
 
-        mask = unit_get_target_mask(u);
-
-        if (unit_has_mask_realized(u, mask))
+        target_mask = unit_get_target_mask(u);
+        if (unit_has_mask_realized(u, target_mask))
                 return 0;
 
         /* First, realize parents */
@@ -727,12 +885,13 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
         }
 
         /* And then do the real work */
-        r = unit_create_cgroups(u, mask);
+        enable_mask = unit_get_enable_mask(u);
+        r = unit_create_cgroup(u, target_mask, enable_mask);
         if (r < 0)
                 return r;
 
         /* Finally, apply the necessary attributes. */
-        cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
+        cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
 
         return 0;
 }
@@ -759,7 +918,7 @@ unsigned manager_dispatch_cgroup_queue(Manager *m) {
 
                 r = unit_realize_cgroup_now(i, state);
                 if (r < 0)
-                        log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
+                        log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
 
                 n++;
         }
@@ -829,39 +988,67 @@ int unit_realize_cgroup(Unit *u) {
         return unit_realize_cgroup_now(u, manager_state(u->manager));
 }
 
-void unit_destroy_cgroup_if_empty(Unit *u) {
+void unit_release_cgroup(Unit *u) {
+        assert(u);
+
+        /* Forgets all cgroup details for this cgroup */
+
+        if (u->cgroup_path) {
+                (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
+                u->cgroup_path = mfree(u->cgroup_path);
+        }
+
+        if (u->cgroup_inotify_wd >= 0) {
+                if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
+                        log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
+
+                (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
+                u->cgroup_inotify_wd = -1;
+        }
+}
+
+void unit_prune_cgroup(Unit *u) {
         int r;
+        bool is_root_slice;
 
         assert(u);
 
+        /* Removes the cgroup, if empty and possible, and stops watching it. */
+
         if (!u->cgroup_path)
                 return;
 
-        r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
+        is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
+
+        r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
         if (r < 0) {
-                log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
+                log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
                 return;
         }
 
-        hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
+        if (is_root_slice)
+                return;
+
+        unit_release_cgroup(u);
 
-        free(u->cgroup_path);
-        u->cgroup_path = NULL;
         u->cgroup_realized = false;
         u->cgroup_realized_mask = 0;
 }
 
-pid_t unit_search_main_pid(Unit *u) {
+int unit_search_main_pid(Unit *u, pid_t *ret) {
         _cleanup_fclose_ FILE *f = NULL;
         pid_t pid = 0, npid, mypid;
+        int r;
 
         assert(u);
+        assert(ret);
 
         if (!u->cgroup_path)
-                return 0;
+                return -ENXIO;
 
-        if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
-                return 0;
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
+        if (r < 0)
+                return r;
 
         mypid = getpid();
         while (cg_read_pid(f, &npid) > 0)  {
@@ -874,90 +1061,274 @@ pid_t unit_search_main_pid(Unit *u) {
                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
                         continue;
 
-                if (pid != 0) {
+                if (pid != 0)
                         /* Dang, there's more than one daemonized PID
                         in this group, so we don't know what process
                         is the main process. */
-                        pid = 0;
-                        break;
-                }
+
+                        return -ENODATA;
 
                 pid = npid;
         }
 
-        return pid;
+        *ret = pid;
+        return 0;
+}
+
+static int unit_watch_pids_in_path(Unit *u, const char *path) {
+       _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int ret = 0, r;
+
+        assert(u);
+        assert(path);
+
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
+        if (r < 0)
+                ret = r;
+        else {
+                pid_t pid;
+
+                while ((r = cg_read_pid(f, &pid)) > 0) {
+                        r = unit_watch_pid(u, pid);
+                        if (r < 0 && ret >= 0)
+                                ret = r;
+                }
+
+                if (r < 0 && ret >= 0)
+                        ret = r;
+        }
+
+        r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
+        if (r < 0) {
+                if (ret >= 0)
+                        ret = r;
+        } else {
+                char *fn;
+
+                while ((r = cg_read_subgroup(d, &fn)) > 0) {
+                        _cleanup_free_ char *p = NULL;
+
+                        p = strjoin(path, "/", fn, NULL);
+                        free(fn);
+
+                        if (!p)
+                                return -ENOMEM;
+
+                        r = unit_watch_pids_in_path(u, p);
+                        if (r < 0 && ret >= 0)
+                                ret = r;
+                }
+
+                if (r < 0 && ret >= 0)
+                        ret = r;
+        }
+
+        return ret;
+}
+
+int unit_watch_all_pids(Unit *u) {
+        assert(u);
+
+        /* Adds all PIDs from our cgroup to the set of PIDs we
+         * watch. This is a fallback logic for cases where we do not
+         * get reliable cgroup empty notifications: we try to use
+         * SIGCHLD as replacement. */
+
+        if (!u->cgroup_path)
+                return -ENOENT;
+
+        if (cg_unified() > 0) /* On unified we can use proper notifications */
+                return 0;
+
+        return unit_watch_pids_in_path(u, u->cgroup_path);
+}
+
+int unit_notify_cgroup_empty(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_path)
+                return 0;
+
+        r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+        if (r <= 0)
+                return r;
+
+        unit_add_to_gc_queue(u);
+
+        if (UNIT_VTABLE(u)->notify_cgroup_empty)
+                UNIT_VTABLE(u)->notify_cgroup_empty(u);
+
+        return 0;
+}
+
+static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        Manager *m = userdata;
+
+        assert(s);
+        assert(fd >= 0);
+        assert(m);
+
+        for (;;) {
+                union inotify_event_buffer buffer;
+                struct inotify_event *e;
+                ssize_t l;
+
+                l = read(fd, &buffer, sizeof(buffer));
+                if (l < 0) {
+                        if (errno == EINTR || errno == EAGAIN)
+                                return 0;
+
+                        return log_error_errno(errno, "Failed to read control group inotify events: %m");
+                }
+
+                FOREACH_INOTIFY_EVENT(e, buffer, l) {
+                        Unit *u;
+
+                        if (e->wd < 0)
+                                /* Queue overflow has no watch descriptor */
+                                continue;
+
+                        if (e->mask & IN_IGNORED)
+                                /* The watch was just removed */
+                                continue;
+
+                        u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
+                        if (!u) /* Not that inotify might deliver
+                                 * events for a watch even after it
+                                 * was removed, because it was queued
+                                 * before the removal. Let's ignore
+                                 * this here safely. */
+                                continue;
+
+                        (void) unit_notify_cgroup_empty(u);
+                }
+        }
 }
 
 int manager_setup_cgroup(Manager *m) {
         _cleanup_free_ char *path = NULL;
-        int r;
+        CGroupController c;
+        int r, unified;
+        char *e;
 
         assert(m);
 
         /* 1. Determine hierarchy */
-        free(m->cgroup_root);
-        m->cgroup_root = NULL;
-
+        m->cgroup_root = mfree(m->cgroup_root);
         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
         if (r < 0)
                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
 
-        /* LEGACY: Already in /system.slice? If so, let's cut this
-         * off. This is to support live upgrades from older systemd
-         * versions where PID 1 was moved there. */
-        if (m->running_as == MANAGER_SYSTEM) {
-                char *e;
+        /* Chop off the init scope, if we are already located in it */
+        e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
 
+        /* LEGACY: Also chop off the system slice if we are in
+         * it. This is to support live upgrades from older systemd
+         * versions where PID 1 was moved there. Also see
+         * cg_get_root_path(). */
+        if (!e && m->running_as == MANAGER_SYSTEM) {
                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
                 if (!e)
-                        e = endswith(m->cgroup_root, "/system");
-                if (e)
-                        *e = 0;
+                        e = endswith(m->cgroup_root, "/system"); /* even more legacy */
         }
+        if (e)
+                *e = 0;
 
         /* And make sure to store away the root value without trailing
          * slash, even for the root dir, so that we can easily prepend
          * it everywhere. */
-        if (streq(m->cgroup_root, "/"))
-                m->cgroup_root[0] = 0;
+        while ((e = endswith(m->cgroup_root, "/")))
+                *e = 0;
 
         /* 2. Show data */
         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
         if (r < 0)
                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
 
-        log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
+        if (unified > 0)
+                log_debug("Unified cgroup hierarchy is located at %s.", path);
+        else
+                log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
+
         if (!m->test_run) {
+                const char *scope_path;
 
                 /* 3. Install agent */
-                if (m->running_as == MANAGER_SYSTEM) {
+                if (unified) {
+
+                        /* In the unified hierarchy we can can get
+                         * cgroup empty notifications via inotify. */
+
+                        m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
+                        safe_close(m->cgroup_inotify_fd);
+
+                        m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+                        if (m->cgroup_inotify_fd < 0)
+                                return log_error_errno(errno, "Failed to create control group inotify object: %m");
+
+                        r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to watch control group inotify object: %m");
+
+                        r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set priority of inotify event source: %m");
+
+                        (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
+
+                } else if (m->running_as == MANAGER_SYSTEM) {
+
+                        /* On the legacy hierarchy we only get
+                         * notifications via cgroup agents. (Which
+                         * isn't really reliable, since it does not
+                         * generate events when control groups with
+                         * children run empty. */
+
                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
                         if (r < 0)
                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
                         else if (r > 0)
                                 log_debug("Installed release agent.");
-                        else
+                        else if (r == 0)
                                 log_debug("Release agent already installed.");
                 }
 
-                /* 4. Make sure we are in the root cgroup */
-                r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
+                /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
+                scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
+                r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
+                        return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
+
+                /* also, move all other userspace processes remaining
+                 * in the root cgroup into that scope. */
+                r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
+                if (r < 0)
+                        log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
 
                 /* 5. And pin it, so that it cannot be unmounted */
                 safe_close(m->pin_cgroupfs_fd);
-
                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
                 if (m->pin_cgroupfs_fd < 0)
                         return log_error_errno(errno, "Failed to open pin file: %m");
 
                 /* 6.  Always enable hierarchical support if it exists... */
-                cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
+                if (!unified)
+                        (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
         }
 
         /* 7. Figure out which controllers are supported */
-        m->cgroup_supported = cg_mask_supported();
+        r = cg_mask_supported(&m->cgroup_supported);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine supported controllers: %m");
+
+        for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
+                log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
 
         return 0;
 }
@@ -968,12 +1339,16 @@ void manager_shutdown_cgroup(Manager *m, bool delete) {
         /* We can't really delete the group, since we are in it. But
          * let's trim it. */
         if (delete && m->cgroup_root)
-                cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
+                (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
+
+        m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
+
+        m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
+        m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
 
         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 
-        free(m->cgroup_root);
-        m->cgroup_root = NULL;
+        m->cgroup_root = mfree(m->cgroup_root);
 }
 
 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
@@ -992,8 +1367,8 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
                 char *e;
 
                 e = strrchr(p, '/');
-                if (e == p || !e)
-                        return NULL;
+                if (!e || e == p)
+                        return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
 
                 *e = 0;
 
@@ -1010,9 +1385,12 @@ Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 
         assert(m);
 
-        if (pid <= 1)
+        if (pid <= 0)
                 return NULL;
 
+        if (pid == 1)
+                return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
+
         u = hashmap_get(m->watch_pids1, LONG_TO_PTR(pid));
         if (u)
                 return u;
@@ -1030,7 +1408,6 @@ Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 
 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
         Unit *u;
-        int r;
 
         assert(m);
         assert(cgroup);
@@ -1039,15 +1416,7 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
         if (!u)
                 return 0;
 
-        r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
-        if (r <= 0)
-                return r;
-
-        if (UNIT_VTABLE(u)->notify_cgroup_empty)
-                UNIT_VTABLE(u)->notify_cgroup_empty(u);
-
-        unit_add_to_gc_queue(u);
-        return 0;
+        return unit_notify_cgroup_empty(u);
 }
 
 int unit_get_memory_current(Unit *u, uint64_t *ret) {
@@ -1060,10 +1429,13 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) {
         if (!u->cgroup_path)
                 return -ENODATA;
 
-        if ((u->cgroup_realized_mask & CGROUP_MEMORY) == 0)
+        if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
                 return -ENODATA;
 
-        r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
+        if (cg_unified() <= 0)
+                r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
+        else
+                r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
         if (r == -ENOENT)
                 return -ENODATA;
         if (r < 0)
@@ -1083,7 +1455,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
         if (!u->cgroup_path)
                 return -ENODATA;
 
-        if ((u->cgroup_realized_mask & CGROUP_CPUACCT) == 0)
+        if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
                 return -ENODATA;
 
         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
index 7b38d210fbea6523623eb0e94887e824fd8926e8..1ce21f43f2f3014e4503e6879018970bffefcf54 100644 (file)
@@ -96,22 +96,32 @@ struct CGroupContext {
 void cgroup_context_init(CGroupContext *c);
 void cgroup_context_done(CGroupContext *c);
 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix);
-void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state);
+void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state);
 
-CGroupControllerMask cgroup_context_get_mask(CGroupContext *c);
+CGroupMask cgroup_context_get_mask(CGroupContext *c);
 
 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
 
-CGroupControllerMask unit_get_cgroup_mask(Unit *u);
-CGroupControllerMask unit_get_siblings_mask(Unit *u);
-CGroupControllerMask unit_get_members_mask(Unit *u);
-CGroupControllerMask unit_get_target_mask(Unit *u);
+CGroupMask unit_get_own_mask(Unit *u);
+CGroupMask unit_get_siblings_mask(Unit *u);
+CGroupMask unit_get_members_mask(Unit *u);
+CGroupMask unit_get_subtree_mask(Unit *u);
+
+CGroupMask unit_get_target_mask(Unit *u);
+CGroupMask unit_get_enable_mask(Unit *u);
 
 void unit_update_cgroup_members_masks(Unit *u);
+
+char *unit_default_cgroup_path(Unit *u);
+int unit_set_cgroup_path(Unit *u, const char *path);
+
 int unit_realize_cgroup(Unit *u);
-void unit_destroy_cgroup_if_empty(Unit *u);
+void unit_release_cgroup(Unit *u);
+void unit_prune_cgroup(Unit *u);
+int unit_watch_cgroup(Unit *u);
+
 int unit_attach_pids_to_cgroup(Unit *u);
 
 int manager_setup_cgroup(Manager *m);
@@ -122,9 +132,8 @@ unsigned manager_dispatch_cgroup_queue(Manager *m);
 Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
 Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
 
-pid_t unit_search_main_pid(Unit *u);
-
-int manager_notify_cgroup_empty(Manager *m, const char *group);
+int unit_search_main_pid(Unit *u, pid_t *ret);
+int unit_watch_all_pids(Unit *u);
 
 int unit_get_memory_current(Unit *u, uint64_t *ret);
 int unit_get_cpu_usage(Unit *u, nsec_t *ret);
@@ -132,5 +141,8 @@ int unit_reset_cpu_usage(Unit *u);
 
 bool unit_cgroup_delegate(Unit *u);
 
+int unit_notify_cgroup_empty(Unit *u);
+int manager_notify_cgroup_empty(Manager *m, const char *group);
+
 const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
 CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
index 9dcc51f2409d914d33134c5cd4e1e7b6b9c9e7e9..e8fd44e2942be31518ea5c1fa05ec4fcdbc63ffe 100644 (file)
@@ -228,7 +228,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->cpu_accounting = b;
-                        u->cgroup_realized_mask &= ~CGROUP_CPUACCT;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_CPUACCT;
                         unit_write_drop_in_private(u, mode, name, b ? "CPUAccounting=yes" : "CPUAccounting=no");
                 }
 
@@ -252,7 +252,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->cpu_shares = ul;
-                        u->cgroup_realized_mask &= ~CGROUP_CPU;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_CPU;
                         unit_write_drop_in_private_format(u, mode, name, "CPUShares=%lu", ul);
                 }
 
@@ -276,7 +276,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->startup_cpu_shares = ul;
-                        u->cgroup_realized_mask &= ~CGROUP_CPU;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_CPU;
                         unit_write_drop_in_private_format(u, mode, name, "StartupCPUShares=%lu", ul);
                 }
 
@@ -294,7 +294,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->cpu_quota_per_sec_usec = u64;
-                        u->cgroup_realized_mask &= ~CGROUP_CPU;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_CPU;
                         unit_write_drop_in_private_format(u, mode, "CPUQuota", "CPUQuota=%0.f%%", (double) (c->cpu_quota_per_sec_usec / 10000));
                 }
 
@@ -309,7 +309,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->blockio_accounting = b;
-                        u->cgroup_realized_mask &= ~CGROUP_BLKIO;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
                         unit_write_drop_in_private(u, mode, name, b ? "BlockIOAccounting=yes" : "BlockIOAccounting=no");
                 }
 
@@ -333,7 +333,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->blockio_weight = ul;
-                        u->cgroup_realized_mask &= ~CGROUP_BLKIO;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
                         unit_write_drop_in_private_format(u, mode, name, "BlockIOWeight=%lu", ul);
                 }
 
@@ -357,7 +357,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->startup_blockio_weight = ul;
-                        u->cgroup_realized_mask &= ~CGROUP_BLKIO;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
                         unit_write_drop_in_private_format(u, mode, name, "StartupBlockIOWeight=%lu", ul);
                 }
 
@@ -427,7 +427,7 @@ int bus_cgroup_set_property(
                                                 cgroup_context_free_blockio_device_bandwidth(c, a);
                         }
 
-                        u->cgroup_realized_mask &= ~CGROUP_BLKIO;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
 
                         f = open_memstream(&buf, &size);
                         if (!f)
@@ -510,7 +510,7 @@ int bus_cgroup_set_property(
                                         cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
                         }
 
-                        u->cgroup_realized_mask &= ~CGROUP_BLKIO;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
 
                         f = open_memstream(&buf, &size);
                         if (!f)
@@ -535,7 +535,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->memory_accounting = b;
-                        u->cgroup_realized_mask &= ~CGROUP_MEMORY;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_MEMORY;
                         unit_write_drop_in_private(u, mode, name, b ? "MemoryAccounting=yes" : "MemoryAccounting=no");
                 }
 
@@ -550,7 +550,7 @@ int bus_cgroup_set_property(
 
                 if (mode != UNIT_CHECK) {
                         c->memory_limit = limit;
-                        u->cgroup_realized_mask &= ~CGROUP_MEMORY;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_MEMORY;
                         unit_write_drop_in_private_format(u, mode, name, "%s=%" PRIu64, name, limit);
                 }
 
@@ -572,7 +572,7 @@ int bus_cgroup_set_property(
                         char *buf;
 
                         c->device_policy = p;
-                        u->cgroup_realized_mask &= ~CGROUP_DEVICE;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_DEVICE;
 
                         buf = strjoina("DevicePolicy=", policy);
                         unit_write_drop_in_private(u, mode, name, buf);
@@ -651,7 +651,7 @@ int bus_cgroup_set_property(
                                         cgroup_context_free_device_allow(c, c->device_allow);
                         }
 
-                        u->cgroup_realized_mask &= ~CGROUP_DEVICE;
+                        u->cgroup_realized_mask &= ~CGROUP_MASK_DEVICE;
 
                         f = open_memstream(&buf, &size);
                         if (!f)
index 1e6291e762f4afb2966870e9c84c0ba98a96ac21..31016b6c4a1d7ac5903c59ab272d16e074e2061f 100644 (file)
@@ -25,6 +25,7 @@
 #include "cgroup-util.h"
 #include "strv.h"
 #include "bus-common-errors.h"
+#include "special.h"
 #include "dbus.h"
 #include "dbus-unit.h"
 
@@ -973,6 +974,8 @@ static int bus_unit_set_transient_property(
                         return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups.");
                 if (u->type == UNIT_SLICE)
                         return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units.");
+                if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope");
 
                 r = sd_bus_message_read(message, "s", &s);
                 if (r < 0)
index 8d14fe23d07b89810d5c767165c78506c4c0245a..a750246a890acb8297613613a48080df6daa29df 100644 (file)
@@ -214,7 +214,7 @@ struct ExecParameters {
         bool apply_tty_stdin;
         bool confirm_spawn;
         bool selinux_context_net;
-        CGroupControllerMask cgroup_supported;
+        CGroupMask cgroup_supported;
         const char *cgroup_path;
         bool cgroup_delegate;
         const char *runtime_prefix;
index 14f069ba9751fa79ff58b95a34ca83b04fd6124d..c3327e37f57c68dd5a48e14ec347493f60e6f581 100644 (file)
@@ -568,7 +568,9 @@ int manager_new(ManagerRunningAs running_as, bool test_run, Manager **_m) {
 
         m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1;
 
-        m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1;
+        m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd =
+                m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd =
+                m->cgroup_inotify_fd = -1;
         m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */
 
         m->ask_password_inotify_fd = -1;
@@ -2722,7 +2724,7 @@ void manager_check_finished(Manager *m) {
 
         SET_FOREACH(u, m->startup_units, i)
                 if (u->cgroup_path)
-                        cgroup_context_apply(unit_get_cgroup_context(u), unit_get_cgroup_mask(u), u->cgroup_path, manager_state(m));
+                        cgroup_context_apply(unit_get_cgroup_context(u), unit_get_own_mask(u), u->cgroup_path, manager_state(m));
 }
 
 static int create_generator_dir(Manager *m, char **generator, const char *name) {
index 3f7fa24e588ae84a68ed62ca3174b7e5992d3170..9956cb7700dbaa6bc47d9fd22d340bf35fe61377 100644 (file)
@@ -215,16 +215,22 @@ struct Manager {
 
         /* Data specific to the cgroup subsystem */
         Hashmap *cgroup_unit;
-        CGroupControllerMask cgroup_supported;
+        CGroupMask cgroup_supported;
         char *cgroup_root;
 
-        int gc_marker;
-        unsigned n_in_gc_queue;
+        /* Notifications from cgroups, when the unified hierarchy is
+         * used is done via inotify. */
+        int cgroup_inotify_fd;
+        sd_event_source *cgroup_inotify_event_source;
+        Hashmap *cgroup_inotify_wd_unit;
 
         /* Make sure the user cannot accidentally unmount our cgroup
          * file system */
         int pin_cgroupfs_fd;
 
+        int gc_marker;
+        unsigned n_in_gc_queue;
+
         /* Flags */
         ManagerRunningAs running_as;
         ManagerExitCode exit_code:5;
index 1782d4072062f48d97f7d34b023f13e2beaa564d..c6f356991556f3717743aceea33499928e884310 100644 (file)
@@ -93,12 +93,14 @@ static const MountPoint mount_table[] = {
 #endif
         { "tmpfs",       "/run",                      "tmpfs",      "mode=755",                MS_NOSUID|MS_NODEV|MS_STRICTATIME,
           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
+        { "cgroup",      "/sys/fs/cgroup",            "cgroup",     "__DEVEL__sane_behavior",  MS_NOSUID|MS_NOEXEC|MS_NODEV,
+          cg_is_unified_wanted, MNT_FATAL|MNT_IN_CONTAINER },
         { "tmpfs",       "/sys/fs/cgroup",            "tmpfs",      "mode=755",                MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
-          NULL,          MNT_FATAL|MNT_IN_CONTAINER },
+          cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
         { "cgroup",      "/sys/fs/cgroup/systemd",    "cgroup",     "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
-          NULL,          MNT_IN_CONTAINER           },
+          cg_is_legacy_wanted, MNT_IN_CONTAINER           },
         { "cgroup",      "/sys/fs/cgroup/systemd",    "cgroup",     "none,name=systemd",       MS_NOSUID|MS_NOEXEC|MS_NODEV,
-          NULL,          MNT_FATAL|MNT_IN_CONTAINER },
+          cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
         { "pstore",      "/sys/fs/pstore",            "pstore",     NULL,                      MS_NOSUID|MS_NOEXEC|MS_NODEV,
           NULL,          MNT_NONE                   },
 #ifdef ENABLE_EFI
@@ -217,6 +219,9 @@ int mount_cgroup_controllers(char ***join_controllers) {
         _cleanup_set_free_free_ Set *controllers = NULL;
         int r;
 
+        if (!cg_is_legacy_wanted())
+                return 0;
+
         /* Mount all available cgroup controllers that are built into the kernel. */
 
         controllers = set_new(&string_hash_ops);
index 1e94d63561bd6c52bbcda842f12b7b4c6fc07000..44cd324f5800c75f618ebdfeb22a7983bb1eafa3 100644 (file)
 #include <errno.h>
 #include <unistd.h>
 
-#include "unit.h"
-#include "scope.h"
 #include "log.h"
-#include "dbus-scope.h"
+#include "strv.h"
 #include "special.h"
 #include "unit-name.h"
+#include "unit.h"
+#include "scope.h"
+#include "dbus-scope.h"
 #include "load-dropin.h"
 
 static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
@@ -136,7 +137,9 @@ static int scope_verify(Scope *s) {
         if (UNIT(s)->load_state != UNIT_LOADED)
                 return 0;
 
-        if (set_isempty(UNIT(s)->pids) && UNIT(s)->manager->n_reloading <= 0) {
+        if (set_isempty(UNIT(s)->pids) &&
+            !manager_is_reloading_or_reexecuting(UNIT(s)->manager) <= 0 &&
+            !unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) {
                 log_unit_error(UNIT(s), "Scope has no PIDs. Refusing.");
                 return -EINVAL;
         }
@@ -151,7 +154,7 @@ static int scope_load(Unit *u) {
         assert(s);
         assert(u->load_state == UNIT_STUB);
 
-        if (!u->transient && UNIT(s)->manager->n_reloading <= 0)
+        if (!u->transient && !manager_is_reloading_or_reexecuting(u->manager))
                 return -ENOENT;
 
         u->load_state = UNIT_LOADED;
@@ -279,6 +282,9 @@ static int scope_start(Unit *u) {
 
         assert(s);
 
+        if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+                return -EPERM;
+
         if (s->state == SCOPE_FAILED)
                 return -EPERM;
 
@@ -289,7 +295,7 @@ static int scope_start(Unit *u) {
 
         assert(s->state == SCOPE_DEAD);
 
-        if (!u->transient && UNIT(s)->manager->n_reloading <= 0)
+        if (!u->transient && !manager_is_reloading_or_reexecuting(u->manager))
                 return -ENOENT;
 
         (void) unit_realize_cgroup(u);
@@ -464,6 +470,9 @@ static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *user
 int scope_abandon(Scope *s) {
         assert(s);
 
+        if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
+                return -EPERM;
+
         if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED))
                 return -ESTALE;
 
@@ -499,6 +508,48 @@ _pure_ static const char *scope_sub_state_to_string(Unit *u) {
         return scope_state_to_string(SCOPE(u)->state);
 }
 
+static int scope_enumerate(Manager *m) {
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        /* Let's unconditionally add the "init.scope" special unit
+         * that encapsulates PID 1. Note that PID 1 already is in the
+         * cgroup for this, we hence just need to allocate the object
+         * for it and that's it. */
+
+        u = manager_get_unit(m, SPECIAL_INIT_SCOPE);
+        if (!u) {
+                u = unit_new(m, sizeof(Scope));
+                if (!u)
+                        return log_oom();
+
+                r = unit_add_name(u, SPECIAL_INIT_SCOPE);
+                if (r < 0)  {
+                        unit_free(u);
+                        return log_error_errno(r, "Failed to add init.scope name");
+                }
+        }
+
+        u->transient = true;
+        u->default_dependencies = false;
+        u->no_gc = true;
+        SCOPE(u)->deserialized_state = SCOPE_RUNNING;
+        SCOPE(u)->kill_context.kill_signal = SIGRTMIN+14;
+
+        /* Prettify things, if we can. */
+        if (!u->description)
+                u->description = strdup("System and Service Manager");
+        if (!u->documentation)
+                (void) strv_extend(&u->documentation, "man:systemd(1)");
+
+        unit_add_to_load_queue(u);
+        unit_add_to_dbus_queue(u);
+
+        return 0;
+}
+
 static const char* const scope_state_table[_SCOPE_STATE_MAX] = {
         [SCOPE_DEAD] = "dead",
         [SCOPE_RUNNING] = "running",
@@ -565,5 +616,7 @@ const UnitVTable scope_vtable = {
         .bus_set_property = bus_scope_set_property,
         .bus_commit_properties = bus_scope_commit_properties,
 
-        .can_transient = true
+        .can_transient = true,
+
+        .enumerate = scope_enumerate,
 };
index 5a0a3aa867c15a9c7ace4325544a2b31e6851134..292fe50de81e8bbe610d4ebb07a5bcbca05a867f 100644 (file)
@@ -767,7 +767,7 @@ static int service_load_pid_file(Service *s, bool may_warn) {
 }
 
 static int service_search_main_pid(Service *s) {
-        pid_t pid;
+        pid_t pid = 0;
         int r;
 
         assert(s);
@@ -782,9 +782,9 @@ static int service_search_main_pid(Service *s) {
 
         assert(s->main_pid <= 0);
 
-        pid = unit_search_main_pid(UNIT(s));
-        if (pid <= 0)
-                return -ENOENT;
+        r = unit_search_main_pid(UNIT(s), &pid);
+        if (r < 0)
+                return r;
 
         log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid);
         r = service_set_main_pid(s, pid);
@@ -860,7 +860,7 @@ static void service_set_state(Service *s, ServiceState state) {
         /* For the inactive states unit_notify() will trim the cgroup,
          * but for exit we have to do that ourselves... */
         if (state == SERVICE_EXITED && UNIT(s)->manager->n_reloading <= 0)
-                unit_destroy_cgroup_if_empty(UNIT(s));
+                unit_prune_cgroup(UNIT(s));
 
         /* For remain_after_exit services, let's see if we can "release" the
          * hold on the console, since unit_notify() only does that in case of
@@ -2644,7 +2644,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                                                 break;
                                         }
                                 } else
-                                        service_search_main_pid(s);
+                                        (void) service_search_main_pid(s);
 
                                 service_enter_start_post(s);
                                 break;
@@ -2666,7 +2666,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                                                 break;
                                         }
                                 } else
-                                        service_search_main_pid(s);
+                                        (void) service_search_main_pid(s);
 
                                 service_enter_running(s, SERVICE_SUCCESS);
                                 break;
@@ -2674,7 +2674,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                         case SERVICE_RELOAD:
                                 if (f == SERVICE_SUCCESS) {
                                         service_load_pid_file(s, true);
-                                        service_search_main_pid(s);
+                                        (void) service_search_main_pid(s);
                                 }
 
                                 s->reload_result = f;
index 7442d23391614e82c63ccdcbaa57973e88085be4..b414462066e891df87c341e9f04ec5d00931e794 100644 (file)
 
 #include <errno.h>
 
-#include "unit.h"
-#include "slice.h"
 #include "log.h"
-#include "dbus-slice.h"
+#include "strv.h"
 #include "special.h"
 #include "unit-name.h"
+#include "unit.h"
+#include "slice.h"
+#include "dbus-slice.h"
 
 static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = {
         [SLICE_DEAD] = UNIT_INACTIVE,
@@ -252,6 +253,40 @@ _pure_ static const char *slice_sub_state_to_string(Unit *u) {
         return slice_state_to_string(SLICE(u)->state);
 }
 
+static int slice_enumerate(Manager *m) {
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        u = manager_get_unit(m, SPECIAL_ROOT_SLICE);
+        if (!u) {
+                u = unit_new(m, sizeof(Slice));
+                if (!u)
+                        return log_oom();
+
+                r = unit_add_name(u, SPECIAL_ROOT_SLICE);
+                if (r < 0) {
+                        unit_free(u);
+                        return log_error_errno(r, "Failed to add -.slice name");
+                }
+        }
+
+        u->default_dependencies = false;
+        u->no_gc = true;
+        SLICE(u)->deserialized_state = SLICE_ACTIVE;
+
+        if (!u->description)
+                u->description = strdup("Root Slice");
+        if (!u->documentation)
+                (void) strv_extend(&u->documentation, "man:systemd.special(7)");
+
+        unit_add_to_load_queue(u);
+        unit_add_to_dbus_queue(u);
+
+        return 0;
+}
+
 static const char* const slice_state_table[_SLICE_STATE_MAX] = {
         [SLICE_DEAD] = "dead",
         [SLICE_ACTIVE] = "active"
@@ -293,6 +328,8 @@ const UnitVTable slice_vtable = {
         .bus_set_property = bus_slice_set_property,
         .bus_commit_properties = bus_slice_commit_properties,
 
+        .enumerate = slice_enumerate,
+
         .status_message_formats = {
                 .finished_start_job = {
                         [JOB_DONE]       = "Created slice %s.",
index 34d3adcd3b703a28ebaf886cee28c618a8bd1646..8c07c6140dbea58084a7489f211d33afb71b266c 100644 (file)
@@ -91,6 +91,7 @@ Unit *unit_new(Manager *m, size_t size) {
         u->unit_file_state = _UNIT_FILE_STATE_INVALID;
         u->unit_file_preset = -1;
         u->on_failure_job_mode = JOB_REPLACE;
+        u->cgroup_inotify_wd = -1;
 
         RATELIMIT_INIT(u->auto_stop_ratelimit, 10 * USEC_PER_SEC, 16);
 
@@ -525,10 +526,7 @@ void unit_free(Unit *u) {
         if (u->in_cgroup_queue)
                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 
-        if (u->cgroup_path) {
-                hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
-                free(u->cgroup_path);
-        }
+        unit_release_cgroup(u);
 
         manager_update_failed_units(u->manager, u, false);
         set_remove(u->manager->startup_units, u);
@@ -1801,7 +1799,7 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
 
         /* Make sure the cgroup is always removed when we become inactive */
         if (UNIT_IS_INACTIVE_OR_FAILED(ns))
-                unit_destroy_cgroup_if_empty(u);
+                unit_prune_cgroup(u);
 
         /* Note that this doesn't apply to RemainAfterExit services exiting
          * successfully, since there's no change of state in that case. Which is
@@ -2028,70 +2026,7 @@ void unit_unwatch_all_pids(Unit *u) {
         while (!set_isempty(u->pids))
                 unit_unwatch_pid(u, PTR_TO_LONG(set_first(u->pids)));
 
-        set_free(u->pids);
-        u->pids = NULL;
-}
-
-static int unit_watch_pids_in_path(Unit *u, const char *path) {
-        _cleanup_closedir_ DIR *d = NULL;
-        _cleanup_fclose_ FILE *f = NULL;
-        int ret = 0, r;
-
-        assert(u);
-        assert(path);
-
-        /* Adds all PIDs from a specific cgroup path to the set of PIDs we watch. */
-
-        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
-        if (r >= 0) {
-                pid_t pid;
-
-                while ((r = cg_read_pid(f, &pid)) > 0) {
-                        r = unit_watch_pid(u, pid);
-                        if (r < 0 && ret >= 0)
-                                ret = r;
-                }
-                if (r < 0 && ret >= 0)
-                        ret = r;
-
-        } else if (ret >= 0)
-                ret = r;
-
-        r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
-        if (r >= 0) {
-                char *fn;
-
-                while ((r = cg_read_subgroup(d, &fn)) > 0) {
-                        _cleanup_free_ char *p = NULL;
-
-                        p = strjoin(path, "/", fn, NULL);
-                        free(fn);
-
-                        if (!p)
-                                return -ENOMEM;
-
-                        r = unit_watch_pids_in_path(u, p);
-                        if (r < 0 && ret >= 0)
-                                ret = r;
-                }
-                if (r < 0 && ret >= 0)
-                        ret = r;
-
-        } else if (ret >= 0)
-                ret = r;
-
-        return ret;
-}
-
-int unit_watch_all_pids(Unit *u) {
-        assert(u);
-
-        /* Adds all PIDs from our cgroup to the set of PIDs we watch */
-
-        if (!u->cgroup_path)
-                return -ENOENT;
-
-        return unit_watch_pids_in_path(u, u->cgroup_path);
+        u->pids = set_free(u->pids);
 }
 
 void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
@@ -2400,31 +2335,6 @@ char *unit_dbus_path(Unit *u) {
         return unit_dbus_path_from_name(u->id);
 }
 
-char *unit_default_cgroup_path(Unit *u) {
-        _cleanup_free_ char *escaped = NULL, *slice = NULL;
-        int r;
-
-        assert(u);
-
-        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
-                return strdup(u->manager->cgroup_root);
-
-        if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
-                r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
-                if (r < 0)
-                        return NULL;
-        }
-
-        escaped = cg_escape(u->id);
-        if (!escaped)
-                return NULL;
-
-        if (slice)
-                return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
-        else
-                return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
-}
-
 int unit_set_slice(Unit *u, Unit *slice) {
         assert(u);
         assert(slice);
@@ -2447,6 +2357,10 @@ int unit_set_slice(Unit *u, Unit *slice) {
         if (slice->type != UNIT_SLICE)
                 return -EINVAL;
 
+        if (unit_has_name(u, SPECIAL_INIT_SCOPE) &&
+            !unit_has_name(slice, SPECIAL_ROOT_SLICE))
+                return -EPERM;
+
         if (UNIT_DEREF(u->slice) == slice)
                 return 0;
 
@@ -2495,7 +2409,7 @@ int unit_set_default_slice(Unit *u) {
                 slice_name = b;
         } else
                 slice_name =
-                        u->manager->running_as == MANAGER_SYSTEM
+                        u->manager->running_as == MANAGER_SYSTEM && !unit_has_name(u, SPECIAL_INIT_SCOPE)
                         ? SPECIAL_SYSTEM_SLICE
                         : SPECIAL_ROOT_SLICE;
 
@@ -2704,40 +2618,6 @@ void unit_serialize_item(Unit *u, FILE *f, const char *key, const char *value) {
         fprintf(f, "%s=%s\n", key, value);
 }
 
-static int unit_set_cgroup_path(Unit *u, const char *path) {
-        _cleanup_free_ char *p = NULL;
-        int r;
-
-        assert(u);
-
-        if (path) {
-                p = strdup(path);
-                if (!p)
-                        return -ENOMEM;
-        } else
-                p = NULL;
-
-        if (streq_ptr(u->cgroup_path, p))
-                return 0;
-
-        if (p) {
-                r = hashmap_put(u->manager->cgroup_unit, p, u);
-                if (r < 0)
-                        return r;
-        }
-
-        if (u->cgroup_path) {
-                log_unit_debug(u, "Changing cgroup path from %s to %s.", u->cgroup_path, strna(p));
-                hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
-                free(u->cgroup_path);
-        }
-
-        u->cgroup_path = p;
-        p = NULL;
-
-        return 0;
-}
-
 int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
         ExecRuntime **rt = NULL;
         size_t offset;
@@ -2868,6 +2748,8 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
                         if (r < 0)
                                 log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v);
 
+                        (void) unit_watch_cgroup(u);
+
                         continue;
                 } else if (streq(l, "cgroup-realized")) {
                         int b;
@@ -3600,18 +3482,22 @@ int unit_kill_context(
 
                 } else if (r > 0) {
 
-                        /* FIXME: For now, we will not wait for the
-                         * cgroup members to die if we are running in
-                         * a container or if this is a delegation
-                         * unit, simply because cgroup notification is
-                         * unreliable in these cases. It doesn't work
-                         * at all in containers, and outside of
-                         * containers it can be confused easily by
-                         * left-over directories in the cgroup --
-                         * which however should not exist in
-                         * non-delegated units. */
-
-                        if  (detect_container(NULL) == 0 && !unit_cgroup_delegate(u))
+                        /* FIXME: For now, on the legacy hierarchy, we
+                         * will not wait for the cgroup members to die
+                         * if we are running in a container or if this
+                         * is a delegation unit, simply because cgroup
+                         * notification is unreliable in these
+                         * cases. It doesn't work at all in
+                         * containers, and outside of containers it
+                         * can be confused easily by left-over
+                         * directories in the cgroup -- which however
+                         * should not exist in non-delegated units. On
+                         * the unified hierarchy that's different,
+                         * there we get proper events. Hence rely on
+                         * them.*/
+
+                        if  (cg_unified() > 0 ||
+                             (detect_container(NULL) == 0 && !unit_cgroup_delegate(u)))
                                 wait_for_exit = true;
 
                         if (c->send_sighup && k != KILL_KILL) {
index bc26653247bd33d24750b48637758dce92c66dfc..3c7684411bf1eb4cc571698128bc0d4c7514f5fc 100644 (file)
@@ -184,9 +184,10 @@ struct Unit {
 
         /* Counterparts in the cgroup filesystem */
         char *cgroup_path;
-        CGroupControllerMask cgroup_realized_mask;
-        CGroupControllerMask cgroup_subtree_mask;
-        CGroupControllerMask cgroup_members_mask;
+        CGroupMask cgroup_realized_mask;
+        CGroupMask cgroup_subtree_mask;
+        CGroupMask cgroup_members_mask;
+        int cgroup_inotify_wd;
 
         /* How to start OnFailure units */
         JobMode on_failure_job_mode;
@@ -522,7 +523,6 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
 
 int unit_watch_pid(Unit *u, pid_t pid);
 void unit_unwatch_pid(Unit *u, pid_t pid);
-int unit_watch_all_pids(Unit *u);
 void unit_unwatch_all_pids(Unit *u);
 
 void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2);
@@ -567,8 +567,6 @@ bool unit_active_or_pending(Unit *u);
 
 int unit_add_default_target_dependency(Unit *u, Unit *target);
 
-char *unit_default_cgroup_path(Unit *u);
-
 void unit_start_on_failure(Unit *u);
 void unit_trigger_notify(Unit *u);
 
index 8039847a721b46fd91c30e73e5ed69d615a661d4..a56960506cffb17dcaabcf05cda636f29f6d0552 100644 (file)
@@ -204,6 +204,7 @@ static char **arg_property = NULL;
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static bool arg_userns = false;
 static int arg_kill_signal = 0;
+static bool arg_unified_cgroup_hierarchy = false;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -385,6 +386,30 @@ static int set_sanitized_path(char **b, const char *path) {
         return 0;
 }
 
+static int detect_unified_cgroup_hierarchy(void) {
+        const char *e;
+        int r;
+
+        /* Allow the user to control whether the unified hierarchy is used */
+        e = getenv("UNIFIED_CGROUP_HIERARCHY");
+        if (e) {
+                r = parse_boolean(e);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+
+                arg_unified_cgroup_hierarchy = r;
+                return 0;
+        }
+
+        /* Otherwise inherit the default from the host system */
+        r = cg_unified();
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
+        arg_unified_cgroup_hierarchy = r;
+        return 0;
+}
+
 static int parse_argv(int argc, char *argv[]) {
 
         enum {
@@ -1037,6 +1062,10 @@ static int parse_argv(int argc, char *argv[]) {
         if (arg_boot && arg_kill_signal <= 0)
                 arg_kill_signal = SIGRTMIN+3;
 
+        r = detect_unified_cgroup_hierarchy();
+        if (r < 0)
+                return r;
+
         return 1;
 }
 
@@ -1095,7 +1124,6 @@ static int mount_all(const char *dest, bool userns) {
                 { "/proc/sys", "/proc/sys",      NULL,     NULL,        MS_BIND,                                                   true,  true  },   /* Bind mount first */
                 { NULL,        "/proc/sys",      NULL,     NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true  },   /* Then, make it r/o */
                 { "sysfs",     "/sys",           "sysfs",  NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false },
-                { "tmpfs",     "/sys/fs/cgroup", "tmpfs",  "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,               true,  false },
                 { "tmpfs",     "/dev",           "tmpfs",  "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false },
                 { "tmpfs",     "/dev/shm",       "tmpfs",  "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false },
                 { "tmpfs",     "/run",           "tmpfs",  "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false },
@@ -1381,7 +1409,7 @@ static int mount_custom(const char *dest) {
         return 0;
 }
 
-static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
         char *to;
         int r;
 
@@ -1409,11 +1437,31 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons
         return 1;
 }
 
-static int mount_cgroup(const char *dest) {
+static int mount_legacy_cgroups(const char *dest) {
         _cleanup_set_free_free_ Set *controllers = NULL;
         const char *cgroup_root;
         int r;
 
+        cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+
+        /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+        r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+        if (r == 0) {
+                _cleanup_free_ char *options = NULL;
+
+                r = tmpfs_patch_options("mode=755", &options);
+                if (r < 0)
+                        return log_oom();
+
+                if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
+                        return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+        }
+
+        if (cg_unified() > 0)
+                goto skip_controllers;
+
         controllers = set_new(&string_hash_ops);
         if (!controllers)
                 return log_oom();
@@ -1437,7 +1485,7 @@ static int mount_cgroup(const char *dest) {
                 if (r == -EINVAL) {
                         /* Not a symbolic link, but directly a single cgroup hierarchy */
 
-                        r = mount_cgroup_hierarchy(dest, controller, controller, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
                         if (r < 0)
                                 return r;
 
@@ -1457,7 +1505,7 @@ static int mount_cgroup(const char *dest) {
                                 continue;
                         }
 
-                        r = mount_cgroup_hierarchy(dest, combined, combined, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
                         if (r < 0)
                                 return r;
 
@@ -1471,17 +1519,52 @@ static int mount_cgroup(const char *dest) {
                 }
         }
 
-        r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
+skip_controllers:
+        r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
         if (r < 0)
                 return r;
 
-        cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
 
         return 0;
 }
 
+static int mount_unified_cgroups(const char *dest) {
+        const char *p;
+        int r;
+
+        assert(dest);
+
+        p = strjoina(dest, "/sys/fs/cgroup");
+
+        r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
+        if (r > 0) {
+                p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
+                if (access(p, F_OK) >= 0)
+                        return 0;
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
+
+                log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
+                return -EINVAL;
+        }
+
+        if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
+                return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
+
+        return 0;
+}
+
+static int mount_cgroups(const char *dest) {
+        if (arg_unified_cgroup_hierarchy)
+                return mount_unified_cgroups(dest);
+        else
+                return mount_legacy_cgroups(dest);
+}
+
 static int mount_systemd_cgroup_writable(const char *dest) {
         _cleanup_free_ char *own_cgroup_path = NULL;
         const char *systemd_root, *systemd_own;
@@ -1493,13 +1576,23 @@ static int mount_systemd_cgroup_writable(const char *dest) {
         if (r < 0)
                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
 
+        /* If we are living in the top-level, then there's nothing to do... */
+        if (path_equal(own_cgroup_path, "/"))
+                return 0;
+
+        if (arg_unified_cgroup_hierarchy) {
+                systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
+                systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
+        } else {
+                systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+                systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+        }
+
         /* Make our own cgroup a (writable) bind mount */
-        systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
 
         /* And then remount the systemd cgroup root read-only */
-        systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
 
@@ -4187,6 +4280,8 @@ static int inner_child(
         assert(directory);
         assert(kmsg_socket >= 0);
 
+        cg_unified_flush();
+
         if (arg_userns) {
                 /* Tell the parent, that it now can write the UID map. */
                 (void) barrier_place(barrier); /* #1 */
@@ -4368,6 +4463,8 @@ static int outer_child(
         assert(pid_socket >= 0);
         assert(kmsg_socket >= 0);
 
+        cg_unified_flush();
+
         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
                 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
 
@@ -4484,7 +4581,7 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = mount_cgroup(directory);
+        r = mount_cgroups(directory);
         if (r < 0)
                 return r;
 
@@ -4499,7 +4596,6 @@ static int outer_child(
                         NULL);
         if (pid < 0)
                 return log_error_errno(errno, "Failed to fork inner child: %m");
-
         if (pid == 0) {
                 pid_socket = safe_close(pid_socket);
                 uid_shift_socket = safe_close(uid_shift_socket);
@@ -4567,9 +4663,112 @@ static int chown_cgroup(pid_t pid) {
         if (fd < 0)
                 return log_error_errno(errno, "Failed to open %s: %m", fs);
 
-        FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
+        FOREACH_STRING(fn,
+                       ".",
+                       "tasks",
+                       "notify_on_release",
+                       "cgroup.procs",
+                       "cgroup.clone_children",
+                       "cgroup.controllers",
+                       "cgroup.subtree_control",
+                       "cgroup.populated")
                 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
-                        log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
+                        log_full_errno(errno == ENOENT ? LOG_DEBUG :  LOG_WARNING, errno,
+                                       "Failed to chown() cgroup file %s, ignoring: %m", fn);
+
+        return 0;
+}
+
+static int sync_cgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL;
+        char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
+        bool undo_mount = false;
+        const char *fn;
+        int unified, r;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+
+        if ((unified > 0) == arg_unified_cgroup_hierarchy)
+                return 0;
+
+        /* When the host uses the legacy cgroup setup, but the
+         * container shall use the unified hierarchy, let's make sure
+         * we copy the path from the name=systemd hierarchy into the
+         * unified hierarchy. Similar for the reverse situation. */
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
+
+        /* In order to access the unified hierarchy we need to mount it */
+        if (!mkdtemp(tree))
+                return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
+
+        if (unified)
+                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+        else
+                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
+        if (r < 0) {
+                r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+                goto finish;
+        }
+
+        undo_mount = true;
+
+        fn = strjoina(tree, cgroup, "/cgroup.procs");
+        (void) mkdir_parents(fn, 0755);
+
+        sprintf(pid_string, PID_FMT, pid);
+        r = write_string_file(fn, pid_string, 0);
+        if (r < 0)
+                log_error_errno(r, "Failed to move process: %m");
+
+finish:
+        if (undo_mount)
+                (void) umount(tree);
+
+        (void) rmdir(tree);
+        return r;
+}
+
+static int create_subcgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL;
+        const char *child;
+        int unified, r;
+
+        /* In the unified hierarchy inner nodes may only only contain
+         * subgroups, but not processes. Hence, if we running in the
+         * unified hierarchy and the container does the same, and we
+         * did not create a scope unit for the container move us and
+         * the container into two separate subcgroups. */
+
+        if (!arg_keep_unit)
+                return 0;
+
+        if (!arg_unified_cgroup_hierarchy)
+                return 0;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+        if (unified == 0)
+                return 0;
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get our control group: %m");
+
+        child = strjoina(cgroup, "/payload");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+        child = strjoina(cgroup, "/supervisor");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
 
         return 0;
 }
@@ -4976,6 +5175,14 @@ int main(int argc, char *argv[]) {
                 if (r < 0)
                         goto finish;
 
+                r = sync_cgroup(pid);
+                if (r < 0)
+                        goto finish;
+
+                r = create_subcgroup(pid);
+                if (r < 0)
+                        goto finish;
+
                 r = chown_cgroup(pid);
                 if (r < 0)
                         goto finish;
index 72f874d8a91f55fa6cd0c30bc23e48dfbe8d90fe..de6c421b82cb44de829e875c78abe6c17471b2d7 100644 (file)
@@ -61,36 +61,36 @@ static int test_cgroup_mask(void) {
         root = UNIT_DEREF(parent->slice);
 
         /* Verify per-unit cgroups settings. */
-        assert_se(unit_get_cgroup_mask(son) == (CGROUP_CPU | CGROUP_CPUACCT));
-        assert_se(unit_get_cgroup_mask(daughter) == 0);
-        assert_se(unit_get_cgroup_mask(grandchild) == 0);
-        assert_se(unit_get_cgroup_mask(parent_deep) == CGROUP_MEMORY);
-        assert_se(unit_get_cgroup_mask(parent) == CGROUP_BLKIO);
-        assert_se(unit_get_cgroup_mask(root) == 0);
+        assert_se(unit_get_own_mask(son) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT));
+        assert_se(unit_get_own_mask(daughter) == 0);
+        assert_se(unit_get_own_mask(grandchild) == 0);
+        assert_se(unit_get_own_mask(parent_deep) == CGROUP_MASK_MEMORY);
+        assert_se(unit_get_own_mask(parent) == CGROUP_MASK_BLKIO);
+        assert_se(unit_get_own_mask(root) == 0);
 
         /* Verify aggregation of member masks */
         assert_se(unit_get_members_mask(son) == 0);
         assert_se(unit_get_members_mask(daughter) == 0);
         assert_se(unit_get_members_mask(grandchild) == 0);
         assert_se(unit_get_members_mask(parent_deep) == 0);
-        assert_se(unit_get_members_mask(parent) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY));
-        assert_se(unit_get_members_mask(root) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY));
+        assert_se(unit_get_members_mask(parent) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY));
+        assert_se(unit_get_members_mask(root) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
 
         /* Verify aggregation of sibling masks. */
-        assert_se(unit_get_siblings_mask(son) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY));
-        assert_se(unit_get_siblings_mask(daughter) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY));
+        assert_se(unit_get_siblings_mask(son) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY));
+        assert_se(unit_get_siblings_mask(daughter) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY));
         assert_se(unit_get_siblings_mask(grandchild) == 0);
-        assert_se(unit_get_siblings_mask(parent_deep) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY));
-        assert_se(unit_get_siblings_mask(parent) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY));
-        assert_se(unit_get_siblings_mask(root) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY));
+        assert_se(unit_get_siblings_mask(parent_deep) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY));
+        assert_se(unit_get_siblings_mask(parent) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
+        assert_se(unit_get_siblings_mask(root) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
 
         /* Verify aggregation of target masks. */
-        assert_se(unit_get_target_mask(son) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY) & m->cgroup_supported));
-        assert_se(unit_get_target_mask(daughter) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY) & m->cgroup_supported));
+        assert_se(unit_get_target_mask(son) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY) & m->cgroup_supported));
+        assert_se(unit_get_target_mask(daughter) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY) & m->cgroup_supported));
         assert_se(unit_get_target_mask(grandchild) == 0);
-        assert_se(unit_get_target_mask(parent_deep) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY) & m->cgroup_supported));
-        assert_se(unit_get_target_mask(parent) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY) & m->cgroup_supported));
-        assert_se(unit_get_target_mask(root) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY) & m->cgroup_supported));
+        assert_se(unit_get_target_mask(parent_deep) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY) & m->cgroup_supported));
+        assert_se(unit_get_target_mask(parent) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
+        assert_se(unit_get_target_mask(root) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
 
         manager_free(m);
 
index 8b0302cfe6d3ee20ffdcd4b7a166f059da383840..37b1c3554aa1116839a6df77e914278ee7b7a694 100644 (file)
@@ -74,8 +74,8 @@ int main(int argc, char*argv[]) {
 
         cg_trim(SYSTEMD_CGROUP_CONTROLLER, "/", false);
 
-        assert_se(cg_delete(SYSTEMD_CGROUP_CONTROLLER, "/test-b") < 0);
-        assert_se(cg_delete(SYSTEMD_CGROUP_CONTROLLER, "/test-a") >= 0);
+        assert_se(cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, "/test-b") < 0);
+        assert_se(cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, "/test-a") >= 0);
 
         assert_se(cg_split_spec("foobar:/", &c, &p) == 0);
         assert_se(streq(c, "foobar"));