]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
core: unified cgroup hierarchy support
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index df341a6e74eab219f1d2a61221c5d7bd43aced23..a56960506cffb17dcaabcf05cda636f29f6d0552 100644 (file)
@@ -204,6 +204,7 @@ static char **arg_property = NULL;
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static bool arg_userns = false;
 static int arg_kill_signal = 0;
+static bool arg_unified_cgroup_hierarchy = false;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -257,9 +258,11 @@ static void help(void) {
                "                            try-guest, try-host\n"
                "  -j                        Equivalent to --link-journal=try-guest\n"
                "     --read-only            Mount the root directory read-only\n"
-               "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
+               "     --bind=PATH[:PATH[:OPTIONS]]\n"
+               "                            Bind mount a file or directory from the host into\n"
                "                            the container\n"
-               "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
+               "     --bind-ro=PATH[:PATH[:OPTIONS]\n"
+               "                            Similar, but creates a read-only bind mount\n"
                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
                "     --overlay=PATH[:PATH...]:PATH\n"
                "                            Create an overlay mount from the host to \n"
@@ -309,8 +312,7 @@ static void custom_mount_free_all(void) {
                 strv_free(m->lower);
         }
 
-        free(arg_custom_mounts);
-        arg_custom_mounts = NULL;
+        arg_custom_mounts = mfree(arg_custom_mounts);
         arg_n_custom_mounts = 0;
 }
 
@@ -341,6 +343,11 @@ static int custom_mounts_prepare(void) {
         for (i = 0; i < arg_n_custom_mounts; i++) {
                 CustomMount *m = &arg_custom_mounts[i];
 
+                if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
+                        log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
+                        return -EINVAL;
+                }
+
                 if (m->type != CUSTOM_MOUNT_OVERLAY)
                         continue;
 
@@ -379,6 +386,30 @@ static int set_sanitized_path(char **b, const char *path) {
         return 0;
 }
 
+static int detect_unified_cgroup_hierarchy(void) {
+        const char *e;
+        int r;
+
+        /* Allow the user to control whether the unified hierarchy is used */
+        e = getenv("UNIFIED_CGROUP_HIERARCHY");
+        if (e) {
+                r = parse_boolean(e);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+
+                arg_unified_cgroup_hierarchy = r;
+                return 0;
+        }
+
+        /* Otherwise inherit the default from the host system */
+        r = cg_unified();
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
+        arg_unified_cgroup_hierarchy = r;
+        return 0;
+}
+
 static int parse_argv(int argc, char *argv[]) {
 
         enum {
@@ -498,9 +529,8 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case 'u':
-                        free(arg_user);
-                        arg_user = strdup(optarg);
-                        if (!arg_user)
+                        r = free_and_strdup(&arg_user, optarg);
+                        if (r < 0)
                                 return log_oom();
 
                         break;
@@ -556,10 +586,9 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case 'M':
-                        if (isempty(optarg)) {
-                                free(arg_machine);
-                                arg_machine = NULL;
-                        } else {
+                        if (isempty(optarg))
+                                arg_machine = mfree(arg_machine);
+                        else {
                                 if (!machine_name_is_valid(optarg)) {
                                         log_error("Invalid machine name: %s", optarg);
                                         return -EINVAL;
@@ -653,17 +682,22 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_BIND:
                 case ARG_BIND_RO: {
-                        _cleanup_free_ char *source = NULL, *destination = NULL;
+                        const char *current = optarg;
+                        _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
                         CustomMount *m;
-                        char *e;
 
-                        e = strchr(optarg, ':');
-                        if (e) {
-                                source = strndup(optarg, e - optarg);
-                                destination = strdup(e + 1);
-                        } else {
-                                source = strdup(optarg);
-                                destination = strdup(optarg);
+                        r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL);
+                        switch (r) {
+                        case 1:
+                                destination = strdup(source);
+                        case 2:
+                        case 3:
+                                break;
+                        case -ENOMEM:
+                                return log_oom();
+                        default:
+                                log_error("Invalid bind mount specification: %s", optarg);
+                                return -EINVAL;
                         }
 
                         if (!source || !destination)
@@ -681,25 +715,29 @@ static int parse_argv(int argc, char *argv[]) {
                         m->source = source;
                         m->destination = destination;
                         m->read_only = c == ARG_BIND_RO;
+                        m->options = opts;
 
-                        source = destination = NULL;
+                        source = destination = opts = NULL;
 
                         break;
                 }
 
                 case ARG_TMPFS: {
+                        const char *current = optarg;
                         _cleanup_free_ char *path = NULL, *opts = NULL;
                         CustomMount *m;
-                        char *e;
 
-                        e = strchr(optarg, ':');
-                        if (e) {
-                                path = strndup(optarg, e - optarg);
-                                opts = strdup(e + 1);
-                        } else {
-                                path = strdup(optarg);
-                                opts = strdup("mode=0755");
+                        r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        else if (r < 0) {
+                                log_error("Invalid tmpfs specification: %s", optarg);
+                                return r;
                         }
+                        if (r)
+                                opts = strdup(current);
+                        else
+                                opts = strdup("mode=0755");
 
                         if (!path || !opts)
                                 return log_oom();
@@ -729,9 +767,13 @@ static int parse_argv(int argc, char *argv[]) {
                         unsigned n = 0;
                         char **i;
 
-                        lower = strv_split(optarg, ":");
-                        if (!lower)
+                        r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+                        if (r == -ENOMEM)
                                 return log_oom();
+                        else if (r < 0) {
+                                log_error("Invalid overlay specification: %s", optarg);
+                                return r;
+                        }
 
                         STRV_FOREACH(i, lower) {
                                 if (!path_is_absolute(*i)) {
@@ -751,9 +793,8 @@ static int parse_argv(int argc, char *argv[]) {
                                 /* If two parameters are specified,
                                  * the first one is the lower, the
                                  * second one the upper directory. And
-                                 * we'll also define the the
-                                 * destination mount point the same as
-                                 * the upper. */
+                                 * we'll also define the destination
+                                 * mount point the same as the upper. */
                                 upper = lower[1];
                                 lower[1] = NULL;
 
@@ -1021,6 +1062,10 @@ static int parse_argv(int argc, char *argv[]) {
         if (arg_boot && arg_kill_signal <= 0)
                 arg_kill_signal = SIGRTMIN+3;
 
+        r = detect_unified_cgroup_hierarchy();
+        if (r < 0)
+                return r;
+
         return 1;
 }
 
@@ -1028,6 +1073,7 @@ static int tmpfs_patch_options(const char *options, char **ret) {
         char *buf = NULL;
 
         if (arg_userns && arg_uid_shift != 0) {
+                assert(arg_uid_shift != UID_INVALID);
 
                 if (options)
                         (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
@@ -1078,7 +1124,6 @@ static int mount_all(const char *dest, bool userns) {
                 { "/proc/sys", "/proc/sys",      NULL,     NULL,        MS_BIND,                                                   true,  true  },   /* Bind mount first */
                 { NULL,        "/proc/sys",      NULL,     NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true  },   /* Then, make it r/o */
                 { "sysfs",     "/sys",           "sysfs",  NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false },
-                { "tmpfs",     "/sys/fs/cgroup", "tmpfs",  "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,               true,  false },
                 { "tmpfs",     "/dev",           "tmpfs",  "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false },
                 { "tmpfs",     "/dev/shm",       "tmpfs",  "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false },
                 { "tmpfs",     "/run",           "tmpfs",  "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false },
@@ -1145,13 +1190,53 @@ static int mount_all(const char *dest, bool userns) {
         return 0;
 }
 
+static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
+        const char *p = options;
+        unsigned long flags = *mount_flags;
+        char *opts = NULL;
+
+        assert(options);
+
+        for (;;) {
+                _cleanup_free_ char *word = NULL;
+                int r = extract_first_word(&p, &word, ",", 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract mount option: %m");
+                if (r == 0)
+                        break;
+
+                if (streq(word, "rbind"))
+                        flags |= MS_REC;
+                else if (streq(word, "norbind"))
+                        flags &= ~MS_REC;
+                else {
+                        log_error("Invalid bind mount option: %s", word);
+                        return -EINVAL;
+                }
+        }
+
+        *mount_flags = flags;
+        /* in the future mount_opts will hold string options for mount(2) */
+        *mount_opts = opts;
+
+        return 0;
+}
+
 static int mount_bind(const char *dest, CustomMount *m) {
         struct stat source_st, dest_st;
         const char *where;
+        unsigned long mount_flags = MS_BIND | MS_REC;
+        _cleanup_free_ char *mount_opts = NULL;
         int r;
 
         assert(m);
 
+        if (m->options) {
+                r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
+                if (r < 0)
+                        return r;
+        }
+
         if (stat(m->source, &source_st) < 0)
                 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
 
@@ -1188,7 +1273,7 @@ static int mount_bind(const char *dest, CustomMount *m) {
         if (r < 0 && r != -EEXIST)
                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
 
-        if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
+        if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
                 return log_error_errno(errno, "mount(%s) failed: %m", where);
 
         if (m->read_only) {
@@ -1225,6 +1310,21 @@ static int mount_tmpfs(const char *dest, CustomMount *m) {
         return 0;
 }
 
+static char *joined_and_escaped_lower_dirs(char * const *lower) {
+        _cleanup_strv_free_ char **sv = NULL;
+
+        sv = strv_copy(lower);
+        if (!sv)
+                return NULL;
+
+        strv_reverse(sv);
+
+        if (!strv_shell_escape(sv, ",:"))
+                return NULL;
+
+        return strv_join(sv, ":");
+}
+
 static int mount_overlay(const char *dest, CustomMount *m) {
         _cleanup_free_ char *lower = NULL;
         const char *where, *options;
@@ -1241,19 +1341,32 @@ static int mount_overlay(const char *dest, CustomMount *m) {
 
         (void) mkdir_p_label(m->source, 0755);
 
-        strv_reverse(m->lower);
-        lower = strv_join(m->lower, ":");
-        strv_reverse(m->lower);
+        lower = joined_and_escaped_lower_dirs(m->lower);
         if (!lower)
                 return log_oom();
 
-        if (m->read_only)
-                options = strjoina("lowerdir=", m->source, ":", lower);
-        else {
+        if (m->read_only) {
+                _cleanup_free_ char *escaped_source = NULL;
+
+                escaped_source = shell_escape(m->source, ",:");
+                if (!escaped_source)
+                        return log_oom();
+
+                options = strjoina("lowerdir=", escaped_source, ":", lower);
+        } else {
+                _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
+
                 assert(m->work_dir);
                 (void) mkdir_label(m->work_dir, 0700);
 
-                options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
+                escaped_source = shell_escape(m->source, ",:");
+                if (!escaped_source)
+                        return log_oom();
+                escaped_work_dir = shell_escape(m->work_dir, ",:");
+                if (!escaped_work_dir)
+                        return log_oom();
+
+                options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
         }
 
         if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
@@ -1296,7 +1409,7 @@ static int mount_custom(const char *dest) {
         return 0;
 }
 
-static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
         char *to;
         int r;
 
@@ -1324,11 +1437,31 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons
         return 1;
 }
 
-static int mount_cgroup(const char *dest) {
+static int mount_legacy_cgroups(const char *dest) {
         _cleanup_set_free_free_ Set *controllers = NULL;
         const char *cgroup_root;
         int r;
 
+        cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+
+        /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+        r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+        if (r == 0) {
+                _cleanup_free_ char *options = NULL;
+
+                r = tmpfs_patch_options("mode=755", &options);
+                if (r < 0)
+                        return log_oom();
+
+                if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
+                        return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+        }
+
+        if (cg_unified() > 0)
+                goto skip_controllers;
+
         controllers = set_new(&string_hash_ops);
         if (!controllers)
                 return log_oom();
@@ -1352,7 +1485,7 @@ static int mount_cgroup(const char *dest) {
                 if (r == -EINVAL) {
                         /* Not a symbolic link, but directly a single cgroup hierarchy */
 
-                        r = mount_cgroup_hierarchy(dest, controller, controller, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
                         if (r < 0)
                                 return r;
 
@@ -1372,7 +1505,7 @@ static int mount_cgroup(const char *dest) {
                                 continue;
                         }
 
-                        r = mount_cgroup_hierarchy(dest, combined, combined, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
                         if (r < 0)
                                 return r;
 
@@ -1386,17 +1519,52 @@ static int mount_cgroup(const char *dest) {
                 }
         }
 
-        r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
+skip_controllers:
+        r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
         if (r < 0)
                 return r;
 
-        cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
 
         return 0;
 }
 
+static int mount_unified_cgroups(const char *dest) {
+        const char *p;
+        int r;
+
+        assert(dest);
+
+        p = strjoina(dest, "/sys/fs/cgroup");
+
+        r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
+        if (r > 0) {
+                p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
+                if (access(p, F_OK) >= 0)
+                        return 0;
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
+
+                log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
+                return -EINVAL;
+        }
+
+        if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
+                return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
+
+        return 0;
+}
+
+static int mount_cgroups(const char *dest) {
+        if (arg_unified_cgroup_hierarchy)
+                return mount_unified_cgroups(dest);
+        else
+                return mount_legacy_cgroups(dest);
+}
+
 static int mount_systemd_cgroup_writable(const char *dest) {
         _cleanup_free_ char *own_cgroup_path = NULL;
         const char *systemd_root, *systemd_own;
@@ -1408,13 +1576,23 @@ static int mount_systemd_cgroup_writable(const char *dest) {
         if (r < 0)
                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
 
+        /* If we are living in the top-level, then there's nothing to do... */
+        if (path_equal(own_cgroup_path, "/"))
+                return 0;
+
+        if (arg_unified_cgroup_hierarchy) {
+                systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
+                systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
+        } else {
+                systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+                systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+        }
+
         /* Make our own cgroup a (writable) bind mount */
-        systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
 
         /* And then remount the systemd cgroup root read-only */
-        systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
 
@@ -1697,7 +1875,7 @@ static int setup_boot_id(const char *dest) {
 
         id128_format_as_uuid(rnd, as_uuid);
 
-        r = write_string_file(from, as_uuid);
+        r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
         if (r < 0)
                 return log_error_errno(r, "Failed to write boot id: %m");
 
@@ -1780,15 +1958,13 @@ static int setup_pts(const char *dest) {
 #ifdef HAVE_SELINUX
         if (arg_selinux_apifs_context)
                 (void) asprintf(&options,
-                                "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"",
-                                arg_uid_shift,
+                                "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
                                 arg_uid_shift + TTY_GID,
                                 arg_selinux_apifs_context);
         else
 #endif
                 (void) asprintf(&options,
-                                "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT,
-                                arg_uid_shift,
+                                "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
                                 arg_uid_shift + TTY_GID);
 
         if (!options)
@@ -2273,7 +2449,7 @@ static int drop_capabilities(void) {
 
 static int register_machine(pid_t pid, int local_ifindex) {
         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
-        _cleanup_bus_close_unref_ sd_bus *bus = NULL;
+        _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
         int r;
 
         if (!arg_register)
@@ -2430,7 +2606,7 @@ static int register_machine(pid_t pid, int local_ifindex) {
 static int terminate_machine(pid_t pid) {
         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
-        _cleanup_bus_close_unref_ sd_bus *bus = NULL;
+        _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
         const char *path;
         int r;
 
@@ -2502,7 +2678,7 @@ static int reset_audit_loginuid(void) {
         if (streq(p, "4294967295"))
                 return 0;
 
-        r = write_string_file("/proc/self/loginuid", "4294967295");
+        r = write_string_file("/proc/self/loginuid", "4294967295", 0);
         if (r < 0) {
                 log_error_errno(r,
                                 "Failed to reset audit login UID. This probably means that your kernel is too\n"
@@ -3963,6 +4139,17 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo
 static int determine_names(void) {
         int r;
 
+        if (arg_template && !arg_directory && arg_machine) {
+
+                /* If --template= was specified then we should not
+                 * search for a machine, but instead create a new one
+                 * in /var/lib/machine. */
+
+                arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
+                if (!arg_directory)
+                        return log_oom();
+        }
+
         if (!arg_image && !arg_directory) {
                 if (arg_machine) {
                         _cleanup_(image_unrefp) Image *i = NULL;
@@ -4002,7 +4189,7 @@ static int determine_names(void) {
                 if (!arg_machine)
                         return log_oom();
 
-                hostname_cleanup(arg_machine, false);
+                hostname_cleanup(arg_machine);
                 if (!machine_name_is_valid(arg_machine)) {
                         log_error("Failed to determine machine name automatically, please use -M.");
                         return -EINVAL;
@@ -4093,6 +4280,8 @@ static int inner_child(
         assert(directory);
         assert(kmsg_socket >= 0);
 
+        cg_unified_flush();
+
         if (arg_userns) {
                 /* Tell the parent, that it now can write the UID map. */
                 (void) barrier_place(barrier); /* #1 */
@@ -4259,6 +4448,7 @@ static int outer_child(
                 int pid_socket,
                 int kmsg_socket,
                 int rtnl_socket,
+                int uid_shift_socket,
                 FDSet *fds,
                 int argc,
                 char *argv[]) {
@@ -4273,6 +4463,8 @@ static int outer_child(
         assert(pid_socket >= 0);
         assert(kmsg_socket >= 0);
 
+        cg_unified_flush();
+
         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
                 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
 
@@ -4317,6 +4509,16 @@ static int outer_child(
         if (r < 0)
                 return r;
 
+        if (arg_userns) {
+                l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
+                if (l < 0)
+                        return log_error_errno(errno, "Failed to send UID shift: %m");
+                if (l != sizeof(arg_uid_shift)) {
+                        log_error("Short write while sending UID shift.");
+                        return -EIO;
+                }
+        }
+
         /* Turn directory into bind mount */
         if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
                 return log_error_errno(errno, "Failed to make bind mount: %m");
@@ -4379,7 +4581,7 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = mount_cgroup(directory);
+        r = mount_cgroups(directory);
         if (r < 0)
                 return r;
 
@@ -4394,9 +4596,9 @@ static int outer_child(
                         NULL);
         if (pid < 0)
                 return log_error_errno(errno, "Failed to fork inner child: %m");
-
         if (pid == 0) {
                 pid_socket = safe_close(pid_socket);
+                uid_shift_socket = safe_close(uid_shift_socket);
 
                 /* The inner child has all namespaces that are
                  * requested, so that we all are owned by the user if
@@ -4430,13 +4632,13 @@ static int setup_uid_map(pid_t pid) {
 
         xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
         xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
-        r = write_string_file(uid_map, line);
+        r = write_string_file(uid_map, line, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to write UID map: %m");
 
         /* We always assign the same UID and GID ranges */
         xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
-        r = write_string_file(uid_map, line);
+        r = write_string_file(uid_map, line, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to write GID map: %m");
 
@@ -4461,9 +4663,112 @@ static int chown_cgroup(pid_t pid) {
         if (fd < 0)
                 return log_error_errno(errno, "Failed to open %s: %m", fs);
 
-        FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
+        FOREACH_STRING(fn,
+                       ".",
+                       "tasks",
+                       "notify_on_release",
+                       "cgroup.procs",
+                       "cgroup.clone_children",
+                       "cgroup.controllers",
+                       "cgroup.subtree_control",
+                       "cgroup.populated")
                 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
-                        log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
+                        log_full_errno(errno == ENOENT ? LOG_DEBUG :  LOG_WARNING, errno,
+                                       "Failed to chown() cgroup file %s, ignoring: %m", fn);
+
+        return 0;
+}
+
+static int sync_cgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL;
+        char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
+        bool undo_mount = false;
+        const char *fn;
+        int unified, r;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+
+        if ((unified > 0) == arg_unified_cgroup_hierarchy)
+                return 0;
+
+        /* When the host uses the legacy cgroup setup, but the
+         * container shall use the unified hierarchy, let's make sure
+         * we copy the path from the name=systemd hierarchy into the
+         * unified hierarchy. Similar for the reverse situation. */
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
+
+        /* In order to access the unified hierarchy we need to mount it */
+        if (!mkdtemp(tree))
+                return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
+
+        if (unified)
+                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+        else
+                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
+        if (r < 0) {
+                r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+                goto finish;
+        }
+
+        undo_mount = true;
+
+        fn = strjoina(tree, cgroup, "/cgroup.procs");
+        (void) mkdir_parents(fn, 0755);
+
+        sprintf(pid_string, PID_FMT, pid);
+        r = write_string_file(fn, pid_string, 0);
+        if (r < 0)
+                log_error_errno(r, "Failed to move process: %m");
+
+finish:
+        if (undo_mount)
+                (void) umount(tree);
+
+        (void) rmdir(tree);
+        return r;
+}
+
+static int create_subcgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL;
+        const char *child;
+        int unified, r;
+
+        /* In the unified hierarchy inner nodes may only only contain
+         * subgroups, but not processes. Hence, if we running in the
+         * unified hierarchy and the container does the same, and we
+         * did not create a scope unit for the container move us and
+         * the container into two separate subcgroups. */
+
+        if (!arg_keep_unit)
+                return 0;
+
+        if (!arg_unified_cgroup_hierarchy)
+                return 0;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+        if (unified == 0)
+                return 0;
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get our control group: %m");
+
+        child = strjoina(cgroup, "/payload");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+        child = strjoina(cgroup, "/supervisor");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
 
         return 0;
 }
@@ -4687,7 +4992,8 @@ int main(int argc, char *argv[]) {
         }
 
         for (;;) {
-                _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 };
+                _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
+                                         uid_shift_socket_pair[2] = { -1, -1 };
                 ContainerStatus container_status;
                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
                 static const struct sigaction sa = {
@@ -4696,10 +5002,10 @@ int main(int argc, char *argv[]) {
                 };
                 int ifi = 0;
                 ssize_t l;
-                                _cleanup_event_unref_ sd_event *event = NULL;
-                                _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
-                                _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
-                                char last_char = 0;
+                _cleanup_event_unref_ sd_event *event = NULL;
+                _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
+                _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
+                char last_char = 0;
 
                 r = barrier_create(&barrier);
                 if (r < 0) {
@@ -4722,6 +5028,12 @@ int main(int argc, char *argv[]) {
                         goto finish;
                 }
 
+                if (arg_userns)
+                        if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
+                                r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
+                                goto finish;
+                        }
+
                 /* Child can be killed before execv(), so handle SIGCHLD
                  * in order to interrupt parent's blocking calls and
                  * give it a chance to call wait() and terminate. */
@@ -4756,6 +5068,7 @@ int main(int argc, char *argv[]) {
                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
                         pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
+                        uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
 
                         (void) reset_all_signal_handlers();
                         (void) reset_signal_mask();
@@ -4771,6 +5084,7 @@ int main(int argc, char *argv[]) {
                                         pid_socket_pair[1],
                                         kmsg_socket_pair[1],
                                         rtnl_socket_pair[1],
+                                        uid_shift_socket_pair[1],
                                         fds,
                                         argc, argv);
                         if (r < 0)
@@ -4819,6 +5133,17 @@ int main(int argc, char *argv[]) {
                                 goto finish;
                         }
 
+                        l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
+                        if (l < 0) {
+                                r = log_error_errno(errno, "Failed to read UID shift: %m");
+                                goto finish;
+                        }
+                        if (l != sizeof(arg_uid_shift)) {
+                                log_error("Short read while reading UID shift: %m");
+                                r = EIO;
+                                goto finish;
+                        }
+
                         r = setup_uid_map(pid);
                         if (r < 0)
                                 goto finish;
@@ -4850,6 +5175,14 @@ int main(int argc, char *argv[]) {
                 if (r < 0)
                         goto finish;
 
+                r = sync_cgroup(pid);
+                if (r < 0)
+                        goto finish;
+
+                r = create_subcgroup(pid);
+                if (r < 0)
+                        goto finish;
+
                 r = chown_cgroup(pid);
                 if (r < 0)
                         goto finish;