X-Git-Url: http://git.ipfire.org/?a=blobdiff_plain;f=src%2Fnspawn%2Fnspawn.c;h=a56960506cffb17dcaabcf05cda636f29f6d0552;hb=efdb02375beb0a940c3320865572913780b4d7de;hp=4cf2d14ae2c095664a58c299d4c8782e786088d3;hpb=6433d440729d22f5954ea1bf9c828bbad04e0838;p=thirdparty%2Fsystemd.git diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 4cf2d14ae2c..a56960506cf 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -204,6 +204,7 @@ static char **arg_property = NULL; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns = false; static int arg_kill_signal = 0; +static bool arg_unified_cgroup_hierarchy = false; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -257,9 +258,11 @@ static void help(void) { " try-guest, try-host\n" " -j Equivalent to --link-journal=try-guest\n" " --read-only Mount the root directory read-only\n" - " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n" + " --bind=PATH[:PATH[:OPTIONS]]\n" + " Bind mount a file or directory from the host into\n" " the container\n" - " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n" + " --bind-ro=PATH[:PATH[:OPTIONS]\n" + " Similar, but creates a read-only bind mount\n" " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n" " --overlay=PATH[:PATH...]:PATH\n" " Create an overlay mount from the host to \n" @@ -309,8 +312,7 @@ static void custom_mount_free_all(void) { strv_free(m->lower); } - free(arg_custom_mounts); - arg_custom_mounts = NULL; + arg_custom_mounts = mfree(arg_custom_mounts); arg_n_custom_mounts = 0; } @@ -341,6 +343,11 @@ static int custom_mounts_prepare(void) { for (i = 0; i < arg_n_custom_mounts; i++) { CustomMount *m = &arg_custom_mounts[i]; + if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) { + log_error("--private-users with automatic UID shift may not be combined with custom root mounts."); + return -EINVAL; + } + if (m->type != CUSTOM_MOUNT_OVERLAY) continue; @@ -379,6 +386,30 @@ static int set_sanitized_path(char **b, const char *path) { return 0; } +static int detect_unified_cgroup_hierarchy(void) { + const char *e; + int r; + + /* Allow the user to control whether the unified hierarchy is used */ + e = getenv("UNIFIED_CGROUP_HIERARCHY"); + if (e) { + r = parse_boolean(e); + if (r < 0) + return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY."); + + arg_unified_cgroup_hierarchy = r; + return 0; + } + + /* Otherwise inherit the default from the host system */ + r = cg_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); + + arg_unified_cgroup_hierarchy = r; + return 0; +} + static int parse_argv(int argc, char *argv[]) { enum { @@ -498,9 +529,8 @@ static int parse_argv(int argc, char *argv[]) { break; case 'u': - free(arg_user); - arg_user = strdup(optarg); - if (!arg_user) + r = free_and_strdup(&arg_user, optarg); + if (r < 0) return log_oom(); break; @@ -556,10 +586,9 @@ static int parse_argv(int argc, char *argv[]) { break; case 'M': - if (isempty(optarg)) { - free(arg_machine); - arg_machine = NULL; - } else { + if (isempty(optarg)) + arg_machine = mfree(arg_machine); + else { if (!machine_name_is_valid(optarg)) { log_error("Invalid machine name: %s", optarg); return -EINVAL; @@ -653,17 +682,22 @@ static int parse_argv(int argc, char *argv[]) { case ARG_BIND: case ARG_BIND_RO: { - _cleanup_free_ char *source = NULL, *destination = NULL; + const char *current = optarg; + _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL; CustomMount *m; - char *e; - e = strchr(optarg, ':'); - if (e) { - source = strndup(optarg, e - optarg); - destination = strdup(e + 1); - } else { - source = strdup(optarg); - destination = strdup(optarg); + r = extract_many_words(¤t, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL); + switch (r) { + case 1: + destination = strdup(source); + case 2: + case 3: + break; + case -ENOMEM: + return log_oom(); + default: + log_error("Invalid bind mount specification: %s", optarg); + return -EINVAL; } if (!source || !destination) @@ -681,25 +715,29 @@ static int parse_argv(int argc, char *argv[]) { m->source = source; m->destination = destination; m->read_only = c == ARG_BIND_RO; + m->options = opts; - source = destination = NULL; + source = destination = opts = NULL; break; } case ARG_TMPFS: { + const char *current = optarg; _cleanup_free_ char *path = NULL, *opts = NULL; CustomMount *m; - char *e; - e = strchr(optarg, ':'); - if (e) { - path = strndup(optarg, e - optarg); - opts = strdup(e + 1); - } else { - path = strdup(optarg); - opts = strdup("mode=0755"); + r = extract_first_word(¤t, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + else if (r < 0) { + log_error("Invalid tmpfs specification: %s", optarg); + return r; } + if (r) + opts = strdup(current); + else + opts = strdup("mode=0755"); if (!path || !opts) return log_oom(); @@ -729,9 +767,13 @@ static int parse_argv(int argc, char *argv[]) { unsigned n = 0; char **i; - lower = strv_split(optarg, ":"); - if (!lower) + r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) return log_oom(); + else if (r < 0) { + log_error("Invalid overlay specification: %s", optarg); + return r; + } STRV_FOREACH(i, lower) { if (!path_is_absolute(*i)) { @@ -751,9 +793,8 @@ static int parse_argv(int argc, char *argv[]) { /* If two parameters are specified, * the first one is the lower, the * second one the upper directory. And - * we'll also define the the - * destination mount point the same as - * the upper. */ + * we'll also define the destination + * mount point the same as the upper. */ upper = lower[1]; lower[1] = NULL; @@ -1021,6 +1062,10 @@ static int parse_argv(int argc, char *argv[]) { if (arg_boot && arg_kill_signal <= 0) arg_kill_signal = SIGRTMIN+3; + r = detect_unified_cgroup_hierarchy(); + if (r < 0) + return r; + return 1; } @@ -1028,6 +1073,7 @@ static int tmpfs_patch_options(const char *options, char **ret) { char *buf = NULL; if (arg_userns && arg_uid_shift != 0) { + assert(arg_uid_shift != UID_INVALID); if (options) (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift); @@ -1074,18 +1120,17 @@ static int mount_all(const char *dest, bool userns) { } MountPoint; static const MountPoint mount_table[] = { - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */ - { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true, true }, /* Then, make it r/o */ - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, - { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false }, - { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, - { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false }, + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */ + { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */ + { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, + { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, + { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, + { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false }, #ifdef HAVE_SELINUX - { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */ - { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false, false }, /* Then, make it r/o */ + { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */ + { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */ #endif }; @@ -1145,13 +1190,53 @@ static int mount_all(const char *dest, bool userns) { return 0; } +static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) { + const char *p = options; + unsigned long flags = *mount_flags; + char *opts = NULL; + + assert(options); + + for (;;) { + _cleanup_free_ char *word = NULL; + int r = extract_first_word(&p, &word, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to extract mount option: %m"); + if (r == 0) + break; + + if (streq(word, "rbind")) + flags |= MS_REC; + else if (streq(word, "norbind")) + flags &= ~MS_REC; + else { + log_error("Invalid bind mount option: %s", word); + return -EINVAL; + } + } + + *mount_flags = flags; + /* in the future mount_opts will hold string options for mount(2) */ + *mount_opts = opts; + + return 0; +} + static int mount_bind(const char *dest, CustomMount *m) { struct stat source_st, dest_st; const char *where; + unsigned long mount_flags = MS_BIND | MS_REC; + _cleanup_free_ char *mount_opts = NULL; int r; assert(m); + if (m->options) { + r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts); + if (r < 0) + return r; + } + if (stat(m->source, &source_st) < 0) return log_error_errno(errno, "Failed to stat %s: %m", m->source); @@ -1188,7 +1273,7 @@ static int mount_bind(const char *dest, CustomMount *m) { if (r < 0 && r != -EEXIST) return log_error_errno(r, "Failed to create mount point %s: %m", where); - if (mount(m->source, where, NULL, MS_BIND, NULL) < 0) + if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0) return log_error_errno(errno, "mount(%s) failed: %m", where); if (m->read_only) { @@ -1225,6 +1310,21 @@ static int mount_tmpfs(const char *dest, CustomMount *m) { return 0; } +static char *joined_and_escaped_lower_dirs(char * const *lower) { + _cleanup_strv_free_ char **sv = NULL; + + sv = strv_copy(lower); + if (!sv) + return NULL; + + strv_reverse(sv); + + if (!strv_shell_escape(sv, ",:")) + return NULL; + + return strv_join(sv, ":"); +} + static int mount_overlay(const char *dest, CustomMount *m) { _cleanup_free_ char *lower = NULL; const char *where, *options; @@ -1241,19 +1341,32 @@ static int mount_overlay(const char *dest, CustomMount *m) { (void) mkdir_p_label(m->source, 0755); - strv_reverse(m->lower); - lower = strv_join(m->lower, ":"); - strv_reverse(m->lower); + lower = joined_and_escaped_lower_dirs(m->lower); if (!lower) return log_oom(); - if (m->read_only) - options = strjoina("lowerdir=", m->source, ":", lower); - else { + if (m->read_only) { + _cleanup_free_ char *escaped_source = NULL; + + escaped_source = shell_escape(m->source, ",:"); + if (!escaped_source) + return log_oom(); + + options = strjoina("lowerdir=", escaped_source, ":", lower); + } else { + _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL; + assert(m->work_dir); (void) mkdir_label(m->work_dir, 0700); - options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir); + escaped_source = shell_escape(m->source, ",:"); + if (!escaped_source) + return log_oom(); + escaped_work_dir = shell_escape(m->work_dir, ",:"); + if (!escaped_work_dir) + return log_oom(); + + options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir); } if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0) @@ -1296,7 +1409,7 @@ static int mount_custom(const char *dest) { return 0; } -static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) { +static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) { char *to; int r; @@ -1324,11 +1437,31 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons return 1; } -static int mount_cgroup(const char *dest) { +static int mount_legacy_cgroups(const char *dest) { _cleanup_set_free_free_ Set *controllers = NULL; const char *cgroup_root; int r; + cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); + + /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ + r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); + if (r == 0) { + _cleanup_free_ char *options = NULL; + + r = tmpfs_patch_options("mode=755", &options); + if (r < 0) + return log_oom(); + + if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0) + return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m"); + } + + if (cg_unified() > 0) + goto skip_controllers; + controllers = set_new(&string_hash_ops); if (!controllers) return log_oom(); @@ -1352,7 +1485,7 @@ static int mount_cgroup(const char *dest) { if (r == -EINVAL) { /* Not a symbolic link, but directly a single cgroup hierarchy */ - r = mount_cgroup_hierarchy(dest, controller, controller, true); + r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true); if (r < 0) return r; @@ -1372,7 +1505,7 @@ static int mount_cgroup(const char *dest) { continue; } - r = mount_cgroup_hierarchy(dest, combined, combined, true); + r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true); if (r < 0) return r; @@ -1386,17 +1519,52 @@ static int mount_cgroup(const char *dest) { } } - r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false); +skip_controllers: + r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false); if (r < 0) return r; - cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0) return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root); return 0; } +static int mount_unified_cgroups(const char *dest) { + const char *p; + int r; + + assert(dest); + + p = strjoina(dest, "/sys/fs/cgroup"); + + r = path_is_mount_point(p, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); + if (r > 0) { + p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs"); + if (access(p, F_OK) >= 0) + return 0; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); + + log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); + return -EINVAL; + } + + if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0) + return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p); + + return 0; +} + +static int mount_cgroups(const char *dest) { + if (arg_unified_cgroup_hierarchy) + return mount_unified_cgroups(dest); + else + return mount_legacy_cgroups(dest); +} + static int mount_systemd_cgroup_writable(const char *dest) { _cleanup_free_ char *own_cgroup_path = NULL; const char *systemd_root, *systemd_own; @@ -1408,13 +1576,23 @@ static int mount_systemd_cgroup_writable(const char *dest) { if (r < 0) return log_error_errno(r, "Failed to determine our own cgroup path: %m"); + /* If we are living in the top-level, then there's nothing to do... */ + if (path_equal(own_cgroup_path, "/")) + return 0; + + if (arg_unified_cgroup_hierarchy) { + systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path); + systemd_root = prefix_roota(dest, "/sys/fs/cgroup"); + } else { + systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); + systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + } + /* Make our own cgroup a (writable) bind mount */ - systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0) return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path); /* And then remount the systemd cgroup root read-only */ - systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) return log_error_errno(errno, "Failed to mount cgroup root read-only: %m"); @@ -1697,7 +1875,7 @@ static int setup_boot_id(const char *dest) { id128_format_as_uuid(rnd, as_uuid); - r = write_string_file(from, as_uuid); + r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE); if (r < 0) return log_error_errno(r, "Failed to write boot id: %m"); @@ -1780,15 +1958,13 @@ static int setup_pts(const char *dest) { #ifdef HAVE_SELINUX if (arg_selinux_apifs_context) (void) asprintf(&options, - "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"", - arg_uid_shift, + "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"", arg_uid_shift + TTY_GID, arg_selinux_apifs_context); else #endif (void) asprintf(&options, - "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT, - arg_uid_shift, + "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT, arg_uid_shift + TTY_GID); if (!options) @@ -2273,7 +2449,7 @@ static int drop_capabilities(void) { static int register_machine(pid_t pid, int local_ifindex) { _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL; - _cleanup_bus_close_unref_ sd_bus *bus = NULL; + _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL; int r; if (!arg_register) @@ -2430,7 +2606,7 @@ static int register_machine(pid_t pid, int local_ifindex) { static int terminate_machine(pid_t pid) { _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL; _cleanup_bus_message_unref_ sd_bus_message *reply = NULL; - _cleanup_bus_close_unref_ sd_bus *bus = NULL; + _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL; const char *path; int r; @@ -2502,7 +2678,7 @@ static int reset_audit_loginuid(void) { if (streq(p, "4294967295")) return 0; - r = write_string_file("/proc/self/loginuid", "4294967295"); + r = write_string_file("/proc/self/loginuid", "4294967295", 0); if (r < 0) { log_error_errno(r, "Failed to reset audit login UID. This probably means that your kernel is too\n" @@ -3963,6 +4139,17 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo static int determine_names(void) { int r; + if (arg_template && !arg_directory && arg_machine) { + + /* If --template= was specified then we should not + * search for a machine, but instead create a new one + * in /var/lib/machine. */ + + arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL); + if (!arg_directory) + return log_oom(); + } + if (!arg_image && !arg_directory) { if (arg_machine) { _cleanup_(image_unrefp) Image *i = NULL; @@ -4002,7 +4189,7 @@ static int determine_names(void) { if (!arg_machine) return log_oom(); - hostname_cleanup(arg_machine, false); + hostname_cleanup(arg_machine); if (!machine_name_is_valid(arg_machine)) { log_error("Failed to determine machine name automatically, please use -M."); return -EINVAL; @@ -4093,6 +4280,8 @@ static int inner_child( assert(directory); assert(kmsg_socket >= 0); + cg_unified_flush(); + if (arg_userns) { /* Tell the parent, that it now can write the UID map. */ (void) barrier_place(barrier); /* #1 */ @@ -4259,6 +4448,7 @@ static int outer_child( int pid_socket, int kmsg_socket, int rtnl_socket, + int uid_shift_socket, FDSet *fds, int argc, char *argv[]) { @@ -4273,6 +4463,8 @@ static int outer_child( assert(pid_socket >= 0); assert(kmsg_socket >= 0); + cg_unified_flush(); + if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m"); @@ -4317,6 +4509,16 @@ static int outer_child( if (r < 0) return r; + if (arg_userns) { + l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send UID shift: %m"); + if (l != sizeof(arg_uid_shift)) { + log_error("Short write while sending UID shift."); + return -EIO; + } + } + /* Turn directory into bind mount */ if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) return log_error_errno(errno, "Failed to make bind mount: %m"); @@ -4379,7 +4581,7 @@ static int outer_child( if (r < 0) return r; - r = mount_cgroup(directory); + r = mount_cgroups(directory); if (r < 0) return r; @@ -4394,9 +4596,9 @@ static int outer_child( NULL); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); - if (pid == 0) { pid_socket = safe_close(pid_socket); + uid_shift_socket = safe_close(uid_shift_socket); /* The inner child has all namespaces that are * requested, so that we all are owned by the user if @@ -4430,13 +4632,13 @@ static int setup_uid_map(pid_t pid) { xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid); xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range); - r = write_string_file(uid_map, line); + r = write_string_file(uid_map, line, 0); if (r < 0) return log_error_errno(r, "Failed to write UID map: %m"); /* We always assign the same UID and GID ranges */ xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid); - r = write_string_file(uid_map, line); + r = write_string_file(uid_map, line, 0); if (r < 0) return log_error_errno(r, "Failed to write GID map: %m"); @@ -4461,9 +4663,112 @@ static int chown_cgroup(pid_t pid) { if (fd < 0) return log_error_errno(errno, "Failed to open %s: %m", fs); - FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children") + FOREACH_STRING(fn, + ".", + "tasks", + "notify_on_release", + "cgroup.procs", + "cgroup.clone_children", + "cgroup.controllers", + "cgroup.subtree_control", + "cgroup.populated") if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0) - log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn); + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to chown() cgroup file %s, ignoring: %m", fn); + + return 0; +} + +static int sync_cgroup(pid_t pid) { + _cleanup_free_ char *cgroup = NULL; + char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; + bool undo_mount = false; + const char *fn; + int unified, r; + + unified = cg_unified(); + if (unified < 0) + return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + + if ((unified > 0) == arg_unified_cgroup_hierarchy) + return 0; + + /* When the host uses the legacy cgroup setup, but the + * container shall use the unified hierarchy, let's make sure + * we copy the path from the name=systemd hierarchy into the + * unified hierarchy. Similar for the reverse situation. */ + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid); + + /* In order to access the unified hierarchy we need to mount it */ + if (!mkdtemp(tree)) + return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m"); + + if (unified) + r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr"); + else + r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior"); + if (r < 0) { + r = log_error_errno(errno, "Failed to mount unified hierarchy: %m"); + goto finish; + } + + undo_mount = true; + + fn = strjoina(tree, cgroup, "/cgroup.procs"); + (void) mkdir_parents(fn, 0755); + + sprintf(pid_string, PID_FMT, pid); + r = write_string_file(fn, pid_string, 0); + if (r < 0) + log_error_errno(r, "Failed to move process: %m"); + +finish: + if (undo_mount) + (void) umount(tree); + + (void) rmdir(tree); + return r; +} + +static int create_subcgroup(pid_t pid) { + _cleanup_free_ char *cgroup = NULL; + const char *child; + int unified, r; + + /* In the unified hierarchy inner nodes may only only contain + * subgroups, but not processes. Hence, if we running in the + * unified hierarchy and the container does the same, and we + * did not create a scope unit for the container move us and + * the container into two separate subcgroups. */ + + if (!arg_keep_unit) + return 0; + + if (!arg_unified_cgroup_hierarchy) + return 0; + + unified = cg_unified(); + if (unified < 0) + return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + if (unified == 0) + return 0; + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get our control group: %m"); + + child = strjoina(cgroup, "/payload"); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + + child = strjoina(cgroup, "/supervisor"); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", child); return 0; } @@ -4687,7 +4992,8 @@ int main(int argc, char *argv[]) { } for (;;) { - _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }; + _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, + uid_shift_socket_pair[2] = { -1, -1 }; ContainerStatus container_status; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; static const struct sigaction sa = { @@ -4696,10 +5002,10 @@ int main(int argc, char *argv[]) { }; int ifi = 0; ssize_t l; - _cleanup_event_unref_ sd_event *event = NULL; - _cleanup_(pty_forward_freep) PTYForward *forward = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - char last_char = 0; + _cleanup_event_unref_ sd_event *event = NULL; + _cleanup_(pty_forward_freep) PTYForward *forward = NULL; + _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; + char last_char = 0; r = barrier_create(&barrier); if (r < 0) { @@ -4722,6 +5028,12 @@ int main(int argc, char *argv[]) { goto finish; } + if (arg_userns) + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { + r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); + goto finish; + } + /* Child can be killed before execv(), so handle SIGCHLD * in order to interrupt parent's blocking calls and * give it a chance to call wait() and terminate. */ @@ -4756,6 +5068,7 @@ int main(int argc, char *argv[]) { kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]); rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); pid_socket_pair[0] = safe_close(pid_socket_pair[0]); + uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); (void) reset_all_signal_handlers(); (void) reset_signal_mask(); @@ -4771,6 +5084,7 @@ int main(int argc, char *argv[]) { pid_socket_pair[1], kmsg_socket_pair[1], rtnl_socket_pair[1], + uid_shift_socket_pair[1], fds, argc, argv); if (r < 0) @@ -4819,6 +5133,17 @@ int main(int argc, char *argv[]) { goto finish; } + l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); + if (l < 0) { + r = log_error_errno(errno, "Failed to read UID shift: %m"); + goto finish; + } + if (l != sizeof(arg_uid_shift)) { + log_error("Short read while reading UID shift: %m"); + r = EIO; + goto finish; + } + r = setup_uid_map(pid); if (r < 0) goto finish; @@ -4850,6 +5175,14 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + r = sync_cgroup(pid); + if (r < 0) + goto finish; + + r = create_subcgroup(pid); + if (r < 0) + goto finish; + r = chown_cgroup(pid); if (r < 0) goto finish;