static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns = false;
static int arg_kill_signal = 0;
+static bool arg_unified_cgroup_hierarchy = false;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
" try-guest, try-host\n"
" -j Equivalent to --link-journal=try-guest\n"
" --read-only Mount the root directory read-only\n"
- " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
+ " --bind=PATH[:PATH[:OPTIONS]]\n"
+ " Bind mount a file or directory from the host into\n"
" the container\n"
- " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
+ " --bind-ro=PATH[:PATH[:OPTIONS]\n"
+ " Similar, but creates a read-only bind mount\n"
" --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
" --overlay=PATH[:PATH...]:PATH\n"
" Create an overlay mount from the host to \n"
strv_free(m->lower);
}
- free(arg_custom_mounts);
- arg_custom_mounts = NULL;
+ arg_custom_mounts = mfree(arg_custom_mounts);
arg_n_custom_mounts = 0;
}
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
+ if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
+ log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
+ return -EINVAL;
+ }
+
if (m->type != CUSTOM_MOUNT_OVERLAY)
continue;
return 0;
}
+static int detect_unified_cgroup_hierarchy(void) {
+ const char *e;
+ int r;
+
+ /* Allow the user to control whether the unified hierarchy is used */
+ e = getenv("UNIFIED_CGROUP_HIERARCHY");
+ if (e) {
+ r = parse_boolean(e);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+
+ arg_unified_cgroup_hierarchy = r;
+ return 0;
+ }
+
+ /* Otherwise inherit the default from the host system */
+ r = cg_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
+ arg_unified_cgroup_hierarchy = r;
+ return 0;
+}
+
static int parse_argv(int argc, char *argv[]) {
enum {
break;
case 'u':
- free(arg_user);
- arg_user = strdup(optarg);
- if (!arg_user)
+ r = free_and_strdup(&arg_user, optarg);
+ if (r < 0)
return log_oom();
break;
break;
case 'M':
- if (isempty(optarg)) {
- free(arg_machine);
- arg_machine = NULL;
- } else {
+ if (isempty(optarg))
+ arg_machine = mfree(arg_machine);
+ else {
if (!machine_name_is_valid(optarg)) {
log_error("Invalid machine name: %s", optarg);
return -EINVAL;
case ARG_BIND:
case ARG_BIND_RO: {
- _cleanup_free_ char *source = NULL, *destination = NULL;
+ const char *current = optarg;
+ _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
CustomMount *m;
- char *e;
- e = strchr(optarg, ':');
- if (e) {
- source = strndup(optarg, e - optarg);
- destination = strdup(e + 1);
- } else {
- source = strdup(optarg);
- destination = strdup(optarg);
+ r = extract_many_words(¤t, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL);
+ switch (r) {
+ case 1:
+ destination = strdup(source);
+ case 2:
+ case 3:
+ break;
+ case -ENOMEM:
+ return log_oom();
+ default:
+ log_error("Invalid bind mount specification: %s", optarg);
+ return -EINVAL;
}
if (!source || !destination)
m->source = source;
m->destination = destination;
m->read_only = c == ARG_BIND_RO;
+ m->options = opts;
- source = destination = NULL;
+ source = destination = opts = NULL;
break;
}
case ARG_TMPFS: {
+ const char *current = optarg;
_cleanup_free_ char *path = NULL, *opts = NULL;
CustomMount *m;
- char *e;
- e = strchr(optarg, ':');
- if (e) {
- path = strndup(optarg, e - optarg);
- opts = strdup(e + 1);
- } else {
- path = strdup(optarg);
- opts = strdup("mode=0755");
+ r = extract_first_word(¤t, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ else if (r < 0) {
+ log_error("Invalid tmpfs specification: %s", optarg);
+ return r;
}
+ if (r)
+ opts = strdup(current);
+ else
+ opts = strdup("mode=0755");
if (!path || !opts)
return log_oom();
unsigned n = 0;
char **i;
- lower = strv_split(optarg, ":");
- if (!lower)
+ r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
return log_oom();
+ else if (r < 0) {
+ log_error("Invalid overlay specification: %s", optarg);
+ return r;
+ }
STRV_FOREACH(i, lower) {
if (!path_is_absolute(*i)) {
/* If two parameters are specified,
* the first one is the lower, the
* second one the upper directory. And
- * we'll also define the the
- * destination mount point the same as
- * the upper. */
+ * we'll also define the destination
+ * mount point the same as the upper. */
upper = lower[1];
lower[1] = NULL;
if (arg_boot && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
+ r = detect_unified_cgroup_hierarchy();
+ if (r < 0)
+ return r;
+
return 1;
}
char *buf = NULL;
if (arg_userns && arg_uid_shift != 0) {
+ assert(arg_uid_shift != UID_INVALID);
if (options)
(void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
} MountPoint;
static const MountPoint mount_table[] = {
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
- { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
- { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true, true }, /* Then, make it r/o */
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
- { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
- { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
- { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
- { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
- { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
+ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
+ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
+ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
+ { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
+ { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
#ifdef HAVE_SELINUX
- { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
- { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false, false }, /* Then, make it r/o */
+ { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
+ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
#endif
};
return 0;
}
+static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
+ const char *p = options;
+ unsigned long flags = *mount_flags;
+ char *opts = NULL;
+
+ assert(options);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+ int r = extract_first_word(&p, &word, ",", 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract mount option: %m");
+ if (r == 0)
+ break;
+
+ if (streq(word, "rbind"))
+ flags |= MS_REC;
+ else if (streq(word, "norbind"))
+ flags &= ~MS_REC;
+ else {
+ log_error("Invalid bind mount option: %s", word);
+ return -EINVAL;
+ }
+ }
+
+ *mount_flags = flags;
+ /* in the future mount_opts will hold string options for mount(2) */
+ *mount_opts = opts;
+
+ return 0;
+}
+
static int mount_bind(const char *dest, CustomMount *m) {
struct stat source_st, dest_st;
const char *where;
+ unsigned long mount_flags = MS_BIND | MS_REC;
+ _cleanup_free_ char *mount_opts = NULL;
int r;
assert(m);
+ if (m->options) {
+ r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
+ if (r < 0)
+ return r;
+ }
+
if (stat(m->source, &source_st) < 0)
return log_error_errno(errno, "Failed to stat %s: %m", m->source);
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
- if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
+ if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
return log_error_errno(errno, "mount(%s) failed: %m", where);
if (m->read_only) {
return 0;
}
+static char *joined_and_escaped_lower_dirs(char * const *lower) {
+ _cleanup_strv_free_ char **sv = NULL;
+
+ sv = strv_copy(lower);
+ if (!sv)
+ return NULL;
+
+ strv_reverse(sv);
+
+ if (!strv_shell_escape(sv, ",:"))
+ return NULL;
+
+ return strv_join(sv, ":");
+}
+
static int mount_overlay(const char *dest, CustomMount *m) {
_cleanup_free_ char *lower = NULL;
const char *where, *options;
(void) mkdir_p_label(m->source, 0755);
- strv_reverse(m->lower);
- lower = strv_join(m->lower, ":");
- strv_reverse(m->lower);
+ lower = joined_and_escaped_lower_dirs(m->lower);
if (!lower)
return log_oom();
- if (m->read_only)
- options = strjoina("lowerdir=", m->source, ":", lower);
- else {
+ if (m->read_only) {
+ _cleanup_free_ char *escaped_source = NULL;
+
+ escaped_source = shell_escape(m->source, ",:");
+ if (!escaped_source)
+ return log_oom();
+
+ options = strjoina("lowerdir=", escaped_source, ":", lower);
+ } else {
+ _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
+
assert(m->work_dir);
(void) mkdir_label(m->work_dir, 0700);
- options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
+ escaped_source = shell_escape(m->source, ",:");
+ if (!escaped_source)
+ return log_oom();
+ escaped_work_dir = shell_escape(m->work_dir, ",:");
+ if (!escaped_work_dir)
+ return log_oom();
+
+ options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
}
if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
return 0;
}
-static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
char *to;
int r;
return 1;
}
-static int mount_cgroup(const char *dest) {
+static int mount_legacy_cgroups(const char *dest) {
_cleanup_set_free_free_ Set *controllers = NULL;
const char *cgroup_root;
int r;
+ cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+
+ /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+ r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+ if (r == 0) {
+ _cleanup_free_ char *options = NULL;
+
+ r = tmpfs_patch_options("mode=755", &options);
+ if (r < 0)
+ return log_oom();
+
+ if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
+ return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+ }
+
+ if (cg_unified() > 0)
+ goto skip_controllers;
+
controllers = set_new(&string_hash_ops);
if (!controllers)
return log_oom();
if (r == -EINVAL) {
/* Not a symbolic link, but directly a single cgroup hierarchy */
- r = mount_cgroup_hierarchy(dest, controller, controller, true);
+ r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
if (r < 0)
return r;
continue;
}
- r = mount_cgroup_hierarchy(dest, combined, combined, true);
+ r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
if (r < 0)
return r;
}
}
- r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
+skip_controllers:
+ r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
if (r < 0)
return r;
- cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
return 0;
}
+static int mount_unified_cgroups(const char *dest) {
+ const char *p;
+ int r;
+
+ assert(dest);
+
+ p = strjoina(dest, "/sys/fs/cgroup");
+
+ r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
+ if (r > 0) {
+ p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
+ if (access(p, F_OK) >= 0)
+ return 0;
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
+
+ log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
+ return -EINVAL;
+ }
+
+ if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
+ return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
+
+ return 0;
+}
+
+static int mount_cgroups(const char *dest) {
+ if (arg_unified_cgroup_hierarchy)
+ return mount_unified_cgroups(dest);
+ else
+ return mount_legacy_cgroups(dest);
+}
+
static int mount_systemd_cgroup_writable(const char *dest) {
_cleanup_free_ char *own_cgroup_path = NULL;
const char *systemd_root, *systemd_own;
if (r < 0)
return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+ /* If we are living in the top-level, then there's nothing to do... */
+ if (path_equal(own_cgroup_path, "/"))
+ return 0;
+
+ if (arg_unified_cgroup_hierarchy) {
+ systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
+ systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
+ } else {
+ systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+ systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+ }
+
/* Make our own cgroup a (writable) bind mount */
- systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
/* And then remount the systemd cgroup root read-only */
- systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
id128_format_as_uuid(rnd, as_uuid);
- r = write_string_file(from, as_uuid);
+ r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
#ifdef HAVE_SELINUX
if (arg_selinux_apifs_context)
(void) asprintf(&options,
- "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"",
- arg_uid_shift,
+ "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
arg_uid_shift + TTY_GID,
arg_selinux_apifs_context);
else
#endif
(void) asprintf(&options,
- "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT,
- arg_uid_shift,
+ "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
arg_uid_shift + TTY_GID);
if (!options)
static int register_machine(pid_t pid, int local_ifindex) {
_cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
- _cleanup_bus_close_unref_ sd_bus *bus = NULL;
+ _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
int r;
if (!arg_register)
static int terminate_machine(pid_t pid) {
_cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
_cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
- _cleanup_bus_close_unref_ sd_bus *bus = NULL;
+ _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
const char *path;
int r;
if (streq(p, "4294967295"))
return 0;
- r = write_string_file("/proc/self/loginuid", "4294967295");
+ r = write_string_file("/proc/self/loginuid", "4294967295", 0);
if (r < 0) {
log_error_errno(r,
"Failed to reset audit login UID. This probably means that your kernel is too\n"
static int determine_names(void) {
int r;
+ if (arg_template && !arg_directory && arg_machine) {
+
+ /* If --template= was specified then we should not
+ * search for a machine, but instead create a new one
+ * in /var/lib/machine. */
+
+ arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
+ if (!arg_directory)
+ return log_oom();
+ }
+
if (!arg_image && !arg_directory) {
if (arg_machine) {
_cleanup_(image_unrefp) Image *i = NULL;
if (!arg_machine)
return log_oom();
- hostname_cleanup(arg_machine, false);
+ hostname_cleanup(arg_machine);
if (!machine_name_is_valid(arg_machine)) {
log_error("Failed to determine machine name automatically, please use -M.");
return -EINVAL;
assert(directory);
assert(kmsg_socket >= 0);
+ cg_unified_flush();
+
if (arg_userns) {
/* Tell the parent, that it now can write the UID map. */
(void) barrier_place(barrier); /* #1 */
int pid_socket,
int kmsg_socket,
int rtnl_socket,
+ int uid_shift_socket,
FDSet *fds,
int argc,
char *argv[]) {
assert(pid_socket >= 0);
assert(kmsg_socket >= 0);
+ cg_unified_flush();
+
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
if (r < 0)
return r;
+ r = determine_uid_shift(directory);
+ if (r < 0)
+ return r;
+
+ if (arg_userns) {
+ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send UID shift: %m");
+ if (l != sizeof(arg_uid_shift)) {
+ log_error("Short write while sending UID shift.");
+ return -EIO;
+ }
+ }
+
/* Turn directory into bind mount */
if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
return log_error_errno(errno, "Failed to make bind mount: %m");
if (r < 0)
return r;
- r = mount_cgroup(directory);
+ r = mount_cgroups(directory);
if (r < 0)
return r;
NULL);
if (pid < 0)
return log_error_errno(errno, "Failed to fork inner child: %m");
-
if (pid == 0) {
pid_socket = safe_close(pid_socket);
+ uid_shift_socket = safe_close(uid_shift_socket);
/* The inner child has all namespaces that are
* requested, so that we all are owned by the user if
xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
- r = write_string_file(uid_map, line);
+ r = write_string_file(uid_map, line, 0);
if (r < 0)
return log_error_errno(r, "Failed to write UID map: %m");
/* We always assign the same UID and GID ranges */
xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
- r = write_string_file(uid_map, line);
+ r = write_string_file(uid_map, line, 0);
if (r < 0)
return log_error_errno(r, "Failed to write GID map: %m");
if (fd < 0)
return log_error_errno(errno, "Failed to open %s: %m", fs);
- FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
+ FOREACH_STRING(fn,
+ ".",
+ "tasks",
+ "notify_on_release",
+ "cgroup.procs",
+ "cgroup.clone_children",
+ "cgroup.controllers",
+ "cgroup.subtree_control",
+ "cgroup.populated")
if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
- log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
+ log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+ "Failed to chown() cgroup file %s, ignoring: %m", fn);
+
+ return 0;
+}
+
+static int sync_cgroup(pid_t pid) {
+ _cleanup_free_ char *cgroup = NULL;
+ char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
+ bool undo_mount = false;
+ const char *fn;
+ int unified, r;
+
+ unified = cg_unified();
+ if (unified < 0)
+ return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+
+ if ((unified > 0) == arg_unified_cgroup_hierarchy)
+ return 0;
+
+ /* When the host uses the legacy cgroup setup, but the
+ * container shall use the unified hierarchy, let's make sure
+ * we copy the path from the name=systemd hierarchy into the
+ * unified hierarchy. Similar for the reverse situation. */
+
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
+
+ /* In order to access the unified hierarchy we need to mount it */
+ if (!mkdtemp(tree))
+ return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
+
+ if (unified)
+ r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+ else
+ r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
+ if (r < 0) {
+ r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+ goto finish;
+ }
+
+ undo_mount = true;
+
+ fn = strjoina(tree, cgroup, "/cgroup.procs");
+ (void) mkdir_parents(fn, 0755);
+
+ sprintf(pid_string, PID_FMT, pid);
+ r = write_string_file(fn, pid_string, 0);
+ if (r < 0)
+ log_error_errno(r, "Failed to move process: %m");
+
+finish:
+ if (undo_mount)
+ (void) umount(tree);
+
+ (void) rmdir(tree);
+ return r;
+}
+
+static int create_subcgroup(pid_t pid) {
+ _cleanup_free_ char *cgroup = NULL;
+ const char *child;
+ int unified, r;
+
+ /* In the unified hierarchy inner nodes may only only contain
+ * subgroups, but not processes. Hence, if we running in the
+ * unified hierarchy and the container does the same, and we
+ * did not create a scope unit for the container move us and
+ * the container into two separate subcgroups. */
+
+ if (!arg_keep_unit)
+ return 0;
+
+ if (!arg_unified_cgroup_hierarchy)
+ return 0;
+
+ unified = cg_unified();
+ if (unified < 0)
+ return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+ if (unified == 0)
+ return 0;
+
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get our control group: %m");
+
+ child = strjoina(cgroup, "/payload");
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+ child = strjoina(cgroup, "/supervisor");
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
return 0;
}
if (r < 0)
goto finish;
- r = determine_uid_shift(arg_directory);
- if (r < 0)
- return r;
-
if (geteuid() != 0) {
log_error("Need to be root.");
r = -EPERM;
}
for (;;) {
- _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 };
+ _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
+ uid_shift_socket_pair[2] = { -1, -1 };
ContainerStatus container_status;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
static const struct sigaction sa = {
};
int ifi = 0;
ssize_t l;
- _cleanup_event_unref_ sd_event *event = NULL;
- _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
- _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
- char last_char = 0;
+ _cleanup_event_unref_ sd_event *event = NULL;
+ _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
+ _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
+ char last_char = 0;
r = barrier_create(&barrier);
if (r < 0) {
goto finish;
}
+ if (arg_userns)
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
+ r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
+ goto finish;
+ }
+
/* Child can be killed before execv(), so handle SIGCHLD
* in order to interrupt parent's blocking calls and
* give it a chance to call wait() and terminate. */
kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
+ uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
(void) reset_all_signal_handlers();
(void) reset_signal_mask();
pid_socket_pair[1],
kmsg_socket_pair[1],
rtnl_socket_pair[1],
+ uid_shift_socket_pair[1],
fds,
argc, argv);
if (r < 0)
goto finish;
}
+ l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
+ if (l < 0) {
+ r = log_error_errno(errno, "Failed to read UID shift: %m");
+ goto finish;
+ }
+ if (l != sizeof(arg_uid_shift)) {
+ log_error("Short read while reading UID shift: %m");
+ r = EIO;
+ goto finish;
+ }
+
r = setup_uid_map(pid);
if (r < 0)
goto finish;
if (r < 0)
goto finish;
+ r = sync_cgroup(pid);
+ if (r < 0)
+ goto finish;
+
+ r = create_subcgroup(pid);
+ if (r < 0)
+ goto finish;
+
r = chown_cgroup(pid);
if (r < 0)
goto finish;