static char *arg_network_bridge = NULL;
static char *arg_network_zone = NULL;
static char *arg_network_namespace_path = NULL;
+static PagerFlags arg_pager_flags = 0;
static unsigned long arg_personality = PERSONALITY_INVALID;
static char *arg_image = NULL;
static char *arg_oci_bundle = NULL;
static bool arg_no_new_privileges = false;
static int arg_oom_score_adjust = 0;
static bool arg_oom_score_adjust_set = false;
-static cpu_set_t *arg_cpuset = NULL;
-static unsigned arg_cpuset_ncpus = 0;
+static CPUSet arg_cpu_set = {};
static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
static TimezoneMode arg_timezone = TIMEZONE_AUTO;
static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
#if HAVE_SECCOMP
STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
#endif
-STATIC_DESTRUCTOR_REGISTER(arg_cpuset, CPU_FREEp);
+STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
static int help(void) {
_cleanup_free_ char *link = NULL;
int r;
- (void) pager_open(false);
+ (void) pager_open(arg_pager_flags);
r = terminal_urlify_man("systemd-nspawn", "1", &link);
if (r < 0)
" -h --help Show this help\n"
" --version Print version string\n"
" -q --quiet Do not show status information\n"
+ " --no-pager Do not pipe output into a pager\n"
" --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
"%3$sImage:%4$s\n"
" -D --directory=PATH Root directory for the container\n"
" if missing\n"
" -x --ephemeral Run container with snapshot of root directory, and\n"
" remove it after exit\n"
- " -i --image=PATH File system device or disk image for the container\n"
+ " -i --image=PATH Root file system disk image (or device node) for\n"
+ " the container\n"
" --oci-bundle=PATH OCI bundle directory\n"
" --read-only Mount the root directory read-only\n"
" --volatile[=MODE] Run the system in volatile mode\n"
- " --root-hash=HASH Specify verity root hash\n"
+ " --root-hash=HASH Specify verity root hash for root disk image\n"
" --pivot-root=PATH[:PATH]\n"
" Pivot root to given directory in the container\n\n"
"%3$sExecution:%4$s\n"
ARG_CONSOLE,
ARG_PIPE,
ARG_OCI_BUNDLE,
+ ARG_NO_PAGER,
};
static const struct option options[] = {
{ "console", required_argument, NULL, ARG_CONSOLE },
{ "pipe", no_argument, NULL, ARG_PIPE },
{ "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
+ { "no-pager", no_argument, NULL, ARG_NO_PAGER },
{}
};
case ARG_NETWORK_ZONE: {
char *j;
- j = strappend("vz-", optarg);
+ j = strjoin("vz-", optarg);
if (!j)
return log_oom();
case ARG_LINK_JOURNAL:
r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
- if (r < 0) {
- log_error_errno(r, "Failed to parse link journal mode %s", optarg);
- return -EINVAL;
- }
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
arg_settings_mask |= SETTING_LINK_JOURNAL;
break;
if (r < 0)
return log_error_errno(r, "Failed to parse root hash: %s", optarg);
if (l < sizeof(sd_id128_t)) {
- log_error("Root hash must be at least 128bit long: %s", optarg);
free(k);
- return -EINVAL;
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
}
free(arg_root_hash);
case ARG_RLIMIT: {
const char *eq;
- char *name;
+ _cleanup_free_ char *name = NULL;
int rl;
if (streq(optarg, "help")) {
break;
case ARG_CPU_AFFINITY: {
- _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
+ CPUSet cpuset;
r = parse_cpu_set(optarg, &cpuset);
if (r < 0)
- return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
-
- if (arg_cpuset)
- CPU_FREE(arg_cpuset);
+ return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
- arg_cpuset = TAKE_PTR(cpuset);
- arg_cpuset_ncpus = r;
+ cpu_set_reset(&arg_cpu_set);
+ arg_cpu_set = cpuset;
arg_settings_mask |= SETTING_CPU_AFFINITY;
break;
}
"read-only\n"
"passive\n"
"pipe");
- else {
- log_error("Unknown console mode: %s", optarg);
- return -EINVAL;
- }
+ else
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
arg_settings_mask |= SETTING_CONSOLE_MODE;
break;
arg_settings_mask |= SETTING_CONSOLE_MODE;
break;
+ case ARG_NO_PAGER:
+ arg_pager_flags |= PAGER_DISABLE;
+ break;
+
case '?':
return -EINVAL;
/* We don't support --private-users-chown together with any of the volatile modes since we couldn't
* change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
- * copy-up (in case of overlay) making the entire excercise pointless. */
+ * copy-up (in case of overlay) making the entire exercise pointless. */
if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
case TIMEZONE_COPY:
/* If mounting failed, try to copy */
- r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
+ r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
if (r < 0) {
log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to copy /etc/localtime to %s, ignoring: %m", where);
}
/* If that didn't work, let's copy the file */
- r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
+ r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
if (r < 0) {
/* If the file already exists as symlink, let's suppress the warning, under the assumption that
* resolved or something similar runs inside and the symlink points there.
const char *to;
int r;
- /* Generate a new randomized boot ID, so that each boot-up of
- * the container gets a new one */
+ /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
- r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+ r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
if (r < 0)
return log_error_errno(r, "Failed to generate random boot ID path: %m");
_cleanup_free_ char *from = NULL, *to = NULL;
struct stat st;
- from = strappend("/dev/", d);
+ from = path_join("/dev/", d);
if (!from)
return log_oom();
- to = prefix_root(dest, from);
+ to = path_join(dest, from);
if (!to)
return log_oom();
if (r < 0)
return log_error_errno(r, "chown() of device node %s failed: %m", to);
- dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
+ dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
if (!dn)
return log_oom();
if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
return log_oom();
- prefixed = prefix_root(dest, sl);
+ prefixed = path_join(dest, sl);
if (!prefixed)
return log_oom();
- t = strjoin("../", d);
+ t = path_join("..", d);
if (!t)
return log_oom();
_cleanup_free_ char *path = NULL;
DeviceNode *n = arg_extra_nodes + i;
- path = prefix_root(dest, n->path);
+ path = path_join(dest, n->path);
if (!path)
return log_oom();
return 0;
}
-static int setup_dev_console(const char *dest, const char *console) {
- _cleanup_umask_ mode_t u;
- const char *to;
+static int setup_stdio_as_dev_console(void) {
+ int terminal;
int r;
- assert(dest);
-
- u = umask(0000);
+ terminal = open_terminal("/dev/console", O_RDWR);
+ if (terminal < 0)
+ return log_error_errno(terminal, "Failed to open console: %m");
- if (!console)
- return 0;
+ /* Make sure we can continue logging to the original stderr, even if
+ * stderr points elsewhere now */
+ r = log_dup_console();
+ if (r < 0)
+ return log_error_errno(r, "Failed to duplicate stderr: %m");
- r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
+ /* invalidates 'terminal' on success and failure */
+ r = rearrange_stdio(terminal, terminal, terminal);
if (r < 0)
- return log_error_errno(r, "Failed to correct access mode for TTY: %m");
+ return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
- /* We need to bind mount the right tty to /dev/console since
- * ptys can only exist on pts file systems. To have something
- * to bind mount things on we create a empty regular file. */
+ return 0;
+}
- to = prefix_roota(dest, "/dev/console");
- r = touch(to);
+static int setup_dev_console(const char *console) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ /* Create /dev/console symlink */
+ r = path_make_relative("/dev", console, &p);
if (r < 0)
- return log_error_errno(r, "touch() for /dev/console failed: %m");
+ return log_error_errno(r, "Failed to create relative path: %m");
+
+ if (symlink(p, "/dev/console") < 0)
+ return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
- return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
+ return 0;
}
static int setup_keyring(void) {
_cleanup_free_ char *fifo = NULL;
_cleanup_close_ int fd = -1;
_cleanup_umask_ mode_t u;
- const char *to;
int r;
assert(kmsg_socket >= 0);
u = umask(0000);
- /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+ /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
* /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
* differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
* with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
- r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+ r = tempfn_random_child("/run", "proc-kmsg", &fifo);
if (r < 0)
return log_error_errno(r, "Failed to generate kmsg path: %m");
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
from = TAKE_PTR(fifo);
- to = "/proc/kmsg";
- r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+ r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
if (r < 0)
return r;
if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
q.ambient = 0;
- } else
+
+ if (capability_quintet_mangle(&q))
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
+
+ } else {
q = (CapabilityQuintet) {
.bounding = arg_caps_retain,
.effective = uid == 0 ? arg_caps_retain : 0,
.ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
};
+ /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
+ * in order to maintain the same behavior as systemd < 242. */
+ if (capability_quintet_mangle(&q))
+ log_warning("Some capabilities will not be set because they are not in the current bounding set.");
+
+ }
+
return capability_quintet_enforce(&q);
}
* search for a machine, but instead create a new one
* in /var/lib/machine. */
- arg_directory = strjoin("/var/lib/machines/", arg_machine);
+ arg_directory = path_join("/var/lib/machines", arg_machine);
if (!arg_directory)
return log_oom();
}
return log_error_errno(r, "Failed to determine current directory: %m");
}
- if (!arg_directory && !arg_image) {
- log_error("Failed to determine path, please use -D or -i.");
- return -EINVAL;
- }
+ if (!arg_directory && !arg_image)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
}
if (!arg_machine) {
return log_oom();
hostname_cleanup(arg_machine);
- if (!machine_name_is_valid(arg_machine)) {
- log_error("Failed to determine machine name automatically, please use -M.");
- return -EINVAL;
- }
+ if (!machine_name_is_valid(arg_machine))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
if (arg_ephemeral) {
char *b;
break;
}
- if (!good) {
- log_error("Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
- return -EPERM;
- }
+ if (!good)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
r = sysctl_write(*k, *v);
if (r < 0)
bool secondary,
int kmsg_socket,
int rtnl_socket,
+ int master_pty_socket,
FDSet *fds) {
_cleanup_free_ char *home = NULL;
rtnl_socket = safe_close(rtnl_socket);
}
+ if (arg_console_mode != CONSOLE_PIPE) {
+ _cleanup_close_ int master = -1;
+ _cleanup_free_ char *console = NULL;
+
+ /* Allocate a pty and make it available as /dev/console. */
+ master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
+ if (master < 0)
+ return log_error_errno(master, "Failed to allocate a pty: %m");
+
+ r = setup_dev_console(console);
+ if (r < 0)
+ return log_error_errno(r, "Failed to setup /dev/console: %m");
+
+ r = send_one_fd(master_pty_socket, master, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to send master fd: %m");
+ master_pty_socket = safe_close(master_pty_socket);
+
+ r = setup_stdio_as_dev_console();
+ if (r < 0)
+ return r;
+ }
+
r = patch_sysctl();
if (r < 0)
return r;
return log_error_errno(r, "Failed to adjust OOM score: %m");
}
- if (arg_cpuset)
- if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
+ if (arg_cpu_set.set)
+ if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
return log_error_errno(errno, "Failed to set CPU affinity: %m");
(void) setup_hostname();
if (is_seccomp_available()) {
r = seccomp_load(arg_seccomp);
- if (IN_SET(r, -EPERM, -EACCES))
+ if (ERRNO_IS_SECCOMP_FATAL(r))
return log_error_errno(r, "Failed to install seccomp filter: %m");
if (r < 0)
log_debug_errno(r, "Failed to install seccomp filter: %m");
static int outer_child(
Barrier *barrier,
const char *directory,
- const char *console,
DissectedImage *dissected_image,
bool secondary,
int pid_socket,
int kmsg_socket,
int rtnl_socket,
int uid_shift_socket,
+ int master_pty_socket,
int unified_cgroup_hierarchy_socket,
FDSet *fds,
int netns_fd) {
assert(pid_socket >= 0);
assert(uuid_socket >= 0);
assert(notify_socket >= 0);
+ assert(master_pty_socket >= 0);
assert(kmsg_socket >= 0);
log_debug("Outer child is initializing.");
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
- if (arg_console_mode != CONSOLE_PIPE) {
- int terminal;
-
- assert(console);
-
- terminal = open_terminal(console, O_RDWR);
- if (terminal < 0)
- return log_error_errno(terminal, "Failed to open console: %m");
-
- /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
- r = log_dup_console();
- if (r < 0)
- return log_error_errno(r, "Failed to duplicate stderr: %m");
-
- r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
- if (r < 0)
- return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
- }
-
r = reset_audit_loginuid();
if (r < 0)
return r;
arg_userns_mode != USER_NAMESPACE_NO,
arg_uid_shift,
arg_uid_range,
- arg_selinux_context);
+ arg_selinux_apifs_context);
if (r < 0)
return r;
/* Mark everything as shared so our mounts get propagated down. This is
* required to make new bind mounts available in systemd services
- * inside the containter that create a new mount namespace.
+ * inside the container that create a new mount namespace.
* See https://github.com/systemd/systemd/issues/3860
* Further submounts (such as /dev) done after this will inherit the
* shared propagation mode. */
return r;
if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
- r = bind_remount_recursive(directory, true, NULL);
+ r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
if (r < 0)
return log_error_errno(r, "Failed to make tree read-only: %m");
}
if (r < 0)
return r;
- r = setup_dev_console(directory, console);
- if (r < 0)
- return r;
-
r = setup_keyring();
if (r < 0)
return r;
return log_error_errno(r, "Failed to join network namespace: %m");
}
- r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
+ r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
if (r < 0)
_exit(EXIT_FAILURE);
l = send_one_fd(notify_socket, fd, 0);
if (l < 0)
- return log_error_errno(errno, "Failed to send notify fd: %m");
+ return log_error_errno(l, "Failed to send notify fd: %m");
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
notify_socket = safe_close(notify_socket);
+ master_pty_socket = safe_close(master_pty_socket);
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
netns_fd = safe_close(netns_fd);
/* Copy the full capabilities over too */
if (capability_quintet_is_set(&settings->full_capabilities)) {
if (!arg_settings_trusted)
- log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path);
+ log_warning("Ignoring capability settings, file %s is not trusted.", path);
else
arg_full_capabilities = settings->full_capabilities;
}
}
if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
- settings->cpuset) {
+ settings->cpu_set.set) {
if (!arg_settings_trusted)
log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
else {
- if (arg_cpuset)
- CPU_FREE(arg_cpuset);
- arg_cpuset = TAKE_PTR(settings->cpuset);
- arg_cpuset_ncpus = settings->cpuset_ncpus;
+ cpu_set_reset(&arg_cpu_set);
+ arg_cpu_set = settings->cpu_set;
+ settings->cpu_set = (CPUSet) {};
}
}
arg_console_width = settings->console_width;
arg_console_height = settings->console_height;
- device_node_free_many(arg_extra_nodes, arg_n_extra_nodes);
+ device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
arg_n_extra_nodes = settings->n_extra_nodes;
FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
_cleanup_free_ char *j = NULL;
- j = strjoin(i, "/", fn);
+ j = path_join(i, fn);
if (!j)
return log_oom();
p = file_in_same_dir(arg_image, fn);
if (!p)
return log_oom();
- } else if (arg_directory) {
+ } else if (arg_directory && !path_equal(arg_directory, "/")) {
p = file_in_same_dir(arg_directory, fn);
if (!p)
return log_oom();
return merge_settings(settings, arg_oci_bundle);
}
-static int run_container(int master,
- const char* console,
+static int run_container(
DissectedImage *dissected_image,
bool secondary,
FDSet *fds,
char veth_name[IFNAMSIZ], bool *veth_created,
union in_addr_union *exposed,
- pid_t *pid, int *ret) {
+ int *master, pid_t *pid, int *ret) {
static const struct sigaction sa = {
.sa_handler = nop_signal_handler,
uuid_socket_pair[2] = { -1, -1 },
notify_socket_pair[2] = { -1, -1 },
uid_shift_socket_pair[2] = { -1, -1 },
+ master_pty_socket_pair[2] = { -1, -1 },
unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
- _cleanup_close_ int notify_socket= -1;
+ _cleanup_close_ int notify_socket = -1;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
return log_error_errno(errno, "Failed to create notify socket pair: %m");
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create console socket pair: %m");
+
if (arg_userns_mode != USER_NAMESPACE_NO)
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
else if (r < 0)
return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
- else if (r == 0) {
- log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
- return -EINVAL;
- }
+ else if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
}
*pid = raw_clone(SIGCHLD|CLONE_NEWNS);
/* The outer child only has a file system namespace. */
barrier_set_role(&barrier, BARRIER_CHILD);
- master = safe_close(master);
-
kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
+ master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
r = outer_child(&barrier,
arg_directory,
- console,
dissected_image,
secondary,
pid_socket_pair[1],
kmsg_socket_pair[1],
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
+ master_pty_socket_pair[1],
unified_cgroup_hierarchy_socket_pair[1],
fds,
netns_fd);
barrier_set_role(&barrier, BARRIER_PARENT);
- fds = fdset_free(fds);
+ fdset_close(fds);
kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
+ master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read UID shift: %m");
- if (l != sizeof arg_uid_shift) {
- log_error("Short read while reading UID shift.");
- return -EIO;
- }
+ if (l != sizeof arg_uid_shift)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
if (arg_userns_mode == USER_NAMESPACE_PICK) {
/* If we are supposed to pick the UID shift, let's try to use the shift read from the
l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
if (l < 0)
return log_error_errno(errno, "Failed to send UID shift: %m");
- if (l != sizeof arg_uid_shift) {
- log_error("Short write while writing UID shift.");
- return -EIO;
- }
+ if (l != sizeof arg_uid_shift)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
}
}
l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
if (l < 0)
return log_error_errno(errno, "Failed to read cgroup mode: %m");
- if (l != sizeof(arg_unified_cgroup_hierarchy)) {
- log_error("Short read while reading cgroup mode (%zu bytes).%s",
- l, l == 0 ? " The child is most likely dead." : "");
- return -EIO;
- }
+ if (l != sizeof(arg_unified_cgroup_hierarchy))
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
+ l, l == 0 ? " The child is most likely dead." : "");
}
/* Wait for the outer child. */
l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read inner child PID: %m");
- if (l != sizeof *pid) {
- log_error("Short read while reading inner child PID.");
- return -EIO;
- }
+ if (l != sizeof *pid)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
/* We also retrieve container UUID in case it was generated by outer child */
l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read container machine ID: %m");
- if (l != sizeof(arg_uuid)) {
- log_error("Short read while reading container machined ID.");
- return -EIO;
- }
+ if (l != sizeof(arg_uuid))
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
/* We also retrieve the socket used for notifications generated by outer child */
notify_socket = receive_one_fd(notify_socket_pair[0], 0);
log_debug("Init process invoked as PID "PID_FMT, *pid);
if (arg_userns_mode != USER_NAMESPACE_NO) {
- if (!barrier_place_and_sync(&barrier)) { /* #1 */
- log_error("Child died too early.");
- return -ESRCH;
- }
+ if (!barrier_place_and_sync(&barrier)) /* #1 */
+ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
r = setup_uid_map(*pid);
if (r < 0)
if (arg_private_network) {
if (!arg_network_namespace_path) {
/* Wait until the child has unshared its network namespace. */
- if (!barrier_place_and_sync(&barrier)) { /* #3 */
- log_error("Child died too early");
- return -ESRCH;
- }
+ if (!barrier_place_and_sync(&barrier)) /* #3 */
+ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
}
r = move_network_interfaces(*pid, arg_network_interfaces);
return r;
/* Let the child know that we are ready and wait that the child is completely ready now. */
- if (!barrier_place_and_sync(&barrier)) { /* #5 */
- log_error("Child died too early.");
- return -ESRCH;
- }
+ if (!barrier_place_and_sync(&barrier)) /* #5 */
+ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
/* At this point we have made use of the UID we picked, and thus nss-mymachines
* will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
- if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
- assert(master >= 0);
+ if (arg_console_mode != CONSOLE_PIPE) {
+ _cleanup_close_ int fd = -1;
+ PTYForwardFlags flags = 0;
- r = pty_forward_new(event, master,
- PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
- &forward);
- if (r < 0)
- return log_error_errno(r, "Failed to create PTY forwarder: %m");
+ /* Retrieve the master pty allocated by inner child */
+ fd = receive_one_fd(master_pty_socket_pair[0], 0);
+ if (fd < 0)
+ return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
- if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
- (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
+ switch (arg_console_mode) {
+
+ case CONSOLE_READ_ONLY:
+ flags |= PTY_FORWARD_READ_ONLY;
+
+ _fallthrough_;
+
+ case CONSOLE_INTERACTIVE:
+ flags |= PTY_FORWARD_IGNORE_VHANGUP;
+
+ r = pty_forward_new(event, fd, flags, &forward);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create PTY forwarder: %m");
+
+ if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
+ (void) pty_forward_set_width_height(forward,
+ arg_console_width,
+ arg_console_height);
+ break;
+
+ default:
+ assert(arg_console_mode == CONSOLE_PASSIVE);
+ }
+
+ *master = TAKE_FD(fd);
}
r = sd_event_loop(event);
}
static int run(int argc, char *argv[]) {
- _cleanup_free_ char *console = NULL;
+ bool secondary = false, remove_directory = false, remove_image = false,
+ veth_created = false, remove_tmprootdir = false;
_cleanup_close_ int master = -1;
_cleanup_fdset_free_ FDSet *fds = NULL;
int r, n_fd_passed, ret = EXIT_SUCCESS;
char veth_name[IFNAMSIZ] = "";
- bool secondary = false, remove_directory = false, remove_image = false;
- pid_t pid = 0;
union in_addr_union exposed = {};
_cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
- bool veth_created = false, remove_tmprootdir = false;
char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
_cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
_cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+ pid_t pid = 0;
log_parse_environment();
log_open();
- /* Make sure rename_process() in the stub init process can work */
- saved_argv = argv;
- saved_argc = argc;
-
r = parse_argv(argc, argv);
if (r <= 0)
goto finish;
if (arg_directory) {
assert(!arg_image);
- if (path_equal(arg_directory, "/") && !arg_ephemeral) {
- log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
+ /* Safety precaution: let's not allow running images from the live host OS image, as long as
+ * /var from the host will propagate into container dynamically (because bad things happen if
+ * two systems write to the same /var). Let's allow it for the special cases where /var is
+ * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
+ if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
+ log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
r = -EINVAL;
goto finish;
}
if (r < 0)
goto finish;
- /* If the specified path is a mount point we
- * generate the new snapshot immediately
- * inside it under a random name. However if
- * the specified is not a mount point we
- * create the new snapshot in the parent
- * directory, just next to it. */
+ /* If the specified path is a mount point we generate the new snapshot immediately
+ * inside it under a random name. However if the specified is not a mount point we
+ * create the new snapshot in the parent directory, just next to it. */
r = path_is_mount_point(arg_directory, NULL, 0);
if (r < 0) {
log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
goto finish;
}
- r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+ /* We take an exclusive lock on this image, since it's our private, ephemeral copy
+ * only owned by us and noone else. */
+ r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
if (r < 0) {
log_error_errno(r, "Failed to lock %s: %m", np);
goto finish;
}
- r = btrfs_subvol_snapshot(arg_directory, np,
- (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
- BTRFS_SNAPSHOT_FALLBACK_COPY |
- BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
- BTRFS_SNAPSHOT_RECURSIVE |
- BTRFS_SNAPSHOT_QUOTA);
+ {
+ BLOCK_SIGNALS(SIGINT);
+ r = btrfs_subvol_snapshot(arg_directory, np,
+ (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_QUOTA |
+ BTRFS_SNAPSHOT_SIGINT);
+ }
+ if (r == -EINTR) {
+ log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
+ goto finish;
+ }
if (r < 0) {
log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
goto finish;
}
free_and_replace(arg_directory, np);
-
remove_directory = true;
-
} else {
r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
if (r < 0)
if (r < 0)
goto finish;
- r = btrfs_subvol_snapshot(arg_template, arg_directory,
- (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
- BTRFS_SNAPSHOT_FALLBACK_COPY |
- BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
- BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
- BTRFS_SNAPSHOT_RECURSIVE |
- BTRFS_SNAPSHOT_QUOTA);
+ {
+ BLOCK_SIGNALS(SIGINT);
+ r = btrfs_subvol_snapshot(arg_template, arg_directory,
+ (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_QUOTA |
+ BTRFS_SNAPSHOT_SIGINT);
+ }
if (r == -EEXIST)
log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
"Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
- else if (r < 0) {
+ else if (r == -EINTR) {
+ log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
+ goto finish;
+ } else if (r < 0) {
log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
goto finish;
} else
goto finish;
}
- r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+ /* Always take an exclusive lock on our own ephemeral copy. */
+ r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
if (r < 0) {
r = log_error_errno(r, "Failed to create image lock: %m");
goto finish;
}
- r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
+ {
+ BLOCK_SIGNALS(SIGINT);
+ r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
+ }
+ if (r == -EINTR) {
+ log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
+ goto finish;
+ }
if (r < 0) {
r = log_error_errno(r, "Failed to copy image file: %m");
goto finish;
}
free_and_replace(arg_image, np);
-
remove_image = true;
} else {
r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
arg_quiet = true;
- if (arg_console_mode != CONSOLE_PIPE) {
- master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
- if (master < 0) {
- r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
- goto finish;
- }
-
- r = ptsname_malloc(master, &console);
- if (r < 0) {
- r = log_error_errno(r, "Failed to determine tty name: %m");
- goto finish;
- }
-
- if (arg_selinux_apifs_context) {
- r = mac_selinux_apply(console, arg_selinux_apifs_context);
- if (r < 0)
- goto finish;
- }
-
- if (unlockpt(master) < 0) {
- r = log_error_errno(errno, "Failed to unlock tty: %m");
- goto finish;
- }
- }
-
if (!arg_quiet)
log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
arg_machine, arg_image ?: arg_directory);
}
for (;;) {
- r = run_container(master,
- console,
- dissected_image,
+ r = run_container(dissected_image,
secondary,
fds,
veth_name, &veth_created,
- &exposed,
+ &exposed, &master,
&pid, &ret);
if (r <= 0)
break;
custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
expose_port_free_all(arg_expose_ports);
rlimit_free_all(arg_rlimit);
- device_node_free_many(arg_extra_nodes, arg_n_extra_nodes);
+ device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
if (r < 0)
return r;