#include <sched.h>
#include <unistd.h>
#include <sys/types.h>
-#include <sys/syscall.h>
#include <sys/mount.h>
-#include <sys/wait.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/prctl.h>
#include <getopt.h>
-#include <termios.h>
-#include <sys/signalfd.h>
#include <grp.h>
#include <linux/fs.h>
-#include <sys/un.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <net/if.h>
#include <linux/veth.h>
#include <sys/personality.h>
#include <linux/loop.h>
-#include <poll.h>
#include <sys/file.h>
#ifdef HAVE_SELINUX
#include "log.h"
#include "util.h"
#include "mkdir.h"
+#include "rm-rf.h"
#include "macro.h"
-#include "audit.h"
#include "missing.h"
#include "cgroup-util.h"
#include "strv.h"
#include "bus-util.h"
#include "bus-error.h"
#include "ptyfwd.h"
-#include "bus-kernel.h"
#include "env-util.h"
-#include "def.h"
#include "rtnl-util.h"
#include "udev-util.h"
#include "blkid-util.h"
#include "in-addr-util.h"
#include "fw-util.h"
#include "local-addresses.h"
+#include "formats-util.h"
+#include "process-util.h"
#ifdef HAVE_SECCOMP
#include "seccomp-util.h"
static char **arg_property = NULL;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns = false;
+static int arg_kill_signal = 0;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
" --capability=CAP In addition to the default, retain specified\n"
" capability\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
+ " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
" try-guest, try-host\n"
" -j Equivalent to --link-journal=try-guest\n"
ARG_TEMPLATE,
ARG_PROPERTY,
ARG_PRIVATE_USERS,
+ ARG_KILL_SIGNAL,
};
static const struct option options[] = {
{ "port", required_argument, NULL, 'p' },
{ "property", required_argument, NULL, ARG_PROPERTY },
{ "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
+ { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
{}
};
arg_userns = true;
break;
+ case ARG_KILL_SIGNAL:
+ arg_kill_signal = signal_from_string_try_harder(optarg);
+ if (arg_kill_signal < 0) {
+ log_error("Cannot parse signal: %s", optarg);
+ return -EINVAL;
+ }
+
+ break;
+
case '?':
return -EINVAL;
arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+ if (arg_boot && arg_kill_signal <= 0)
+ arg_kill_signal = SIGRTMIN+3;
+
return 1;
}
return log_oom();
t = path_is_mount_point(where, true);
- if (t < 0) {
+ if (t < 0 && t != -ENOENT) {
log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
if (r == 0)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
}
- if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
+ if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "mount(%s) failed: %m", where);
if (ro) {
to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
r = path_is_mount_point(to, false);
- if (r < 0)
+ if (r < 0 && r != -ENOENT)
return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
if (r > 0)
return 0;
goto fail;
}
- if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
+ if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
log_error_errno(errno, "Failed to create /usr bind mount: %m");
r = -errno;
goto fail;
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
- if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
+ if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
log_error_errno(errno, "Failed to bind mount boot id: %m");
r = -errno;
- } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
+ } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
log_warning_errno(errno, "Failed to make boot id read-only: %m");
unlink(from);
return -r;
}
- if (mknod(to, st.st_mode, st.st_rdev) < 0)
- return log_error_errno(errno, "mknod(%s) failed: %m", to);
+ if (mknod(to, st.st_mode, st.st_rdev) < 0) {
+ if (errno != EPERM)
+ return log_error_errno(errno, "mknod(%s) failed: %m", to);
+
+ /* Some systems abusively restrict mknod but
+ * allow bind mounts. */
+ r = touch(to);
+ if (r < 0)
+ return log_error_errno(r, "touch (%s) failed: %m", to);
+ if (mount(from, to, NULL, MS_BIND, NULL) < 0)
+ return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
+ }
if (arg_userns && arg_uid_shift != UID_INVALID)
if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
static int setup_dev_console(const char *dest, const char *console) {
_cleanup_umask_ mode_t u;
const char *to;
- struct stat st;
int r;
assert(dest);
u = umask(0000);
- if (stat("/dev/null", &st) < 0)
- return log_error_errno(errno, "Failed to stat /dev/null: %m");
-
r = chmod_and_chown(console, 0600, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to correct access mode for TTY: %m");
/* We need to bind mount the right tty to /dev/console since
* ptys can only exist on pts file systems. To have something
- * to bind mount things on we create a device node first, and
- * use /dev/null for that since we the cgroups device policy
- * allows us to create that freely, while we cannot create
- * /dev/console. (Note that the major minor doesn't actually
- * matter here, since we mount it over anyway). */
+ * to bind mount things on we create a empty regular file. */
to = strjoina(dest, "/dev/console");
- if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
- return log_error_errno(errno, "mknod() for /dev/console failed: %m");
+ r = touch(to);
+ if (r < 0)
+ return log_error_errno(r, "touch() for /dev/console failed: %m");
- if (mount(console, to, "bind", MS_BIND, NULL) < 0)
+ if (mount(console, to, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
return 0;
if (r < 0)
return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
- if (mount(from, to, "bind", MS_BIND, NULL) < 0)
+ if (mount(from, to, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
return r;
}
- if (mount(p, q, "bind", MS_BIND, NULL) < 0)
+ if (mount(p, q, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
return 0;
static int setup_seccomp(void) {
#ifdef HAVE_SECCOMP
- static const int blacklist[] = {
- SCMP_SYS(kexec_load),
- SCMP_SYS(open_by_handle_at),
- SCMP_SYS(iopl),
- SCMP_SYS(ioperm),
- SCMP_SYS(swapon),
- SCMP_SYS(swapoff),
- };
-
- static const int kmod_blacklist[] = {
- SCMP_SYS(init_module),
- SCMP_SYS(finit_module),
- SCMP_SYS(delete_module),
+ static const struct {
+ uint64_t capability;
+ int syscall_num;
+ } blacklist[] = {
+ { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
+ { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
+ { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
+ { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
+ { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
+ { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
+ { CAP_SYS_MODULE, SCMP_SYS(init_module)},
+ { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
+ { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
};
scmp_filter_ctx seccomp;
}
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
- r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
+ if (arg_retain & (1ULL << blacklist[i].capability))
+ continue;
+
+ r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
if (r == -EFAULT)
continue; /* unknown syscall */
if (r < 0) {
}
}
- /* If the CAP_SYS_MODULE capability is not requested then
- * we'll block the kmod syscalls too */
- if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
- for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
- r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
- if (r == -EFAULT)
- continue; /* unknown syscall */
- if (r < 0) {
- log_error_errno(r, "Failed to block syscall: %m");
- goto finish;
- }
- }
- }
/*
Audit is broken in containers, much of the userspace audit
return -errno;
}
- blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
+ (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
is_gpt = streq_ptr(pttype, "gpt");
is_mbr = streq_ptr(pttype, "dos");
return 0;
#else
log_error("--image= is not supported, compiled without blkid support.");
- return -ENOTSUP;
+ return -EOPNOTSUPP;
#endif
}
if (streq(fstype, "crypto_LUKS")) {
log_error("nspawn currently does not support LUKS disk images.");
- return -ENOTSUP;
+ return -EOPNOTSUPP;
}
if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
return 0;
#else
log_error("--image= is not supported, compiled without blkid support.");
- return -ENOTSUP;
+ return -EOPNOTSUPP;
#endif
}
pid = PTR_TO_UINT32(userdata);
if (pid > 0) {
- if (kill(pid, SIGRTMIN+3) >= 0) {
+ if (kill(pid, arg_kill_signal) >= 0) {
log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
sd_event_source_set_userdata(s, NULL);
return 0;
goto finish;
}
- if (sd_booted() <= 0) {
- log_error("Not running on a systemd system.");
- r = -EINVAL;
- goto finish;
- }
-
log_close();
n_fd_passed = sd_listen_fds(false);
if (n_fd_passed > 0) {
}
if (arg_ephemeral) {
- char *np;
+ _cleanup_free_ char *np = NULL;
/* If the specified path is a mount point we
* generate the new snapshot immediately
goto finish;
}
- r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
+ r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
if (r < 0) {
- free(np);
log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
goto finish;
}
free(arg_directory);
arg_directory = np;
+ np = NULL;
remove_subvol = true;
}
if (arg_template) {
- r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
+ r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
if (r == -EEXIST) {
if (!arg_quiet)
log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
_exit(EXIT_FAILURE);
/* Turn directory into bind mount */
- if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
+ if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
log_error_errno(errno, "Failed to make bind mount: %m");
_exit(EXIT_FAILURE);
}
goto finish;
}
- if (arg_boot) {
+ if (arg_kill_signal > 0) {
/* Try to kill the init system on SIGINT or SIGTERM */
sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
if (remove_subvol && arg_directory) {
int k;
- k = btrfs_subvol_remove(arg_directory);
+ k = btrfs_subvol_remove(arg_directory, true);
if (k < 0)
log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
}
const char *p;
p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
- (void) rm_rf(p, false, true, false);
+ (void) rm_rf(p, REMOVE_ROOT);
}
free(arg_directory);