#include "mkdir.h"
#include "mount-util.h"
#include "mountpoint-util.h"
+#include "namespace-util.h"
#include "netlink-util.h"
#include "nspawn-cgroup.h"
#include "nspawn-def.h"
#include "nspawn-expose-ports.h"
#include "nspawn-mount.h"
#include "nspawn-network.h"
+#include "nspawn-oci.h"
#include "nspawn-patch-uid.h"
#include "nspawn-register.h"
#include "nspawn-seccomp.h"
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
+#include "nulstr-util.h"
#include "os-util.h"
#include "pager.h"
#include "parse-util.h"
#include "raw-clone.h"
#include "rlimit-util.h"
#include "rm-rf.h"
+#if HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
#include "selinux-util.h"
#include "signal-util.h"
#include "socket-util.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
+#include "sysctl-util.h"
#include "terminal-util.h"
#include "tmpfile-util.h"
#include "umask-util.h"
static char *arg_pivot_root_new = NULL;
static char *arg_pivot_root_old = NULL;
static char *arg_user = NULL;
+static uid_t arg_uid = UID_INVALID;
+static gid_t arg_gid = GID_INVALID;
+static gid_t* arg_supplementary_gids = NULL;
+static size_t arg_n_supplementary_gids = 0;
static sd_id128_t arg_uuid = {};
static char *arg_machine = NULL; /* The name used by the host to refer to this */
static char *arg_hostname = NULL; /* The name the payload sees by default */
static const char *arg_selinux_context = NULL;
static const char *arg_selinux_apifs_context = NULL;
-static const char *arg_slice = NULL;
+static char *arg_slice = NULL;
static bool arg_private_network = false;
static bool arg_read_only = false;
static StartMode arg_start_mode = START_PID1;
(1ULL << CAP_SYS_PTRACE) |
(1ULL << CAP_SYS_RESOURCE) |
(1ULL << CAP_SYS_TTY_CONFIG);
+static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
static CustomMount *arg_custom_mounts = NULL;
static size_t arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
static char *arg_network_namespace_path = NULL;
static unsigned long arg_personality = PERSONALITY_INVALID;
static char *arg_image = NULL;
+static char *arg_oci_bundle = NULL;
static VolatileMode arg_volatile_mode = VOLATILE_NO;
static ExposePort *arg_expose_ports = NULL;
static char **arg_property = NULL;
+static sd_bus_message *arg_property_message = NULL;
static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns_chown = false;
static size_t arg_root_hash_size = 0;
static char **arg_syscall_whitelist = NULL;
static char **arg_syscall_blacklist = NULL;
+#if HAVE_SECCOMP
+static scmp_filter_ctx arg_seccomp = NULL;
+#endif
static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
static bool arg_no_new_privileges = false;
static int arg_oom_score_adjust = 0;
static unsigned arg_cpuset_ncpus = 0;
static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
static TimezoneMode arg_timezone = TIMEZONE_AUTO;
+static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
+static DeviceNode* arg_extra_nodes = NULL;
+static size_t arg_n_extra_nodes = 0;
+static char **arg_sysctl = NULL;
+static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
static int help(void) {
_cleanup_free_ char *link = NULL;
if (r < 0)
return log_oom();
- printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
+ printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
"Spawn a command or OS in a light-weight container.\n\n"
" -h --help Show this help\n"
" --version Print version string\n"
" -q --quiet Do not show status information\n"
+ " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
+ "%3$sImage:%4$s\n"
" -D --directory=PATH Root directory for the container\n"
" --template=PATH Initialize root directory from template directory,\n"
" if missing\n"
" -x --ephemeral Run container with snapshot of root directory, and\n"
" remove it after exit\n"
" -i --image=PATH File system device or disk image for the container\n"
+ " --oci-bundle=PATH OCI bundle directory\n"
+ " --read-only Mount the root directory read-only\n"
+ " --volatile[=MODE] Run the system in volatile mode\n"
" --root-hash=HASH Specify verity root hash\n"
+ " --pivot-root=PATH[:PATH]\n"
+ " Pivot root to given directory in the container\n\n"
+ "%3$sExecution:%4$s\n"
" -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
" -b --boot Boot up full system (i.e. invoke init)\n"
" --chdir=PATH Set working directory in the container\n"
- " --pivot-root=PATH[:PATH]\n"
- " Pivot root to given directory in the container\n"
- " -u --user=USER Run the command under specified user or uid\n"
+ " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
+ " -u --user=USER Run the command under specified user or UID\n"
+ " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
+ " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
+ "%3$sSystem Identity:%4$s\n"
" -M --machine=NAME Set the machine name for the container\n"
" --hostname=NAME Override the hostname for the container\n"
- " --uuid=UUID Set a specific machine UUID for the container\n"
+ " --uuid=UUID Set a specific machine UUID for the container\n\n"
+ "%3$sProperties:%4$s\n"
" -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n"
+ " --register=BOOLEAN Register container as machine\n"
+ " --keep-unit Do not register a scope for the machine, reuse\n"
+ " the service unit nspawn is running in\n\n"
+ "%3$sUser Namespacing:%4$s\n"
" -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
" --private-users[=UIDBASE[:NUIDS]]\n"
" Similar, but with user configured UID/GID range\n"
- " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
+ " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
+ "%3$sNetworking:%4$s\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n"
" Set network namespace to the one represented by\n"
" the specified kernel namespace file node\n"
" -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
- " Expose a container IP port on the host\n"
- " -Z --selinux-context=SECLABEL\n"
- " Set the SELinux security context to be used by\n"
- " processes in the container\n"
- " -L --selinux-apifs-context=SECLABEL\n"
- " Set the SELinux security context to be used by\n"
- " API/tmpfs file systems in the container\n"
+ " Expose a container IP port on the host\n\n"
+ "%3$sSecurity:%4$s\n"
" --capability=CAP In addition to the default, retain specified\n"
" capability\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
" --system-call-filter=LIST|~LIST\n"
" Permit/prohibit specific system calls\n"
+ " -Z --selinux-context=SECLABEL\n"
+ " Set the SELinux security context to be used by\n"
+ " processes in the container\n"
+ " -L --selinux-apifs-context=SECLABEL\n"
+ " Set the SELinux security context to be used by\n"
+ " API/tmpfs file systems in the container\n\n"
+ "%3$sResources:%4$s\n"
" --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
" --oom-score-adjust=VALUE\n"
" Adjust the OOM score value for the payload\n"
- " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
- " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
- " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
- " host, try-guest, try-host\n"
- " -j Equivalent to --link-journal=try-guest\n"
+ " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n\n"
+ "%3$sIntegration:%4$s\n"
" --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
" --timezone=MODE Select mode of /etc/localtime initialization\n"
- " --read-only Mount the root directory read-only\n"
+ " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
+ " host, try-guest, try-host\n"
+ " -j Equivalent to --link-journal=try-guest\n\n"
+ "%3$sMounts:%4$s\n"
" --bind=PATH[:PATH[:OPTIONS]]\n"
" Bind mount a file or directory from the host into\n"
" the container\n"
" --bind-ro=PATH[:PATH[:OPTIONS]\n"
" Similar, but creates a read-only bind mount\n"
+ " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
+ " it\n"
" --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
" --overlay=PATH[:PATH...]:PATH\n"
" Create an overlay mount from the host to \n"
" the container\n"
" --overlay-ro=PATH[:PATH...]:PATH\n"
- " Similar, but creates a read-only overlay mount\n"
- " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
- " --register=BOOLEAN Register container as machine\n"
- " --keep-unit Do not register a scope for the machine, reuse\n"
- " the service unit nspawn is running in\n"
- " --volatile[=MODE] Run the system in volatile mode\n"
- " --settings=BOOLEAN Load additional settings from .nspawn file\n"
- " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
- "\nSee the %s for details.\n"
+ " Similar, but creates a read-only overlay mount\n\n"
+ "%3$sInput/Output:%4$s\n"
+ " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
+ " set up for the container.\n"
+ " -P --pipe Equivalent to --console=pipe\n"
+ "\nSee the %2$s for details.\n"
, program_invocation_short_name
, link
- );
+ , ansi_underline(), ansi_normal());
return 0;
}
return;
if (r < 0)
log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
+
arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
+ arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
}
static void parse_mount_settings_env(void) {
/* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
* even if it is supported. If not supported, it has no effect. */
- r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
- if (r == 0 || !cg_ns_supported())
+ if (!cg_ns_supported())
arg_use_cgns = false;
+ else {
+ r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
+ if (r < 0) {
+ if (r != -ENXIO)
+ log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
+
+ arg_use_cgns = true;
+ } else {
+ arg_use_cgns = r > 0;
+ arg_settings_mask |= SETTING_USE_CGNS;
+ }
+ }
e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
if (e)
ARG_TMPFS,
ARG_OVERLAY,
ARG_OVERLAY_RO,
+ ARG_INACCESSIBLE,
ARG_SHARE_SYSTEM,
ARG_REGISTER,
ARG_KEEP_UNIT,
ARG_CPU_AFFINITY,
ARG_RESOLV_CONF,
ARG_TIMEZONE,
+ ARG_CONSOLE,
+ ARG_PIPE,
+ ARG_OCI_BUNDLE,
};
static const struct option options[] = {
{ "tmpfs", required_argument, NULL, ARG_TMPFS },
{ "overlay", required_argument, NULL, ARG_OVERLAY },
{ "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
+ { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
{ "machine", required_argument, NULL, 'M' },
{ "hostname", required_argument, NULL, ARG_HOSTNAME },
{ "slice", required_argument, NULL, 'S' },
{ "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
{ "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
{ "timezone", required_argument, NULL, ARG_TIMEZONE },
+ { "console", required_argument, NULL, ARG_CONSOLE },
+ { "pipe", no_argument, NULL, ARG_PIPE },
+ { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
{}
};
assert(argc >= 0);
assert(argv);
- while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
switch (c) {
case 'h':
r = parse_path_argument_and_warn(optarg, false, &arg_directory);
if (r < 0)
return r;
+
+ arg_settings_mask |= SETTING_DIRECTORY;
break;
case ARG_TEMPLATE:
r = parse_path_argument_and_warn(optarg, false, &arg_template);
if (r < 0)
return r;
+
+ arg_settings_mask |= SETTING_DIRECTORY;
break;
case 'i':
r = parse_path_argument_and_warn(optarg, false, &arg_image);
if (r < 0)
return r;
+
+ arg_settings_mask |= SETTING_DIRECTORY;
+ break;
+
+ case ARG_OCI_BUNDLE:
+ r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
+ if (r < 0)
+ return r;
+
break;
case 'x':
if (r < 0)
return r;
+ arg_settings_mask |= SETTING_NETWORK;
break;
case 'b':
break;
case 'S':
- arg_slice = optarg;
+ r = free_and_strdup(&arg_slice, optarg);
+ if (r < 0)
+ return log_oom();
+
+ arg_settings_mask |= SETTING_SLICE;
break;
case 'M':
r = extract_first_word(&p, &t, ",", 0);
if (r < 0)
return log_error_errno(r, "Failed to parse capability %s.", t);
-
if (r == 0)
break;
arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
break;
+ case ARG_INACCESSIBLE:
+ r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
+
+ arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
+ break;
+
case 'E': {
char **n;
arg_settings_mask |= SETTING_TIMEZONE;
break;
+ case ARG_CONSOLE:
+ if (streq(optarg, "interactive"))
+ arg_console_mode = CONSOLE_INTERACTIVE;
+ else if (streq(optarg, "read-only"))
+ arg_console_mode = CONSOLE_READ_ONLY;
+ else if (streq(optarg, "passive"))
+ arg_console_mode = CONSOLE_PASSIVE;
+ else if (streq(optarg, "pipe"))
+ arg_console_mode = CONSOLE_PIPE;
+ else if (streq(optarg, "help"))
+ puts("interactive\n"
+ "read-only\n"
+ "passive\n"
+ "pipe");
+ else {
+ log_error("Unknown console mode: %s", optarg);
+ return -EINVAL;
+ }
+
+ arg_settings_mask |= SETTING_CONSOLE_MODE;
+ break;
+
+ case 'P':
+ case ARG_PIPE:
+ arg_console_mode = CONSOLE_PIPE;
+ arg_settings_mask |= SETTING_CONSOLE_MODE;
+ break;
+
case '?':
return -EINVAL;
* --directory=". */
arg_directory = TAKE_PTR(arg_template);
- arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+ arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
+
+ /* Make sure to parse environment before we reset the settings mask below */
+ parse_environment();
/* Load all settings from .nspawn files */
if (mask_no_settings)
if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
+ if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
+ arg_read_only = true;
+
if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
/* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
* The latter is not technically a user session, but we don't need to labour the point. */
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
if (arg_userns_chown && arg_read_only)
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined.");
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "--read-only and --private-users-chown may not be combined.");
+
+ /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
+ * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
+ * copy-up (in case of overlay) making the entire excercise pointless. */
+ if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
- /* If --network-namespace-path is given with any other network-related option,
- * we need to error out, to avoid conflicts between different network options. */
+ /* If --network-namespace-path is given with any other network-related option, we need to error out,
+ * to avoid conflicts between different network options. */
if (arg_network_namespace_path &&
(arg_network_interfaces || arg_network_macvlan ||
arg_network_ipvlan || arg_network_veth_extra ||
arg_network_bridge || arg_network_zone ||
arg_network_veth || arg_private_network))
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path cannot be combined with other network options.");
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
if (arg_network_bridge && arg_network_zone)
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-bridge= and --network-zone= may not be combined.");
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "--network-bridge= and --network-zone= may not be combined.");
if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
- if (arg_volatile_mode != VOLATILE_NO && arg_read_only)
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
-
if (arg_expose_ports && !arg_private_network)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
"/usr/share/zoneinfo/");
}
+static bool etc_writable(void) {
+ return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
+}
+
static int setup_timezone(const char *dest) {
_cleanup_free_ char *p = NULL, *etc = NULL;
const char *where, *check;
if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
r = readlink_malloc("/etc/localtime", &p);
if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_OFF : TIMEZONE_DELETE;
+ m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_COPY;
+ m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
else if (r < 0) {
log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
/* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
*/
return 0;
} else if (arg_timezone == TIMEZONE_AUTO)
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_SYMLINK;
+ m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
else
m = arg_timezone;
} else
if (arg_private_network)
m = RESOLV_CONF_OFF;
else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC;
+ m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
else if (have_resolv_conf("/etc/resolv.conf") > 0)
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST;
+ m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
else
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE;
+ m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
} else
m = arg_resolv_conf;
"tty\0"
"net/tun\0";
+ _cleanup_umask_ mode_t u;
const char *d;
int r = 0;
- _cleanup_umask_ mode_t u;
assert(dest);
return r;
}
+static int make_extra_nodes(const char *dest) {
+ _cleanup_umask_ mode_t u;
+ size_t i;
+ int r;
+
+ u = umask(0000);
+
+ for (i = 0; i < arg_n_extra_nodes; i++) {
+ _cleanup_free_ char *path = NULL;
+ DeviceNode *n = arg_extra_nodes + i;
+
+ path = prefix_root(dest, n->path);
+ if (!path)
+ return log_oom();
+
+ if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
+ return log_error_errno(errno, "Failed to create device node '%s': %m", path);
+
+ r = chmod_and_chown(path, n->mode, n->uid, n->gid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
+ }
+
+ return 0;
+}
+
static int setup_pts(const char *dest) {
_cleanup_free_ char *options = NULL;
const char *p;
int r;
assert(dest);
- assert(console);
u = umask(0000);
+ if (!console)
+ return 0;
+
r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
if (r < 0)
return log_error_errno(r, "Failed to correct access mode for TTY: %m");
return 0;
}
-static int drop_capabilities(void) {
- return capability_bounding_set_drop(arg_caps_retain, false);
+static int drop_capabilities(uid_t uid) {
+ CapabilityQuintet q;
+
+ /* Let's initialize all five capability sets to something valid. If the quintet was configured via
+ * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
+ * arg_caps_retain. */
+
+ if (capability_quintet_is_set(&arg_full_capabilities)) {
+ q = arg_full_capabilities;
+
+ if (q.bounding == (uint64_t) -1)
+ q.bounding = uid == 0 ? arg_caps_retain : 0;
+
+ if (q.effective == (uint64_t) -1)
+ q.effective = uid == 0 ? q.bounding : 0;
+
+ if (q.inheritable == (uint64_t) -1)
+ q.inheritable = uid == 0 ? q.bounding : 0;
+
+ if (q.permitted == (uint64_t) -1)
+ q.permitted = uid == 0 ? q.bounding : 0;
+
+ if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
+ q.ambient = 0;
+ } else
+ q = (CapabilityQuintet) {
+ .bounding = arg_caps_retain,
+ .effective = uid == 0 ? arg_caps_retain : 0,
+ .inheritable = uid == 0 ? arg_caps_retain : 0,
+ .permitted = uid == 0 ? arg_caps_retain : 0,
+ .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
+ };
+
+ return capability_quintet_enforce(&q);
}
static int reset_audit_loginuid(void) {
return 0;
}
+static unsigned long effective_clone_ns_flags(void) {
+ unsigned long flags = arg_clone_ns_flags;
+
+ if (arg_private_network)
+ flags |= CLONE_NEWNET;
+ if (arg_use_cgns)
+ flags |= CLONE_NEWCGROUP;
+ if (arg_userns_mode != USER_NAMESPACE_NO)
+ flags |= CLONE_NEWUSER;
+
+ return flags;
+}
+
+static int patch_sysctl(void) {
+
+ /* This table is inspired by runc's sysctl() function */
+ static const struct {
+ const char *key;
+ bool prefix;
+ unsigned long clone_flags;
+ } safe_sysctl[] = {
+ { "kernel.hostname", false, CLONE_NEWUTS },
+ { "kernel.domainname", false, CLONE_NEWUTS },
+ { "kernel.msgmax", false, CLONE_NEWIPC },
+ { "kernel.msgmnb", false, CLONE_NEWIPC },
+ { "kernel.msgmni", false, CLONE_NEWIPC },
+ { "kernel.sem", false, CLONE_NEWIPC },
+ { "kernel.shmall", false, CLONE_NEWIPC },
+ { "kernel.shmmax", false, CLONE_NEWIPC },
+ { "kernel.shmmni", false, CLONE_NEWIPC },
+ { "fs.mqueue.", true, CLONE_NEWIPC },
+ { "net.", true, CLONE_NEWNET },
+ };
+
+ unsigned long flags;
+ char **k, **v;
+ int r;
+
+ flags = effective_clone_ns_flags();
+
+ STRV_FOREACH_PAIR(k, v, arg_sysctl) {
+ bool good = false;
+ size_t i;
+
+ for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
+
+ if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
+ continue;
+
+ if (safe_sysctl[i].prefix)
+ good = startswith(*k, safe_sysctl[i].key);
+ else
+ good = streq(*k, safe_sysctl[i].key);
+
+ if (good)
+ break;
+ }
+
+ if (!good) {
+ log_error("Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
+ return -EPERM;
+ }
+
+ r = sysctl_write(*k, *v);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
+ }
+
+ return 0;
+}
+
static int inner_child(
Barrier *barrier,
const char *directory,
};
const char *exec_target;
_cleanup_strv_free_ char **env_use = NULL;
- int r;
+ int r, which_failed;
/* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
* the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
assert(directory);
assert(kmsg_socket >= 0);
+ log_debug("Inner child is initializing.");
+
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* Tell the parent, that it now can write the UID map. */
(void) barrier_place(barrier); /* #1 */
return r;
kmsg_socket = safe_close(kmsg_socket);
+ r = mount_custom(
+ "/",
+ arg_custom_mounts,
+ arg_n_custom_mounts,
+ false,
+ 0,
+ 0,
+ arg_selinux_apifs_context,
+ true);
+ if (r < 0)
+ return r;
+
if (setsid() < 0)
return log_error_errno(errno, "setsid() failed: %m");
rtnl_socket = safe_close(rtnl_socket);
}
+ r = patch_sysctl();
+ if (r < 0)
+ return r;
+
if (arg_oom_score_adjust_set) {
r = set_oom_score_adjust(arg_oom_score_adjust);
if (r < 0)
if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
return log_error_errno(errno, "Failed to set CPU affinity: %m");
- r = drop_capabilities();
- if (r < 0)
- return log_error_errno(r, "drop_capabilities() failed: %m");
-
(void) setup_hostname();
if (arg_personality != PERSONALITY_INVALID) {
return log_error_errno(r, "personality() failed: %m");
}
+ r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
+ if (r < 0)
+ return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+
+#if HAVE_SECCOMP
+ if (arg_seccomp) {
+
+ if (is_seccomp_available()) {
+
+ r = seccomp_load(arg_seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return log_error_errno(r, "Failed to install seccomp filter: %m");
+ if (r < 0)
+ log_debug_errno(r, "Failed to install seccomp filter: %m");
+ }
+ } else
+#endif
+ {
+ r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
+ if (r < 0)
+ return r;
+ }
+
#if HAVE_SELINUX
if (arg_selinux_context)
if (setexeccon(arg_selinux_context) < 0)
return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
#endif
- r = change_uid_gid(arg_user, &home);
+ /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
+ * if we need to later on. */
+ if (prctl(PR_SET_KEEPCAPS, 1) < 0)
+ return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
+
+ if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
+ r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
+ else
+ r = change_uid_gid(arg_user, &home);
if (r < 0)
return r;
+ r = drop_capabilities(getuid());
+ if (r < 0)
+ return log_error_errno(r, "Dropping capabilities failed: %m");
+
if (arg_no_new_privileges)
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
return log_error_errno(errno, "Failed to disable new privileges: %m");
if (envp[n_env])
n_env++;
- if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
- (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
- (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
- return log_oom();
+ if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
+ if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
+ return log_oom();
+
+ if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
+ if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
+ asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
+ return log_oom();
assert(!sd_id128_is_null(arg_uuid));
return r;
}
+ log_debug("Inner child completed, invoking payload.");
+
/* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
* has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
* it again. Note that the other fds closed here are at least the locking and barrier fds. */
const char *directory,
const char *console,
DissectedImage *dissected_image,
- bool interactive,
bool secondary,
int pid_socket,
int uuid_socket,
int netns_fd) {
_cleanup_close_ int fd = -1;
- int r, which_failed;
pid_t pid;
ssize_t l;
+ int r;
/* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
* its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
assert(barrier);
assert(directory);
- assert(console);
assert(pid_socket >= 0);
assert(uuid_socket >= 0);
assert(notify_socket >= 0);
assert(kmsg_socket >= 0);
+ log_debug("Outer child is initializing.");
+
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
- if (interactive) {
+ if (arg_console_mode != CONSOLE_PIPE) {
int terminal;
+ assert(console);
+
terminal = open_terminal(console, O_RDWR);
if (terminal < 0)
return log_error_errno(terminal, "Failed to open console: %m");
"Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
}
+ if (!dissected_image) {
+ /* Turn directory into bind mount */
+ r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ r = setup_pivot_root(
+ directory,
+ arg_pivot_root_new,
+ arg_pivot_root_old);
+ if (r < 0)
+ return r;
+
+ r = setup_volatile_mode(
+ directory,
+ arg_volatile_mode,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_context);
+ if (r < 0)
+ return r;
+
if (dissected_image) {
/* Now we know the uid shift, let's now mount everything else that might be in the image. */
r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
}
- /* Turn directory into bind mount */
- r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
- return r;
-
- r = setup_pivot_root(
- directory,
- arg_pivot_root_new,
- arg_pivot_root_old);
- if (r < 0)
- return r;
-
- r = setup_volatile(
- directory,
- arg_volatile_mode,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_context);
- if (r < 0)
- return r;
-
- r = setup_volatile_state(
- directory,
- arg_volatile_mode,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_context);
- if (r < 0)
- return r;
-
/* Mark everything as shared so our mounts get propagated down. This is
* required to make new bind mounts available in systemd services
* inside the containter that create a new mount namespace.
if (r < 0)
return r;
- if (arg_read_only) {
+ if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
r = bind_remount_recursive(directory, true, NULL);
if (r < 0)
return log_error_errno(r, "Failed to make tree read-only: %m");
if (r < 0)
return r;
- dev_setup(directory, arg_uid_shift, arg_uid_shift);
+ r = make_extra_nodes(directory);
+ if (r < 0)
+ return r;
+
+ (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
+ (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
r = setup_pts(directory);
if (r < 0)
if (r < 0)
return r;
- r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
- if (r < 0)
- return r;
-
r = setup_timezone(directory);
if (r < 0)
return r;
arg_userns_mode != USER_NAMESPACE_NO,
arg_uid_shift,
arg_uid_range,
- arg_selinux_apifs_context);
+ arg_selinux_apifs_context,
+ false);
if (r < 0)
return r;
if (fd < 0)
return fd;
- r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
- if (r < 0)
- return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
-
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
arg_clone_ns_flags |
(arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
arg_ephemeral = settings->ephemeral;
+ if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
+ settings->root) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring root directory setting, file %s is not trusted.", path);
+ else
+ free_and_replace(arg_directory, settings->root);
+ }
+
if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
settings->pivot_root_new) {
free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
settings->environment)
strv_free_and_replace(arg_setenv, settings->environment);
- if ((arg_settings_mask & SETTING_USER) == 0 &&
- settings->user)
- free_and_replace(arg_user, settings->user);
+ if ((arg_settings_mask & SETTING_USER) == 0) {
+
+ if (settings->user)
+ free_and_replace(arg_user, settings->user);
+
+ if (uid_is_valid(settings->uid))
+ arg_uid = settings->uid;
+ if (gid_is_valid(settings->gid))
+ arg_gid = settings->gid;
+ if (settings->n_supplementary_gids > 0) {
+ free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
+ arg_n_supplementary_gids = settings->n_supplementary_gids;
+ }
+ }
if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
- uint64_t plus;
+ uint64_t plus, minus;
+
+ /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
+ * Settings structure */
plus = settings->capability;
- if (settings_private_network(settings))
- plus |= (1ULL << CAP_NET_ADMIN);
+ minus = settings->drop_capability;
+
+ if ((arg_settings_mask & SETTING_NETWORK) == 0) {
+ if (settings_private_network(settings))
+ plus |= UINT64_C(1) << CAP_NET_ADMIN;
+ else
+ minus |= UINT64_C(1) << CAP_NET_ADMIN;
+ }
if (!arg_settings_trusted && plus != 0) {
if (settings->capability != 0)
} else
arg_caps_retain |= plus;
- arg_caps_retain &= ~settings->drop_capability;
+ arg_caps_retain &= ~minus;
+
+ /* Copy the full capabilities over too */
+ if (capability_quintet_is_set(&settings->full_capabilities)) {
+ if (!arg_settings_trusted)
+ log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path);
+ else
+ arg_full_capabilities = settings->full_capabilities;
+ }
}
if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
settings->network_interfaces ||
settings->network_macvlan ||
settings->network_ipvlan ||
- settings->network_veth_extra)) {
+ settings->network_veth_extra ||
+ settings->network_namespace_path)) {
if (!arg_settings_trusted)
log_warning("Ignoring network settings, file %s is not trusted.", path);
free_and_replace(arg_network_bridge, settings->network_bridge);
free_and_replace(arg_network_zone, settings->network_zone);
+
+ free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
}
}
if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
- if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
+ if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
else {
strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
}
+
+#if HAVE_SECCOMP
+ if (!arg_settings_trusted && settings->seccomp)
+ log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
+ else {
+ seccomp_release(arg_seccomp);
+ arg_seccomp = TAKE_PTR(settings->seccomp);
+ }
+#endif
}
for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
settings->timezone != _TIMEZONE_MODE_INVALID)
arg_timezone = settings->timezone;
+ if ((arg_settings_mask & SETTING_SLICE) == 0 &&
+ settings->slice) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
+ else
+ free_and_replace(arg_slice, settings->slice);
+ }
+
+ if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
+ settings->use_cgns >= 0) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
+ else
+ arg_use_cgns = settings->use_cgns;
+ }
+
+ if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
+ settings->clone_ns_flags != (unsigned long) -1) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
+ else
+ arg_clone_ns_flags = settings->clone_ns_flags;
+ }
+
+ if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
+ settings->console_mode >= 0) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
+ else
+ arg_console_mode = settings->console_mode;
+ }
+
+ /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
+ * don't consult arg_settings_mask for them. */
+
+ sd_bus_message_unref(arg_property_message);
+ arg_property_message = TAKE_PTR(settings->properties);
+
+ arg_console_width = settings->console_width;
+ arg_console_height = settings->console_height;
+
+ device_node_free_many(arg_extra_nodes, arg_n_extra_nodes);
+ arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
+ arg_n_extra_nodes = settings->n_extra_nodes;
+
return 0;
}
const char *fn, *i;
int r;
+ if (arg_oci_bundle)
+ return 0;
+
/* If all settings are masked, there's no point in looking for
* the settings file */
if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
return merge_settings(settings, p);
}
+static int load_oci_bundle(void) {
+ _cleanup_(settings_freep) Settings *settings = NULL;
+ int r;
+
+ if (!arg_oci_bundle)
+ return 0;
+
+ /* By default let's trust OCI bundles */
+ if (arg_settings_trusted < 0)
+ arg_settings_trusted = true;
+
+ r = oci_load(NULL, arg_oci_bundle, &settings);
+ if (r < 0)
+ return r;
+
+ return merge_settings(settings, arg_oci_bundle);
+}
+
static int run(int master,
const char* console,
DissectedImage *dissected_image,
- bool interactive,
bool secondary,
FDSet *fds,
char veth_name[IFNAMSIZ], bool *veth_created,
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
_cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
ContainerStatus container_status = 0;
- char last_char = 0;
int ifi = 0, r;
ssize_t l;
sigset_t mask_chld;
arg_directory,
console,
dissected_image,
- interactive,
secondary,
pid_socket_pair[1],
uuid_socket_pair[1],
arg_custom_mounts, arg_n_custom_mounts,
arg_kill_signal,
arg_property,
+ arg_property_message,
arg_keep_unit,
arg_container_service_name);
if (r < 0)
arg_slice,
arg_custom_mounts, arg_n_custom_mounts,
arg_kill_signal,
- arg_property);
+ arg_property,
+ arg_property_message);
if (r < 0)
return r;
} else if (arg_slice || arg_property)
log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
- r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
+ r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
if (r < 0)
return r;
- r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+ r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
if (r < 0)
return r;
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
- r = pty_forward_new(event, master,
- PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
- &forward);
- if (r < 0)
- return log_error_errno(r, "Failed to create PTY forwarder: %m");
+ if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
+ assert(master >= 0);
+
+ r = pty_forward_new(event, master,
+ PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
+ &forward);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create PTY forwarder: %m");
+
+ if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
+ (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
+ }
r = sd_event_loop(event);
if (r < 0)
return log_error_errno(r, "Failed to run event loop: %m");
- pty_forward_get_last_char(forward, &last_char);
+ if (forward) {
+ char last_char = 0;
- forward = pty_forward_free(forward);
+ (void) pty_forward_get_last_char(forward, &last_char);
+ forward = pty_forward_free(forward);
- if (!arg_quiet && last_char != '\n')
- putc('\n', stdout);
+ if (!arg_quiet && last_char != '\n')
+ putc('\n', stdout);
+ }
/* Kill if it is not dead yet anyway */
if (bus) {
pid_t pid = 0;
union in_addr_union exposed = {};
_cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
- bool interactive, veth_created = false, remove_tmprootdir = false;
+ bool veth_created = false, remove_tmprootdir = false;
char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
_cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
_cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
if (r < 0)
goto finish;
+ r = load_oci_bundle();
+ if (r < 0)
+ goto finish;
+
r = determine_names();
if (r < 0)
goto finish;
if (r < 0)
goto finish;
- parse_environment();
-
r = cg_unified_flush();
if (r < 0) {
log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
goto finish;
}
- r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
+ r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
if (r < 0) {
r = log_error_errno(r, "Failed to copy image file: %m");
goto finish;
if (r < 0)
goto finish;
- interactive =
- isatty(STDIN_FILENO) > 0 &&
- isatty(STDOUT_FILENO) > 0;
+ if (arg_console_mode < 0)
+ arg_console_mode =
+ isatty(STDIN_FILENO) > 0 &&
+ isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
- master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
- if (master < 0) {
- r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
- goto finish;
- }
+ if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
+ arg_quiet = true;
- r = ptsname_malloc(master, &console);
- if (r < 0) {
- r = log_error_errno(r, "Failed to determine tty name: %m");
- goto finish;
- }
+ if (arg_console_mode != CONSOLE_PIPE) {
+ master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
+ if (master < 0) {
+ r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
+ goto finish;
+ }
- if (arg_selinux_apifs_context) {
- r = mac_selinux_apply(console, arg_selinux_apifs_context);
- if (r < 0)
+ r = ptsname_malloc(master, &console);
+ if (r < 0) {
+ r = log_error_errno(r, "Failed to determine tty name: %m");
goto finish;
- }
+ }
- if (unlockpt(master) < 0) {
- r = log_error_errno(errno, "Failed to unlock tty: %m");
- goto finish;
+ if (arg_selinux_apifs_context) {
+ r = mac_selinux_apply(console, arg_selinux_apifs_context);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (unlockpt(master) < 0) {
+ r = log_error_errno(errno, "Failed to unlock tty: %m");
+ goto finish;
+ }
}
if (!arg_quiet)
r = run(master,
console,
dissected_image,
- interactive, secondary,
+ secondary,
fds,
veth_name, &veth_created,
&exposed,
free(arg_machine);
free(arg_hostname);
free(arg_user);
+ free(arg_supplementary_gids);
free(arg_pivot_root_new);
free(arg_pivot_root_old);
free(arg_chdir);
free(arg_network_zone);
free(arg_network_namespace_path);
strv_free(arg_property);
+ sd_bus_message_unref(arg_property_message);
custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
expose_port_free_all(arg_expose_ports);
free(arg_root_hash);
rlimit_free_all(arg_rlimit);
strv_free(arg_syscall_whitelist);
strv_free(arg_syscall_blacklist);
+#if HAVE_SECCOMP
+ seccomp_release(arg_seccomp);
+#endif
arg_cpuset = cpu_set_mfree(arg_cpuset);
+ free(arg_oci_bundle);
+ device_node_free_many(arg_extra_nodes, arg_n_extra_nodes);
+ strv_free(arg_sysctl);
+ free(arg_slice);
return r < 0 ? EXIT_FAILURE : ret;
}