#include "loopback-setup.h"
#include "machine-image.h"
#include "macro.h"
+#include "main-func.h"
#include "missing.h"
#include "mkdir.h"
#include "mount-util.h"
#include "mountpoint-util.h"
+#include "namespace-util.h"
#include "netlink-util.h"
#include "nspawn-cgroup.h"
#include "nspawn-def.h"
#include "nspawn-expose-ports.h"
#include "nspawn-mount.h"
#include "nspawn-network.h"
+#include "nspawn-oci.h"
#include "nspawn-patch-uid.h"
#include "nspawn-register.h"
#include "nspawn-seccomp.h"
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
+#include "nulstr-util.h"
#include "os-util.h"
#include "pager.h"
#include "parse-util.h"
#include "raw-clone.h"
#include "rlimit-util.h"
#include "rm-rf.h"
+#if HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
#include "selinux-util.h"
#include "signal-util.h"
#include "socket-util.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
+#include "sysctl-util.h"
#include "terminal-util.h"
#include "tmpfile-util.h"
#include "umask-util.h"
typedef enum ContainerStatus {
CONTAINER_TERMINATED,
- CONTAINER_REBOOTED
+ CONTAINER_REBOOTED,
} ContainerStatus;
static char *arg_directory = NULL;
static char *arg_pivot_root_new = NULL;
static char *arg_pivot_root_old = NULL;
static char *arg_user = NULL;
+static uid_t arg_uid = UID_INVALID;
+static gid_t arg_gid = GID_INVALID;
+static gid_t* arg_supplementary_gids = NULL;
+static size_t arg_n_supplementary_gids = 0;
static sd_id128_t arg_uuid = {};
static char *arg_machine = NULL; /* The name used by the host to refer to this */
static char *arg_hostname = NULL; /* The name the payload sees by default */
static const char *arg_selinux_context = NULL;
static const char *arg_selinux_apifs_context = NULL;
-static const char *arg_slice = NULL;
+static char *arg_slice = NULL;
static bool arg_private_network = false;
static bool arg_read_only = false;
static StartMode arg_start_mode = START_PID1;
(1ULL << CAP_SYS_PTRACE) |
(1ULL << CAP_SYS_RESOURCE) |
(1ULL << CAP_SYS_TTY_CONFIG);
+static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
static CustomMount *arg_custom_mounts = NULL;
static size_t arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
static char *arg_network_bridge = NULL;
static char *arg_network_zone = NULL;
static char *arg_network_namespace_path = NULL;
+static PagerFlags arg_pager_flags = 0;
static unsigned long arg_personality = PERSONALITY_INVALID;
static char *arg_image = NULL;
+static char *arg_oci_bundle = NULL;
static VolatileMode arg_volatile_mode = VOLATILE_NO;
static ExposePort *arg_expose_ports = NULL;
static char **arg_property = NULL;
+static sd_bus_message *arg_property_message = NULL;
static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns_chown = false;
static size_t arg_root_hash_size = 0;
static char **arg_syscall_whitelist = NULL;
static char **arg_syscall_blacklist = NULL;
+#if HAVE_SECCOMP
+static scmp_filter_ctx arg_seccomp = NULL;
+#endif
static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
static bool arg_no_new_privileges = false;
static int arg_oom_score_adjust = 0;
static bool arg_oom_score_adjust_set = false;
-static cpu_set_t *arg_cpuset = NULL;
-static unsigned arg_cpuset_ncpus = 0;
+static CPUSet arg_cpu_set = {};
static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
static TimezoneMode arg_timezone = TIMEZONE_AUTO;
+static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
+static DeviceNode* arg_extra_nodes = NULL;
+static size_t arg_n_extra_nodes = 0;
+static char **arg_sysctl = NULL;
+static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
+
+STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
+STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
+#if HAVE_SECCOMP
+STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
+#endif
+STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
+STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
static int help(void) {
_cleanup_free_ char *link = NULL;
int r;
- (void) pager_open(false);
+ (void) pager_open(arg_pager_flags);
r = terminal_urlify_man("systemd-nspawn", "1", &link);
if (r < 0)
return log_oom();
- printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
+ printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
"Spawn a command or OS in a light-weight container.\n\n"
" -h --help Show this help\n"
" --version Print version string\n"
" -q --quiet Do not show status information\n"
+ " --no-pager Do not pipe output into a pager\n"
+ " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
+ "%3$sImage:%4$s\n"
" -D --directory=PATH Root directory for the container\n"
" --template=PATH Initialize root directory from template directory,\n"
" if missing\n"
" -x --ephemeral Run container with snapshot of root directory, and\n"
" remove it after exit\n"
- " -i --image=PATH File system device or disk image for the container\n"
- " --root-hash=HASH Specify verity root hash\n"
+ " -i --image=PATH Root file system disk image (or device node) for\n"
+ " the container\n"
+ " --oci-bundle=PATH OCI bundle directory\n"
+ " --read-only Mount the root directory read-only\n"
+ " --volatile[=MODE] Run the system in volatile mode\n"
+ " --root-hash=HASH Specify verity root hash for root disk image\n"
+ " --pivot-root=PATH[:PATH]\n"
+ " Pivot root to given directory in the container\n\n"
+ "%3$sExecution:%4$s\n"
" -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
" -b --boot Boot up full system (i.e. invoke init)\n"
" --chdir=PATH Set working directory in the container\n"
- " --pivot-root=PATH[:PATH]\n"
- " Pivot root to given directory in the container\n"
- " -u --user=USER Run the command under specified user or uid\n"
+ " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
+ " -u --user=USER Run the command under specified user or UID\n"
+ " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
+ " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
+ "%3$sSystem Identity:%4$s\n"
" -M --machine=NAME Set the machine name for the container\n"
" --hostname=NAME Override the hostname for the container\n"
- " --uuid=UUID Set a specific machine UUID for the container\n"
+ " --uuid=UUID Set a specific machine UUID for the container\n\n"
+ "%3$sProperties:%4$s\n"
" -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n"
+ " --register=BOOLEAN Register container as machine\n"
+ " --keep-unit Do not register a scope for the machine, reuse\n"
+ " the service unit nspawn is running in\n\n"
+ "%3$sUser Namespacing:%4$s\n"
" -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
" --private-users[=UIDBASE[:NUIDS]]\n"
" Similar, but with user configured UID/GID range\n"
- " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
+ " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
+ "%3$sNetworking:%4$s\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n"
" Set network namespace to the one represented by\n"
" the specified kernel namespace file node\n"
" -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
- " Expose a container IP port on the host\n"
- " -Z --selinux-context=SECLABEL\n"
- " Set the SELinux security context to be used by\n"
- " processes in the container\n"
- " -L --selinux-apifs-context=SECLABEL\n"
- " Set the SELinux security context to be used by\n"
- " API/tmpfs file systems in the container\n"
+ " Expose a container IP port on the host\n\n"
+ "%3$sSecurity:%4$s\n"
" --capability=CAP In addition to the default, retain specified\n"
" capability\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
+ " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
" --system-call-filter=LIST|~LIST\n"
" Permit/prohibit specific system calls\n"
+ " -Z --selinux-context=SECLABEL\n"
+ " Set the SELinux security context to be used by\n"
+ " processes in the container\n"
+ " -L --selinux-apifs-context=SECLABEL\n"
+ " Set the SELinux security context to be used by\n"
+ " API/tmpfs file systems in the container\n\n"
+ "%3$sResources:%4$s\n"
" --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
" --oom-score-adjust=VALUE\n"
" Adjust the OOM score value for the payload\n"
" --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
- " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
- " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
- " host, try-guest, try-host\n"
- " -j Equivalent to --link-journal=try-guest\n"
+ " --personality=ARCH Pick personality for this container\n\n"
+ "%3$sIntegration:%4$s\n"
" --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
" --timezone=MODE Select mode of /etc/localtime initialization\n"
- " --read-only Mount the root directory read-only\n"
+ " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
+ " host, try-guest, try-host\n"
+ " -j Equivalent to --link-journal=try-guest\n\n"
+ "%3$sMounts:%4$s\n"
" --bind=PATH[:PATH[:OPTIONS]]\n"
" Bind mount a file or directory from the host into\n"
" the container\n"
" --bind-ro=PATH[:PATH[:OPTIONS]\n"
" Similar, but creates a read-only bind mount\n"
+ " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
+ " it\n"
" --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
" --overlay=PATH[:PATH...]:PATH\n"
" Create an overlay mount from the host to \n"
" the container\n"
" --overlay-ro=PATH[:PATH...]:PATH\n"
- " Similar, but creates a read-only overlay mount\n"
- " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
- " --register=BOOLEAN Register container as machine\n"
- " --keep-unit Do not register a scope for the machine, reuse\n"
- " the service unit nspawn is running in\n"
- " --volatile[=MODE] Run the system in volatile mode\n"
- " --settings=BOOLEAN Load additional settings from .nspawn file\n"
- " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
- "\nSee the %s for details.\n"
+ " Similar, but creates a read-only overlay mount\n\n"
+ "%3$sInput/Output:%4$s\n"
+ " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
+ " set up for the container.\n"
+ " -P --pipe Equivalent to --console=pipe\n"
+ "\nSee the %2$s for details.\n"
, program_invocation_short_name
, link
- );
+ , ansi_underline(), ansi_normal());
return 0;
}
return;
if (r < 0)
log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
+
arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
+ arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
}
static void parse_mount_settings_env(void) {
/* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
* even if it is supported. If not supported, it has no effect. */
- r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
- if (r == 0 || !cg_ns_supported())
+ if (!cg_ns_supported())
arg_use_cgns = false;
+ else {
+ r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
+ if (r < 0) {
+ if (r != -ENXIO)
+ log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
+
+ arg_use_cgns = true;
+ } else {
+ arg_use_cgns = r > 0;
+ arg_settings_mask |= SETTING_USE_CGNS;
+ }
+ }
e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
if (e)
ARG_TMPFS,
ARG_OVERLAY,
ARG_OVERLAY_RO,
+ ARG_INACCESSIBLE,
ARG_SHARE_SYSTEM,
ARG_REGISTER,
ARG_KEEP_UNIT,
ARG_CPU_AFFINITY,
ARG_RESOLV_CONF,
ARG_TIMEZONE,
+ ARG_CONSOLE,
+ ARG_PIPE,
+ ARG_OCI_BUNDLE,
+ ARG_NO_PAGER,
};
static const struct option options[] = {
{ "tmpfs", required_argument, NULL, ARG_TMPFS },
{ "overlay", required_argument, NULL, ARG_OVERLAY },
{ "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
+ { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
{ "machine", required_argument, NULL, 'M' },
{ "hostname", required_argument, NULL, ARG_HOSTNAME },
{ "slice", required_argument, NULL, 'S' },
{ "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
{ "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
{ "timezone", required_argument, NULL, ARG_TIMEZONE },
+ { "console", required_argument, NULL, ARG_CONSOLE },
+ { "pipe", no_argument, NULL, ARG_PIPE },
+ { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
+ { "no-pager", no_argument, NULL, ARG_NO_PAGER },
{}
};
assert(argc >= 0);
assert(argv);
- while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
switch (c) {
case 'h':
r = parse_path_argument_and_warn(optarg, false, &arg_directory);
if (r < 0)
return r;
+
+ arg_settings_mask |= SETTING_DIRECTORY;
break;
case ARG_TEMPLATE:
r = parse_path_argument_and_warn(optarg, false, &arg_template);
if (r < 0)
return r;
+
+ arg_settings_mask |= SETTING_DIRECTORY;
break;
case 'i':
r = parse_path_argument_and_warn(optarg, false, &arg_image);
if (r < 0)
return r;
+
+ arg_settings_mask |= SETTING_DIRECTORY;
+ break;
+
+ case ARG_OCI_BUNDLE:
+ r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
+ if (r < 0)
+ return r;
+
break;
case 'x':
if (r < 0)
return r;
+ arg_settings_mask |= SETTING_NETWORK;
break;
case 'b':
break;
case 'S':
- arg_slice = optarg;
+ r = free_and_strdup(&arg_slice, optarg);
+ if (r < 0)
+ return log_oom();
+
+ arg_settings_mask |= SETTING_SLICE;
break;
case 'M':
r = extract_first_word(&p, &t, ",", 0);
if (r < 0)
return log_error_errno(r, "Failed to parse capability %s.", t);
-
if (r == 0)
break;
case ARG_LINK_JOURNAL:
r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
- if (r < 0) {
- log_error_errno(r, "Failed to parse link journal mode %s", optarg);
- return -EINVAL;
- }
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
arg_settings_mask |= SETTING_LINK_JOURNAL;
break;
arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
break;
+ case ARG_INACCESSIBLE:
+ r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
+
+ arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
+ break;
+
case 'E': {
char **n;
if (r < 0)
return log_error_errno(r, "Failed to parse root hash: %s", optarg);
if (l < sizeof(sd_id128_t)) {
- log_error("Root hash must be at least 128bit long: %s", optarg);
free(k);
- return -EINVAL;
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
}
free(arg_root_hash);
break;
case ARG_CPU_AFFINITY: {
- _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
+ CPUSet cpuset;
r = parse_cpu_set(optarg, &cpuset);
if (r < 0)
- return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
-
- if (arg_cpuset)
- CPU_FREE(arg_cpuset);
+ return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
- arg_cpuset = TAKE_PTR(cpuset);
- arg_cpuset_ncpus = r;
+ cpu_set_reset(&arg_cpu_set);
+ arg_cpu_set = cpuset;
arg_settings_mask |= SETTING_CPU_AFFINITY;
break;
}
arg_settings_mask |= SETTING_TIMEZONE;
break;
+ case ARG_CONSOLE:
+ if (streq(optarg, "interactive"))
+ arg_console_mode = CONSOLE_INTERACTIVE;
+ else if (streq(optarg, "read-only"))
+ arg_console_mode = CONSOLE_READ_ONLY;
+ else if (streq(optarg, "passive"))
+ arg_console_mode = CONSOLE_PASSIVE;
+ else if (streq(optarg, "pipe"))
+ arg_console_mode = CONSOLE_PIPE;
+ else if (streq(optarg, "help"))
+ puts("interactive\n"
+ "read-only\n"
+ "passive\n"
+ "pipe");
+ else
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
+
+ arg_settings_mask |= SETTING_CONSOLE_MODE;
+ break;
+
+ case 'P':
+ case ARG_PIPE:
+ arg_console_mode = CONSOLE_PIPE;
+ arg_settings_mask |= SETTING_CONSOLE_MODE;
+ break;
+
+ case ARG_NO_PAGER:
+ arg_pager_flags |= PAGER_DISABLE;
+ break;
+
case '?':
return -EINVAL;
* --directory=". */
arg_directory = TAKE_PTR(arg_template);
- arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+ arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
+
+ /* Make sure to parse environment before we reset the settings mask below */
+ parse_environment();
/* Load all settings from .nspawn files */
if (mask_no_settings)
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
if (arg_userns_chown && arg_read_only)
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined.");
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "--read-only and --private-users-chown may not be combined.");
/* We don't support --private-users-chown together with any of the volatile modes since we couldn't
* change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
- * copy-up (in case of overlay) making the entire excercise pointless. */
+ * copy-up (in case of overlay) making the entire exercise pointless. */
if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
- /* If --network-namespace-path is given with any other network-related option,
- * we need to error out, to avoid conflicts between different network options. */
+ /* If --network-namespace-path is given with any other network-related option, we need to error out,
+ * to avoid conflicts between different network options. */
if (arg_network_namespace_path &&
(arg_network_interfaces || arg_network_macvlan ||
arg_network_ipvlan || arg_network_veth_extra ||
arg_network_bridge || arg_network_zone ||
arg_network_veth || arg_private_network))
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path cannot be combined with other network options.");
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
if (arg_network_bridge && arg_network_zone)
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-bridge= and --network-zone= may not be combined.");
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "--network-bridge= and --network-zone= may not be combined.");
if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
case TIMEZONE_COPY:
/* If mounting failed, try to copy */
- r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
+ r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
if (r < 0) {
log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to copy /etc/localtime to %s, ignoring: %m", where);
}
/* If that didn't work, let's copy the file */
- r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
+ r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
if (r < 0) {
/* If the file already exists as symlink, let's suppress the warning, under the assumption that
* resolved or something similar runs inside and the symlink points there.
const char *to;
int r;
- /* Generate a new randomized boot ID, so that each boot-up of
- * the container gets a new one */
+ /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
- r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+ r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
if (r < 0)
return log_error_errno(r, "Failed to generate random boot ID path: %m");
"tty\0"
"net/tun\0";
+ _cleanup_umask_ mode_t u;
const char *d;
int r = 0;
- _cleanup_umask_ mode_t u;
assert(dest);
return r;
}
+static int make_extra_nodes(const char *dest) {
+ _cleanup_umask_ mode_t u;
+ size_t i;
+ int r;
+
+ u = umask(0000);
+
+ for (i = 0; i < arg_n_extra_nodes; i++) {
+ _cleanup_free_ char *path = NULL;
+ DeviceNode *n = arg_extra_nodes + i;
+
+ path = prefix_root(dest, n->path);
+ if (!path)
+ return log_oom();
+
+ if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
+ return log_error_errno(errno, "Failed to create device node '%s': %m", path);
+
+ r = chmod_and_chown(path, n->mode, n->uid, n->gid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
+ }
+
+ return 0;
+}
+
static int setup_pts(const char *dest) {
_cleanup_free_ char *options = NULL;
const char *p;
int r;
assert(dest);
- assert(console);
u = umask(0000);
+ if (!console)
+ return 0;
+
r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
if (r < 0)
return log_error_errno(r, "Failed to correct access mode for TTY: %m");
_cleanup_free_ char *fifo = NULL;
_cleanup_close_ int fd = -1;
_cleanup_umask_ mode_t u;
- const char *to;
int r;
assert(kmsg_socket >= 0);
u = umask(0000);
- /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+ /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
* /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
* differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
* with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
- r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+ r = tempfn_random_child("/run", "proc-kmsg", &fifo);
if (r < 0)
return log_error_errno(r, "Failed to generate kmsg path: %m");
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
from = TAKE_PTR(fifo);
- to = "/proc/kmsg";
- r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+ r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
if (r < 0)
return r;
return 0;
}
-static int drop_capabilities(void) {
- return capability_bounding_set_drop(arg_caps_retain, false);
+static int drop_capabilities(uid_t uid) {
+ CapabilityQuintet q;
+
+ /* Let's initialize all five capability sets to something valid. If the quintet was configured via
+ * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
+ * arg_caps_retain. */
+
+ if (capability_quintet_is_set(&arg_full_capabilities)) {
+ q = arg_full_capabilities;
+
+ if (q.bounding == (uint64_t) -1)
+ q.bounding = uid == 0 ? arg_caps_retain : 0;
+
+ if (q.effective == (uint64_t) -1)
+ q.effective = uid == 0 ? q.bounding : 0;
+
+ if (q.inheritable == (uint64_t) -1)
+ q.inheritable = uid == 0 ? q.bounding : 0;
+
+ if (q.permitted == (uint64_t) -1)
+ q.permitted = uid == 0 ? q.bounding : 0;
+
+ if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
+ q.ambient = 0;
+ } else
+ q = (CapabilityQuintet) {
+ .bounding = arg_caps_retain,
+ .effective = uid == 0 ? arg_caps_retain : 0,
+ .inheritable = uid == 0 ? arg_caps_retain : 0,
+ .permitted = uid == 0 ? arg_caps_retain : 0,
+ .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
+ };
+
+ return capability_quintet_enforce(&q);
}
static int reset_audit_loginuid(void) {
return log_error_errno(r, "Failed to determine current directory: %m");
}
- if (!arg_directory && !arg_image) {
- log_error("Failed to determine path, please use -D or -i.");
- return -EINVAL;
- }
+ if (!arg_directory && !arg_image)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
}
if (!arg_machine) {
return log_oom();
hostname_cleanup(arg_machine);
- if (!machine_name_is_valid(arg_machine)) {
- log_error("Failed to determine machine name automatically, please use -M.");
- return -EINVAL;
- }
+ if (!machine_name_is_valid(arg_machine))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
if (arg_ephemeral) {
char *b;
return 0;
}
+static unsigned long effective_clone_ns_flags(void) {
+ unsigned long flags = arg_clone_ns_flags;
+
+ if (arg_private_network)
+ flags |= CLONE_NEWNET;
+ if (arg_use_cgns)
+ flags |= CLONE_NEWCGROUP;
+ if (arg_userns_mode != USER_NAMESPACE_NO)
+ flags |= CLONE_NEWUSER;
+
+ return flags;
+}
+
+static int patch_sysctl(void) {
+
+ /* This table is inspired by runc's sysctl() function */
+ static const struct {
+ const char *key;
+ bool prefix;
+ unsigned long clone_flags;
+ } safe_sysctl[] = {
+ { "kernel.hostname", false, CLONE_NEWUTS },
+ { "kernel.domainname", false, CLONE_NEWUTS },
+ { "kernel.msgmax", false, CLONE_NEWIPC },
+ { "kernel.msgmnb", false, CLONE_NEWIPC },
+ { "kernel.msgmni", false, CLONE_NEWIPC },
+ { "kernel.sem", false, CLONE_NEWIPC },
+ { "kernel.shmall", false, CLONE_NEWIPC },
+ { "kernel.shmmax", false, CLONE_NEWIPC },
+ { "kernel.shmmni", false, CLONE_NEWIPC },
+ { "fs.mqueue.", true, CLONE_NEWIPC },
+ { "net.", true, CLONE_NEWNET },
+ };
+
+ unsigned long flags;
+ char **k, **v;
+ int r;
+
+ flags = effective_clone_ns_flags();
+
+ STRV_FOREACH_PAIR(k, v, arg_sysctl) {
+ bool good = false;
+ size_t i;
+
+ for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
+
+ if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
+ continue;
+
+ if (safe_sysctl[i].prefix)
+ good = startswith(*k, safe_sysctl[i].key);
+ else
+ good = streq(*k, safe_sysctl[i].key);
+
+ if (good)
+ break;
+ }
+
+ if (!good)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
+
+ r = sysctl_write(*k, *v);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
+ }
+
+ return 0;
+}
+
static int inner_child(
Barrier *barrier,
const char *directory,
};
const char *exec_target;
_cleanup_strv_free_ char **env_use = NULL;
- int r;
+ int r, which_failed;
/* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
* the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
assert(directory);
assert(kmsg_socket >= 0);
+ log_debug("Inner child is initializing.");
+
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* Tell the parent, that it now can write the UID map. */
(void) barrier_place(barrier); /* #1 */
return r;
kmsg_socket = safe_close(kmsg_socket);
+ r = mount_custom(
+ "/",
+ arg_custom_mounts,
+ arg_n_custom_mounts,
+ false,
+ 0,
+ 0,
+ arg_selinux_apifs_context,
+ true);
+ if (r < 0)
+ return r;
+
if (setsid() < 0)
return log_error_errno(errno, "setsid() failed: %m");
rtnl_socket = safe_close(rtnl_socket);
}
+ r = patch_sysctl();
+ if (r < 0)
+ return r;
+
if (arg_oom_score_adjust_set) {
r = set_oom_score_adjust(arg_oom_score_adjust);
if (r < 0)
return log_error_errno(r, "Failed to adjust OOM score: %m");
}
- if (arg_cpuset)
- if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
+ if (arg_cpu_set.set)
+ if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
return log_error_errno(errno, "Failed to set CPU affinity: %m");
- r = drop_capabilities();
- if (r < 0)
- return log_error_errno(r, "drop_capabilities() failed: %m");
-
(void) setup_hostname();
if (arg_personality != PERSONALITY_INVALID) {
return log_error_errno(r, "personality() failed: %m");
}
+ r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
+ if (r < 0)
+ return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+
+#if HAVE_SECCOMP
+ if (arg_seccomp) {
+
+ if (is_seccomp_available()) {
+
+ r = seccomp_load(arg_seccomp);
+ if (ERRNO_IS_SECCOMP_FATAL(r))
+ return log_error_errno(r, "Failed to install seccomp filter: %m");
+ if (r < 0)
+ log_debug_errno(r, "Failed to install seccomp filter: %m");
+ }
+ } else
+#endif
+ {
+ r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
+ if (r < 0)
+ return r;
+ }
+
#if HAVE_SELINUX
if (arg_selinux_context)
if (setexeccon(arg_selinux_context) < 0)
return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
#endif
- r = change_uid_gid(arg_user, &home);
+ /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
+ * if we need to later on. */
+ if (prctl(PR_SET_KEEPCAPS, 1) < 0)
+ return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
+
+ if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
+ r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
+ else
+ r = change_uid_gid(arg_user, &home);
if (r < 0)
return r;
+ r = drop_capabilities(getuid());
+ if (r < 0)
+ return log_error_errno(r, "Dropping capabilities failed: %m");
+
if (arg_no_new_privileges)
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
return log_error_errno(errno, "Failed to disable new privileges: %m");
if (envp[n_env])
n_env++;
- if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
- (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
- (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
- return log_oom();
+ if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
+ if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
+ return log_oom();
+
+ if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
+ if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
+ asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
+ return log_oom();
assert(!sd_id128_is_null(arg_uuid));
return r;
}
+ log_debug("Inner child completed, invoking payload.");
+
/* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
* has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
* it again. Note that the other fds closed here are at least the locking and barrier fds. */
const char *directory,
const char *console,
DissectedImage *dissected_image,
- bool interactive,
bool secondary,
int pid_socket,
int uuid_socket,
int netns_fd) {
_cleanup_close_ int fd = -1;
- int r, which_failed;
pid_t pid;
ssize_t l;
+ int r;
/* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
* its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
assert(barrier);
assert(directory);
- assert(console);
assert(pid_socket >= 0);
assert(uuid_socket >= 0);
assert(notify_socket >= 0);
assert(kmsg_socket >= 0);
+ log_debug("Outer child is initializing.");
+
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
- if (interactive) {
+ if (arg_console_mode != CONSOLE_PIPE) {
int terminal;
+ assert(console);
+
terminal = open_terminal(console, O_RDWR);
if (terminal < 0)
return log_error_errno(terminal, "Failed to open console: %m");
arg_userns_mode != USER_NAMESPACE_NO,
arg_uid_shift,
arg_uid_range,
- arg_selinux_context);
+ arg_selinux_apifs_context);
if (r < 0)
return r;
/* Mark everything as shared so our mounts get propagated down. This is
* required to make new bind mounts available in systemd services
- * inside the containter that create a new mount namespace.
+ * inside the container that create a new mount namespace.
* See https://github.com/systemd/systemd/issues/3860
* Further submounts (such as /dev) done after this will inherit the
* shared propagation mode. */
return r;
if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
- r = bind_remount_recursive(directory, true, NULL);
+ r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
if (r < 0)
return log_error_errno(r, "Failed to make tree read-only: %m");
}
if (r < 0)
return r;
- dev_setup(directory, arg_uid_shift, arg_uid_shift);
+ r = make_extra_nodes(directory);
+ if (r < 0)
+ return r;
+
+ (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
+ (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
r = setup_pts(directory);
if (r < 0)
if (r < 0)
return r;
- r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
- if (r < 0)
- return r;
-
r = setup_timezone(directory);
if (r < 0)
return r;
arg_userns_mode != USER_NAMESPACE_NO,
arg_uid_shift,
arg_uid_range,
- arg_selinux_apifs_context);
+ arg_selinux_apifs_context,
+ false);
if (r < 0)
return r;
if (fd < 0)
return fd;
- r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
- if (r < 0)
- return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
-
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
arg_clone_ns_flags |
(arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
return log_oom();
if (strv_find(tags, "READY=1"))
- sd_notifyf(false, "READY=1\n");
+ (void) sd_notifyf(false, "READY=1\n");
p = strv_find_startswith(tags, "STATUS=");
if (p)
- sd_notifyf(false, "STATUS=Container running: %s", p);
+ (void) sd_notifyf(false, "STATUS=Container running: %s", p);
return 0;
}
if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
arg_ephemeral = settings->ephemeral;
+ if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
+ settings->root) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring root directory setting, file %s is not trusted.", path);
+ else
+ free_and_replace(arg_directory, settings->root);
+ }
+
if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
settings->pivot_root_new) {
free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
settings->environment)
strv_free_and_replace(arg_setenv, settings->environment);
- if ((arg_settings_mask & SETTING_USER) == 0 &&
- settings->user)
- free_and_replace(arg_user, settings->user);
+ if ((arg_settings_mask & SETTING_USER) == 0) {
+
+ if (settings->user)
+ free_and_replace(arg_user, settings->user);
+
+ if (uid_is_valid(settings->uid))
+ arg_uid = settings->uid;
+ if (gid_is_valid(settings->gid))
+ arg_gid = settings->gid;
+ if (settings->n_supplementary_gids > 0) {
+ free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
+ arg_n_supplementary_gids = settings->n_supplementary_gids;
+ }
+ }
if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
- uint64_t plus;
+ uint64_t plus, minus;
+
+ /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
+ * Settings structure */
plus = settings->capability;
- if (settings_private_network(settings))
- plus |= (1ULL << CAP_NET_ADMIN);
+ minus = settings->drop_capability;
+
+ if ((arg_settings_mask & SETTING_NETWORK) == 0) {
+ if (settings_private_network(settings))
+ plus |= UINT64_C(1) << CAP_NET_ADMIN;
+ else
+ minus |= UINT64_C(1) << CAP_NET_ADMIN;
+ }
if (!arg_settings_trusted && plus != 0) {
if (settings->capability != 0)
} else
arg_caps_retain |= plus;
- arg_caps_retain &= ~settings->drop_capability;
+ arg_caps_retain &= ~minus;
+
+ /* Copy the full capabilities over too */
+ if (capability_quintet_is_set(&settings->full_capabilities)) {
+ if (!arg_settings_trusted)
+ log_warning("Ignoring capability settings, file %s is not trusted.", path);
+ else
+ arg_full_capabilities = settings->full_capabilities;
+ }
}
if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
settings->network_interfaces ||
settings->network_macvlan ||
settings->network_ipvlan ||
- settings->network_veth_extra)) {
+ settings->network_veth_extra ||
+ settings->network_namespace_path)) {
if (!arg_settings_trusted)
log_warning("Ignoring network settings, file %s is not trusted.", path);
free_and_replace(arg_network_bridge, settings->network_bridge);
free_and_replace(arg_network_zone, settings->network_zone);
+
+ free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
}
}
if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
- if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
+ if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
else {
strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
}
+
+#if HAVE_SECCOMP
+ if (!arg_settings_trusted && settings->seccomp)
+ log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
+ else {
+ seccomp_release(arg_seccomp);
+ arg_seccomp = TAKE_PTR(settings->seccomp);
+ }
+#endif
}
for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
}
if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
- settings->cpuset) {
+ settings->cpu_set.set) {
if (!arg_settings_trusted)
log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
else {
- if (arg_cpuset)
- CPU_FREE(arg_cpuset);
- arg_cpuset = TAKE_PTR(settings->cpuset);
- arg_cpuset_ncpus = settings->cpuset_ncpus;
+ cpu_set_reset(&arg_cpu_set);
+ arg_cpu_set = settings->cpu_set;
+ settings->cpu_set = (CPUSet) {};
}
}
settings->timezone != _TIMEZONE_MODE_INVALID)
arg_timezone = settings->timezone;
+ if ((arg_settings_mask & SETTING_SLICE) == 0 &&
+ settings->slice) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
+ else
+ free_and_replace(arg_slice, settings->slice);
+ }
+
+ if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
+ settings->use_cgns >= 0) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
+ else
+ arg_use_cgns = settings->use_cgns;
+ }
+
+ if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
+ settings->clone_ns_flags != (unsigned long) -1) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
+ else
+ arg_clone_ns_flags = settings->clone_ns_flags;
+ }
+
+ if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
+ settings->console_mode >= 0) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
+ else
+ arg_console_mode = settings->console_mode;
+ }
+
+ /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
+ * don't consult arg_settings_mask for them. */
+
+ sd_bus_message_unref(arg_property_message);
+ arg_property_message = TAKE_PTR(settings->properties);
+
+ arg_console_width = settings->console_width;
+ arg_console_height = settings->console_height;
+
+ device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
+ arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
+ arg_n_extra_nodes = settings->n_extra_nodes;
+
return 0;
}
const char *fn, *i;
int r;
+ if (arg_oci_bundle)
+ return 0;
+
/* If all settings are masked, there's no point in looking for
* the settings file */
if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
return merge_settings(settings, p);
}
-static int run(int master,
+static int load_oci_bundle(void) {
+ _cleanup_(settings_freep) Settings *settings = NULL;
+ int r;
+
+ if (!arg_oci_bundle)
+ return 0;
+
+ /* By default let's trust OCI bundles */
+ if (arg_settings_trusted < 0)
+ arg_settings_trusted = true;
+
+ r = oci_load(NULL, arg_oci_bundle, &settings);
+ if (r < 0)
+ return r;
+
+ return merge_settings(settings, arg_oci_bundle);
+}
+
+static int run_container(int master,
const char* console,
DissectedImage *dissected_image,
- bool interactive,
bool secondary,
FDSet *fds,
char veth_name[IFNAMSIZ], bool *veth_created,
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
_cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
ContainerStatus container_status = 0;
- char last_char = 0;
int ifi = 0, r;
ssize_t l;
sigset_t mask_chld;
log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
else if (r < 0)
return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
- else if (r == 0) {
- log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
- return -EINVAL;
- }
+ else if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
}
*pid = raw_clone(SIGCHLD|CLONE_NEWNS);
arg_directory,
console,
dissected_image,
- interactive,
secondary,
pid_socket_pair[1],
uuid_socket_pair[1],
barrier_set_role(&barrier, BARRIER_PARENT);
- fds = fdset_free(fds);
+ fdset_close(fds);
kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read UID shift: %m");
- if (l != sizeof arg_uid_shift) {
- log_error("Short read while reading UID shift.");
- return -EIO;
- }
+ if (l != sizeof arg_uid_shift)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
if (arg_userns_mode == USER_NAMESPACE_PICK) {
/* If we are supposed to pick the UID shift, let's try to use the shift read from the
l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
if (l < 0)
return log_error_errno(errno, "Failed to send UID shift: %m");
- if (l != sizeof arg_uid_shift) {
- log_error("Short write while writing UID shift.");
- return -EIO;
- }
+ if (l != sizeof arg_uid_shift)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
}
}
l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
if (l < 0)
return log_error_errno(errno, "Failed to read cgroup mode: %m");
- if (l != sizeof(arg_unified_cgroup_hierarchy)) {
- log_error("Short read while reading cgroup mode (%zu bytes).%s",
- l, l == 0 ? " The child is most likely dead." : "");
- return -EIO;
- }
+ if (l != sizeof(arg_unified_cgroup_hierarchy))
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
+ l, l == 0 ? " The child is most likely dead." : "");
}
/* Wait for the outer child. */
l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read inner child PID: %m");
- if (l != sizeof *pid) {
- log_error("Short read while reading inner child PID.");
- return -EIO;
- }
+ if (l != sizeof *pid)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
/* We also retrieve container UUID in case it was generated by outer child */
l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read container machine ID: %m");
- if (l != sizeof(arg_uuid)) {
- log_error("Short read while reading container machined ID.");
- return -EIO;
- }
+ if (l != sizeof(arg_uuid))
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
/* We also retrieve the socket used for notifications generated by outer child */
notify_socket = receive_one_fd(notify_socket_pair[0], 0);
log_debug("Init process invoked as PID "PID_FMT, *pid);
if (arg_userns_mode != USER_NAMESPACE_NO) {
- if (!barrier_place_and_sync(&barrier)) { /* #1 */
- log_error("Child died too early.");
- return -ESRCH;
- }
+ if (!barrier_place_and_sync(&barrier)) /* #1 */
+ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
r = setup_uid_map(*pid);
if (r < 0)
if (arg_private_network) {
if (!arg_network_namespace_path) {
/* Wait until the child has unshared its network namespace. */
- if (!barrier_place_and_sync(&barrier)) { /* #3 */
- log_error("Child died too early");
- return -ESRCH;
- }
+ if (!barrier_place_and_sync(&barrier)) /* #3 */
+ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
}
r = move_network_interfaces(*pid, arg_network_interfaces);
arg_custom_mounts, arg_n_custom_mounts,
arg_kill_signal,
arg_property,
+ arg_property_message,
arg_keep_unit,
arg_container_service_name);
if (r < 0)
arg_slice,
arg_custom_mounts, arg_n_custom_mounts,
arg_kill_signal,
- arg_property);
+ arg_property,
+ arg_property_message);
if (r < 0)
return r;
} else if (arg_slice || arg_property)
log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
- r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
+ r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
if (r < 0)
return r;
- r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+ r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
if (r < 0)
return r;
return r;
/* Let the child know that we are ready and wait that the child is completely ready now. */
- if (!barrier_place_and_sync(&barrier)) { /* #5 */
- log_error("Child died too early.");
- return -ESRCH;
- }
+ if (!barrier_place_and_sync(&barrier)) /* #5 */
+ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
/* At this point we have made use of the UID we picked, and thus nss-mymachines
* will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
etc_passwd_lock = safe_close(etc_passwd_lock);
- sd_notifyf(false,
- "STATUS=Container running.\n"
- "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
+ (void) sd_notifyf(false,
+ "STATUS=Container running.\n"
+ "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
if (!arg_notify_ready)
(void) sd_notify(false, "READY=1\n");
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
- r = pty_forward_new(event, master,
- PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
- &forward);
- if (r < 0)
- return log_error_errno(r, "Failed to create PTY forwarder: %m");
+ if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
+ assert(master >= 0);
+
+ r = pty_forward_new(event, master,
+ PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
+ &forward);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create PTY forwarder: %m");
+
+ if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
+ (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
+ }
r = sd_event_loop(event);
if (r < 0)
return log_error_errno(r, "Failed to run event loop: %m");
- pty_forward_get_last_char(forward, &last_char);
+ if (forward) {
+ char last_char = 0;
- forward = pty_forward_free(forward);
+ (void) pty_forward_get_last_char(forward, &last_char);
+ forward = pty_forward_free(forward);
- if (!arg_quiet && last_char != '\n')
- putc('\n', stdout);
+ if (!arg_quiet && last_char != '\n')
+ putc('\n', stdout);
+ }
/* Kill if it is not dead yet anyway */
if (bus) {
return 0;
}
-int main(int argc, char *argv[]) {
+static int run(int argc, char *argv[]) {
_cleanup_free_ char *console = NULL;
_cleanup_close_ int master = -1;
_cleanup_fdset_free_ FDSet *fds = NULL;
pid_t pid = 0;
union in_addr_union exposed = {};
_cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
- bool interactive, veth_created = false, remove_tmprootdir = false;
+ bool veth_created = false, remove_tmprootdir = false;
char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
_cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
_cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
log_parse_environment();
log_open();
- /* Make sure rename_process() in the stub init process can work */
- saved_argv = argv;
- saved_argc = argc;
-
r = parse_argv(argc, argv);
if (r <= 0)
goto finish;
if (r < 0)
goto finish;
+ r = load_oci_bundle();
+ if (r < 0)
+ goto finish;
+
r = determine_names();
if (r < 0)
goto finish;
if (r < 0)
goto finish;
- parse_environment();
-
r = cg_unified_flush();
if (r < 0) {
log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
goto finish;
}
- r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
+ r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
if (r < 0) {
r = log_error_errno(r, "Failed to copy image file: %m");
goto finish;
if (r < 0)
goto finish;
- interactive =
- isatty(STDIN_FILENO) > 0 &&
- isatty(STDOUT_FILENO) > 0;
+ if (arg_console_mode < 0)
+ arg_console_mode =
+ isatty(STDIN_FILENO) > 0 &&
+ isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
- master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
- if (master < 0) {
- r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
- goto finish;
- }
+ if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
+ arg_quiet = true;
- r = ptsname_malloc(master, &console);
- if (r < 0) {
- r = log_error_errno(r, "Failed to determine tty name: %m");
- goto finish;
- }
+ if (arg_console_mode != CONSOLE_PIPE) {
+ master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
+ if (master < 0) {
+ r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
+ goto finish;
+ }
- if (arg_selinux_apifs_context) {
- r = mac_selinux_apply(console, arg_selinux_apifs_context);
- if (r < 0)
+ r = ptsname_malloc(master, &console);
+ if (r < 0) {
+ r = log_error_errno(r, "Failed to determine tty name: %m");
goto finish;
- }
+ }
- if (unlockpt(master) < 0) {
- r = log_error_errno(errno, "Failed to unlock tty: %m");
- goto finish;
+ if (arg_selinux_apifs_context) {
+ r = mac_selinux_apply(console, arg_selinux_apifs_context);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (unlockpt(master) < 0) {
+ r = log_error_errno(errno, "Failed to unlock tty: %m");
+ goto finish;
+ }
}
if (!arg_quiet)
}
for (;;) {
- r = run(master,
- console,
- dissected_image,
- interactive, secondary,
- fds,
- veth_name, &veth_created,
- &exposed,
- &pid, &ret);
+ r = run_container(master,
+ console,
+ dissected_image,
+ secondary,
+ fds,
+ veth_name, &veth_created,
+ &exposed,
+ &pid, &ret);
if (r <= 0)
break;
}
finish:
- sd_notify(false,
- r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
- "STOPPING=1\nSTATUS=Terminating...");
+ (void) sd_notify(false,
+ r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
+ "STOPPING=1\nSTATUS=Terminating...");
if (pid > 0)
(void) kill(pid, SIGKILL);
(void) remove_veth_links(veth_name, arg_network_veth_extra);
(void) remove_bridge(arg_network_zone);
- free(arg_directory);
- free(arg_template);
- free(arg_image);
- free(arg_machine);
- free(arg_hostname);
- free(arg_user);
- free(arg_pivot_root_new);
- free(arg_pivot_root_old);
- free(arg_chdir);
- strv_free(arg_setenv);
- free(arg_network_bridge);
- strv_free(arg_network_interfaces);
- strv_free(arg_network_macvlan);
- strv_free(arg_network_ipvlan);
- strv_free(arg_network_veth_extra);
- strv_free(arg_parameters);
- free(arg_network_zone);
- free(arg_network_namespace_path);
- strv_free(arg_property);
custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
expose_port_free_all(arg_expose_ports);
- free(arg_root_hash);
rlimit_free_all(arg_rlimit);
- strv_free(arg_syscall_whitelist);
- strv_free(arg_syscall_blacklist);
- arg_cpuset = cpu_set_mfree(arg_cpuset);
+ device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
- return r < 0 ? EXIT_FAILURE : ret;
+ if (r < 0)
+ return r;
+
+ return ret;
}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);