]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
mount: when we fail to establish an inaccessible mount gracefully, undo the mount
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
6b5cf3ea 4#include <blkid.h>
8fe0087e 5#endif
88213476 6#include <errno.h>
88213476 7#include <getopt.h>
0e7ac751 8#include <grp.h>
503f480f 9#include <linux/fs.h>
1b9e5b12 10#include <linux/loop.h>
0e7ac751 11#include <pwd.h>
8fe0087e 12#include <sched.h>
349cc4a5 13#if HAVE_SELINUX
8fe0087e 14#include <selinux/selinux.h>
1b9e5b12 15#endif
8fe0087e
LP
16#include <signal.h>
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20#include <sys/file.h>
8fe0087e
LP
21#include <sys/personality.h>
22#include <sys/prctl.h>
23#include <sys/types.h>
6916b164 24#include <sys/wait.h>
8fe0087e 25#include <unistd.h>
1b9e5b12 26
b053cd5f 27#include "sd-bus.h"
1f0cd86b 28#include "sd-daemon.h"
1f0cd86b 29#include "sd-id128.h"
8fe0087e 30
b5efdb8a 31#include "alloc-util.h"
8fe0087e
LP
32#include "barrier.h"
33#include "base-filesystem.h"
34#include "blkid-util.h"
35#include "btrfs-util.h"
b8ea7a6e 36#include "bus-error.h"
b053cd5f 37#include "bus-util.h"
8fe0087e 38#include "cap-list.h"
430f0182 39#include "capability-util.h"
04d391da 40#include "cgroup-util.h"
8fe0087e 41#include "copy.h"
d107bb7d 42#include "cpu-set-util.h"
4fc9982c 43#include "dev-setup.h"
2d845785 44#include "dissect-image.h"
8fe0087e 45#include "env-util.h"
3ffd4af2 46#include "fd-util.h"
842f3b0f 47#include "fdset.h"
a5c32cff 48#include "fileio.h"
f97b34a6 49#include "format-util.h"
f4f15635 50#include "fs-util.h"
1b9e5b12 51#include "gpt.h"
4623e8e6 52#include "hexdecoct.h"
8fe0087e 53#include "hostname-util.h"
910fd145 54#include "id128-util.h"
8fe0087e 55#include "log.h"
2d845785 56#include "loop-util.h"
8fe0087e 57#include "loopback-setup.h"
1b9cebf6 58#include "machine-image.h"
8fe0087e
LP
59#include "macro.h"
60#include "missing.h"
61#include "mkdir.h"
4349cd7c 62#include "mount-util.h"
049af8ad 63#include "mountpoint-util.h"
0cb8e3d1 64#include "namespace-util.h"
8fe0087e 65#include "netlink-util.h"
07630cea 66#include "nspawn-cgroup.h"
3603efde 67#include "nspawn-def.h"
07630cea
LP
68#include "nspawn-expose-ports.h"
69#include "nspawn-mount.h"
70#include "nspawn-network.h"
de40a303 71#include "nspawn-oci.h"
7336138e 72#include "nspawn-patch-uid.h"
07630cea 73#include "nspawn-register.h"
910fd145 74#include "nspawn-seccomp.h"
07630cea
LP
75#include "nspawn-settings.h"
76#include "nspawn-setuid.h"
7732f92b 77#include "nspawn-stub-pid1.h"
d8b4d14d 78#include "nulstr-util.h"
d58ad743 79#include "os-util.h"
50ebcf6c 80#include "pager.h"
6bedfcbb 81#include "parse-util.h"
8fe0087e 82#include "path-util.h"
294bf0c3 83#include "pretty-print.h"
0b452006 84#include "process-util.h"
8fe0087e
LP
85#include "ptyfwd.h"
86#include "random-util.h"
8869a0b4 87#include "raw-clone.h"
bf428efb 88#include "rlimit-util.h"
8fe0087e 89#include "rm-rf.h"
de40a303
LP
90#if HAVE_SECCOMP
91#include "seccomp-util.h"
92#endif
68b02049 93#include "selinux-util.h"
8fe0087e 94#include "signal-util.h"
2583fbea 95#include "socket-util.h"
8fcde012 96#include "stat-util.h"
15a5e950 97#include "stdio-util.h"
5c828e66 98#include "string-table.h"
07630cea 99#include "string-util.h"
8fe0087e 100#include "strv.h"
de40a303 101#include "sysctl-util.h"
8fe0087e 102#include "terminal-util.h"
e4de7287 103#include "tmpfile-util.h"
affb60b1 104#include "umask-util.h"
b1d4f8e1 105#include "user-util.h"
8fe0087e 106#include "util.h"
e9642be2 107
62b1e758
YW
108#if HAVE_SPLIT_USR
109#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
110#else
111#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
112#endif
113
9c1e04d0
AP
114/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
115 * nspawn_notify_socket_path is relative to the container
116 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
117#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 118
2a49b612
ZJS
119#define EXIT_FORCE_RESTART 133
120
113cea80
DH
121typedef enum ContainerStatus {
122 CONTAINER_TERMINATED,
123 CONTAINER_REBOOTED
124} ContainerStatus;
125
88213476 126static char *arg_directory = NULL;
ec16945e 127static char *arg_template = NULL;
5f932eb9 128static char *arg_chdir = NULL;
b53ede69
PW
129static char *arg_pivot_root_new = NULL;
130static char *arg_pivot_root_old = NULL;
687d0825 131static char *arg_user = NULL;
de40a303
LP
132static uid_t arg_uid = UID_INVALID;
133static gid_t arg_gid = GID_INVALID;
134static gid_t* arg_supplementary_gids = NULL;
135static size_t arg_n_supplementary_gids = 0;
9444b1f2 136static sd_id128_t arg_uuid = {};
3a9530e5
LP
137static char *arg_machine = NULL; /* The name used by the host to refer to this */
138static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
139static const char *arg_selinux_context = NULL;
140static const char *arg_selinux_apifs_context = NULL;
de40a303 141static char *arg_slice = NULL;
ff01d048 142static bool arg_private_network = false;
bc2f673e 143static bool arg_read_only = false;
7732f92b 144static StartMode arg_start_mode = START_PID1;
ec16945e 145static bool arg_ephemeral = false;
57fb9fb5 146static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 147static bool arg_link_journal_try = false;
520e0d54 148static uint64_t arg_caps_retain =
50b52222
LP
149 (1ULL << CAP_AUDIT_CONTROL) |
150 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
151 (1ULL << CAP_CHOWN) |
152 (1ULL << CAP_DAC_OVERRIDE) |
153 (1ULL << CAP_DAC_READ_SEARCH) |
154 (1ULL << CAP_FOWNER) |
155 (1ULL << CAP_FSETID) |
156 (1ULL << CAP_IPC_OWNER) |
157 (1ULL << CAP_KILL) |
158 (1ULL << CAP_LEASE) |
159 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 160 (1ULL << CAP_MKNOD) |
5076f0cc
LP
161 (1ULL << CAP_NET_BIND_SERVICE) |
162 (1ULL << CAP_NET_BROADCAST) |
163 (1ULL << CAP_NET_RAW) |
5076f0cc 164 (1ULL << CAP_SETFCAP) |
50b52222 165 (1ULL << CAP_SETGID) |
5076f0cc
LP
166 (1ULL << CAP_SETPCAP) |
167 (1ULL << CAP_SETUID) |
168 (1ULL << CAP_SYS_ADMIN) |
50b52222 169 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
170 (1ULL << CAP_SYS_CHROOT) |
171 (1ULL << CAP_SYS_NICE) |
172 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 173 (1ULL << CAP_SYS_RESOURCE) |
50b52222 174 (1ULL << CAP_SYS_TTY_CONFIG);
de40a303 175static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 176static CustomMount *arg_custom_mounts = NULL;
88614c8a 177static size_t arg_n_custom_mounts = 0;
f4889f65 178static char **arg_setenv = NULL;
284c0b91 179static bool arg_quiet = false;
eb91eb18 180static bool arg_register = true;
89f7c846 181static bool arg_keep_unit = false;
aa28aefe 182static char **arg_network_interfaces = NULL;
c74e630d 183static char **arg_network_macvlan = NULL;
4bbfe7ad 184static char **arg_network_ipvlan = NULL;
69c79d3c 185static bool arg_network_veth = false;
f6d6bad1 186static char **arg_network_veth_extra = NULL;
f757855e 187static char *arg_network_bridge = NULL;
22b28dfd 188static char *arg_network_zone = NULL;
d7bea6b6 189static char *arg_network_namespace_path = NULL;
050f7277 190static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 191static char *arg_image = NULL;
de40a303 192static char *arg_oci_bundle = NULL;
f757855e 193static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 194static ExposePort *arg_expose_ports = NULL;
f36933fe 195static char **arg_property = NULL;
de40a303 196static sd_bus_message *arg_property_message = NULL;
0de7acce 197static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 198static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 199static bool arg_userns_chown = false;
c6c8f6e2 200static int arg_kill_signal = 0;
5da38d07 201static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
202static SettingsMask arg_settings_mask = 0;
203static int arg_settings_trusted = -1;
204static char **arg_parameters = NULL;
6aadfa4c 205static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 206static bool arg_notify_ready = false;
5a8ff0e6 207static bool arg_use_cgns = true;
0c582db0 208static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 209static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
4623e8e6
LP
210static void *arg_root_hash = NULL;
211static size_t arg_root_hash_size = 0;
960e4569
LP
212static char **arg_syscall_whitelist = NULL;
213static char **arg_syscall_blacklist = NULL;
de40a303
LP
214#if HAVE_SECCOMP
215static scmp_filter_ctx arg_seccomp = NULL;
216#endif
bf428efb 217static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 218static bool arg_no_new_privileges = false;
81f345df
LP
219static int arg_oom_score_adjust = 0;
220static bool arg_oom_score_adjust_set = false;
d107bb7d
LP
221static cpu_set_t *arg_cpuset = NULL;
222static unsigned arg_cpuset_ncpus = 0;
09d423e9 223static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 224static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
225static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
226static DeviceNode* arg_extra_nodes = NULL;
227static size_t arg_n_extra_nodes = 0;
228static char **arg_sysctl = NULL;
229static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
88213476 230
37ec0fdd
LP
231static int help(void) {
232 _cleanup_free_ char *link = NULL;
233 int r;
234
0221d68a 235 (void) pager_open(false);
50ebcf6c 236
37ec0fdd
LP
237 r = terminal_urlify_man("systemd-nspawn", "1", &link);
238 if (r < 0)
239 return log_oom();
240
88213476 241 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
a7e2e50d 242 "Spawn a command or OS in a light-weight container.\n\n"
a8828ed9
DW
243 " -h --help Show this help\n"
244 " --version Print version string\n"
69c79d3c 245 " -q --quiet Do not show status information\n"
1b9e5b12 246 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
247 " --template=PATH Initialize root directory from template directory,\n"
248 " if missing\n"
249 " -x --ephemeral Run container with snapshot of root directory, and\n"
250 " remove it after exit\n"
251 " -i --image=PATH File system device or disk image for the container\n"
de40a303 252 " --oci-bundle=PATH OCI bundle directory\n"
4623e8e6 253 " --root-hash=HASH Specify verity root hash\n"
7732f92b 254 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 255 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 256 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
257 " --pivot-root=PATH[:PATH]\n"
258 " Pivot root to given directory in the container\n"
a8828ed9 259 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 260 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 261 " --hostname=NAME Override the hostname for the container\n"
69c79d3c 262 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 263 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 264 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 265 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 266 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 267 " Similar, but with user configured UID/GID range\n"
24597ee0 268 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
269 " --private-network Disable network in container\n"
270 " --network-interface=INTERFACE\n"
271 " Assign an existing network interface to the\n"
272 " container\n"
c74e630d
LP
273 " --network-macvlan=INTERFACE\n"
274 " Create a macvlan network interface based on an\n"
275 " existing network interface to the container\n"
4bbfe7ad
TG
276 " --network-ipvlan=INTERFACE\n"
277 " Create a ipvlan network interface based on an\n"
278 " existing network interface to the container\n"
a8eaaee7 279 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 280 " and container\n"
f6d6bad1
LP
281 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
282 " Add an additional virtual Ethernet link between\n"
283 " host and container\n"
ab046dde 284 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
285 " Add a virtual Ethernet connection to the container\n"
286 " and attach it to an existing bridge on the host\n"
287 " --network-zone=NAME Similar, but attach the new interface to an\n"
288 " an automatically managed bridge interface\n"
d7bea6b6
DP
289 " --network-namespace-path=PATH\n"
290 " Set network namespace to the one represented by\n"
291 " the specified kernel namespace file node\n"
6d0b55c2 292 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 293 " Expose a container IP port on the host\n"
82adf6af
LP
294 " -Z --selinux-context=SECLABEL\n"
295 " Set the SELinux security context to be used by\n"
296 " processes in the container\n"
297 " -L --selinux-apifs-context=SECLABEL\n"
298 " Set the SELinux security context to be used by\n"
299 " API/tmpfs file systems in the container\n"
a8828ed9
DW
300 " --capability=CAP In addition to the default, retain specified\n"
301 " capability\n"
302 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
303 " --system-call-filter=LIST|~LIST\n"
304 " Permit/prohibit specific system calls\n"
bf428efb 305 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
306 " --oom-score-adjust=VALUE\n"
307 " Adjust the OOM score value for the payload\n"
d107bb7d 308 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
c6c8f6e2 309 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
310 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
311 " host, try-guest, try-host\n"
574edc90 312 " -j Equivalent to --link-journal=try-guest\n"
09d423e9 313 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 314 " --timezone=MODE Select mode of /etc/localtime initialization\n"
69c79d3c 315 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
316 " --bind=PATH[:PATH[:OPTIONS]]\n"
317 " Bind mount a file or directory from the host into\n"
a8828ed9 318 " the container\n"
5e5bfa6e
EY
319 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
320 " Similar, but creates a read-only bind mount\n"
de40a303
LP
321 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
322 " it\n"
06c17c39 323 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
324 " --overlay=PATH[:PATH...]:PATH\n"
325 " Create an overlay mount from the host to \n"
326 " the container\n"
327 " --overlay-ro=PATH[:PATH...]:PATH\n"
328 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 329 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 330 " --register=BOOLEAN Register container as machine\n"
89f7c846 331 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 332 " the service unit nspawn is running in\n"
6d0b55c2 333 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 334 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 335 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
de40a303
LP
336 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
337 " set up for the container.\n"
338 " -P --pipe Equivalent to --console=pipe\n"
37ec0fdd
LP
339 "\nSee the %s for details.\n"
340 , program_invocation_short_name
341 , link
342 );
343
344 return 0;
88213476
LP
345}
346
86c0dd4a 347static int custom_mount_check_all(void) {
88614c8a 348 size_t i;
5a8af538 349
5a8af538
LP
350 for (i = 0; i < arg_n_custom_mounts; i++) {
351 CustomMount *m = &arg_custom_mounts[i];
352
0de7acce 353 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
354 if (arg_userns_chown)
355 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
356 "--private-users-chown may not be combined with custom root mounts.");
357 else if (arg_uid_shift == UID_INVALID)
358 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
359 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 360 }
5a8af538
LP
361 }
362
363 return 0;
364}
365
8199d554 366static int detect_unified_cgroup_hierarchy_from_environment(void) {
efdb0237 367 const char *e;
415fc41c 368 int r;
5da38d07 369
efdb0237
LP
370 /* Allow the user to control whether the unified hierarchy is used */
371 e = getenv("UNIFIED_CGROUP_HIERARCHY");
372 if (e) {
373 r = parse_boolean(e);
374 if (r < 0)
375 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
376 if (r > 0)
377 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
378 else
379 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
380 }
381
8199d554
LP
382 return 0;
383}
384
385static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
386 int r;
387
388 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
389 * image actually supports. */
b4cccbc1
LP
390 r = cg_all_unified();
391 if (r < 0)
392 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
393 if (r > 0) {
a8725a06
ZJS
394 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
395 * routine only detects 231, so we'll have a false negative here for 230. */
396 r = systemd_installation_has_version(directory, 230);
397 if (r < 0)
398 return log_error_errno(r, "Failed to determine systemd version in container: %m");
399 if (r > 0)
400 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
401 else
402 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 403 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
404 /* Mixed cgroup hierarchy support was added in 233 */
405 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
406 if (r < 0)
407 return log_error_errno(r, "Failed to determine systemd version in container: %m");
408 if (r > 0)
409 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
410 else
411 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
412 } else
5da38d07 413 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 414
8199d554
LP
415 log_debug("Using %s hierarchy for container.",
416 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
417 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
418
efdb0237
LP
419 return 0;
420}
421
0c582db0
LB
422static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
423 int r;
424
425 r = getenv_bool(name);
426 if (r == -ENXIO)
427 return;
428 if (r < 0)
429 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
de40a303 430
0c582db0 431 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 432 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
0c582db0
LB
433}
434
4f086aab 435static void parse_mount_settings_env(void) {
4f086aab 436 const char *e;
1099ceeb
LP
437 int r;
438
439 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
440 if (r >= 0)
441 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
442 else if (r != -ENXIO)
443 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
4f086aab
SU
444
445 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
446 if (!e)
447 return;
448
449 if (streq(e, "network")) {
450 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
451 return;
452 }
453
454 r = parse_boolean(e);
455 if (r < 0) {
456 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
457 return;
ab8ee0f2 458 }
4f086aab 459
ab8ee0f2
ZJS
460 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
461 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
462}
463
d5455d2f
LP
464static void parse_environment(void) {
465 const char *e;
466 int r;
467
468 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
469 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
470 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
471 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
472
473 parse_mount_settings_env();
474
489fae52
ZJS
475 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
476 * even if it is supported. If not supported, it has no effect. */
de40a303 477 if (!cg_ns_supported())
489fae52 478 arg_use_cgns = false;
de40a303
LP
479 else {
480 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
481 if (r < 0) {
482 if (r != -ENXIO)
483 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
484
485 arg_use_cgns = true;
486 } else {
487 arg_use_cgns = r > 0;
488 arg_settings_mask |= SETTING_USE_CGNS;
489 }
490 }
d5455d2f
LP
491
492 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
493 if (e)
494 arg_container_service_name = e;
495
496 detect_unified_cgroup_hierarchy_from_environment();
497}
498
88213476 499static int parse_argv(int argc, char *argv[]) {
a41fe3a2 500 enum {
acbeb427
ZJS
501 ARG_VERSION = 0x100,
502 ARG_PRIVATE_NETWORK,
bc2f673e 503 ARG_UUID,
5076f0cc 504 ARG_READ_ONLY,
57fb9fb5 505 ARG_CAPABILITY,
420c7379 506 ARG_DROP_CAPABILITY,
17fe0523
LP
507 ARG_LINK_JOURNAL,
508 ARG_BIND,
f4889f65 509 ARG_BIND_RO,
06c17c39 510 ARG_TMPFS,
5a8af538
LP
511 ARG_OVERLAY,
512 ARG_OVERLAY_RO,
de40a303 513 ARG_INACCESSIBLE,
eb91eb18 514 ARG_SHARE_SYSTEM,
89f7c846 515 ARG_REGISTER,
aa28aefe 516 ARG_KEEP_UNIT,
69c79d3c 517 ARG_NETWORK_INTERFACE,
c74e630d 518 ARG_NETWORK_MACVLAN,
4bbfe7ad 519 ARG_NETWORK_IPVLAN,
ab046dde 520 ARG_NETWORK_BRIDGE,
22b28dfd 521 ARG_NETWORK_ZONE,
f6d6bad1 522 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 523 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 524 ARG_PERSONALITY,
4d9f07b4 525 ARG_VOLATILE,
ec16945e 526 ARG_TEMPLATE,
f36933fe 527 ARG_PROPERTY,
6dac160c 528 ARG_PRIVATE_USERS,
c6c8f6e2 529 ARG_KILL_SIGNAL,
f757855e 530 ARG_SETTINGS,
5f932eb9 531 ARG_CHDIR,
b53ede69 532 ARG_PIVOT_ROOT,
7336138e 533 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 534 ARG_NOTIFY_READY,
4623e8e6 535 ARG_ROOT_HASH,
960e4569 536 ARG_SYSTEM_CALL_FILTER,
bf428efb 537 ARG_RLIMIT,
3a9530e5 538 ARG_HOSTNAME,
66edd963 539 ARG_NO_NEW_PRIVILEGES,
81f345df 540 ARG_OOM_SCORE_ADJUST,
d107bb7d 541 ARG_CPU_AFFINITY,
09d423e9 542 ARG_RESOLV_CONF,
1688841f 543 ARG_TIMEZONE,
de40a303
LP
544 ARG_CONSOLE,
545 ARG_PIPE,
546 ARG_OCI_BUNDLE,
a41fe3a2
LP
547 };
548
88213476 549 static const struct option options[] = {
d7bea6b6
DP
550 { "help", no_argument, NULL, 'h' },
551 { "version", no_argument, NULL, ARG_VERSION },
552 { "directory", required_argument, NULL, 'D' },
553 { "template", required_argument, NULL, ARG_TEMPLATE },
554 { "ephemeral", no_argument, NULL, 'x' },
555 { "user", required_argument, NULL, 'u' },
556 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
557 { "as-pid2", no_argument, NULL, 'a' },
558 { "boot", no_argument, NULL, 'b' },
559 { "uuid", required_argument, NULL, ARG_UUID },
560 { "read-only", no_argument, NULL, ARG_READ_ONLY },
561 { "capability", required_argument, NULL, ARG_CAPABILITY },
562 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 563 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
564 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
565 { "bind", required_argument, NULL, ARG_BIND },
566 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
567 { "tmpfs", required_argument, NULL, ARG_TMPFS },
568 { "overlay", required_argument, NULL, ARG_OVERLAY },
569 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 570 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 571 { "machine", required_argument, NULL, 'M' },
3a9530e5 572 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
573 { "slice", required_argument, NULL, 'S' },
574 { "setenv", required_argument, NULL, 'E' },
575 { "selinux-context", required_argument, NULL, 'Z' },
576 { "selinux-apifs-context", required_argument, NULL, 'L' },
577 { "quiet", no_argument, NULL, 'q' },
578 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
579 { "register", required_argument, NULL, ARG_REGISTER },
580 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
581 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
582 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
583 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
584 { "network-veth", no_argument, NULL, 'n' },
585 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
586 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
587 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
588 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
589 { "personality", required_argument, NULL, ARG_PERSONALITY },
590 { "image", required_argument, NULL, 'i' },
591 { "volatile", optional_argument, NULL, ARG_VOLATILE },
592 { "port", required_argument, NULL, 'p' },
593 { "property", required_argument, NULL, ARG_PROPERTY },
594 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
595 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
596 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
597 { "settings", required_argument, NULL, ARG_SETTINGS },
598 { "chdir", required_argument, NULL, ARG_CHDIR },
599 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
600 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
601 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
602 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 603 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 604 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 605 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 606 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 607 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
608 { "console", required_argument, NULL, ARG_CONSOLE },
609 { "pipe", no_argument, NULL, ARG_PIPE },
610 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
eb9da376 611 {}
88213476
LP
612 };
613
9444b1f2 614 int c, r;
d5455d2f 615 const char *p;
a42c8b54 616 uint64_t plus = 0, minus = 0;
f757855e 617 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
618
619 assert(argc >= 0);
620 assert(argv);
621
de40a303 622 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
623 switch (c) {
624
625 case 'h':
37ec0fdd 626 return help();
88213476 627
acbeb427 628 case ARG_VERSION:
3f6fd1ba 629 return version();
acbeb427 630
88213476 631 case 'D':
0f03c2a4 632 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 633 if (r < 0)
0f03c2a4 634 return r;
de40a303
LP
635
636 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
637 break;
638
639 case ARG_TEMPLATE:
0f03c2a4 640 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 641 if (r < 0)
0f03c2a4 642 return r;
de40a303
LP
643
644 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
645 break;
646
1b9e5b12 647 case 'i':
0f03c2a4 648 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 649 if (r < 0)
0f03c2a4 650 return r;
de40a303
LP
651
652 arg_settings_mask |= SETTING_DIRECTORY;
653 break;
654
655 case ARG_OCI_BUNDLE:
656 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
657 if (r < 0)
658 return r;
659
ec16945e
LP
660 break;
661
662 case 'x':
663 arg_ephemeral = true;
a2f577fc 664 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
665 break;
666
687d0825 667 case 'u':
2fc09a9c
DM
668 r = free_and_strdup(&arg_user, optarg);
669 if (r < 0)
7027ff61 670 return log_oom();
687d0825 671
f757855e 672 arg_settings_mask |= SETTING_USER;
687d0825
MV
673 break;
674
22b28dfd
LP
675 case ARG_NETWORK_ZONE: {
676 char *j;
677
678 j = strappend("vz-", optarg);
679 if (!j)
680 return log_oom();
681
682 if (!ifname_valid(j)) {
683 log_error("Network zone name not valid: %s", j);
684 free(j);
685 return -EINVAL;
686 }
687
df1fac6d 688 free_and_replace(arg_network_zone, j);
22b28dfd
LP
689
690 arg_network_veth = true;
691 arg_private_network = true;
692 arg_settings_mask |= SETTING_NETWORK;
693 break;
694 }
695
ab046dde 696 case ARG_NETWORK_BRIDGE:
ef76dff2 697
baaa35ad
ZJS
698 if (!ifname_valid(optarg))
699 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
700 "Bridge interface name not valid: %s", optarg);
ef76dff2 701
f757855e
LP
702 r = free_and_strdup(&arg_network_bridge, optarg);
703 if (r < 0)
704 return log_oom();
ab046dde 705
4831981d 706 _fallthrough_;
0dfaa006 707 case 'n':
69c79d3c
LP
708 arg_network_veth = true;
709 arg_private_network = true;
f757855e 710 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
711 break;
712
f6d6bad1
LP
713 case ARG_NETWORK_VETH_EXTRA:
714 r = veth_extra_parse(&arg_network_veth_extra, optarg);
715 if (r < 0)
716 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
717
718 arg_private_network = true;
719 arg_settings_mask |= SETTING_NETWORK;
720 break;
721
aa28aefe 722 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
723 if (!ifname_valid(optarg))
724 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
725 "Network interface name not valid: %s", optarg);
ef76dff2 726
c74e630d
LP
727 if (strv_extend(&arg_network_interfaces, optarg) < 0)
728 return log_oom();
729
730 arg_private_network = true;
f757855e 731 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
732 break;
733
734 case ARG_NETWORK_MACVLAN:
ef76dff2 735
baaa35ad
ZJS
736 if (!ifname_valid(optarg))
737 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
738 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 739
c74e630d 740 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
741 return log_oom();
742
4bbfe7ad 743 arg_private_network = true;
f757855e 744 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
745 break;
746
747 case ARG_NETWORK_IPVLAN:
ef76dff2 748
baaa35ad
ZJS
749 if (!ifname_valid(optarg))
750 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
751 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 752
4bbfe7ad
TG
753 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
754 return log_oom();
755
4831981d 756 _fallthrough_;
ff01d048
LP
757 case ARG_PRIVATE_NETWORK:
758 arg_private_network = true;
f757855e 759 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
760 break;
761
d7bea6b6
DP
762 case ARG_NETWORK_NAMESPACE_PATH:
763 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
764 if (r < 0)
765 return r;
766
de40a303 767 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
768 break;
769
0f0dbc46 770 case 'b':
baaa35ad
ZJS
771 if (arg_start_mode == START_PID2)
772 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
773 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
774
775 arg_start_mode = START_BOOT;
776 arg_settings_mask |= SETTING_START_MODE;
777 break;
778
779 case 'a':
baaa35ad
ZJS
780 if (arg_start_mode == START_BOOT)
781 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
782 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
783
784 arg_start_mode = START_PID2;
785 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
786 break;
787
144f0fc0 788 case ARG_UUID:
9444b1f2 789 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
790 if (r < 0)
791 return log_error_errno(r, "Invalid UUID: %s", optarg);
792
baaa35ad
ZJS
793 if (sd_id128_is_null(arg_uuid))
794 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
795 "Machine UUID may not be all zeroes.");
f757855e
LP
796
797 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 798 break;
aa96c6cb 799
9444b1f2 800 case 'S':
de40a303
LP
801 r = free_and_strdup(&arg_slice, optarg);
802 if (r < 0)
803 return log_oom();
804
805 arg_settings_mask |= SETTING_SLICE;
144f0fc0
LP
806 break;
807
7027ff61 808 case 'M':
c1521918 809 if (isempty(optarg))
97b11eed 810 arg_machine = mfree(arg_machine);
c1521918 811 else {
baaa35ad
ZJS
812 if (!machine_name_is_valid(optarg))
813 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
814 "Invalid machine name: %s", optarg);
7027ff61 815
0c3c4284
LP
816 r = free_and_strdup(&arg_machine, optarg);
817 if (r < 0)
eb91eb18 818 return log_oom();
eb91eb18 819 }
9ce6d1b3 820 break;
7027ff61 821
3a9530e5
LP
822 case ARG_HOSTNAME:
823 if (isempty(optarg))
824 arg_hostname = mfree(arg_hostname);
825 else {
baaa35ad
ZJS
826 if (!hostname_is_valid(optarg, false))
827 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
828 "Invalid hostname: %s", optarg);
3a9530e5
LP
829
830 r = free_and_strdup(&arg_hostname, optarg);
831 if (r < 0)
832 return log_oom();
833 }
834
835 arg_settings_mask |= SETTING_HOSTNAME;
836 break;
837
82adf6af
LP
838 case 'Z':
839 arg_selinux_context = optarg;
a8828ed9
DW
840 break;
841
82adf6af
LP
842 case 'L':
843 arg_selinux_apifs_context = optarg;
a8828ed9
DW
844 break;
845
bc2f673e
LP
846 case ARG_READ_ONLY:
847 arg_read_only = true;
f757855e 848 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
849 break;
850
420c7379
LP
851 case ARG_CAPABILITY:
852 case ARG_DROP_CAPABILITY: {
6cbe4ed1 853 p = optarg;
9ed794a3 854 for (;;) {
6cbe4ed1 855 _cleanup_free_ char *t = NULL;
5076f0cc 856
6cbe4ed1
SS
857 r = extract_first_word(&p, &t, ",", 0);
858 if (r < 0)
859 return log_error_errno(r, "Failed to parse capability %s.", t);
6cbe4ed1
SS
860 if (r == 0)
861 break;
5076f0cc 862
39ed67d1
LP
863 if (streq(t, "all")) {
864 if (c == ARG_CAPABILITY)
a42c8b54 865 plus = (uint64_t) -1;
39ed67d1 866 else
a42c8b54 867 minus = (uint64_t) -1;
39ed67d1 868 } else {
acf4d158
YW
869 r = capability_from_name(t);
870 if (r < 0)
871 return log_error_errno(r, "Failed to parse capability %s.", t);
39ed67d1
LP
872
873 if (c == ARG_CAPABILITY)
acf4d158 874 plus |= 1ULL << r;
39ed67d1 875 else
acf4d158 876 minus |= 1ULL << r;
5076f0cc 877 }
5076f0cc
LP
878 }
879
f757855e 880 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
881 break;
882 }
883
66edd963
LP
884 case ARG_NO_NEW_PRIVILEGES:
885 r = parse_boolean(optarg);
886 if (r < 0)
887 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
888
889 arg_no_new_privileges = r;
890 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
891 break;
892
57fb9fb5
LP
893 case 'j':
894 arg_link_journal = LINK_GUEST;
574edc90 895 arg_link_journal_try = true;
4e1d6aa9 896 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
897 break;
898
899 case ARG_LINK_JOURNAL:
4e1d6aa9
LP
900 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
901 if (r < 0) {
902 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5
LP
903 return -EINVAL;
904 }
905
4e1d6aa9 906 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
907 break;
908
17fe0523 909 case ARG_BIND:
f757855e
LP
910 case ARG_BIND_RO:
911 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
912 if (r < 0)
913 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 914
f757855e 915 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 916 break;
06c17c39 917
f757855e
LP
918 case ARG_TMPFS:
919 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
920 if (r < 0)
921 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 922
f757855e 923 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 924 break;
5a8af538
LP
925
926 case ARG_OVERLAY:
ad85779a
LP
927 case ARG_OVERLAY_RO:
928 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
929 if (r == -EADDRNOTAVAIL)
930 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
931 if (r < 0)
932 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 933
f757855e 934 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 935 break;
06c17c39 936
de40a303
LP
937 case ARG_INACCESSIBLE:
938 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
939 if (r < 0)
940 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
941
942 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
943 break;
944
a5f1cb3b 945 case 'E': {
f4889f65
LP
946 char **n;
947
baaa35ad
ZJS
948 if (!env_assignment_is_valid(optarg))
949 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
950 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
951
952 n = strv_env_set(arg_setenv, optarg);
953 if (!n)
954 return log_oom();
955
130d3d22 956 strv_free_and_replace(arg_setenv, n);
f757855e 957 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
958 break;
959 }
960
284c0b91
LP
961 case 'q':
962 arg_quiet = true;
963 break;
964
8a96d94e 965 case ARG_SHARE_SYSTEM:
a6b5216c 966 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 967 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 968 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 969 arg_clone_ns_flags = 0;
8a96d94e
LP
970 break;
971
eb91eb18
LP
972 case ARG_REGISTER:
973 r = parse_boolean(optarg);
974 if (r < 0) {
975 log_error("Failed to parse --register= argument: %s", optarg);
976 return r;
977 }
978
979 arg_register = r;
980 break;
981
89f7c846
LP
982 case ARG_KEEP_UNIT:
983 arg_keep_unit = true;
984 break;
985
6afc95b7
LP
986 case ARG_PERSONALITY:
987
ac45f971 988 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
989 if (arg_personality == PERSONALITY_INVALID)
990 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
991 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 992
f757855e 993 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
994 break;
995
4d9f07b4
LP
996 case ARG_VOLATILE:
997
998 if (!optarg)
f757855e 999 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1000 else if (streq(optarg, "help")) {
1001 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1002 return 0;
1003 } else {
f757855e 1004 VolatileMode m;
4d9f07b4 1005
f757855e 1006 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1007 if (m < 0)
1008 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1009 "Failed to parse --volatile= argument: %s", optarg);
1010 else
f757855e 1011 arg_volatile_mode = m;
6d0b55c2
LP
1012 }
1013
f757855e
LP
1014 arg_settings_mask |= SETTING_VOLATILE_MODE;
1015 break;
6d0b55c2 1016
f757855e
LP
1017 case 'p':
1018 r = expose_port_parse(&arg_expose_ports, optarg);
1019 if (r == -EEXIST)
1020 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1021 if (r < 0)
1022 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1023
f757855e 1024 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1025 break;
6d0b55c2 1026
f36933fe
LP
1027 case ARG_PROPERTY:
1028 if (strv_extend(&arg_property, optarg) < 0)
1029 return log_oom();
1030
1031 break;
1032
ae209204
ZJS
1033 case ARG_PRIVATE_USERS: {
1034 int boolean = -1;
0de7acce 1035
ae209204
ZJS
1036 if (!optarg)
1037 boolean = true;
1038 else if (!in_charset(optarg, DIGITS))
1039 /* do *not* parse numbers as booleans */
1040 boolean = parse_boolean(optarg);
1041
1042 if (boolean == false) {
0de7acce
LP
1043 /* no: User namespacing off */
1044 arg_userns_mode = USER_NAMESPACE_NO;
1045 arg_uid_shift = UID_INVALID;
1046 arg_uid_range = UINT32_C(0x10000);
ae209204 1047 } else if (boolean == true) {
0de7acce
LP
1048 /* yes: User namespacing on, UID range is read from root dir */
1049 arg_userns_mode = USER_NAMESPACE_FIXED;
1050 arg_uid_shift = UID_INVALID;
1051 arg_uid_range = UINT32_C(0x10000);
1052 } else if (streq(optarg, "pick")) {
1053 /* pick: User namespacing on, UID range is picked randomly */
1054 arg_userns_mode = USER_NAMESPACE_PICK;
1055 arg_uid_shift = UID_INVALID;
1056 arg_uid_range = UINT32_C(0x10000);
1057 } else {
6c2058b3 1058 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1059 const char *range, *shift;
1060
0de7acce
LP
1061 /* anything else: User namespacing on, UID range is explicitly configured */
1062
6dac160c
LP
1063 range = strchr(optarg, ':');
1064 if (range) {
6c2058b3
ZJS
1065 buffer = strndup(optarg, range - optarg);
1066 if (!buffer)
1067 return log_oom();
1068 shift = buffer;
6dac160c
LP
1069
1070 range++;
bfd292ec
ZJS
1071 r = safe_atou32(range, &arg_uid_range);
1072 if (r < 0)
be715731 1073 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1074 } else
1075 shift = optarg;
1076
be715731
ZJS
1077 r = parse_uid(shift, &arg_uid_shift);
1078 if (r < 0)
1079 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1080
1081 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1082 }
1083
baaa35ad
ZJS
1084 if (arg_uid_range <= 0)
1085 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1086 "UID range cannot be 0.");
be715731 1087
0de7acce 1088 arg_settings_mask |= SETTING_USERNS;
6dac160c 1089 break;
ae209204 1090 }
6dac160c 1091
0de7acce 1092 case 'U':
ccabee0d
LP
1093 if (userns_supported()) {
1094 arg_userns_mode = USER_NAMESPACE_PICK;
1095 arg_uid_shift = UID_INVALID;
1096 arg_uid_range = UINT32_C(0x10000);
1097
1098 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1099 }
1100
7336138e
LP
1101 break;
1102
0de7acce 1103 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1104 arg_userns_chown = true;
0de7acce
LP
1105
1106 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1107 break;
1108
c6c8f6e2 1109 case ARG_KILL_SIGNAL:
5c828e66
LP
1110 if (streq(optarg, "help")) {
1111 DUMP_STRING_TABLE(signal, int, _NSIG);
1112 return 0;
1113 }
1114
29a3db75 1115 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1116 if (arg_kill_signal < 0)
1117 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1118 "Cannot parse signal: %s", optarg);
c6c8f6e2 1119
f757855e
LP
1120 arg_settings_mask |= SETTING_KILL_SIGNAL;
1121 break;
1122
1123 case ARG_SETTINGS:
1124
1125 /* no → do not read files
1126 * yes → read files, do not override cmdline, trust only subset
1127 * override → read files, override cmdline, trust only subset
1128 * trusted → read files, do not override cmdline, trust all
1129 */
1130
1131 r = parse_boolean(optarg);
1132 if (r < 0) {
1133 if (streq(optarg, "trusted")) {
1134 mask_all_settings = false;
1135 mask_no_settings = false;
1136 arg_settings_trusted = true;
1137
1138 } else if (streq(optarg, "override")) {
1139 mask_all_settings = false;
1140 mask_no_settings = true;
1141 arg_settings_trusted = -1;
1142 } else
1143 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1144 } else if (r > 0) {
1145 /* yes */
1146 mask_all_settings = false;
1147 mask_no_settings = false;
1148 arg_settings_trusted = -1;
1149 } else {
1150 /* no */
1151 mask_all_settings = true;
1152 mask_no_settings = false;
1153 arg_settings_trusted = false;
1154 }
1155
c6c8f6e2
LP
1156 break;
1157
5f932eb9 1158 case ARG_CHDIR:
baaa35ad
ZJS
1159 if (!path_is_absolute(optarg))
1160 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1161 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1162
1163 r = free_and_strdup(&arg_chdir, optarg);
1164 if (r < 0)
1165 return log_oom();
1166
1167 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1168 break;
1169
b53ede69
PW
1170 case ARG_PIVOT_ROOT:
1171 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1172 if (r < 0)
1173 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1174
1175 arg_settings_mask |= SETTING_PIVOT_ROOT;
1176 break;
1177
9c1e04d0
AP
1178 case ARG_NOTIFY_READY:
1179 r = parse_boolean(optarg);
baaa35ad
ZJS
1180 if (r < 0)
1181 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1182 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1183 arg_notify_ready = r;
1184 arg_settings_mask |= SETTING_NOTIFY_READY;
1185 break;
1186
4623e8e6
LP
1187 case ARG_ROOT_HASH: {
1188 void *k;
1189 size_t l;
1190
1191 r = unhexmem(optarg, strlen(optarg), &k, &l);
1192 if (r < 0)
1193 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1194 if (l < sizeof(sd_id128_t)) {
1195 log_error("Root hash must be at least 128bit long: %s", optarg);
1196 free(k);
1197 return -EINVAL;
1198 }
1199
1200 free(arg_root_hash);
1201 arg_root_hash = k;
1202 arg_root_hash_size = l;
1203 break;
1204 }
1205
960e4569
LP
1206 case ARG_SYSTEM_CALL_FILTER: {
1207 bool negative;
1208 const char *items;
1209
1210 negative = optarg[0] == '~';
1211 items = negative ? optarg + 1 : optarg;
1212
1213 for (;;) {
1214 _cleanup_free_ char *word = NULL;
1215
1216 r = extract_first_word(&items, &word, NULL, 0);
1217 if (r == 0)
1218 break;
1219 if (r == -ENOMEM)
1220 return log_oom();
1221 if (r < 0)
1222 return log_error_errno(r, "Failed to parse system call filter: %m");
1223
1224 if (negative)
1225 r = strv_extend(&arg_syscall_blacklist, word);
1226 else
1227 r = strv_extend(&arg_syscall_whitelist, word);
1228 if (r < 0)
1229 return log_oom();
1230 }
1231
1232 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1233 break;
1234 }
1235
bf428efb
LP
1236 case ARG_RLIMIT: {
1237 const char *eq;
1238 char *name;
1239 int rl;
1240
5c828e66
LP
1241 if (streq(optarg, "help")) {
1242 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1243 return 0;
1244 }
1245
bf428efb 1246 eq = strchr(optarg, '=');
baaa35ad
ZJS
1247 if (!eq)
1248 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1249 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1250
1251 name = strndup(optarg, eq - optarg);
1252 if (!name)
1253 return log_oom();
1254
1255 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1256 if (rl < 0)
1257 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1258 "Unknown resource limit: %s", name);
bf428efb
LP
1259
1260 if (!arg_rlimit[rl]) {
1261 arg_rlimit[rl] = new0(struct rlimit, 1);
1262 if (!arg_rlimit[rl])
1263 return log_oom();
1264 }
1265
1266 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1267 if (r < 0)
1268 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1269
1270 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1271 break;
1272 }
1273
81f345df
LP
1274 case ARG_OOM_SCORE_ADJUST:
1275 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1276 if (r < 0)
1277 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1278
1279 arg_oom_score_adjust_set = true;
1280 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1281 break;
1282
d107bb7d
LP
1283 case ARG_CPU_AFFINITY: {
1284 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1285
1286 r = parse_cpu_set(optarg, &cpuset);
1287 if (r < 0)
1288 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1289
1290 if (arg_cpuset)
1291 CPU_FREE(arg_cpuset);
1292
1293 arg_cpuset = TAKE_PTR(cpuset);
1294 arg_cpuset_ncpus = r;
1295 arg_settings_mask |= SETTING_CPU_AFFINITY;
1296 break;
1297 }
1298
09d423e9
LP
1299 case ARG_RESOLV_CONF:
1300 if (streq(optarg, "help")) {
1301 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1302 return 0;
1303 }
1304
1305 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1306 if (arg_resolv_conf < 0)
1307 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1308 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1309
1310 arg_settings_mask |= SETTING_RESOLV_CONF;
1311 break;
1312
1688841f
LP
1313 case ARG_TIMEZONE:
1314 if (streq(optarg, "help")) {
1315 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1316 return 0;
1317 }
1318
1319 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1320 if (arg_timezone < 0)
1321 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1322 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1323
1324 arg_settings_mask |= SETTING_TIMEZONE;
1325 break;
1326
de40a303
LP
1327 case ARG_CONSOLE:
1328 if (streq(optarg, "interactive"))
1329 arg_console_mode = CONSOLE_INTERACTIVE;
1330 else if (streq(optarg, "read-only"))
1331 arg_console_mode = CONSOLE_READ_ONLY;
1332 else if (streq(optarg, "passive"))
1333 arg_console_mode = CONSOLE_PASSIVE;
1334 else if (streq(optarg, "pipe"))
1335 arg_console_mode = CONSOLE_PIPE;
1336 else if (streq(optarg, "help"))
1337 puts("interactive\n"
1338 "read-only\n"
1339 "passive\n"
1340 "pipe");
1341 else {
1342 log_error("Unknown console mode: %s", optarg);
1343 return -EINVAL;
1344 }
1345
1346 arg_settings_mask |= SETTING_CONSOLE_MODE;
1347 break;
1348
1349 case 'P':
1350 case ARG_PIPE:
1351 arg_console_mode = CONSOLE_PIPE;
1352 arg_settings_mask |= SETTING_CONSOLE_MODE;
1353 break;
1354
88213476
LP
1355 case '?':
1356 return -EINVAL;
1357
1358 default:
eb9da376 1359 assert_not_reached("Unhandled option");
88213476 1360 }
88213476 1361
60f1ec13
LP
1362 if (argc > optind) {
1363 strv_free(arg_parameters);
1364 arg_parameters = strv_copy(argv + optind);
1365 if (!arg_parameters)
1366 return log_oom();
d7bea6b6 1367
60f1ec13
LP
1368 arg_settings_mask |= SETTING_START_MODE;
1369 }
1370
1371 if (arg_ephemeral && arg_template && !arg_directory)
1372 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1373 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1374 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1375 * --directory=". */
1376 arg_directory = TAKE_PTR(arg_template);
1377
bd4b15f2 1378 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1379
de40a303
LP
1380 /* Make sure to parse environment before we reset the settings mask below */
1381 parse_environment();
1382
60f1ec13
LP
1383 /* Load all settings from .nspawn files */
1384 if (mask_no_settings)
1385 arg_settings_mask = 0;
1386
1387 /* Don't load any settings from .nspawn files */
1388 if (mask_all_settings)
1389 arg_settings_mask = _SETTINGS_MASK_ALL;
1390
1391 return 1;
1392}
1393
1394static int verify_arguments(void) {
1395 int r;
a6b5216c 1396
4f086aab
SU
1397 if (arg_userns_mode != USER_NAMESPACE_NO)
1398 arg_mount_settings |= MOUNT_USE_USERNS;
1399
1400 if (arg_private_network)
1401 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1402
48a8d337
LB
1403 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1404 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1405 arg_register = false;
baaa35ad 1406 if (arg_start_mode != START_PID1)
60f1ec13 1407 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1408 }
eb91eb18 1409
0de7acce 1410 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1411 arg_userns_chown = true;
1412
60f1ec13
LP
1413 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1414 arg_kill_signal = SIGRTMIN+3;
1415
e5a4bb0d
LP
1416 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1417 arg_read_only = true;
1418
baaa35ad 1419 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1420 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1421 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1422 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1423
baaa35ad 1424 if (arg_directory && arg_image)
60f1ec13 1425 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1426
baaa35ad 1427 if (arg_template && arg_image)
60f1ec13 1428 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1429
baaa35ad 1430 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1431 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1432
baaa35ad 1433 if (arg_ephemeral && arg_template)
60f1ec13 1434 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1435
baaa35ad 1436 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1437 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1438
baaa35ad 1439 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1440 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1441
baaa35ad 1442 if (arg_userns_chown && arg_read_only)
de40a303
LP
1443 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1444 "--read-only and --private-users-chown may not be combined.");
f757855e 1445
e5a4bb0d
LP
1446 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1447 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1448 * copy-up (in case of overlay) making the entire excercise pointless. */
1449 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1450 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1451
de40a303
LP
1452 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1453 * to avoid conflicts between different network options. */
60f1ec13
LP
1454 if (arg_network_namespace_path &&
1455 (arg_network_interfaces || arg_network_macvlan ||
1456 arg_network_ipvlan || arg_network_veth_extra ||
1457 arg_network_bridge || arg_network_zone ||
1458 arg_network_veth || arg_private_network))
de40a303 1459 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1460
60f1ec13 1461 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1462 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1463 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1464
baaa35ad 1465 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1466 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1467
baaa35ad 1468 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1469 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1470
baaa35ad 1471 if (arg_expose_ports && !arg_private_network)
60f1ec13 1472 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1473
349cc4a5 1474#if ! HAVE_LIBIPTC
baaa35ad 1475 if (arg_expose_ports)
60f1ec13 1476 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1477#endif
1478
60f1ec13
LP
1479 r = custom_mount_check_all();
1480 if (r < 0)
1481 return r;
c6c8f6e2 1482
f757855e 1483 return 0;
88213476
LP
1484}
1485
03cfe0d5
LP
1486static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1487 assert(p);
1488
0de7acce 1489 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1490 return 0;
1491
1492 if (uid == UID_INVALID && gid == GID_INVALID)
1493 return 0;
1494
1495 if (uid != UID_INVALID) {
1496 uid += arg_uid_shift;
1497
1498 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1499 return -EOVERFLOW;
1500 }
1501
1502 if (gid != GID_INVALID) {
1503 gid += (gid_t) arg_uid_shift;
1504
1505 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1506 return -EOVERFLOW;
1507 }
1508
1509 if (lchown(p, uid, gid) < 0)
1510 return -errno;
b12afc8c
LP
1511
1512 return 0;
1513}
1514
03cfe0d5
LP
1515static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1516 const char *q;
dae8b82e 1517 int r;
03cfe0d5
LP
1518
1519 q = prefix_roota(root, path);
dae8b82e
ZJS
1520 r = mkdir_errno_wrapper(q, mode);
1521 if (r == -EEXIST)
1522 return 0;
1523 if (r < 0)
1524 return r;
03cfe0d5
LP
1525
1526 return userns_lchown(q, uid, gid);
1527}
1528
1688841f 1529static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1530 return PATH_STARTSWITH_SET(
1531 path,
1532 "../usr/share/zoneinfo/",
1533 "/usr/share/zoneinfo/");
1688841f
LP
1534}
1535
83205269
LP
1536static bool etc_writable(void) {
1537 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1538}
1539
e58a1277 1540static int setup_timezone(const char *dest) {
1688841f
LP
1541 _cleanup_free_ char *p = NULL, *etc = NULL;
1542 const char *where, *check;
1543 TimezoneMode m;
d4036145 1544 int r;
f8440af5 1545
e58a1277
LP
1546 assert(dest);
1547
1688841f 1548 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1549 r = readlink_malloc("/etc/localtime", &p);
1550 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1551 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1552 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1553 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1554 else if (r < 0) {
1555 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1556 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1557 * file.
1558 *
1559 * Example:
1560 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1561 */
1562 return 0;
1563 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1564 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1565 else
1566 m = arg_timezone;
1567 } else
1568 m = arg_timezone;
1569
1570 if (m == TIMEZONE_OFF)
1571 return 0;
1572
1573 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
d4036145 1574 if (r < 0) {
1688841f 1575 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1576 return 0;
1577 }
1578
1688841f
LP
1579 where = strjoina(etc, "/localtime");
1580
1581 switch (m) {
1582
1583 case TIMEZONE_DELETE:
1584 if (unlink(where) < 0)
1585 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1586
d4036145 1587 return 0;
d4036145 1588
1688841f
LP
1589 case TIMEZONE_SYMLINK: {
1590 _cleanup_free_ char *q = NULL;
1591 const char *z, *what;
4d1c38b8 1592
1688841f
LP
1593 z = timezone_from_path(p);
1594 if (!z) {
1595 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1596 return 0;
1688841f 1597 }
d4036145 1598
1688841f
LP
1599 r = readlink_malloc(where, &q);
1600 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1601 return 0; /* Already pointing to the right place? Then do nothing .. */
1602
1603 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1604 r = chase_symlinks(check, dest, 0, NULL);
1605 if (r < 0)
1606 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1607 else {
1608 if (unlink(where) < 0 && errno != ENOENT) {
1609 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1610 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1611 return 0;
1612 }
1613
1614 what = strjoina("../usr/share/zoneinfo/", z);
1615 if (symlink(what, where) < 0) {
1616 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1617 errno, "Failed to correct timezone of container, ignoring: %m");
1618 return 0;
1619 }
1620
1621 break;
1622 }
1623
1624 _fallthrough_;
d4036145 1625 }
68fb0892 1626
1688841f
LP
1627 case TIMEZONE_BIND: {
1628 _cleanup_free_ char *resolved = NULL;
1629 int found;
1630
1631 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1632 if (found < 0) {
1633 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1634 return 0;
1635 }
1636
1637 if (found == 0) /* missing? */
1638 (void) touch(resolved);
1639
1640 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1641 if (r >= 0)
1642 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1643
1644 _fallthrough_;
79d80fc1 1645 }
4d9f07b4 1646
1688841f
LP
1647 case TIMEZONE_COPY:
1648 /* If mounting failed, try to copy */
1649 r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
1650 if (r < 0) {
1651 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1652 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1653 return 0;
1654 }
1655
1656 break;
1657
1658 default:
1659 assert_not_reached("unexpected mode");
d4036145 1660 }
e58a1277 1661
1688841f 1662 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1663 r = userns_lchown(where, 0, 0);
1664 if (r < 0)
1688841f 1665 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1666
e58a1277 1667 return 0;
88213476
LP
1668}
1669
09d423e9
LP
1670static int have_resolv_conf(const char *path) {
1671 assert(path);
1672
1673 if (access(path, F_OK) < 0) {
1674 if (errno == ENOENT)
1675 return 0;
1676
1677 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1678 }
1679
1680 return 1;
1681}
1682
7357272e 1683static int resolved_listening(void) {
b8ea7a6e 1684 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1685 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1686 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1687 int r;
1688
7357272e 1689 /* Check if resolved is listening */
b053cd5f
LP
1690
1691 r = sd_bus_open_system(&bus);
1692 if (r < 0)
b8ea7a6e 1693 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1694
7357272e 1695 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1696 if (r < 0)
1697 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1698 if (r == 0)
1699 return 0;
7357272e
DM
1700
1701 r = sd_bus_get_property_string(bus,
1702 "org.freedesktop.resolve1",
1703 "/org/freedesktop/resolve1",
1704 "org.freedesktop.resolve1.Manager",
1705 "DNSStubListener",
b8ea7a6e 1706 &error,
7357272e
DM
1707 &dns_stub_listener_mode);
1708 if (r < 0)
b8ea7a6e 1709 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1710
1711 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1712}
1713
2547bb41 1714static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1715 _cleanup_free_ char *etc = NULL;
1716 const char *where, *what;
1717 ResolvConfMode m;
1718 int r;
2547bb41
LP
1719
1720 assert(dest);
1721
09d423e9
LP
1722 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1723 if (arg_private_network)
1724 m = RESOLV_CONF_OFF;
1725 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
83205269 1726 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
09d423e9 1727 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 1728 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 1729 else
83205269 1730 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
09d423e9
LP
1731 } else
1732 m = arg_resolv_conf;
1733
1734 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1735 return 0;
1736
87447ae4
LP
1737 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1738 if (r < 0) {
1739 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1740 return 0;
1741 }
1742
1743 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1744
1745 if (m == RESOLV_CONF_DELETE) {
1746 if (unlink(where) < 0)
1747 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1748
87447ae4
LP
1749 return 0;
1750 }
79d80fc1 1751
09d423e9
LP
1752 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1753 what = STATIC_RESOLV_CONF;
1754 else
1755 what = "/etc/resolv.conf";
87447ae4 1756
09d423e9
LP
1757 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1758 _cleanup_free_ char *resolved = NULL;
1759 int found;
1760
1761 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1762 if (found < 0) {
1763 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1764 return 0;
1765 }
3539724c 1766
87447ae4
LP
1767 if (found == 0) /* missing? */
1768 (void) touch(resolved);
5367354d 1769
09d423e9 1770 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1771 if (r >= 0)
87447ae4 1772 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1773 }
1774
1775 /* If that didn't work, let's copy the file */
09d423e9 1776 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1777 if (r < 0) {
3539724c
LP
1778 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1779 * resolved or something similar runs inside and the symlink points there.
68a313c5 1780 *
3539724c 1781 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1782 */
09d423e9 1783 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1784 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1785 return 0;
1786 }
2547bb41 1787
03cfe0d5
LP
1788 r = userns_lchown(where, 0, 0);
1789 if (r < 0)
3539724c 1790 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1791
2547bb41
LP
1792 return 0;
1793}
1794
1e4f1671 1795static int setup_boot_id(void) {
cdde6ba6
LP
1796 _cleanup_(unlink_and_freep) char *from = NULL;
1797 _cleanup_free_ char *path = NULL;
3bbaff3e 1798 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1799 const char *to;
04bc4a3f
LP
1800 int r;
1801
04bc4a3f
LP
1802 /* Generate a new randomized boot ID, so that each boot-up of
1803 * the container gets a new one */
1804
cdde6ba6
LP
1805 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1806 if (r < 0)
1807 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1808
1809 r = sd_id128_randomize(&rnd);
f647962d
MS
1810 if (r < 0)
1811 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1812
cdde6ba6 1813 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1814 if (r < 0)
1815 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1816
cdde6ba6
LP
1817 from = TAKE_PTR(path);
1818 to = "/proc/sys/kernel/random/boot_id";
1819
60e76d48 1820 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1821 if (r < 0)
1822 return r;
04bc4a3f 1823
cdde6ba6 1824 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1825}
1826
e58a1277 1827static int copy_devnodes(const char *dest) {
88213476
LP
1828 static const char devnodes[] =
1829 "null\0"
1830 "zero\0"
1831 "full\0"
1832 "random\0"
1833 "urandom\0"
85614d66
TG
1834 "tty\0"
1835 "net/tun\0";
88213476 1836
de40a303 1837 _cleanup_umask_ mode_t u;
88213476 1838 const char *d;
e58a1277 1839 int r = 0;
a258bf26
LP
1840
1841 assert(dest);
124640f1
LP
1842
1843 u = umask(0000);
88213476 1844
03cfe0d5
LP
1845 /* Create /dev/net, so that we can create /dev/net/tun in it */
1846 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1847 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1848
88213476 1849 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1850 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1851 struct stat st;
88213476 1852
7f112f50 1853 from = strappend("/dev/", d);
8967f291
LP
1854 if (!from)
1855 return log_oom();
1856
03cfe0d5 1857 to = prefix_root(dest, from);
8967f291
LP
1858 if (!to)
1859 return log_oom();
88213476
LP
1860
1861 if (stat(from, &st) < 0) {
1862
4a62c710
MS
1863 if (errno != ENOENT)
1864 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1865
baaa35ad
ZJS
1866 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1867 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1868 "%s is not a char or block device, cannot copy.", from);
1869 else {
8dfce114
LP
1870 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1871
81f5049b 1872 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1873 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1874 if (errno == EEXIST)
8dbf71ec 1875 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1876 if (errno != EPERM)
1877 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1878
8dfce114 1879 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
1880 r = touch(to);
1881 if (r < 0)
1882 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1883 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1884 if (r < 0)
1885 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1886 }
6278cf60 1887
03cfe0d5
LP
1888 r = userns_lchown(to, 0, 0);
1889 if (r < 0)
1890 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114
LP
1891
1892 dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
1893 if (!dn)
1894 return log_oom();
1895
1896 r = userns_mkdir(dest, dn, 0755, 0, 0);
1897 if (r < 0)
1898 return log_error_errno(r, "Failed to create '%s': %m", dn);
1899
1900 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1901 return log_oom();
1902
1903 prefixed = prefix_root(dest, sl);
1904 if (!prefixed)
1905 return log_oom();
1906
1907 t = strjoin("../", d);
1908 if (!t)
1909 return log_oom();
1910
1911 if (symlink(t, prefixed) < 0)
1912 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 1913 }
88213476
LP
1914 }
1915
e58a1277
LP
1916 return r;
1917}
88213476 1918
de40a303
LP
1919static int make_extra_nodes(const char *dest) {
1920 _cleanup_umask_ mode_t u;
1921 size_t i;
1922 int r;
1923
1924 u = umask(0000);
1925
1926 for (i = 0; i < arg_n_extra_nodes; i++) {
1927 _cleanup_free_ char *path = NULL;
1928 DeviceNode *n = arg_extra_nodes + i;
1929
1930 path = prefix_root(dest, n->path);
1931 if (!path)
1932 return log_oom();
1933
1934 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
1935 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
1936
1937 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
1938 if (r < 0)
1939 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
1940 }
1941
1942 return 0;
1943}
1944
03cfe0d5
LP
1945static int setup_pts(const char *dest) {
1946 _cleanup_free_ char *options = NULL;
1947 const char *p;
709f6e46 1948 int r;
03cfe0d5 1949
349cc4a5 1950#if HAVE_SELINUX
03cfe0d5
LP
1951 if (arg_selinux_apifs_context)
1952 (void) asprintf(&options,
3dce8915 1953 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1954 arg_uid_shift + TTY_GID,
1955 arg_selinux_apifs_context);
1956 else
1957#endif
1958 (void) asprintf(&options,
3dce8915 1959 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1960 arg_uid_shift + TTY_GID);
f2d88580 1961
03cfe0d5 1962 if (!options)
f2d88580
LP
1963 return log_oom();
1964
03cfe0d5 1965 /* Mount /dev/pts itself */
cc9fce65 1966 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
1967 r = mkdir_errno_wrapper(p, 0755);
1968 if (r < 0)
1969 return log_error_errno(r, "Failed to create /dev/pts: %m");
1970
60e76d48
ZJS
1971 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1972 if (r < 0)
1973 return r;
709f6e46
MS
1974 r = userns_lchown(p, 0, 0);
1975 if (r < 0)
1976 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1977
1978 /* Create /dev/ptmx symlink */
1979 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1980 if (symlink("pts/ptmx", p) < 0)
1981 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1982 r = userns_lchown(p, 0, 0);
1983 if (r < 0)
1984 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1985
03cfe0d5
LP
1986 /* And fix /dev/pts/ptmx ownership */
1987 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1988 r = userns_lchown(p, 0, 0);
1989 if (r < 0)
1990 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1991
f2d88580
LP
1992 return 0;
1993}
1994
e58a1277 1995static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1996 _cleanup_umask_ mode_t u;
1997 const char *to;
e58a1277 1998 int r;
e58a1277
LP
1999
2000 assert(dest);
e58a1277
LP
2001
2002 u = umask(0000);
2003
de40a303
LP
2004 if (!console)
2005 return 0;
2006
03cfe0d5 2007 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
2008 if (r < 0)
2009 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 2010
a258bf26
LP
2011 /* We need to bind mount the right tty to /dev/console since
2012 * ptys can only exist on pts file systems. To have something
81f5049b 2013 * to bind mount things on we create a empty regular file. */
a258bf26 2014
03cfe0d5 2015 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
2016 r = touch(to);
2017 if (r < 0)
2018 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 2019
60e76d48 2020 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
2021}
2022
8e5430c4
LP
2023static int setup_keyring(void) {
2024 key_serial_t keyring;
2025
2026 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2027 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2028 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2029 * these system calls let's make sure we don't leak anything into the container. */
2030
2031 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2032 if (keyring == -1) {
2033 if (errno == ENOSYS)
2034 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2035 else if (IN_SET(errno, EACCES, EPERM))
2036 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2037 else
2038 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2039 }
2040
2041 return 0;
2042}
2043
1e4f1671 2044static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2045 _cleanup_(unlink_and_freep) char *from = NULL;
2046 _cleanup_free_ char *fifo = NULL;
2047 _cleanup_close_ int fd = -1;
7fd1b19b 2048 _cleanup_umask_ mode_t u;
9ec5a93c
LP
2049 const char *to;
2050 int r;
e58a1277 2051
e58a1277 2052 assert(kmsg_socket >= 0);
a258bf26 2053
e58a1277 2054 u = umask(0000);
a258bf26 2055
9ec5a93c
LP
2056 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
2057 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2058 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2059 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2060
2061 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
2062 if (r < 0)
2063 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2064
9ec5a93c 2065 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2066 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2067
2068 from = TAKE_PTR(fifo);
2069 to = "/proc/kmsg";
2070
60e76d48
ZJS
2071 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2072 if (r < 0)
2073 return r;
e58a1277 2074
669fc4e5 2075 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2076 if (fd < 0)
2077 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2078
9ec5a93c 2079 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2080 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2081 if (r < 0)
2082 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2083
25ea79fe 2084 return 0;
88213476
LP
2085}
2086
1c4baffc 2087static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2088 union in_addr_union *exposed = userdata;
2089
2090 assert(rtnl);
2091 assert(m);
2092 assert(exposed);
2093
7a8f6325 2094 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
2095 return 0;
2096}
2097
3a74cea5 2098static int setup_hostname(void) {
c818eef1 2099 int r;
3a74cea5 2100
0c582db0 2101 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2102 return 0;
2103
c818eef1
LP
2104 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2105 if (r < 0)
2106 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2107
7027ff61 2108 return 0;
3a74cea5
LP
2109}
2110
57fb9fb5 2111static int setup_journal(const char *directory) {
0f5e1382 2112 _cleanup_free_ char *d = NULL;
b2238e38
LP
2113 const char *dirname, *p, *q;
2114 sd_id128_t this_id;
2115 char id[33];
8054d749 2116 bool try;
57fb9fb5
LP
2117 int r;
2118
df9a75e4
LP
2119 /* Don't link journals in ephemeral mode */
2120 if (arg_ephemeral)
2121 return 0;
2122
8054d749
LP
2123 if (arg_link_journal == LINK_NO)
2124 return 0;
2125
2126 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2127
4d680aee 2128 r = sd_id128_get_machine(&this_id);
f647962d
MS
2129 if (r < 0)
2130 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2131
e01ff70a 2132 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2133 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2134 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2135 if (try)
4d680aee 2136 return 0;
df9a75e4 2137 return -EEXIST;
4d680aee
ZJS
2138 }
2139
369ca6da
ZJS
2140 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2141 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2142 if (r < 0) {
2143 bool ignore = r == -EROFS && try;
2144 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2145 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2146 return ignore ? 0 : r;
2147 }
2148 }
03cfe0d5 2149
e01ff70a
MS
2150 (void) sd_id128_to_string(arg_uuid, id);
2151
03cfe0d5
LP
2152 p = strjoina("/var/log/journal/", id);
2153 q = prefix_roota(directory, p);
27407a01 2154
e1873695 2155 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2156 if (try)
2157 return 0;
27407a01 2158
baaa35ad
ZJS
2159 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2160 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2161 }
2162
e1873695 2163 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2164 if (try)
2165 return 0;
57fb9fb5 2166
baaa35ad
ZJS
2167 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2168 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2169 }
2170
2171 r = readlink_and_make_absolute(p, &d);
2172 if (r >= 0) {
3742095b 2173 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2174 path_equal(d, q)) {
2175
03cfe0d5 2176 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2177 if (r < 0)
709f6e46 2178 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2179 return 0;
57fb9fb5
LP
2180 }
2181
4a62c710
MS
2182 if (unlink(p) < 0)
2183 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2184 } else if (r == -EINVAL) {
2185
2186 if (arg_link_journal == LINK_GUEST &&
2187 rmdir(p) < 0) {
2188
27407a01
ZJS
2189 if (errno == ENOTDIR) {
2190 log_error("%s already exists and is neither a symlink nor a directory", p);
2191 return r;
4314d33f
MS
2192 } else
2193 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2194 }
4314d33f
MS
2195 } else if (r != -ENOENT)
2196 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2197
2198 if (arg_link_journal == LINK_GUEST) {
2199
2200 if (symlink(q, p) < 0) {
8054d749 2201 if (try) {
56f64d95 2202 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2203 return 0;
4314d33f
MS
2204 } else
2205 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2206 }
2207
03cfe0d5 2208 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2209 if (r < 0)
709f6e46 2210 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2211 return 0;
57fb9fb5
LP
2212 }
2213
2214 if (arg_link_journal == LINK_HOST) {
ccddd104 2215 /* don't create parents here — if the host doesn't have
574edc90 2216 * permanent journal set up, don't force it here */
ba8e6c4d 2217
dae8b82e
ZJS
2218 r = mkdir_errno_wrapper(p, 0755);
2219 if (r < 0 && r != -EEXIST) {
8054d749 2220 if (try) {
dae8b82e 2221 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2222 return 0;
4314d33f 2223 } else
dae8b82e 2224 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2225 }
2226
27407a01
ZJS
2227 } else if (access(p, F_OK) < 0)
2228 return 0;
57fb9fb5 2229
cdb2b9d0
LP
2230 if (dir_is_empty(q) == 0)
2231 log_warning("%s is not empty, proceeding anyway.", q);
2232
03cfe0d5 2233 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2234 if (r < 0)
2235 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2236
60e76d48
ZJS
2237 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2238 if (r < 0)
4a62c710 2239 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2240
27407a01 2241 return 0;
57fb9fb5
LP
2242}
2243
de40a303
LP
2244static int drop_capabilities(uid_t uid) {
2245 CapabilityQuintet q;
2246
2247 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2248 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2249 * arg_caps_retain. */
2250
2251 if (capability_quintet_is_set(&arg_full_capabilities)) {
2252 q = arg_full_capabilities;
2253
2254 if (q.bounding == (uint64_t) -1)
2255 q.bounding = uid == 0 ? arg_caps_retain : 0;
2256
2257 if (q.effective == (uint64_t) -1)
2258 q.effective = uid == 0 ? q.bounding : 0;
2259
2260 if (q.inheritable == (uint64_t) -1)
2261 q.inheritable = uid == 0 ? q.bounding : 0;
2262
2263 if (q.permitted == (uint64_t) -1)
2264 q.permitted = uid == 0 ? q.bounding : 0;
2265
2266 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2267 q.ambient = 0;
2268 } else
2269 q = (CapabilityQuintet) {
2270 .bounding = arg_caps_retain,
2271 .effective = uid == 0 ? arg_caps_retain : 0,
2272 .inheritable = uid == 0 ? arg_caps_retain : 0,
2273 .permitted = uid == 0 ? arg_caps_retain : 0,
2274 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2275 };
2276
2277 return capability_quintet_enforce(&q);
88213476
LP
2278}
2279
db999e0f
LP
2280static int reset_audit_loginuid(void) {
2281 _cleanup_free_ char *p = NULL;
2282 int r;
2283
0c582db0 2284 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2285 return 0;
2286
2287 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2288 if (r == -ENOENT)
db999e0f 2289 return 0;
f647962d
MS
2290 if (r < 0)
2291 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2292
2293 /* Already reset? */
2294 if (streq(p, "4294967295"))
2295 return 0;
2296
57512c89 2297 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2298 if (r < 0) {
10a87006
LP
2299 log_error_errno(r,
2300 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2301 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2302 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2303 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2304 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2305
db999e0f 2306 sleep(5);
77b6e194 2307 }
db999e0f
LP
2308
2309 return 0;
77b6e194
LP
2310}
2311
785890ac
LP
2312static int setup_propagate(const char *root) {
2313 const char *p, *q;
709f6e46 2314 int r;
785890ac
LP
2315
2316 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2317 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2318 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2319 (void) mkdir_p(p, 0600);
2320
709f6e46
MS
2321 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2322 if (r < 0)
2323 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2324
709f6e46
MS
2325 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2326 if (r < 0)
2327 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2328
709f6e46
MS
2329 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2330 if (r < 0)
2331 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2332
03cfe0d5 2333 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2334 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2335 if (r < 0)
2336 return r;
785890ac 2337
60e76d48
ZJS
2338 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2339 if (r < 0)
2340 return r;
785890ac 2341
19caffac
AC
2342 /* machined will MS_MOVE into that directory, and that's only
2343 * supported for non-shared mounts. */
60e76d48 2344 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2345}
2346
317feb4d 2347static int setup_machine_id(const char *directory) {
691675ba
LP
2348 const char *etc_machine_id;
2349 sd_id128_t id;
3bbaff3e 2350 int r;
e01ff70a 2351
317feb4d
LP
2352 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2353 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2354 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2355 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2356 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2357 * container behaves nicely). */
2358
e01ff70a
MS
2359 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2360
691675ba 2361 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2362 if (r < 0) {
2363 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2364 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2365
317feb4d
LP
2366 if (sd_id128_is_null(arg_uuid)) {
2367 r = sd_id128_randomize(&arg_uuid);
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2370 }
2371 } else {
baaa35ad
ZJS
2372 if (sd_id128_is_null(id))
2373 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2374 "Machine ID in container image is zero, refusing.");
e01ff70a 2375
317feb4d
LP
2376 arg_uuid = id;
2377 }
691675ba 2378
e01ff70a
MS
2379 return 0;
2380}
2381
7336138e
LP
2382static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2383 int r;
2384
2385 assert(directory);
2386
0de7acce 2387 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2388 return 0;
2389
2390 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2391 if (r == -EOPNOTSUPP)
2392 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2393 if (r == -EBADE)
2394 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2395 if (r < 0)
2396 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2397 if (r == 0)
2398 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2399 else
2400 log_debug("Patched directory tree to match UID/GID range.");
2401
2402 return r;
2403}
2404
113cea80 2405/*
6d416b9c
LS
2406 * Return values:
2407 * < 0 : wait_for_terminate() failed to get the state of the
2408 * container, the container was terminated by a signal, or
2409 * failed for an unknown reason. No change is made to the
2410 * container argument.
2411 * > 0 : The program executed in the container terminated with an
2412 * error. The exit code of the program executed in the
919699ec
LP
2413 * container is returned. The container argument has been set
2414 * to CONTAINER_TERMINATED.
6d416b9c
LS
2415 * 0 : The container is being rebooted, has been shut down or exited
2416 * successfully. The container argument has been set to either
2417 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2418 *
6d416b9c
LS
2419 * That is, success is indicated by a return value of zero, and an
2420 * error is indicated by a non-zero value.
113cea80
DH
2421 */
2422static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2423 siginfo_t status;
919699ec 2424 int r;
113cea80
DH
2425
2426 r = wait_for_terminate(pid, &status);
f647962d
MS
2427 if (r < 0)
2428 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2429
2430 switch (status.si_code) {
fddbb89c 2431
113cea80 2432 case CLD_EXITED:
b5a2179b 2433 if (status.si_status == 0)
919699ec 2434 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2435 else
919699ec 2436 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2437
919699ec
LP
2438 *container = CONTAINER_TERMINATED;
2439 return status.si_status;
113cea80
DH
2440
2441 case CLD_KILLED:
2442 if (status.si_status == SIGINT) {
919699ec 2443 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2444 *container = CONTAINER_TERMINATED;
919699ec
LP
2445 return 0;
2446
113cea80 2447 } else if (status.si_status == SIGHUP) {
919699ec 2448 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2449 *container = CONTAINER_REBOOTED;
919699ec 2450 return 0;
113cea80 2451 }
919699ec 2452
4831981d 2453 _fallthrough_;
113cea80 2454 case CLD_DUMPED:
baaa35ad
ZJS
2455 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2456 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2457
2458 default:
baaa35ad
ZJS
2459 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2460 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2461 }
113cea80
DH
2462}
2463
023fb90b
LP
2464static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2465 pid_t pid;
2466
4a0b58c4 2467 pid = PTR_TO_PID(userdata);
023fb90b 2468 if (pid > 0) {
c6c8f6e2 2469 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2470 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2471 sd_event_source_set_userdata(s, NULL);
2472 return 0;
2473 }
2474 }
2475
2476 sd_event_exit(sd_event_source_get_event(s), 0);
2477 return 0;
2478}
2479
6916b164 2480static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2481 pid_t pid;
2482
2483 assert(s);
2484 assert(ssi);
2485
2486 pid = PTR_TO_PID(userdata);
2487
6916b164
AU
2488 for (;;) {
2489 siginfo_t si = {};
abdb9b08 2490
6916b164
AU
2491 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2492 return log_error_errno(errno, "Failed to waitid(): %m");
2493 if (si.si_pid == 0) /* No pending children. */
2494 break;
abdb9b08 2495 if (si.si_pid == pid) {
6916b164
AU
2496 /* The main process we care for has exited. Return from
2497 * signal handler but leave the zombie. */
2498 sd_event_exit(sd_event_source_get_event(s), 0);
2499 break;
2500 }
abdb9b08 2501
6916b164
AU
2502 /* Reap all other children. */
2503 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2504 }
2505
2506 return 0;
2507}
2508
abdb9b08
LP
2509static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2510 pid_t pid;
2511
2512 assert(m);
2513
2514 pid = PTR_TO_PID(userdata);
2515
2516 if (arg_kill_signal > 0) {
2517 log_info("Container termination requested. Attempting to halt container.");
2518 (void) kill(pid, arg_kill_signal);
2519 } else {
2520 log_info("Container termination requested. Exiting.");
2521 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2522 }
2523
2524 return 0;
2525}
2526
ec16945e 2527static int determine_names(void) {
1b9cebf6 2528 int r;
ec16945e 2529
c1521918
LP
2530 if (arg_template && !arg_directory && arg_machine) {
2531
2532 /* If --template= was specified then we should not
2533 * search for a machine, but instead create a new one
2534 * in /var/lib/machine. */
2535
605405c6 2536 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2537 if (!arg_directory)
2538 return log_oom();
2539 }
2540
ec16945e 2541 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2542 if (arg_machine) {
2543 _cleanup_(image_unrefp) Image *i = NULL;
2544
5ef46e5f 2545 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2546 if (r == -ENOENT)
2547 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2548 if (r < 0)
2549 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2550
eb38edce 2551 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2552 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2553 else
0f03c2a4 2554 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2555 if (r < 0)
0f3be6ca 2556 return log_oom();
1b9cebf6 2557
aee327b8
LP
2558 if (!arg_ephemeral)
2559 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2560 } else {
2561 r = safe_getcwd(&arg_directory);
2562 if (r < 0)
2563 return log_error_errno(r, "Failed to determine current directory: %m");
2564 }
ec16945e 2565
0f3be6ca 2566 if (!arg_directory && !arg_image) {
1b9cebf6 2567 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2568 return -EINVAL;
2569 }
2570 }
2571
2572 if (!arg_machine) {
b9ba4dab
LP
2573 if (arg_directory && path_equal(arg_directory, "/"))
2574 arg_machine = gethostname_malloc();
4827ab48
LP
2575 else {
2576 if (arg_image) {
2577 char *e;
2578
2579 arg_machine = strdup(basename(arg_image));
2580
2581 /* Truncate suffix if there is one */
2582 e = endswith(arg_machine, ".raw");
2583 if (e)
2584 *e = 0;
2585 } else
2586 arg_machine = strdup(basename(arg_directory));
2587 }
ec16945e
LP
2588 if (!arg_machine)
2589 return log_oom();
2590
ae691c1d 2591 hostname_cleanup(arg_machine);
ec16945e
LP
2592 if (!machine_name_is_valid(arg_machine)) {
2593 log_error("Failed to determine machine name automatically, please use -M.");
2594 return -EINVAL;
2595 }
b9ba4dab
LP
2596
2597 if (arg_ephemeral) {
2598 char *b;
2599
2600 /* Add a random suffix when this is an
2601 * ephemeral machine, so that we can run many
2602 * instances at once without manually having
2603 * to specify -M each time. */
2604
2605 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2606 return log_oom();
2607
2608 free(arg_machine);
2609 arg_machine = b;
2610 }
ec16945e
LP
2611 }
2612
2613 return 0;
2614}
2615
8d4aa2bb 2616static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2617 char *chased;
2618 int r;
2619
2620 assert(p);
2621
2622 if (!*p)
2623 return 0;
2624
8d4aa2bb 2625 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2628
8405dcf7
ZJS
2629 free_and_replace(*p, chased);
2630 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
3f342ec4
LP
2631}
2632
03cfe0d5 2633static int determine_uid_shift(const char *directory) {
6dac160c
LP
2634 int r;
2635
0de7acce 2636 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2637 arg_uid_shift = 0;
6dac160c 2638 return 0;
03cfe0d5 2639 }
6dac160c
LP
2640
2641 if (arg_uid_shift == UID_INVALID) {
2642 struct stat st;
2643
03cfe0d5 2644 r = stat(directory, &st);
6dac160c 2645 if (r < 0)
03cfe0d5 2646 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2647
2648 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2649
baaa35ad
ZJS
2650 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2651 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2652 "UID and GID base of %s don't match.", directory);
6dac160c
LP
2653
2654 arg_uid_range = UINT32_C(0x10000);
2655 }
2656
baaa35ad
ZJS
2657 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2658 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2659 "UID base too high for UID range.");
6dac160c 2660
6dac160c
LP
2661 return 0;
2662}
2663
de40a303
LP
2664static unsigned long effective_clone_ns_flags(void) {
2665 unsigned long flags = arg_clone_ns_flags;
2666
2667 if (arg_private_network)
2668 flags |= CLONE_NEWNET;
2669 if (arg_use_cgns)
2670 flags |= CLONE_NEWCGROUP;
2671 if (arg_userns_mode != USER_NAMESPACE_NO)
2672 flags |= CLONE_NEWUSER;
2673
2674 return flags;
2675}
2676
2677static int patch_sysctl(void) {
2678
2679 /* This table is inspired by runc's sysctl() function */
2680 static const struct {
2681 const char *key;
2682 bool prefix;
2683 unsigned long clone_flags;
2684 } safe_sysctl[] = {
2685 { "kernel.hostname", false, CLONE_NEWUTS },
2686 { "kernel.domainname", false, CLONE_NEWUTS },
2687 { "kernel.msgmax", false, CLONE_NEWIPC },
2688 { "kernel.msgmnb", false, CLONE_NEWIPC },
2689 { "kernel.msgmni", false, CLONE_NEWIPC },
2690 { "kernel.sem", false, CLONE_NEWIPC },
2691 { "kernel.shmall", false, CLONE_NEWIPC },
2692 { "kernel.shmmax", false, CLONE_NEWIPC },
2693 { "kernel.shmmni", false, CLONE_NEWIPC },
2694 { "fs.mqueue.", true, CLONE_NEWIPC },
2695 { "net.", true, CLONE_NEWNET },
2696 };
2697
2698 unsigned long flags;
2699 char **k, **v;
2700 int r;
2701
2702 flags = effective_clone_ns_flags();
2703
2704 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2705 bool good = false;
2706 size_t i;
2707
2708 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2709
2710 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2711 continue;
2712
2713 if (safe_sysctl[i].prefix)
2714 good = startswith(*k, safe_sysctl[i].key);
2715 else
2716 good = streq(*k, safe_sysctl[i].key);
2717
2718 if (good)
2719 break;
2720 }
2721
2722 if (!good) {
2723 log_error("Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
2724 return -EPERM;
2725 }
2726
2727 r = sysctl_write(*k, *v);
2728 if (r < 0)
2729 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2730 }
2731
2732 return 0;
2733}
2734
03cfe0d5
LP
2735static int inner_child(
2736 Barrier *barrier,
2737 const char *directory,
2738 bool secondary,
2739 int kmsg_socket,
2740 int rtnl_socket,
f757855e 2741 FDSet *fds) {
69c79d3c 2742
03cfe0d5 2743 _cleanup_free_ char *home = NULL;
e01ff70a 2744 char as_uuid[37];
88614c8a 2745 size_t n_env = 1;
03cfe0d5 2746 const char *envp[] = {
0c300adf 2747 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2748 NULL, /* container */
03cfe0d5
LP
2749 NULL, /* TERM */
2750 NULL, /* HOME */
2751 NULL, /* USER */
2752 NULL, /* LOGNAME */
2753 NULL, /* container_uuid */
2754 NULL, /* LISTEN_FDS */
2755 NULL, /* LISTEN_PID */
9c1e04d0 2756 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2757 NULL
2758 };
1a68e1e5 2759 const char *exec_target;
2371271c 2760 _cleanup_strv_free_ char **env_use = NULL;
de40a303 2761 int r, which_failed;
88213476 2762
b37469d7
LP
2763 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2764 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2765 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2766 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2767 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2768 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2769 * namespace.
2770 *
2771 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2772 * unshare(). See below. */
2773
03cfe0d5
LP
2774 assert(barrier);
2775 assert(directory);
2776 assert(kmsg_socket >= 0);
88213476 2777
de40a303
LP
2778 log_debug("Inner child is initializing.");
2779
0de7acce 2780 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2781 /* Tell the parent, that it now can write the UID map. */
2782 (void) barrier_place(barrier); /* #1 */
7027ff61 2783
03cfe0d5 2784 /* Wait until the parent wrote the UID map */
baaa35ad
ZJS
2785 if (!barrier_place_and_sync(barrier)) /* #2 */
2786 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2787 "Parent died too early");
88213476
LP
2788 }
2789
6d66bd3b
EV
2790 r = reset_uid_gid();
2791 if (r < 0)
2792 return log_error_errno(r, "Couldn't become new root: %m");
2793
0de7acce 2794 r = mount_all(NULL,
4f086aab 2795 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 2796 arg_uid_shift,
0de7acce 2797 arg_selinux_apifs_context);
03cfe0d5
LP
2798 if (r < 0)
2799 return r;
2800
04413780
ZJS
2801 if (!arg_network_namespace_path && arg_private_network) {
2802 r = unshare(CLONE_NEWNET);
2803 if (r < 0)
2804 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2805
2806 /* Tell the parent that it can setup network interfaces. */
2807 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2808 }
2809
4f086aab 2810 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2811 if (r < 0)
2812 return r;
2813
03cfe0d5
LP
2814 /* Wait until we are cgroup-ified, so that we
2815 * can mount the right cgroup path writable */
baaa35ad
ZJS
2816 if (!barrier_place_and_sync(barrier)) /* #4 */
2817 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2818 "Parent died too early");
88213476 2819
489fae52 2820 if (arg_use_cgns) {
0996ef00
CB
2821 r = unshare(CLONE_NEWCGROUP);
2822 if (r < 0)
04413780 2823 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2824 r = mount_cgroups(
2825 "",
2826 arg_unified_cgroup_hierarchy,
2827 arg_userns_mode != USER_NAMESPACE_NO,
2828 arg_uid_shift,
2829 arg_uid_range,
5a8ff0e6 2830 arg_selinux_apifs_context,
ada54120 2831 true);
0996ef00
CB
2832 if (r < 0)
2833 return r;
2834 } else {
2835 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2836 if (r < 0)
2837 return r;
2838 }
ec16945e 2839
1e4f1671 2840 r = setup_boot_id();
03cfe0d5
LP
2841 if (r < 0)
2842 return r;
ec16945e 2843
1e4f1671 2844 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2845 if (r < 0)
2846 return r;
2847 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2848
de40a303
LP
2849 r = mount_custom(
2850 "/",
2851 arg_custom_mounts,
2852 arg_n_custom_mounts,
2853 false,
2854 0,
2855 0,
2856 arg_selinux_apifs_context,
2857 true);
2858 if (r < 0)
2859 return r;
2860
03cfe0d5
LP
2861 if (setsid() < 0)
2862 return log_error_errno(errno, "setsid() failed: %m");
2863
2864 if (arg_private_network)
2865 loopback_setup();
2866
7a8f6325
LP
2867 if (arg_expose_ports) {
2868 r = expose_port_send_rtnl(rtnl_socket);
2869 if (r < 0)
2870 return r;
2871 rtnl_socket = safe_close(rtnl_socket);
2872 }
03cfe0d5 2873
de40a303
LP
2874 r = patch_sysctl();
2875 if (r < 0)
2876 return r;
2877
81f345df
LP
2878 if (arg_oom_score_adjust_set) {
2879 r = set_oom_score_adjust(arg_oom_score_adjust);
2880 if (r < 0)
2881 return log_error_errno(r, "Failed to adjust OOM score: %m");
2882 }
2883
d107bb7d
LP
2884 if (arg_cpuset)
2885 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2886 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2887
c818eef1 2888 (void) setup_hostname();
03cfe0d5 2889
050f7277 2890 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2891 r = safe_personality(arg_personality);
2892 if (r < 0)
2893 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2894 } else if (secondary) {
21022b9d
LP
2895 r = safe_personality(PER_LINUX32);
2896 if (r < 0)
2897 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2898 }
2899
de40a303
LP
2900 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
2901 if (r < 0)
2902 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
2903
2904#if HAVE_SECCOMP
2905 if (arg_seccomp) {
2906
2907 if (is_seccomp_available()) {
2908
2909 r = seccomp_load(arg_seccomp);
2910 if (IN_SET(r, -EPERM, -EACCES))
2911 return log_error_errno(r, "Failed to install seccomp filter: %m");
2912 if (r < 0)
2913 log_debug_errno(r, "Failed to install seccomp filter: %m");
2914 }
2915 } else
2916#endif
2917 {
2918 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
2919 if (r < 0)
2920 return r;
2921 }
2922
349cc4a5 2923#if HAVE_SELINUX
03cfe0d5 2924 if (arg_selinux_context)
2ed96880 2925 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2926 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2927#endif
2928
de40a303
LP
2929 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
2930 * if we need to later on. */
2931 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
2932 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
2933
2934 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
2935 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
2936 else
2937 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2938 if (r < 0)
2939 return r;
2940
de40a303
LP
2941 r = drop_capabilities(getuid());
2942 if (r < 0)
2943 return log_error_errno(r, "Dropping capabilities failed: %m");
2944
66edd963
LP
2945 if (arg_no_new_privileges)
2946 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2947 return log_error_errno(errno, "Failed to disable new privileges: %m");
2948
6aadfa4c
ILG
2949 /* LXC sets container=lxc, so follow the scheme here */
2950 envp[n_env++] = strjoina("container=", arg_container_service_name);
2951
03cfe0d5
LP
2952 envp[n_env] = strv_find_prefix(environ, "TERM=");
2953 if (envp[n_env])
313cefa1 2954 n_env++;
03cfe0d5 2955
de40a303
LP
2956 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
2957 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
2958 return log_oom();
2959
2960 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
2961 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
2962 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
2963 return log_oom();
03cfe0d5 2964
3bbaff3e 2965 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2966
691675ba 2967 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2968 return log_oom();
03cfe0d5
LP
2969
2970 if (fdset_size(fds) > 0) {
2971 r = fdset_cloexec(fds, false);
2972 if (r < 0)
2973 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2974
2975 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2976 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2977 return log_oom();
2978 }
9c1e04d0
AP
2979 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2980 return log_oom();
03cfe0d5 2981
2371271c
TG
2982 env_use = strv_env_merge(2, envp, arg_setenv);
2983 if (!env_use)
2984 return log_oom();
03cfe0d5
LP
2985
2986 /* Let the parent know that we are ready and
2987 * wait until the parent is ready with the
2988 * setup, too... */
baaa35ad
ZJS
2989 if (!barrier_place_and_sync(barrier)) /* #5 */
2990 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2991 "Parent died too early");
03cfe0d5 2992
5f932eb9
LP
2993 if (arg_chdir)
2994 if (chdir(arg_chdir) < 0)
2995 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2996
7732f92b 2997 if (arg_start_mode == START_PID2) {
75bf701f 2998 r = stub_pid1(arg_uuid);
7732f92b
LP
2999 if (r < 0)
3000 return r;
3001 }
3002
de40a303
LP
3003 log_debug("Inner child completed, invoking payload.");
3004
8ca082b4
LP
3005 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3006 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3007 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3008 log_close();
8ca082b4
LP
3009 log_set_open_when_needed(true);
3010
03cfe0d5
LP
3011 (void) fdset_close_others(fds);
3012
7732f92b 3013 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3014 char **a;
3015 size_t m;
3016
3017 /* Automatically search for the init system */
3018
75f32f04
ZJS
3019 m = strv_length(arg_parameters);
3020 a = newa(char*, m + 2);
3021 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3022 a[1 + m] = NULL;
03cfe0d5 3023
ced58da7 3024 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3025 execve(a[0], a, env_use);
3026
ced58da7 3027 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3028 execve(a[0], a, env_use);
3029
ced58da7 3030 a[0] = (char*) "/sbin/init";
03cfe0d5 3031 execve(a[0], a, env_use);
ced58da7
LP
3032
3033 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3034 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3035 const char *dollar_path;
3036
1a68e1e5 3037 exec_target = arg_parameters[0];
b6b180b7
LP
3038
3039 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3040 * binary. */
3041 dollar_path = strv_env_get(env_use, "PATH");
3042 if (dollar_path) {
3043 if (putenv((char*) dollar_path) != 0)
3044 return log_error_errno(errno, "Failed to update $PATH: %m");
3045 }
3046
f757855e 3047 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3048 } else {
5f932eb9 3049 if (!arg_chdir)
d929b0f9
ZJS
3050 /* If we cannot change the directory, we'll end up in /, that is expected. */
3051 (void) chdir(home ?: "/root");
5f932eb9 3052
03cfe0d5
LP
3053 execle("/bin/bash", "-bash", NULL, env_use);
3054 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3055
3056 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3057 }
3058
8ca082b4 3059 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3060}
3061
9c1e04d0 3062static int setup_sd_notify_child(void) {
271f518f 3063 _cleanup_close_ int fd = -1;
9c1e04d0 3064 union sockaddr_union sa = {
44ed5214
LP
3065 .un.sun_family = AF_UNIX,
3066 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3067 };
3068 int r;
3069
3070 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3071 if (fd < 0)
3072 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3073
3074 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3075 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3076
9c1e04d0 3077 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3078 if (r < 0)
44ed5214 3079 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3080
adc7d9f0 3081 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3082 if (r < 0)
adc7d9f0 3083 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3084
2ff48e98 3085 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3086 if (r < 0)
2ff48e98 3087 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3088
271f518f 3089 return TAKE_FD(fd);
9c1e04d0
AP
3090}
3091
03cfe0d5
LP
3092static int outer_child(
3093 Barrier *barrier,
3094 const char *directory,
3095 const char *console,
2d845785 3096 DissectedImage *dissected_image,
03cfe0d5
LP
3097 bool secondary,
3098 int pid_socket,
e01ff70a 3099 int uuid_socket,
9c1e04d0 3100 int notify_socket,
03cfe0d5
LP
3101 int kmsg_socket,
3102 int rtnl_socket,
825d5287 3103 int uid_shift_socket,
8199d554 3104 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3105 FDSet *fds,
3106 int netns_fd) {
03cfe0d5 3107
bf428efb 3108 _cleanup_close_ int fd = -1;
03cfe0d5
LP
3109 pid_t pid;
3110 ssize_t l;
de40a303 3111 int r;
03cfe0d5 3112
b37469d7
LP
3113 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3114 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3115 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3116 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3117
03cfe0d5
LP
3118 assert(barrier);
3119 assert(directory);
03cfe0d5 3120 assert(pid_socket >= 0);
e01ff70a 3121 assert(uuid_socket >= 0);
9c1e04d0 3122 assert(notify_socket >= 0);
03cfe0d5
LP
3123 assert(kmsg_socket >= 0);
3124
de40a303
LP
3125 log_debug("Outer child is initializing.");
3126
03cfe0d5
LP
3127 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3128 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3129
de40a303 3130 if (arg_console_mode != CONSOLE_PIPE) {
2b33ab09 3131 int terminal;
03cfe0d5 3132
de40a303
LP
3133 assert(console);
3134
2b33ab09
LP
3135 terminal = open_terminal(console, O_RDWR);
3136 if (terminal < 0)
3137 return log_error_errno(terminal, "Failed to open console: %m");
03cfe0d5 3138
17cac366
LP
3139 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
3140 r = log_dup_console();
3141 if (r < 0)
3142 return log_error_errno(r, "Failed to duplicate stderr: %m");
3143
2b33ab09
LP
3144 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
3145 if (r < 0)
3146 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
03cfe0d5
LP
3147 }
3148
3149 r = reset_audit_loginuid();
3150 if (r < 0)
3151 return r;
3152
3153 /* Mark everything as slave, so that we still
3154 * receive mounts from the real root, but don't
3155 * propagate mounts to the real root. */
60e76d48
ZJS
3156 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3157 if (r < 0)
3158 return r;
03cfe0d5 3159
2d845785 3160 if (dissected_image) {
2d3a5a73
LP
3161 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3162 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3163 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3164 * makes sure ESP partitions and userns are compatible. */
3165
3166 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
03bcb6d4
LP
3167 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3168 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3169 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785
LP
3170 if (r < 0)
3171 return r;
3172 }
03cfe0d5 3173
391567f4
LP
3174 r = determine_uid_shift(directory);
3175 if (r < 0)
3176 return r;
3177
0de7acce 3178 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3179 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3180 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3181 if (l < 0)
3182 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3183 if (l != sizeof(arg_uid_shift))
3184 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3185 "Short write while sending UID shift.");
0e7ac751 3186
0de7acce 3187 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3188 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3189 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3190 * not it will pick a different one, and send it back to us. */
3191
3192 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3193 if (l < 0)
3194 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3195 if (l != sizeof(arg_uid_shift))
3196 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3197 "Short read while receiving UID shift.");
0e7ac751
LP
3198 }
3199
ff6c6cc1
LP
3200 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3201 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3202 }
3203
e50cd82f
LP
3204 if (!dissected_image) {
3205 /* Turn directory into bind mount */
3206 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3207 if (r < 0)
3208 return r;
3209 }
7d0ecdd6
LP
3210
3211 r = setup_pivot_root(
3212 directory,
3213 arg_pivot_root_new,
3214 arg_pivot_root_old);
3215 if (r < 0)
3216 return r;
3217
3218 r = setup_volatile_mode(
3219 directory,
3220 arg_volatile_mode,
3221 arg_userns_mode != USER_NAMESPACE_NO,
3222 arg_uid_shift,
3223 arg_uid_range,
3224 arg_selinux_context);
3225 if (r < 0)
3226 return r;
3227
2d3a5a73
LP
3228 if (dissected_image) {
3229 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3230 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3231 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3232 if (r < 0)
3233 return r;
3234 }
3235
8199d554
LP
3236 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3237 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3238
3239 r = detect_unified_cgroup_hierarchy_from_image(directory);
3240 if (r < 0)
3241 return r;
3242
3243 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3244 if (l < 0)
3245 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3246 if (l != sizeof(arg_unified_cgroup_hierarchy))
3247 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3248 "Short write while sending cgroup mode.");
8199d554
LP
3249
3250 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3251 }
3252
4ad14eff
LP
3253 /* Mark everything as shared so our mounts get propagated down. This is
3254 * required to make new bind mounts available in systemd services
3255 * inside the containter that create a new mount namespace.
3256 * See https://github.com/systemd/systemd/issues/3860
3257 * Further submounts (such as /dev) done after this will inherit the
13e785f7 3258 * shared propagation mode. */
4ad14eff
LP
3259 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3260 if (r < 0)
3261 return r;
3262
3263 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3264 if (r < 0)
3265 return r;
3266
03cfe0d5
LP
3267 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3268 if (r < 0)
3269 return r;
3270
e5a4bb0d 3271 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
6b7c9f8b 3272 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
3273 if (r < 0)
3274 return log_error_errno(r, "Failed to make tree read-only: %m");
3275 }
3276
0de7acce 3277 r = mount_all(directory,
4f086aab 3278 arg_mount_settings,
0de7acce 3279 arg_uid_shift,
0de7acce 3280 arg_selinux_apifs_context);
03cfe0d5
LP
3281 if (r < 0)
3282 return r;
3283
07fa00f9
LP
3284 r = copy_devnodes(directory);
3285 if (r < 0)
03cfe0d5
LP
3286 return r;
3287
de40a303
LP
3288 r = make_extra_nodes(directory);
3289 if (r < 0)
3290 return r;
3291
3292 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3293 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
03cfe0d5 3294
07fa00f9
LP
3295 r = setup_pts(directory);
3296 if (r < 0)
03cfe0d5
LP
3297 return r;
3298
3299 r = setup_propagate(directory);
3300 if (r < 0)
3301 return r;
3302
3303 r = setup_dev_console(directory, console);
3304 if (r < 0)
3305 return r;
3306
8e5430c4
LP
3307 r = setup_keyring();
3308 if (r < 0)
3309 return r;
3310
03cfe0d5
LP
3311 r = setup_timezone(directory);
3312 if (r < 0)
3313 return r;
3314
3315 r = setup_resolv_conf(directory);
3316 if (r < 0)
3317 return r;
3318
e01ff70a
MS
3319 r = setup_machine_id(directory);
3320 if (r < 0)
3321 return r;
3322
03cfe0d5
LP
3323 r = setup_journal(directory);
3324 if (r < 0)
3325 return r;
3326
0de7acce
LP
3327 r = mount_custom(
3328 directory,
3329 arg_custom_mounts,
3330 arg_n_custom_mounts,
3331 arg_userns_mode != USER_NAMESPACE_NO,
3332 arg_uid_shift,
3333 arg_uid_range,
de40a303
LP
3334 arg_selinux_apifs_context,
3335 false);
03cfe0d5
LP
3336 if (r < 0)
3337 return r;
3338
489fae52 3339 if (!arg_use_cgns) {
0996ef00
CB
3340 r = mount_cgroups(
3341 directory,
3342 arg_unified_cgroup_hierarchy,
3343 arg_userns_mode != USER_NAMESPACE_NO,
3344 arg_uid_shift,
3345 arg_uid_range,
5a8ff0e6 3346 arg_selinux_apifs_context,
ada54120 3347 false);
0996ef00
CB
3348 if (r < 0)
3349 return r;
3350 }
03cfe0d5
LP
3351
3352 r = mount_move_root(directory);
3353 if (r < 0)
3354 return log_error_errno(r, "Failed to move root directory: %m");
3355
9c1e04d0
AP
3356 fd = setup_sd_notify_child();
3357 if (fd < 0)
3358 return fd;
3359
03cfe0d5 3360 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3361 arg_clone_ns_flags |
8869a0b4 3362 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3363 if (pid < 0)
3364 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3365 if (pid == 0) {
3366 pid_socket = safe_close(pid_socket);
e01ff70a 3367 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3368 notify_socket = safe_close(notify_socket);
825d5287 3369 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3370
3371 /* The inner child has all namespaces that are
3372 * requested, so that we all are owned by the user if
3373 * user namespaces are turned on. */
3374
d7bea6b6
DP
3375 if (arg_network_namespace_path) {
3376 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3377 if (r < 0)
e2d39e54 3378 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3379 }
3380
f757855e 3381 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3382 if (r < 0)
3383 _exit(EXIT_FAILURE);
3384
3385 _exit(EXIT_SUCCESS);
3386 }
3387
3388 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3389 if (l < 0)
3390 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3391 if (l != sizeof(pid))
3392 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3393 "Short write while sending PID.");
03cfe0d5 3394
e01ff70a
MS
3395 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3396 if (l < 0)
3397 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3398 if (l != sizeof(arg_uuid))
3399 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3400 "Short write while sending machine ID.");
e01ff70a 3401
9c1e04d0
AP
3402 l = send_one_fd(notify_socket, fd, 0);
3403 if (l < 0)
3404 return log_error_errno(errno, "Failed to send notify fd: %m");
3405
03cfe0d5 3406 pid_socket = safe_close(pid_socket);
e01ff70a 3407 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3408 notify_socket = safe_close(notify_socket);
327e26d6
KN
3409 kmsg_socket = safe_close(kmsg_socket);
3410 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3411 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3412
3413 return 0;
3414}
3415
0e7ac751 3416static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3417 bool tried_hashed = false;
0e7ac751
LP
3418 unsigned n_tries = 100;
3419 uid_t candidate;
3420 int r;
3421
3422 assert(shift);
3423 assert(ret_lock_file);
0de7acce 3424 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3425 assert(arg_uid_range == 0x10000U);
3426
3427 candidate = *shift;
3428
3429 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3430
3431 for (;;) {
fbd0b64f 3432 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3433 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3434
3435 if (--n_tries <= 0)
3436 return -EBUSY;
3437
87d5e4f2 3438 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3439 goto next;
3440 if ((candidate & UINT32_C(0xFFFF)) != 0)
3441 goto next;
3442
3443 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3444 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3445 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3446 goto next;
3447 if (r < 0)
3448 return r;
3449
3450 /* Make some superficial checks whether the range is currently known in the user database */
3451 if (getpwuid(candidate))
3452 goto next;
3453 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3454 goto next;
3455 if (getgrgid(candidate))
3456 goto next;
3457 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3458 goto next;
3459
3460 *ret_lock_file = lf;
3461 lf = (struct LockFile) LOCK_FILE_INIT;
3462 *shift = candidate;
3463 return 0;
3464
3465 next:
d381c8a6
LP
3466 if (arg_machine && !tried_hashed) {
3467 /* Try to hash the base from the container name */
3468
3469 static const uint8_t hash_key[] = {
3470 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3471 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3472 };
3473
3474 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3475
3476 tried_hashed = true;
3477 } else
3478 random_bytes(&candidate, sizeof(candidate));
3479
87d5e4f2 3480 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3481 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3482 }
3483}
3484
03cfe0d5 3485static int setup_uid_map(pid_t pid) {
fbd0b64f 3486 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3487 int r;
3488
3489 assert(pid > 1);
3490
3491 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3492 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3493 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3494 if (r < 0)
3495 return log_error_errno(r, "Failed to write UID map: %m");
3496
3497 /* We always assign the same UID and GID ranges */
3498 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3499 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3500 if (r < 0)
3501 return log_error_errno(r, "Failed to write GID map: %m");
3502
3503 return 0;
3504}
3505
9c1e04d0 3506static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3507 char buf[NOTIFY_BUFFER_MAX+1];
3508 char *p = NULL;
3509 struct iovec iovec = {
3510 .iov_base = buf,
3511 .iov_len = sizeof(buf)-1,
3512 };
3513 union {
3514 struct cmsghdr cmsghdr;
3515 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3516 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3517 } control = {};
3518 struct msghdr msghdr = {
3519 .msg_iov = &iovec,
3520 .msg_iovlen = 1,
3521 .msg_control = &control,
3522 .msg_controllen = sizeof(control),
3523 };
3524 struct cmsghdr *cmsg;
3525 struct ucred *ucred = NULL;
3526 ssize_t n;
3527 pid_t inner_child_pid;
3528 _cleanup_strv_free_ char **tags = NULL;
3529
3530 assert(userdata);
3531
3532 inner_child_pid = PTR_TO_PID(userdata);
3533
3534 if (revents != EPOLLIN) {
3535 log_warning("Got unexpected poll event for notify fd.");
3536 return 0;
3537 }
3538
3539 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3540 if (n < 0) {
3742095b 3541 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3542 return 0;
3543
3544 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3545 }
3546 cmsg_close_all(&msghdr);
3547
3548 CMSG_FOREACH(cmsg, &msghdr) {
3549 if (cmsg->cmsg_level == SOL_SOCKET &&
3550 cmsg->cmsg_type == SCM_CREDENTIALS &&
3551 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3552
3553 ucred = (struct ucred*) CMSG_DATA(cmsg);
3554 }
3555 }
3556
3557 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3558 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3559 return 0;
3560 }
3561
3562 if ((size_t) n >= sizeof(buf)) {
3563 log_warning("Received notify message exceeded maximum size. Ignoring.");
3564 return 0;
3565 }
3566
3567 buf[n] = 0;
3568 tags = strv_split(buf, "\n\r");
3569 if (!tags)
3570 return log_oom();
3571
3572 if (strv_find(tags, "READY=1"))
3573 sd_notifyf(false, "READY=1\n");
3574
3575 p = strv_find_startswith(tags, "STATUS=");
3576 if (p)
3577 sd_notifyf(false, "STATUS=Container running: %s", p);
3578
3579 return 0;
3580}
3581
5773024d 3582static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3583 int r;
9c1e04d0 3584
5773024d 3585 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3586 if (r < 0)
3587 return log_error_errno(r, "Failed to allocate notify event source: %m");
3588
5773024d 3589 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3590
3591 return 0;
3592}
3593
5d961407
LP
3594static int merge_settings(Settings *settings, const char *path) {
3595 int rl;
f757855e 3596
5d961407
LP
3597 assert(settings);
3598 assert(path);
f757855e 3599
5d961407
LP
3600 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3601 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3602
7732f92b
LP
3603 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3604 settings->start_mode >= 0) {
3605 arg_start_mode = settings->start_mode;
130d3d22 3606 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3607 }
3608
a2f577fc
JL
3609 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3610 arg_ephemeral = settings->ephemeral;
3611
de40a303
LP
3612 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3613 settings->root) {
3614
3615 if (!arg_settings_trusted)
3616 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3617 else
3618 free_and_replace(arg_directory, settings->root);
3619 }
3620
b53ede69
PW
3621 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3622 settings->pivot_root_new) {
3623 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3624 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3625 }
3626
5f932eb9 3627 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3628 settings->working_directory)
3629 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3630
f757855e 3631 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3632 settings->environment)
3633 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 3634
de40a303
LP
3635 if ((arg_settings_mask & SETTING_USER) == 0) {
3636
3637 if (settings->user)
3638 free_and_replace(arg_user, settings->user);
3639
3640 if (uid_is_valid(settings->uid))
3641 arg_uid = settings->uid;
3642 if (gid_is_valid(settings->gid))
3643 arg_gid = settings->gid;
3644 if (settings->n_supplementary_gids > 0) {
3645 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3646 arg_n_supplementary_gids = settings->n_supplementary_gids;
3647 }
3648 }
f757855e
LP
3649
3650 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 3651 uint64_t plus, minus;
f757855e 3652
de40a303
LP
3653 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3654 * Settings structure */
3655
0e265674 3656 plus = settings->capability;
a3fc6b55
LP
3657 minus = settings->drop_capability;
3658
3659 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3660 if (settings_private_network(settings))
3661 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3662 else
3663 minus |= UINT64_C(1) << CAP_NET_ADMIN;
3664 }
0e265674
LP
3665
3666 if (!arg_settings_trusted && plus != 0) {
3667 if (settings->capability != 0)
5d961407 3668 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
0e265674 3669 } else
520e0d54 3670 arg_caps_retain |= plus;
f757855e 3671
a3fc6b55 3672 arg_caps_retain &= ~minus;
de40a303
LP
3673
3674 /* Copy the full capabilities over too */
3675 if (capability_quintet_is_set(&settings->full_capabilities)) {
3676 if (!arg_settings_trusted)
3677 log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path);
3678 else
3679 arg_full_capabilities = settings->full_capabilities;
3680 }
f757855e
LP
3681 }
3682
3683 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3684 settings->kill_signal > 0)
3685 arg_kill_signal = settings->kill_signal;
3686
3687 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3688 settings->personality != PERSONALITY_INVALID)
3689 arg_personality = settings->personality;
3690
3691 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3692 !sd_id128_is_null(settings->machine_id)) {
3693
3694 if (!arg_settings_trusted)
5d961407 3695 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3696 else
3697 arg_uuid = settings->machine_id;
3698 }
3699
3700 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3701 settings->read_only >= 0)
3702 arg_read_only = settings->read_only;
3703
3704 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3705 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3706 arg_volatile_mode = settings->volatile_mode;
3707
3708 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3709 settings->n_custom_mounts > 0) {
3710
3711 if (!arg_settings_trusted)
5d961407 3712 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3713 else {
3714 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3715 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3716 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3717 settings->n_custom_mounts = 0;
3718 }
3719 }
3720
3721 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3722 (settings->private_network >= 0 ||
3723 settings->network_veth >= 0 ||
3724 settings->network_bridge ||
22b28dfd 3725 settings->network_zone ||
f757855e
LP
3726 settings->network_interfaces ||
3727 settings->network_macvlan ||
f6d6bad1 3728 settings->network_ipvlan ||
de40a303
LP
3729 settings->network_veth_extra ||
3730 settings->network_namespace_path)) {
f757855e
LP
3731
3732 if (!arg_settings_trusted)
5d961407 3733 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3734 else {
f6d6bad1 3735 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3736 arg_private_network = settings_private_network(settings);
3737
130d3d22
YW
3738 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3739 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3740 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3741 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3742
1cc6c93a
YW
3743 free_and_replace(arg_network_bridge, settings->network_bridge);
3744 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
3745
3746 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
3747 }
3748 }
3749
3750 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3751 settings->expose_ports) {
3752
3753 if (!arg_settings_trusted)
5d961407 3754 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3755 else {
3756 expose_port_free_all(arg_expose_ports);
1cc6c93a 3757 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3758 }
3759 }
3760
0de7acce
LP
3761 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3762 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3763
3764 if (!arg_settings_trusted)
5d961407 3765 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3766 else {
3767 arg_userns_mode = settings->userns_mode;
3768 arg_uid_shift = settings->uid_shift;
3769 arg_uid_range = settings->uid_range;
3770 arg_userns_chown = settings->userns_chown;
3771 }
3772 }
3773
9c1e04d0
AP
3774 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3775 arg_notify_ready = settings->notify_ready;
3776
960e4569
LP
3777 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3778
de40a303 3779 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
5d961407 3780 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3781 else {
130d3d22
YW
3782 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3783 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569 3784 }
de40a303
LP
3785
3786#if HAVE_SECCOMP
3787 if (!arg_settings_trusted && settings->seccomp)
3788 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3789 else {
3790 seccomp_release(arg_seccomp);
3791 arg_seccomp = TAKE_PTR(settings->seccomp);
3792 }
3793#endif
960e4569
LP
3794 }
3795
bf428efb
LP
3796 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3797 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3798 continue;
3799
3800 if (!settings->rlimit[rl])
3801 continue;
3802
3803 if (!arg_settings_trusted) {
5d961407 3804 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3805 continue;
3806 }
3807
3808 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3809 }
3810
3a9530e5
LP
3811 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3812 settings->hostname)
3813 free_and_replace(arg_hostname, settings->hostname);
3814
66edd963
LP
3815 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3816 settings->no_new_privileges >= 0)
3817 arg_no_new_privileges = settings->no_new_privileges;
3818
81f345df
LP
3819 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3820 settings->oom_score_adjust_set) {
3821
3822 if (!arg_settings_trusted)
5d961407 3823 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3824 else {
3825 arg_oom_score_adjust = settings->oom_score_adjust;
3826 arg_oom_score_adjust_set = true;
3827 }
3828 }
3829
d107bb7d
LP
3830 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3831 settings->cpuset) {
3832
3833 if (!arg_settings_trusted)
5d961407 3834 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d
LP
3835 else {
3836 if (arg_cpuset)
3837 CPU_FREE(arg_cpuset);
3838 arg_cpuset = TAKE_PTR(settings->cpuset);
3839 arg_cpuset_ncpus = settings->cpuset_ncpus;
3840 }
3841 }
3842
09d423e9
LP
3843 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3844 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3845 arg_resolv_conf = settings->resolv_conf;
3846
4e1d6aa9
LP
3847 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3848 settings->link_journal != _LINK_JOURNAL_INVALID) {
3849
3850 if (!arg_settings_trusted)
3851 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3852 else {
3853 arg_link_journal = settings->link_journal;
3854 arg_link_journal_try = settings->link_journal_try;
3855 }
3856 }
3857
1688841f
LP
3858 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3859 settings->timezone != _TIMEZONE_MODE_INVALID)
3860 arg_timezone = settings->timezone;
3861
de40a303
LP
3862 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
3863 settings->slice) {
3864
3865 if (!arg_settings_trusted)
3866 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
3867 else
3868 free_and_replace(arg_slice, settings->slice);
3869 }
3870
3871 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
3872 settings->use_cgns >= 0) {
3873
3874 if (!arg_settings_trusted)
3875 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
3876 else
3877 arg_use_cgns = settings->use_cgns;
3878 }
3879
3880 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
3881 settings->clone_ns_flags != (unsigned long) -1) {
3882
3883 if (!arg_settings_trusted)
3884 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
3885 else
3886 arg_clone_ns_flags = settings->clone_ns_flags;
3887 }
3888
3889 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
3890 settings->console_mode >= 0) {
3891
3892 if (!arg_settings_trusted)
3893 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
3894 else
3895 arg_console_mode = settings->console_mode;
3896 }
3897
3898 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
3899 * don't consult arg_settings_mask for them. */
3900
3901 sd_bus_message_unref(arg_property_message);
3902 arg_property_message = TAKE_PTR(settings->properties);
3903
3904 arg_console_width = settings->console_width;
3905 arg_console_height = settings->console_height;
3906
3907 device_node_free_many(arg_extra_nodes, arg_n_extra_nodes);
3908 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
3909 arg_n_extra_nodes = settings->n_extra_nodes;
3910
f757855e
LP
3911 return 0;
3912}
3913
5d961407
LP
3914static int load_settings(void) {
3915 _cleanup_(settings_freep) Settings *settings = NULL;
3916 _cleanup_fclose_ FILE *f = NULL;
3917 _cleanup_free_ char *p = NULL;
3918 const char *fn, *i;
3919 int r;
3920
de40a303
LP
3921 if (arg_oci_bundle)
3922 return 0;
3923
5d961407
LP
3924 /* If all settings are masked, there's no point in looking for
3925 * the settings file */
3926 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3927 return 0;
3928
3929 fn = strjoina(arg_machine, ".nspawn");
3930
3931 /* We first look in the admin's directories in /etc and /run */
3932 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3933 _cleanup_free_ char *j = NULL;
3934
3935 j = strjoin(i, "/", fn);
3936 if (!j)
3937 return log_oom();
3938
3939 f = fopen(j, "re");
3940 if (f) {
3941 p = TAKE_PTR(j);
3942
3943 /* By default, we trust configuration from /etc and /run */
3944 if (arg_settings_trusted < 0)
3945 arg_settings_trusted = true;
3946
3947 break;
3948 }
3949
3950 if (errno != ENOENT)
3951 return log_error_errno(errno, "Failed to open %s: %m", j);
3952 }
3953
3954 if (!f) {
3955 /* After that, let's look for a file next to the
3956 * actual image we shall boot. */
3957
3958 if (arg_image) {
3959 p = file_in_same_dir(arg_image, fn);
3960 if (!p)
3961 return log_oom();
3962 } else if (arg_directory) {
3963 p = file_in_same_dir(arg_directory, fn);
3964 if (!p)
3965 return log_oom();
3966 }
3967
3968 if (p) {
3969 f = fopen(p, "re");
3970 if (!f && errno != ENOENT)
3971 return log_error_errno(errno, "Failed to open %s: %m", p);
3972
3973 /* By default, we do not trust configuration from /var/lib/machines */
3974 if (arg_settings_trusted < 0)
3975 arg_settings_trusted = false;
3976 }
3977 }
3978
3979 if (!f)
3980 return 0;
3981
3982 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3983
3984 r = settings_load(f, p, &settings);
3985 if (r < 0)
3986 return r;
3987
3988 return merge_settings(settings, p);
3989}
3990
de40a303
LP
3991static int load_oci_bundle(void) {
3992 _cleanup_(settings_freep) Settings *settings = NULL;
3993 int r;
3994
3995 if (!arg_oci_bundle)
3996 return 0;
3997
3998 /* By default let's trust OCI bundles */
3999 if (arg_settings_trusted < 0)
4000 arg_settings_trusted = true;
4001
4002 r = oci_load(NULL, arg_oci_bundle, &settings);
4003 if (r < 0)
4004 return r;
4005
4006 return merge_settings(settings, arg_oci_bundle);
4007}
4008
b0067625
ZJS
4009static int run(int master,
4010 const char* console,
2d845785 4011 DissectedImage *dissected_image,
b0067625
ZJS
4012 bool secondary,
4013 FDSet *fds,
4014 char veth_name[IFNAMSIZ], bool *veth_created,
4015 union in_addr_union *exposed,
4016 pid_t *pid, int *ret) {
4017
4018 static const struct sigaction sa = {
4019 .sa_handler = nop_signal_handler,
e28c7cd0 4020 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4021 };
4022
8e766630 4023 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4024 _cleanup_close_ int etc_passwd_lock = -1;
4025 _cleanup_close_pair_ int
4026 kmsg_socket_pair[2] = { -1, -1 },
4027 rtnl_socket_pair[2] = { -1, -1 },
4028 pid_socket_pair[2] = { -1, -1 },
4029 uuid_socket_pair[2] = { -1, -1 },
4030 notify_socket_pair[2] = { -1, -1 },
8199d554
LP
4031 uid_shift_socket_pair[2] = { -1, -1 },
4032 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4033
b0067625
ZJS
4034 _cleanup_close_ int notify_socket= -1;
4035 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4036 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4037 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4038 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4039 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4040 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4041 ContainerStatus container_status = 0;
b0067625
ZJS
4042 int ifi = 0, r;
4043 ssize_t l;
4044 sigset_t mask_chld;
d7bea6b6 4045 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
4046
4047 assert_se(sigemptyset(&mask_chld) == 0);
4048 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4049
4050 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4051 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4052 * check with getpwuid() if the specific user already exists. Note that /etc might be
4053 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4054 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4055 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4056 * really ours. */
4057
4058 etc_passwd_lock = take_etc_passwd_lock(NULL);
4059 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4060 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4061 }
4062
4063 r = barrier_create(&barrier);
4064 if (r < 0)
4065 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4066
4067 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4068 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4069
4070 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4071 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4072
4073 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4074 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4075
4076 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4077 return log_error_errno(errno, "Failed to create id socket pair: %m");
4078
4079 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4080 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4081
4082 if (arg_userns_mode != USER_NAMESPACE_NO)
4083 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4084 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4085
8199d554
LP
4086 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4087 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4088 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4089
b0067625
ZJS
4090 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4091 * parent's blocking calls and give it a chance to call wait() and terminate. */
4092 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4093 if (r < 0)
4094 return log_error_errno(errno, "Failed to change the signal mask: %m");
4095
4096 r = sigaction(SIGCHLD, &sa, NULL);
4097 if (r < 0)
4098 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4099
d7bea6b6
DP
4100 if (arg_network_namespace_path) {
4101 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4102 if (netns_fd < 0)
4103 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4104
4105 r = fd_is_network_ns(netns_fd);
6619ad88
LP
4106 if (r == -EUCLEAN)
4107 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4108 else if (r < 0)
d7bea6b6 4109 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
6619ad88
LP
4110 else if (r == 0) {
4111 log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4112 return -EINVAL;
4113 }
4114 }
4115
b0067625
ZJS
4116 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4117 if (*pid < 0)
4118 return log_error_errno(errno, "clone() failed%s: %m",
4119 errno == EINVAL ?
4120 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4121
4122 if (*pid == 0) {
4123 /* The outer child only has a file system namespace. */
4124 barrier_set_role(&barrier, BARRIER_CHILD);
4125
4126 master = safe_close(master);
4127
4128 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4129 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4130 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4131 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4132 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4133 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4134 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4135
4136 (void) reset_all_signal_handlers();
4137 (void) reset_signal_mask();
4138
4139 r = outer_child(&barrier,
4140 arg_directory,
4141 console,
2d845785 4142 dissected_image,
b0067625
ZJS
4143 secondary,
4144 pid_socket_pair[1],
4145 uuid_socket_pair[1],
4146 notify_socket_pair[1],
4147 kmsg_socket_pair[1],
4148 rtnl_socket_pair[1],
4149 uid_shift_socket_pair[1],
8199d554 4150 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
4151 fds,
4152 netns_fd);
b0067625
ZJS
4153 if (r < 0)
4154 _exit(EXIT_FAILURE);
4155
4156 _exit(EXIT_SUCCESS);
4157 }
4158
4159 barrier_set_role(&barrier, BARRIER_PARENT);
4160
4161 fds = fdset_free(fds);
4162
4163 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4164 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4165 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4166 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4167 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4168 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4169 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4170
4171 if (arg_userns_mode != USER_NAMESPACE_NO) {
4172 /* The child just let us know the UID shift it might have read from the image. */
4173 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4174 if (l < 0)
4175 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
4176 if (l != sizeof arg_uid_shift) {
4177 log_error("Short read while reading UID shift.");
4178 return -EIO;
4179 }
4180
4181 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4182 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4183 * image, but if that's already in use, pick a new one, and report back to the child,
4184 * which one we now picked. */
4185
4186 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4187 if (r < 0)
4188 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4189
4190 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4191 if (l < 0)
4192 return log_error_errno(errno, "Failed to send UID shift: %m");
4193 if (l != sizeof arg_uid_shift) {
4194 log_error("Short write while writing UID shift.");
4195 return -EIO;
4196 }
4197 }
4198 }
4199
8199d554
LP
4200 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4201 /* The child let us know the support cgroup mode it might have read from the image. */
4202 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4203 if (l < 0)
4204 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4205 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
bd897e72
ZJS
4206 log_error("Short read while reading cgroup mode (%zu bytes).%s",
4207 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4208 return -EIO;
4209 }
4210 }
4211
b0067625 4212 /* Wait for the outer child. */
d2e0ac3d
LP
4213 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4214 if (r < 0)
4215 return r;
4216 if (r != EXIT_SUCCESS)
4217 return -EIO;
b0067625
ZJS
4218
4219 /* And now retrieve the PID of the inner child. */
4220 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4221 if (l < 0)
4222 return log_error_errno(errno, "Failed to read inner child PID: %m");
4223 if (l != sizeof *pid) {
4224 log_error("Short read while reading inner child PID.");
4225 return -EIO;
4226 }
4227
4228 /* We also retrieve container UUID in case it was generated by outer child */
4229 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4230 if (l < 0)
4231 return log_error_errno(errno, "Failed to read container machine ID: %m");
4232 if (l != sizeof(arg_uuid)) {
4233 log_error("Short read while reading container machined ID.");
4234 return -EIO;
4235 }
4236
4237 /* We also retrieve the socket used for notifications generated by outer child */
4238 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4239 if (notify_socket < 0)
4240 return log_error_errno(notify_socket,
4241 "Failed to receive notification socket from the outer child: %m");
4242
4243 log_debug("Init process invoked as PID "PID_FMT, *pid);
4244
4245 if (arg_userns_mode != USER_NAMESPACE_NO) {
4246 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4247 log_error("Child died too early.");
4248 return -ESRCH;
4249 }
4250
4251 r = setup_uid_map(*pid);
4252 if (r < 0)
4253 return r;
4254
4255 (void) barrier_place(&barrier); /* #2 */
4256 }
4257
4258 if (arg_private_network) {
75116558
PS
4259 if (!arg_network_namespace_path) {
4260 /* Wait until the child has unshared its network namespace. */
4261 if (!barrier_place_and_sync(&barrier)) { /* #3 */
4262 log_error("Child died too early");
4263 return -ESRCH;
4264 }
4265 }
4266
b0067625
ZJS
4267 r = move_network_interfaces(*pid, arg_network_interfaces);
4268 if (r < 0)
4269 return r;
4270
4271 if (arg_network_veth) {
4272 r = setup_veth(arg_machine, *pid, veth_name,
4273 arg_network_bridge || arg_network_zone);
4274 if (r < 0)
4275 return r;
4276 else if (r > 0)
4277 ifi = r;
4278
4279 if (arg_network_bridge) {
4280 /* Add the interface to a bridge */
4281 r = setup_bridge(veth_name, arg_network_bridge, false);
4282 if (r < 0)
4283 return r;
4284 if (r > 0)
4285 ifi = r;
4286 } else if (arg_network_zone) {
4287 /* Add the interface to a bridge, possibly creating it */
4288 r = setup_bridge(veth_name, arg_network_zone, true);
4289 if (r < 0)
4290 return r;
4291 if (r > 0)
4292 ifi = r;
4293 }
4294 }
4295
4296 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4297 if (r < 0)
4298 return r;
4299
4300 /* We created the primary and extra veth links now; let's remember this, so that we know to
4301 remove them later on. Note that we don't bother with removing veth links that were created
4302 here when their setup failed half-way, because in that case the kernel should be able to
4303 remove them on its own, since they cannot be referenced by anything yet. */
4304 *veth_created = true;
4305
4306 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4307 if (r < 0)
4308 return r;
4309
4310 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4311 if (r < 0)
4312 return r;
4313 }
4314
abdb9b08
LP
4315 if (arg_register || !arg_keep_unit) {
4316 r = sd_bus_default_system(&bus);
4317 if (r < 0)
4318 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4319
4320 r = sd_bus_set_close_on_exit(bus, false);
4321 if (r < 0)
4322 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4323 }
4324
4325 if (!arg_keep_unit) {
4326 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4327 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4328 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4329
75152a4d
LP
4330 r = sd_bus_match_signal_async(
4331 bus,
4332 NULL,
4333 "org.freedesktop.systemd1",
4334 NULL,
4335 "org.freedesktop.systemd1.Scope",
4336 "RequestStop",
4337 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4338 if (r < 0)
75152a4d 4339 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4340 }
4341
b0067625
ZJS
4342 if (arg_register) {
4343 r = register_machine(
abdb9b08 4344 bus,
b0067625
ZJS
4345 arg_machine,
4346 *pid,
4347 arg_directory,
4348 arg_uuid,
4349 ifi,
4350 arg_slice,
4351 arg_custom_mounts, arg_n_custom_mounts,
4352 arg_kill_signal,
4353 arg_property,
de40a303 4354 arg_property_message,
b0067625
ZJS
4355 arg_keep_unit,
4356 arg_container_service_name);
4357 if (r < 0)
4358 return r;
abdb9b08 4359
cd2dfc6f
LP
4360 } else if (!arg_keep_unit) {
4361 r = allocate_scope(
abdb9b08 4362 bus,
cd2dfc6f
LP
4363 arg_machine,
4364 *pid,
4365 arg_slice,
4366 arg_custom_mounts, arg_n_custom_mounts,
4367 arg_kill_signal,
de40a303
LP
4368 arg_property,
4369 arg_property_message);
cd2dfc6f
LP
4370 if (r < 0)
4371 return r;
4372
4373 } else if (arg_slice || arg_property)
4374 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4375
27da7ef0 4376 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4377 if (r < 0)
4378 return r;
4379
27da7ef0 4380 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4381 if (r < 0)
4382 return r;
b0067625 4383
de54e02d 4384 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4385 if (r < 0)
4386 return r;
4387
4388 /* Notify the child that the parent is ready with all
4389 * its setup (including cgroup-ification), and that
4390 * the child can now hand over control to the code to
4391 * run inside the container. */
75116558 4392 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4393
4394 /* Block SIGCHLD here, before notifying child.
4395 * process_pty() will handle it with the other signals. */
4396 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4397
4398 /* Reset signal to default */
4399 r = default_signals(SIGCHLD, -1);
4400 if (r < 0)
4401 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4402
4403 r = sd_event_new(&event);
4404 if (r < 0)
4405 return log_error_errno(r, "Failed to get default event source: %m");
4406
8fd010bb
LP
4407 (void) sd_event_set_watchdog(event, true);
4408
abdb9b08
LP
4409 if (bus) {
4410 r = sd_bus_attach_event(bus, event, 0);
4411 if (r < 0)
4412 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4413 }
4414
5773024d 4415 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4416 if (r < 0)
4417 return r;
4418
4419 /* Let the child know that we are ready and wait that the child is completely ready now. */
75116558 4420 if (!barrier_place_and_sync(&barrier)) { /* #5 */
b0067625
ZJS
4421 log_error("Child died too early.");
4422 return -ESRCH;
4423 }
4424
4425 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4426 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4427 etc_passwd_lock = safe_close(etc_passwd_lock);
4428
4429 sd_notifyf(false,
4430 "STATUS=Container running.\n"
4431 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4432 if (!arg_notify_ready)
919f5ae0 4433 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4434
4435 if (arg_kill_signal > 0) {
4436 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4437 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4438 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4439 } else {
4440 /* Immediately exit */
919f5ae0
LP
4441 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4442 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4443 }
4444
6916b164 4445 /* Exit when the child exits */
919f5ae0 4446 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4447
4448 if (arg_expose_ports) {
4449 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4450 if (r < 0)
4451 return r;
4452
4453 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4454 }
4455
4456 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4457
de40a303
LP
4458 if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
4459 assert(master >= 0);
4460
4461 r = pty_forward_new(event, master,
4462 PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
4463 &forward);
4464 if (r < 0)
4465 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4466
4467 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4468 (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
4469 }
b0067625
ZJS
4470
4471 r = sd_event_loop(event);
4472 if (r < 0)
4473 return log_error_errno(r, "Failed to run event loop: %m");
4474
de40a303
LP
4475 if (forward) {
4476 char last_char = 0;
b0067625 4477
de40a303
LP
4478 (void) pty_forward_get_last_char(forward, &last_char);
4479 forward = pty_forward_free(forward);
b0067625 4480
de40a303
LP
4481 if (!arg_quiet && last_char != '\n')
4482 putc('\n', stdout);
4483 }
b0067625
ZJS
4484
4485 /* Kill if it is not dead yet anyway */
1d78fea2
LP
4486 if (bus) {
4487 if (arg_register)
4488 terminate_machine(bus, arg_machine);
4489 else if (!arg_keep_unit)
4490 terminate_scope(bus, arg_machine);
4491 }
b0067625
ZJS
4492
4493 /* Normally redundant, but better safe than sorry */
c67b0082 4494 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4495
4496 r = wait_for_container(*pid, &container_status);
4497 *pid = 0;
4498
4499 if (r < 0)
4500 /* We failed to wait for the container, or the container exited abnormally. */
4501 return r;
4502 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4503 /* r > 0 → The container exited with a non-zero status.
4504 * As a special case, we need to replace 133 with a different value,
4505 * because 133 is special-cased in the service file to reboot the container.
4506 * otherwise → The container exited with zero status and a reboot was not requested.
4507 */
2a49b612 4508 if (r == EXIT_FORCE_RESTART)
27e29a1e 4509 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4510 *ret = r;
b0067625
ZJS
4511 return 0; /* finito */
4512 }
4513
4514 /* CONTAINER_REBOOTED, loop again */
4515
4516 if (arg_keep_unit) {
4517 /* Special handling if we are running as a service: instead of simply
4518 * restarting the machine we want to restart the entire service, so let's
4519 * inform systemd about this with the special exit code 133. The service
4520 * file uses RestartForceExitStatus=133 so that this results in a full
4521 * nspawn restart. This is necessary since we might have cgroup parameters
4522 * set we want to have flushed out. */
2a49b612
ZJS
4523 *ret = EXIT_FORCE_RESTART;
4524 return 0; /* finito */
b0067625
ZJS
4525 }
4526
4527 expose_port_flush(arg_expose_ports, exposed);
4528
4529 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4530 *veth_created = false;
4531 return 1; /* loop again */
4532}
4533
bf428efb 4534static int initialize_rlimits(void) {
bf428efb
LP
4535 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4536 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4537 * container execution environments. */
4538
4539 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4540 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4541 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4542 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4543 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4544 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4545 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4546 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4547 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4548 [RLIMIT_NICE] = { 0, 0 },
4549 [RLIMIT_NOFILE] = { 1024, 4096 },
4550 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4551 [RLIMIT_RTPRIO] = { 0, 0 },
4552 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4553 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4554
4555 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4556 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4557 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4558 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4559 * that PID 1 changes a number of other resource limits during early initialization which is why we
4560 * don't read the other limits from PID 1 but prefer the static table above. */
4561 };
4562
4563 int rl;
4564
4565 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
4566 /* Let's only fill in what the user hasn't explicitly configured anyway */
4567 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4568 const struct rlimit *v;
4569 struct rlimit buffer;
4570
4571 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4572 /* For these two let's read the limits off PID 1. See above for an explanation. */
4573
4574 if (prlimit(1, rl, NULL, &buffer) < 0)
4575 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4576
4577 v = &buffer;
4578 } else
4579 v = kernel_defaults + rl;
4580
4581 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4582 if (!arg_rlimit[rl])
4583 return log_oom();
4584 }
4585
4586 if (DEBUG_LOGGING) {
4587 _cleanup_free_ char *k = NULL;
4588
4589 (void) rlimit_format(arg_rlimit[rl], &k);
4590 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4591 }
4592 }
4593
4594 return 0;
4595}
4596
03cfe0d5 4597int main(int argc, char *argv[]) {
2d845785
LP
4598 _cleanup_free_ char *console = NULL;
4599 _cleanup_close_ int master = -1;
03cfe0d5 4600 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4601 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4602 char veth_name[IFNAMSIZ] = "";
17cbb288 4603 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4604 pid_t pid = 0;
03cfe0d5 4605 union in_addr_union exposed = {};
8e766630 4606 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
de40a303 4607 bool veth_created = false, remove_tmprootdir = false;
c67b0082 4608 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4609 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4610 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4611 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
4612
4613 log_parse_environment();
4614 log_open();
415fc41c 4615
7732f92b
LP
4616 /* Make sure rename_process() in the stub init process can work */
4617 saved_argv = argv;
4618 saved_argc = argc;
4619
03cfe0d5
LP
4620 r = parse_argv(argc, argv);
4621 if (r <= 0)
4622 goto finish;
4623
fba868fa
LP
4624 r = must_be_root();
4625 if (r < 0)
03cfe0d5 4626 goto finish;
fba868fa 4627
bf428efb
LP
4628 r = initialize_rlimits();
4629 if (r < 0)
4630 goto finish;
4631
de40a303
LP
4632 r = load_oci_bundle();
4633 if (r < 0)
4634 goto finish;
4635
f757855e
LP
4636 r = determine_names();
4637 if (r < 0)
4638 goto finish;
4639
4640 r = load_settings();
4641 if (r < 0)
4642 goto finish;
4643
5eee8290
LP
4644 r = cg_unified_flush();
4645 if (r < 0) {
4646 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4647 goto finish;
4648 }
4649
f757855e
LP
4650 r = verify_arguments();
4651 if (r < 0)
4652 goto finish;
03cfe0d5 4653
8199d554
LP
4654 r = detect_unified_cgroup_hierarchy_from_environment();
4655 if (r < 0)
4656 goto finish;
4657
2949ff26
LP
4658 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4659 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4660 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4661 (void) ignore_signals(SIGPIPE, -1);
4662
03cfe0d5
LP
4663 n_fd_passed = sd_listen_fds(false);
4664 if (n_fd_passed > 0) {
4665 r = fdset_new_listen_fds(&fds, false);
4666 if (r < 0) {
4667 log_error_errno(r, "Failed to collect file descriptors: %m");
4668 goto finish;
4669 }
4670 }
4671
83e803a9
ZJS
4672 /* The "default" umask. This is appropriate for most file and directory
4673 * operations performed by nspawn, and is the umask that will be used for
4674 * the child. Functions like copy_devnodes() change the umask temporarily. */
4675 umask(0022);
4676
03cfe0d5
LP
4677 if (arg_directory) {
4678 assert(!arg_image);
4679
4680 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4681 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4682 r = -EINVAL;
4683 goto finish;
4684 }
4685
4686 if (arg_ephemeral) {
4687 _cleanup_free_ char *np = NULL;
4688
8d4aa2bb 4689 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4690 if (r < 0)
4691 goto finish;
4692
03cfe0d5
LP
4693 /* If the specified path is a mount point we
4694 * generate the new snapshot immediately
4695 * inside it under a random name. However if
4696 * the specified is not a mount point we
4697 * create the new snapshot in the parent
4698 * directory, just next to it. */
e1873695 4699 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4700 if (r < 0) {
4701 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4702 goto finish;
4703 }
4704 if (r > 0)
770b5ce4 4705 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4706 else
770b5ce4 4707 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4708 if (r < 0) {
0f3be6ca 4709 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4710 goto finish;
4711 }
4712
4713 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4714 if (r < 0) {
4715 log_error_errno(r, "Failed to lock %s: %m", np);
4716 goto finish;
4717 }
4718
17cbb288
LP
4719 r = btrfs_subvol_snapshot(arg_directory, np,
4720 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4721 BTRFS_SNAPSHOT_FALLBACK_COPY |
4722 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4723 BTRFS_SNAPSHOT_RECURSIVE |
4724 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4725 if (r < 0) {
4726 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4727 goto finish;
ec16945e
LP
4728 }
4729
1cc6c93a 4730 free_and_replace(arg_directory, np);
ec16945e 4731
17cbb288 4732 remove_directory = true;
30535c16
LP
4733
4734 } else {
cb638b5e 4735 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4736 if (r < 0)
4737 goto finish;
4738
30535c16
LP
4739 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4740 if (r == -EBUSY) {
4741 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4742 goto finish;
4743 }
4744 if (r < 0) {
4745 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4746 goto finish;
30535c16
LP
4747 }
4748
4749 if (arg_template) {
8d4aa2bb 4750 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4751 if (r < 0)
4752 goto finish;
4753
17cbb288
LP
4754 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4755 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4756 BTRFS_SNAPSHOT_FALLBACK_COPY |
4757 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4758 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4759 BTRFS_SNAPSHOT_RECURSIVE |
4760 BTRFS_SNAPSHOT_QUOTA);
ff6c6cc1
LP
4761 if (r == -EEXIST)
4762 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4763 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4764 else if (r < 0) {
83521414 4765 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 4766 goto finish;
ff6c6cc1
LP
4767 } else
4768 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4769 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 4770 }
ec16945e
LP
4771 }
4772
7732f92b 4773 if (arg_start_mode == START_BOOT) {
a5201ed6 4774 const char *p;
c9fe05e0 4775
a5201ed6
LP
4776 if (arg_pivot_root_new)
4777 p = prefix_roota(arg_directory, arg_pivot_root_new);
4778 else
4779 p = arg_directory;
c9fe05e0
AR
4780
4781 if (path_is_os_tree(p) <= 0) {
4782 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4783 r = -EINVAL;
1b9e5b12
LP
4784 goto finish;
4785 }
4786 } else {
c9fe05e0
AR
4787 const char *p, *q;
4788
a5201ed6
LP
4789 if (arg_pivot_root_new)
4790 p = prefix_roota(arg_directory, arg_pivot_root_new);
4791 else
4792 p = arg_directory;
c9fe05e0
AR
4793
4794 q = strjoina(p, "/usr/");
1b9e5b12 4795
c9fe05e0
AR
4796 if (laccess(q, F_OK) < 0) {
4797 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4798 r = -EINVAL;
1b9e5b12 4799 goto finish;
1b9e5b12
LP
4800 }
4801 }
ec16945e 4802
6b9132a9 4803 } else {
ec16945e
LP
4804 assert(arg_image);
4805 assert(!arg_template);
4806
8d4aa2bb 4807 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4808 if (r < 0)
4809 goto finish;
4810
0f3be6ca
LP
4811 if (arg_ephemeral) {
4812 _cleanup_free_ char *np = NULL;
4813
4814 r = tempfn_random(arg_image, "machine.", &np);
4815 if (r < 0) {
4816 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4817 goto finish;
4818 }
4819
4820 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4821 if (r < 0) {
4822 r = log_error_errno(r, "Failed to create image lock: %m");
4823 goto finish;
4824 }
4825
adc6f43b 4826 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
0f3be6ca
LP
4827 if (r < 0) {
4828 r = log_error_errno(r, "Failed to copy image file: %m");
4829 goto finish;
4830 }
4831
1cc6c93a 4832 free_and_replace(arg_image, np);
0f3be6ca
LP
4833
4834 remove_image = true;
4835 } else {
4836 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4837 if (r == -EBUSY) {
4838 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4839 goto finish;
4840 }
4841 if (r < 0) {
4842 r = log_error_errno(r, "Failed to create image lock: %m");
4843 goto finish;
4844 }
4623e8e6 4845
78ebe980
LP
4846 if (!arg_root_hash) {
4847 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4848 if (r < 0) {
4849 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4850 goto finish;
4851 }
4852 }
30535c16
LP
4853 }
4854
c67b0082 4855 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4856 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4857 goto finish;
1b9e5b12 4858 }
6b9132a9 4859
c67b0082
LP
4860 remove_tmprootdir = true;
4861
4862 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4863 if (!arg_directory) {
4864 r = log_oom();
4865 goto finish;
6b9132a9 4866 }
88213476 4867
2d845785
LP
4868 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4869 if (r < 0) {
4870 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
4871 goto finish;
4872 }
1b9e5b12 4873
4526113f 4874 r = dissect_image_and_warn(
e0f9e7bd 4875 loop->fd,
4526113f 4876 arg_image,
e0f9e7bd
LP
4877 arg_root_hash, arg_root_hash_size,
4878 DISSECT_IMAGE_REQUIRE_ROOT,
4879 &dissected_image);
2d845785 4880 if (r == -ENOPKG) {
4526113f 4881 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
4882 log_notice("Note that the disk image needs to\n"
4883 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4884 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4885 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4886 " d) or contain a file system without a partition table\n"
4887 "in order to be bootable with systemd-nspawn.");
1b9e5b12 4888 goto finish;
2d845785 4889 }
4526113f 4890 if (r < 0)
842f3b0f 4891 goto finish;
1b9e5b12 4892
4623e8e6
LP
4893 if (!arg_root_hash && dissected_image->can_verity)
4894 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4895
4896 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
4897 if (r < 0)
4898 goto finish;
0f3be6ca
LP
4899
4900 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4901 if (remove_image && unlink(arg_image) >= 0)
4902 remove_image = false;
842f3b0f 4903 }
842f3b0f 4904
86c0dd4a 4905 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4906 if (r < 0)
4907 goto finish;
4908
de40a303
LP
4909 if (arg_console_mode < 0)
4910 arg_console_mode =
4911 isatty(STDIN_FILENO) > 0 &&
4912 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 4913
de40a303
LP
4914 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
4915 arg_quiet = true;
a258bf26 4916
de40a303
LP
4917 if (arg_console_mode != CONSOLE_PIPE) {
4918 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
4919 if (master < 0) {
4920 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4921 goto finish;
4922 }
68b02049 4923
de40a303
LP
4924 r = ptsname_malloc(master, &console);
4925 if (r < 0) {
4926 r = log_error_errno(r, "Failed to determine tty name: %m");
68b02049 4927 goto finish;
de40a303 4928 }
a258bf26 4929
de40a303
LP
4930 if (arg_selinux_apifs_context) {
4931 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4932 if (r < 0)
4933 goto finish;
4934 }
4935
4936 if (unlockpt(master) < 0) {
4937 r = log_error_errno(errno, "Failed to unlock tty: %m");
4938 goto finish;
4939 }
a258bf26
LP
4940 }
4941
9c857b9d
LP
4942 if (!arg_quiet)
4943 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4944 arg_machine, arg_image ?: arg_directory);
4945
72c0a2c2 4946 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4947
66edd963 4948 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
4949 r = log_error_errno(errno, "Failed to become subreaper: %m");
4950 goto finish;
4951 }
4952
d87be9b0 4953 for (;;) {
b0067625
ZJS
4954 r = run(master,
4955 console,
2d845785 4956 dissected_image,
de40a303 4957 secondary,
b0067625
ZJS
4958 fds,
4959 veth_name, &veth_created,
4960 &exposed,
4961 &pid, &ret);
4962 if (r <= 0)
d87be9b0 4963 break;
d87be9b0 4964 }
88213476
LP
4965
4966finish:
af4ec430 4967 sd_notify(false,
2a49b612
ZJS
4968 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4969 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4970
9444b1f2 4971 if (pid > 0)
c67b0082 4972 (void) kill(pid, SIGKILL);
88213476 4973
503546da 4974 /* Try to flush whatever is still queued in the pty */
6a0f896b 4975 if (master >= 0) {
1c876927 4976 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
4977 master = safe_close(master);
4978 }
4979
4980 if (pid > 0)
4981 (void) wait_for_terminate(pid, NULL);
503546da 4982
50ebcf6c
LP
4983 pager_close();
4984
17cbb288 4985 if (remove_directory && arg_directory) {
ec16945e
LP
4986 int k;
4987
17cbb288 4988 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4989 if (k < 0)
17cbb288 4990 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4991 }
4992
0f3be6ca
LP
4993 if (remove_image && arg_image) {
4994 if (unlink(arg_image) < 0)
4995 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4996 }
4997
c67b0082
LP
4998 if (remove_tmprootdir) {
4999 if (rmdir(tmprootdir) < 0)
5000 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5001 }
5002
785890ac
LP
5003 if (arg_machine) {
5004 const char *p;
5005
63c372cb 5006 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5007 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5008 }
5009
7a8f6325 5010 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
5011
5012 if (veth_created)
5013 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5014 (void) remove_bridge(arg_network_zone);
f757855e 5015
04d391da 5016 free(arg_directory);
ec16945e
LP
5017 free(arg_template);
5018 free(arg_image);
7027ff61 5019 free(arg_machine);
3a9530e5 5020 free(arg_hostname);
c74e630d 5021 free(arg_user);
de40a303 5022 free(arg_supplementary_gids);
b53ede69
PW
5023 free(arg_pivot_root_new);
5024 free(arg_pivot_root_old);
5f932eb9 5025 free(arg_chdir);
c74e630d 5026 strv_free(arg_setenv);
f757855e 5027 free(arg_network_bridge);
c74e630d
LP
5028 strv_free(arg_network_interfaces);
5029 strv_free(arg_network_macvlan);
4bbfe7ad 5030 strv_free(arg_network_ipvlan);
f6d6bad1 5031 strv_free(arg_network_veth_extra);
f757855e 5032 strv_free(arg_parameters);
df1fac6d
LP
5033 free(arg_network_zone);
5034 free(arg_network_namespace_path);
5035 strv_free(arg_property);
de40a303 5036 sd_bus_message_unref(arg_property_message);
f757855e
LP
5037 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5038 expose_port_free_all(arg_expose_ports);
4623e8e6 5039 free(arg_root_hash);
bf428efb 5040 rlimit_free_all(arg_rlimit);
df1fac6d
LP
5041 strv_free(arg_syscall_whitelist);
5042 strv_free(arg_syscall_blacklist);
de40a303
LP
5043#if HAVE_SECCOMP
5044 seccomp_release(arg_seccomp);
5045#endif
d107bb7d 5046 arg_cpuset = cpu_set_mfree(arg_cpuset);
de40a303
LP
5047 free(arg_oci_bundle);
5048 device_node_free_many(arg_extra_nodes, arg_n_extra_nodes);
5049 strv_free(arg_sysctl);
5050 free(arg_slice);
6d0b55c2 5051
ec16945e 5052 return r < 0 ? EXIT_FAILURE : ret;
88213476 5053}