]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #11827 from keszybz/pkgconfig-variables
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
6b5cf3ea 4#include <blkid.h>
8fe0087e 5#endif
88213476 6#include <errno.h>
88213476 7#include <getopt.h>
0e7ac751 8#include <grp.h>
503f480f 9#include <linux/fs.h>
1b9e5b12 10#include <linux/loop.h>
0e7ac751 11#include <pwd.h>
8fe0087e 12#include <sched.h>
349cc4a5 13#if HAVE_SELINUX
8fe0087e 14#include <selinux/selinux.h>
1b9e5b12 15#endif
8fe0087e
LP
16#include <signal.h>
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20#include <sys/file.h>
8fe0087e
LP
21#include <sys/personality.h>
22#include <sys/prctl.h>
23#include <sys/types.h>
6916b164 24#include <sys/wait.h>
8fe0087e 25#include <unistd.h>
1b9e5b12 26
b053cd5f 27#include "sd-bus.h"
1f0cd86b 28#include "sd-daemon.h"
1f0cd86b 29#include "sd-id128.h"
8fe0087e 30
b5efdb8a 31#include "alloc-util.h"
8fe0087e
LP
32#include "barrier.h"
33#include "base-filesystem.h"
34#include "blkid-util.h"
35#include "btrfs-util.h"
b8ea7a6e 36#include "bus-error.h"
b053cd5f 37#include "bus-util.h"
8fe0087e 38#include "cap-list.h"
430f0182 39#include "capability-util.h"
04d391da 40#include "cgroup-util.h"
8fe0087e 41#include "copy.h"
d107bb7d 42#include "cpu-set-util.h"
4fc9982c 43#include "dev-setup.h"
2d845785 44#include "dissect-image.h"
8fe0087e 45#include "env-util.h"
3ffd4af2 46#include "fd-util.h"
842f3b0f 47#include "fdset.h"
a5c32cff 48#include "fileio.h"
f97b34a6 49#include "format-util.h"
f4f15635 50#include "fs-util.h"
1b9e5b12 51#include "gpt.h"
4623e8e6 52#include "hexdecoct.h"
8fe0087e 53#include "hostname-util.h"
910fd145 54#include "id128-util.h"
8fe0087e 55#include "log.h"
2d845785 56#include "loop-util.h"
8fe0087e 57#include "loopback-setup.h"
1b9cebf6 58#include "machine-image.h"
8fe0087e
LP
59#include "macro.h"
60#include "missing.h"
61#include "mkdir.h"
4349cd7c 62#include "mount-util.h"
049af8ad 63#include "mountpoint-util.h"
8fe0087e 64#include "netlink-util.h"
07630cea 65#include "nspawn-cgroup.h"
3603efde 66#include "nspawn-def.h"
07630cea
LP
67#include "nspawn-expose-ports.h"
68#include "nspawn-mount.h"
69#include "nspawn-network.h"
7336138e 70#include "nspawn-patch-uid.h"
07630cea 71#include "nspawn-register.h"
910fd145 72#include "nspawn-seccomp.h"
07630cea
LP
73#include "nspawn-settings.h"
74#include "nspawn-setuid.h"
7732f92b 75#include "nspawn-stub-pid1.h"
d58ad743 76#include "os-util.h"
50ebcf6c 77#include "pager.h"
6bedfcbb 78#include "parse-util.h"
8fe0087e 79#include "path-util.h"
294bf0c3 80#include "pretty-print.h"
0b452006 81#include "process-util.h"
8fe0087e
LP
82#include "ptyfwd.h"
83#include "random-util.h"
8869a0b4 84#include "raw-clone.h"
bf428efb 85#include "rlimit-util.h"
8fe0087e 86#include "rm-rf.h"
68b02049 87#include "selinux-util.h"
8fe0087e 88#include "signal-util.h"
2583fbea 89#include "socket-util.h"
8fcde012 90#include "stat-util.h"
15a5e950 91#include "stdio-util.h"
5c828e66 92#include "string-table.h"
07630cea 93#include "string-util.h"
8fe0087e
LP
94#include "strv.h"
95#include "terminal-util.h"
e4de7287 96#include "tmpfile-util.h"
affb60b1 97#include "umask-util.h"
b1d4f8e1 98#include "user-util.h"
8fe0087e 99#include "util.h"
e9642be2 100
62b1e758
YW
101#if HAVE_SPLIT_USR
102#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
103#else
104#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
105#endif
106
9c1e04d0
AP
107/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
108 * nspawn_notify_socket_path is relative to the container
109 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
110#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 111
2a49b612
ZJS
112#define EXIT_FORCE_RESTART 133
113
113cea80
DH
114typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117} ContainerStatus;
118
88213476 119static char *arg_directory = NULL;
ec16945e 120static char *arg_template = NULL;
5f932eb9 121static char *arg_chdir = NULL;
b53ede69
PW
122static char *arg_pivot_root_new = NULL;
123static char *arg_pivot_root_old = NULL;
687d0825 124static char *arg_user = NULL;
9444b1f2 125static sd_id128_t arg_uuid = {};
3a9530e5
LP
126static char *arg_machine = NULL; /* The name used by the host to refer to this */
127static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
128static const char *arg_selinux_context = NULL;
129static const char *arg_selinux_apifs_context = NULL;
9444b1f2 130static const char *arg_slice = NULL;
ff01d048 131static bool arg_private_network = false;
bc2f673e 132static bool arg_read_only = false;
7732f92b 133static StartMode arg_start_mode = START_PID1;
ec16945e 134static bool arg_ephemeral = false;
57fb9fb5 135static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 136static bool arg_link_journal_try = false;
520e0d54 137static uint64_t arg_caps_retain =
50b52222
LP
138 (1ULL << CAP_AUDIT_CONTROL) |
139 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
140 (1ULL << CAP_CHOWN) |
141 (1ULL << CAP_DAC_OVERRIDE) |
142 (1ULL << CAP_DAC_READ_SEARCH) |
143 (1ULL << CAP_FOWNER) |
144 (1ULL << CAP_FSETID) |
145 (1ULL << CAP_IPC_OWNER) |
146 (1ULL << CAP_KILL) |
147 (1ULL << CAP_LEASE) |
148 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 149 (1ULL << CAP_MKNOD) |
5076f0cc
LP
150 (1ULL << CAP_NET_BIND_SERVICE) |
151 (1ULL << CAP_NET_BROADCAST) |
152 (1ULL << CAP_NET_RAW) |
5076f0cc 153 (1ULL << CAP_SETFCAP) |
50b52222 154 (1ULL << CAP_SETGID) |
5076f0cc
LP
155 (1ULL << CAP_SETPCAP) |
156 (1ULL << CAP_SETUID) |
157 (1ULL << CAP_SYS_ADMIN) |
50b52222 158 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
159 (1ULL << CAP_SYS_CHROOT) |
160 (1ULL << CAP_SYS_NICE) |
161 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 162 (1ULL << CAP_SYS_RESOURCE) |
50b52222 163 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538 164static CustomMount *arg_custom_mounts = NULL;
88614c8a 165static size_t arg_n_custom_mounts = 0;
f4889f65 166static char **arg_setenv = NULL;
284c0b91 167static bool arg_quiet = false;
eb91eb18 168static bool arg_register = true;
89f7c846 169static bool arg_keep_unit = false;
aa28aefe 170static char **arg_network_interfaces = NULL;
c74e630d 171static char **arg_network_macvlan = NULL;
4bbfe7ad 172static char **arg_network_ipvlan = NULL;
69c79d3c 173static bool arg_network_veth = false;
f6d6bad1 174static char **arg_network_veth_extra = NULL;
f757855e 175static char *arg_network_bridge = NULL;
22b28dfd 176static char *arg_network_zone = NULL;
d7bea6b6 177static char *arg_network_namespace_path = NULL;
050f7277 178static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 179static char *arg_image = NULL;
f757855e 180static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 181static ExposePort *arg_expose_ports = NULL;
f36933fe 182static char **arg_property = NULL;
0de7acce 183static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 184static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 185static bool arg_userns_chown = false;
c6c8f6e2 186static int arg_kill_signal = 0;
5da38d07 187static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
188static SettingsMask arg_settings_mask = 0;
189static int arg_settings_trusted = -1;
190static char **arg_parameters = NULL;
6aadfa4c 191static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 192static bool arg_notify_ready = false;
5a8ff0e6 193static bool arg_use_cgns = true;
0c582db0 194static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 195static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
4623e8e6
LP
196static void *arg_root_hash = NULL;
197static size_t arg_root_hash_size = 0;
960e4569
LP
198static char **arg_syscall_whitelist = NULL;
199static char **arg_syscall_blacklist = NULL;
bf428efb 200static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 201static bool arg_no_new_privileges = false;
81f345df
LP
202static int arg_oom_score_adjust = 0;
203static bool arg_oom_score_adjust_set = false;
d107bb7d
LP
204static cpu_set_t *arg_cpuset = NULL;
205static unsigned arg_cpuset_ncpus = 0;
09d423e9 206static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 207static TimezoneMode arg_timezone = TIMEZONE_AUTO;
88213476 208
37ec0fdd
LP
209static int help(void) {
210 _cleanup_free_ char *link = NULL;
211 int r;
212
0221d68a 213 (void) pager_open(false);
50ebcf6c 214
37ec0fdd
LP
215 r = terminal_urlify_man("systemd-nspawn", "1", &link);
216 if (r < 0)
217 return log_oom();
218
88213476 219 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
a7e2e50d 220 "Spawn a command or OS in a light-weight container.\n\n"
a8828ed9
DW
221 " -h --help Show this help\n"
222 " --version Print version string\n"
69c79d3c 223 " -q --quiet Do not show status information\n"
1b9e5b12 224 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
225 " --template=PATH Initialize root directory from template directory,\n"
226 " if missing\n"
227 " -x --ephemeral Run container with snapshot of root directory, and\n"
228 " remove it after exit\n"
229 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 230 " --root-hash=HASH Specify verity root hash\n"
7732f92b 231 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 232 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 233 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
234 " --pivot-root=PATH[:PATH]\n"
235 " Pivot root to given directory in the container\n"
a8828ed9 236 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 237 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 238 " --hostname=NAME Override the hostname for the container\n"
69c79d3c 239 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 240 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 241 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 242 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 243 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 244 " Similar, but with user configured UID/GID range\n"
24597ee0 245 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
246 " --private-network Disable network in container\n"
247 " --network-interface=INTERFACE\n"
248 " Assign an existing network interface to the\n"
249 " container\n"
c74e630d
LP
250 " --network-macvlan=INTERFACE\n"
251 " Create a macvlan network interface based on an\n"
252 " existing network interface to the container\n"
4bbfe7ad
TG
253 " --network-ipvlan=INTERFACE\n"
254 " Create a ipvlan network interface based on an\n"
255 " existing network interface to the container\n"
a8eaaee7 256 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 257 " and container\n"
f6d6bad1
LP
258 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
259 " Add an additional virtual Ethernet link between\n"
260 " host and container\n"
ab046dde 261 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
262 " Add a virtual Ethernet connection to the container\n"
263 " and attach it to an existing bridge on the host\n"
264 " --network-zone=NAME Similar, but attach the new interface to an\n"
265 " an automatically managed bridge interface\n"
d7bea6b6
DP
266 " --network-namespace-path=PATH\n"
267 " Set network namespace to the one represented by\n"
268 " the specified kernel namespace file node\n"
6d0b55c2 269 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 270 " Expose a container IP port on the host\n"
82adf6af
LP
271 " -Z --selinux-context=SECLABEL\n"
272 " Set the SELinux security context to be used by\n"
273 " processes in the container\n"
274 " -L --selinux-apifs-context=SECLABEL\n"
275 " Set the SELinux security context to be used by\n"
276 " API/tmpfs file systems in the container\n"
a8828ed9
DW
277 " --capability=CAP In addition to the default, retain specified\n"
278 " capability\n"
279 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
280 " --system-call-filter=LIST|~LIST\n"
281 " Permit/prohibit specific system calls\n"
bf428efb 282 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
283 " --oom-score-adjust=VALUE\n"
284 " Adjust the OOM score value for the payload\n"
d107bb7d 285 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
c6c8f6e2 286 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
287 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
288 " host, try-guest, try-host\n"
574edc90 289 " -j Equivalent to --link-journal=try-guest\n"
09d423e9 290 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 291 " --timezone=MODE Select mode of /etc/localtime initialization\n"
69c79d3c 292 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
293 " --bind=PATH[:PATH[:OPTIONS]]\n"
294 " Bind mount a file or directory from the host into\n"
a8828ed9 295 " the container\n"
5e5bfa6e
EY
296 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
297 " Similar, but creates a read-only bind mount\n"
06c17c39 298 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
299 " --overlay=PATH[:PATH...]:PATH\n"
300 " Create an overlay mount from the host to \n"
301 " the container\n"
302 " --overlay-ro=PATH[:PATH...]:PATH\n"
303 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 304 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 305 " --register=BOOLEAN Register container as machine\n"
89f7c846 306 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 307 " the service unit nspawn is running in\n"
6d0b55c2 308 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 309 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 310 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
37ec0fdd
LP
311 "\nSee the %s for details.\n"
312 , program_invocation_short_name
313 , link
314 );
315
316 return 0;
88213476
LP
317}
318
86c0dd4a 319static int custom_mount_check_all(void) {
88614c8a 320 size_t i;
5a8af538 321
5a8af538
LP
322 for (i = 0; i < arg_n_custom_mounts; i++) {
323 CustomMount *m = &arg_custom_mounts[i];
324
0de7acce 325 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
326 if (arg_userns_chown)
327 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
328 "--private-users-chown may not be combined with custom root mounts.");
329 else if (arg_uid_shift == UID_INVALID)
330 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
331 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 332 }
5a8af538
LP
333 }
334
335 return 0;
336}
337
8199d554 338static int detect_unified_cgroup_hierarchy_from_environment(void) {
efdb0237 339 const char *e;
415fc41c 340 int r;
5da38d07 341
efdb0237
LP
342 /* Allow the user to control whether the unified hierarchy is used */
343 e = getenv("UNIFIED_CGROUP_HIERARCHY");
344 if (e) {
345 r = parse_boolean(e);
346 if (r < 0)
347 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
348 if (r > 0)
349 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
350 else
351 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
352 }
353
8199d554
LP
354 return 0;
355}
356
357static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
358 int r;
359
360 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
361 * image actually supports. */
b4cccbc1
LP
362 r = cg_all_unified();
363 if (r < 0)
364 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
365 if (r > 0) {
a8725a06
ZJS
366 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
367 * routine only detects 231, so we'll have a false negative here for 230. */
368 r = systemd_installation_has_version(directory, 230);
369 if (r < 0)
370 return log_error_errno(r, "Failed to determine systemd version in container: %m");
371 if (r > 0)
372 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
373 else
374 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 375 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
376 /* Mixed cgroup hierarchy support was added in 233 */
377 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
378 if (r < 0)
379 return log_error_errno(r, "Failed to determine systemd version in container: %m");
380 if (r > 0)
381 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
382 else
383 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
384 } else
5da38d07 385 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 386
8199d554
LP
387 log_debug("Using %s hierarchy for container.",
388 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
389 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
390
efdb0237
LP
391 return 0;
392}
393
0c582db0
LB
394static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
395 int r;
396
397 r = getenv_bool(name);
398 if (r == -ENXIO)
399 return;
400 if (r < 0)
401 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
402 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
403}
404
4f086aab 405static void parse_mount_settings_env(void) {
4f086aab 406 const char *e;
1099ceeb
LP
407 int r;
408
409 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
410 if (r >= 0)
411 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
412 else if (r != -ENXIO)
413 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
4f086aab
SU
414
415 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
416 if (!e)
417 return;
418
419 if (streq(e, "network")) {
420 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
421 return;
422 }
423
424 r = parse_boolean(e);
425 if (r < 0) {
426 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
427 return;
ab8ee0f2 428 }
4f086aab 429
ab8ee0f2
ZJS
430 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
431 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
432}
433
d5455d2f
LP
434static void parse_environment(void) {
435 const char *e;
436 int r;
437
438 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
439 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
440 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
441 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
442
443 parse_mount_settings_env();
444
489fae52
ZJS
445 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
446 * even if it is supported. If not supported, it has no effect. */
d5455d2f 447 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
489fae52
ZJS
448 if (r == 0 || !cg_ns_supported())
449 arg_use_cgns = false;
d5455d2f
LP
450
451 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
452 if (e)
453 arg_container_service_name = e;
454
455 detect_unified_cgroup_hierarchy_from_environment();
456}
457
88213476 458static int parse_argv(int argc, char *argv[]) {
a41fe3a2 459 enum {
acbeb427
ZJS
460 ARG_VERSION = 0x100,
461 ARG_PRIVATE_NETWORK,
bc2f673e 462 ARG_UUID,
5076f0cc 463 ARG_READ_ONLY,
57fb9fb5 464 ARG_CAPABILITY,
420c7379 465 ARG_DROP_CAPABILITY,
17fe0523
LP
466 ARG_LINK_JOURNAL,
467 ARG_BIND,
f4889f65 468 ARG_BIND_RO,
06c17c39 469 ARG_TMPFS,
5a8af538
LP
470 ARG_OVERLAY,
471 ARG_OVERLAY_RO,
eb91eb18 472 ARG_SHARE_SYSTEM,
89f7c846 473 ARG_REGISTER,
aa28aefe 474 ARG_KEEP_UNIT,
69c79d3c 475 ARG_NETWORK_INTERFACE,
c74e630d 476 ARG_NETWORK_MACVLAN,
4bbfe7ad 477 ARG_NETWORK_IPVLAN,
ab046dde 478 ARG_NETWORK_BRIDGE,
22b28dfd 479 ARG_NETWORK_ZONE,
f6d6bad1 480 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 481 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 482 ARG_PERSONALITY,
4d9f07b4 483 ARG_VOLATILE,
ec16945e 484 ARG_TEMPLATE,
f36933fe 485 ARG_PROPERTY,
6dac160c 486 ARG_PRIVATE_USERS,
c6c8f6e2 487 ARG_KILL_SIGNAL,
f757855e 488 ARG_SETTINGS,
5f932eb9 489 ARG_CHDIR,
b53ede69 490 ARG_PIVOT_ROOT,
7336138e 491 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 492 ARG_NOTIFY_READY,
4623e8e6 493 ARG_ROOT_HASH,
960e4569 494 ARG_SYSTEM_CALL_FILTER,
bf428efb 495 ARG_RLIMIT,
3a9530e5 496 ARG_HOSTNAME,
66edd963 497 ARG_NO_NEW_PRIVILEGES,
81f345df 498 ARG_OOM_SCORE_ADJUST,
d107bb7d 499 ARG_CPU_AFFINITY,
09d423e9 500 ARG_RESOLV_CONF,
1688841f 501 ARG_TIMEZONE,
a41fe3a2
LP
502 };
503
88213476 504 static const struct option options[] = {
d7bea6b6
DP
505 { "help", no_argument, NULL, 'h' },
506 { "version", no_argument, NULL, ARG_VERSION },
507 { "directory", required_argument, NULL, 'D' },
508 { "template", required_argument, NULL, ARG_TEMPLATE },
509 { "ephemeral", no_argument, NULL, 'x' },
510 { "user", required_argument, NULL, 'u' },
511 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
512 { "as-pid2", no_argument, NULL, 'a' },
513 { "boot", no_argument, NULL, 'b' },
514 { "uuid", required_argument, NULL, ARG_UUID },
515 { "read-only", no_argument, NULL, ARG_READ_ONLY },
516 { "capability", required_argument, NULL, ARG_CAPABILITY },
517 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 518 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
519 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
520 { "bind", required_argument, NULL, ARG_BIND },
521 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
522 { "tmpfs", required_argument, NULL, ARG_TMPFS },
523 { "overlay", required_argument, NULL, ARG_OVERLAY },
524 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
525 { "machine", required_argument, NULL, 'M' },
3a9530e5 526 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
527 { "slice", required_argument, NULL, 'S' },
528 { "setenv", required_argument, NULL, 'E' },
529 { "selinux-context", required_argument, NULL, 'Z' },
530 { "selinux-apifs-context", required_argument, NULL, 'L' },
531 { "quiet", no_argument, NULL, 'q' },
532 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
533 { "register", required_argument, NULL, ARG_REGISTER },
534 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
535 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
536 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
537 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
538 { "network-veth", no_argument, NULL, 'n' },
539 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
540 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
541 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
542 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
543 { "personality", required_argument, NULL, ARG_PERSONALITY },
544 { "image", required_argument, NULL, 'i' },
545 { "volatile", optional_argument, NULL, ARG_VOLATILE },
546 { "port", required_argument, NULL, 'p' },
547 { "property", required_argument, NULL, ARG_PROPERTY },
548 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
549 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
550 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
551 { "settings", required_argument, NULL, ARG_SETTINGS },
552 { "chdir", required_argument, NULL, ARG_CHDIR },
553 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
554 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
555 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
556 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 557 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 558 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 559 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 560 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 561 { "timezone", required_argument, NULL, ARG_TIMEZONE },
eb9da376 562 {}
88213476
LP
563 };
564
9444b1f2 565 int c, r;
d5455d2f 566 const char *p;
a42c8b54 567 uint64_t plus = 0, minus = 0;
f757855e 568 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
569
570 assert(argc >= 0);
571 assert(argv);
572
2e1f244e 573 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
574 switch (c) {
575
576 case 'h':
37ec0fdd 577 return help();
88213476 578
acbeb427 579 case ARG_VERSION:
3f6fd1ba 580 return version();
acbeb427 581
88213476 582 case 'D':
0f03c2a4 583 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 584 if (r < 0)
0f03c2a4 585 return r;
ec16945e
LP
586 break;
587
588 case ARG_TEMPLATE:
0f03c2a4 589 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 590 if (r < 0)
0f03c2a4 591 return r;
88213476
LP
592 break;
593
1b9e5b12 594 case 'i':
0f03c2a4 595 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 596 if (r < 0)
0f03c2a4 597 return r;
ec16945e
LP
598 break;
599
600 case 'x':
601 arg_ephemeral = true;
a2f577fc 602 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
603 break;
604
687d0825 605 case 'u':
2fc09a9c
DM
606 r = free_and_strdup(&arg_user, optarg);
607 if (r < 0)
7027ff61 608 return log_oom();
687d0825 609
f757855e 610 arg_settings_mask |= SETTING_USER;
687d0825
MV
611 break;
612
22b28dfd
LP
613 case ARG_NETWORK_ZONE: {
614 char *j;
615
616 j = strappend("vz-", optarg);
617 if (!j)
618 return log_oom();
619
620 if (!ifname_valid(j)) {
621 log_error("Network zone name not valid: %s", j);
622 free(j);
623 return -EINVAL;
624 }
625
df1fac6d 626 free_and_replace(arg_network_zone, j);
22b28dfd
LP
627
628 arg_network_veth = true;
629 arg_private_network = true;
630 arg_settings_mask |= SETTING_NETWORK;
631 break;
632 }
633
ab046dde 634 case ARG_NETWORK_BRIDGE:
ef76dff2 635
baaa35ad
ZJS
636 if (!ifname_valid(optarg))
637 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
638 "Bridge interface name not valid: %s", optarg);
ef76dff2 639
f757855e
LP
640 r = free_and_strdup(&arg_network_bridge, optarg);
641 if (r < 0)
642 return log_oom();
ab046dde 643
4831981d 644 _fallthrough_;
0dfaa006 645 case 'n':
69c79d3c
LP
646 arg_network_veth = true;
647 arg_private_network = true;
f757855e 648 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
649 break;
650
f6d6bad1
LP
651 case ARG_NETWORK_VETH_EXTRA:
652 r = veth_extra_parse(&arg_network_veth_extra, optarg);
653 if (r < 0)
654 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
655
656 arg_private_network = true;
657 arg_settings_mask |= SETTING_NETWORK;
658 break;
659
aa28aefe 660 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
661 if (!ifname_valid(optarg))
662 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
663 "Network interface name not valid: %s", optarg);
ef76dff2 664
c74e630d
LP
665 if (strv_extend(&arg_network_interfaces, optarg) < 0)
666 return log_oom();
667
668 arg_private_network = true;
f757855e 669 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
670 break;
671
672 case ARG_NETWORK_MACVLAN:
ef76dff2 673
baaa35ad
ZJS
674 if (!ifname_valid(optarg))
675 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
676 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 677
c74e630d 678 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
679 return log_oom();
680
4bbfe7ad 681 arg_private_network = true;
f757855e 682 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
683 break;
684
685 case ARG_NETWORK_IPVLAN:
ef76dff2 686
baaa35ad
ZJS
687 if (!ifname_valid(optarg))
688 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
689 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 690
4bbfe7ad
TG
691 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
692 return log_oom();
693
4831981d 694 _fallthrough_;
ff01d048
LP
695 case ARG_PRIVATE_NETWORK:
696 arg_private_network = true;
f757855e 697 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
698 break;
699
d7bea6b6
DP
700 case ARG_NETWORK_NAMESPACE_PATH:
701 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
702 if (r < 0)
703 return r;
704
705 break;
706
0f0dbc46 707 case 'b':
baaa35ad
ZJS
708 if (arg_start_mode == START_PID2)
709 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
710 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
711
712 arg_start_mode = START_BOOT;
713 arg_settings_mask |= SETTING_START_MODE;
714 break;
715
716 case 'a':
baaa35ad
ZJS
717 if (arg_start_mode == START_BOOT)
718 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
719 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
720
721 arg_start_mode = START_PID2;
722 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
723 break;
724
144f0fc0 725 case ARG_UUID:
9444b1f2 726 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
727 if (r < 0)
728 return log_error_errno(r, "Invalid UUID: %s", optarg);
729
baaa35ad
ZJS
730 if (sd_id128_is_null(arg_uuid))
731 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
732 "Machine UUID may not be all zeroes.");
f757855e
LP
733
734 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 735 break;
aa96c6cb 736
9444b1f2 737 case 'S':
c74e630d 738 arg_slice = optarg;
144f0fc0
LP
739 break;
740
7027ff61 741 case 'M':
c1521918 742 if (isempty(optarg))
97b11eed 743 arg_machine = mfree(arg_machine);
c1521918 744 else {
baaa35ad
ZJS
745 if (!machine_name_is_valid(optarg))
746 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
747 "Invalid machine name: %s", optarg);
7027ff61 748
0c3c4284
LP
749 r = free_and_strdup(&arg_machine, optarg);
750 if (r < 0)
eb91eb18 751 return log_oom();
eb91eb18 752 }
9ce6d1b3 753 break;
7027ff61 754
3a9530e5
LP
755 case ARG_HOSTNAME:
756 if (isempty(optarg))
757 arg_hostname = mfree(arg_hostname);
758 else {
baaa35ad
ZJS
759 if (!hostname_is_valid(optarg, false))
760 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
761 "Invalid hostname: %s", optarg);
3a9530e5
LP
762
763 r = free_and_strdup(&arg_hostname, optarg);
764 if (r < 0)
765 return log_oom();
766 }
767
768 arg_settings_mask |= SETTING_HOSTNAME;
769 break;
770
82adf6af
LP
771 case 'Z':
772 arg_selinux_context = optarg;
a8828ed9
DW
773 break;
774
82adf6af
LP
775 case 'L':
776 arg_selinux_apifs_context = optarg;
a8828ed9
DW
777 break;
778
bc2f673e
LP
779 case ARG_READ_ONLY:
780 arg_read_only = true;
f757855e 781 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
782 break;
783
420c7379
LP
784 case ARG_CAPABILITY:
785 case ARG_DROP_CAPABILITY: {
6cbe4ed1 786 p = optarg;
9ed794a3 787 for (;;) {
6cbe4ed1 788 _cleanup_free_ char *t = NULL;
5076f0cc 789
6cbe4ed1
SS
790 r = extract_first_word(&p, &t, ",", 0);
791 if (r < 0)
792 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 793
6cbe4ed1
SS
794 if (r == 0)
795 break;
5076f0cc 796
39ed67d1
LP
797 if (streq(t, "all")) {
798 if (c == ARG_CAPABILITY)
a42c8b54 799 plus = (uint64_t) -1;
39ed67d1 800 else
a42c8b54 801 minus = (uint64_t) -1;
39ed67d1 802 } else {
acf4d158
YW
803 r = capability_from_name(t);
804 if (r < 0)
805 return log_error_errno(r, "Failed to parse capability %s.", t);
39ed67d1
LP
806
807 if (c == ARG_CAPABILITY)
acf4d158 808 plus |= 1ULL << r;
39ed67d1 809 else
acf4d158 810 minus |= 1ULL << r;
5076f0cc 811 }
5076f0cc
LP
812 }
813
f757855e 814 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
815 break;
816 }
817
66edd963
LP
818 case ARG_NO_NEW_PRIVILEGES:
819 r = parse_boolean(optarg);
820 if (r < 0)
821 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
822
823 arg_no_new_privileges = r;
824 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
825 break;
826
57fb9fb5
LP
827 case 'j':
828 arg_link_journal = LINK_GUEST;
574edc90 829 arg_link_journal_try = true;
4e1d6aa9 830 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
831 break;
832
833 case ARG_LINK_JOURNAL:
4e1d6aa9
LP
834 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
835 if (r < 0) {
836 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5
LP
837 return -EINVAL;
838 }
839
4e1d6aa9 840 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
841 break;
842
17fe0523 843 case ARG_BIND:
f757855e
LP
844 case ARG_BIND_RO:
845 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
846 if (r < 0)
847 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 848
f757855e 849 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 850 break;
06c17c39 851
f757855e
LP
852 case ARG_TMPFS:
853 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
854 if (r < 0)
855 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 856
f757855e 857 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 858 break;
5a8af538
LP
859
860 case ARG_OVERLAY:
ad85779a
LP
861 case ARG_OVERLAY_RO:
862 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
863 if (r == -EADDRNOTAVAIL)
864 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
865 if (r < 0)
866 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 867
f757855e 868 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 869 break;
06c17c39 870
a5f1cb3b 871 case 'E': {
f4889f65
LP
872 char **n;
873
baaa35ad
ZJS
874 if (!env_assignment_is_valid(optarg))
875 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
876 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
877
878 n = strv_env_set(arg_setenv, optarg);
879 if (!n)
880 return log_oom();
881
130d3d22 882 strv_free_and_replace(arg_setenv, n);
f757855e 883 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
884 break;
885 }
886
284c0b91
LP
887 case 'q':
888 arg_quiet = true;
889 break;
890
8a96d94e 891 case ARG_SHARE_SYSTEM:
a6b5216c 892 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 893 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 894 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 895 arg_clone_ns_flags = 0;
8a96d94e
LP
896 break;
897
eb91eb18
LP
898 case ARG_REGISTER:
899 r = parse_boolean(optarg);
900 if (r < 0) {
901 log_error("Failed to parse --register= argument: %s", optarg);
902 return r;
903 }
904
905 arg_register = r;
906 break;
907
89f7c846
LP
908 case ARG_KEEP_UNIT:
909 arg_keep_unit = true;
910 break;
911
6afc95b7
LP
912 case ARG_PERSONALITY:
913
ac45f971 914 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
915 if (arg_personality == PERSONALITY_INVALID)
916 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
917 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 918
f757855e 919 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
920 break;
921
4d9f07b4
LP
922 case ARG_VOLATILE:
923
924 if (!optarg)
f757855e 925 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
926 else if (streq(optarg, "help")) {
927 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
928 return 0;
929 } else {
f757855e 930 VolatileMode m;
4d9f07b4 931
f757855e 932 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
933 if (m < 0)
934 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
935 "Failed to parse --volatile= argument: %s", optarg);
936 else
f757855e 937 arg_volatile_mode = m;
6d0b55c2
LP
938 }
939
f757855e
LP
940 arg_settings_mask |= SETTING_VOLATILE_MODE;
941 break;
6d0b55c2 942
f757855e
LP
943 case 'p':
944 r = expose_port_parse(&arg_expose_ports, optarg);
945 if (r == -EEXIST)
946 return log_error_errno(r, "Duplicate port specification: %s", optarg);
947 if (r < 0)
948 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 949
f757855e 950 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 951 break;
6d0b55c2 952
f36933fe
LP
953 case ARG_PROPERTY:
954 if (strv_extend(&arg_property, optarg) < 0)
955 return log_oom();
956
957 break;
958
ae209204
ZJS
959 case ARG_PRIVATE_USERS: {
960 int boolean = -1;
0de7acce 961
ae209204
ZJS
962 if (!optarg)
963 boolean = true;
964 else if (!in_charset(optarg, DIGITS))
965 /* do *not* parse numbers as booleans */
966 boolean = parse_boolean(optarg);
967
968 if (boolean == false) {
0de7acce
LP
969 /* no: User namespacing off */
970 arg_userns_mode = USER_NAMESPACE_NO;
971 arg_uid_shift = UID_INVALID;
972 arg_uid_range = UINT32_C(0x10000);
ae209204 973 } else if (boolean == true) {
0de7acce
LP
974 /* yes: User namespacing on, UID range is read from root dir */
975 arg_userns_mode = USER_NAMESPACE_FIXED;
976 arg_uid_shift = UID_INVALID;
977 arg_uid_range = UINT32_C(0x10000);
978 } else if (streq(optarg, "pick")) {
979 /* pick: User namespacing on, UID range is picked randomly */
980 arg_userns_mode = USER_NAMESPACE_PICK;
981 arg_uid_shift = UID_INVALID;
982 arg_uid_range = UINT32_C(0x10000);
983 } else {
6c2058b3 984 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
985 const char *range, *shift;
986
0de7acce
LP
987 /* anything else: User namespacing on, UID range is explicitly configured */
988
6dac160c
LP
989 range = strchr(optarg, ':');
990 if (range) {
6c2058b3
ZJS
991 buffer = strndup(optarg, range - optarg);
992 if (!buffer)
993 return log_oom();
994 shift = buffer;
6dac160c
LP
995
996 range++;
bfd292ec
ZJS
997 r = safe_atou32(range, &arg_uid_range);
998 if (r < 0)
be715731 999 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1000 } else
1001 shift = optarg;
1002
be715731
ZJS
1003 r = parse_uid(shift, &arg_uid_shift);
1004 if (r < 0)
1005 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1006
1007 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1008 }
1009
baaa35ad
ZJS
1010 if (arg_uid_range <= 0)
1011 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1012 "UID range cannot be 0.");
be715731 1013
0de7acce 1014 arg_settings_mask |= SETTING_USERNS;
6dac160c 1015 break;
ae209204 1016 }
6dac160c 1017
0de7acce 1018 case 'U':
ccabee0d
LP
1019 if (userns_supported()) {
1020 arg_userns_mode = USER_NAMESPACE_PICK;
1021 arg_uid_shift = UID_INVALID;
1022 arg_uid_range = UINT32_C(0x10000);
1023
1024 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1025 }
1026
7336138e
LP
1027 break;
1028
0de7acce 1029 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1030 arg_userns_chown = true;
0de7acce
LP
1031
1032 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1033 break;
1034
c6c8f6e2 1035 case ARG_KILL_SIGNAL:
5c828e66
LP
1036 if (streq(optarg, "help")) {
1037 DUMP_STRING_TABLE(signal, int, _NSIG);
1038 return 0;
1039 }
1040
29a3db75 1041 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1042 if (arg_kill_signal < 0)
1043 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1044 "Cannot parse signal: %s", optarg);
c6c8f6e2 1045
f757855e
LP
1046 arg_settings_mask |= SETTING_KILL_SIGNAL;
1047 break;
1048
1049 case ARG_SETTINGS:
1050
1051 /* no → do not read files
1052 * yes → read files, do not override cmdline, trust only subset
1053 * override → read files, override cmdline, trust only subset
1054 * trusted → read files, do not override cmdline, trust all
1055 */
1056
1057 r = parse_boolean(optarg);
1058 if (r < 0) {
1059 if (streq(optarg, "trusted")) {
1060 mask_all_settings = false;
1061 mask_no_settings = false;
1062 arg_settings_trusted = true;
1063
1064 } else if (streq(optarg, "override")) {
1065 mask_all_settings = false;
1066 mask_no_settings = true;
1067 arg_settings_trusted = -1;
1068 } else
1069 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1070 } else if (r > 0) {
1071 /* yes */
1072 mask_all_settings = false;
1073 mask_no_settings = false;
1074 arg_settings_trusted = -1;
1075 } else {
1076 /* no */
1077 mask_all_settings = true;
1078 mask_no_settings = false;
1079 arg_settings_trusted = false;
1080 }
1081
c6c8f6e2
LP
1082 break;
1083
5f932eb9 1084 case ARG_CHDIR:
baaa35ad
ZJS
1085 if (!path_is_absolute(optarg))
1086 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1087 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1088
1089 r = free_and_strdup(&arg_chdir, optarg);
1090 if (r < 0)
1091 return log_oom();
1092
1093 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1094 break;
1095
b53ede69
PW
1096 case ARG_PIVOT_ROOT:
1097 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1098 if (r < 0)
1099 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1100
1101 arg_settings_mask |= SETTING_PIVOT_ROOT;
1102 break;
1103
9c1e04d0
AP
1104 case ARG_NOTIFY_READY:
1105 r = parse_boolean(optarg);
baaa35ad
ZJS
1106 if (r < 0)
1107 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1108 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1109 arg_notify_ready = r;
1110 arg_settings_mask |= SETTING_NOTIFY_READY;
1111 break;
1112
4623e8e6
LP
1113 case ARG_ROOT_HASH: {
1114 void *k;
1115 size_t l;
1116
1117 r = unhexmem(optarg, strlen(optarg), &k, &l);
1118 if (r < 0)
1119 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1120 if (l < sizeof(sd_id128_t)) {
1121 log_error("Root hash must be at least 128bit long: %s", optarg);
1122 free(k);
1123 return -EINVAL;
1124 }
1125
1126 free(arg_root_hash);
1127 arg_root_hash = k;
1128 arg_root_hash_size = l;
1129 break;
1130 }
1131
960e4569
LP
1132 case ARG_SYSTEM_CALL_FILTER: {
1133 bool negative;
1134 const char *items;
1135
1136 negative = optarg[0] == '~';
1137 items = negative ? optarg + 1 : optarg;
1138
1139 for (;;) {
1140 _cleanup_free_ char *word = NULL;
1141
1142 r = extract_first_word(&items, &word, NULL, 0);
1143 if (r == 0)
1144 break;
1145 if (r == -ENOMEM)
1146 return log_oom();
1147 if (r < 0)
1148 return log_error_errno(r, "Failed to parse system call filter: %m");
1149
1150 if (negative)
1151 r = strv_extend(&arg_syscall_blacklist, word);
1152 else
1153 r = strv_extend(&arg_syscall_whitelist, word);
1154 if (r < 0)
1155 return log_oom();
1156 }
1157
1158 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1159 break;
1160 }
1161
bf428efb
LP
1162 case ARG_RLIMIT: {
1163 const char *eq;
1164 char *name;
1165 int rl;
1166
5c828e66
LP
1167 if (streq(optarg, "help")) {
1168 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1169 return 0;
1170 }
1171
bf428efb 1172 eq = strchr(optarg, '=');
baaa35ad
ZJS
1173 if (!eq)
1174 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1175 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1176
1177 name = strndup(optarg, eq - optarg);
1178 if (!name)
1179 return log_oom();
1180
1181 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1182 if (rl < 0)
1183 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1184 "Unknown resource limit: %s", name);
bf428efb
LP
1185
1186 if (!arg_rlimit[rl]) {
1187 arg_rlimit[rl] = new0(struct rlimit, 1);
1188 if (!arg_rlimit[rl])
1189 return log_oom();
1190 }
1191
1192 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1193 if (r < 0)
1194 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1195
1196 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1197 break;
1198 }
1199
81f345df
LP
1200 case ARG_OOM_SCORE_ADJUST:
1201 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1202 if (r < 0)
1203 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1204
1205 arg_oom_score_adjust_set = true;
1206 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1207 break;
1208
d107bb7d
LP
1209 case ARG_CPU_AFFINITY: {
1210 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1211
1212 r = parse_cpu_set(optarg, &cpuset);
1213 if (r < 0)
1214 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1215
1216 if (arg_cpuset)
1217 CPU_FREE(arg_cpuset);
1218
1219 arg_cpuset = TAKE_PTR(cpuset);
1220 arg_cpuset_ncpus = r;
1221 arg_settings_mask |= SETTING_CPU_AFFINITY;
1222 break;
1223 }
1224
09d423e9
LP
1225 case ARG_RESOLV_CONF:
1226 if (streq(optarg, "help")) {
1227 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1228 return 0;
1229 }
1230
1231 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1232 if (arg_resolv_conf < 0)
1233 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1234 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1235
1236 arg_settings_mask |= SETTING_RESOLV_CONF;
1237 break;
1238
1688841f
LP
1239 case ARG_TIMEZONE:
1240 if (streq(optarg, "help")) {
1241 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1242 return 0;
1243 }
1244
1245 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1246 if (arg_timezone < 0)
1247 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1248 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1249
1250 arg_settings_mask |= SETTING_TIMEZONE;
1251 break;
1252
88213476
LP
1253 case '?':
1254 return -EINVAL;
1255
1256 default:
eb9da376 1257 assert_not_reached("Unhandled option");
88213476 1258 }
88213476 1259
60f1ec13
LP
1260 if (argc > optind) {
1261 strv_free(arg_parameters);
1262 arg_parameters = strv_copy(argv + optind);
1263 if (!arg_parameters)
1264 return log_oom();
d7bea6b6 1265
60f1ec13
LP
1266 arg_settings_mask |= SETTING_START_MODE;
1267 }
1268
1269 if (arg_ephemeral && arg_template && !arg_directory)
1270 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1271 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1272 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1273 * --directory=". */
1274 arg_directory = TAKE_PTR(arg_template);
1275
1276 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1277
1278 /* Load all settings from .nspawn files */
1279 if (mask_no_settings)
1280 arg_settings_mask = 0;
1281
1282 /* Don't load any settings from .nspawn files */
1283 if (mask_all_settings)
1284 arg_settings_mask = _SETTINGS_MASK_ALL;
1285
1286 return 1;
1287}
1288
1289static int verify_arguments(void) {
1290 int r;
a6b5216c 1291
4f086aab
SU
1292 if (arg_userns_mode != USER_NAMESPACE_NO)
1293 arg_mount_settings |= MOUNT_USE_USERNS;
1294
1295 if (arg_private_network)
1296 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1297
48a8d337
LB
1298 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1299 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1300 arg_register = false;
baaa35ad 1301 if (arg_start_mode != START_PID1)
60f1ec13 1302 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1303 }
eb91eb18 1304
0de7acce 1305 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1306 arg_userns_chown = true;
1307
60f1ec13
LP
1308 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1309 arg_kill_signal = SIGRTMIN+3;
1310
baaa35ad 1311 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1312 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1313 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1314 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1315
baaa35ad 1316 if (arg_directory && arg_image)
60f1ec13 1317 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1318
baaa35ad 1319 if (arg_template && arg_image)
60f1ec13 1320 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1321
baaa35ad 1322 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1323 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1324
baaa35ad 1325 if (arg_ephemeral && arg_template)
60f1ec13 1326 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1327
baaa35ad 1328 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1329 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1330
baaa35ad 1331 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1332 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1333
baaa35ad 1334 if (arg_userns_chown && arg_read_only)
60f1ec13 1335 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined.");
f757855e 1336
60f1ec13
LP
1337 /* If --network-namespace-path is given with any other network-related option,
1338 * we need to error out, to avoid conflicts between different network options. */
1339 if (arg_network_namespace_path &&
1340 (arg_network_interfaces || arg_network_macvlan ||
1341 arg_network_ipvlan || arg_network_veth_extra ||
1342 arg_network_bridge || arg_network_zone ||
1343 arg_network_veth || arg_private_network))
1344 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path cannot be combined with other network options.");
86c0dd4a 1345
60f1ec13
LP
1346 if (arg_network_bridge && arg_network_zone)
1347 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-bridge= and --network-zone= may not be combined.");
f757855e 1348
baaa35ad 1349 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1350 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1351
baaa35ad 1352 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1353 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1354
baaa35ad 1355 if (arg_volatile_mode != VOLATILE_NO && arg_read_only)
60f1ec13 1356 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
4d9f07b4 1357
baaa35ad 1358 if (arg_expose_ports && !arg_private_network)
60f1ec13 1359 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1360
349cc4a5 1361#if ! HAVE_LIBIPTC
baaa35ad 1362 if (arg_expose_ports)
60f1ec13 1363 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1364#endif
1365
60f1ec13
LP
1366 r = custom_mount_check_all();
1367 if (r < 0)
1368 return r;
c6c8f6e2 1369
f757855e 1370 return 0;
88213476
LP
1371}
1372
03cfe0d5
LP
1373static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1374 assert(p);
1375
0de7acce 1376 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1377 return 0;
1378
1379 if (uid == UID_INVALID && gid == GID_INVALID)
1380 return 0;
1381
1382 if (uid != UID_INVALID) {
1383 uid += arg_uid_shift;
1384
1385 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1386 return -EOVERFLOW;
1387 }
1388
1389 if (gid != GID_INVALID) {
1390 gid += (gid_t) arg_uid_shift;
1391
1392 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1393 return -EOVERFLOW;
1394 }
1395
1396 if (lchown(p, uid, gid) < 0)
1397 return -errno;
b12afc8c
LP
1398
1399 return 0;
1400}
1401
03cfe0d5
LP
1402static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1403 const char *q;
dae8b82e 1404 int r;
03cfe0d5
LP
1405
1406 q = prefix_roota(root, path);
dae8b82e
ZJS
1407 r = mkdir_errno_wrapper(q, mode);
1408 if (r == -EEXIST)
1409 return 0;
1410 if (r < 0)
1411 return r;
03cfe0d5
LP
1412
1413 return userns_lchown(q, uid, gid);
1414}
1415
1688841f 1416static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1417 return PATH_STARTSWITH_SET(
1418 path,
1419 "../usr/share/zoneinfo/",
1420 "/usr/share/zoneinfo/");
1688841f
LP
1421}
1422
e58a1277 1423static int setup_timezone(const char *dest) {
1688841f
LP
1424 _cleanup_free_ char *p = NULL, *etc = NULL;
1425 const char *where, *check;
1426 TimezoneMode m;
d4036145 1427 int r;
f8440af5 1428
e58a1277
LP
1429 assert(dest);
1430
1688841f 1431 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1432 r = readlink_malloc("/etc/localtime", &p);
1433 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1434 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_OFF : TIMEZONE_DELETE;
1435 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1436 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_COPY;
1437 else if (r < 0) {
1438 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1439 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1440 * file.
1441 *
1442 * Example:
1443 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1444 */
1445 return 0;
1446 } else if (arg_timezone == TIMEZONE_AUTO)
1447 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_SYMLINK;
1448 else
1449 m = arg_timezone;
1450 } else
1451 m = arg_timezone;
1452
1453 if (m == TIMEZONE_OFF)
1454 return 0;
1455
1456 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
d4036145 1457 if (r < 0) {
1688841f 1458 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1459 return 0;
1460 }
1461
1688841f
LP
1462 where = strjoina(etc, "/localtime");
1463
1464 switch (m) {
1465
1466 case TIMEZONE_DELETE:
1467 if (unlink(where) < 0)
1468 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1469
d4036145 1470 return 0;
d4036145 1471
1688841f
LP
1472 case TIMEZONE_SYMLINK: {
1473 _cleanup_free_ char *q = NULL;
1474 const char *z, *what;
4d1c38b8 1475
1688841f
LP
1476 z = timezone_from_path(p);
1477 if (!z) {
1478 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1479 return 0;
1688841f 1480 }
d4036145 1481
1688841f
LP
1482 r = readlink_malloc(where, &q);
1483 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1484 return 0; /* Already pointing to the right place? Then do nothing .. */
1485
1486 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1487 r = chase_symlinks(check, dest, 0, NULL);
1488 if (r < 0)
1489 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1490 else {
1491 if (unlink(where) < 0 && errno != ENOENT) {
1492 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1493 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1494 return 0;
1495 }
1496
1497 what = strjoina("../usr/share/zoneinfo/", z);
1498 if (symlink(what, where) < 0) {
1499 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1500 errno, "Failed to correct timezone of container, ignoring: %m");
1501 return 0;
1502 }
1503
1504 break;
1505 }
1506
1507 _fallthrough_;
d4036145 1508 }
68fb0892 1509
1688841f
LP
1510 case TIMEZONE_BIND: {
1511 _cleanup_free_ char *resolved = NULL;
1512 int found;
1513
1514 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1515 if (found < 0) {
1516 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1517 return 0;
1518 }
1519
1520 if (found == 0) /* missing? */
1521 (void) touch(resolved);
1522
1523 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1524 if (r >= 0)
1525 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1526
1527 _fallthrough_;
79d80fc1 1528 }
4d9f07b4 1529
1688841f
LP
1530 case TIMEZONE_COPY:
1531 /* If mounting failed, try to copy */
1532 r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
1533 if (r < 0) {
1534 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1535 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1536 return 0;
1537 }
1538
1539 break;
1540
1541 default:
1542 assert_not_reached("unexpected mode");
d4036145 1543 }
e58a1277 1544
1688841f 1545 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1546 r = userns_lchown(where, 0, 0);
1547 if (r < 0)
1688841f 1548 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1549
e58a1277 1550 return 0;
88213476
LP
1551}
1552
09d423e9
LP
1553static int have_resolv_conf(const char *path) {
1554 assert(path);
1555
1556 if (access(path, F_OK) < 0) {
1557 if (errno == ENOENT)
1558 return 0;
1559
1560 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1561 }
1562
1563 return 1;
1564}
1565
7357272e 1566static int resolved_listening(void) {
b8ea7a6e 1567 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1568 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1569 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1570 int r;
1571
7357272e 1572 /* Check if resolved is listening */
b053cd5f
LP
1573
1574 r = sd_bus_open_system(&bus);
1575 if (r < 0)
b8ea7a6e 1576 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1577
7357272e 1578 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1579 if (r < 0)
1580 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1581 if (r == 0)
1582 return 0;
7357272e
DM
1583
1584 r = sd_bus_get_property_string(bus,
1585 "org.freedesktop.resolve1",
1586 "/org/freedesktop/resolve1",
1587 "org.freedesktop.resolve1.Manager",
1588 "DNSStubListener",
b8ea7a6e 1589 &error,
7357272e
DM
1590 &dns_stub_listener_mode);
1591 if (r < 0)
b8ea7a6e 1592 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1593
1594 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1595}
1596
2547bb41 1597static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1598 _cleanup_free_ char *etc = NULL;
1599 const char *where, *what;
1600 ResolvConfMode m;
1601 int r;
2547bb41
LP
1602
1603 assert(dest);
1604
09d423e9
LP
1605 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1606 if (arg_private_network)
1607 m = RESOLV_CONF_OFF;
1608 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
27b620b7 1609 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC;
09d423e9
LP
1610 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1611 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST;
1612 else
1613 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE;
1614 } else
1615 m = arg_resolv_conf;
1616
1617 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1618 return 0;
1619
87447ae4
LP
1620 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1621 if (r < 0) {
1622 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1623 return 0;
1624 }
1625
1626 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1627
1628 if (m == RESOLV_CONF_DELETE) {
1629 if (unlink(where) < 0)
1630 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1631
87447ae4
LP
1632 return 0;
1633 }
79d80fc1 1634
09d423e9
LP
1635 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1636 what = STATIC_RESOLV_CONF;
1637 else
1638 what = "/etc/resolv.conf";
87447ae4 1639
09d423e9
LP
1640 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1641 _cleanup_free_ char *resolved = NULL;
1642 int found;
1643
1644 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1645 if (found < 0) {
1646 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1647 return 0;
1648 }
3539724c 1649
87447ae4
LP
1650 if (found == 0) /* missing? */
1651 (void) touch(resolved);
5367354d 1652
09d423e9 1653 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1654 if (r >= 0)
87447ae4 1655 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1656 }
1657
1658 /* If that didn't work, let's copy the file */
09d423e9 1659 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1660 if (r < 0) {
3539724c
LP
1661 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1662 * resolved or something similar runs inside and the symlink points there.
68a313c5 1663 *
3539724c 1664 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1665 */
09d423e9 1666 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1667 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1668 return 0;
1669 }
2547bb41 1670
03cfe0d5
LP
1671 r = userns_lchown(where, 0, 0);
1672 if (r < 0)
3539724c 1673 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1674
2547bb41
LP
1675 return 0;
1676}
1677
1e4f1671 1678static int setup_boot_id(void) {
cdde6ba6
LP
1679 _cleanup_(unlink_and_freep) char *from = NULL;
1680 _cleanup_free_ char *path = NULL;
3bbaff3e 1681 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1682 const char *to;
04bc4a3f
LP
1683 int r;
1684
04bc4a3f
LP
1685 /* Generate a new randomized boot ID, so that each boot-up of
1686 * the container gets a new one */
1687
cdde6ba6
LP
1688 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1689 if (r < 0)
1690 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1691
1692 r = sd_id128_randomize(&rnd);
f647962d
MS
1693 if (r < 0)
1694 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1695
cdde6ba6 1696 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1697 if (r < 0)
1698 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1699
cdde6ba6
LP
1700 from = TAKE_PTR(path);
1701 to = "/proc/sys/kernel/random/boot_id";
1702
60e76d48 1703 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1704 if (r < 0)
1705 return r;
04bc4a3f 1706
cdde6ba6 1707 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1708}
1709
e58a1277 1710static int copy_devnodes(const char *dest) {
88213476
LP
1711 static const char devnodes[] =
1712 "null\0"
1713 "zero\0"
1714 "full\0"
1715 "random\0"
1716 "urandom\0"
85614d66
TG
1717 "tty\0"
1718 "net/tun\0";
88213476
LP
1719
1720 const char *d;
e58a1277 1721 int r = 0;
7fd1b19b 1722 _cleanup_umask_ mode_t u;
a258bf26
LP
1723
1724 assert(dest);
124640f1
LP
1725
1726 u = umask(0000);
88213476 1727
03cfe0d5
LP
1728 /* Create /dev/net, so that we can create /dev/net/tun in it */
1729 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1730 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1731
88213476 1732 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1733 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1734 struct stat st;
88213476 1735
7f112f50 1736 from = strappend("/dev/", d);
8967f291
LP
1737 if (!from)
1738 return log_oom();
1739
03cfe0d5 1740 to = prefix_root(dest, from);
8967f291
LP
1741 if (!to)
1742 return log_oom();
88213476
LP
1743
1744 if (stat(from, &st) < 0) {
1745
4a62c710
MS
1746 if (errno != ENOENT)
1747 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1748
baaa35ad
ZJS
1749 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1750 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1751 "%s is not a char or block device, cannot copy.", from);
1752 else {
8dfce114
LP
1753 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1754
81f5049b 1755 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1756 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1757 if (errno == EEXIST)
8dbf71ec 1758 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1759 if (errno != EPERM)
1760 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1761
8dfce114 1762 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
1763 r = touch(to);
1764 if (r < 0)
1765 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1766 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1767 if (r < 0)
1768 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1769 }
6278cf60 1770
03cfe0d5
LP
1771 r = userns_lchown(to, 0, 0);
1772 if (r < 0)
1773 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114
LP
1774
1775 dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
1776 if (!dn)
1777 return log_oom();
1778
1779 r = userns_mkdir(dest, dn, 0755, 0, 0);
1780 if (r < 0)
1781 return log_error_errno(r, "Failed to create '%s': %m", dn);
1782
1783 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1784 return log_oom();
1785
1786 prefixed = prefix_root(dest, sl);
1787 if (!prefixed)
1788 return log_oom();
1789
1790 t = strjoin("../", d);
1791 if (!t)
1792 return log_oom();
1793
1794 if (symlink(t, prefixed) < 0)
1795 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 1796 }
88213476
LP
1797 }
1798
e58a1277
LP
1799 return r;
1800}
88213476 1801
03cfe0d5
LP
1802static int setup_pts(const char *dest) {
1803 _cleanup_free_ char *options = NULL;
1804 const char *p;
709f6e46 1805 int r;
03cfe0d5 1806
349cc4a5 1807#if HAVE_SELINUX
03cfe0d5
LP
1808 if (arg_selinux_apifs_context)
1809 (void) asprintf(&options,
3dce8915 1810 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1811 arg_uid_shift + TTY_GID,
1812 arg_selinux_apifs_context);
1813 else
1814#endif
1815 (void) asprintf(&options,
3dce8915 1816 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1817 arg_uid_shift + TTY_GID);
f2d88580 1818
03cfe0d5 1819 if (!options)
f2d88580
LP
1820 return log_oom();
1821
03cfe0d5 1822 /* Mount /dev/pts itself */
cc9fce65 1823 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
1824 r = mkdir_errno_wrapper(p, 0755);
1825 if (r < 0)
1826 return log_error_errno(r, "Failed to create /dev/pts: %m");
1827
60e76d48
ZJS
1828 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1829 if (r < 0)
1830 return r;
709f6e46
MS
1831 r = userns_lchown(p, 0, 0);
1832 if (r < 0)
1833 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1834
1835 /* Create /dev/ptmx symlink */
1836 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1837 if (symlink("pts/ptmx", p) < 0)
1838 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1839 r = userns_lchown(p, 0, 0);
1840 if (r < 0)
1841 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1842
03cfe0d5
LP
1843 /* And fix /dev/pts/ptmx ownership */
1844 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1845 r = userns_lchown(p, 0, 0);
1846 if (r < 0)
1847 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1848
f2d88580
LP
1849 return 0;
1850}
1851
e58a1277 1852static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1853 _cleanup_umask_ mode_t u;
1854 const char *to;
e58a1277 1855 int r;
e58a1277
LP
1856
1857 assert(dest);
1858 assert(console);
1859
1860 u = umask(0000);
1861
03cfe0d5 1862 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1863 if (r < 0)
1864 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1865
a258bf26
LP
1866 /* We need to bind mount the right tty to /dev/console since
1867 * ptys can only exist on pts file systems. To have something
81f5049b 1868 * to bind mount things on we create a empty regular file. */
a258bf26 1869
03cfe0d5 1870 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1871 r = touch(to);
1872 if (r < 0)
1873 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1874
60e76d48 1875 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1876}
1877
8e5430c4
LP
1878static int setup_keyring(void) {
1879 key_serial_t keyring;
1880
1881 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1882 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1883 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1884 * these system calls let's make sure we don't leak anything into the container. */
1885
1886 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1887 if (keyring == -1) {
1888 if (errno == ENOSYS)
1889 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1890 else if (IN_SET(errno, EACCES, EPERM))
1891 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1892 else
1893 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1894 }
1895
1896 return 0;
1897}
1898
1e4f1671 1899static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
1900 _cleanup_(unlink_and_freep) char *from = NULL;
1901 _cleanup_free_ char *fifo = NULL;
1902 _cleanup_close_ int fd = -1;
7fd1b19b 1903 _cleanup_umask_ mode_t u;
9ec5a93c
LP
1904 const char *to;
1905 int r;
e58a1277 1906
e58a1277 1907 assert(kmsg_socket >= 0);
a258bf26 1908
e58a1277 1909 u = umask(0000);
a258bf26 1910
9ec5a93c
LP
1911 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
1912 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
1913 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
1914 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
1915
1916 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
1917 if (r < 0)
1918 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 1919
9ec5a93c 1920 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 1921 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
1922
1923 from = TAKE_PTR(fifo);
1924 to = "/proc/kmsg";
1925
60e76d48
ZJS
1926 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1927 if (r < 0)
1928 return r;
e58a1277 1929
669fc4e5 1930 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
1931 if (fd < 0)
1932 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1933
9ec5a93c 1934 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 1935 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
1936 if (r < 0)
1937 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1938
25ea79fe 1939 return 0;
88213476
LP
1940}
1941
1c4baffc 1942static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1943 union in_addr_union *exposed = userdata;
1944
1945 assert(rtnl);
1946 assert(m);
1947 assert(exposed);
1948
7a8f6325 1949 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1950 return 0;
1951}
1952
3a74cea5 1953static int setup_hostname(void) {
c818eef1 1954 int r;
3a74cea5 1955
0c582db0 1956 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1957 return 0;
1958
c818eef1
LP
1959 r = sethostname_idempotent(arg_hostname ?: arg_machine);
1960 if (r < 0)
1961 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 1962
7027ff61 1963 return 0;
3a74cea5
LP
1964}
1965
57fb9fb5 1966static int setup_journal(const char *directory) {
0f5e1382 1967 _cleanup_free_ char *d = NULL;
b2238e38
LP
1968 const char *dirname, *p, *q;
1969 sd_id128_t this_id;
1970 char id[33];
8054d749 1971 bool try;
57fb9fb5
LP
1972 int r;
1973
df9a75e4
LP
1974 /* Don't link journals in ephemeral mode */
1975 if (arg_ephemeral)
1976 return 0;
1977
8054d749
LP
1978 if (arg_link_journal == LINK_NO)
1979 return 0;
1980
1981 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1982
4d680aee 1983 r = sd_id128_get_machine(&this_id);
f647962d
MS
1984 if (r < 0)
1985 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1986
e01ff70a 1987 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1988 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1989 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1990 if (try)
4d680aee 1991 return 0;
df9a75e4 1992 return -EEXIST;
4d680aee
ZJS
1993 }
1994
369ca6da
ZJS
1995 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
1996 r = userns_mkdir(directory, dirname, 0755, 0, 0);
1997 if (r < 0) {
1998 bool ignore = r == -EROFS && try;
1999 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2000 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2001 return ignore ? 0 : r;
2002 }
2003 }
03cfe0d5 2004
e01ff70a
MS
2005 (void) sd_id128_to_string(arg_uuid, id);
2006
03cfe0d5
LP
2007 p = strjoina("/var/log/journal/", id);
2008 q = prefix_roota(directory, p);
27407a01 2009
e1873695 2010 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2011 if (try)
2012 return 0;
27407a01 2013
baaa35ad
ZJS
2014 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2015 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2016 }
2017
e1873695 2018 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2019 if (try)
2020 return 0;
57fb9fb5 2021
baaa35ad
ZJS
2022 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2023 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2024 }
2025
2026 r = readlink_and_make_absolute(p, &d);
2027 if (r >= 0) {
3742095b 2028 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2029 path_equal(d, q)) {
2030
03cfe0d5 2031 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2032 if (r < 0)
709f6e46 2033 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2034 return 0;
57fb9fb5
LP
2035 }
2036
4a62c710
MS
2037 if (unlink(p) < 0)
2038 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2039 } else if (r == -EINVAL) {
2040
2041 if (arg_link_journal == LINK_GUEST &&
2042 rmdir(p) < 0) {
2043
27407a01
ZJS
2044 if (errno == ENOTDIR) {
2045 log_error("%s already exists and is neither a symlink nor a directory", p);
2046 return r;
4314d33f
MS
2047 } else
2048 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2049 }
4314d33f
MS
2050 } else if (r != -ENOENT)
2051 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2052
2053 if (arg_link_journal == LINK_GUEST) {
2054
2055 if (symlink(q, p) < 0) {
8054d749 2056 if (try) {
56f64d95 2057 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2058 return 0;
4314d33f
MS
2059 } else
2060 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2061 }
2062
03cfe0d5 2063 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2064 if (r < 0)
709f6e46 2065 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2066 return 0;
57fb9fb5
LP
2067 }
2068
2069 if (arg_link_journal == LINK_HOST) {
ccddd104 2070 /* don't create parents here — if the host doesn't have
574edc90 2071 * permanent journal set up, don't force it here */
ba8e6c4d 2072
dae8b82e
ZJS
2073 r = mkdir_errno_wrapper(p, 0755);
2074 if (r < 0 && r != -EEXIST) {
8054d749 2075 if (try) {
dae8b82e 2076 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2077 return 0;
4314d33f 2078 } else
dae8b82e 2079 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2080 }
2081
27407a01
ZJS
2082 } else if (access(p, F_OK) < 0)
2083 return 0;
57fb9fb5 2084
cdb2b9d0
LP
2085 if (dir_is_empty(q) == 0)
2086 log_warning("%s is not empty, proceeding anyway.", q);
2087
03cfe0d5 2088 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2089 if (r < 0)
2090 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2091
60e76d48
ZJS
2092 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2093 if (r < 0)
4a62c710 2094 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2095
27407a01 2096 return 0;
57fb9fb5
LP
2097}
2098
88213476 2099static int drop_capabilities(void) {
520e0d54 2100 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
2101}
2102
db999e0f
LP
2103static int reset_audit_loginuid(void) {
2104 _cleanup_free_ char *p = NULL;
2105 int r;
2106
0c582db0 2107 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2108 return 0;
2109
2110 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2111 if (r == -ENOENT)
db999e0f 2112 return 0;
f647962d
MS
2113 if (r < 0)
2114 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2115
2116 /* Already reset? */
2117 if (streq(p, "4294967295"))
2118 return 0;
2119
57512c89 2120 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2121 if (r < 0) {
10a87006
LP
2122 log_error_errno(r,
2123 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2124 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2125 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2126 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2127 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2128
db999e0f 2129 sleep(5);
77b6e194 2130 }
db999e0f
LP
2131
2132 return 0;
77b6e194
LP
2133}
2134
785890ac
LP
2135static int setup_propagate(const char *root) {
2136 const char *p, *q;
709f6e46 2137 int r;
785890ac
LP
2138
2139 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2140 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2141 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2142 (void) mkdir_p(p, 0600);
2143
709f6e46
MS
2144 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2145 if (r < 0)
2146 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2147
709f6e46
MS
2148 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2149 if (r < 0)
2150 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2151
709f6e46
MS
2152 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2153 if (r < 0)
2154 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2155
03cfe0d5 2156 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2157 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2158 if (r < 0)
2159 return r;
785890ac 2160
60e76d48
ZJS
2161 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2162 if (r < 0)
2163 return r;
785890ac 2164
19caffac
AC
2165 /* machined will MS_MOVE into that directory, and that's only
2166 * supported for non-shared mounts. */
60e76d48 2167 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2168}
2169
317feb4d 2170static int setup_machine_id(const char *directory) {
691675ba
LP
2171 const char *etc_machine_id;
2172 sd_id128_t id;
3bbaff3e 2173 int r;
e01ff70a 2174
317feb4d
LP
2175 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2176 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2177 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2178 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2179 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2180 * container behaves nicely). */
2181
e01ff70a
MS
2182 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2183
691675ba 2184 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2185 if (r < 0) {
2186 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2187 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2188
317feb4d
LP
2189 if (sd_id128_is_null(arg_uuid)) {
2190 r = sd_id128_randomize(&arg_uuid);
2191 if (r < 0)
2192 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2193 }
2194 } else {
baaa35ad
ZJS
2195 if (sd_id128_is_null(id))
2196 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2197 "Machine ID in container image is zero, refusing.");
e01ff70a 2198
317feb4d
LP
2199 arg_uuid = id;
2200 }
691675ba 2201
e01ff70a
MS
2202 return 0;
2203}
2204
7336138e
LP
2205static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2206 int r;
2207
2208 assert(directory);
2209
0de7acce 2210 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2211 return 0;
2212
2213 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2214 if (r == -EOPNOTSUPP)
2215 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2216 if (r == -EBADE)
2217 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2218 if (r < 0)
2219 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2220 if (r == 0)
2221 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2222 else
2223 log_debug("Patched directory tree to match UID/GID range.");
2224
2225 return r;
2226}
2227
113cea80 2228/*
6d416b9c
LS
2229 * Return values:
2230 * < 0 : wait_for_terminate() failed to get the state of the
2231 * container, the container was terminated by a signal, or
2232 * failed for an unknown reason. No change is made to the
2233 * container argument.
2234 * > 0 : The program executed in the container terminated with an
2235 * error. The exit code of the program executed in the
919699ec
LP
2236 * container is returned. The container argument has been set
2237 * to CONTAINER_TERMINATED.
6d416b9c
LS
2238 * 0 : The container is being rebooted, has been shut down or exited
2239 * successfully. The container argument has been set to either
2240 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2241 *
6d416b9c
LS
2242 * That is, success is indicated by a return value of zero, and an
2243 * error is indicated by a non-zero value.
113cea80
DH
2244 */
2245static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2246 siginfo_t status;
919699ec 2247 int r;
113cea80
DH
2248
2249 r = wait_for_terminate(pid, &status);
f647962d
MS
2250 if (r < 0)
2251 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2252
2253 switch (status.si_code) {
fddbb89c 2254
113cea80 2255 case CLD_EXITED:
b5a2179b 2256 if (status.si_status == 0)
919699ec 2257 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2258 else
919699ec 2259 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2260
919699ec
LP
2261 *container = CONTAINER_TERMINATED;
2262 return status.si_status;
113cea80
DH
2263
2264 case CLD_KILLED:
2265 if (status.si_status == SIGINT) {
919699ec 2266 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2267 *container = CONTAINER_TERMINATED;
919699ec
LP
2268 return 0;
2269
113cea80 2270 } else if (status.si_status == SIGHUP) {
919699ec 2271 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2272 *container = CONTAINER_REBOOTED;
919699ec 2273 return 0;
113cea80 2274 }
919699ec 2275
4831981d 2276 _fallthrough_;
113cea80 2277 case CLD_DUMPED:
baaa35ad
ZJS
2278 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2279 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2280
2281 default:
baaa35ad
ZJS
2282 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2283 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2284 }
113cea80
DH
2285}
2286
023fb90b
LP
2287static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2288 pid_t pid;
2289
4a0b58c4 2290 pid = PTR_TO_PID(userdata);
023fb90b 2291 if (pid > 0) {
c6c8f6e2 2292 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2293 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2294 sd_event_source_set_userdata(s, NULL);
2295 return 0;
2296 }
2297 }
2298
2299 sd_event_exit(sd_event_source_get_event(s), 0);
2300 return 0;
2301}
2302
6916b164 2303static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2304 pid_t pid;
2305
2306 assert(s);
2307 assert(ssi);
2308
2309 pid = PTR_TO_PID(userdata);
2310
6916b164
AU
2311 for (;;) {
2312 siginfo_t si = {};
abdb9b08 2313
6916b164
AU
2314 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2315 return log_error_errno(errno, "Failed to waitid(): %m");
2316 if (si.si_pid == 0) /* No pending children. */
2317 break;
abdb9b08 2318 if (si.si_pid == pid) {
6916b164
AU
2319 /* The main process we care for has exited. Return from
2320 * signal handler but leave the zombie. */
2321 sd_event_exit(sd_event_source_get_event(s), 0);
2322 break;
2323 }
abdb9b08 2324
6916b164
AU
2325 /* Reap all other children. */
2326 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2327 }
2328
2329 return 0;
2330}
2331
abdb9b08
LP
2332static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2333 pid_t pid;
2334
2335 assert(m);
2336
2337 pid = PTR_TO_PID(userdata);
2338
2339 if (arg_kill_signal > 0) {
2340 log_info("Container termination requested. Attempting to halt container.");
2341 (void) kill(pid, arg_kill_signal);
2342 } else {
2343 log_info("Container termination requested. Exiting.");
2344 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2345 }
2346
2347 return 0;
2348}
2349
ec16945e 2350static int determine_names(void) {
1b9cebf6 2351 int r;
ec16945e 2352
c1521918
LP
2353 if (arg_template && !arg_directory && arg_machine) {
2354
2355 /* If --template= was specified then we should not
2356 * search for a machine, but instead create a new one
2357 * in /var/lib/machine. */
2358
605405c6 2359 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2360 if (!arg_directory)
2361 return log_oom();
2362 }
2363
ec16945e 2364 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2365 if (arg_machine) {
2366 _cleanup_(image_unrefp) Image *i = NULL;
2367
5ef46e5f 2368 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2369 if (r == -ENOENT)
2370 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2371 if (r < 0)
2372 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2373
eb38edce 2374 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2375 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2376 else
0f03c2a4 2377 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2378 if (r < 0)
0f3be6ca 2379 return log_oom();
1b9cebf6 2380
aee327b8
LP
2381 if (!arg_ephemeral)
2382 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2383 } else {
2384 r = safe_getcwd(&arg_directory);
2385 if (r < 0)
2386 return log_error_errno(r, "Failed to determine current directory: %m");
2387 }
ec16945e 2388
0f3be6ca 2389 if (!arg_directory && !arg_image) {
1b9cebf6 2390 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2391 return -EINVAL;
2392 }
2393 }
2394
2395 if (!arg_machine) {
b9ba4dab
LP
2396 if (arg_directory && path_equal(arg_directory, "/"))
2397 arg_machine = gethostname_malloc();
4827ab48
LP
2398 else {
2399 if (arg_image) {
2400 char *e;
2401
2402 arg_machine = strdup(basename(arg_image));
2403
2404 /* Truncate suffix if there is one */
2405 e = endswith(arg_machine, ".raw");
2406 if (e)
2407 *e = 0;
2408 } else
2409 arg_machine = strdup(basename(arg_directory));
2410 }
ec16945e
LP
2411 if (!arg_machine)
2412 return log_oom();
2413
ae691c1d 2414 hostname_cleanup(arg_machine);
ec16945e
LP
2415 if (!machine_name_is_valid(arg_machine)) {
2416 log_error("Failed to determine machine name automatically, please use -M.");
2417 return -EINVAL;
2418 }
b9ba4dab
LP
2419
2420 if (arg_ephemeral) {
2421 char *b;
2422
2423 /* Add a random suffix when this is an
2424 * ephemeral machine, so that we can run many
2425 * instances at once without manually having
2426 * to specify -M each time. */
2427
2428 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2429 return log_oom();
2430
2431 free(arg_machine);
2432 arg_machine = b;
2433 }
ec16945e
LP
2434 }
2435
2436 return 0;
2437}
2438
8d4aa2bb 2439static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2440 char *chased;
2441 int r;
2442
2443 assert(p);
2444
2445 if (!*p)
2446 return 0;
2447
8d4aa2bb 2448 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2449 if (r < 0)
2450 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2451
8405dcf7
ZJS
2452 free_and_replace(*p, chased);
2453 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
3f342ec4
LP
2454}
2455
03cfe0d5 2456static int determine_uid_shift(const char *directory) {
6dac160c
LP
2457 int r;
2458
0de7acce 2459 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2460 arg_uid_shift = 0;
6dac160c 2461 return 0;
03cfe0d5 2462 }
6dac160c
LP
2463
2464 if (arg_uid_shift == UID_INVALID) {
2465 struct stat st;
2466
03cfe0d5 2467 r = stat(directory, &st);
6dac160c 2468 if (r < 0)
03cfe0d5 2469 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2470
2471 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2472
baaa35ad
ZJS
2473 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2474 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2475 "UID and GID base of %s don't match.", directory);
6dac160c
LP
2476
2477 arg_uid_range = UINT32_C(0x10000);
2478 }
2479
baaa35ad
ZJS
2480 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2481 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2482 "UID base too high for UID range.");
6dac160c 2483
6dac160c
LP
2484 return 0;
2485}
2486
03cfe0d5
LP
2487static int inner_child(
2488 Barrier *barrier,
2489 const char *directory,
2490 bool secondary,
2491 int kmsg_socket,
2492 int rtnl_socket,
f757855e 2493 FDSet *fds) {
69c79d3c 2494
03cfe0d5 2495 _cleanup_free_ char *home = NULL;
e01ff70a 2496 char as_uuid[37];
88614c8a 2497 size_t n_env = 1;
03cfe0d5 2498 const char *envp[] = {
0c300adf 2499 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2500 NULL, /* container */
03cfe0d5
LP
2501 NULL, /* TERM */
2502 NULL, /* HOME */
2503 NULL, /* USER */
2504 NULL, /* LOGNAME */
2505 NULL, /* container_uuid */
2506 NULL, /* LISTEN_FDS */
2507 NULL, /* LISTEN_PID */
9c1e04d0 2508 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2509 NULL
2510 };
1a68e1e5 2511 const char *exec_target;
2371271c 2512 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2513 int r;
88213476 2514
b37469d7
LP
2515 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2516 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2517 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2518 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2519 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2520 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2521 * namespace.
2522 *
2523 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2524 * unshare(). See below. */
2525
03cfe0d5
LP
2526 assert(barrier);
2527 assert(directory);
2528 assert(kmsg_socket >= 0);
88213476 2529
0de7acce 2530 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2531 /* Tell the parent, that it now can write the UID map. */
2532 (void) barrier_place(barrier); /* #1 */
7027ff61 2533
03cfe0d5 2534 /* Wait until the parent wrote the UID map */
baaa35ad
ZJS
2535 if (!barrier_place_and_sync(barrier)) /* #2 */
2536 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2537 "Parent died too early");
88213476
LP
2538 }
2539
6d66bd3b
EV
2540 r = reset_uid_gid();
2541 if (r < 0)
2542 return log_error_errno(r, "Couldn't become new root: %m");
2543
0de7acce 2544 r = mount_all(NULL,
4f086aab 2545 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 2546 arg_uid_shift,
0de7acce 2547 arg_selinux_apifs_context);
03cfe0d5
LP
2548 if (r < 0)
2549 return r;
2550
04413780
ZJS
2551 if (!arg_network_namespace_path && arg_private_network) {
2552 r = unshare(CLONE_NEWNET);
2553 if (r < 0)
2554 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2555
2556 /* Tell the parent that it can setup network interfaces. */
2557 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2558 }
2559
4f086aab 2560 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2561 if (r < 0)
2562 return r;
2563
03cfe0d5
LP
2564 /* Wait until we are cgroup-ified, so that we
2565 * can mount the right cgroup path writable */
baaa35ad
ZJS
2566 if (!barrier_place_and_sync(barrier)) /* #4 */
2567 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2568 "Parent died too early");
88213476 2569
489fae52 2570 if (arg_use_cgns) {
0996ef00
CB
2571 r = unshare(CLONE_NEWCGROUP);
2572 if (r < 0)
04413780 2573 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2574 r = mount_cgroups(
2575 "",
2576 arg_unified_cgroup_hierarchy,
2577 arg_userns_mode != USER_NAMESPACE_NO,
2578 arg_uid_shift,
2579 arg_uid_range,
5a8ff0e6 2580 arg_selinux_apifs_context,
ada54120 2581 true);
0996ef00
CB
2582 if (r < 0)
2583 return r;
2584 } else {
2585 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2586 if (r < 0)
2587 return r;
2588 }
ec16945e 2589
1e4f1671 2590 r = setup_boot_id();
03cfe0d5
LP
2591 if (r < 0)
2592 return r;
ec16945e 2593
1e4f1671 2594 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2595 if (r < 0)
2596 return r;
2597 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2598
03cfe0d5
LP
2599 if (setsid() < 0)
2600 return log_error_errno(errno, "setsid() failed: %m");
2601
2602 if (arg_private_network)
2603 loopback_setup();
2604
7a8f6325
LP
2605 if (arg_expose_ports) {
2606 r = expose_port_send_rtnl(rtnl_socket);
2607 if (r < 0)
2608 return r;
2609 rtnl_socket = safe_close(rtnl_socket);
2610 }
03cfe0d5 2611
81f345df
LP
2612 if (arg_oom_score_adjust_set) {
2613 r = set_oom_score_adjust(arg_oom_score_adjust);
2614 if (r < 0)
2615 return log_error_errno(r, "Failed to adjust OOM score: %m");
2616 }
2617
d107bb7d
LP
2618 if (arg_cpuset)
2619 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2620 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2621
709f6e46
MS
2622 r = drop_capabilities();
2623 if (r < 0)
2624 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5 2625
c818eef1 2626 (void) setup_hostname();
03cfe0d5 2627
050f7277 2628 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2629 r = safe_personality(arg_personality);
2630 if (r < 0)
2631 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2632 } else if (secondary) {
21022b9d
LP
2633 r = safe_personality(PER_LINUX32);
2634 if (r < 0)
2635 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2636 }
2637
349cc4a5 2638#if HAVE_SELINUX
03cfe0d5 2639 if (arg_selinux_context)
2ed96880 2640 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2641 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2642#endif
2643
ee645080 2644 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2645 if (r < 0)
2646 return r;
2647
66edd963
LP
2648 if (arg_no_new_privileges)
2649 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2650 return log_error_errno(errno, "Failed to disable new privileges: %m");
2651
6aadfa4c
ILG
2652 /* LXC sets container=lxc, so follow the scheme here */
2653 envp[n_env++] = strjoina("container=", arg_container_service_name);
2654
03cfe0d5
LP
2655 envp[n_env] = strv_find_prefix(environ, "TERM=");
2656 if (envp[n_env])
313cefa1 2657 n_env++;
03cfe0d5
LP
2658
2659 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2660 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2661 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2662 return log_oom();
2663
3bbaff3e 2664 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2665
691675ba 2666 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2667 return log_oom();
03cfe0d5
LP
2668
2669 if (fdset_size(fds) > 0) {
2670 r = fdset_cloexec(fds, false);
2671 if (r < 0)
2672 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2673
2674 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2675 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2676 return log_oom();
2677 }
9c1e04d0
AP
2678 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2679 return log_oom();
03cfe0d5 2680
2371271c
TG
2681 env_use = strv_env_merge(2, envp, arg_setenv);
2682 if (!env_use)
2683 return log_oom();
03cfe0d5
LP
2684
2685 /* Let the parent know that we are ready and
2686 * wait until the parent is ready with the
2687 * setup, too... */
baaa35ad
ZJS
2688 if (!barrier_place_and_sync(barrier)) /* #5 */
2689 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2690 "Parent died too early");
03cfe0d5 2691
5f932eb9
LP
2692 if (arg_chdir)
2693 if (chdir(arg_chdir) < 0)
2694 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2695
7732f92b 2696 if (arg_start_mode == START_PID2) {
75bf701f 2697 r = stub_pid1(arg_uuid);
7732f92b
LP
2698 if (r < 0)
2699 return r;
2700 }
2701
8ca082b4
LP
2702 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
2703 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
2704 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 2705 log_close();
8ca082b4
LP
2706 log_set_open_when_needed(true);
2707
03cfe0d5
LP
2708 (void) fdset_close_others(fds);
2709
7732f92b 2710 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2711 char **a;
2712 size_t m;
2713
2714 /* Automatically search for the init system */
2715
75f32f04
ZJS
2716 m = strv_length(arg_parameters);
2717 a = newa(char*, m + 2);
2718 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2719 a[1 + m] = NULL;
03cfe0d5 2720
ced58da7 2721 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2722 execve(a[0], a, env_use);
2723
ced58da7 2724 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2725 execve(a[0], a, env_use);
2726
ced58da7 2727 a[0] = (char*) "/sbin/init";
03cfe0d5 2728 execve(a[0], a, env_use);
ced58da7
LP
2729
2730 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 2731 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
2732 const char *dollar_path;
2733
1a68e1e5 2734 exec_target = arg_parameters[0];
b6b180b7
LP
2735
2736 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
2737 * binary. */
2738 dollar_path = strv_env_get(env_use, "PATH");
2739 if (dollar_path) {
2740 if (putenv((char*) dollar_path) != 0)
2741 return log_error_errno(errno, "Failed to update $PATH: %m");
2742 }
2743
f757855e 2744 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2745 } else {
5f932eb9 2746 if (!arg_chdir)
d929b0f9
ZJS
2747 /* If we cannot change the directory, we'll end up in /, that is expected. */
2748 (void) chdir(home ?: "/root");
5f932eb9 2749
03cfe0d5
LP
2750 execle("/bin/bash", "-bash", NULL, env_use);
2751 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2752
2753 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2754 }
2755
8ca082b4 2756 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2757}
2758
9c1e04d0 2759static int setup_sd_notify_child(void) {
271f518f 2760 _cleanup_close_ int fd = -1;
9c1e04d0 2761 union sockaddr_union sa = {
44ed5214
LP
2762 .un.sun_family = AF_UNIX,
2763 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
2764 };
2765 int r;
2766
2767 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2768 if (fd < 0)
2769 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2770
2771 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 2772 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 2773
9c1e04d0 2774 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 2775 if (r < 0)
44ed5214 2776 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 2777
adc7d9f0 2778 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 2779 if (r < 0)
adc7d9f0 2780 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 2781
2ff48e98 2782 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 2783 if (r < 0)
2ff48e98 2784 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 2785
271f518f 2786 return TAKE_FD(fd);
9c1e04d0
AP
2787}
2788
03cfe0d5
LP
2789static int outer_child(
2790 Barrier *barrier,
2791 const char *directory,
2792 const char *console,
2d845785 2793 DissectedImage *dissected_image,
03cfe0d5
LP
2794 bool interactive,
2795 bool secondary,
2796 int pid_socket,
e01ff70a 2797 int uuid_socket,
9c1e04d0 2798 int notify_socket,
03cfe0d5
LP
2799 int kmsg_socket,
2800 int rtnl_socket,
825d5287 2801 int uid_shift_socket,
8199d554 2802 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
2803 FDSet *fds,
2804 int netns_fd) {
03cfe0d5 2805
bf428efb
LP
2806 _cleanup_close_ int fd = -1;
2807 int r, which_failed;
03cfe0d5
LP
2808 pid_t pid;
2809 ssize_t l;
03cfe0d5 2810
b37469d7
LP
2811 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
2812 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
2813 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
2814 * initializations a second child (the "inner" one) is forked off it, and it exits. */
2815
03cfe0d5
LP
2816 assert(barrier);
2817 assert(directory);
2818 assert(console);
2819 assert(pid_socket >= 0);
e01ff70a 2820 assert(uuid_socket >= 0);
9c1e04d0 2821 assert(notify_socket >= 0);
03cfe0d5
LP
2822 assert(kmsg_socket >= 0);
2823
2824 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2825 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2826
2827 if (interactive) {
2b33ab09 2828 int terminal;
03cfe0d5 2829
2b33ab09
LP
2830 terminal = open_terminal(console, O_RDWR);
2831 if (terminal < 0)
2832 return log_error_errno(terminal, "Failed to open console: %m");
03cfe0d5 2833
17cac366
LP
2834 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
2835 r = log_dup_console();
2836 if (r < 0)
2837 return log_error_errno(r, "Failed to duplicate stderr: %m");
2838
2b33ab09
LP
2839 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
2840 if (r < 0)
2841 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
03cfe0d5
LP
2842 }
2843
2844 r = reset_audit_loginuid();
2845 if (r < 0)
2846 return r;
2847
2848 /* Mark everything as slave, so that we still
2849 * receive mounts from the real root, but don't
2850 * propagate mounts to the real root. */
60e76d48
ZJS
2851 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2852 if (r < 0)
2853 return r;
03cfe0d5 2854
2d845785 2855 if (dissected_image) {
2d3a5a73
LP
2856 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
2857 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
2858 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
2859 * makes sure ESP partitions and userns are compatible. */
2860
2861 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
03bcb6d4
LP
2862 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
2863 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
2864 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785
LP
2865 if (r < 0)
2866 return r;
2867 }
03cfe0d5 2868
391567f4
LP
2869 r = determine_uid_shift(directory);
2870 if (r < 0)
2871 return r;
2872
0de7acce 2873 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2874 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2875 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2876 if (l < 0)
2877 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
2878 if (l != sizeof(arg_uid_shift))
2879 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2880 "Short write while sending UID shift.");
0e7ac751 2881
0de7acce 2882 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2883 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2884 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2885 * not it will pick a different one, and send it back to us. */
2886
2887 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2888 if (l < 0)
2889 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
2890 if (l != sizeof(arg_uid_shift))
2891 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2892 "Short read while receiving UID shift.");
0e7ac751
LP
2893 }
2894
ff6c6cc1
LP
2895 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
2896 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2897 }
2898
2d3a5a73
LP
2899 if (dissected_image) {
2900 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
2901 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2902 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2903 if (r < 0)
2904 return r;
2905 }
2906
8199d554
LP
2907 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
2908 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
2909
2910 r = detect_unified_cgroup_hierarchy_from_image(directory);
2911 if (r < 0)
2912 return r;
2913
2914 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
2915 if (l < 0)
2916 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
2917 if (l != sizeof(arg_unified_cgroup_hierarchy))
2918 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2919 "Short write while sending cgroup mode.");
8199d554
LP
2920
2921 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
2922 }
2923
03cfe0d5 2924 /* Turn directory into bind mount */
60e76d48
ZJS
2925 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2926 if (r < 0)
2927 return r;
03cfe0d5 2928
b53ede69
PW
2929 r = setup_pivot_root(
2930 directory,
2931 arg_pivot_root_new,
2932 arg_pivot_root_old);
2933 if (r < 0)
2934 return r;
2935
0de7acce
LP
2936 r = setup_volatile(
2937 directory,
2938 arg_volatile_mode,
2939 arg_userns_mode != USER_NAMESPACE_NO,
2940 arg_uid_shift,
2941 arg_uid_range,
2942 arg_selinux_context);
03cfe0d5
LP
2943 if (r < 0)
2944 return r;
2945
0de7acce
LP
2946 r = setup_volatile_state(
2947 directory,
2948 arg_volatile_mode,
2949 arg_userns_mode != USER_NAMESPACE_NO,
2950 arg_uid_shift,
2951 arg_uid_range,
2952 arg_selinux_context);
03cfe0d5
LP
2953 if (r < 0)
2954 return r;
2955
4ad14eff
LP
2956 /* Mark everything as shared so our mounts get propagated down. This is
2957 * required to make new bind mounts available in systemd services
2958 * inside the containter that create a new mount namespace.
2959 * See https://github.com/systemd/systemd/issues/3860
2960 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2961 * shared propagation mode. */
4ad14eff
LP
2962 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2963 if (r < 0)
2964 return r;
2965
2966 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2967 if (r < 0)
2968 return r;
2969
03cfe0d5
LP
2970 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2971 if (r < 0)
2972 return r;
2973
03cfe0d5 2974 if (arg_read_only) {
6b7c9f8b 2975 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2976 if (r < 0)
2977 return log_error_errno(r, "Failed to make tree read-only: %m");
2978 }
2979
0de7acce 2980 r = mount_all(directory,
4f086aab 2981 arg_mount_settings,
0de7acce 2982 arg_uid_shift,
0de7acce 2983 arg_selinux_apifs_context);
03cfe0d5
LP
2984 if (r < 0)
2985 return r;
2986
07fa00f9
LP
2987 r = copy_devnodes(directory);
2988 if (r < 0)
03cfe0d5
LP
2989 return r;
2990
2991 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2992
07fa00f9
LP
2993 r = setup_pts(directory);
2994 if (r < 0)
03cfe0d5
LP
2995 return r;
2996
2997 r = setup_propagate(directory);
2998 if (r < 0)
2999 return r;
3000
3001 r = setup_dev_console(directory, console);
3002 if (r < 0)
3003 return r;
3004
8e5430c4
LP
3005 r = setup_keyring();
3006 if (r < 0)
3007 return r;
3008
960e4569 3009 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
03cfe0d5
LP
3010 if (r < 0)
3011 return r;
3012
3013 r = setup_timezone(directory);
3014 if (r < 0)
3015 return r;
3016
3017 r = setup_resolv_conf(directory);
3018 if (r < 0)
3019 return r;
3020
e01ff70a
MS
3021 r = setup_machine_id(directory);
3022 if (r < 0)
3023 return r;
3024
03cfe0d5
LP
3025 r = setup_journal(directory);
3026 if (r < 0)
3027 return r;
3028
0de7acce
LP
3029 r = mount_custom(
3030 directory,
3031 arg_custom_mounts,
3032 arg_n_custom_mounts,
3033 arg_userns_mode != USER_NAMESPACE_NO,
3034 arg_uid_shift,
3035 arg_uid_range,
3036 arg_selinux_apifs_context);
03cfe0d5
LP
3037 if (r < 0)
3038 return r;
3039
489fae52 3040 if (!arg_use_cgns) {
0996ef00
CB
3041 r = mount_cgroups(
3042 directory,
3043 arg_unified_cgroup_hierarchy,
3044 arg_userns_mode != USER_NAMESPACE_NO,
3045 arg_uid_shift,
3046 arg_uid_range,
5a8ff0e6 3047 arg_selinux_apifs_context,
ada54120 3048 false);
0996ef00
CB
3049 if (r < 0)
3050 return r;
3051 }
03cfe0d5
LP
3052
3053 r = mount_move_root(directory);
3054 if (r < 0)
3055 return log_error_errno(r, "Failed to move root directory: %m");
3056
9c1e04d0
AP
3057 fd = setup_sd_notify_child();
3058 if (fd < 0)
3059 return fd;
3060
bf428efb
LP
3061 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3062 if (r < 0)
3063 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3064
03cfe0d5 3065 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3066 arg_clone_ns_flags |
8869a0b4 3067 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3068 if (pid < 0)
3069 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3070 if (pid == 0) {
3071 pid_socket = safe_close(pid_socket);
e01ff70a 3072 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3073 notify_socket = safe_close(notify_socket);
825d5287 3074 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3075
3076 /* The inner child has all namespaces that are
3077 * requested, so that we all are owned by the user if
3078 * user namespaces are turned on. */
3079
d7bea6b6
DP
3080 if (arg_network_namespace_path) {
3081 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3082 if (r < 0)
e2d39e54 3083 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3084 }
3085
f757855e 3086 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3087 if (r < 0)
3088 _exit(EXIT_FAILURE);
3089
3090 _exit(EXIT_SUCCESS);
3091 }
3092
3093 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3094 if (l < 0)
3095 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3096 if (l != sizeof(pid))
3097 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3098 "Short write while sending PID.");
03cfe0d5 3099
e01ff70a
MS
3100 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3101 if (l < 0)
3102 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3103 if (l != sizeof(arg_uuid))
3104 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3105 "Short write while sending machine ID.");
e01ff70a 3106
9c1e04d0
AP
3107 l = send_one_fd(notify_socket, fd, 0);
3108 if (l < 0)
3109 return log_error_errno(errno, "Failed to send notify fd: %m");
3110
03cfe0d5 3111 pid_socket = safe_close(pid_socket);
e01ff70a 3112 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3113 notify_socket = safe_close(notify_socket);
327e26d6
KN
3114 kmsg_socket = safe_close(kmsg_socket);
3115 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3116 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3117
3118 return 0;
3119}
3120
0e7ac751 3121static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3122 bool tried_hashed = false;
0e7ac751
LP
3123 unsigned n_tries = 100;
3124 uid_t candidate;
3125 int r;
3126
3127 assert(shift);
3128 assert(ret_lock_file);
0de7acce 3129 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3130 assert(arg_uid_range == 0x10000U);
3131
3132 candidate = *shift;
3133
3134 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3135
3136 for (;;) {
fbd0b64f 3137 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3138 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3139
3140 if (--n_tries <= 0)
3141 return -EBUSY;
3142
87d5e4f2 3143 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3144 goto next;
3145 if ((candidate & UINT32_C(0xFFFF)) != 0)
3146 goto next;
3147
3148 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3149 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3150 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3151 goto next;
3152 if (r < 0)
3153 return r;
3154
3155 /* Make some superficial checks whether the range is currently known in the user database */
3156 if (getpwuid(candidate))
3157 goto next;
3158 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3159 goto next;
3160 if (getgrgid(candidate))
3161 goto next;
3162 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3163 goto next;
3164
3165 *ret_lock_file = lf;
3166 lf = (struct LockFile) LOCK_FILE_INIT;
3167 *shift = candidate;
3168 return 0;
3169
3170 next:
d381c8a6
LP
3171 if (arg_machine && !tried_hashed) {
3172 /* Try to hash the base from the container name */
3173
3174 static const uint8_t hash_key[] = {
3175 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3176 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3177 };
3178
3179 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3180
3181 tried_hashed = true;
3182 } else
3183 random_bytes(&candidate, sizeof(candidate));
3184
87d5e4f2 3185 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3186 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3187 }
3188}
3189
03cfe0d5 3190static int setup_uid_map(pid_t pid) {
fbd0b64f 3191 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3192 int r;
3193
3194 assert(pid > 1);
3195
3196 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3197 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3198 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3199 if (r < 0)
3200 return log_error_errno(r, "Failed to write UID map: %m");
3201
3202 /* We always assign the same UID and GID ranges */
3203 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3204 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3205 if (r < 0)
3206 return log_error_errno(r, "Failed to write GID map: %m");
3207
3208 return 0;
3209}
3210
9c1e04d0 3211static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3212 char buf[NOTIFY_BUFFER_MAX+1];
3213 char *p = NULL;
3214 struct iovec iovec = {
3215 .iov_base = buf,
3216 .iov_len = sizeof(buf)-1,
3217 };
3218 union {
3219 struct cmsghdr cmsghdr;
3220 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3221 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3222 } control = {};
3223 struct msghdr msghdr = {
3224 .msg_iov = &iovec,
3225 .msg_iovlen = 1,
3226 .msg_control = &control,
3227 .msg_controllen = sizeof(control),
3228 };
3229 struct cmsghdr *cmsg;
3230 struct ucred *ucred = NULL;
3231 ssize_t n;
3232 pid_t inner_child_pid;
3233 _cleanup_strv_free_ char **tags = NULL;
3234
3235 assert(userdata);
3236
3237 inner_child_pid = PTR_TO_PID(userdata);
3238
3239 if (revents != EPOLLIN) {
3240 log_warning("Got unexpected poll event for notify fd.");
3241 return 0;
3242 }
3243
3244 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3245 if (n < 0) {
3742095b 3246 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3247 return 0;
3248
3249 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3250 }
3251 cmsg_close_all(&msghdr);
3252
3253 CMSG_FOREACH(cmsg, &msghdr) {
3254 if (cmsg->cmsg_level == SOL_SOCKET &&
3255 cmsg->cmsg_type == SCM_CREDENTIALS &&
3256 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3257
3258 ucred = (struct ucred*) CMSG_DATA(cmsg);
3259 }
3260 }
3261
3262 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3263 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3264 return 0;
3265 }
3266
3267 if ((size_t) n >= sizeof(buf)) {
3268 log_warning("Received notify message exceeded maximum size. Ignoring.");
3269 return 0;
3270 }
3271
3272 buf[n] = 0;
3273 tags = strv_split(buf, "\n\r");
3274 if (!tags)
3275 return log_oom();
3276
3277 if (strv_find(tags, "READY=1"))
3278 sd_notifyf(false, "READY=1\n");
3279
3280 p = strv_find_startswith(tags, "STATUS=");
3281 if (p)
3282 sd_notifyf(false, "STATUS=Container running: %s", p);
3283
3284 return 0;
3285}
3286
5773024d 3287static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3288 int r;
9c1e04d0 3289
5773024d 3290 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3291 if (r < 0)
3292 return log_error_errno(r, "Failed to allocate notify event source: %m");
3293
5773024d 3294 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3295
3296 return 0;
3297}
3298
5d961407
LP
3299static int merge_settings(Settings *settings, const char *path) {
3300 int rl;
f757855e 3301
5d961407
LP
3302 assert(settings);
3303 assert(path);
f757855e 3304
5d961407
LP
3305 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3306 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3307
7732f92b
LP
3308 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3309 settings->start_mode >= 0) {
3310 arg_start_mode = settings->start_mode;
130d3d22 3311 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3312 }
3313
a2f577fc
JL
3314 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3315 arg_ephemeral = settings->ephemeral;
3316
b53ede69
PW
3317 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3318 settings->pivot_root_new) {
3319 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3320 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3321 }
3322
5f932eb9 3323 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3324 settings->working_directory)
3325 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3326
f757855e 3327 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3328 settings->environment)
3329 strv_free_and_replace(arg_setenv, settings->environment);
f757855e
LP
3330
3331 if ((arg_settings_mask & SETTING_USER) == 0 &&
1cc6c93a
YW
3332 settings->user)
3333 free_and_replace(arg_user, settings->user);
f757855e
LP
3334
3335 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3336 uint64_t plus;
f757855e 3337
0e265674
LP
3338 plus = settings->capability;
3339 if (settings_private_network(settings))
3340 plus |= (1ULL << CAP_NET_ADMIN);
3341
3342 if (!arg_settings_trusted && plus != 0) {
3343 if (settings->capability != 0)
5d961407 3344 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
0e265674 3345 } else
520e0d54 3346 arg_caps_retain |= plus;
f757855e 3347
520e0d54 3348 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3349 }
3350
3351 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3352 settings->kill_signal > 0)
3353 arg_kill_signal = settings->kill_signal;
3354
3355 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3356 settings->personality != PERSONALITY_INVALID)
3357 arg_personality = settings->personality;
3358
3359 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3360 !sd_id128_is_null(settings->machine_id)) {
3361
3362 if (!arg_settings_trusted)
5d961407 3363 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3364 else
3365 arg_uuid = settings->machine_id;
3366 }
3367
3368 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3369 settings->read_only >= 0)
3370 arg_read_only = settings->read_only;
3371
3372 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3373 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3374 arg_volatile_mode = settings->volatile_mode;
3375
3376 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3377 settings->n_custom_mounts > 0) {
3378
3379 if (!arg_settings_trusted)
5d961407 3380 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3381 else {
3382 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3383 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3384 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3385 settings->n_custom_mounts = 0;
3386 }
3387 }
3388
3389 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3390 (settings->private_network >= 0 ||
3391 settings->network_veth >= 0 ||
3392 settings->network_bridge ||
22b28dfd 3393 settings->network_zone ||
f757855e
LP
3394 settings->network_interfaces ||
3395 settings->network_macvlan ||
f6d6bad1
LP
3396 settings->network_ipvlan ||
3397 settings->network_veth_extra)) {
f757855e
LP
3398
3399 if (!arg_settings_trusted)
5d961407 3400 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3401 else {
f6d6bad1 3402 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3403 arg_private_network = settings_private_network(settings);
3404
130d3d22
YW
3405 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3406 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3407 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3408 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3409
1cc6c93a
YW
3410 free_and_replace(arg_network_bridge, settings->network_bridge);
3411 free_and_replace(arg_network_zone, settings->network_zone);
f757855e
LP
3412 }
3413 }
3414
3415 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3416 settings->expose_ports) {
3417
3418 if (!arg_settings_trusted)
5d961407 3419 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3420 else {
3421 expose_port_free_all(arg_expose_ports);
1cc6c93a 3422 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3423 }
3424 }
3425
0de7acce
LP
3426 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3427 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3428
3429 if (!arg_settings_trusted)
5d961407 3430 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3431 else {
3432 arg_userns_mode = settings->userns_mode;
3433 arg_uid_shift = settings->uid_shift;
3434 arg_uid_range = settings->uid_range;
3435 arg_userns_chown = settings->userns_chown;
3436 }
3437 }
3438
9c1e04d0
AP
3439 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3440 arg_notify_ready = settings->notify_ready;
3441
960e4569
LP
3442 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3443
3444 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
5d961407 3445 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3446 else {
130d3d22
YW
3447 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3448 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569
LP
3449 }
3450 }
3451
bf428efb
LP
3452 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3453 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3454 continue;
3455
3456 if (!settings->rlimit[rl])
3457 continue;
3458
3459 if (!arg_settings_trusted) {
5d961407 3460 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3461 continue;
3462 }
3463
3464 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3465 }
3466
3a9530e5
LP
3467 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3468 settings->hostname)
3469 free_and_replace(arg_hostname, settings->hostname);
3470
66edd963
LP
3471 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3472 settings->no_new_privileges >= 0)
3473 arg_no_new_privileges = settings->no_new_privileges;
3474
81f345df
LP
3475 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3476 settings->oom_score_adjust_set) {
3477
3478 if (!arg_settings_trusted)
5d961407 3479 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3480 else {
3481 arg_oom_score_adjust = settings->oom_score_adjust;
3482 arg_oom_score_adjust_set = true;
3483 }
3484 }
3485
d107bb7d
LP
3486 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3487 settings->cpuset) {
3488
3489 if (!arg_settings_trusted)
5d961407 3490 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d
LP
3491 else {
3492 if (arg_cpuset)
3493 CPU_FREE(arg_cpuset);
3494 arg_cpuset = TAKE_PTR(settings->cpuset);
3495 arg_cpuset_ncpus = settings->cpuset_ncpus;
3496 }
3497 }
3498
09d423e9
LP
3499 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3500 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3501 arg_resolv_conf = settings->resolv_conf;
3502
4e1d6aa9
LP
3503 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3504 settings->link_journal != _LINK_JOURNAL_INVALID) {
3505
3506 if (!arg_settings_trusted)
3507 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3508 else {
3509 arg_link_journal = settings->link_journal;
3510 arg_link_journal_try = settings->link_journal_try;
3511 }
3512 }
3513
1688841f
LP
3514 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3515 settings->timezone != _TIMEZONE_MODE_INVALID)
3516 arg_timezone = settings->timezone;
3517
f757855e
LP
3518 return 0;
3519}
3520
5d961407
LP
3521static int load_settings(void) {
3522 _cleanup_(settings_freep) Settings *settings = NULL;
3523 _cleanup_fclose_ FILE *f = NULL;
3524 _cleanup_free_ char *p = NULL;
3525 const char *fn, *i;
3526 int r;
3527
3528 /* If all settings are masked, there's no point in looking for
3529 * the settings file */
3530 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3531 return 0;
3532
3533 fn = strjoina(arg_machine, ".nspawn");
3534
3535 /* We first look in the admin's directories in /etc and /run */
3536 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3537 _cleanup_free_ char *j = NULL;
3538
3539 j = strjoin(i, "/", fn);
3540 if (!j)
3541 return log_oom();
3542
3543 f = fopen(j, "re");
3544 if (f) {
3545 p = TAKE_PTR(j);
3546
3547 /* By default, we trust configuration from /etc and /run */
3548 if (arg_settings_trusted < 0)
3549 arg_settings_trusted = true;
3550
3551 break;
3552 }
3553
3554 if (errno != ENOENT)
3555 return log_error_errno(errno, "Failed to open %s: %m", j);
3556 }
3557
3558 if (!f) {
3559 /* After that, let's look for a file next to the
3560 * actual image we shall boot. */
3561
3562 if (arg_image) {
3563 p = file_in_same_dir(arg_image, fn);
3564 if (!p)
3565 return log_oom();
3566 } else if (arg_directory) {
3567 p = file_in_same_dir(arg_directory, fn);
3568 if (!p)
3569 return log_oom();
3570 }
3571
3572 if (p) {
3573 f = fopen(p, "re");
3574 if (!f && errno != ENOENT)
3575 return log_error_errno(errno, "Failed to open %s: %m", p);
3576
3577 /* By default, we do not trust configuration from /var/lib/machines */
3578 if (arg_settings_trusted < 0)
3579 arg_settings_trusted = false;
3580 }
3581 }
3582
3583 if (!f)
3584 return 0;
3585
3586 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3587
3588 r = settings_load(f, p, &settings);
3589 if (r < 0)
3590 return r;
3591
3592 return merge_settings(settings, p);
3593}
3594
b0067625
ZJS
3595static int run(int master,
3596 const char* console,
2d845785 3597 DissectedImage *dissected_image,
b0067625
ZJS
3598 bool interactive,
3599 bool secondary,
3600 FDSet *fds,
3601 char veth_name[IFNAMSIZ], bool *veth_created,
3602 union in_addr_union *exposed,
3603 pid_t *pid, int *ret) {
3604
3605 static const struct sigaction sa = {
3606 .sa_handler = nop_signal_handler,
e28c7cd0 3607 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3608 };
3609
8e766630 3610 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
3611 _cleanup_close_ int etc_passwd_lock = -1;
3612 _cleanup_close_pair_ int
3613 kmsg_socket_pair[2] = { -1, -1 },
3614 rtnl_socket_pair[2] = { -1, -1 },
3615 pid_socket_pair[2] = { -1, -1 },
3616 uuid_socket_pair[2] = { -1, -1 },
3617 notify_socket_pair[2] = { -1, -1 },
8199d554
LP
3618 uid_shift_socket_pair[2] = { -1, -1 },
3619 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
3620
b0067625
ZJS
3621 _cleanup_close_ int notify_socket= -1;
3622 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3623 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3624 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3625 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3626 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 3627 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625
ZJS
3628 ContainerStatus container_status = 0;
3629 char last_char = 0;
3630 int ifi = 0, r;
3631 ssize_t l;
3632 sigset_t mask_chld;
d7bea6b6 3633 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
3634
3635 assert_se(sigemptyset(&mask_chld) == 0);
3636 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3637
3638 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3639 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3640 * check with getpwuid() if the specific user already exists. Note that /etc might be
3641 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3642 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3643 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3644 * really ours. */
3645
3646 etc_passwd_lock = take_etc_passwd_lock(NULL);
3647 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3648 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3649 }
3650
3651 r = barrier_create(&barrier);
3652 if (r < 0)
3653 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3654
3655 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3656 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3657
3658 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3659 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3660
3661 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3662 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3663
3664 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3665 return log_error_errno(errno, "Failed to create id socket pair: %m");
3666
3667 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3668 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3669
3670 if (arg_userns_mode != USER_NAMESPACE_NO)
3671 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3672 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3673
8199d554
LP
3674 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
3675 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
3676 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
3677
b0067625
ZJS
3678 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3679 * parent's blocking calls and give it a chance to call wait() and terminate. */
3680 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3681 if (r < 0)
3682 return log_error_errno(errno, "Failed to change the signal mask: %m");
3683
3684 r = sigaction(SIGCHLD, &sa, NULL);
3685 if (r < 0)
3686 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3687
d7bea6b6
DP
3688 if (arg_network_namespace_path) {
3689 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3690 if (netns_fd < 0)
3691 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
3692
3693 r = fd_is_network_ns(netns_fd);
6619ad88
LP
3694 if (r == -EUCLEAN)
3695 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
3696 else if (r < 0)
d7bea6b6 3697 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
6619ad88
LP
3698 else if (r == 0) {
3699 log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
3700 return -EINVAL;
3701 }
3702 }
3703
b0067625
ZJS
3704 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3705 if (*pid < 0)
3706 return log_error_errno(errno, "clone() failed%s: %m",
3707 errno == EINVAL ?
3708 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3709
3710 if (*pid == 0) {
3711 /* The outer child only has a file system namespace. */
3712 barrier_set_role(&barrier, BARRIER_CHILD);
3713
3714 master = safe_close(master);
3715
3716 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3717 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3718 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3719 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3720 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3721 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 3722 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
3723
3724 (void) reset_all_signal_handlers();
3725 (void) reset_signal_mask();
3726
3727 r = outer_child(&barrier,
3728 arg_directory,
3729 console,
2d845785 3730 dissected_image,
b0067625
ZJS
3731 interactive,
3732 secondary,
3733 pid_socket_pair[1],
3734 uuid_socket_pair[1],
3735 notify_socket_pair[1],
3736 kmsg_socket_pair[1],
3737 rtnl_socket_pair[1],
3738 uid_shift_socket_pair[1],
8199d554 3739 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
3740 fds,
3741 netns_fd);
b0067625
ZJS
3742 if (r < 0)
3743 _exit(EXIT_FAILURE);
3744
3745 _exit(EXIT_SUCCESS);
3746 }
3747
3748 barrier_set_role(&barrier, BARRIER_PARENT);
3749
3750 fds = fdset_free(fds);
3751
3752 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3753 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3754 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3755 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3756 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3757 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 3758 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
3759
3760 if (arg_userns_mode != USER_NAMESPACE_NO) {
3761 /* The child just let us know the UID shift it might have read from the image. */
3762 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3763 if (l < 0)
3764 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3765 if (l != sizeof arg_uid_shift) {
3766 log_error("Short read while reading UID shift.");
3767 return -EIO;
3768 }
3769
3770 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3771 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3772 * image, but if that's already in use, pick a new one, and report back to the child,
3773 * which one we now picked. */
3774
3775 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3776 if (r < 0)
3777 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3778
3779 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3780 if (l < 0)
3781 return log_error_errno(errno, "Failed to send UID shift: %m");
3782 if (l != sizeof arg_uid_shift) {
3783 log_error("Short write while writing UID shift.");
3784 return -EIO;
3785 }
3786 }
3787 }
3788
8199d554
LP
3789 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3790 /* The child let us know the support cgroup mode it might have read from the image. */
3791 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
3792 if (l < 0)
3793 return log_error_errno(errno, "Failed to read cgroup mode: %m");
3794 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
bd897e72
ZJS
3795 log_error("Short read while reading cgroup mode (%zu bytes).%s",
3796 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
3797 return -EIO;
3798 }
3799 }
3800
b0067625 3801 /* Wait for the outer child. */
d2e0ac3d
LP
3802 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
3803 if (r < 0)
3804 return r;
3805 if (r != EXIT_SUCCESS)
3806 return -EIO;
b0067625
ZJS
3807
3808 /* And now retrieve the PID of the inner child. */
3809 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3810 if (l < 0)
3811 return log_error_errno(errno, "Failed to read inner child PID: %m");
3812 if (l != sizeof *pid) {
3813 log_error("Short read while reading inner child PID.");
3814 return -EIO;
3815 }
3816
3817 /* We also retrieve container UUID in case it was generated by outer child */
3818 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3819 if (l < 0)
3820 return log_error_errno(errno, "Failed to read container machine ID: %m");
3821 if (l != sizeof(arg_uuid)) {
3822 log_error("Short read while reading container machined ID.");
3823 return -EIO;
3824 }
3825
3826 /* We also retrieve the socket used for notifications generated by outer child */
3827 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3828 if (notify_socket < 0)
3829 return log_error_errno(notify_socket,
3830 "Failed to receive notification socket from the outer child: %m");
3831
3832 log_debug("Init process invoked as PID "PID_FMT, *pid);
3833
3834 if (arg_userns_mode != USER_NAMESPACE_NO) {
3835 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3836 log_error("Child died too early.");
3837 return -ESRCH;
3838 }
3839
3840 r = setup_uid_map(*pid);
3841 if (r < 0)
3842 return r;
3843
3844 (void) barrier_place(&barrier); /* #2 */
3845 }
3846
3847 if (arg_private_network) {
75116558
PS
3848 if (!arg_network_namespace_path) {
3849 /* Wait until the child has unshared its network namespace. */
3850 if (!barrier_place_and_sync(&barrier)) { /* #3 */
3851 log_error("Child died too early");
3852 return -ESRCH;
3853 }
3854 }
3855
b0067625
ZJS
3856 r = move_network_interfaces(*pid, arg_network_interfaces);
3857 if (r < 0)
3858 return r;
3859
3860 if (arg_network_veth) {
3861 r = setup_veth(arg_machine, *pid, veth_name,
3862 arg_network_bridge || arg_network_zone);
3863 if (r < 0)
3864 return r;
3865 else if (r > 0)
3866 ifi = r;
3867
3868 if (arg_network_bridge) {
3869 /* Add the interface to a bridge */
3870 r = setup_bridge(veth_name, arg_network_bridge, false);
3871 if (r < 0)
3872 return r;
3873 if (r > 0)
3874 ifi = r;
3875 } else if (arg_network_zone) {
3876 /* Add the interface to a bridge, possibly creating it */
3877 r = setup_bridge(veth_name, arg_network_zone, true);
3878 if (r < 0)
3879 return r;
3880 if (r > 0)
3881 ifi = r;
3882 }
3883 }
3884
3885 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3886 if (r < 0)
3887 return r;
3888
3889 /* We created the primary and extra veth links now; let's remember this, so that we know to
3890 remove them later on. Note that we don't bother with removing veth links that were created
3891 here when their setup failed half-way, because in that case the kernel should be able to
3892 remove them on its own, since they cannot be referenced by anything yet. */
3893 *veth_created = true;
3894
3895 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3896 if (r < 0)
3897 return r;
3898
3899 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3900 if (r < 0)
3901 return r;
3902 }
3903
abdb9b08
LP
3904 if (arg_register || !arg_keep_unit) {
3905 r = sd_bus_default_system(&bus);
3906 if (r < 0)
3907 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
3908
3909 r = sd_bus_set_close_on_exit(bus, false);
3910 if (r < 0)
3911 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
3912 }
3913
3914 if (!arg_keep_unit) {
3915 /* When a new scope is created for this container, then we'll be registered as its controller, in which
3916 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
3917 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
3918
75152a4d
LP
3919 r = sd_bus_match_signal_async(
3920 bus,
3921 NULL,
3922 "org.freedesktop.systemd1",
3923 NULL,
3924 "org.freedesktop.systemd1.Scope",
3925 "RequestStop",
3926 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 3927 if (r < 0)
75152a4d 3928 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
3929 }
3930
b0067625
ZJS
3931 if (arg_register) {
3932 r = register_machine(
abdb9b08 3933 bus,
b0067625
ZJS
3934 arg_machine,
3935 *pid,
3936 arg_directory,
3937 arg_uuid,
3938 ifi,
3939 arg_slice,
3940 arg_custom_mounts, arg_n_custom_mounts,
3941 arg_kill_signal,
3942 arg_property,
3943 arg_keep_unit,
3944 arg_container_service_name);
3945 if (r < 0)
3946 return r;
abdb9b08 3947
cd2dfc6f
LP
3948 } else if (!arg_keep_unit) {
3949 r = allocate_scope(
abdb9b08 3950 bus,
cd2dfc6f
LP
3951 arg_machine,
3952 *pid,
3953 arg_slice,
3954 arg_custom_mounts, arg_n_custom_mounts,
3955 arg_kill_signal,
3956 arg_property);
3957 if (r < 0)
3958 return r;
3959
3960 } else if (arg_slice || arg_property)
3961 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3962
f0bef277 3963 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3964 if (r < 0)
3965 return r;
3966
720f0a2f
LP
3967 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
3968 if (r < 0)
3969 return r;
b0067625 3970
de54e02d 3971 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3972 if (r < 0)
3973 return r;
3974
3975 /* Notify the child that the parent is ready with all
3976 * its setup (including cgroup-ification), and that
3977 * the child can now hand over control to the code to
3978 * run inside the container. */
75116558 3979 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
3980
3981 /* Block SIGCHLD here, before notifying child.
3982 * process_pty() will handle it with the other signals. */
3983 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3984
3985 /* Reset signal to default */
3986 r = default_signals(SIGCHLD, -1);
3987 if (r < 0)
3988 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3989
3990 r = sd_event_new(&event);
3991 if (r < 0)
3992 return log_error_errno(r, "Failed to get default event source: %m");
3993
8fd010bb
LP
3994 (void) sd_event_set_watchdog(event, true);
3995
abdb9b08
LP
3996 if (bus) {
3997 r = sd_bus_attach_event(bus, event, 0);
3998 if (r < 0)
3999 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4000 }
4001
5773024d 4002 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4003 if (r < 0)
4004 return r;
4005
4006 /* Let the child know that we are ready and wait that the child is completely ready now. */
75116558 4007 if (!barrier_place_and_sync(&barrier)) { /* #5 */
b0067625
ZJS
4008 log_error("Child died too early.");
4009 return -ESRCH;
4010 }
4011
4012 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4013 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4014 etc_passwd_lock = safe_close(etc_passwd_lock);
4015
4016 sd_notifyf(false,
4017 "STATUS=Container running.\n"
4018 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4019 if (!arg_notify_ready)
919f5ae0 4020 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4021
4022 if (arg_kill_signal > 0) {
4023 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4024 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4025 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4026 } else {
4027 /* Immediately exit */
919f5ae0
LP
4028 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4029 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4030 }
4031
6916b164 4032 /* Exit when the child exits */
919f5ae0 4033 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4034
4035 if (arg_expose_ports) {
4036 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4037 if (r < 0)
4038 return r;
4039
4040 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4041 }
4042
4043 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4044
4045 r = pty_forward_new(event, master,
4046 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
4047 &forward);
4048 if (r < 0)
4049 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4050
4051 r = sd_event_loop(event);
4052 if (r < 0)
4053 return log_error_errno(r, "Failed to run event loop: %m");
4054
4055 pty_forward_get_last_char(forward, &last_char);
4056
4057 forward = pty_forward_free(forward);
4058
4059 if (!arg_quiet && last_char != '\n')
4060 putc('\n', stdout);
4061
4062 /* Kill if it is not dead yet anyway */
1d78fea2
LP
4063 if (bus) {
4064 if (arg_register)
4065 terminate_machine(bus, arg_machine);
4066 else if (!arg_keep_unit)
4067 terminate_scope(bus, arg_machine);
4068 }
b0067625
ZJS
4069
4070 /* Normally redundant, but better safe than sorry */
c67b0082 4071 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4072
4073 r = wait_for_container(*pid, &container_status);
4074 *pid = 0;
4075
4076 if (r < 0)
4077 /* We failed to wait for the container, or the container exited abnormally. */
4078 return r;
4079 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4080 /* r > 0 → The container exited with a non-zero status.
4081 * As a special case, we need to replace 133 with a different value,
4082 * because 133 is special-cased in the service file to reboot the container.
4083 * otherwise → The container exited with zero status and a reboot was not requested.
4084 */
2a49b612 4085 if (r == EXIT_FORCE_RESTART)
27e29a1e 4086 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4087 *ret = r;
b0067625
ZJS
4088 return 0; /* finito */
4089 }
4090
4091 /* CONTAINER_REBOOTED, loop again */
4092
4093 if (arg_keep_unit) {
4094 /* Special handling if we are running as a service: instead of simply
4095 * restarting the machine we want to restart the entire service, so let's
4096 * inform systemd about this with the special exit code 133. The service
4097 * file uses RestartForceExitStatus=133 so that this results in a full
4098 * nspawn restart. This is necessary since we might have cgroup parameters
4099 * set we want to have flushed out. */
2a49b612
ZJS
4100 *ret = EXIT_FORCE_RESTART;
4101 return 0; /* finito */
b0067625
ZJS
4102 }
4103
4104 expose_port_flush(arg_expose_ports, exposed);
4105
4106 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4107 *veth_created = false;
4108 return 1; /* loop again */
4109}
4110
bf428efb 4111static int initialize_rlimits(void) {
bf428efb
LP
4112 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4113 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4114 * container execution environments. */
4115
4116 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4117 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4118 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4119 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4120 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4121 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4122 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4123 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4124 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4125 [RLIMIT_NICE] = { 0, 0 },
4126 [RLIMIT_NOFILE] = { 1024, 4096 },
4127 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4128 [RLIMIT_RTPRIO] = { 0, 0 },
4129 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4130 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4131
4132 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4133 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4134 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4135 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4136 * that PID 1 changes a number of other resource limits during early initialization which is why we
4137 * don't read the other limits from PID 1 but prefer the static table above. */
4138 };
4139
4140 int rl;
4141
4142 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
4143 /* Let's only fill in what the user hasn't explicitly configured anyway */
4144 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4145 const struct rlimit *v;
4146 struct rlimit buffer;
4147
4148 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4149 /* For these two let's read the limits off PID 1. See above for an explanation. */
4150
4151 if (prlimit(1, rl, NULL, &buffer) < 0)
4152 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4153
4154 v = &buffer;
4155 } else
4156 v = kernel_defaults + rl;
4157
4158 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4159 if (!arg_rlimit[rl])
4160 return log_oom();
4161 }
4162
4163 if (DEBUG_LOGGING) {
4164 _cleanup_free_ char *k = NULL;
4165
4166 (void) rlimit_format(arg_rlimit[rl], &k);
4167 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4168 }
4169 }
4170
4171 return 0;
4172}
4173
03cfe0d5 4174int main(int argc, char *argv[]) {
2d845785
LP
4175 _cleanup_free_ char *console = NULL;
4176 _cleanup_close_ int master = -1;
03cfe0d5 4177 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4178 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4179 char veth_name[IFNAMSIZ] = "";
17cbb288 4180 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4181 pid_t pid = 0;
03cfe0d5 4182 union in_addr_union exposed = {};
8e766630 4183 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4184 bool interactive, veth_created = false, remove_tmprootdir = false;
4185 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4186 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4187 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4188 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
4189
4190 log_parse_environment();
4191 log_open();
415fc41c 4192
7732f92b
LP
4193 /* Make sure rename_process() in the stub init process can work */
4194 saved_argv = argv;
4195 saved_argc = argc;
4196
03cfe0d5
LP
4197 r = parse_argv(argc, argv);
4198 if (r <= 0)
4199 goto finish;
4200
fba868fa
LP
4201 r = must_be_root();
4202 if (r < 0)
03cfe0d5 4203 goto finish;
fba868fa 4204
bf428efb
LP
4205 r = initialize_rlimits();
4206 if (r < 0)
4207 goto finish;
4208
f757855e
LP
4209 r = determine_names();
4210 if (r < 0)
4211 goto finish;
4212
4213 r = load_settings();
4214 if (r < 0)
4215 goto finish;
4216
d5455d2f
LP
4217 parse_environment();
4218
5eee8290
LP
4219 r = cg_unified_flush();
4220 if (r < 0) {
4221 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4222 goto finish;
4223 }
4224
f757855e
LP
4225 r = verify_arguments();
4226 if (r < 0)
4227 goto finish;
03cfe0d5 4228
8199d554
LP
4229 r = detect_unified_cgroup_hierarchy_from_environment();
4230 if (r < 0)
4231 goto finish;
4232
2949ff26
LP
4233 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4234 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4235 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4236 (void) ignore_signals(SIGPIPE, -1);
4237
03cfe0d5
LP
4238 n_fd_passed = sd_listen_fds(false);
4239 if (n_fd_passed > 0) {
4240 r = fdset_new_listen_fds(&fds, false);
4241 if (r < 0) {
4242 log_error_errno(r, "Failed to collect file descriptors: %m");
4243 goto finish;
4244 }
4245 }
4246
83e803a9
ZJS
4247 /* The "default" umask. This is appropriate for most file and directory
4248 * operations performed by nspawn, and is the umask that will be used for
4249 * the child. Functions like copy_devnodes() change the umask temporarily. */
4250 umask(0022);
4251
03cfe0d5
LP
4252 if (arg_directory) {
4253 assert(!arg_image);
4254
4255 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4256 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4257 r = -EINVAL;
4258 goto finish;
4259 }
4260
4261 if (arg_ephemeral) {
4262 _cleanup_free_ char *np = NULL;
4263
8d4aa2bb 4264 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4265 if (r < 0)
4266 goto finish;
4267
03cfe0d5
LP
4268 /* If the specified path is a mount point we
4269 * generate the new snapshot immediately
4270 * inside it under a random name. However if
4271 * the specified is not a mount point we
4272 * create the new snapshot in the parent
4273 * directory, just next to it. */
e1873695 4274 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4275 if (r < 0) {
4276 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4277 goto finish;
4278 }
4279 if (r > 0)
770b5ce4 4280 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4281 else
770b5ce4 4282 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4283 if (r < 0) {
0f3be6ca 4284 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4285 goto finish;
4286 }
4287
4288 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4289 if (r < 0) {
4290 log_error_errno(r, "Failed to lock %s: %m", np);
4291 goto finish;
4292 }
4293
17cbb288
LP
4294 r = btrfs_subvol_snapshot(arg_directory, np,
4295 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4296 BTRFS_SNAPSHOT_FALLBACK_COPY |
4297 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4298 BTRFS_SNAPSHOT_RECURSIVE |
4299 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4300 if (r < 0) {
4301 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4302 goto finish;
ec16945e
LP
4303 }
4304
1cc6c93a 4305 free_and_replace(arg_directory, np);
ec16945e 4306
17cbb288 4307 remove_directory = true;
30535c16
LP
4308
4309 } else {
cb638b5e 4310 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4311 if (r < 0)
4312 goto finish;
4313
30535c16
LP
4314 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4315 if (r == -EBUSY) {
4316 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4317 goto finish;
4318 }
4319 if (r < 0) {
4320 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4321 goto finish;
30535c16
LP
4322 }
4323
4324 if (arg_template) {
8d4aa2bb 4325 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4326 if (r < 0)
4327 goto finish;
4328
17cbb288
LP
4329 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4330 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4331 BTRFS_SNAPSHOT_FALLBACK_COPY |
4332 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4333 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4334 BTRFS_SNAPSHOT_RECURSIVE |
4335 BTRFS_SNAPSHOT_QUOTA);
ff6c6cc1
LP
4336 if (r == -EEXIST)
4337 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4338 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4339 else if (r < 0) {
83521414 4340 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 4341 goto finish;
ff6c6cc1
LP
4342 } else
4343 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4344 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 4345 }
ec16945e
LP
4346 }
4347
7732f92b 4348 if (arg_start_mode == START_BOOT) {
a5201ed6 4349 const char *p;
c9fe05e0 4350
a5201ed6
LP
4351 if (arg_pivot_root_new)
4352 p = prefix_roota(arg_directory, arg_pivot_root_new);
4353 else
4354 p = arg_directory;
c9fe05e0
AR
4355
4356 if (path_is_os_tree(p) <= 0) {
4357 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4358 r = -EINVAL;
1b9e5b12
LP
4359 goto finish;
4360 }
4361 } else {
c9fe05e0
AR
4362 const char *p, *q;
4363
a5201ed6
LP
4364 if (arg_pivot_root_new)
4365 p = prefix_roota(arg_directory, arg_pivot_root_new);
4366 else
4367 p = arg_directory;
c9fe05e0
AR
4368
4369 q = strjoina(p, "/usr/");
1b9e5b12 4370
c9fe05e0
AR
4371 if (laccess(q, F_OK) < 0) {
4372 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4373 r = -EINVAL;
1b9e5b12 4374 goto finish;
1b9e5b12
LP
4375 }
4376 }
ec16945e 4377
6b9132a9 4378 } else {
ec16945e
LP
4379 assert(arg_image);
4380 assert(!arg_template);
4381
8d4aa2bb 4382 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4383 if (r < 0)
4384 goto finish;
4385
0f3be6ca
LP
4386 if (arg_ephemeral) {
4387 _cleanup_free_ char *np = NULL;
4388
4389 r = tempfn_random(arg_image, "machine.", &np);
4390 if (r < 0) {
4391 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4392 goto finish;
4393 }
4394
4395 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4396 if (r < 0) {
4397 r = log_error_errno(r, "Failed to create image lock: %m");
4398 goto finish;
4399 }
4400
1c876927 4401 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
4402 if (r < 0) {
4403 r = log_error_errno(r, "Failed to copy image file: %m");
4404 goto finish;
4405 }
4406
1cc6c93a 4407 free_and_replace(arg_image, np);
0f3be6ca
LP
4408
4409 remove_image = true;
4410 } else {
4411 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4412 if (r == -EBUSY) {
4413 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4414 goto finish;
4415 }
4416 if (r < 0) {
4417 r = log_error_errno(r, "Failed to create image lock: %m");
4418 goto finish;
4419 }
4623e8e6 4420
78ebe980
LP
4421 if (!arg_root_hash) {
4422 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4423 if (r < 0) {
4424 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4425 goto finish;
4426 }
4427 }
30535c16
LP
4428 }
4429
c67b0082 4430 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4431 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4432 goto finish;
1b9e5b12 4433 }
6b9132a9 4434
c67b0082
LP
4435 remove_tmprootdir = true;
4436
4437 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4438 if (!arg_directory) {
4439 r = log_oom();
4440 goto finish;
6b9132a9 4441 }
88213476 4442
2d845785
LP
4443 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4444 if (r < 0) {
4445 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
4446 goto finish;
4447 }
1b9e5b12 4448
4526113f 4449 r = dissect_image_and_warn(
e0f9e7bd 4450 loop->fd,
4526113f 4451 arg_image,
e0f9e7bd
LP
4452 arg_root_hash, arg_root_hash_size,
4453 DISSECT_IMAGE_REQUIRE_ROOT,
4454 &dissected_image);
2d845785 4455 if (r == -ENOPKG) {
4526113f 4456 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
4457 log_notice("Note that the disk image needs to\n"
4458 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4459 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4460 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4461 " d) or contain a file system without a partition table\n"
4462 "in order to be bootable with systemd-nspawn.");
1b9e5b12 4463 goto finish;
2d845785 4464 }
4526113f 4465 if (r < 0)
842f3b0f 4466 goto finish;
1b9e5b12 4467
4623e8e6
LP
4468 if (!arg_root_hash && dissected_image->can_verity)
4469 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4470
4471 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
4472 if (r < 0)
4473 goto finish;
0f3be6ca
LP
4474
4475 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4476 if (remove_image && unlink(arg_image) >= 0)
4477 remove_image = false;
842f3b0f 4478 }
842f3b0f 4479
86c0dd4a 4480 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4481 if (r < 0)
4482 goto finish;
4483
03cfe0d5
LP
4484 interactive =
4485 isatty(STDIN_FILENO) > 0 &&
4486 isatty(STDOUT_FILENO) > 0;
9c857b9d 4487
669fc4e5 4488 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
db7feb7e 4489 if (master < 0) {
ec16945e 4490 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4491 goto finish;
4492 }
4493
611b312b
LP
4494 r = ptsname_malloc(master, &console);
4495 if (r < 0) {
4496 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4497 goto finish;
68b02049
DW
4498 }
4499
4500 if (arg_selinux_apifs_context) {
4501 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4502 if (r < 0)
4503 goto finish;
a258bf26
LP
4504 }
4505
a258bf26 4506 if (unlockpt(master) < 0) {
ec16945e 4507 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4508 goto finish;
4509 }
4510
9c857b9d
LP
4511 if (!arg_quiet)
4512 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4513 arg_machine, arg_image ?: arg_directory);
4514
72c0a2c2 4515 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4516
66edd963 4517 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
4518 r = log_error_errno(errno, "Failed to become subreaper: %m");
4519 goto finish;
4520 }
4521
d87be9b0 4522 for (;;) {
b0067625
ZJS
4523 r = run(master,
4524 console,
2d845785 4525 dissected_image,
b0067625
ZJS
4526 interactive, secondary,
4527 fds,
4528 veth_name, &veth_created,
4529 &exposed,
4530 &pid, &ret);
4531 if (r <= 0)
d87be9b0 4532 break;
d87be9b0 4533 }
88213476
LP
4534
4535finish:
af4ec430 4536 sd_notify(false,
2a49b612
ZJS
4537 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4538 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4539
9444b1f2 4540 if (pid > 0)
c67b0082 4541 (void) kill(pid, SIGKILL);
88213476 4542
503546da 4543 /* Try to flush whatever is still queued in the pty */
6a0f896b 4544 if (master >= 0) {
1c876927 4545 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
4546 master = safe_close(master);
4547 }
4548
4549 if (pid > 0)
4550 (void) wait_for_terminate(pid, NULL);
503546da 4551
50ebcf6c
LP
4552 pager_close();
4553
17cbb288 4554 if (remove_directory && arg_directory) {
ec16945e
LP
4555 int k;
4556
17cbb288 4557 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4558 if (k < 0)
17cbb288 4559 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4560 }
4561
0f3be6ca
LP
4562 if (remove_image && arg_image) {
4563 if (unlink(arg_image) < 0)
4564 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4565 }
4566
c67b0082
LP
4567 if (remove_tmprootdir) {
4568 if (rmdir(tmprootdir) < 0)
4569 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4570 }
4571
785890ac
LP
4572 if (arg_machine) {
4573 const char *p;
4574
63c372cb 4575 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4576 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4577 }
4578
7a8f6325 4579 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4580
4581 if (veth_created)
4582 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4583 (void) remove_bridge(arg_network_zone);
f757855e 4584
04d391da 4585 free(arg_directory);
ec16945e
LP
4586 free(arg_template);
4587 free(arg_image);
7027ff61 4588 free(arg_machine);
3a9530e5 4589 free(arg_hostname);
c74e630d 4590 free(arg_user);
b53ede69
PW
4591 free(arg_pivot_root_new);
4592 free(arg_pivot_root_old);
5f932eb9 4593 free(arg_chdir);
c74e630d 4594 strv_free(arg_setenv);
f757855e 4595 free(arg_network_bridge);
c74e630d
LP
4596 strv_free(arg_network_interfaces);
4597 strv_free(arg_network_macvlan);
4bbfe7ad 4598 strv_free(arg_network_ipvlan);
f6d6bad1 4599 strv_free(arg_network_veth_extra);
f757855e 4600 strv_free(arg_parameters);
df1fac6d
LP
4601 free(arg_network_zone);
4602 free(arg_network_namespace_path);
4603 strv_free(arg_property);
f757855e
LP
4604 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4605 expose_port_free_all(arg_expose_ports);
4623e8e6 4606 free(arg_root_hash);
bf428efb 4607 rlimit_free_all(arg_rlimit);
df1fac6d
LP
4608 strv_free(arg_syscall_whitelist);
4609 strv_free(arg_syscall_blacklist);
d107bb7d 4610 arg_cpuset = cpu_set_mfree(arg_cpuset);
6d0b55c2 4611
ec16945e 4612 return r < 0 ? EXIT_FAILURE : ret;
88213476 4613}