]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: (void)ify more stuff
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
6b5cf3ea 4#include <blkid.h>
8fe0087e 5#endif
88213476 6#include <errno.h>
88213476 7#include <getopt.h>
0e7ac751 8#include <grp.h>
503f480f 9#include <linux/fs.h>
1b9e5b12 10#include <linux/loop.h>
0e7ac751 11#include <pwd.h>
8fe0087e 12#include <sched.h>
349cc4a5 13#if HAVE_SELINUX
8fe0087e 14#include <selinux/selinux.h>
1b9e5b12 15#endif
8fe0087e
LP
16#include <signal.h>
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20#include <sys/file.h>
8fe0087e
LP
21#include <sys/personality.h>
22#include <sys/prctl.h>
23#include <sys/types.h>
6916b164 24#include <sys/wait.h>
8fe0087e 25#include <unistd.h>
1b9e5b12 26
b053cd5f 27#include "sd-bus.h"
1f0cd86b 28#include "sd-daemon.h"
1f0cd86b 29#include "sd-id128.h"
8fe0087e 30
b5efdb8a 31#include "alloc-util.h"
8fe0087e
LP
32#include "barrier.h"
33#include "base-filesystem.h"
34#include "blkid-util.h"
35#include "btrfs-util.h"
b8ea7a6e 36#include "bus-error.h"
b053cd5f 37#include "bus-util.h"
8fe0087e 38#include "cap-list.h"
430f0182 39#include "capability-util.h"
04d391da 40#include "cgroup-util.h"
8fe0087e 41#include "copy.h"
d107bb7d 42#include "cpu-set-util.h"
4fc9982c 43#include "dev-setup.h"
2d845785 44#include "dissect-image.h"
8fe0087e 45#include "env-util.h"
3ffd4af2 46#include "fd-util.h"
842f3b0f 47#include "fdset.h"
a5c32cff 48#include "fileio.h"
f97b34a6 49#include "format-util.h"
f4f15635 50#include "fs-util.h"
1b9e5b12 51#include "gpt.h"
4623e8e6 52#include "hexdecoct.h"
8fe0087e 53#include "hostname-util.h"
910fd145 54#include "id128-util.h"
8fe0087e 55#include "log.h"
2d845785 56#include "loop-util.h"
8fe0087e 57#include "loopback-setup.h"
1b9cebf6 58#include "machine-image.h"
8fe0087e
LP
59#include "macro.h"
60#include "missing.h"
61#include "mkdir.h"
4349cd7c 62#include "mount-util.h"
049af8ad 63#include "mountpoint-util.h"
0cb8e3d1 64#include "namespace-util.h"
8fe0087e 65#include "netlink-util.h"
07630cea 66#include "nspawn-cgroup.h"
3603efde 67#include "nspawn-def.h"
07630cea
LP
68#include "nspawn-expose-ports.h"
69#include "nspawn-mount.h"
70#include "nspawn-network.h"
7336138e 71#include "nspawn-patch-uid.h"
07630cea 72#include "nspawn-register.h"
910fd145 73#include "nspawn-seccomp.h"
07630cea
LP
74#include "nspawn-settings.h"
75#include "nspawn-setuid.h"
7732f92b 76#include "nspawn-stub-pid1.h"
d8b4d14d 77#include "nulstr-util.h"
d58ad743 78#include "os-util.h"
50ebcf6c 79#include "pager.h"
6bedfcbb 80#include "parse-util.h"
8fe0087e 81#include "path-util.h"
294bf0c3 82#include "pretty-print.h"
0b452006 83#include "process-util.h"
8fe0087e
LP
84#include "ptyfwd.h"
85#include "random-util.h"
8869a0b4 86#include "raw-clone.h"
bf428efb 87#include "rlimit-util.h"
8fe0087e 88#include "rm-rf.h"
68b02049 89#include "selinux-util.h"
8fe0087e 90#include "signal-util.h"
2583fbea 91#include "socket-util.h"
8fcde012 92#include "stat-util.h"
15a5e950 93#include "stdio-util.h"
5c828e66 94#include "string-table.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
e4de7287 98#include "tmpfile-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
62b1e758
YW
103#if HAVE_SPLIT_USR
104#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
105#else
106#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
107#endif
108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
2a49b612
ZJS
114#define EXIT_FORCE_RESTART 133
115
113cea80
DH
116typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
118 CONTAINER_REBOOTED
119} ContainerStatus;
120
88213476 121static char *arg_directory = NULL;
ec16945e 122static char *arg_template = NULL;
5f932eb9 123static char *arg_chdir = NULL;
b53ede69
PW
124static char *arg_pivot_root_new = NULL;
125static char *arg_pivot_root_old = NULL;
687d0825 126static char *arg_user = NULL;
9444b1f2 127static sd_id128_t arg_uuid = {};
3a9530e5
LP
128static char *arg_machine = NULL; /* The name used by the host to refer to this */
129static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
130static const char *arg_selinux_context = NULL;
131static const char *arg_selinux_apifs_context = NULL;
9444b1f2 132static const char *arg_slice = NULL;
ff01d048 133static bool arg_private_network = false;
bc2f673e 134static bool arg_read_only = false;
7732f92b 135static StartMode arg_start_mode = START_PID1;
ec16945e 136static bool arg_ephemeral = false;
57fb9fb5 137static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 138static bool arg_link_journal_try = false;
520e0d54 139static uint64_t arg_caps_retain =
50b52222
LP
140 (1ULL << CAP_AUDIT_CONTROL) |
141 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
142 (1ULL << CAP_CHOWN) |
143 (1ULL << CAP_DAC_OVERRIDE) |
144 (1ULL << CAP_DAC_READ_SEARCH) |
145 (1ULL << CAP_FOWNER) |
146 (1ULL << CAP_FSETID) |
147 (1ULL << CAP_IPC_OWNER) |
148 (1ULL << CAP_KILL) |
149 (1ULL << CAP_LEASE) |
150 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 151 (1ULL << CAP_MKNOD) |
5076f0cc
LP
152 (1ULL << CAP_NET_BIND_SERVICE) |
153 (1ULL << CAP_NET_BROADCAST) |
154 (1ULL << CAP_NET_RAW) |
5076f0cc 155 (1ULL << CAP_SETFCAP) |
50b52222 156 (1ULL << CAP_SETGID) |
5076f0cc
LP
157 (1ULL << CAP_SETPCAP) |
158 (1ULL << CAP_SETUID) |
159 (1ULL << CAP_SYS_ADMIN) |
50b52222 160 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
161 (1ULL << CAP_SYS_CHROOT) |
162 (1ULL << CAP_SYS_NICE) |
163 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 164 (1ULL << CAP_SYS_RESOURCE) |
50b52222 165 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538 166static CustomMount *arg_custom_mounts = NULL;
88614c8a 167static size_t arg_n_custom_mounts = 0;
f4889f65 168static char **arg_setenv = NULL;
284c0b91 169static bool arg_quiet = false;
eb91eb18 170static bool arg_register = true;
89f7c846 171static bool arg_keep_unit = false;
aa28aefe 172static char **arg_network_interfaces = NULL;
c74e630d 173static char **arg_network_macvlan = NULL;
4bbfe7ad 174static char **arg_network_ipvlan = NULL;
69c79d3c 175static bool arg_network_veth = false;
f6d6bad1 176static char **arg_network_veth_extra = NULL;
f757855e 177static char *arg_network_bridge = NULL;
22b28dfd 178static char *arg_network_zone = NULL;
d7bea6b6 179static char *arg_network_namespace_path = NULL;
050f7277 180static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 181static char *arg_image = NULL;
f757855e 182static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 183static ExposePort *arg_expose_ports = NULL;
f36933fe 184static char **arg_property = NULL;
0de7acce 185static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 186static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 187static bool arg_userns_chown = false;
c6c8f6e2 188static int arg_kill_signal = 0;
5da38d07 189static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
190static SettingsMask arg_settings_mask = 0;
191static int arg_settings_trusted = -1;
192static char **arg_parameters = NULL;
6aadfa4c 193static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 194static bool arg_notify_ready = false;
5a8ff0e6 195static bool arg_use_cgns = true;
0c582db0 196static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 197static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
4623e8e6
LP
198static void *arg_root_hash = NULL;
199static size_t arg_root_hash_size = 0;
960e4569
LP
200static char **arg_syscall_whitelist = NULL;
201static char **arg_syscall_blacklist = NULL;
bf428efb 202static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 203static bool arg_no_new_privileges = false;
81f345df
LP
204static int arg_oom_score_adjust = 0;
205static bool arg_oom_score_adjust_set = false;
d107bb7d
LP
206static cpu_set_t *arg_cpuset = NULL;
207static unsigned arg_cpuset_ncpus = 0;
09d423e9 208static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 209static TimezoneMode arg_timezone = TIMEZONE_AUTO;
88213476 210
37ec0fdd
LP
211static int help(void) {
212 _cleanup_free_ char *link = NULL;
213 int r;
214
0221d68a 215 (void) pager_open(false);
50ebcf6c 216
37ec0fdd
LP
217 r = terminal_urlify_man("systemd-nspawn", "1", &link);
218 if (r < 0)
219 return log_oom();
220
88213476 221 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
a7e2e50d 222 "Spawn a command or OS in a light-weight container.\n\n"
a8828ed9
DW
223 " -h --help Show this help\n"
224 " --version Print version string\n"
69c79d3c 225 " -q --quiet Do not show status information\n"
1b9e5b12 226 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
227 " --template=PATH Initialize root directory from template directory,\n"
228 " if missing\n"
229 " -x --ephemeral Run container with snapshot of root directory, and\n"
230 " remove it after exit\n"
231 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 232 " --root-hash=HASH Specify verity root hash\n"
7732f92b 233 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 234 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 235 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
236 " --pivot-root=PATH[:PATH]\n"
237 " Pivot root to given directory in the container\n"
a8828ed9 238 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 239 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 240 " --hostname=NAME Override the hostname for the container\n"
69c79d3c 241 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 242 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 243 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 244 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 245 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 246 " Similar, but with user configured UID/GID range\n"
24597ee0 247 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
248 " --private-network Disable network in container\n"
249 " --network-interface=INTERFACE\n"
250 " Assign an existing network interface to the\n"
251 " container\n"
c74e630d
LP
252 " --network-macvlan=INTERFACE\n"
253 " Create a macvlan network interface based on an\n"
254 " existing network interface to the container\n"
4bbfe7ad
TG
255 " --network-ipvlan=INTERFACE\n"
256 " Create a ipvlan network interface based on an\n"
257 " existing network interface to the container\n"
a8eaaee7 258 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 259 " and container\n"
f6d6bad1
LP
260 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
261 " Add an additional virtual Ethernet link between\n"
262 " host and container\n"
ab046dde 263 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
264 " Add a virtual Ethernet connection to the container\n"
265 " and attach it to an existing bridge on the host\n"
266 " --network-zone=NAME Similar, but attach the new interface to an\n"
267 " an automatically managed bridge interface\n"
d7bea6b6
DP
268 " --network-namespace-path=PATH\n"
269 " Set network namespace to the one represented by\n"
270 " the specified kernel namespace file node\n"
6d0b55c2 271 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 272 " Expose a container IP port on the host\n"
82adf6af
LP
273 " -Z --selinux-context=SECLABEL\n"
274 " Set the SELinux security context to be used by\n"
275 " processes in the container\n"
276 " -L --selinux-apifs-context=SECLABEL\n"
277 " Set the SELinux security context to be used by\n"
278 " API/tmpfs file systems in the container\n"
a8828ed9
DW
279 " --capability=CAP In addition to the default, retain specified\n"
280 " capability\n"
281 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
282 " --system-call-filter=LIST|~LIST\n"
283 " Permit/prohibit specific system calls\n"
bf428efb 284 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
285 " --oom-score-adjust=VALUE\n"
286 " Adjust the OOM score value for the payload\n"
d107bb7d 287 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
c6c8f6e2 288 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
289 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
290 " host, try-guest, try-host\n"
574edc90 291 " -j Equivalent to --link-journal=try-guest\n"
09d423e9 292 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 293 " --timezone=MODE Select mode of /etc/localtime initialization\n"
69c79d3c 294 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
295 " --bind=PATH[:PATH[:OPTIONS]]\n"
296 " Bind mount a file or directory from the host into\n"
a8828ed9 297 " the container\n"
5e5bfa6e
EY
298 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
299 " Similar, but creates a read-only bind mount\n"
06c17c39 300 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
301 " --overlay=PATH[:PATH...]:PATH\n"
302 " Create an overlay mount from the host to \n"
303 " the container\n"
304 " --overlay-ro=PATH[:PATH...]:PATH\n"
305 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 306 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 307 " --register=BOOLEAN Register container as machine\n"
89f7c846 308 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 309 " the service unit nspawn is running in\n"
6d0b55c2 310 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 311 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 312 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
37ec0fdd
LP
313 "\nSee the %s for details.\n"
314 , program_invocation_short_name
315 , link
316 );
317
318 return 0;
88213476
LP
319}
320
86c0dd4a 321static int custom_mount_check_all(void) {
88614c8a 322 size_t i;
5a8af538 323
5a8af538
LP
324 for (i = 0; i < arg_n_custom_mounts; i++) {
325 CustomMount *m = &arg_custom_mounts[i];
326
0de7acce 327 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
328 if (arg_userns_chown)
329 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
330 "--private-users-chown may not be combined with custom root mounts.");
331 else if (arg_uid_shift == UID_INVALID)
332 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
333 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 334 }
5a8af538
LP
335 }
336
337 return 0;
338}
339
8199d554 340static int detect_unified_cgroup_hierarchy_from_environment(void) {
efdb0237 341 const char *e;
415fc41c 342 int r;
5da38d07 343
efdb0237
LP
344 /* Allow the user to control whether the unified hierarchy is used */
345 e = getenv("UNIFIED_CGROUP_HIERARCHY");
346 if (e) {
347 r = parse_boolean(e);
348 if (r < 0)
349 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
350 if (r > 0)
351 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
352 else
353 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
354 }
355
8199d554
LP
356 return 0;
357}
358
359static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
360 int r;
361
362 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
363 * image actually supports. */
b4cccbc1
LP
364 r = cg_all_unified();
365 if (r < 0)
366 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
367 if (r > 0) {
a8725a06
ZJS
368 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
369 * routine only detects 231, so we'll have a false negative here for 230. */
370 r = systemd_installation_has_version(directory, 230);
371 if (r < 0)
372 return log_error_errno(r, "Failed to determine systemd version in container: %m");
373 if (r > 0)
374 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
375 else
376 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 377 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
378 /* Mixed cgroup hierarchy support was added in 233 */
379 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
380 if (r < 0)
381 return log_error_errno(r, "Failed to determine systemd version in container: %m");
382 if (r > 0)
383 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
384 else
385 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
386 } else
5da38d07 387 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 388
8199d554
LP
389 log_debug("Using %s hierarchy for container.",
390 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
391 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
392
efdb0237
LP
393 return 0;
394}
395
0c582db0
LB
396static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
397 int r;
398
399 r = getenv_bool(name);
400 if (r == -ENXIO)
401 return;
402 if (r < 0)
403 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
404 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
405}
406
4f086aab 407static void parse_mount_settings_env(void) {
4f086aab 408 const char *e;
1099ceeb
LP
409 int r;
410
411 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
412 if (r >= 0)
413 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
414 else if (r != -ENXIO)
415 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
4f086aab
SU
416
417 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
418 if (!e)
419 return;
420
421 if (streq(e, "network")) {
422 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
423 return;
424 }
425
426 r = parse_boolean(e);
427 if (r < 0) {
428 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
429 return;
ab8ee0f2 430 }
4f086aab 431
ab8ee0f2
ZJS
432 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
433 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
434}
435
d5455d2f
LP
436static void parse_environment(void) {
437 const char *e;
438 int r;
439
440 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
441 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
442 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
443 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
444
445 parse_mount_settings_env();
446
489fae52
ZJS
447 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
448 * even if it is supported. If not supported, it has no effect. */
d5455d2f 449 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
489fae52
ZJS
450 if (r == 0 || !cg_ns_supported())
451 arg_use_cgns = false;
d5455d2f
LP
452
453 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
454 if (e)
455 arg_container_service_name = e;
456
457 detect_unified_cgroup_hierarchy_from_environment();
458}
459
88213476 460static int parse_argv(int argc, char *argv[]) {
a41fe3a2 461 enum {
acbeb427
ZJS
462 ARG_VERSION = 0x100,
463 ARG_PRIVATE_NETWORK,
bc2f673e 464 ARG_UUID,
5076f0cc 465 ARG_READ_ONLY,
57fb9fb5 466 ARG_CAPABILITY,
420c7379 467 ARG_DROP_CAPABILITY,
17fe0523
LP
468 ARG_LINK_JOURNAL,
469 ARG_BIND,
f4889f65 470 ARG_BIND_RO,
06c17c39 471 ARG_TMPFS,
5a8af538
LP
472 ARG_OVERLAY,
473 ARG_OVERLAY_RO,
eb91eb18 474 ARG_SHARE_SYSTEM,
89f7c846 475 ARG_REGISTER,
aa28aefe 476 ARG_KEEP_UNIT,
69c79d3c 477 ARG_NETWORK_INTERFACE,
c74e630d 478 ARG_NETWORK_MACVLAN,
4bbfe7ad 479 ARG_NETWORK_IPVLAN,
ab046dde 480 ARG_NETWORK_BRIDGE,
22b28dfd 481 ARG_NETWORK_ZONE,
f6d6bad1 482 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 483 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 484 ARG_PERSONALITY,
4d9f07b4 485 ARG_VOLATILE,
ec16945e 486 ARG_TEMPLATE,
f36933fe 487 ARG_PROPERTY,
6dac160c 488 ARG_PRIVATE_USERS,
c6c8f6e2 489 ARG_KILL_SIGNAL,
f757855e 490 ARG_SETTINGS,
5f932eb9 491 ARG_CHDIR,
b53ede69 492 ARG_PIVOT_ROOT,
7336138e 493 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 494 ARG_NOTIFY_READY,
4623e8e6 495 ARG_ROOT_HASH,
960e4569 496 ARG_SYSTEM_CALL_FILTER,
bf428efb 497 ARG_RLIMIT,
3a9530e5 498 ARG_HOSTNAME,
66edd963 499 ARG_NO_NEW_PRIVILEGES,
81f345df 500 ARG_OOM_SCORE_ADJUST,
d107bb7d 501 ARG_CPU_AFFINITY,
09d423e9 502 ARG_RESOLV_CONF,
1688841f 503 ARG_TIMEZONE,
a41fe3a2
LP
504 };
505
88213476 506 static const struct option options[] = {
d7bea6b6
DP
507 { "help", no_argument, NULL, 'h' },
508 { "version", no_argument, NULL, ARG_VERSION },
509 { "directory", required_argument, NULL, 'D' },
510 { "template", required_argument, NULL, ARG_TEMPLATE },
511 { "ephemeral", no_argument, NULL, 'x' },
512 { "user", required_argument, NULL, 'u' },
513 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
514 { "as-pid2", no_argument, NULL, 'a' },
515 { "boot", no_argument, NULL, 'b' },
516 { "uuid", required_argument, NULL, ARG_UUID },
517 { "read-only", no_argument, NULL, ARG_READ_ONLY },
518 { "capability", required_argument, NULL, ARG_CAPABILITY },
519 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 520 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
521 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
522 { "bind", required_argument, NULL, ARG_BIND },
523 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
524 { "tmpfs", required_argument, NULL, ARG_TMPFS },
525 { "overlay", required_argument, NULL, ARG_OVERLAY },
526 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
527 { "machine", required_argument, NULL, 'M' },
3a9530e5 528 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
529 { "slice", required_argument, NULL, 'S' },
530 { "setenv", required_argument, NULL, 'E' },
531 { "selinux-context", required_argument, NULL, 'Z' },
532 { "selinux-apifs-context", required_argument, NULL, 'L' },
533 { "quiet", no_argument, NULL, 'q' },
534 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
535 { "register", required_argument, NULL, ARG_REGISTER },
536 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
537 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
538 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
539 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
540 { "network-veth", no_argument, NULL, 'n' },
541 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
542 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
543 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
544 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
545 { "personality", required_argument, NULL, ARG_PERSONALITY },
546 { "image", required_argument, NULL, 'i' },
547 { "volatile", optional_argument, NULL, ARG_VOLATILE },
548 { "port", required_argument, NULL, 'p' },
549 { "property", required_argument, NULL, ARG_PROPERTY },
550 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
551 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
552 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
553 { "settings", required_argument, NULL, ARG_SETTINGS },
554 { "chdir", required_argument, NULL, ARG_CHDIR },
555 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
556 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
557 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
558 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 559 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 560 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 561 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 562 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 563 { "timezone", required_argument, NULL, ARG_TIMEZONE },
eb9da376 564 {}
88213476
LP
565 };
566
9444b1f2 567 int c, r;
d5455d2f 568 const char *p;
a42c8b54 569 uint64_t plus = 0, minus = 0;
f757855e 570 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
571
572 assert(argc >= 0);
573 assert(argv);
574
2e1f244e 575 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
576 switch (c) {
577
578 case 'h':
37ec0fdd 579 return help();
88213476 580
acbeb427 581 case ARG_VERSION:
3f6fd1ba 582 return version();
acbeb427 583
88213476 584 case 'D':
0f03c2a4 585 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 586 if (r < 0)
0f03c2a4 587 return r;
ec16945e
LP
588 break;
589
590 case ARG_TEMPLATE:
0f03c2a4 591 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 592 if (r < 0)
0f03c2a4 593 return r;
88213476
LP
594 break;
595
1b9e5b12 596 case 'i':
0f03c2a4 597 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 598 if (r < 0)
0f03c2a4 599 return r;
ec16945e
LP
600 break;
601
602 case 'x':
603 arg_ephemeral = true;
a2f577fc 604 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
605 break;
606
687d0825 607 case 'u':
2fc09a9c
DM
608 r = free_and_strdup(&arg_user, optarg);
609 if (r < 0)
7027ff61 610 return log_oom();
687d0825 611
f757855e 612 arg_settings_mask |= SETTING_USER;
687d0825
MV
613 break;
614
22b28dfd
LP
615 case ARG_NETWORK_ZONE: {
616 char *j;
617
618 j = strappend("vz-", optarg);
619 if (!j)
620 return log_oom();
621
622 if (!ifname_valid(j)) {
623 log_error("Network zone name not valid: %s", j);
624 free(j);
625 return -EINVAL;
626 }
627
df1fac6d 628 free_and_replace(arg_network_zone, j);
22b28dfd
LP
629
630 arg_network_veth = true;
631 arg_private_network = true;
632 arg_settings_mask |= SETTING_NETWORK;
633 break;
634 }
635
ab046dde 636 case ARG_NETWORK_BRIDGE:
ef76dff2 637
baaa35ad
ZJS
638 if (!ifname_valid(optarg))
639 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
640 "Bridge interface name not valid: %s", optarg);
ef76dff2 641
f757855e
LP
642 r = free_and_strdup(&arg_network_bridge, optarg);
643 if (r < 0)
644 return log_oom();
ab046dde 645
4831981d 646 _fallthrough_;
0dfaa006 647 case 'n':
69c79d3c
LP
648 arg_network_veth = true;
649 arg_private_network = true;
f757855e 650 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
651 break;
652
f6d6bad1
LP
653 case ARG_NETWORK_VETH_EXTRA:
654 r = veth_extra_parse(&arg_network_veth_extra, optarg);
655 if (r < 0)
656 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
657
658 arg_private_network = true;
659 arg_settings_mask |= SETTING_NETWORK;
660 break;
661
aa28aefe 662 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
663 if (!ifname_valid(optarg))
664 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
665 "Network interface name not valid: %s", optarg);
ef76dff2 666
c74e630d
LP
667 if (strv_extend(&arg_network_interfaces, optarg) < 0)
668 return log_oom();
669
670 arg_private_network = true;
f757855e 671 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
672 break;
673
674 case ARG_NETWORK_MACVLAN:
ef76dff2 675
baaa35ad
ZJS
676 if (!ifname_valid(optarg))
677 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
678 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 679
c74e630d 680 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
681 return log_oom();
682
4bbfe7ad 683 arg_private_network = true;
f757855e 684 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
685 break;
686
687 case ARG_NETWORK_IPVLAN:
ef76dff2 688
baaa35ad
ZJS
689 if (!ifname_valid(optarg))
690 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
691 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 692
4bbfe7ad
TG
693 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
694 return log_oom();
695
4831981d 696 _fallthrough_;
ff01d048
LP
697 case ARG_PRIVATE_NETWORK:
698 arg_private_network = true;
f757855e 699 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
700 break;
701
d7bea6b6
DP
702 case ARG_NETWORK_NAMESPACE_PATH:
703 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
704 if (r < 0)
705 return r;
706
707 break;
708
0f0dbc46 709 case 'b':
baaa35ad
ZJS
710 if (arg_start_mode == START_PID2)
711 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
712 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
713
714 arg_start_mode = START_BOOT;
715 arg_settings_mask |= SETTING_START_MODE;
716 break;
717
718 case 'a':
baaa35ad
ZJS
719 if (arg_start_mode == START_BOOT)
720 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
721 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
722
723 arg_start_mode = START_PID2;
724 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
725 break;
726
144f0fc0 727 case ARG_UUID:
9444b1f2 728 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
729 if (r < 0)
730 return log_error_errno(r, "Invalid UUID: %s", optarg);
731
baaa35ad
ZJS
732 if (sd_id128_is_null(arg_uuid))
733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
734 "Machine UUID may not be all zeroes.");
f757855e
LP
735
736 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 737 break;
aa96c6cb 738
9444b1f2 739 case 'S':
c74e630d 740 arg_slice = optarg;
144f0fc0
LP
741 break;
742
7027ff61 743 case 'M':
c1521918 744 if (isempty(optarg))
97b11eed 745 arg_machine = mfree(arg_machine);
c1521918 746 else {
baaa35ad
ZJS
747 if (!machine_name_is_valid(optarg))
748 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
749 "Invalid machine name: %s", optarg);
7027ff61 750
0c3c4284
LP
751 r = free_and_strdup(&arg_machine, optarg);
752 if (r < 0)
eb91eb18 753 return log_oom();
eb91eb18 754 }
9ce6d1b3 755 break;
7027ff61 756
3a9530e5
LP
757 case ARG_HOSTNAME:
758 if (isempty(optarg))
759 arg_hostname = mfree(arg_hostname);
760 else {
baaa35ad
ZJS
761 if (!hostname_is_valid(optarg, false))
762 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
763 "Invalid hostname: %s", optarg);
3a9530e5
LP
764
765 r = free_and_strdup(&arg_hostname, optarg);
766 if (r < 0)
767 return log_oom();
768 }
769
770 arg_settings_mask |= SETTING_HOSTNAME;
771 break;
772
82adf6af
LP
773 case 'Z':
774 arg_selinux_context = optarg;
a8828ed9
DW
775 break;
776
82adf6af
LP
777 case 'L':
778 arg_selinux_apifs_context = optarg;
a8828ed9
DW
779 break;
780
bc2f673e
LP
781 case ARG_READ_ONLY:
782 arg_read_only = true;
f757855e 783 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
784 break;
785
420c7379
LP
786 case ARG_CAPABILITY:
787 case ARG_DROP_CAPABILITY: {
6cbe4ed1 788 p = optarg;
9ed794a3 789 for (;;) {
6cbe4ed1 790 _cleanup_free_ char *t = NULL;
5076f0cc 791
6cbe4ed1
SS
792 r = extract_first_word(&p, &t, ",", 0);
793 if (r < 0)
794 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 795
6cbe4ed1
SS
796 if (r == 0)
797 break;
5076f0cc 798
39ed67d1
LP
799 if (streq(t, "all")) {
800 if (c == ARG_CAPABILITY)
a42c8b54 801 plus = (uint64_t) -1;
39ed67d1 802 else
a42c8b54 803 minus = (uint64_t) -1;
39ed67d1 804 } else {
acf4d158
YW
805 r = capability_from_name(t);
806 if (r < 0)
807 return log_error_errno(r, "Failed to parse capability %s.", t);
39ed67d1
LP
808
809 if (c == ARG_CAPABILITY)
acf4d158 810 plus |= 1ULL << r;
39ed67d1 811 else
acf4d158 812 minus |= 1ULL << r;
5076f0cc 813 }
5076f0cc
LP
814 }
815
f757855e 816 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
817 break;
818 }
819
66edd963
LP
820 case ARG_NO_NEW_PRIVILEGES:
821 r = parse_boolean(optarg);
822 if (r < 0)
823 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
824
825 arg_no_new_privileges = r;
826 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
827 break;
828
57fb9fb5
LP
829 case 'j':
830 arg_link_journal = LINK_GUEST;
574edc90 831 arg_link_journal_try = true;
4e1d6aa9 832 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
833 break;
834
835 case ARG_LINK_JOURNAL:
4e1d6aa9
LP
836 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
837 if (r < 0) {
838 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5
LP
839 return -EINVAL;
840 }
841
4e1d6aa9 842 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
843 break;
844
17fe0523 845 case ARG_BIND:
f757855e
LP
846 case ARG_BIND_RO:
847 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
848 if (r < 0)
849 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 850
f757855e 851 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 852 break;
06c17c39 853
f757855e
LP
854 case ARG_TMPFS:
855 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
856 if (r < 0)
857 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 858
f757855e 859 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 860 break;
5a8af538
LP
861
862 case ARG_OVERLAY:
ad85779a
LP
863 case ARG_OVERLAY_RO:
864 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
865 if (r == -EADDRNOTAVAIL)
866 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
867 if (r < 0)
868 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 869
f757855e 870 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 871 break;
06c17c39 872
a5f1cb3b 873 case 'E': {
f4889f65
LP
874 char **n;
875
baaa35ad
ZJS
876 if (!env_assignment_is_valid(optarg))
877 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
878 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
879
880 n = strv_env_set(arg_setenv, optarg);
881 if (!n)
882 return log_oom();
883
130d3d22 884 strv_free_and_replace(arg_setenv, n);
f757855e 885 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
886 break;
887 }
888
284c0b91
LP
889 case 'q':
890 arg_quiet = true;
891 break;
892
8a96d94e 893 case ARG_SHARE_SYSTEM:
a6b5216c 894 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 895 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 896 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 897 arg_clone_ns_flags = 0;
8a96d94e
LP
898 break;
899
eb91eb18
LP
900 case ARG_REGISTER:
901 r = parse_boolean(optarg);
902 if (r < 0) {
903 log_error("Failed to parse --register= argument: %s", optarg);
904 return r;
905 }
906
907 arg_register = r;
908 break;
909
89f7c846
LP
910 case ARG_KEEP_UNIT:
911 arg_keep_unit = true;
912 break;
913
6afc95b7
LP
914 case ARG_PERSONALITY:
915
ac45f971 916 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
917 if (arg_personality == PERSONALITY_INVALID)
918 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
919 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 920
f757855e 921 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
922 break;
923
4d9f07b4
LP
924 case ARG_VOLATILE:
925
926 if (!optarg)
f757855e 927 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
928 else if (streq(optarg, "help")) {
929 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
930 return 0;
931 } else {
f757855e 932 VolatileMode m;
4d9f07b4 933
f757855e 934 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
935 if (m < 0)
936 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
937 "Failed to parse --volatile= argument: %s", optarg);
938 else
f757855e 939 arg_volatile_mode = m;
6d0b55c2
LP
940 }
941
f757855e
LP
942 arg_settings_mask |= SETTING_VOLATILE_MODE;
943 break;
6d0b55c2 944
f757855e
LP
945 case 'p':
946 r = expose_port_parse(&arg_expose_ports, optarg);
947 if (r == -EEXIST)
948 return log_error_errno(r, "Duplicate port specification: %s", optarg);
949 if (r < 0)
950 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 951
f757855e 952 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 953 break;
6d0b55c2 954
f36933fe
LP
955 case ARG_PROPERTY:
956 if (strv_extend(&arg_property, optarg) < 0)
957 return log_oom();
958
959 break;
960
ae209204
ZJS
961 case ARG_PRIVATE_USERS: {
962 int boolean = -1;
0de7acce 963
ae209204
ZJS
964 if (!optarg)
965 boolean = true;
966 else if (!in_charset(optarg, DIGITS))
967 /* do *not* parse numbers as booleans */
968 boolean = parse_boolean(optarg);
969
970 if (boolean == false) {
0de7acce
LP
971 /* no: User namespacing off */
972 arg_userns_mode = USER_NAMESPACE_NO;
973 arg_uid_shift = UID_INVALID;
974 arg_uid_range = UINT32_C(0x10000);
ae209204 975 } else if (boolean == true) {
0de7acce
LP
976 /* yes: User namespacing on, UID range is read from root dir */
977 arg_userns_mode = USER_NAMESPACE_FIXED;
978 arg_uid_shift = UID_INVALID;
979 arg_uid_range = UINT32_C(0x10000);
980 } else if (streq(optarg, "pick")) {
981 /* pick: User namespacing on, UID range is picked randomly */
982 arg_userns_mode = USER_NAMESPACE_PICK;
983 arg_uid_shift = UID_INVALID;
984 arg_uid_range = UINT32_C(0x10000);
985 } else {
6c2058b3 986 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
987 const char *range, *shift;
988
0de7acce
LP
989 /* anything else: User namespacing on, UID range is explicitly configured */
990
6dac160c
LP
991 range = strchr(optarg, ':');
992 if (range) {
6c2058b3
ZJS
993 buffer = strndup(optarg, range - optarg);
994 if (!buffer)
995 return log_oom();
996 shift = buffer;
6dac160c
LP
997
998 range++;
bfd292ec
ZJS
999 r = safe_atou32(range, &arg_uid_range);
1000 if (r < 0)
be715731 1001 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1002 } else
1003 shift = optarg;
1004
be715731
ZJS
1005 r = parse_uid(shift, &arg_uid_shift);
1006 if (r < 0)
1007 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1008
1009 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1010 }
1011
baaa35ad
ZJS
1012 if (arg_uid_range <= 0)
1013 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1014 "UID range cannot be 0.");
be715731 1015
0de7acce 1016 arg_settings_mask |= SETTING_USERNS;
6dac160c 1017 break;
ae209204 1018 }
6dac160c 1019
0de7acce 1020 case 'U':
ccabee0d
LP
1021 if (userns_supported()) {
1022 arg_userns_mode = USER_NAMESPACE_PICK;
1023 arg_uid_shift = UID_INVALID;
1024 arg_uid_range = UINT32_C(0x10000);
1025
1026 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1027 }
1028
7336138e
LP
1029 break;
1030
0de7acce 1031 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1032 arg_userns_chown = true;
0de7acce
LP
1033
1034 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1035 break;
1036
c6c8f6e2 1037 case ARG_KILL_SIGNAL:
5c828e66
LP
1038 if (streq(optarg, "help")) {
1039 DUMP_STRING_TABLE(signal, int, _NSIG);
1040 return 0;
1041 }
1042
29a3db75 1043 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1044 if (arg_kill_signal < 0)
1045 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1046 "Cannot parse signal: %s", optarg);
c6c8f6e2 1047
f757855e
LP
1048 arg_settings_mask |= SETTING_KILL_SIGNAL;
1049 break;
1050
1051 case ARG_SETTINGS:
1052
1053 /* no → do not read files
1054 * yes → read files, do not override cmdline, trust only subset
1055 * override → read files, override cmdline, trust only subset
1056 * trusted → read files, do not override cmdline, trust all
1057 */
1058
1059 r = parse_boolean(optarg);
1060 if (r < 0) {
1061 if (streq(optarg, "trusted")) {
1062 mask_all_settings = false;
1063 mask_no_settings = false;
1064 arg_settings_trusted = true;
1065
1066 } else if (streq(optarg, "override")) {
1067 mask_all_settings = false;
1068 mask_no_settings = true;
1069 arg_settings_trusted = -1;
1070 } else
1071 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1072 } else if (r > 0) {
1073 /* yes */
1074 mask_all_settings = false;
1075 mask_no_settings = false;
1076 arg_settings_trusted = -1;
1077 } else {
1078 /* no */
1079 mask_all_settings = true;
1080 mask_no_settings = false;
1081 arg_settings_trusted = false;
1082 }
1083
c6c8f6e2
LP
1084 break;
1085
5f932eb9 1086 case ARG_CHDIR:
baaa35ad
ZJS
1087 if (!path_is_absolute(optarg))
1088 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1089 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1090
1091 r = free_and_strdup(&arg_chdir, optarg);
1092 if (r < 0)
1093 return log_oom();
1094
1095 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1096 break;
1097
b53ede69
PW
1098 case ARG_PIVOT_ROOT:
1099 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1100 if (r < 0)
1101 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1102
1103 arg_settings_mask |= SETTING_PIVOT_ROOT;
1104 break;
1105
9c1e04d0
AP
1106 case ARG_NOTIFY_READY:
1107 r = parse_boolean(optarg);
baaa35ad
ZJS
1108 if (r < 0)
1109 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1110 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1111 arg_notify_ready = r;
1112 arg_settings_mask |= SETTING_NOTIFY_READY;
1113 break;
1114
4623e8e6
LP
1115 case ARG_ROOT_HASH: {
1116 void *k;
1117 size_t l;
1118
1119 r = unhexmem(optarg, strlen(optarg), &k, &l);
1120 if (r < 0)
1121 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1122 if (l < sizeof(sd_id128_t)) {
1123 log_error("Root hash must be at least 128bit long: %s", optarg);
1124 free(k);
1125 return -EINVAL;
1126 }
1127
1128 free(arg_root_hash);
1129 arg_root_hash = k;
1130 arg_root_hash_size = l;
1131 break;
1132 }
1133
960e4569
LP
1134 case ARG_SYSTEM_CALL_FILTER: {
1135 bool negative;
1136 const char *items;
1137
1138 negative = optarg[0] == '~';
1139 items = negative ? optarg + 1 : optarg;
1140
1141 for (;;) {
1142 _cleanup_free_ char *word = NULL;
1143
1144 r = extract_first_word(&items, &word, NULL, 0);
1145 if (r == 0)
1146 break;
1147 if (r == -ENOMEM)
1148 return log_oom();
1149 if (r < 0)
1150 return log_error_errno(r, "Failed to parse system call filter: %m");
1151
1152 if (negative)
1153 r = strv_extend(&arg_syscall_blacklist, word);
1154 else
1155 r = strv_extend(&arg_syscall_whitelist, word);
1156 if (r < 0)
1157 return log_oom();
1158 }
1159
1160 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1161 break;
1162 }
1163
bf428efb
LP
1164 case ARG_RLIMIT: {
1165 const char *eq;
1166 char *name;
1167 int rl;
1168
5c828e66
LP
1169 if (streq(optarg, "help")) {
1170 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1171 return 0;
1172 }
1173
bf428efb 1174 eq = strchr(optarg, '=');
baaa35ad
ZJS
1175 if (!eq)
1176 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1177 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1178
1179 name = strndup(optarg, eq - optarg);
1180 if (!name)
1181 return log_oom();
1182
1183 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1184 if (rl < 0)
1185 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1186 "Unknown resource limit: %s", name);
bf428efb
LP
1187
1188 if (!arg_rlimit[rl]) {
1189 arg_rlimit[rl] = new0(struct rlimit, 1);
1190 if (!arg_rlimit[rl])
1191 return log_oom();
1192 }
1193
1194 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1195 if (r < 0)
1196 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1197
1198 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1199 break;
1200 }
1201
81f345df
LP
1202 case ARG_OOM_SCORE_ADJUST:
1203 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1204 if (r < 0)
1205 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1206
1207 arg_oom_score_adjust_set = true;
1208 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1209 break;
1210
d107bb7d
LP
1211 case ARG_CPU_AFFINITY: {
1212 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1213
1214 r = parse_cpu_set(optarg, &cpuset);
1215 if (r < 0)
1216 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1217
1218 if (arg_cpuset)
1219 CPU_FREE(arg_cpuset);
1220
1221 arg_cpuset = TAKE_PTR(cpuset);
1222 arg_cpuset_ncpus = r;
1223 arg_settings_mask |= SETTING_CPU_AFFINITY;
1224 break;
1225 }
1226
09d423e9
LP
1227 case ARG_RESOLV_CONF:
1228 if (streq(optarg, "help")) {
1229 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1230 return 0;
1231 }
1232
1233 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1234 if (arg_resolv_conf < 0)
1235 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1236 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1237
1238 arg_settings_mask |= SETTING_RESOLV_CONF;
1239 break;
1240
1688841f
LP
1241 case ARG_TIMEZONE:
1242 if (streq(optarg, "help")) {
1243 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1244 return 0;
1245 }
1246
1247 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1248 if (arg_timezone < 0)
1249 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1250 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1251
1252 arg_settings_mask |= SETTING_TIMEZONE;
1253 break;
1254
88213476
LP
1255 case '?':
1256 return -EINVAL;
1257
1258 default:
eb9da376 1259 assert_not_reached("Unhandled option");
88213476 1260 }
88213476 1261
60f1ec13
LP
1262 if (argc > optind) {
1263 strv_free(arg_parameters);
1264 arg_parameters = strv_copy(argv + optind);
1265 if (!arg_parameters)
1266 return log_oom();
d7bea6b6 1267
60f1ec13
LP
1268 arg_settings_mask |= SETTING_START_MODE;
1269 }
1270
1271 if (arg_ephemeral && arg_template && !arg_directory)
1272 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1273 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1274 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1275 * --directory=". */
1276 arg_directory = TAKE_PTR(arg_template);
1277
1278 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1279
1280 /* Load all settings from .nspawn files */
1281 if (mask_no_settings)
1282 arg_settings_mask = 0;
1283
1284 /* Don't load any settings from .nspawn files */
1285 if (mask_all_settings)
1286 arg_settings_mask = _SETTINGS_MASK_ALL;
1287
1288 return 1;
1289}
1290
1291static int verify_arguments(void) {
1292 int r;
a6b5216c 1293
4f086aab
SU
1294 if (arg_userns_mode != USER_NAMESPACE_NO)
1295 arg_mount_settings |= MOUNT_USE_USERNS;
1296
1297 if (arg_private_network)
1298 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1299
48a8d337
LB
1300 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1301 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1302 arg_register = false;
baaa35ad 1303 if (arg_start_mode != START_PID1)
60f1ec13 1304 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1305 }
eb91eb18 1306
0de7acce 1307 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1308 arg_userns_chown = true;
1309
60f1ec13
LP
1310 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1311 arg_kill_signal = SIGRTMIN+3;
1312
e5a4bb0d
LP
1313 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1314 arg_read_only = true;
1315
baaa35ad 1316 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1317 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1318 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1319 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1320
baaa35ad 1321 if (arg_directory && arg_image)
60f1ec13 1322 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1323
baaa35ad 1324 if (arg_template && arg_image)
60f1ec13 1325 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1326
baaa35ad 1327 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1328 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1329
baaa35ad 1330 if (arg_ephemeral && arg_template)
60f1ec13 1331 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1332
baaa35ad 1333 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1334 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1335
baaa35ad 1336 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1337 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1338
baaa35ad 1339 if (arg_userns_chown && arg_read_only)
60f1ec13 1340 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined.");
f757855e 1341
e5a4bb0d
LP
1342 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1343 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1344 * copy-up (in case of overlay) making the entire excercise pointless. */
1345 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1346 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1347
60f1ec13
LP
1348 /* If --network-namespace-path is given with any other network-related option,
1349 * we need to error out, to avoid conflicts between different network options. */
1350 if (arg_network_namespace_path &&
1351 (arg_network_interfaces || arg_network_macvlan ||
1352 arg_network_ipvlan || arg_network_veth_extra ||
1353 arg_network_bridge || arg_network_zone ||
1354 arg_network_veth || arg_private_network))
1355 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path cannot be combined with other network options.");
86c0dd4a 1356
60f1ec13
LP
1357 if (arg_network_bridge && arg_network_zone)
1358 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-bridge= and --network-zone= may not be combined.");
f757855e 1359
baaa35ad 1360 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1361 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1362
baaa35ad 1363 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1364 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1365
baaa35ad 1366 if (arg_expose_ports && !arg_private_network)
60f1ec13 1367 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1368
349cc4a5 1369#if ! HAVE_LIBIPTC
baaa35ad 1370 if (arg_expose_ports)
60f1ec13 1371 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1372#endif
1373
60f1ec13
LP
1374 r = custom_mount_check_all();
1375 if (r < 0)
1376 return r;
c6c8f6e2 1377
f757855e 1378 return 0;
88213476
LP
1379}
1380
03cfe0d5
LP
1381static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1382 assert(p);
1383
0de7acce 1384 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1385 return 0;
1386
1387 if (uid == UID_INVALID && gid == GID_INVALID)
1388 return 0;
1389
1390 if (uid != UID_INVALID) {
1391 uid += arg_uid_shift;
1392
1393 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1394 return -EOVERFLOW;
1395 }
1396
1397 if (gid != GID_INVALID) {
1398 gid += (gid_t) arg_uid_shift;
1399
1400 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1401 return -EOVERFLOW;
1402 }
1403
1404 if (lchown(p, uid, gid) < 0)
1405 return -errno;
b12afc8c
LP
1406
1407 return 0;
1408}
1409
03cfe0d5
LP
1410static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1411 const char *q;
dae8b82e 1412 int r;
03cfe0d5
LP
1413
1414 q = prefix_roota(root, path);
dae8b82e
ZJS
1415 r = mkdir_errno_wrapper(q, mode);
1416 if (r == -EEXIST)
1417 return 0;
1418 if (r < 0)
1419 return r;
03cfe0d5
LP
1420
1421 return userns_lchown(q, uid, gid);
1422}
1423
1688841f 1424static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1425 return PATH_STARTSWITH_SET(
1426 path,
1427 "../usr/share/zoneinfo/",
1428 "/usr/share/zoneinfo/");
1688841f
LP
1429}
1430
83205269
LP
1431static bool etc_writable(void) {
1432 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1433}
1434
e58a1277 1435static int setup_timezone(const char *dest) {
1688841f
LP
1436 _cleanup_free_ char *p = NULL, *etc = NULL;
1437 const char *where, *check;
1438 TimezoneMode m;
d4036145 1439 int r;
f8440af5 1440
e58a1277
LP
1441 assert(dest);
1442
1688841f 1443 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1444 r = readlink_malloc("/etc/localtime", &p);
1445 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1446 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1447 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1448 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1449 else if (r < 0) {
1450 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1451 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1452 * file.
1453 *
1454 * Example:
1455 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1456 */
1457 return 0;
1458 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1459 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1460 else
1461 m = arg_timezone;
1462 } else
1463 m = arg_timezone;
1464
1465 if (m == TIMEZONE_OFF)
1466 return 0;
1467
1468 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
d4036145 1469 if (r < 0) {
1688841f 1470 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1471 return 0;
1472 }
1473
1688841f
LP
1474 where = strjoina(etc, "/localtime");
1475
1476 switch (m) {
1477
1478 case TIMEZONE_DELETE:
1479 if (unlink(where) < 0)
1480 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1481
d4036145 1482 return 0;
d4036145 1483
1688841f
LP
1484 case TIMEZONE_SYMLINK: {
1485 _cleanup_free_ char *q = NULL;
1486 const char *z, *what;
4d1c38b8 1487
1688841f
LP
1488 z = timezone_from_path(p);
1489 if (!z) {
1490 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1491 return 0;
1688841f 1492 }
d4036145 1493
1688841f
LP
1494 r = readlink_malloc(where, &q);
1495 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1496 return 0; /* Already pointing to the right place? Then do nothing .. */
1497
1498 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1499 r = chase_symlinks(check, dest, 0, NULL);
1500 if (r < 0)
1501 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1502 else {
1503 if (unlink(where) < 0 && errno != ENOENT) {
1504 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1505 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1506 return 0;
1507 }
1508
1509 what = strjoina("../usr/share/zoneinfo/", z);
1510 if (symlink(what, where) < 0) {
1511 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1512 errno, "Failed to correct timezone of container, ignoring: %m");
1513 return 0;
1514 }
1515
1516 break;
1517 }
1518
1519 _fallthrough_;
d4036145 1520 }
68fb0892 1521
1688841f
LP
1522 case TIMEZONE_BIND: {
1523 _cleanup_free_ char *resolved = NULL;
1524 int found;
1525
1526 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1527 if (found < 0) {
1528 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1529 return 0;
1530 }
1531
1532 if (found == 0) /* missing? */
1533 (void) touch(resolved);
1534
1535 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1536 if (r >= 0)
1537 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1538
1539 _fallthrough_;
79d80fc1 1540 }
4d9f07b4 1541
1688841f
LP
1542 case TIMEZONE_COPY:
1543 /* If mounting failed, try to copy */
1544 r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
1545 if (r < 0) {
1546 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1547 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1548 return 0;
1549 }
1550
1551 break;
1552
1553 default:
1554 assert_not_reached("unexpected mode");
d4036145 1555 }
e58a1277 1556
1688841f 1557 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1558 r = userns_lchown(where, 0, 0);
1559 if (r < 0)
1688841f 1560 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1561
e58a1277 1562 return 0;
88213476
LP
1563}
1564
09d423e9
LP
1565static int have_resolv_conf(const char *path) {
1566 assert(path);
1567
1568 if (access(path, F_OK) < 0) {
1569 if (errno == ENOENT)
1570 return 0;
1571
1572 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1573 }
1574
1575 return 1;
1576}
1577
7357272e 1578static int resolved_listening(void) {
b8ea7a6e 1579 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1580 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1581 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1582 int r;
1583
7357272e 1584 /* Check if resolved is listening */
b053cd5f
LP
1585
1586 r = sd_bus_open_system(&bus);
1587 if (r < 0)
b8ea7a6e 1588 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1589
7357272e 1590 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1591 if (r < 0)
1592 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1593 if (r == 0)
1594 return 0;
7357272e
DM
1595
1596 r = sd_bus_get_property_string(bus,
1597 "org.freedesktop.resolve1",
1598 "/org/freedesktop/resolve1",
1599 "org.freedesktop.resolve1.Manager",
1600 "DNSStubListener",
b8ea7a6e 1601 &error,
7357272e
DM
1602 &dns_stub_listener_mode);
1603 if (r < 0)
b8ea7a6e 1604 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1605
1606 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1607}
1608
2547bb41 1609static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1610 _cleanup_free_ char *etc = NULL;
1611 const char *where, *what;
1612 ResolvConfMode m;
1613 int r;
2547bb41
LP
1614
1615 assert(dest);
1616
09d423e9
LP
1617 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1618 if (arg_private_network)
1619 m = RESOLV_CONF_OFF;
1620 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
83205269 1621 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
09d423e9 1622 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 1623 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 1624 else
83205269 1625 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
09d423e9
LP
1626 } else
1627 m = arg_resolv_conf;
1628
1629 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1630 return 0;
1631
87447ae4
LP
1632 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1633 if (r < 0) {
1634 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1635 return 0;
1636 }
1637
1638 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1639
1640 if (m == RESOLV_CONF_DELETE) {
1641 if (unlink(where) < 0)
1642 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1643
87447ae4
LP
1644 return 0;
1645 }
79d80fc1 1646
09d423e9
LP
1647 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1648 what = STATIC_RESOLV_CONF;
1649 else
1650 what = "/etc/resolv.conf";
87447ae4 1651
09d423e9
LP
1652 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1653 _cleanup_free_ char *resolved = NULL;
1654 int found;
1655
1656 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1657 if (found < 0) {
1658 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1659 return 0;
1660 }
3539724c 1661
87447ae4
LP
1662 if (found == 0) /* missing? */
1663 (void) touch(resolved);
5367354d 1664
09d423e9 1665 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1666 if (r >= 0)
87447ae4 1667 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1668 }
1669
1670 /* If that didn't work, let's copy the file */
09d423e9 1671 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1672 if (r < 0) {
3539724c
LP
1673 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1674 * resolved or something similar runs inside and the symlink points there.
68a313c5 1675 *
3539724c 1676 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1677 */
09d423e9 1678 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1679 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1680 return 0;
1681 }
2547bb41 1682
03cfe0d5
LP
1683 r = userns_lchown(where, 0, 0);
1684 if (r < 0)
3539724c 1685 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1686
2547bb41
LP
1687 return 0;
1688}
1689
1e4f1671 1690static int setup_boot_id(void) {
cdde6ba6
LP
1691 _cleanup_(unlink_and_freep) char *from = NULL;
1692 _cleanup_free_ char *path = NULL;
3bbaff3e 1693 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1694 const char *to;
04bc4a3f
LP
1695 int r;
1696
04bc4a3f
LP
1697 /* Generate a new randomized boot ID, so that each boot-up of
1698 * the container gets a new one */
1699
cdde6ba6
LP
1700 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1701 if (r < 0)
1702 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1703
1704 r = sd_id128_randomize(&rnd);
f647962d
MS
1705 if (r < 0)
1706 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1707
cdde6ba6 1708 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1709 if (r < 0)
1710 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1711
cdde6ba6
LP
1712 from = TAKE_PTR(path);
1713 to = "/proc/sys/kernel/random/boot_id";
1714
60e76d48 1715 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1716 if (r < 0)
1717 return r;
04bc4a3f 1718
cdde6ba6 1719 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1720}
1721
e58a1277 1722static int copy_devnodes(const char *dest) {
88213476
LP
1723 static const char devnodes[] =
1724 "null\0"
1725 "zero\0"
1726 "full\0"
1727 "random\0"
1728 "urandom\0"
85614d66
TG
1729 "tty\0"
1730 "net/tun\0";
88213476
LP
1731
1732 const char *d;
e58a1277 1733 int r = 0;
7fd1b19b 1734 _cleanup_umask_ mode_t u;
a258bf26
LP
1735
1736 assert(dest);
124640f1
LP
1737
1738 u = umask(0000);
88213476 1739
03cfe0d5
LP
1740 /* Create /dev/net, so that we can create /dev/net/tun in it */
1741 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1742 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1743
88213476 1744 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1745 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1746 struct stat st;
88213476 1747
7f112f50 1748 from = strappend("/dev/", d);
8967f291
LP
1749 if (!from)
1750 return log_oom();
1751
03cfe0d5 1752 to = prefix_root(dest, from);
8967f291
LP
1753 if (!to)
1754 return log_oom();
88213476
LP
1755
1756 if (stat(from, &st) < 0) {
1757
4a62c710
MS
1758 if (errno != ENOENT)
1759 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1760
baaa35ad
ZJS
1761 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1762 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1763 "%s is not a char or block device, cannot copy.", from);
1764 else {
8dfce114
LP
1765 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1766
81f5049b 1767 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1768 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1769 if (errno == EEXIST)
8dbf71ec 1770 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1771 if (errno != EPERM)
1772 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1773
8dfce114 1774 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
1775 r = touch(to);
1776 if (r < 0)
1777 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1778 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1779 if (r < 0)
1780 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1781 }
6278cf60 1782
03cfe0d5
LP
1783 r = userns_lchown(to, 0, 0);
1784 if (r < 0)
1785 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114
LP
1786
1787 dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
1788 if (!dn)
1789 return log_oom();
1790
1791 r = userns_mkdir(dest, dn, 0755, 0, 0);
1792 if (r < 0)
1793 return log_error_errno(r, "Failed to create '%s': %m", dn);
1794
1795 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1796 return log_oom();
1797
1798 prefixed = prefix_root(dest, sl);
1799 if (!prefixed)
1800 return log_oom();
1801
1802 t = strjoin("../", d);
1803 if (!t)
1804 return log_oom();
1805
1806 if (symlink(t, prefixed) < 0)
1807 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 1808 }
88213476
LP
1809 }
1810
e58a1277
LP
1811 return r;
1812}
88213476 1813
03cfe0d5
LP
1814static int setup_pts(const char *dest) {
1815 _cleanup_free_ char *options = NULL;
1816 const char *p;
709f6e46 1817 int r;
03cfe0d5 1818
349cc4a5 1819#if HAVE_SELINUX
03cfe0d5
LP
1820 if (arg_selinux_apifs_context)
1821 (void) asprintf(&options,
3dce8915 1822 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1823 arg_uid_shift + TTY_GID,
1824 arg_selinux_apifs_context);
1825 else
1826#endif
1827 (void) asprintf(&options,
3dce8915 1828 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1829 arg_uid_shift + TTY_GID);
f2d88580 1830
03cfe0d5 1831 if (!options)
f2d88580
LP
1832 return log_oom();
1833
03cfe0d5 1834 /* Mount /dev/pts itself */
cc9fce65 1835 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
1836 r = mkdir_errno_wrapper(p, 0755);
1837 if (r < 0)
1838 return log_error_errno(r, "Failed to create /dev/pts: %m");
1839
60e76d48
ZJS
1840 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1841 if (r < 0)
1842 return r;
709f6e46
MS
1843 r = userns_lchown(p, 0, 0);
1844 if (r < 0)
1845 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1846
1847 /* Create /dev/ptmx symlink */
1848 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1849 if (symlink("pts/ptmx", p) < 0)
1850 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1851 r = userns_lchown(p, 0, 0);
1852 if (r < 0)
1853 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1854
03cfe0d5
LP
1855 /* And fix /dev/pts/ptmx ownership */
1856 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1857 r = userns_lchown(p, 0, 0);
1858 if (r < 0)
1859 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1860
f2d88580
LP
1861 return 0;
1862}
1863
e58a1277 1864static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1865 _cleanup_umask_ mode_t u;
1866 const char *to;
e58a1277 1867 int r;
e58a1277
LP
1868
1869 assert(dest);
1870 assert(console);
1871
1872 u = umask(0000);
1873
03cfe0d5 1874 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1875 if (r < 0)
1876 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1877
a258bf26
LP
1878 /* We need to bind mount the right tty to /dev/console since
1879 * ptys can only exist on pts file systems. To have something
81f5049b 1880 * to bind mount things on we create a empty regular file. */
a258bf26 1881
03cfe0d5 1882 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1883 r = touch(to);
1884 if (r < 0)
1885 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1886
60e76d48 1887 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1888}
1889
8e5430c4
LP
1890static int setup_keyring(void) {
1891 key_serial_t keyring;
1892
1893 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1894 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1895 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1896 * these system calls let's make sure we don't leak anything into the container. */
1897
1898 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1899 if (keyring == -1) {
1900 if (errno == ENOSYS)
1901 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1902 else if (IN_SET(errno, EACCES, EPERM))
1903 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1904 else
1905 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1906 }
1907
1908 return 0;
1909}
1910
1e4f1671 1911static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
1912 _cleanup_(unlink_and_freep) char *from = NULL;
1913 _cleanup_free_ char *fifo = NULL;
1914 _cleanup_close_ int fd = -1;
7fd1b19b 1915 _cleanup_umask_ mode_t u;
9ec5a93c
LP
1916 const char *to;
1917 int r;
e58a1277 1918
e58a1277 1919 assert(kmsg_socket >= 0);
a258bf26 1920
e58a1277 1921 u = umask(0000);
a258bf26 1922
9ec5a93c
LP
1923 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
1924 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
1925 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
1926 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
1927
1928 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
1929 if (r < 0)
1930 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 1931
9ec5a93c 1932 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 1933 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
1934
1935 from = TAKE_PTR(fifo);
1936 to = "/proc/kmsg";
1937
60e76d48
ZJS
1938 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1939 if (r < 0)
1940 return r;
e58a1277 1941
669fc4e5 1942 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
1943 if (fd < 0)
1944 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1945
9ec5a93c 1946 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 1947 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
1948 if (r < 0)
1949 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1950
25ea79fe 1951 return 0;
88213476
LP
1952}
1953
1c4baffc 1954static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1955 union in_addr_union *exposed = userdata;
1956
1957 assert(rtnl);
1958 assert(m);
1959 assert(exposed);
1960
7a8f6325 1961 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1962 return 0;
1963}
1964
3a74cea5 1965static int setup_hostname(void) {
c818eef1 1966 int r;
3a74cea5 1967
0c582db0 1968 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1969 return 0;
1970
c818eef1
LP
1971 r = sethostname_idempotent(arg_hostname ?: arg_machine);
1972 if (r < 0)
1973 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 1974
7027ff61 1975 return 0;
3a74cea5
LP
1976}
1977
57fb9fb5 1978static int setup_journal(const char *directory) {
0f5e1382 1979 _cleanup_free_ char *d = NULL;
b2238e38
LP
1980 const char *dirname, *p, *q;
1981 sd_id128_t this_id;
1982 char id[33];
8054d749 1983 bool try;
57fb9fb5
LP
1984 int r;
1985
df9a75e4
LP
1986 /* Don't link journals in ephemeral mode */
1987 if (arg_ephemeral)
1988 return 0;
1989
8054d749
LP
1990 if (arg_link_journal == LINK_NO)
1991 return 0;
1992
1993 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1994
4d680aee 1995 r = sd_id128_get_machine(&this_id);
f647962d
MS
1996 if (r < 0)
1997 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1998
e01ff70a 1999 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2000 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2001 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2002 if (try)
4d680aee 2003 return 0;
df9a75e4 2004 return -EEXIST;
4d680aee
ZJS
2005 }
2006
369ca6da
ZJS
2007 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2008 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2009 if (r < 0) {
2010 bool ignore = r == -EROFS && try;
2011 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2012 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2013 return ignore ? 0 : r;
2014 }
2015 }
03cfe0d5 2016
e01ff70a
MS
2017 (void) sd_id128_to_string(arg_uuid, id);
2018
03cfe0d5
LP
2019 p = strjoina("/var/log/journal/", id);
2020 q = prefix_roota(directory, p);
27407a01 2021
e1873695 2022 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2023 if (try)
2024 return 0;
27407a01 2025
baaa35ad
ZJS
2026 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2027 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2028 }
2029
e1873695 2030 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2031 if (try)
2032 return 0;
57fb9fb5 2033
baaa35ad
ZJS
2034 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2035 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2036 }
2037
2038 r = readlink_and_make_absolute(p, &d);
2039 if (r >= 0) {
3742095b 2040 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2041 path_equal(d, q)) {
2042
03cfe0d5 2043 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2044 if (r < 0)
709f6e46 2045 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2046 return 0;
57fb9fb5
LP
2047 }
2048
4a62c710
MS
2049 if (unlink(p) < 0)
2050 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2051 } else if (r == -EINVAL) {
2052
2053 if (arg_link_journal == LINK_GUEST &&
2054 rmdir(p) < 0) {
2055
27407a01
ZJS
2056 if (errno == ENOTDIR) {
2057 log_error("%s already exists and is neither a symlink nor a directory", p);
2058 return r;
4314d33f
MS
2059 } else
2060 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2061 }
4314d33f
MS
2062 } else if (r != -ENOENT)
2063 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2064
2065 if (arg_link_journal == LINK_GUEST) {
2066
2067 if (symlink(q, p) < 0) {
8054d749 2068 if (try) {
56f64d95 2069 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2070 return 0;
4314d33f
MS
2071 } else
2072 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2073 }
2074
03cfe0d5 2075 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2076 if (r < 0)
709f6e46 2077 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2078 return 0;
57fb9fb5
LP
2079 }
2080
2081 if (arg_link_journal == LINK_HOST) {
ccddd104 2082 /* don't create parents here — if the host doesn't have
574edc90 2083 * permanent journal set up, don't force it here */
ba8e6c4d 2084
dae8b82e
ZJS
2085 r = mkdir_errno_wrapper(p, 0755);
2086 if (r < 0 && r != -EEXIST) {
8054d749 2087 if (try) {
dae8b82e 2088 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2089 return 0;
4314d33f 2090 } else
dae8b82e 2091 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2092 }
2093
27407a01
ZJS
2094 } else if (access(p, F_OK) < 0)
2095 return 0;
57fb9fb5 2096
cdb2b9d0
LP
2097 if (dir_is_empty(q) == 0)
2098 log_warning("%s is not empty, proceeding anyway.", q);
2099
03cfe0d5 2100 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2101 if (r < 0)
2102 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2103
60e76d48
ZJS
2104 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2105 if (r < 0)
4a62c710 2106 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2107
27407a01 2108 return 0;
57fb9fb5
LP
2109}
2110
88213476 2111static int drop_capabilities(void) {
520e0d54 2112 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
2113}
2114
db999e0f
LP
2115static int reset_audit_loginuid(void) {
2116 _cleanup_free_ char *p = NULL;
2117 int r;
2118
0c582db0 2119 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2120 return 0;
2121
2122 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2123 if (r == -ENOENT)
db999e0f 2124 return 0;
f647962d
MS
2125 if (r < 0)
2126 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2127
2128 /* Already reset? */
2129 if (streq(p, "4294967295"))
2130 return 0;
2131
57512c89 2132 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2133 if (r < 0) {
10a87006
LP
2134 log_error_errno(r,
2135 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2136 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2137 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2138 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2139 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2140
db999e0f 2141 sleep(5);
77b6e194 2142 }
db999e0f
LP
2143
2144 return 0;
77b6e194
LP
2145}
2146
785890ac
LP
2147static int setup_propagate(const char *root) {
2148 const char *p, *q;
709f6e46 2149 int r;
785890ac
LP
2150
2151 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2152 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2153 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2154 (void) mkdir_p(p, 0600);
2155
709f6e46
MS
2156 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2157 if (r < 0)
2158 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2159
709f6e46
MS
2160 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2161 if (r < 0)
2162 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2163
709f6e46
MS
2164 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2165 if (r < 0)
2166 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2167
03cfe0d5 2168 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2169 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2170 if (r < 0)
2171 return r;
785890ac 2172
60e76d48
ZJS
2173 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2174 if (r < 0)
2175 return r;
785890ac 2176
19caffac
AC
2177 /* machined will MS_MOVE into that directory, and that's only
2178 * supported for non-shared mounts. */
60e76d48 2179 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2180}
2181
317feb4d 2182static int setup_machine_id(const char *directory) {
691675ba
LP
2183 const char *etc_machine_id;
2184 sd_id128_t id;
3bbaff3e 2185 int r;
e01ff70a 2186
317feb4d
LP
2187 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2188 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2189 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2190 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2191 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2192 * container behaves nicely). */
2193
e01ff70a
MS
2194 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2195
691675ba 2196 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2197 if (r < 0) {
2198 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2199 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2200
317feb4d
LP
2201 if (sd_id128_is_null(arg_uuid)) {
2202 r = sd_id128_randomize(&arg_uuid);
2203 if (r < 0)
2204 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2205 }
2206 } else {
baaa35ad
ZJS
2207 if (sd_id128_is_null(id))
2208 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2209 "Machine ID in container image is zero, refusing.");
e01ff70a 2210
317feb4d
LP
2211 arg_uuid = id;
2212 }
691675ba 2213
e01ff70a
MS
2214 return 0;
2215}
2216
7336138e
LP
2217static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2218 int r;
2219
2220 assert(directory);
2221
0de7acce 2222 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2223 return 0;
2224
2225 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2226 if (r == -EOPNOTSUPP)
2227 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2228 if (r == -EBADE)
2229 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2230 if (r < 0)
2231 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2232 if (r == 0)
2233 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2234 else
2235 log_debug("Patched directory tree to match UID/GID range.");
2236
2237 return r;
2238}
2239
113cea80 2240/*
6d416b9c
LS
2241 * Return values:
2242 * < 0 : wait_for_terminate() failed to get the state of the
2243 * container, the container was terminated by a signal, or
2244 * failed for an unknown reason. No change is made to the
2245 * container argument.
2246 * > 0 : The program executed in the container terminated with an
2247 * error. The exit code of the program executed in the
919699ec
LP
2248 * container is returned. The container argument has been set
2249 * to CONTAINER_TERMINATED.
6d416b9c
LS
2250 * 0 : The container is being rebooted, has been shut down or exited
2251 * successfully. The container argument has been set to either
2252 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2253 *
6d416b9c
LS
2254 * That is, success is indicated by a return value of zero, and an
2255 * error is indicated by a non-zero value.
113cea80
DH
2256 */
2257static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2258 siginfo_t status;
919699ec 2259 int r;
113cea80
DH
2260
2261 r = wait_for_terminate(pid, &status);
f647962d
MS
2262 if (r < 0)
2263 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2264
2265 switch (status.si_code) {
fddbb89c 2266
113cea80 2267 case CLD_EXITED:
b5a2179b 2268 if (status.si_status == 0)
919699ec 2269 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2270 else
919699ec 2271 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2272
919699ec
LP
2273 *container = CONTAINER_TERMINATED;
2274 return status.si_status;
113cea80
DH
2275
2276 case CLD_KILLED:
2277 if (status.si_status == SIGINT) {
919699ec 2278 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2279 *container = CONTAINER_TERMINATED;
919699ec
LP
2280 return 0;
2281
113cea80 2282 } else if (status.si_status == SIGHUP) {
919699ec 2283 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2284 *container = CONTAINER_REBOOTED;
919699ec 2285 return 0;
113cea80 2286 }
919699ec 2287
4831981d 2288 _fallthrough_;
113cea80 2289 case CLD_DUMPED:
baaa35ad
ZJS
2290 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2291 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2292
2293 default:
baaa35ad
ZJS
2294 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2295 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2296 }
113cea80
DH
2297}
2298
023fb90b
LP
2299static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2300 pid_t pid;
2301
4a0b58c4 2302 pid = PTR_TO_PID(userdata);
023fb90b 2303 if (pid > 0) {
c6c8f6e2 2304 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2305 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2306 sd_event_source_set_userdata(s, NULL);
2307 return 0;
2308 }
2309 }
2310
2311 sd_event_exit(sd_event_source_get_event(s), 0);
2312 return 0;
2313}
2314
6916b164 2315static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2316 pid_t pid;
2317
2318 assert(s);
2319 assert(ssi);
2320
2321 pid = PTR_TO_PID(userdata);
2322
6916b164
AU
2323 for (;;) {
2324 siginfo_t si = {};
abdb9b08 2325
6916b164
AU
2326 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2327 return log_error_errno(errno, "Failed to waitid(): %m");
2328 if (si.si_pid == 0) /* No pending children. */
2329 break;
abdb9b08 2330 if (si.si_pid == pid) {
6916b164
AU
2331 /* The main process we care for has exited. Return from
2332 * signal handler but leave the zombie. */
2333 sd_event_exit(sd_event_source_get_event(s), 0);
2334 break;
2335 }
abdb9b08 2336
6916b164
AU
2337 /* Reap all other children. */
2338 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2339 }
2340
2341 return 0;
2342}
2343
abdb9b08
LP
2344static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2345 pid_t pid;
2346
2347 assert(m);
2348
2349 pid = PTR_TO_PID(userdata);
2350
2351 if (arg_kill_signal > 0) {
2352 log_info("Container termination requested. Attempting to halt container.");
2353 (void) kill(pid, arg_kill_signal);
2354 } else {
2355 log_info("Container termination requested. Exiting.");
2356 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2357 }
2358
2359 return 0;
2360}
2361
ec16945e 2362static int determine_names(void) {
1b9cebf6 2363 int r;
ec16945e 2364
c1521918
LP
2365 if (arg_template && !arg_directory && arg_machine) {
2366
2367 /* If --template= was specified then we should not
2368 * search for a machine, but instead create a new one
2369 * in /var/lib/machine. */
2370
605405c6 2371 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2372 if (!arg_directory)
2373 return log_oom();
2374 }
2375
ec16945e 2376 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2377 if (arg_machine) {
2378 _cleanup_(image_unrefp) Image *i = NULL;
2379
5ef46e5f 2380 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2381 if (r == -ENOENT)
2382 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2383 if (r < 0)
2384 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2385
eb38edce 2386 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2387 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2388 else
0f03c2a4 2389 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2390 if (r < 0)
0f3be6ca 2391 return log_oom();
1b9cebf6 2392
aee327b8
LP
2393 if (!arg_ephemeral)
2394 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2395 } else {
2396 r = safe_getcwd(&arg_directory);
2397 if (r < 0)
2398 return log_error_errno(r, "Failed to determine current directory: %m");
2399 }
ec16945e 2400
0f3be6ca 2401 if (!arg_directory && !arg_image) {
1b9cebf6 2402 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2403 return -EINVAL;
2404 }
2405 }
2406
2407 if (!arg_machine) {
b9ba4dab
LP
2408 if (arg_directory && path_equal(arg_directory, "/"))
2409 arg_machine = gethostname_malloc();
4827ab48
LP
2410 else {
2411 if (arg_image) {
2412 char *e;
2413
2414 arg_machine = strdup(basename(arg_image));
2415
2416 /* Truncate suffix if there is one */
2417 e = endswith(arg_machine, ".raw");
2418 if (e)
2419 *e = 0;
2420 } else
2421 arg_machine = strdup(basename(arg_directory));
2422 }
ec16945e
LP
2423 if (!arg_machine)
2424 return log_oom();
2425
ae691c1d 2426 hostname_cleanup(arg_machine);
ec16945e
LP
2427 if (!machine_name_is_valid(arg_machine)) {
2428 log_error("Failed to determine machine name automatically, please use -M.");
2429 return -EINVAL;
2430 }
b9ba4dab
LP
2431
2432 if (arg_ephemeral) {
2433 char *b;
2434
2435 /* Add a random suffix when this is an
2436 * ephemeral machine, so that we can run many
2437 * instances at once without manually having
2438 * to specify -M each time. */
2439
2440 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2441 return log_oom();
2442
2443 free(arg_machine);
2444 arg_machine = b;
2445 }
ec16945e
LP
2446 }
2447
2448 return 0;
2449}
2450
8d4aa2bb 2451static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2452 char *chased;
2453 int r;
2454
2455 assert(p);
2456
2457 if (!*p)
2458 return 0;
2459
8d4aa2bb 2460 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2461 if (r < 0)
2462 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2463
8405dcf7
ZJS
2464 free_and_replace(*p, chased);
2465 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
3f342ec4
LP
2466}
2467
03cfe0d5 2468static int determine_uid_shift(const char *directory) {
6dac160c
LP
2469 int r;
2470
0de7acce 2471 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2472 arg_uid_shift = 0;
6dac160c 2473 return 0;
03cfe0d5 2474 }
6dac160c
LP
2475
2476 if (arg_uid_shift == UID_INVALID) {
2477 struct stat st;
2478
03cfe0d5 2479 r = stat(directory, &st);
6dac160c 2480 if (r < 0)
03cfe0d5 2481 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2482
2483 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2484
baaa35ad
ZJS
2485 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2486 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2487 "UID and GID base of %s don't match.", directory);
6dac160c
LP
2488
2489 arg_uid_range = UINT32_C(0x10000);
2490 }
2491
baaa35ad
ZJS
2492 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2493 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2494 "UID base too high for UID range.");
6dac160c 2495
6dac160c
LP
2496 return 0;
2497}
2498
03cfe0d5
LP
2499static int inner_child(
2500 Barrier *barrier,
2501 const char *directory,
2502 bool secondary,
2503 int kmsg_socket,
2504 int rtnl_socket,
f757855e 2505 FDSet *fds) {
69c79d3c 2506
03cfe0d5 2507 _cleanup_free_ char *home = NULL;
e01ff70a 2508 char as_uuid[37];
88614c8a 2509 size_t n_env = 1;
03cfe0d5 2510 const char *envp[] = {
0c300adf 2511 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2512 NULL, /* container */
03cfe0d5
LP
2513 NULL, /* TERM */
2514 NULL, /* HOME */
2515 NULL, /* USER */
2516 NULL, /* LOGNAME */
2517 NULL, /* container_uuid */
2518 NULL, /* LISTEN_FDS */
2519 NULL, /* LISTEN_PID */
9c1e04d0 2520 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2521 NULL
2522 };
1a68e1e5 2523 const char *exec_target;
2371271c 2524 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2525 int r;
88213476 2526
b37469d7
LP
2527 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2528 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2529 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2530 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2531 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2532 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2533 * namespace.
2534 *
2535 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2536 * unshare(). See below. */
2537
03cfe0d5
LP
2538 assert(barrier);
2539 assert(directory);
2540 assert(kmsg_socket >= 0);
88213476 2541
0de7acce 2542 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2543 /* Tell the parent, that it now can write the UID map. */
2544 (void) barrier_place(barrier); /* #1 */
7027ff61 2545
03cfe0d5 2546 /* Wait until the parent wrote the UID map */
baaa35ad
ZJS
2547 if (!barrier_place_and_sync(barrier)) /* #2 */
2548 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2549 "Parent died too early");
88213476
LP
2550 }
2551
6d66bd3b
EV
2552 r = reset_uid_gid();
2553 if (r < 0)
2554 return log_error_errno(r, "Couldn't become new root: %m");
2555
0de7acce 2556 r = mount_all(NULL,
4f086aab 2557 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 2558 arg_uid_shift,
0de7acce 2559 arg_selinux_apifs_context);
03cfe0d5
LP
2560 if (r < 0)
2561 return r;
2562
04413780
ZJS
2563 if (!arg_network_namespace_path && arg_private_network) {
2564 r = unshare(CLONE_NEWNET);
2565 if (r < 0)
2566 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2567
2568 /* Tell the parent that it can setup network interfaces. */
2569 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2570 }
2571
4f086aab 2572 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2573 if (r < 0)
2574 return r;
2575
03cfe0d5
LP
2576 /* Wait until we are cgroup-ified, so that we
2577 * can mount the right cgroup path writable */
baaa35ad
ZJS
2578 if (!barrier_place_and_sync(barrier)) /* #4 */
2579 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2580 "Parent died too early");
88213476 2581
489fae52 2582 if (arg_use_cgns) {
0996ef00
CB
2583 r = unshare(CLONE_NEWCGROUP);
2584 if (r < 0)
04413780 2585 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2586 r = mount_cgroups(
2587 "",
2588 arg_unified_cgroup_hierarchy,
2589 arg_userns_mode != USER_NAMESPACE_NO,
2590 arg_uid_shift,
2591 arg_uid_range,
5a8ff0e6 2592 arg_selinux_apifs_context,
ada54120 2593 true);
0996ef00
CB
2594 if (r < 0)
2595 return r;
2596 } else {
2597 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2598 if (r < 0)
2599 return r;
2600 }
ec16945e 2601
1e4f1671 2602 r = setup_boot_id();
03cfe0d5
LP
2603 if (r < 0)
2604 return r;
ec16945e 2605
1e4f1671 2606 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2607 if (r < 0)
2608 return r;
2609 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2610
03cfe0d5
LP
2611 if (setsid() < 0)
2612 return log_error_errno(errno, "setsid() failed: %m");
2613
2614 if (arg_private_network)
2615 loopback_setup();
2616
7a8f6325
LP
2617 if (arg_expose_ports) {
2618 r = expose_port_send_rtnl(rtnl_socket);
2619 if (r < 0)
2620 return r;
2621 rtnl_socket = safe_close(rtnl_socket);
2622 }
03cfe0d5 2623
81f345df
LP
2624 if (arg_oom_score_adjust_set) {
2625 r = set_oom_score_adjust(arg_oom_score_adjust);
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to adjust OOM score: %m");
2628 }
2629
d107bb7d
LP
2630 if (arg_cpuset)
2631 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2632 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2633
709f6e46
MS
2634 r = drop_capabilities();
2635 if (r < 0)
2636 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5 2637
c818eef1 2638 (void) setup_hostname();
03cfe0d5 2639
050f7277 2640 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2641 r = safe_personality(arg_personality);
2642 if (r < 0)
2643 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2644 } else if (secondary) {
21022b9d
LP
2645 r = safe_personality(PER_LINUX32);
2646 if (r < 0)
2647 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2648 }
2649
349cc4a5 2650#if HAVE_SELINUX
03cfe0d5 2651 if (arg_selinux_context)
2ed96880 2652 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2653 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2654#endif
2655
ee645080 2656 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2657 if (r < 0)
2658 return r;
2659
66edd963
LP
2660 if (arg_no_new_privileges)
2661 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2662 return log_error_errno(errno, "Failed to disable new privileges: %m");
2663
6aadfa4c
ILG
2664 /* LXC sets container=lxc, so follow the scheme here */
2665 envp[n_env++] = strjoina("container=", arg_container_service_name);
2666
03cfe0d5
LP
2667 envp[n_env] = strv_find_prefix(environ, "TERM=");
2668 if (envp[n_env])
313cefa1 2669 n_env++;
03cfe0d5
LP
2670
2671 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2672 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2673 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2674 return log_oom();
2675
3bbaff3e 2676 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2677
691675ba 2678 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2679 return log_oom();
03cfe0d5
LP
2680
2681 if (fdset_size(fds) > 0) {
2682 r = fdset_cloexec(fds, false);
2683 if (r < 0)
2684 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2685
2686 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2687 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2688 return log_oom();
2689 }
9c1e04d0
AP
2690 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2691 return log_oom();
03cfe0d5 2692
2371271c
TG
2693 env_use = strv_env_merge(2, envp, arg_setenv);
2694 if (!env_use)
2695 return log_oom();
03cfe0d5
LP
2696
2697 /* Let the parent know that we are ready and
2698 * wait until the parent is ready with the
2699 * setup, too... */
baaa35ad
ZJS
2700 if (!barrier_place_and_sync(barrier)) /* #5 */
2701 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2702 "Parent died too early");
03cfe0d5 2703
5f932eb9
LP
2704 if (arg_chdir)
2705 if (chdir(arg_chdir) < 0)
2706 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2707
7732f92b 2708 if (arg_start_mode == START_PID2) {
75bf701f 2709 r = stub_pid1(arg_uuid);
7732f92b
LP
2710 if (r < 0)
2711 return r;
2712 }
2713
8ca082b4
LP
2714 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
2715 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
2716 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 2717 log_close();
8ca082b4
LP
2718 log_set_open_when_needed(true);
2719
03cfe0d5
LP
2720 (void) fdset_close_others(fds);
2721
7732f92b 2722 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2723 char **a;
2724 size_t m;
2725
2726 /* Automatically search for the init system */
2727
75f32f04
ZJS
2728 m = strv_length(arg_parameters);
2729 a = newa(char*, m + 2);
2730 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2731 a[1 + m] = NULL;
03cfe0d5 2732
ced58da7 2733 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2734 execve(a[0], a, env_use);
2735
ced58da7 2736 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2737 execve(a[0], a, env_use);
2738
ced58da7 2739 a[0] = (char*) "/sbin/init";
03cfe0d5 2740 execve(a[0], a, env_use);
ced58da7
LP
2741
2742 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 2743 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
2744 const char *dollar_path;
2745
1a68e1e5 2746 exec_target = arg_parameters[0];
b6b180b7
LP
2747
2748 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
2749 * binary. */
2750 dollar_path = strv_env_get(env_use, "PATH");
2751 if (dollar_path) {
2752 if (putenv((char*) dollar_path) != 0)
2753 return log_error_errno(errno, "Failed to update $PATH: %m");
2754 }
2755
f757855e 2756 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2757 } else {
5f932eb9 2758 if (!arg_chdir)
d929b0f9
ZJS
2759 /* If we cannot change the directory, we'll end up in /, that is expected. */
2760 (void) chdir(home ?: "/root");
5f932eb9 2761
03cfe0d5
LP
2762 execle("/bin/bash", "-bash", NULL, env_use);
2763 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2764
2765 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2766 }
2767
8ca082b4 2768 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2769}
2770
9c1e04d0 2771static int setup_sd_notify_child(void) {
271f518f 2772 _cleanup_close_ int fd = -1;
9c1e04d0 2773 union sockaddr_union sa = {
44ed5214
LP
2774 .un.sun_family = AF_UNIX,
2775 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
2776 };
2777 int r;
2778
2779 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2780 if (fd < 0)
2781 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2782
2783 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 2784 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 2785
9c1e04d0 2786 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 2787 if (r < 0)
44ed5214 2788 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 2789
adc7d9f0 2790 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 2791 if (r < 0)
adc7d9f0 2792 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 2793
2ff48e98 2794 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 2795 if (r < 0)
2ff48e98 2796 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 2797
271f518f 2798 return TAKE_FD(fd);
9c1e04d0
AP
2799}
2800
03cfe0d5
LP
2801static int outer_child(
2802 Barrier *barrier,
2803 const char *directory,
2804 const char *console,
2d845785 2805 DissectedImage *dissected_image,
03cfe0d5
LP
2806 bool interactive,
2807 bool secondary,
2808 int pid_socket,
e01ff70a 2809 int uuid_socket,
9c1e04d0 2810 int notify_socket,
03cfe0d5
LP
2811 int kmsg_socket,
2812 int rtnl_socket,
825d5287 2813 int uid_shift_socket,
8199d554 2814 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
2815 FDSet *fds,
2816 int netns_fd) {
03cfe0d5 2817
bf428efb
LP
2818 _cleanup_close_ int fd = -1;
2819 int r, which_failed;
03cfe0d5
LP
2820 pid_t pid;
2821 ssize_t l;
03cfe0d5 2822
b37469d7
LP
2823 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
2824 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
2825 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
2826 * initializations a second child (the "inner" one) is forked off it, and it exits. */
2827
03cfe0d5
LP
2828 assert(barrier);
2829 assert(directory);
2830 assert(console);
2831 assert(pid_socket >= 0);
e01ff70a 2832 assert(uuid_socket >= 0);
9c1e04d0 2833 assert(notify_socket >= 0);
03cfe0d5
LP
2834 assert(kmsg_socket >= 0);
2835
2836 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2837 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2838
2839 if (interactive) {
2b33ab09 2840 int terminal;
03cfe0d5 2841
2b33ab09
LP
2842 terminal = open_terminal(console, O_RDWR);
2843 if (terminal < 0)
2844 return log_error_errno(terminal, "Failed to open console: %m");
03cfe0d5 2845
17cac366
LP
2846 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
2847 r = log_dup_console();
2848 if (r < 0)
2849 return log_error_errno(r, "Failed to duplicate stderr: %m");
2850
2b33ab09
LP
2851 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
2852 if (r < 0)
2853 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
03cfe0d5
LP
2854 }
2855
2856 r = reset_audit_loginuid();
2857 if (r < 0)
2858 return r;
2859
2860 /* Mark everything as slave, so that we still
2861 * receive mounts from the real root, but don't
2862 * propagate mounts to the real root. */
60e76d48
ZJS
2863 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2864 if (r < 0)
2865 return r;
03cfe0d5 2866
2d845785 2867 if (dissected_image) {
2d3a5a73
LP
2868 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
2869 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
2870 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
2871 * makes sure ESP partitions and userns are compatible. */
2872
2873 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
03bcb6d4
LP
2874 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
2875 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
2876 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785
LP
2877 if (r < 0)
2878 return r;
2879 }
03cfe0d5 2880
391567f4
LP
2881 r = determine_uid_shift(directory);
2882 if (r < 0)
2883 return r;
2884
0de7acce 2885 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2886 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2887 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2888 if (l < 0)
2889 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
2890 if (l != sizeof(arg_uid_shift))
2891 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2892 "Short write while sending UID shift.");
0e7ac751 2893
0de7acce 2894 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2895 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2896 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2897 * not it will pick a different one, and send it back to us. */
2898
2899 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2900 if (l < 0)
2901 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
2902 if (l != sizeof(arg_uid_shift))
2903 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2904 "Short read while receiving UID shift.");
0e7ac751
LP
2905 }
2906
ff6c6cc1
LP
2907 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
2908 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2909 }
2910
e50cd82f
LP
2911 if (!dissected_image) {
2912 /* Turn directory into bind mount */
2913 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2914 if (r < 0)
2915 return r;
2916 }
7d0ecdd6
LP
2917
2918 r = setup_pivot_root(
2919 directory,
2920 arg_pivot_root_new,
2921 arg_pivot_root_old);
2922 if (r < 0)
2923 return r;
2924
2925 r = setup_volatile_mode(
2926 directory,
2927 arg_volatile_mode,
2928 arg_userns_mode != USER_NAMESPACE_NO,
2929 arg_uid_shift,
2930 arg_uid_range,
2931 arg_selinux_context);
2932 if (r < 0)
2933 return r;
2934
2d3a5a73
LP
2935 if (dissected_image) {
2936 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
2937 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2938 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2939 if (r < 0)
2940 return r;
2941 }
2942
8199d554
LP
2943 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
2944 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
2945
2946 r = detect_unified_cgroup_hierarchy_from_image(directory);
2947 if (r < 0)
2948 return r;
2949
2950 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
2951 if (l < 0)
2952 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
2953 if (l != sizeof(arg_unified_cgroup_hierarchy))
2954 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2955 "Short write while sending cgroup mode.");
8199d554
LP
2956
2957 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
2958 }
2959
4ad14eff
LP
2960 /* Mark everything as shared so our mounts get propagated down. This is
2961 * required to make new bind mounts available in systemd services
2962 * inside the containter that create a new mount namespace.
2963 * See https://github.com/systemd/systemd/issues/3860
2964 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2965 * shared propagation mode. */
4ad14eff
LP
2966 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2967 if (r < 0)
2968 return r;
2969
2970 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2971 if (r < 0)
2972 return r;
2973
03cfe0d5
LP
2974 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2975 if (r < 0)
2976 return r;
2977
e5a4bb0d 2978 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
6b7c9f8b 2979 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2980 if (r < 0)
2981 return log_error_errno(r, "Failed to make tree read-only: %m");
2982 }
2983
0de7acce 2984 r = mount_all(directory,
4f086aab 2985 arg_mount_settings,
0de7acce 2986 arg_uid_shift,
0de7acce 2987 arg_selinux_apifs_context);
03cfe0d5
LP
2988 if (r < 0)
2989 return r;
2990
07fa00f9
LP
2991 r = copy_devnodes(directory);
2992 if (r < 0)
03cfe0d5
LP
2993 return r;
2994
2995 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2996
07fa00f9
LP
2997 r = setup_pts(directory);
2998 if (r < 0)
03cfe0d5
LP
2999 return r;
3000
3001 r = setup_propagate(directory);
3002 if (r < 0)
3003 return r;
3004
3005 r = setup_dev_console(directory, console);
3006 if (r < 0)
3007 return r;
3008
8e5430c4
LP
3009 r = setup_keyring();
3010 if (r < 0)
3011 return r;
3012
960e4569 3013 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
03cfe0d5
LP
3014 if (r < 0)
3015 return r;
3016
3017 r = setup_timezone(directory);
3018 if (r < 0)
3019 return r;
3020
3021 r = setup_resolv_conf(directory);
3022 if (r < 0)
3023 return r;
3024
e01ff70a
MS
3025 r = setup_machine_id(directory);
3026 if (r < 0)
3027 return r;
3028
03cfe0d5
LP
3029 r = setup_journal(directory);
3030 if (r < 0)
3031 return r;
3032
0de7acce
LP
3033 r = mount_custom(
3034 directory,
3035 arg_custom_mounts,
3036 arg_n_custom_mounts,
3037 arg_userns_mode != USER_NAMESPACE_NO,
3038 arg_uid_shift,
3039 arg_uid_range,
3040 arg_selinux_apifs_context);
03cfe0d5
LP
3041 if (r < 0)
3042 return r;
3043
489fae52 3044 if (!arg_use_cgns) {
0996ef00
CB
3045 r = mount_cgroups(
3046 directory,
3047 arg_unified_cgroup_hierarchy,
3048 arg_userns_mode != USER_NAMESPACE_NO,
3049 arg_uid_shift,
3050 arg_uid_range,
5a8ff0e6 3051 arg_selinux_apifs_context,
ada54120 3052 false);
0996ef00
CB
3053 if (r < 0)
3054 return r;
3055 }
03cfe0d5
LP
3056
3057 r = mount_move_root(directory);
3058 if (r < 0)
3059 return log_error_errno(r, "Failed to move root directory: %m");
3060
9c1e04d0
AP
3061 fd = setup_sd_notify_child();
3062 if (fd < 0)
3063 return fd;
3064
bf428efb
LP
3065 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3066 if (r < 0)
3067 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3068
03cfe0d5 3069 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3070 arg_clone_ns_flags |
8869a0b4 3071 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3072 if (pid < 0)
3073 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3074 if (pid == 0) {
3075 pid_socket = safe_close(pid_socket);
e01ff70a 3076 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3077 notify_socket = safe_close(notify_socket);
825d5287 3078 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3079
3080 /* The inner child has all namespaces that are
3081 * requested, so that we all are owned by the user if
3082 * user namespaces are turned on. */
3083
d7bea6b6
DP
3084 if (arg_network_namespace_path) {
3085 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3086 if (r < 0)
e2d39e54 3087 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3088 }
3089
f757855e 3090 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3091 if (r < 0)
3092 _exit(EXIT_FAILURE);
3093
3094 _exit(EXIT_SUCCESS);
3095 }
3096
3097 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3098 if (l < 0)
3099 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3100 if (l != sizeof(pid))
3101 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3102 "Short write while sending PID.");
03cfe0d5 3103
e01ff70a
MS
3104 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3105 if (l < 0)
3106 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3107 if (l != sizeof(arg_uuid))
3108 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3109 "Short write while sending machine ID.");
e01ff70a 3110
9c1e04d0
AP
3111 l = send_one_fd(notify_socket, fd, 0);
3112 if (l < 0)
3113 return log_error_errno(errno, "Failed to send notify fd: %m");
3114
03cfe0d5 3115 pid_socket = safe_close(pid_socket);
e01ff70a 3116 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3117 notify_socket = safe_close(notify_socket);
327e26d6
KN
3118 kmsg_socket = safe_close(kmsg_socket);
3119 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3120 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3121
3122 return 0;
3123}
3124
0e7ac751 3125static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3126 bool tried_hashed = false;
0e7ac751
LP
3127 unsigned n_tries = 100;
3128 uid_t candidate;
3129 int r;
3130
3131 assert(shift);
3132 assert(ret_lock_file);
0de7acce 3133 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3134 assert(arg_uid_range == 0x10000U);
3135
3136 candidate = *shift;
3137
3138 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3139
3140 for (;;) {
fbd0b64f 3141 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3142 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3143
3144 if (--n_tries <= 0)
3145 return -EBUSY;
3146
87d5e4f2 3147 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3148 goto next;
3149 if ((candidate & UINT32_C(0xFFFF)) != 0)
3150 goto next;
3151
3152 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3153 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3154 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3155 goto next;
3156 if (r < 0)
3157 return r;
3158
3159 /* Make some superficial checks whether the range is currently known in the user database */
3160 if (getpwuid(candidate))
3161 goto next;
3162 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3163 goto next;
3164 if (getgrgid(candidate))
3165 goto next;
3166 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3167 goto next;
3168
3169 *ret_lock_file = lf;
3170 lf = (struct LockFile) LOCK_FILE_INIT;
3171 *shift = candidate;
3172 return 0;
3173
3174 next:
d381c8a6
LP
3175 if (arg_machine && !tried_hashed) {
3176 /* Try to hash the base from the container name */
3177
3178 static const uint8_t hash_key[] = {
3179 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3180 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3181 };
3182
3183 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3184
3185 tried_hashed = true;
3186 } else
3187 random_bytes(&candidate, sizeof(candidate));
3188
87d5e4f2 3189 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3190 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3191 }
3192}
3193
03cfe0d5 3194static int setup_uid_map(pid_t pid) {
fbd0b64f 3195 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3196 int r;
3197
3198 assert(pid > 1);
3199
3200 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3201 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3202 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3203 if (r < 0)
3204 return log_error_errno(r, "Failed to write UID map: %m");
3205
3206 /* We always assign the same UID and GID ranges */
3207 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3208 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3209 if (r < 0)
3210 return log_error_errno(r, "Failed to write GID map: %m");
3211
3212 return 0;
3213}
3214
9c1e04d0 3215static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3216 char buf[NOTIFY_BUFFER_MAX+1];
3217 char *p = NULL;
3218 struct iovec iovec = {
3219 .iov_base = buf,
3220 .iov_len = sizeof(buf)-1,
3221 };
3222 union {
3223 struct cmsghdr cmsghdr;
3224 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3225 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3226 } control = {};
3227 struct msghdr msghdr = {
3228 .msg_iov = &iovec,
3229 .msg_iovlen = 1,
3230 .msg_control = &control,
3231 .msg_controllen = sizeof(control),
3232 };
3233 struct cmsghdr *cmsg;
3234 struct ucred *ucred = NULL;
3235 ssize_t n;
3236 pid_t inner_child_pid;
3237 _cleanup_strv_free_ char **tags = NULL;
3238
3239 assert(userdata);
3240
3241 inner_child_pid = PTR_TO_PID(userdata);
3242
3243 if (revents != EPOLLIN) {
3244 log_warning("Got unexpected poll event for notify fd.");
3245 return 0;
3246 }
3247
3248 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3249 if (n < 0) {
3742095b 3250 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3251 return 0;
3252
3253 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3254 }
3255 cmsg_close_all(&msghdr);
3256
3257 CMSG_FOREACH(cmsg, &msghdr) {
3258 if (cmsg->cmsg_level == SOL_SOCKET &&
3259 cmsg->cmsg_type == SCM_CREDENTIALS &&
3260 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3261
3262 ucred = (struct ucred*) CMSG_DATA(cmsg);
3263 }
3264 }
3265
3266 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3267 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3268 return 0;
3269 }
3270
3271 if ((size_t) n >= sizeof(buf)) {
3272 log_warning("Received notify message exceeded maximum size. Ignoring.");
3273 return 0;
3274 }
3275
3276 buf[n] = 0;
3277 tags = strv_split(buf, "\n\r");
3278 if (!tags)
3279 return log_oom();
3280
3281 if (strv_find(tags, "READY=1"))
3282 sd_notifyf(false, "READY=1\n");
3283
3284 p = strv_find_startswith(tags, "STATUS=");
3285 if (p)
3286 sd_notifyf(false, "STATUS=Container running: %s", p);
3287
3288 return 0;
3289}
3290
5773024d 3291static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3292 int r;
9c1e04d0 3293
5773024d 3294 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3295 if (r < 0)
3296 return log_error_errno(r, "Failed to allocate notify event source: %m");
3297
5773024d 3298 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3299
3300 return 0;
3301}
3302
5d961407
LP
3303static int merge_settings(Settings *settings, const char *path) {
3304 int rl;
f757855e 3305
5d961407
LP
3306 assert(settings);
3307 assert(path);
f757855e 3308
5d961407
LP
3309 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3310 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3311
7732f92b
LP
3312 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3313 settings->start_mode >= 0) {
3314 arg_start_mode = settings->start_mode;
130d3d22 3315 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3316 }
3317
a2f577fc
JL
3318 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3319 arg_ephemeral = settings->ephemeral;
3320
b53ede69
PW
3321 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3322 settings->pivot_root_new) {
3323 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3324 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3325 }
3326
5f932eb9 3327 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3328 settings->working_directory)
3329 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3330
f757855e 3331 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3332 settings->environment)
3333 strv_free_and_replace(arg_setenv, settings->environment);
f757855e
LP
3334
3335 if ((arg_settings_mask & SETTING_USER) == 0 &&
1cc6c93a
YW
3336 settings->user)
3337 free_and_replace(arg_user, settings->user);
f757855e
LP
3338
3339 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3340 uint64_t plus;
f757855e 3341
0e265674
LP
3342 plus = settings->capability;
3343 if (settings_private_network(settings))
3344 plus |= (1ULL << CAP_NET_ADMIN);
3345
3346 if (!arg_settings_trusted && plus != 0) {
3347 if (settings->capability != 0)
5d961407 3348 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
0e265674 3349 } else
520e0d54 3350 arg_caps_retain |= plus;
f757855e 3351
520e0d54 3352 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3353 }
3354
3355 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3356 settings->kill_signal > 0)
3357 arg_kill_signal = settings->kill_signal;
3358
3359 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3360 settings->personality != PERSONALITY_INVALID)
3361 arg_personality = settings->personality;
3362
3363 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3364 !sd_id128_is_null(settings->machine_id)) {
3365
3366 if (!arg_settings_trusted)
5d961407 3367 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3368 else
3369 arg_uuid = settings->machine_id;
3370 }
3371
3372 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3373 settings->read_only >= 0)
3374 arg_read_only = settings->read_only;
3375
3376 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3377 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3378 arg_volatile_mode = settings->volatile_mode;
3379
3380 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3381 settings->n_custom_mounts > 0) {
3382
3383 if (!arg_settings_trusted)
5d961407 3384 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3385 else {
3386 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3387 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3388 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3389 settings->n_custom_mounts = 0;
3390 }
3391 }
3392
3393 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3394 (settings->private_network >= 0 ||
3395 settings->network_veth >= 0 ||
3396 settings->network_bridge ||
22b28dfd 3397 settings->network_zone ||
f757855e
LP
3398 settings->network_interfaces ||
3399 settings->network_macvlan ||
f6d6bad1
LP
3400 settings->network_ipvlan ||
3401 settings->network_veth_extra)) {
f757855e
LP
3402
3403 if (!arg_settings_trusted)
5d961407 3404 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3405 else {
f6d6bad1 3406 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3407 arg_private_network = settings_private_network(settings);
3408
130d3d22
YW
3409 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3410 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3411 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3412 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3413
1cc6c93a
YW
3414 free_and_replace(arg_network_bridge, settings->network_bridge);
3415 free_and_replace(arg_network_zone, settings->network_zone);
f757855e
LP
3416 }
3417 }
3418
3419 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3420 settings->expose_ports) {
3421
3422 if (!arg_settings_trusted)
5d961407 3423 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3424 else {
3425 expose_port_free_all(arg_expose_ports);
1cc6c93a 3426 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3427 }
3428 }
3429
0de7acce
LP
3430 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3431 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3432
3433 if (!arg_settings_trusted)
5d961407 3434 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3435 else {
3436 arg_userns_mode = settings->userns_mode;
3437 arg_uid_shift = settings->uid_shift;
3438 arg_uid_range = settings->uid_range;
3439 arg_userns_chown = settings->userns_chown;
3440 }
3441 }
3442
9c1e04d0
AP
3443 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3444 arg_notify_ready = settings->notify_ready;
3445
960e4569
LP
3446 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3447
3448 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
5d961407 3449 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3450 else {
130d3d22
YW
3451 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3452 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569
LP
3453 }
3454 }
3455
bf428efb
LP
3456 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3457 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3458 continue;
3459
3460 if (!settings->rlimit[rl])
3461 continue;
3462
3463 if (!arg_settings_trusted) {
5d961407 3464 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3465 continue;
3466 }
3467
3468 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3469 }
3470
3a9530e5
LP
3471 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3472 settings->hostname)
3473 free_and_replace(arg_hostname, settings->hostname);
3474
66edd963
LP
3475 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3476 settings->no_new_privileges >= 0)
3477 arg_no_new_privileges = settings->no_new_privileges;
3478
81f345df
LP
3479 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3480 settings->oom_score_adjust_set) {
3481
3482 if (!arg_settings_trusted)
5d961407 3483 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3484 else {
3485 arg_oom_score_adjust = settings->oom_score_adjust;
3486 arg_oom_score_adjust_set = true;
3487 }
3488 }
3489
d107bb7d
LP
3490 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3491 settings->cpuset) {
3492
3493 if (!arg_settings_trusted)
5d961407 3494 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d
LP
3495 else {
3496 if (arg_cpuset)
3497 CPU_FREE(arg_cpuset);
3498 arg_cpuset = TAKE_PTR(settings->cpuset);
3499 arg_cpuset_ncpus = settings->cpuset_ncpus;
3500 }
3501 }
3502
09d423e9
LP
3503 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3504 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3505 arg_resolv_conf = settings->resolv_conf;
3506
4e1d6aa9
LP
3507 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3508 settings->link_journal != _LINK_JOURNAL_INVALID) {
3509
3510 if (!arg_settings_trusted)
3511 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3512 else {
3513 arg_link_journal = settings->link_journal;
3514 arg_link_journal_try = settings->link_journal_try;
3515 }
3516 }
3517
1688841f
LP
3518 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3519 settings->timezone != _TIMEZONE_MODE_INVALID)
3520 arg_timezone = settings->timezone;
3521
f757855e
LP
3522 return 0;
3523}
3524
5d961407
LP
3525static int load_settings(void) {
3526 _cleanup_(settings_freep) Settings *settings = NULL;
3527 _cleanup_fclose_ FILE *f = NULL;
3528 _cleanup_free_ char *p = NULL;
3529 const char *fn, *i;
3530 int r;
3531
3532 /* If all settings are masked, there's no point in looking for
3533 * the settings file */
3534 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3535 return 0;
3536
3537 fn = strjoina(arg_machine, ".nspawn");
3538
3539 /* We first look in the admin's directories in /etc and /run */
3540 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3541 _cleanup_free_ char *j = NULL;
3542
3543 j = strjoin(i, "/", fn);
3544 if (!j)
3545 return log_oom();
3546
3547 f = fopen(j, "re");
3548 if (f) {
3549 p = TAKE_PTR(j);
3550
3551 /* By default, we trust configuration from /etc and /run */
3552 if (arg_settings_trusted < 0)
3553 arg_settings_trusted = true;
3554
3555 break;
3556 }
3557
3558 if (errno != ENOENT)
3559 return log_error_errno(errno, "Failed to open %s: %m", j);
3560 }
3561
3562 if (!f) {
3563 /* After that, let's look for a file next to the
3564 * actual image we shall boot. */
3565
3566 if (arg_image) {
3567 p = file_in_same_dir(arg_image, fn);
3568 if (!p)
3569 return log_oom();
3570 } else if (arg_directory) {
3571 p = file_in_same_dir(arg_directory, fn);
3572 if (!p)
3573 return log_oom();
3574 }
3575
3576 if (p) {
3577 f = fopen(p, "re");
3578 if (!f && errno != ENOENT)
3579 return log_error_errno(errno, "Failed to open %s: %m", p);
3580
3581 /* By default, we do not trust configuration from /var/lib/machines */
3582 if (arg_settings_trusted < 0)
3583 arg_settings_trusted = false;
3584 }
3585 }
3586
3587 if (!f)
3588 return 0;
3589
3590 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3591
3592 r = settings_load(f, p, &settings);
3593 if (r < 0)
3594 return r;
3595
3596 return merge_settings(settings, p);
3597}
3598
b0067625
ZJS
3599static int run(int master,
3600 const char* console,
2d845785 3601 DissectedImage *dissected_image,
b0067625
ZJS
3602 bool interactive,
3603 bool secondary,
3604 FDSet *fds,
3605 char veth_name[IFNAMSIZ], bool *veth_created,
3606 union in_addr_union *exposed,
3607 pid_t *pid, int *ret) {
3608
3609 static const struct sigaction sa = {
3610 .sa_handler = nop_signal_handler,
e28c7cd0 3611 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3612 };
3613
8e766630 3614 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
3615 _cleanup_close_ int etc_passwd_lock = -1;
3616 _cleanup_close_pair_ int
3617 kmsg_socket_pair[2] = { -1, -1 },
3618 rtnl_socket_pair[2] = { -1, -1 },
3619 pid_socket_pair[2] = { -1, -1 },
3620 uuid_socket_pair[2] = { -1, -1 },
3621 notify_socket_pair[2] = { -1, -1 },
8199d554
LP
3622 uid_shift_socket_pair[2] = { -1, -1 },
3623 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
3624
b0067625
ZJS
3625 _cleanup_close_ int notify_socket= -1;
3626 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3627 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3628 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3629 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3630 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 3631 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625
ZJS
3632 ContainerStatus container_status = 0;
3633 char last_char = 0;
3634 int ifi = 0, r;
3635 ssize_t l;
3636 sigset_t mask_chld;
d7bea6b6 3637 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
3638
3639 assert_se(sigemptyset(&mask_chld) == 0);
3640 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3641
3642 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3643 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3644 * check with getpwuid() if the specific user already exists. Note that /etc might be
3645 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3646 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3647 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3648 * really ours. */
3649
3650 etc_passwd_lock = take_etc_passwd_lock(NULL);
3651 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3652 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3653 }
3654
3655 r = barrier_create(&barrier);
3656 if (r < 0)
3657 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3658
3659 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3660 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3661
3662 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3663 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3664
3665 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3666 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3667
3668 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3669 return log_error_errno(errno, "Failed to create id socket pair: %m");
3670
3671 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3672 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3673
3674 if (arg_userns_mode != USER_NAMESPACE_NO)
3675 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3676 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3677
8199d554
LP
3678 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
3679 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
3680 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
3681
b0067625
ZJS
3682 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3683 * parent's blocking calls and give it a chance to call wait() and terminate. */
3684 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3685 if (r < 0)
3686 return log_error_errno(errno, "Failed to change the signal mask: %m");
3687
3688 r = sigaction(SIGCHLD, &sa, NULL);
3689 if (r < 0)
3690 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3691
d7bea6b6
DP
3692 if (arg_network_namespace_path) {
3693 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3694 if (netns_fd < 0)
3695 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
3696
3697 r = fd_is_network_ns(netns_fd);
6619ad88
LP
3698 if (r == -EUCLEAN)
3699 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
3700 else if (r < 0)
d7bea6b6 3701 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
6619ad88
LP
3702 else if (r == 0) {
3703 log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
3704 return -EINVAL;
3705 }
3706 }
3707
b0067625
ZJS
3708 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3709 if (*pid < 0)
3710 return log_error_errno(errno, "clone() failed%s: %m",
3711 errno == EINVAL ?
3712 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3713
3714 if (*pid == 0) {
3715 /* The outer child only has a file system namespace. */
3716 barrier_set_role(&barrier, BARRIER_CHILD);
3717
3718 master = safe_close(master);
3719
3720 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3721 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3722 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3723 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3724 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3725 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 3726 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
3727
3728 (void) reset_all_signal_handlers();
3729 (void) reset_signal_mask();
3730
3731 r = outer_child(&barrier,
3732 arg_directory,
3733 console,
2d845785 3734 dissected_image,
b0067625
ZJS
3735 interactive,
3736 secondary,
3737 pid_socket_pair[1],
3738 uuid_socket_pair[1],
3739 notify_socket_pair[1],
3740 kmsg_socket_pair[1],
3741 rtnl_socket_pair[1],
3742 uid_shift_socket_pair[1],
8199d554 3743 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
3744 fds,
3745 netns_fd);
b0067625
ZJS
3746 if (r < 0)
3747 _exit(EXIT_FAILURE);
3748
3749 _exit(EXIT_SUCCESS);
3750 }
3751
3752 barrier_set_role(&barrier, BARRIER_PARENT);
3753
3754 fds = fdset_free(fds);
3755
3756 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3757 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3758 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3759 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3760 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3761 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 3762 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
3763
3764 if (arg_userns_mode != USER_NAMESPACE_NO) {
3765 /* The child just let us know the UID shift it might have read from the image. */
3766 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3767 if (l < 0)
3768 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3769 if (l != sizeof arg_uid_shift) {
3770 log_error("Short read while reading UID shift.");
3771 return -EIO;
3772 }
3773
3774 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3775 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3776 * image, but if that's already in use, pick a new one, and report back to the child,
3777 * which one we now picked. */
3778
3779 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3780 if (r < 0)
3781 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3782
3783 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3784 if (l < 0)
3785 return log_error_errno(errno, "Failed to send UID shift: %m");
3786 if (l != sizeof arg_uid_shift) {
3787 log_error("Short write while writing UID shift.");
3788 return -EIO;
3789 }
3790 }
3791 }
3792
8199d554
LP
3793 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3794 /* The child let us know the support cgroup mode it might have read from the image. */
3795 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
3796 if (l < 0)
3797 return log_error_errno(errno, "Failed to read cgroup mode: %m");
3798 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
bd897e72
ZJS
3799 log_error("Short read while reading cgroup mode (%zu bytes).%s",
3800 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
3801 return -EIO;
3802 }
3803 }
3804
b0067625 3805 /* Wait for the outer child. */
d2e0ac3d
LP
3806 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
3807 if (r < 0)
3808 return r;
3809 if (r != EXIT_SUCCESS)
3810 return -EIO;
b0067625
ZJS
3811
3812 /* And now retrieve the PID of the inner child. */
3813 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3814 if (l < 0)
3815 return log_error_errno(errno, "Failed to read inner child PID: %m");
3816 if (l != sizeof *pid) {
3817 log_error("Short read while reading inner child PID.");
3818 return -EIO;
3819 }
3820
3821 /* We also retrieve container UUID in case it was generated by outer child */
3822 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3823 if (l < 0)
3824 return log_error_errno(errno, "Failed to read container machine ID: %m");
3825 if (l != sizeof(arg_uuid)) {
3826 log_error("Short read while reading container machined ID.");
3827 return -EIO;
3828 }
3829
3830 /* We also retrieve the socket used for notifications generated by outer child */
3831 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3832 if (notify_socket < 0)
3833 return log_error_errno(notify_socket,
3834 "Failed to receive notification socket from the outer child: %m");
3835
3836 log_debug("Init process invoked as PID "PID_FMT, *pid);
3837
3838 if (arg_userns_mode != USER_NAMESPACE_NO) {
3839 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3840 log_error("Child died too early.");
3841 return -ESRCH;
3842 }
3843
3844 r = setup_uid_map(*pid);
3845 if (r < 0)
3846 return r;
3847
3848 (void) barrier_place(&barrier); /* #2 */
3849 }
3850
3851 if (arg_private_network) {
75116558
PS
3852 if (!arg_network_namespace_path) {
3853 /* Wait until the child has unshared its network namespace. */
3854 if (!barrier_place_and_sync(&barrier)) { /* #3 */
3855 log_error("Child died too early");
3856 return -ESRCH;
3857 }
3858 }
3859
b0067625
ZJS
3860 r = move_network_interfaces(*pid, arg_network_interfaces);
3861 if (r < 0)
3862 return r;
3863
3864 if (arg_network_veth) {
3865 r = setup_veth(arg_machine, *pid, veth_name,
3866 arg_network_bridge || arg_network_zone);
3867 if (r < 0)
3868 return r;
3869 else if (r > 0)
3870 ifi = r;
3871
3872 if (arg_network_bridge) {
3873 /* Add the interface to a bridge */
3874 r = setup_bridge(veth_name, arg_network_bridge, false);
3875 if (r < 0)
3876 return r;
3877 if (r > 0)
3878 ifi = r;
3879 } else if (arg_network_zone) {
3880 /* Add the interface to a bridge, possibly creating it */
3881 r = setup_bridge(veth_name, arg_network_zone, true);
3882 if (r < 0)
3883 return r;
3884 if (r > 0)
3885 ifi = r;
3886 }
3887 }
3888
3889 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3890 if (r < 0)
3891 return r;
3892
3893 /* We created the primary and extra veth links now; let's remember this, so that we know to
3894 remove them later on. Note that we don't bother with removing veth links that were created
3895 here when their setup failed half-way, because in that case the kernel should be able to
3896 remove them on its own, since they cannot be referenced by anything yet. */
3897 *veth_created = true;
3898
3899 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3900 if (r < 0)
3901 return r;
3902
3903 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3904 if (r < 0)
3905 return r;
3906 }
3907
abdb9b08
LP
3908 if (arg_register || !arg_keep_unit) {
3909 r = sd_bus_default_system(&bus);
3910 if (r < 0)
3911 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
3912
3913 r = sd_bus_set_close_on_exit(bus, false);
3914 if (r < 0)
3915 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
3916 }
3917
3918 if (!arg_keep_unit) {
3919 /* When a new scope is created for this container, then we'll be registered as its controller, in which
3920 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
3921 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
3922
75152a4d
LP
3923 r = sd_bus_match_signal_async(
3924 bus,
3925 NULL,
3926 "org.freedesktop.systemd1",
3927 NULL,
3928 "org.freedesktop.systemd1.Scope",
3929 "RequestStop",
3930 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 3931 if (r < 0)
75152a4d 3932 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
3933 }
3934
b0067625
ZJS
3935 if (arg_register) {
3936 r = register_machine(
abdb9b08 3937 bus,
b0067625
ZJS
3938 arg_machine,
3939 *pid,
3940 arg_directory,
3941 arg_uuid,
3942 ifi,
3943 arg_slice,
3944 arg_custom_mounts, arg_n_custom_mounts,
3945 arg_kill_signal,
3946 arg_property,
3947 arg_keep_unit,
3948 arg_container_service_name);
3949 if (r < 0)
3950 return r;
abdb9b08 3951
cd2dfc6f
LP
3952 } else if (!arg_keep_unit) {
3953 r = allocate_scope(
abdb9b08 3954 bus,
cd2dfc6f
LP
3955 arg_machine,
3956 *pid,
3957 arg_slice,
3958 arg_custom_mounts, arg_n_custom_mounts,
3959 arg_kill_signal,
3960 arg_property);
3961 if (r < 0)
3962 return r;
3963
3964 } else if (arg_slice || arg_property)
3965 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3966
27da7ef0 3967 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
3968 if (r < 0)
3969 return r;
3970
27da7ef0 3971 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
3972 if (r < 0)
3973 return r;
b0067625 3974
de54e02d 3975 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3976 if (r < 0)
3977 return r;
3978
3979 /* Notify the child that the parent is ready with all
3980 * its setup (including cgroup-ification), and that
3981 * the child can now hand over control to the code to
3982 * run inside the container. */
75116558 3983 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
3984
3985 /* Block SIGCHLD here, before notifying child.
3986 * process_pty() will handle it with the other signals. */
3987 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3988
3989 /* Reset signal to default */
3990 r = default_signals(SIGCHLD, -1);
3991 if (r < 0)
3992 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3993
3994 r = sd_event_new(&event);
3995 if (r < 0)
3996 return log_error_errno(r, "Failed to get default event source: %m");
3997
8fd010bb
LP
3998 (void) sd_event_set_watchdog(event, true);
3999
abdb9b08
LP
4000 if (bus) {
4001 r = sd_bus_attach_event(bus, event, 0);
4002 if (r < 0)
4003 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4004 }
4005
5773024d 4006 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4007 if (r < 0)
4008 return r;
4009
4010 /* Let the child know that we are ready and wait that the child is completely ready now. */
75116558 4011 if (!barrier_place_and_sync(&barrier)) { /* #5 */
b0067625
ZJS
4012 log_error("Child died too early.");
4013 return -ESRCH;
4014 }
4015
4016 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4017 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4018 etc_passwd_lock = safe_close(etc_passwd_lock);
4019
4020 sd_notifyf(false,
4021 "STATUS=Container running.\n"
4022 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4023 if (!arg_notify_ready)
919f5ae0 4024 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4025
4026 if (arg_kill_signal > 0) {
4027 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4028 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4029 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4030 } else {
4031 /* Immediately exit */
919f5ae0
LP
4032 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4033 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4034 }
4035
6916b164 4036 /* Exit when the child exits */
919f5ae0 4037 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4038
4039 if (arg_expose_ports) {
4040 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4041 if (r < 0)
4042 return r;
4043
4044 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4045 }
4046
4047 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4048
4049 r = pty_forward_new(event, master,
4050 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
4051 &forward);
4052 if (r < 0)
4053 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4054
4055 r = sd_event_loop(event);
4056 if (r < 0)
4057 return log_error_errno(r, "Failed to run event loop: %m");
4058
4059 pty_forward_get_last_char(forward, &last_char);
4060
4061 forward = pty_forward_free(forward);
4062
4063 if (!arg_quiet && last_char != '\n')
4064 putc('\n', stdout);
4065
4066 /* Kill if it is not dead yet anyway */
1d78fea2
LP
4067 if (bus) {
4068 if (arg_register)
4069 terminate_machine(bus, arg_machine);
4070 else if (!arg_keep_unit)
4071 terminate_scope(bus, arg_machine);
4072 }
b0067625
ZJS
4073
4074 /* Normally redundant, but better safe than sorry */
c67b0082 4075 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4076
4077 r = wait_for_container(*pid, &container_status);
4078 *pid = 0;
4079
4080 if (r < 0)
4081 /* We failed to wait for the container, or the container exited abnormally. */
4082 return r;
4083 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4084 /* r > 0 → The container exited with a non-zero status.
4085 * As a special case, we need to replace 133 with a different value,
4086 * because 133 is special-cased in the service file to reboot the container.
4087 * otherwise → The container exited with zero status and a reboot was not requested.
4088 */
2a49b612 4089 if (r == EXIT_FORCE_RESTART)
27e29a1e 4090 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4091 *ret = r;
b0067625
ZJS
4092 return 0; /* finito */
4093 }
4094
4095 /* CONTAINER_REBOOTED, loop again */
4096
4097 if (arg_keep_unit) {
4098 /* Special handling if we are running as a service: instead of simply
4099 * restarting the machine we want to restart the entire service, so let's
4100 * inform systemd about this with the special exit code 133. The service
4101 * file uses RestartForceExitStatus=133 so that this results in a full
4102 * nspawn restart. This is necessary since we might have cgroup parameters
4103 * set we want to have flushed out. */
2a49b612
ZJS
4104 *ret = EXIT_FORCE_RESTART;
4105 return 0; /* finito */
b0067625
ZJS
4106 }
4107
4108 expose_port_flush(arg_expose_ports, exposed);
4109
4110 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4111 *veth_created = false;
4112 return 1; /* loop again */
4113}
4114
bf428efb 4115static int initialize_rlimits(void) {
bf428efb
LP
4116 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4117 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4118 * container execution environments. */
4119
4120 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4121 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4122 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4123 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4124 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4125 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4126 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4127 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4128 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4129 [RLIMIT_NICE] = { 0, 0 },
4130 [RLIMIT_NOFILE] = { 1024, 4096 },
4131 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4132 [RLIMIT_RTPRIO] = { 0, 0 },
4133 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4134 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4135
4136 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4137 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4138 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4139 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4140 * that PID 1 changes a number of other resource limits during early initialization which is why we
4141 * don't read the other limits from PID 1 but prefer the static table above. */
4142 };
4143
4144 int rl;
4145
4146 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
4147 /* Let's only fill in what the user hasn't explicitly configured anyway */
4148 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4149 const struct rlimit *v;
4150 struct rlimit buffer;
4151
4152 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4153 /* For these two let's read the limits off PID 1. See above for an explanation. */
4154
4155 if (prlimit(1, rl, NULL, &buffer) < 0)
4156 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4157
4158 v = &buffer;
4159 } else
4160 v = kernel_defaults + rl;
4161
4162 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4163 if (!arg_rlimit[rl])
4164 return log_oom();
4165 }
4166
4167 if (DEBUG_LOGGING) {
4168 _cleanup_free_ char *k = NULL;
4169
4170 (void) rlimit_format(arg_rlimit[rl], &k);
4171 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4172 }
4173 }
4174
4175 return 0;
4176}
4177
03cfe0d5 4178int main(int argc, char *argv[]) {
2d845785
LP
4179 _cleanup_free_ char *console = NULL;
4180 _cleanup_close_ int master = -1;
03cfe0d5 4181 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4182 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4183 char veth_name[IFNAMSIZ] = "";
17cbb288 4184 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4185 pid_t pid = 0;
03cfe0d5 4186 union in_addr_union exposed = {};
8e766630 4187 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4188 bool interactive, veth_created = false, remove_tmprootdir = false;
4189 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4190 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4191 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4192 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
4193
4194 log_parse_environment();
4195 log_open();
415fc41c 4196
7732f92b
LP
4197 /* Make sure rename_process() in the stub init process can work */
4198 saved_argv = argv;
4199 saved_argc = argc;
4200
03cfe0d5
LP
4201 r = parse_argv(argc, argv);
4202 if (r <= 0)
4203 goto finish;
4204
fba868fa
LP
4205 r = must_be_root();
4206 if (r < 0)
03cfe0d5 4207 goto finish;
fba868fa 4208
bf428efb
LP
4209 r = initialize_rlimits();
4210 if (r < 0)
4211 goto finish;
4212
f757855e
LP
4213 r = determine_names();
4214 if (r < 0)
4215 goto finish;
4216
4217 r = load_settings();
4218 if (r < 0)
4219 goto finish;
4220
d5455d2f
LP
4221 parse_environment();
4222
5eee8290
LP
4223 r = cg_unified_flush();
4224 if (r < 0) {
4225 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4226 goto finish;
4227 }
4228
f757855e
LP
4229 r = verify_arguments();
4230 if (r < 0)
4231 goto finish;
03cfe0d5 4232
8199d554
LP
4233 r = detect_unified_cgroup_hierarchy_from_environment();
4234 if (r < 0)
4235 goto finish;
4236
2949ff26
LP
4237 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4238 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4239 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4240 (void) ignore_signals(SIGPIPE, -1);
4241
03cfe0d5
LP
4242 n_fd_passed = sd_listen_fds(false);
4243 if (n_fd_passed > 0) {
4244 r = fdset_new_listen_fds(&fds, false);
4245 if (r < 0) {
4246 log_error_errno(r, "Failed to collect file descriptors: %m");
4247 goto finish;
4248 }
4249 }
4250
83e803a9
ZJS
4251 /* The "default" umask. This is appropriate for most file and directory
4252 * operations performed by nspawn, and is the umask that will be used for
4253 * the child. Functions like copy_devnodes() change the umask temporarily. */
4254 umask(0022);
4255
03cfe0d5
LP
4256 if (arg_directory) {
4257 assert(!arg_image);
4258
4259 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4260 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4261 r = -EINVAL;
4262 goto finish;
4263 }
4264
4265 if (arg_ephemeral) {
4266 _cleanup_free_ char *np = NULL;
4267
8d4aa2bb 4268 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4269 if (r < 0)
4270 goto finish;
4271
03cfe0d5
LP
4272 /* If the specified path is a mount point we
4273 * generate the new snapshot immediately
4274 * inside it under a random name. However if
4275 * the specified is not a mount point we
4276 * create the new snapshot in the parent
4277 * directory, just next to it. */
e1873695 4278 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4279 if (r < 0) {
4280 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4281 goto finish;
4282 }
4283 if (r > 0)
770b5ce4 4284 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4285 else
770b5ce4 4286 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4287 if (r < 0) {
0f3be6ca 4288 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4289 goto finish;
4290 }
4291
4292 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4293 if (r < 0) {
4294 log_error_errno(r, "Failed to lock %s: %m", np);
4295 goto finish;
4296 }
4297
17cbb288
LP
4298 r = btrfs_subvol_snapshot(arg_directory, np,
4299 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4300 BTRFS_SNAPSHOT_FALLBACK_COPY |
4301 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4302 BTRFS_SNAPSHOT_RECURSIVE |
4303 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4304 if (r < 0) {
4305 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4306 goto finish;
ec16945e
LP
4307 }
4308
1cc6c93a 4309 free_and_replace(arg_directory, np);
ec16945e 4310
17cbb288 4311 remove_directory = true;
30535c16
LP
4312
4313 } else {
cb638b5e 4314 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4315 if (r < 0)
4316 goto finish;
4317
30535c16
LP
4318 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4319 if (r == -EBUSY) {
4320 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4321 goto finish;
4322 }
4323 if (r < 0) {
4324 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4325 goto finish;
30535c16
LP
4326 }
4327
4328 if (arg_template) {
8d4aa2bb 4329 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4330 if (r < 0)
4331 goto finish;
4332
17cbb288
LP
4333 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4334 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4335 BTRFS_SNAPSHOT_FALLBACK_COPY |
4336 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4337 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4338 BTRFS_SNAPSHOT_RECURSIVE |
4339 BTRFS_SNAPSHOT_QUOTA);
ff6c6cc1
LP
4340 if (r == -EEXIST)
4341 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4342 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4343 else if (r < 0) {
83521414 4344 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 4345 goto finish;
ff6c6cc1
LP
4346 } else
4347 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4348 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 4349 }
ec16945e
LP
4350 }
4351
7732f92b 4352 if (arg_start_mode == START_BOOT) {
a5201ed6 4353 const char *p;
c9fe05e0 4354
a5201ed6
LP
4355 if (arg_pivot_root_new)
4356 p = prefix_roota(arg_directory, arg_pivot_root_new);
4357 else
4358 p = arg_directory;
c9fe05e0
AR
4359
4360 if (path_is_os_tree(p) <= 0) {
4361 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4362 r = -EINVAL;
1b9e5b12
LP
4363 goto finish;
4364 }
4365 } else {
c9fe05e0
AR
4366 const char *p, *q;
4367
a5201ed6
LP
4368 if (arg_pivot_root_new)
4369 p = prefix_roota(arg_directory, arg_pivot_root_new);
4370 else
4371 p = arg_directory;
c9fe05e0
AR
4372
4373 q = strjoina(p, "/usr/");
1b9e5b12 4374
c9fe05e0
AR
4375 if (laccess(q, F_OK) < 0) {
4376 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4377 r = -EINVAL;
1b9e5b12 4378 goto finish;
1b9e5b12
LP
4379 }
4380 }
ec16945e 4381
6b9132a9 4382 } else {
ec16945e
LP
4383 assert(arg_image);
4384 assert(!arg_template);
4385
8d4aa2bb 4386 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4387 if (r < 0)
4388 goto finish;
4389
0f3be6ca
LP
4390 if (arg_ephemeral) {
4391 _cleanup_free_ char *np = NULL;
4392
4393 r = tempfn_random(arg_image, "machine.", &np);
4394 if (r < 0) {
4395 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4396 goto finish;
4397 }
4398
4399 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4400 if (r < 0) {
4401 r = log_error_errno(r, "Failed to create image lock: %m");
4402 goto finish;
4403 }
4404
adc6f43b 4405 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
0f3be6ca
LP
4406 if (r < 0) {
4407 r = log_error_errno(r, "Failed to copy image file: %m");
4408 goto finish;
4409 }
4410
1cc6c93a 4411 free_and_replace(arg_image, np);
0f3be6ca
LP
4412
4413 remove_image = true;
4414 } else {
4415 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4416 if (r == -EBUSY) {
4417 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4418 goto finish;
4419 }
4420 if (r < 0) {
4421 r = log_error_errno(r, "Failed to create image lock: %m");
4422 goto finish;
4423 }
4623e8e6 4424
78ebe980
LP
4425 if (!arg_root_hash) {
4426 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4427 if (r < 0) {
4428 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4429 goto finish;
4430 }
4431 }
30535c16
LP
4432 }
4433
c67b0082 4434 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4435 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4436 goto finish;
1b9e5b12 4437 }
6b9132a9 4438
c67b0082
LP
4439 remove_tmprootdir = true;
4440
4441 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4442 if (!arg_directory) {
4443 r = log_oom();
4444 goto finish;
6b9132a9 4445 }
88213476 4446
2d845785
LP
4447 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4448 if (r < 0) {
4449 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
4450 goto finish;
4451 }
1b9e5b12 4452
4526113f 4453 r = dissect_image_and_warn(
e0f9e7bd 4454 loop->fd,
4526113f 4455 arg_image,
e0f9e7bd
LP
4456 arg_root_hash, arg_root_hash_size,
4457 DISSECT_IMAGE_REQUIRE_ROOT,
4458 &dissected_image);
2d845785 4459 if (r == -ENOPKG) {
4526113f 4460 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
4461 log_notice("Note that the disk image needs to\n"
4462 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4463 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4464 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4465 " d) or contain a file system without a partition table\n"
4466 "in order to be bootable with systemd-nspawn.");
1b9e5b12 4467 goto finish;
2d845785 4468 }
4526113f 4469 if (r < 0)
842f3b0f 4470 goto finish;
1b9e5b12 4471
4623e8e6
LP
4472 if (!arg_root_hash && dissected_image->can_verity)
4473 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4474
4475 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
4476 if (r < 0)
4477 goto finish;
0f3be6ca
LP
4478
4479 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4480 if (remove_image && unlink(arg_image) >= 0)
4481 remove_image = false;
842f3b0f 4482 }
842f3b0f 4483
86c0dd4a 4484 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4485 if (r < 0)
4486 goto finish;
4487
03cfe0d5
LP
4488 interactive =
4489 isatty(STDIN_FILENO) > 0 &&
4490 isatty(STDOUT_FILENO) > 0;
9c857b9d 4491
669fc4e5 4492 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
db7feb7e 4493 if (master < 0) {
ec16945e 4494 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4495 goto finish;
4496 }
4497
611b312b
LP
4498 r = ptsname_malloc(master, &console);
4499 if (r < 0) {
4500 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4501 goto finish;
68b02049
DW
4502 }
4503
4504 if (arg_selinux_apifs_context) {
4505 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4506 if (r < 0)
4507 goto finish;
a258bf26
LP
4508 }
4509
a258bf26 4510 if (unlockpt(master) < 0) {
ec16945e 4511 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4512 goto finish;
4513 }
4514
9c857b9d
LP
4515 if (!arg_quiet)
4516 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4517 arg_machine, arg_image ?: arg_directory);
4518
72c0a2c2 4519 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4520
66edd963 4521 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
4522 r = log_error_errno(errno, "Failed to become subreaper: %m");
4523 goto finish;
4524 }
4525
d87be9b0 4526 for (;;) {
b0067625
ZJS
4527 r = run(master,
4528 console,
2d845785 4529 dissected_image,
b0067625
ZJS
4530 interactive, secondary,
4531 fds,
4532 veth_name, &veth_created,
4533 &exposed,
4534 &pid, &ret);
4535 if (r <= 0)
d87be9b0 4536 break;
d87be9b0 4537 }
88213476
LP
4538
4539finish:
af4ec430 4540 sd_notify(false,
2a49b612
ZJS
4541 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4542 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4543
9444b1f2 4544 if (pid > 0)
c67b0082 4545 (void) kill(pid, SIGKILL);
88213476 4546
503546da 4547 /* Try to flush whatever is still queued in the pty */
6a0f896b 4548 if (master >= 0) {
1c876927 4549 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
4550 master = safe_close(master);
4551 }
4552
4553 if (pid > 0)
4554 (void) wait_for_terminate(pid, NULL);
503546da 4555
50ebcf6c
LP
4556 pager_close();
4557
17cbb288 4558 if (remove_directory && arg_directory) {
ec16945e
LP
4559 int k;
4560
17cbb288 4561 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4562 if (k < 0)
17cbb288 4563 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4564 }
4565
0f3be6ca
LP
4566 if (remove_image && arg_image) {
4567 if (unlink(arg_image) < 0)
4568 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4569 }
4570
c67b0082
LP
4571 if (remove_tmprootdir) {
4572 if (rmdir(tmprootdir) < 0)
4573 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4574 }
4575
785890ac
LP
4576 if (arg_machine) {
4577 const char *p;
4578
63c372cb 4579 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4580 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4581 }
4582
7a8f6325 4583 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4584
4585 if (veth_created)
4586 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4587 (void) remove_bridge(arg_network_zone);
f757855e 4588
04d391da 4589 free(arg_directory);
ec16945e
LP
4590 free(arg_template);
4591 free(arg_image);
7027ff61 4592 free(arg_machine);
3a9530e5 4593 free(arg_hostname);
c74e630d 4594 free(arg_user);
b53ede69
PW
4595 free(arg_pivot_root_new);
4596 free(arg_pivot_root_old);
5f932eb9 4597 free(arg_chdir);
c74e630d 4598 strv_free(arg_setenv);
f757855e 4599 free(arg_network_bridge);
c74e630d
LP
4600 strv_free(arg_network_interfaces);
4601 strv_free(arg_network_macvlan);
4bbfe7ad 4602 strv_free(arg_network_ipvlan);
f6d6bad1 4603 strv_free(arg_network_veth_extra);
f757855e 4604 strv_free(arg_parameters);
df1fac6d
LP
4605 free(arg_network_zone);
4606 free(arg_network_namespace_path);
4607 strv_free(arg_property);
f757855e
LP
4608 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4609 expose_port_free_all(arg_expose_ports);
4623e8e6 4610 free(arg_root_hash);
bf428efb 4611 rlimit_free_all(arg_rlimit);
df1fac6d
LP
4612 strv_free(arg_syscall_whitelist);
4613 strv_free(arg_syscall_blacklist);
d107bb7d 4614 arg_cpuset = cpu_set_mfree(arg_cpuset);
6d0b55c2 4615
ec16945e 4616 return r < 0 ? EXIT_FAILURE : ret;
88213476 4617}