]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
journal: sync immediately on shutting down journald
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
813dbff4 27#include "ether-addr-util.h"
8fe0087e
LP
28#include "barrier.h"
29#include "base-filesystem.h"
30#include "blkid-util.h"
31#include "btrfs-util.h"
d6b4d1c7 32#include "build.h"
b8ea7a6e 33#include "bus-error.h"
7f8a85e6 34#include "bus-locator.h"
b053cd5f 35#include "bus-util.h"
8fe0087e 36#include "cap-list.h"
430f0182 37#include "capability-util.h"
04d391da 38#include "cgroup-util.h"
f461a28d 39#include "chase.h"
988851b6 40#include "common-signal.h"
8fe0087e 41#include "copy.h"
d107bb7d 42#include "cpu-set-util.h"
786d19fd 43#include "creds-util.h"
4fc9982c 44#include "dev-setup.h"
57f1b61b 45#include "discover-image.h"
2d845785 46#include "dissect-image.h"
8fe0087e 47#include "env-util.h"
3652872a 48#include "escape.h"
3ffd4af2 49#include "fd-util.h"
842f3b0f 50#include "fdset.h"
a5c32cff 51#include "fileio.h"
f97b34a6 52#include "format-util.h"
f4f15635 53#include "fs-util.h"
1b9e5b12 54#include "gpt.h"
4623e8e6 55#include "hexdecoct.h"
e2054217 56#include "hostname-setup.h"
8fe0087e 57#include "hostname-util.h"
910fd145 58#include "id128-util.h"
3652872a 59#include "io-util.h"
8fe0087e 60#include "log.h"
2d845785 61#include "loop-util.h"
8fe0087e 62#include "loopback-setup.h"
e8ac916e 63#include "machine-credential.h"
8fe0087e 64#include "macro.h"
44dbef90 65#include "main-func.h"
f5947a5e 66#include "missing_sched.h"
8fe0087e 67#include "mkdir.h"
4349cd7c 68#include "mount-util.h"
049af8ad 69#include "mountpoint-util.h"
0cb8e3d1 70#include "namespace-util.h"
8fe0087e 71#include "netlink-util.h"
2f893044 72#include "nspawn-bind-user.h"
07630cea 73#include "nspawn-cgroup.h"
3603efde 74#include "nspawn-def.h"
07630cea
LP
75#include "nspawn-expose-ports.h"
76#include "nspawn-mount.h"
77#include "nspawn-network.h"
de40a303 78#include "nspawn-oci.h"
7336138e 79#include "nspawn-patch-uid.h"
07630cea 80#include "nspawn-register.h"
910fd145 81#include "nspawn-seccomp.h"
07630cea
LP
82#include "nspawn-settings.h"
83#include "nspawn-setuid.h"
7732f92b 84#include "nspawn-stub-pid1.h"
c9394f4f 85#include "nspawn-util.h"
91181e07 86#include "nspawn.h"
d8b4d14d 87#include "nulstr-util.h"
d58ad743 88#include "os-util.h"
50ebcf6c 89#include "pager.h"
614b022c 90#include "parse-argument.h"
6bedfcbb 91#include "parse-util.h"
294bf0c3 92#include "pretty-print.h"
0b452006 93#include "process-util.h"
8fe0087e
LP
94#include "ptyfwd.h"
95#include "random-util.h"
8869a0b4 96#include "raw-clone.h"
86775e35 97#include "resolve-util.h"
bf428efb 98#include "rlimit-util.h"
8fe0087e 99#include "rm-rf.h"
de40a303 100#include "seccomp-util.h"
68b02049 101#include "selinux-util.h"
8fe0087e 102#include "signal-util.h"
2583fbea 103#include "socket-util.h"
8fcde012 104#include "stat-util.h"
15a5e950 105#include "stdio-util.h"
5c828e66 106#include "string-table.h"
07630cea 107#include "string-util.h"
8fe0087e 108#include "strv.h"
de40a303 109#include "sysctl-util.h"
8fe0087e 110#include "terminal-util.h"
e4de7287 111#include "tmpfile-util.h"
affb60b1 112#include "umask-util.h"
43c3fb46 113#include "unit-name.h"
b1d4f8e1 114#include "user-util.h"
e9642be2 115
e96ceaba
LP
116/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
e79581dd 118#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
0e7ac751 119
2a49b612
ZJS
120#define EXIT_FORCE_RESTART 133
121
113cea80
DH
122typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
6145bb4f 124 CONTAINER_REBOOTED,
113cea80
DH
125} ContainerStatus;
126
88213476 127static char *arg_directory = NULL;
ec16945e 128static char *arg_template = NULL;
5f932eb9 129static char *arg_chdir = NULL;
b53ede69
PW
130static char *arg_pivot_root_new = NULL;
131static char *arg_pivot_root_old = NULL;
687d0825 132static char *arg_user = NULL;
de40a303
LP
133static uid_t arg_uid = UID_INVALID;
134static gid_t arg_gid = GID_INVALID;
135static gid_t* arg_supplementary_gids = NULL;
136static size_t arg_n_supplementary_gids = 0;
9444b1f2 137static sd_id128_t arg_uuid = {};
3a9530e5
LP
138static char *arg_machine = NULL; /* The name used by the host to refer to this */
139static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
140static const char *arg_selinux_context = NULL;
141static const char *arg_selinux_apifs_context = NULL;
de40a303 142static char *arg_slice = NULL;
ff01d048 143static bool arg_private_network = false;
bc2f673e 144static bool arg_read_only = false;
7732f92b 145static StartMode arg_start_mode = START_PID1;
ec16945e 146static bool arg_ephemeral = false;
57fb9fb5 147static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 148static bool arg_link_journal_try = false;
520e0d54 149static uint64_t arg_caps_retain =
50b52222
LP
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 161 (1ULL << CAP_MKNOD) |
5076f0cc
LP
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
5076f0cc 165 (1ULL << CAP_SETFCAP) |
50b52222 166 (1ULL << CAP_SETGID) |
5076f0cc
LP
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
50b52222 170 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 174 (1ULL << CAP_SYS_RESOURCE) |
50b52222 175 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 176static uint64_t arg_caps_ambient = 0;
de40a303 177static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 178static CustomMount *arg_custom_mounts = NULL;
88614c8a 179static size_t arg_n_custom_mounts = 0;
f4889f65 180static char **arg_setenv = NULL;
284c0b91 181static bool arg_quiet = false;
eb91eb18 182static bool arg_register = true;
89f7c846 183static bool arg_keep_unit = false;
aa28aefe 184static char **arg_network_interfaces = NULL;
c74e630d 185static char **arg_network_macvlan = NULL;
4bbfe7ad 186static char **arg_network_ipvlan = NULL;
69c79d3c 187static bool arg_network_veth = false;
f6d6bad1 188static char **arg_network_veth_extra = NULL;
f757855e 189static char *arg_network_bridge = NULL;
22b28dfd 190static char *arg_network_zone = NULL;
d7bea6b6 191static char *arg_network_namespace_path = NULL;
813dbff4 192struct ether_addr arg_network_provided_mac = {};
bb068de0 193static PagerFlags arg_pager_flags = 0;
050f7277 194static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 195static char *arg_image = NULL;
de40a303 196static char *arg_oci_bundle = NULL;
f757855e 197static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 198static ExposePort *arg_expose_ports = NULL;
f36933fe 199static char **arg_property = NULL;
de40a303 200static sd_bus_message *arg_property_message = NULL;
0de7acce 201static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 202static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 203static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 204static int arg_kill_signal = 0;
5da38d07 205static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
206static SettingsMask arg_settings_mask = 0;
207static int arg_settings_trusted = -1;
208static char **arg_parameters = NULL;
6aadfa4c 209static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 210static bool arg_notify_ready = false;
5a8ff0e6 211static bool arg_use_cgns = true;
0c582db0 212static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 213static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 214static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
215static char **arg_syscall_allow_list = NULL;
216static char **arg_syscall_deny_list = NULL;
de40a303
LP
217#if HAVE_SECCOMP
218static scmp_filter_ctx arg_seccomp = NULL;
219#endif
bf428efb 220static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 221static bool arg_no_new_privileges = false;
81f345df
LP
222static int arg_oom_score_adjust = 0;
223static bool arg_oom_score_adjust_set = false;
0985c7c4 224static CPUSet arg_cpu_set = {};
09d423e9 225static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 226static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 227static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
228static DeviceNode* arg_extra_nodes = NULL;
229static size_t arg_n_extra_nodes = 0;
230static char **arg_sysctl = NULL;
231static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
e8ac916e 232static MachineCredential *arg_credentials = NULL;
3652872a 233static size_t arg_n_credentials = 0;
2f893044 234static char **arg_bind_user = NULL;
4a4654e0 235static bool arg_suppress_sync = false;
3603f151 236static char *arg_settings_filename = NULL;
4c27749b 237static Architecture arg_architecture = _ARCHITECTURE_INVALID;
84be0c71 238static ImagePolicy *arg_image_policy = NULL;
88213476 239
6145bb4f
LP
240STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
249STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
250STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
254STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
255STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
256STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
258STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
259STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
260STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
261STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
262STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 263STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
264STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
265STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
266#if HAVE_SECCOMP
267STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
268#endif
0985c7c4 269STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 270STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 271STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
3603f151 272STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
84be0c71 273STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
6145bb4f 274
dce66ffe
ZJS
275static int handle_arg_console(const char *arg) {
276 if (streq(arg, "help")) {
10e8a60b
LP
277 puts("autopipe\n"
278 "interactive\n"
dce66ffe 279 "passive\n"
10e8a60b
LP
280 "pipe\n"
281 "read-only");
dce66ffe
ZJS
282 return 0;
283 }
284
285 if (streq(arg, "interactive"))
286 arg_console_mode = CONSOLE_INTERACTIVE;
287 else if (streq(arg, "read-only"))
288 arg_console_mode = CONSOLE_READ_ONLY;
289 else if (streq(arg, "passive"))
290 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
291 else if (streq(arg, "pipe")) {
292 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
293 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
294 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
295 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
296 "Proceeding anyway.");
297
dce66ffe 298 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
299 } else if (streq(arg, "autopipe")) {
300 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
301 arg_console_mode = CONSOLE_INTERACTIVE;
302 else
303 arg_console_mode = CONSOLE_PIPE;
554c4beb 304 } else
dce66ffe
ZJS
305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
306
307 arg_settings_mask |= SETTING_CONSOLE_MODE;
308 return 1;
309}
310
37ec0fdd
LP
311static int help(void) {
312 _cleanup_free_ char *link = NULL;
313 int r;
314
384c2c32 315 pager_open(arg_pager_flags);
50ebcf6c 316
37ec0fdd
LP
317 r = terminal_urlify_man("systemd-nspawn", "1", &link);
318 if (r < 0)
319 return log_oom();
320
25148653 321 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 322 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
323 " -h --help Show this help\n"
324 " --version Print version string\n"
69c79d3c 325 " -q --quiet Do not show status information\n"
bb068de0 326 " --no-pager Do not pipe output into a pager\n"
25148653
LP
327 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
328 "%3$sImage:%4$s\n"
1b9e5b12 329 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
330 " --template=PATH Initialize root directory from template directory,\n"
331 " if missing\n"
332 " -x --ephemeral Run container with snapshot of root directory, and\n"
333 " remove it after exit\n"
25e68fd3
LP
334 " -i --image=PATH Root file system disk image (or device node) for\n"
335 " the container\n"
84be0c71 336 " --image-policy=POLICY Specify disk image dissection policy\n"
de40a303 337 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
338 " --read-only Mount the root directory read-only\n"
339 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 340 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
341 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
342 " as a DER encoded PKCS7, either as a path to a file\n"
343 " or as an ASCII base64 encoded string prefixed by\n"
344 " 'base64:'\n"
e7cbe5cb 345 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
346 " --pivot-root=PATH[:PATH]\n"
347 " Pivot root to given directory in the container\n\n"
348 "%3$sExecution:%4$s\n"
7732f92b 349 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 350 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 351 " --chdir=PATH Set working directory in the container\n"
0d2a0179 352 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
25148653
LP
353 " -u --user=USER Run the command under specified user or UID\n"
354 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
4a4654e0
LP
355 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
356 " --suppress-sync=BOOLEAN\n"
357 " Suppress any form of disk data synchronization\n\n"
25148653 358 "%3$sSystem Identity:%4$s\n"
a8828ed9 359 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 360 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
361 " --uuid=UUID Set a specific machine UUID for the container\n\n"
362 "%3$sProperties:%4$s\n"
a8828ed9 363 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 364 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
365 " --register=BOOLEAN Register container as machine\n"
366 " --keep-unit Do not register a scope for the machine, reuse\n"
367 " the service unit nspawn is running in\n\n"
368 "%3$sUser Namespacing:%4$s\n"
b917743d
YW
369 " --private-users=no Run without user namespacing\n"
370 " --private-users=yes|pick|identity\n"
371 " Run within user namespace, autoselect UID/GID range\n"
372 " --private-users=UIDBASE[:NUIDS]\n"
90b4a64d 373 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
374 " --private-users-ownership=MODE\n"
375 " Adjust ('chown') or map ('map') OS tree ownership\n"
b917743d
YW
376 " to private UID/GID range\n"
377 " -U Equivalent to --private-users=pick and\n"
378 " --private-users-ownership=auto\n\n"
25148653 379 "%3$sNetworking:%4$s\n"
69c79d3c 380 " --private-network Disable network in container\n"
2f091b1b 381 " --network-interface=HOSTIF[:CONTAINERIF]\n"
69c79d3c
LP
382 " Assign an existing network interface to the\n"
383 " container\n"
2f091b1b 384 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
c74e630d
LP
385 " Create a macvlan network interface based on an\n"
386 " existing network interface to the container\n"
2f091b1b 387 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
387f6955 388 " Create an ipvlan network interface based on an\n"
4bbfe7ad 389 " existing network interface to the container\n"
a8eaaee7 390 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 391 " and container\n"
f6d6bad1
LP
392 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
393 " Add an additional virtual Ethernet link between\n"
394 " host and container\n"
ab046dde 395 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
396 " Add a virtual Ethernet connection to the container\n"
397 " and attach it to an existing bridge on the host\n"
398 " --network-zone=NAME Similar, but attach the new interface to an\n"
399 " an automatically managed bridge interface\n"
d7bea6b6
DP
400 " --network-namespace-path=PATH\n"
401 " Set network namespace to the one represented by\n"
402 " the specified kernel namespace file node\n"
6d0b55c2 403 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
404 " Expose a container IP port on the host\n\n"
405 "%3$sSecurity:%4$s\n"
a8828ed9
DW
406 " --capability=CAP In addition to the default, retain specified\n"
407 " capability\n"
408 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
409 " --ambient-capability=CAP\n"
410 " Sets the specified capability for the started\n"
411 " process. Not useful if booting a machine.\n"
f4e803c8 412 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
413 " --system-call-filter=LIST|~LIST\n"
414 " Permit/prohibit specific system calls\n"
25148653
LP
415 " -Z --selinux-context=SECLABEL\n"
416 " Set the SELinux security context to be used by\n"
417 " processes in the container\n"
418 " -L --selinux-apifs-context=SECLABEL\n"
419 " Set the SELinux security context to be used by\n"
420 " API/tmpfs file systems in the container\n\n"
421 "%3$sResources:%4$s\n"
bf428efb 422 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
423 " --oom-score-adjust=VALUE\n"
424 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
425 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
426 " --personality=ARCH Pick personality for this container\n\n"
25148653 427 "%3$sIntegration:%4$s\n"
09d423e9 428 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 429 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
430 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
431 " host, try-guest, try-host\n"
432 " -j Equivalent to --link-journal=try-guest\n\n"
433 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
434 " --bind=PATH[:PATH[:OPTIONS]]\n"
435 " Bind mount a file or directory from the host into\n"
a8828ed9 436 " the container\n"
5e5bfa6e
EY
437 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
438 " Similar, but creates a read-only bind mount\n"
de40a303
LP
439 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
440 " it\n"
06c17c39 441 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
442 " --overlay=PATH[:PATH...]:PATH\n"
443 " Create an overlay mount from the host to \n"
444 " the container\n"
445 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
446 " Similar, but creates a read-only overlay mount\n"
447 " --bind-user=NAME Bind user from host to container\n\n"
25148653 448 "%3$sInput/Output:%4$s\n"
de40a303
LP
449 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
450 " set up for the container.\n"
3652872a
LP
451 " -P --pipe Equivalent to --console=pipe\n\n"
452 "%3$sCredentials:%4$s\n"
453 " --set-credential=ID:VALUE\n"
454 " Pass a credential with literal value to container.\n"
455 " --load-credential=ID:PATH\n"
456 " Load credential to pass to container from file or\n"
457 " AF_UNIX stream socket.\n"
bc556335
DDM
458 "\nSee the %2$s for details.\n",
459 program_invocation_short_name,
460 link,
461 ansi_underline(),
462 ansi_normal(),
463 ansi_highlight(),
464 ansi_normal());
37ec0fdd
LP
465
466 return 0;
88213476
LP
467}
468
86c0dd4a 469static int custom_mount_check_all(void) {
88614c8a 470 size_t i;
5a8af538 471
5a8af538
LP
472 for (i = 0; i < arg_n_custom_mounts; i++) {
473 CustomMount *m = &arg_custom_mounts[i];
474
0de7acce 475 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 476 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 478 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 479 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
481 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 482 }
5a8af538
LP
483 }
484
485 return 0;
486}
487
8199d554 488static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 489 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 490 int r;
5da38d07 491
efdb0237 492 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
493
494 e = getenv(var);
495 if (!e) {
d5fc5b2f 496 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
497 var = "UNIFIED_CGROUP_HIERARCHY";
498 e = getenv(var);
c78c095b
ZJS
499 }
500
501 if (!isempty(e)) {
efdb0237
LP
502 r = parse_boolean(e);
503 if (r < 0)
c78c095b 504 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
505 if (r > 0)
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
507 else
508 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
509 }
510
8199d554
LP
511 return 0;
512}
513
514static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
515 int r;
516
75b0d8b8
ZJS
517 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
518 * in the image actually supports. */
b4cccbc1
LP
519 r = cg_all_unified();
520 if (r < 0)
521 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
522 if (r > 0) {
a8725a06
ZJS
523 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
524 * routine only detects 231, so we'll have a false negative here for 230. */
7e6821ed 525 r = systemd_installation_has_version(directory, "230");
a8725a06
ZJS
526 if (r < 0)
527 return log_error_errno(r, "Failed to determine systemd version in container: %m");
528 if (r > 0)
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
530 else
531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 532 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b 533 /* Mixed cgroup hierarchy support was added in 233 */
7e6821ed 534 r = systemd_installation_has_version(directory, "233");
0fd9563f
ZJS
535 if (r < 0)
536 return log_error_errno(r, "Failed to determine systemd version in container: %m");
537 if (r > 0)
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
539 else
540 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
541 } else
5da38d07 542 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 543
8199d554
LP
544 log_debug("Using %s hierarchy for container.",
545 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
546 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
547
efdb0237
LP
548 return 0;
549}
550
8a99bd0c
ZJS
551static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
552 uint64_t mask = 0;
553 int r;
554
555 for (;;) {
556 _cleanup_free_ char *t = NULL;
557
558 r = extract_first_word(&spec, &t, ",", 0);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse capability %s.", t);
561 if (r == 0)
562 break;
563
564 if (streq(t, "help")) {
565 for (int i = 0; i < capability_list_length(); i++) {
566 const char *name;
567
568 name = capability_to_name(i);
569 if (name)
570 puts(name);
571 }
572
573 return 0; /* quit */
574 }
575
576 if (streq(t, "all"))
f5fbe71d 577 mask = UINT64_MAX;
8a99bd0c
ZJS
578 else {
579 r = capability_from_name(t);
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse capability %s.", t);
582
583 mask |= 1ULL << r;
584 }
585 }
586
587 *ret_mask = mask;
588 return 1; /* continue */
589}
590
49048684 591static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
592 int r;
593
594 r = getenv_bool(name);
595 if (r == -ENXIO)
49048684 596 return 0;
0c582db0 597 if (r < 0)
49048684 598 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 599
0c582db0 600 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 601 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 602 return 0;
0c582db0
LB
603}
604
49048684 605static int parse_mount_settings_env(void) {
4f086aab 606 const char *e;
1099ceeb
LP
607 int r;
608
609 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
610 if (r < 0 && r != -ENXIO)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
612 if (r >= 0)
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
614
615 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 616 if (streq_ptr(e, "network"))
4f086aab 617 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 618
49048684
ZJS
619 else if (e) {
620 r = parse_boolean(e);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
623
624 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
625 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 626 }
4f086aab 627
49048684 628 return 0;
4f086aab
SU
629}
630
49048684 631static int parse_environment(void) {
d5455d2f
LP
632 const char *e;
633 int r;
634
49048684
ZJS
635 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
636 if (r < 0)
637 return r;
638 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
639 if (r < 0)
640 return r;
641 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
642 if (r < 0)
643 return r;
644 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
645 if (r < 0)
646 return r;
d5455d2f 647
49048684
ZJS
648 r = parse_mount_settings_env();
649 if (r < 0)
650 return r;
d5455d2f 651
489fae52
ZJS
652 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
653 * even if it is supported. If not supported, it has no effect. */
de40a303 654 if (!cg_ns_supported())
489fae52 655 arg_use_cgns = false;
de40a303
LP
656 else {
657 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
658 if (r < 0) {
659 if (r != -ENXIO)
49048684 660 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
661
662 arg_use_cgns = true;
663 } else {
664 arg_use_cgns = r > 0;
665 arg_settings_mask |= SETTING_USE_CGNS;
666 }
667 }
d5455d2f
LP
668
669 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
670 if (e)
671 arg_container_service_name = e;
672
813dbff4
RC
673 e = getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
674 if (e) {
675 r = parse_ether_addr(e, &arg_network_provided_mac);
676 if (r < 0)
677 return log_error_errno(r, "Failed to parse provided MAC address via environment variable");
678 }
679
4a4654e0
LP
680 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
681 if (r >= 0)
682 arg_suppress_sync = r;
683 else if (r != -ENXIO)
684 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
685
49048684 686 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
687}
688
88213476 689static int parse_argv(int argc, char *argv[]) {
a41fe3a2 690 enum {
acbeb427
ZJS
691 ARG_VERSION = 0x100,
692 ARG_PRIVATE_NETWORK,
bc2f673e 693 ARG_UUID,
5076f0cc 694 ARG_READ_ONLY,
57fb9fb5 695 ARG_CAPABILITY,
88fc9c9b 696 ARG_AMBIENT_CAPABILITY,
420c7379 697 ARG_DROP_CAPABILITY,
17fe0523
LP
698 ARG_LINK_JOURNAL,
699 ARG_BIND,
f4889f65 700 ARG_BIND_RO,
06c17c39 701 ARG_TMPFS,
5a8af538
LP
702 ARG_OVERLAY,
703 ARG_OVERLAY_RO,
de40a303 704 ARG_INACCESSIBLE,
eb91eb18 705 ARG_SHARE_SYSTEM,
89f7c846 706 ARG_REGISTER,
aa28aefe 707 ARG_KEEP_UNIT,
69c79d3c 708 ARG_NETWORK_INTERFACE,
c74e630d 709 ARG_NETWORK_MACVLAN,
4bbfe7ad 710 ARG_NETWORK_IPVLAN,
ab046dde 711 ARG_NETWORK_BRIDGE,
22b28dfd 712 ARG_NETWORK_ZONE,
f6d6bad1 713 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 714 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 715 ARG_PERSONALITY,
4d9f07b4 716 ARG_VOLATILE,
ec16945e 717 ARG_TEMPLATE,
f36933fe 718 ARG_PROPERTY,
6dac160c 719 ARG_PRIVATE_USERS,
c6c8f6e2 720 ARG_KILL_SIGNAL,
f757855e 721 ARG_SETTINGS,
5f932eb9 722 ARG_CHDIR,
b53ede69 723 ARG_PIVOT_ROOT,
7336138e 724 ARG_PRIVATE_USERS_CHOWN,
6c045a99 725 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 726 ARG_NOTIFY_READY,
4623e8e6 727 ARG_ROOT_HASH,
89e62e0b
LP
728 ARG_ROOT_HASH_SIG,
729 ARG_VERITY_DATA,
960e4569 730 ARG_SYSTEM_CALL_FILTER,
bf428efb 731 ARG_RLIMIT,
3a9530e5 732 ARG_HOSTNAME,
66edd963 733 ARG_NO_NEW_PRIVILEGES,
81f345df 734 ARG_OOM_SCORE_ADJUST,
d107bb7d 735 ARG_CPU_AFFINITY,
09d423e9 736 ARG_RESOLV_CONF,
1688841f 737 ARG_TIMEZONE,
de40a303
LP
738 ARG_CONSOLE,
739 ARG_PIPE,
740 ARG_OCI_BUNDLE,
bb068de0 741 ARG_NO_PAGER,
3652872a
LP
742 ARG_SET_CREDENTIAL,
743 ARG_LOAD_CREDENTIAL,
2f893044 744 ARG_BIND_USER,
4a4654e0 745 ARG_SUPPRESS_SYNC,
84be0c71 746 ARG_IMAGE_POLICY,
a41fe3a2
LP
747 };
748
88213476 749 static const struct option options[] = {
d7bea6b6
DP
750 { "help", no_argument, NULL, 'h' },
751 { "version", no_argument, NULL, ARG_VERSION },
752 { "directory", required_argument, NULL, 'D' },
753 { "template", required_argument, NULL, ARG_TEMPLATE },
754 { "ephemeral", no_argument, NULL, 'x' },
755 { "user", required_argument, NULL, 'u' },
756 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
757 { "as-pid2", no_argument, NULL, 'a' },
758 { "boot", no_argument, NULL, 'b' },
759 { "uuid", required_argument, NULL, ARG_UUID },
760 { "read-only", no_argument, NULL, ARG_READ_ONLY },
761 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 762 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 763 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 764 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
765 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
766 { "bind", required_argument, NULL, ARG_BIND },
767 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
768 { "tmpfs", required_argument, NULL, ARG_TMPFS },
769 { "overlay", required_argument, NULL, ARG_OVERLAY },
770 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 771 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 772 { "machine", required_argument, NULL, 'M' },
3a9530e5 773 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
774 { "slice", required_argument, NULL, 'S' },
775 { "setenv", required_argument, NULL, 'E' },
776 { "selinux-context", required_argument, NULL, 'Z' },
777 { "selinux-apifs-context", required_argument, NULL, 'L' },
778 { "quiet", no_argument, NULL, 'q' },
779 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
780 { "register", required_argument, NULL, ARG_REGISTER },
781 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
782 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
783 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
784 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
785 { "network-veth", no_argument, NULL, 'n' },
786 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
787 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
788 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
789 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
790 { "personality", required_argument, NULL, ARG_PERSONALITY },
791 { "image", required_argument, NULL, 'i' },
792 { "volatile", optional_argument, NULL, ARG_VOLATILE },
793 { "port", required_argument, NULL, 'p' },
794 { "property", required_argument, NULL, ARG_PROPERTY },
795 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
796 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
797 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
798 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
799 { "settings", required_argument, NULL, ARG_SETTINGS },
800 { "chdir", required_argument, NULL, ARG_CHDIR },
801 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
802 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
803 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
804 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
805 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 806 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 807 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 808 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 809 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 810 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 811 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
812 { "console", required_argument, NULL, ARG_CONSOLE },
813 { "pipe", no_argument, NULL, ARG_PIPE },
814 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 815 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
816 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
817 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 818 { "bind-user", required_argument, NULL, ARG_BIND_USER },
4a4654e0 819 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
84be0c71 820 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
eb9da376 821 {}
88213476
LP
822 };
823
9444b1f2 824 int c, r;
a42c8b54 825 uint64_t plus = 0, minus = 0;
f757855e 826 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
827
828 assert(argc >= 0);
829 assert(argv);
830
ef9c12b1
YW
831 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
832 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
833 optind = 0;
de40a303 834 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
835 switch (c) {
836
837 case 'h':
37ec0fdd 838 return help();
88213476 839
acbeb427 840 case ARG_VERSION:
3f6fd1ba 841 return version();
acbeb427 842
88213476 843 case 'D':
614b022c 844 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 845 if (r < 0)
0f03c2a4 846 return r;
de40a303
LP
847
848 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
849 break;
850
851 case ARG_TEMPLATE:
614b022c 852 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 853 if (r < 0)
0f03c2a4 854 return r;
de40a303
LP
855
856 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
857 break;
858
1b9e5b12 859 case 'i':
614b022c 860 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 861 if (r < 0)
0f03c2a4 862 return r;
de40a303
LP
863
864 arg_settings_mask |= SETTING_DIRECTORY;
865 break;
866
867 case ARG_OCI_BUNDLE:
614b022c 868 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
869 if (r < 0)
870 return r;
871
ec16945e
LP
872 break;
873
874 case 'x':
875 arg_ephemeral = true;
a2f577fc 876 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
877 break;
878
687d0825 879 case 'u':
2fc09a9c
DM
880 r = free_and_strdup(&arg_user, optarg);
881 if (r < 0)
7027ff61 882 return log_oom();
687d0825 883
f757855e 884 arg_settings_mask |= SETTING_USER;
687d0825
MV
885 break;
886
22b28dfd 887 case ARG_NETWORK_ZONE: {
fee9f7b5 888 _cleanup_free_ char *j = NULL;
22b28dfd 889
b910cc72 890 j = strjoin("vz-", optarg);
22b28dfd
LP
891 if (!j)
892 return log_oom();
893
fee9f7b5
FS
894 if (!ifname_valid(j))
895 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
896 "Network zone name not valid: %s", j);
22b28dfd 897
df1fac6d 898 free_and_replace(arg_network_zone, j);
22b28dfd
LP
899
900 arg_network_veth = true;
901 arg_private_network = true;
902 arg_settings_mask |= SETTING_NETWORK;
903 break;
904 }
905
ab046dde 906 case ARG_NETWORK_BRIDGE:
ef76dff2 907
baaa35ad
ZJS
908 if (!ifname_valid(optarg))
909 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
910 "Bridge interface name not valid: %s", optarg);
ef76dff2 911
f757855e
LP
912 r = free_and_strdup(&arg_network_bridge, optarg);
913 if (r < 0)
914 return log_oom();
ab046dde 915
4831981d 916 _fallthrough_;
0dfaa006 917 case 'n':
69c79d3c
LP
918 arg_network_veth = true;
919 arg_private_network = true;
f757855e 920 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
921 break;
922
f6d6bad1
LP
923 case ARG_NETWORK_VETH_EXTRA:
924 r = veth_extra_parse(&arg_network_veth_extra, optarg);
925 if (r < 0)
926 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
927
928 arg_private_network = true;
929 arg_settings_mask |= SETTING_NETWORK;
930 break;
931
aa28aefe 932 case ARG_NETWORK_INTERFACE:
2f091b1b 933 r = interface_pair_parse(&arg_network_interfaces, optarg);
b390f178
DDM
934 if (r < 0)
935 return r;
936
c74e630d 937 arg_private_network = true;
f757855e 938 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
939 break;
940
941 case ARG_NETWORK_MACVLAN:
2f091b1b 942 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
b390f178
DDM
943 if (r < 0)
944 return r;
945
4bbfe7ad 946 arg_private_network = true;
f757855e 947 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
948 break;
949
950 case ARG_NETWORK_IPVLAN:
2f091b1b 951 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
b390f178
DDM
952 if (r < 0)
953 return r;
954
4831981d 955 _fallthrough_;
ff01d048
LP
956 case ARG_PRIVATE_NETWORK:
957 arg_private_network = true;
f757855e 958 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
959 break;
960
d7bea6b6 961 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 962 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
963 if (r < 0)
964 return r;
965
de40a303 966 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
967 break;
968
0f0dbc46 969 case 'b':
baaa35ad
ZJS
970 if (arg_start_mode == START_PID2)
971 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
972 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
973
974 arg_start_mode = START_BOOT;
975 arg_settings_mask |= SETTING_START_MODE;
976 break;
977
978 case 'a':
baaa35ad
ZJS
979 if (arg_start_mode == START_BOOT)
980 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
981 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
982
983 arg_start_mode = START_PID2;
984 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
985 break;
986
144f0fc0 987 case ARG_UUID:
aea3f594
ZJS
988 r = id128_from_string_nonzero(optarg, &arg_uuid);
989 if (r == -ENXIO)
baaa35ad
ZJS
990 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
991 "Machine UUID may not be all zeroes.");
aea3f594
ZJS
992 if (r < 0)
993 return log_error_errno(r, "Invalid UUID: %s", optarg);
f757855e
LP
994
995 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 996 break;
aa96c6cb 997
43c3fb46
LP
998 case 'S': {
999 _cleanup_free_ char *mangled = NULL;
1000
1001 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
1002 if (r < 0)
1003 return log_oom();
1004
43c3fb46 1005 free_and_replace(arg_slice, mangled);
de40a303 1006 arg_settings_mask |= SETTING_SLICE;
144f0fc0 1007 break;
43c3fb46 1008 }
144f0fc0 1009
7027ff61 1010 case 'M':
c1521918 1011 if (isempty(optarg))
97b11eed 1012 arg_machine = mfree(arg_machine);
c1521918 1013 else {
52ef5dd7 1014 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1015 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1016 "Invalid machine name: %s", optarg);
7027ff61 1017
0c3c4284
LP
1018 r = free_and_strdup(&arg_machine, optarg);
1019 if (r < 0)
eb91eb18 1020 return log_oom();
eb91eb18 1021 }
9ce6d1b3 1022 break;
7027ff61 1023
3a9530e5
LP
1024 case ARG_HOSTNAME:
1025 if (isempty(optarg))
1026 arg_hostname = mfree(arg_hostname);
1027 else {
52ef5dd7 1028 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1029 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1030 "Invalid hostname: %s", optarg);
3a9530e5
LP
1031
1032 r = free_and_strdup(&arg_hostname, optarg);
1033 if (r < 0)
1034 return log_oom();
1035 }
1036
1037 arg_settings_mask |= SETTING_HOSTNAME;
1038 break;
1039
82adf6af
LP
1040 case 'Z':
1041 arg_selinux_context = optarg;
a8828ed9
DW
1042 break;
1043
82adf6af
LP
1044 case 'L':
1045 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1046 break;
1047
bc2f673e
LP
1048 case ARG_READ_ONLY:
1049 arg_read_only = true;
f757855e 1050 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1051 break;
1052
88fc9c9b
TH
1053 case ARG_AMBIENT_CAPABILITY: {
1054 uint64_t m;
1055 r = parse_capability_spec(optarg, &m);
1056 if (r <= 0)
1057 return r;
1058 arg_caps_ambient |= m;
1059 arg_settings_mask |= SETTING_CAPABILITY;
1060 break;
1061 }
420c7379
LP
1062 case ARG_CAPABILITY:
1063 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1064 uint64_t m;
1065 r = parse_capability_spec(optarg, &m);
1066 if (r <= 0)
1067 return r;
5076f0cc 1068
8a99bd0c
ZJS
1069 if (c == ARG_CAPABILITY)
1070 plus |= m;
1071 else
1072 minus |= m;
f757855e 1073 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1074 break;
1075 }
66edd963
LP
1076 case ARG_NO_NEW_PRIVILEGES:
1077 r = parse_boolean(optarg);
1078 if (r < 0)
1079 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1080
1081 arg_no_new_privileges = r;
1082 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1083 break;
1084
57fb9fb5
LP
1085 case 'j':
1086 arg_link_journal = LINK_GUEST;
574edc90 1087 arg_link_journal_try = true;
4e1d6aa9 1088 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1089 break;
1090
1091 case ARG_LINK_JOURNAL:
4e1d6aa9 1092 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1095
4e1d6aa9 1096 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1097 break;
1098
17fe0523 1099 case ARG_BIND:
f757855e
LP
1100 case ARG_BIND_RO:
1101 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1102 if (r < 0)
1103 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1104
f757855e 1105 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1106 break;
06c17c39 1107
f757855e
LP
1108 case ARG_TMPFS:
1109 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1110 if (r < 0)
1111 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1112
f757855e 1113 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1114 break;
5a8af538
LP
1115
1116 case ARG_OVERLAY:
ad85779a
LP
1117 case ARG_OVERLAY_RO:
1118 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1119 if (r == -EADDRNOTAVAIL)
1120 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1123
f757855e 1124 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1125 break;
06c17c39 1126
de40a303
LP
1127 case ARG_INACCESSIBLE:
1128 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1129 if (r < 0)
1130 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1131
1132 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1133 break;
1134
0d2a0179
ZJS
1135 case 'E':
1136 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
aaf057c4 1137 if (r < 0)
0d2a0179 1138 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
f4889f65 1139
f757855e 1140 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65 1141 break;
f4889f65 1142
284c0b91
LP
1143 case 'q':
1144 arg_quiet = true;
1145 break;
1146
8a96d94e 1147 case ARG_SHARE_SYSTEM:
a6b5216c 1148 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1149 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1150 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1151 arg_clone_ns_flags = 0;
8a96d94e
LP
1152 break;
1153
eb91eb18
LP
1154 case ARG_REGISTER:
1155 r = parse_boolean(optarg);
1156 if (r < 0) {
1157 log_error("Failed to parse --register= argument: %s", optarg);
1158 return r;
1159 }
1160
1161 arg_register = r;
1162 break;
1163
89f7c846
LP
1164 case ARG_KEEP_UNIT:
1165 arg_keep_unit = true;
1166 break;
1167
6afc95b7
LP
1168 case ARG_PERSONALITY:
1169
ac45f971 1170 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1171 if (arg_personality == PERSONALITY_INVALID)
1172 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1173 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1174
f757855e 1175 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1176 break;
1177
4d9f07b4
LP
1178 case ARG_VOLATILE:
1179
1180 if (!optarg)
f757855e 1181 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1182 else if (streq(optarg, "help")) {
1183 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1184 return 0;
1185 } else {
f757855e 1186 VolatileMode m;
4d9f07b4 1187
f757855e 1188 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1189 if (m < 0)
1190 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1191 "Failed to parse --volatile= argument: %s", optarg);
1192 else
f757855e 1193 arg_volatile_mode = m;
6d0b55c2
LP
1194 }
1195
f757855e
LP
1196 arg_settings_mask |= SETTING_VOLATILE_MODE;
1197 break;
6d0b55c2 1198
f757855e
LP
1199 case 'p':
1200 r = expose_port_parse(&arg_expose_ports, optarg);
1201 if (r == -EEXIST)
1202 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1203 if (r < 0)
1204 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1205
f757855e 1206 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1207 break;
6d0b55c2 1208
f36933fe
LP
1209 case ARG_PROPERTY:
1210 if (strv_extend(&arg_property, optarg) < 0)
1211 return log_oom();
1212
1213 break;
1214
ae209204 1215 case ARG_PRIVATE_USERS: {
33eac552 1216 int boolean;
0de7acce 1217
ae209204
ZJS
1218 if (!optarg)
1219 boolean = true;
1220 else if (!in_charset(optarg, DIGITS))
1221 /* do *not* parse numbers as booleans */
1222 boolean = parse_boolean(optarg);
33eac552
LP
1223 else
1224 boolean = -1;
ae209204 1225
33eac552 1226 if (boolean == 0) {
0de7acce
LP
1227 /* no: User namespacing off */
1228 arg_userns_mode = USER_NAMESPACE_NO;
1229 arg_uid_shift = UID_INVALID;
1230 arg_uid_range = UINT32_C(0x10000);
33eac552 1231 } else if (boolean > 0) {
0de7acce
LP
1232 /* yes: User namespacing on, UID range is read from root dir */
1233 arg_userns_mode = USER_NAMESPACE_FIXED;
1234 arg_uid_shift = UID_INVALID;
1235 arg_uid_range = UINT32_C(0x10000);
1236 } else if (streq(optarg, "pick")) {
1237 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1238 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1239 * implied by USER_NAMESPACE_PICK
33eac552 1240 * further down. */
0de7acce
LP
1241 arg_uid_shift = UID_INVALID;
1242 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1243
1244 } else if (streq(optarg, "identity")) {
6c2d70ce 1245 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
33eac552
LP
1246 * itself, i.e. we don't actually map anything, but do take benefit of
1247 * isolation of capability sets. */
1248 arg_userns_mode = USER_NAMESPACE_FIXED;
1249 arg_uid_shift = 0;
1250 arg_uid_range = UINT32_C(0x10000);
0de7acce 1251 } else {
6c2058b3 1252 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1253 const char *range, *shift;
1254
0de7acce
LP
1255 /* anything else: User namespacing on, UID range is explicitly configured */
1256
6dac160c
LP
1257 range = strchr(optarg, ':');
1258 if (range) {
6c2058b3
ZJS
1259 buffer = strndup(optarg, range - optarg);
1260 if (!buffer)
1261 return log_oom();
1262 shift = buffer;
6dac160c
LP
1263
1264 range++;
bfd292ec
ZJS
1265 r = safe_atou32(range, &arg_uid_range);
1266 if (r < 0)
be715731 1267 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1268 } else
1269 shift = optarg;
1270
be715731
ZJS
1271 r = parse_uid(shift, &arg_uid_shift);
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1274
1275 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1276
58e13de5
LP
1277 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1278 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1279 }
be715731 1280
0de7acce 1281 arg_settings_mask |= SETTING_USERNS;
6dac160c 1282 break;
ae209204 1283 }
6dac160c 1284
0de7acce 1285 case 'U':
ccabee0d 1286 if (userns_supported()) {
6c045a99
LP
1287 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1288 * implied by USER_NAMESPACE_PICK
33eac552 1289 * further down. */
ccabee0d
LP
1290 arg_uid_shift = UID_INVALID;
1291 arg_uid_range = UINT32_C(0x10000);
1292
1293 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1294 }
1295
7336138e
LP
1296 break;
1297
0de7acce 1298 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1299 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1300
1301 arg_settings_mask |= SETTING_USERNS;
1302 break;
1303
1304 case ARG_PRIVATE_USERS_OWNERSHIP:
1305 if (streq(optarg, "help")) {
1306 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1307 return 0;
1308 }
1309
1310 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1311 if (arg_userns_ownership < 0)
1312 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1313
1314 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1315 break;
1316
c6c8f6e2 1317 case ARG_KILL_SIGNAL:
5c828e66
LP
1318 if (streq(optarg, "help")) {
1319 DUMP_STRING_TABLE(signal, int, _NSIG);
1320 return 0;
1321 }
1322
29a3db75 1323 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1324 if (arg_kill_signal < 0)
7211c853 1325 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1326
f757855e
LP
1327 arg_settings_mask |= SETTING_KILL_SIGNAL;
1328 break;
1329
1330 case ARG_SETTINGS:
1331
1332 /* no → do not read files
1333 * yes → read files, do not override cmdline, trust only subset
1334 * override → read files, override cmdline, trust only subset
1335 * trusted → read files, do not override cmdline, trust all
1336 */
1337
1338 r = parse_boolean(optarg);
1339 if (r < 0) {
1340 if (streq(optarg, "trusted")) {
1341 mask_all_settings = false;
1342 mask_no_settings = false;
1343 arg_settings_trusted = true;
1344
1345 } else if (streq(optarg, "override")) {
1346 mask_all_settings = false;
1347 mask_no_settings = true;
1348 arg_settings_trusted = -1;
1349 } else
1350 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1351 } else if (r > 0) {
1352 /* yes */
1353 mask_all_settings = false;
1354 mask_no_settings = false;
1355 arg_settings_trusted = -1;
1356 } else {
1357 /* no */
1358 mask_all_settings = true;
1359 mask_no_settings = false;
1360 arg_settings_trusted = false;
1361 }
1362
c6c8f6e2
LP
1363 break;
1364
5f932eb9 1365 case ARG_CHDIR:
baaa35ad
ZJS
1366 if (!path_is_absolute(optarg))
1367 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1368 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1369
1370 r = free_and_strdup(&arg_chdir, optarg);
1371 if (r < 0)
1372 return log_oom();
1373
1374 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1375 break;
1376
b53ede69
PW
1377 case ARG_PIVOT_ROOT:
1378 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1379 if (r < 0)
1380 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1381
1382 arg_settings_mask |= SETTING_PIVOT_ROOT;
1383 break;
1384
9c1e04d0
AP
1385 case ARG_NOTIFY_READY:
1386 r = parse_boolean(optarg);
baaa35ad
ZJS
1387 if (r < 0)
1388 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1389 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1390 arg_notify_ready = r;
1391 arg_settings_mask |= SETTING_NOTIFY_READY;
1392 break;
1393
4623e8e6 1394 case ARG_ROOT_HASH: {
89e62e0b 1395 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1396 size_t l;
1397
1398 r = unhexmem(optarg, strlen(optarg), &k, &l);
1399 if (r < 0)
1400 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1401 if (l < sizeof(sd_id128_t))
da890466 1402 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
4623e8e6 1403
89e62e0b
LP
1404 free_and_replace(arg_verity_settings.root_hash, k);
1405 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1406 break;
1407 }
1408
c2923fdc
LB
1409 case ARG_ROOT_HASH_SIG: {
1410 char *value;
89e62e0b
LP
1411 size_t l;
1412 void *p;
c2923fdc
LB
1413
1414 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1415 r = unbase64mem(value, strlen(value), &p, &l);
1416 if (r < 0)
1417 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1418
c2923fdc 1419 } else {
89e62e0b 1420 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1421 if (r < 0)
89e62e0b 1422 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1423 }
1424
89e62e0b
LP
1425 free_and_replace(arg_verity_settings.root_hash_sig, p);
1426 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1427 break;
1428 }
1429
89e62e0b 1430 case ARG_VERITY_DATA:
614b022c 1431 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1432 if (r < 0)
1433 return r;
1434 break;
1435
960e4569
LP
1436 case ARG_SYSTEM_CALL_FILTER: {
1437 bool negative;
1438 const char *items;
1439
1440 negative = optarg[0] == '~';
1441 items = negative ? optarg + 1 : optarg;
1442
1443 for (;;) {
1444 _cleanup_free_ char *word = NULL;
1445
1446 r = extract_first_word(&items, &word, NULL, 0);
1447 if (r == 0)
1448 break;
1449 if (r == -ENOMEM)
1450 return log_oom();
1451 if (r < 0)
1452 return log_error_errno(r, "Failed to parse system call filter: %m");
1453
1454 if (negative)
6b000af4 1455 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1456 else
6b000af4 1457 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1458 if (r < 0)
1459 return log_oom();
1460 }
1461
1462 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1463 break;
1464 }
1465
bf428efb
LP
1466 case ARG_RLIMIT: {
1467 const char *eq;
622ecfa8 1468 _cleanup_free_ char *name = NULL;
bf428efb
LP
1469 int rl;
1470
5c828e66
LP
1471 if (streq(optarg, "help")) {
1472 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1473 return 0;
1474 }
1475
bf428efb 1476 eq = strchr(optarg, '=');
baaa35ad
ZJS
1477 if (!eq)
1478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1479 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1480
1481 name = strndup(optarg, eq - optarg);
1482 if (!name)
1483 return log_oom();
1484
1485 rl = rlimit_from_string_harder(name);
baaa35ad 1486 if (rl < 0)
7211c853 1487 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1488
1489 if (!arg_rlimit[rl]) {
1490 arg_rlimit[rl] = new0(struct rlimit, 1);
1491 if (!arg_rlimit[rl])
1492 return log_oom();
1493 }
1494
1495 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1498
1499 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1500 break;
1501 }
1502
81f345df
LP
1503 case ARG_OOM_SCORE_ADJUST:
1504 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1505 if (r < 0)
1506 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1507
1508 arg_oom_score_adjust_set = true;
1509 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1510 break;
1511
d107bb7d 1512 case ARG_CPU_AFFINITY: {
0985c7c4 1513 CPUSet cpuset;
d107bb7d
LP
1514
1515 r = parse_cpu_set(optarg, &cpuset);
1516 if (r < 0)
0985c7c4 1517 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1518
0985c7c4
ZJS
1519 cpu_set_reset(&arg_cpu_set);
1520 arg_cpu_set = cpuset;
d107bb7d
LP
1521 arg_settings_mask |= SETTING_CPU_AFFINITY;
1522 break;
1523 }
1524
09d423e9
LP
1525 case ARG_RESOLV_CONF:
1526 if (streq(optarg, "help")) {
1527 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1528 return 0;
1529 }
1530
1531 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1532 if (arg_resolv_conf < 0)
7211c853 1533 return log_error_errno(arg_resolv_conf,
baaa35ad 1534 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1535
1536 arg_settings_mask |= SETTING_RESOLV_CONF;
1537 break;
1538
1688841f
LP
1539 case ARG_TIMEZONE:
1540 if (streq(optarg, "help")) {
1541 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1542 return 0;
1543 }
1544
1545 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1546 if (arg_timezone < 0)
7211c853 1547 return log_error_errno(arg_timezone,
baaa35ad 1548 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1549
1550 arg_settings_mask |= SETTING_TIMEZONE;
1551 break;
1552
de40a303 1553 case ARG_CONSOLE:
dce66ffe
ZJS
1554 r = handle_arg_console(optarg);
1555 if (r <= 0)
1556 return r;
de40a303
LP
1557 break;
1558
1559 case 'P':
1560 case ARG_PIPE:
dce66ffe
ZJS
1561 r = handle_arg_console("pipe");
1562 if (r <= 0)
1563 return r;
de40a303
LP
1564 break;
1565
bb068de0
ZJS
1566 case ARG_NO_PAGER:
1567 arg_pager_flags |= PAGER_DISABLE;
1568 break;
1569
e8ac916e
SL
1570 case ARG_SET_CREDENTIAL:
1571 r = machine_credential_set(&arg_credentials, &arg_n_credentials, optarg);
3652872a
LP
1572 if (r == -ENOMEM)
1573 return log_oom();
1574 if (r < 0)
e8ac916e 1575 return log_error_errno(r, "Failed to set credential from %s: %m", optarg);
3652872a
LP
1576 arg_settings_mask |= SETTING_CREDENTIALS;
1577 break;
3652872a 1578
e8ac916e
SL
1579 case ARG_LOAD_CREDENTIAL:
1580 r = machine_credential_load(&arg_credentials, &arg_n_credentials, optarg);
3652872a
LP
1581 if (r == -ENOMEM)
1582 return log_oom();
1583 if (r < 0)
e8ac916e 1584 return log_error_errno(r, "Failed to load credential from %s: %m", optarg);
3652872a
LP
1585
1586 arg_settings_mask |= SETTING_CREDENTIALS;
1587 break;
3652872a 1588
2f893044
LP
1589 case ARG_BIND_USER:
1590 if (!valid_user_group_name(optarg, 0))
1591 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1592
1593 if (strv_extend(&arg_bind_user, optarg) < 0)
1594 return log_oom();
1595
1596 arg_settings_mask |= SETTING_BIND_USER;
1597 break;
1598
4a4654e0
LP
1599 case ARG_SUPPRESS_SYNC:
1600 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1601 if (r < 0)
1602 return r;
1603
1604 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1605 break;
1606
06e78680
YW
1607 case ARG_IMAGE_POLICY:
1608 r = parse_image_policy_argument(optarg, &arg_image_policy);
84be0c71 1609 if (r < 0)
06e78680 1610 return r;
84be0c71 1611 break;
84be0c71 1612
88213476
LP
1613 case '?':
1614 return -EINVAL;
1615
1616 default:
04499a70 1617 assert_not_reached();
88213476 1618 }
88213476 1619
60f1ec13
LP
1620 if (argc > optind) {
1621 strv_free(arg_parameters);
1622 arg_parameters = strv_copy(argv + optind);
1623 if (!arg_parameters)
1624 return log_oom();
d7bea6b6 1625
60f1ec13
LP
1626 arg_settings_mask |= SETTING_START_MODE;
1627 }
1628
1629 if (arg_ephemeral && arg_template && !arg_directory)
1630 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1631 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1632 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1633 * --directory=". */
1634 arg_directory = TAKE_PTR(arg_template);
1635
2642d22a
DDM
1636 arg_caps_retain |= plus;
1637 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1638
1639 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1640 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1641 * indicate that. */
1642 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1643 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1644
1645 arg_caps_retain &= ~minus;
60f1ec13 1646
de40a303 1647 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1648 r = parse_environment();
1649 if (r < 0)
1650 return r;
de40a303 1651
60f1ec13
LP
1652 /* Load all settings from .nspawn files */
1653 if (mask_no_settings)
1654 arg_settings_mask = 0;
1655
1656 /* Don't load any settings from .nspawn files */
1657 if (mask_all_settings)
1658 arg_settings_mask = _SETTINGS_MASK_ALL;
1659
1660 return 1;
1661}
1662
1663static int verify_arguments(void) {
1664 int r;
a6b5216c 1665
75b0d8b8
ZJS
1666 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1667 /* If we are running the stub init in the container, we don't need to look at what the init
1668 * in the container supports, because we are not using it. Let's immediately pick the right
1669 * setting based on the host system configuration.
1670 *
1671 * We only do this, if the user didn't use an environment variable to override the detection.
1672 */
1673
1674 r = cg_all_unified();
1675 if (r < 0)
1676 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1677 if (r > 0)
1678 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1679 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1680 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1681 else
1682 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1683 }
1684
4f086aab
SU
1685 if (arg_userns_mode != USER_NAMESPACE_NO)
1686 arg_mount_settings |= MOUNT_USE_USERNS;
1687
1688 if (arg_private_network)
1689 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1690
48a8d337
LB
1691 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1692 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1693 arg_register = false;
baaa35ad 1694 if (arg_start_mode != START_PID1)
60f1ec13 1695 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1696 }
eb91eb18 1697
6c045a99
LP
1698 if (arg_userns_ownership < 0)
1699 arg_userns_ownership =
f61c7f88 1700 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1701 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1702
60f1ec13
LP
1703 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1704 arg_kill_signal = SIGRTMIN+3;
1705
e5a4bb0d
LP
1706 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1707 arg_read_only = true;
1708
2436ea76
DDM
1709 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1710 arg_read_only = true;
1711
baaa35ad 1712 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1713 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1714 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1715 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1716
baaa35ad 1717 if (arg_directory && arg_image)
60f1ec13 1718 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1719
baaa35ad 1720 if (arg_template && arg_image)
60f1ec13 1721 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1722
baaa35ad 1723 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1724 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1725
baaa35ad 1726 if (arg_ephemeral && arg_template)
60f1ec13 1727 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1728
baaa35ad 1729 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1730 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1731
baaa35ad 1732 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1733 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1734
6c045a99 1735 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1736 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1737 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1738
6c045a99
LP
1739 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1740 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1741 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1742 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1743 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1744
679ecd36
SZ
1745 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1746 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1747 if (arg_network_namespace_path &&
1748 (arg_network_interfaces || arg_network_macvlan ||
1749 arg_network_ipvlan || arg_network_veth_extra ||
1750 arg_network_bridge || arg_network_zone ||
679ecd36 1751 arg_network_veth))
de40a303 1752 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1753
60f1ec13 1754 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1755 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1756 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1757
baaa35ad 1758 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1759 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1760
baaa35ad 1761 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1762 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1763
baaa35ad 1764 if (arg_expose_ports && !arg_private_network)
60f1ec13 1765 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1766
88fc9c9b 1767 if (arg_caps_ambient) {
f5fbe71d 1768 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1769 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1770
1771 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1772 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1773
1774 if (arg_start_mode == START_BOOT)
1775 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1776 }
1777
2f893044
LP
1778 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1779 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1780
1781 /* Drop duplicate --bind-user= entries */
1782 strv_uniq(arg_bind_user);
1783
60f1ec13
LP
1784 r = custom_mount_check_all();
1785 if (r < 0)
1786 return r;
c6c8f6e2 1787
f757855e 1788 return 0;
88213476
LP
1789}
1790
2f091b1b
TM
1791static int verify_network_interfaces_initialized(void) {
1792 int r;
1793 r = test_network_interfaces_initialized(arg_network_interfaces);
1794 if (r < 0)
1795 return r;
1796
1797 r = test_network_interfaces_initialized(arg_network_macvlan);
1798 if (r < 0)
1799 return r;
1800
1801 r = test_network_interfaces_initialized(arg_network_ipvlan);
1802 if (r < 0)
1803 return r;
1804
1805 return 0;
1806}
1807
91181e07 1808int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1809 assert(p);
1810
0de7acce 1811 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1812 return 0;
1813
1814 if (uid == UID_INVALID && gid == GID_INVALID)
1815 return 0;
1816
1817 if (uid != UID_INVALID) {
1818 uid += arg_uid_shift;
1819
1820 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1821 return -EOVERFLOW;
1822 }
1823
1824 if (gid != GID_INVALID) {
1825 gid += (gid_t) arg_uid_shift;
1826
1827 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1828 return -EOVERFLOW;
1829 }
1830
7c248223 1831 return RET_NERRNO(lchown(p, uid, gid));
b12afc8c
LP
1832}
1833
91181e07 1834int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1835 const char *q;
dae8b82e 1836 int r;
03cfe0d5
LP
1837
1838 q = prefix_roota(root, path);
3f692e2e 1839 r = RET_NERRNO(mkdir(q, mode));
dae8b82e
ZJS
1840 if (r == -EEXIST)
1841 return 0;
1842 if (r < 0)
1843 return r;
03cfe0d5
LP
1844
1845 return userns_lchown(q, uid, gid);
1846}
1847
1688841f 1848static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1849 return PATH_STARTSWITH_SET(
1850 path,
1851 "../usr/share/zoneinfo/",
1852 "/usr/share/zoneinfo/");
1688841f
LP
1853}
1854
83205269
LP
1855static bool etc_writable(void) {
1856 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1857}
1858
e58a1277 1859static int setup_timezone(const char *dest) {
1688841f
LP
1860 _cleanup_free_ char *p = NULL, *etc = NULL;
1861 const char *where, *check;
1862 TimezoneMode m;
d4036145 1863 int r;
f8440af5 1864
e58a1277
LP
1865 assert(dest);
1866
1688841f 1867 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1868 r = readlink_malloc("/etc/localtime", &p);
1869 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1870 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1871 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1872 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1873 else if (r < 0) {
1874 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1875 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1876 * file.
1877 *
1878 * Example:
1879 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1880 */
1881 return 0;
1882 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1883 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1884 else
1885 m = arg_timezone;
1886 } else
1887 m = arg_timezone;
1888
1889 if (m == TIMEZONE_OFF)
1890 return 0;
1891
f461a28d 1892 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1893 if (r < 0) {
1688841f 1894 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1895 return 0;
1896 }
1897
1688841f
LP
1898 where = strjoina(etc, "/localtime");
1899
1900 switch (m) {
1901
1902 case TIMEZONE_DELETE:
1903 if (unlink(where) < 0)
1904 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1905
d4036145 1906 return 0;
d4036145 1907
1688841f
LP
1908 case TIMEZONE_SYMLINK: {
1909 _cleanup_free_ char *q = NULL;
1910 const char *z, *what;
4d1c38b8 1911
1688841f
LP
1912 z = timezone_from_path(p);
1913 if (!z) {
1914 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1915 return 0;
1688841f 1916 }
d4036145 1917
1688841f
LP
1918 r = readlink_malloc(where, &q);
1919 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1920 return 0; /* Already pointing to the right place? Then do nothing .. */
1921
1922 check = strjoina(dest, "/usr/share/zoneinfo/", z);
f461a28d 1923 r = chase(check, dest, 0, NULL, NULL);
1688841f
LP
1924 if (r < 0)
1925 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1926 else {
1927 if (unlink(where) < 0 && errno != ENOENT) {
1928 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1929 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1930 return 0;
1931 }
1932
1933 what = strjoina("../usr/share/zoneinfo/", z);
1934 if (symlink(what, where) < 0) {
1935 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1936 errno, "Failed to correct timezone of container, ignoring: %m");
1937 return 0;
1938 }
1939
1940 break;
1941 }
1942
1943 _fallthrough_;
d4036145 1944 }
68fb0892 1945
1688841f
LP
1946 case TIMEZONE_BIND: {
1947 _cleanup_free_ char *resolved = NULL;
1948 int found;
1949
f461a28d 1950 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1951 if (found < 0) {
1952 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1953 return 0;
1954 }
1955
1956 if (found == 0) /* missing? */
1957 (void) touch(resolved);
1958
511a8cfe 1959 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 1960 if (r >= 0)
511a8cfe 1961 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
1962
1963 _fallthrough_;
79d80fc1 1964 }
4d9f07b4 1965
1688841f
LP
1966 case TIMEZONE_COPY:
1967 /* If mounting failed, try to copy */
7c2f5495 1968 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1969 if (r < 0) {
1970 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1971 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1972 return 0;
1973 }
1974
1975 break;
1976
1977 default:
04499a70 1978 assert_not_reached();
d4036145 1979 }
e58a1277 1980
1688841f 1981 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1982 r = userns_lchown(where, 0, 0);
1983 if (r < 0)
1688841f 1984 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1985
e58a1277 1986 return 0;
88213476
LP
1987}
1988
09d423e9
LP
1989static int have_resolv_conf(const char *path) {
1990 assert(path);
1991
1992 if (access(path, F_OK) < 0) {
1993 if (errno == ENOENT)
1994 return 0;
1995
1996 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1997 }
1998
1999 return 1;
2000}
2001
7357272e 2002static int resolved_listening(void) {
b8ea7a6e 2003 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2004 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2005 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2006 int r;
2007
7357272e 2008 /* Check if resolved is listening */
b053cd5f
LP
2009
2010 r = sd_bus_open_system(&bus);
2011 if (r < 0)
b8ea7a6e 2012 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2013
7357272e 2014 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2015 if (r < 0)
2016 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2017 if (r == 0)
2018 return 0;
7357272e 2019
7f8a85e6 2020 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
7357272e 2021 if (r < 0)
b8ea7a6e 2022 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2023
2024 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2025}
2026
2547bb41 2027static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2028 _cleanup_free_ char *etc = NULL;
2029 const char *where, *what;
2030 ResolvConfMode m;
2031 int r;
2547bb41
LP
2032
2033 assert(dest);
2034
09d423e9
LP
2035 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2036 if (arg_private_network)
2037 m = RESOLV_CONF_OFF;
86775e35
LP
2038 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2039 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2040 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2041 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2042 else
83205269 2043 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2044
09d423e9
LP
2045 } else
2046 m = arg_resolv_conf;
2047
2048 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2049 return 0;
2050
f461a28d 2051 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2052 if (r < 0) {
2053 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2054 return 0;
2055 }
2056
2057 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2058
2059 if (m == RESOLV_CONF_DELETE) {
2060 if (unlink(where) < 0)
2061 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2062
87447ae4
LP
2063 return 0;
2064 }
79d80fc1 2065
86775e35
LP
2066 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2067 what = PRIVATE_STATIC_RESOLV_CONF;
2068 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2069 what = PRIVATE_UPLINK_RESOLV_CONF;
2070 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2071 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2072 else
2073 what = "/etc/resolv.conf";
87447ae4 2074
86775e35 2075 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2076 _cleanup_free_ char *resolved = NULL;
2077 int found;
2078
d404c8d8 2079 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
09d423e9
LP
2080 if (found < 0) {
2081 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2082 return 0;
2083 }
3539724c 2084
87447ae4
LP
2085 if (found == 0) /* missing? */
2086 (void) touch(resolved);
5367354d 2087
511a8cfe 2088 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2089 if (r >= 0)
511a8cfe 2090 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2091
2092 /* If that didn't work, let's copy the file */
3539724c
LP
2093 }
2094
86775e35 2095 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
7c2f5495 2096 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
86775e35 2097 else
7c2f5495 2098 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
79d80fc1 2099 if (r < 0) {
3539724c
LP
2100 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2101 * resolved or something similar runs inside and the symlink points there.
68a313c5 2102 *
3539724c 2103 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2104 */
86775e35
LP
2105 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2106 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2107 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2108 return 0;
2109 }
2547bb41 2110
03cfe0d5
LP
2111 r = userns_lchown(where, 0, 0);
2112 if (r < 0)
3539724c 2113 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2114
2547bb41
LP
2115 return 0;
2116}
2117
1e4f1671 2118static int setup_boot_id(void) {
cdde6ba6
LP
2119 _cleanup_(unlink_and_freep) char *from = NULL;
2120 _cleanup_free_ char *path = NULL;
3bbaff3e 2121 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2122 const char *to;
04bc4a3f
LP
2123 int r;
2124
1eacc470 2125 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2126
1eacc470 2127 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2128 if (r < 0)
2129 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2130
2131 r = sd_id128_randomize(&rnd);
f647962d
MS
2132 if (r < 0)
2133 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2134
b40c8ebd 2135 r = id128_write(path, ID128_FORMAT_UUID, rnd);
f647962d
MS
2136 if (r < 0)
2137 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2138
cdde6ba6
LP
2139 from = TAKE_PTR(path);
2140 to = "/proc/sys/kernel/random/boot_id";
2141
511a8cfe 2142 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2143 if (r < 0)
2144 return r;
04bc4a3f 2145
511a8cfe 2146 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2147}
2148
e58a1277 2149static int copy_devnodes(const char *dest) {
88213476
LP
2150 static const char devnodes[] =
2151 "null\0"
2152 "zero\0"
2153 "full\0"
2154 "random\0"
2155 "urandom\0"
85614d66
TG
2156 "tty\0"
2157 "net/tun\0";
88213476 2158
e58a1277 2159 int r = 0;
a258bf26
LP
2160
2161 assert(dest);
124640f1 2162
52f05ef2 2163 BLOCK_WITH_UMASK(0000);
88213476 2164
03cfe0d5
LP
2165 /* Create /dev/net, so that we can create /dev/net/tun in it */
2166 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2167 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2168
88213476 2169 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2170 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2171 struct stat st;
88213476 2172
c6134d3e 2173 from = path_join("/dev/", d);
8967f291
LP
2174 if (!from)
2175 return log_oom();
2176
c6134d3e 2177 to = path_join(dest, from);
8967f291
LP
2178 if (!to)
2179 return log_oom();
88213476
LP
2180
2181 if (stat(from, &st) < 0) {
2182
4a62c710
MS
2183 if (errno != ENOENT)
2184 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2185
baaa35ad
ZJS
2186 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2187 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2188 "%s is not a char or block device, cannot copy.", from);
2189 else {
8dfce114
LP
2190 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2191
81f5049b 2192 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2193 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2194 if (errno == EEXIST)
8dbf71ec 2195 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2196 if (errno != EPERM)
2197 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2198
8dfce114 2199 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2200 r = touch(to);
2201 if (r < 0)
2202 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2203 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2204 if (r < 0)
2205 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2206 }
6278cf60 2207
03cfe0d5
LP
2208 r = userns_lchown(to, 0, 0);
2209 if (r < 0)
2210 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2211
657ee2d8 2212 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2213 if (!dn)
2214 return log_oom();
2215
2216 r = userns_mkdir(dest, dn, 0755, 0, 0);
2217 if (r < 0)
2218 return log_error_errno(r, "Failed to create '%s': %m", dn);
2219
2220 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2221 return log_oom();
2222
c6134d3e 2223 prefixed = path_join(dest, sl);
8dfce114
LP
2224 if (!prefixed)
2225 return log_oom();
2226
2d9b74ba 2227 t = path_join("..", d);
8dfce114
LP
2228 if (!t)
2229 return log_oom();
2230
2231 if (symlink(t, prefixed) < 0)
2232 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2233 }
88213476
LP
2234 }
2235
e58a1277
LP
2236 return r;
2237}
88213476 2238
de40a303 2239static int make_extra_nodes(const char *dest) {
de40a303
LP
2240 size_t i;
2241 int r;
2242
52f05ef2 2243 BLOCK_WITH_UMASK(0000);
de40a303
LP
2244
2245 for (i = 0; i < arg_n_extra_nodes; i++) {
2246 _cleanup_free_ char *path = NULL;
2247 DeviceNode *n = arg_extra_nodes + i;
2248
c6134d3e 2249 path = path_join(dest, n->path);
de40a303
LP
2250 if (!path)
2251 return log_oom();
2252
2253 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2254 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2255
2256 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2257 if (r < 0)
2258 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2259 }
2260
2261 return 0;
2262}
2263
03cfe0d5
LP
2264static int setup_pts(const char *dest) {
2265 _cleanup_free_ char *options = NULL;
2266 const char *p;
709f6e46 2267 int r;
03cfe0d5 2268
349cc4a5 2269#if HAVE_SELINUX
03cfe0d5
LP
2270 if (arg_selinux_apifs_context)
2271 (void) asprintf(&options,
3dce8915 2272 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2273 arg_uid_shift + TTY_GID,
2274 arg_selinux_apifs_context);
2275 else
2276#endif
2277 (void) asprintf(&options,
3dce8915 2278 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2279 arg_uid_shift + TTY_GID);
f2d88580 2280
03cfe0d5 2281 if (!options)
f2d88580
LP
2282 return log_oom();
2283
03cfe0d5 2284 /* Mount /dev/pts itself */
cc9fce65 2285 p = prefix_roota(dest, "/dev/pts");
3f692e2e 2286 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e
ZJS
2287 if (r < 0)
2288 return log_error_errno(r, "Failed to create /dev/pts: %m");
2289
511a8cfe 2290 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2291 if (r < 0)
2292 return r;
709f6e46
MS
2293 r = userns_lchown(p, 0, 0);
2294 if (r < 0)
2295 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2296
2297 /* Create /dev/ptmx symlink */
2298 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2299 if (symlink("pts/ptmx", p) < 0)
2300 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2301 r = userns_lchown(p, 0, 0);
2302 if (r < 0)
2303 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2304
03cfe0d5
LP
2305 /* And fix /dev/pts/ptmx ownership */
2306 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2307 r = userns_lchown(p, 0, 0);
2308 if (r < 0)
2309 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2310
f2d88580
LP
2311 return 0;
2312}
2313
3acc84eb 2314static int setup_stdio_as_dev_console(void) {
5bb1d7fb 2315 _cleanup_close_ int terminal = -EBADF;
e58a1277 2316 int r;
e58a1277 2317
335d2ead
LP
2318 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2319 * explicitly, if we are configured to. */
2320 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2321 if (terminal < 0)
2322 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2323
3acc84eb
FB
2324 /* Make sure we can continue logging to the original stderr, even if
2325 * stderr points elsewhere now */
2326 r = log_dup_console();
2327 if (r < 0)
2328 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2329
3acc84eb
FB
2330 /* invalidates 'terminal' on success and failure */
2331 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2332 TAKE_FD(terminal);
f647962d 2333 if (r < 0)
3acc84eb
FB
2334 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2335
2336 return 0;
2337}
88213476 2338
3acc84eb
FB
2339static int setup_dev_console(const char *console) {
2340 _cleanup_free_ char *p = NULL;
2341 int r;
a258bf26 2342
3acc84eb
FB
2343 /* Create /dev/console symlink */
2344 r = path_make_relative("/dev", console, &p);
81f5049b 2345 if (r < 0)
3acc84eb
FB
2346 return log_error_errno(r, "Failed to create relative path: %m");
2347
2348 if (symlink(p, "/dev/console") < 0)
2349 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2350
3acc84eb 2351 return 0;
e58a1277
LP
2352}
2353
8e5430c4
LP
2354static int setup_keyring(void) {
2355 key_serial_t keyring;
2356
6b000af4
LP
2357 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2358 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2359 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2360 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2361 * into the container. */
8e5430c4
LP
2362
2363 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2364 if (keyring == -1) {
2365 if (errno == ENOSYS)
2366 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2367 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2368 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2369 else
2370 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2371 }
2372
2373 return 0;
2374}
2375
3652872a
LP
2376static int setup_credentials(const char *root) {
2377 const char *q;
2378 int r;
2379
2380 if (arg_n_credentials <= 0)
2381 return 0;
2382
2383 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to create /run/host: %m");
2386
2387 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2390
2391 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2392 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2393 if (r < 0)
2394 return r;
2395
2396 for (size_t i = 0; i < arg_n_credentials; i++) {
2397 _cleanup_free_ char *j = NULL;
254d1313 2398 _cleanup_close_ int fd = -EBADF;
3652872a
LP
2399
2400 j = path_join(q, arg_credentials[i].id);
2401 if (!j)
2402 return log_oom();
2403
2404 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2405 if (fd < 0)
2406 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2407
e22c60a9 2408 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size);
3652872a
LP
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2411
2412 if (fchmod(fd, 0400) < 0)
2413 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2414
2415 if (arg_userns_mode != USER_NAMESPACE_NO) {
2416 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2417 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2418 }
2419 }
2420
2421 if (chmod(q, 0500) < 0)
2422 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2423
2424 r = userns_lchown(q, 0, 0);
2425 if (r < 0)
2426 return r;
2427
2428 /* Make both mount and superblock read-only now */
511a8cfe 2429 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2430 if (r < 0)
2431 return r;
2432
511a8cfe 2433 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2434}
2435
5d9d3fcb 2436static int setup_kmsg(int fd_inner_socket) {
9ec5a93c
LP
2437 _cleanup_(unlink_and_freep) char *from = NULL;
2438 _cleanup_free_ char *fifo = NULL;
254d1313 2439 _cleanup_close_ int fd = -EBADF;
9ec5a93c 2440 int r;
e58a1277 2441
5d9d3fcb 2442 assert(fd_inner_socket >= 0);
a258bf26 2443
52f05ef2 2444 BLOCK_WITH_UMASK(0000);
a258bf26 2445
30fd9a2d 2446 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2447 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2448 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2449 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2450
1eacc470 2451 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2452 if (r < 0)
2453 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2454
9ec5a93c 2455 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2456 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2457
2458 from = TAKE_PTR(fifo);
9ec5a93c 2459
511a8cfe 2460 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2461 if (r < 0)
2462 return r;
e58a1277 2463
669fc4e5 2464 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2465 if (fd < 0)
2466 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2467
9ec5a93c 2468 /* Store away the fd in the socket, so that it stays open as long as we run the child */
5d9d3fcb 2469 r = send_one_fd(fd_inner_socket, fd, 0);
d9603714
DH
2470 if (r < 0)
2471 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2472
25ea79fe 2473 return 0;
88213476
LP
2474}
2475
761cf19d 2476struct ExposeArgs {
deff68e7
FW
2477 union in_addr_union address4;
2478 union in_addr_union address6;
761cf19d
FW
2479 struct FirewallContext *fw_ctx;
2480};
2481
1c4baffc 2482static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
99534007 2483 struct ExposeArgs *args = ASSERT_PTR(userdata);
6d0b55c2
LP
2484
2485 assert(rtnl);
2486 assert(m);
6d0b55c2 2487
fb9044cb
LP
2488 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2489 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2490 return 0;
2491}
2492
3a74cea5 2493static int setup_hostname(void) {
c818eef1 2494 int r;
3a74cea5 2495
0c582db0 2496 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2497 return 0;
2498
c818eef1
LP
2499 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2500 if (r < 0)
2501 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2502
7027ff61 2503 return 0;
3a74cea5
LP
2504}
2505
57fb9fb5 2506static int setup_journal(const char *directory) {
0f5e1382 2507 _cleanup_free_ char *d = NULL;
5980d463 2508 const char *p, *q;
b2238e38 2509 sd_id128_t this_id;
8054d749 2510 bool try;
57fb9fb5
LP
2511 int r;
2512
df9a75e4
LP
2513 /* Don't link journals in ephemeral mode */
2514 if (arg_ephemeral)
2515 return 0;
2516
8054d749
LP
2517 if (arg_link_journal == LINK_NO)
2518 return 0;
2519
2520 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2521
4d680aee 2522 r = sd_id128_get_machine(&this_id);
f647962d
MS
2523 if (r < 0)
2524 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2525
e01ff70a 2526 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2527 log_full(try ? LOG_WARNING : LOG_ERR,
85b55869 2528 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
8054d749 2529 if (try)
4d680aee 2530 return 0;
df9a75e4 2531 return -EEXIST;
4d680aee
ZJS
2532 }
2533
369ca6da
ZJS
2534 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2535 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2536 if (r < 0) {
2537 bool ignore = r == -EROFS && try;
2538 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2539 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2540 return ignore ? 0 : r;
2541 }
2542 }
03cfe0d5 2543
85b55869 2544 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
03cfe0d5 2545 q = prefix_roota(directory, p);
27407a01 2546
e1873695 2547 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2548 if (try)
2549 return 0;
27407a01 2550
baaa35ad
ZJS
2551 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2552 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2553 }
2554
e1873695 2555 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2556 if (try)
2557 return 0;
57fb9fb5 2558
baaa35ad
ZJS
2559 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2560 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2561 }
2562
2563 r = readlink_and_make_absolute(p, &d);
2564 if (r >= 0) {
3742095b 2565 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2566 path_equal(d, q)) {
2567
03cfe0d5 2568 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2569 if (r < 0)
709f6e46 2570 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2571 return 0;
57fb9fb5
LP
2572 }
2573
4a62c710
MS
2574 if (unlink(p) < 0)
2575 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2576 } else if (r == -EINVAL) {
2577
2578 if (arg_link_journal == LINK_GUEST &&
2579 rmdir(p) < 0) {
2580
27407a01
ZJS
2581 if (errno == ENOTDIR) {
2582 log_error("%s already exists and is neither a symlink nor a directory", p);
2583 return r;
4314d33f
MS
2584 } else
2585 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2586 }
4314d33f
MS
2587 } else if (r != -ENOENT)
2588 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2589
2590 if (arg_link_journal == LINK_GUEST) {
2591
2592 if (symlink(q, p) < 0) {
8054d749 2593 if (try) {
56f64d95 2594 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2595 return 0;
4314d33f
MS
2596 } else
2597 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2598 }
2599
03cfe0d5 2600 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2601 if (r < 0)
709f6e46 2602 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2603 return 0;
57fb9fb5
LP
2604 }
2605
2606 if (arg_link_journal == LINK_HOST) {
ccddd104 2607 /* don't create parents here — if the host doesn't have
574edc90 2608 * permanent journal set up, don't force it here */
ba8e6c4d 2609
3f692e2e 2610 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e 2611 if (r < 0 && r != -EEXIST) {
8054d749 2612 if (try) {
dae8b82e 2613 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2614 return 0;
4314d33f 2615 } else
dae8b82e 2616 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2617 }
2618
27407a01
ZJS
2619 } else if (access(p, F_OK) < 0)
2620 return 0;
57fb9fb5 2621
db55bbf2 2622 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
cdb2b9d0
LP
2623 log_warning("%s is not empty, proceeding anyway.", q);
2624
03cfe0d5 2625 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2628
511a8cfe 2629 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2630 if (r < 0)
4a62c710 2631 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2632
27407a01 2633 return 0;
57fb9fb5
LP
2634}
2635
de40a303
LP
2636static int drop_capabilities(uid_t uid) {
2637 CapabilityQuintet q;
2638
2639 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2640 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2641 * arg_caps_retain. */
2642
2643 if (capability_quintet_is_set(&arg_full_capabilities)) {
2644 q = arg_full_capabilities;
2645
f5fbe71d 2646 if (q.bounding == UINT64_MAX)
de40a303
LP
2647 q.bounding = uid == 0 ? arg_caps_retain : 0;
2648
f5fbe71d 2649 if (q.effective == UINT64_MAX)
de40a303
LP
2650 q.effective = uid == 0 ? q.bounding : 0;
2651
f5fbe71d 2652 if (q.inheritable == UINT64_MAX)
88fc9c9b 2653 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2654
f5fbe71d 2655 if (q.permitted == UINT64_MAX)
88fc9c9b 2656 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2657
f5fbe71d 2658 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2659 q.ambient = arg_caps_ambient;
f66ad460
AZ
2660
2661 if (capability_quintet_mangle(&q))
2662 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2663
2664 } else {
de40a303
LP
2665 q = (CapabilityQuintet) {
2666 .bounding = arg_caps_retain,
2667 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2668 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2669 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2670 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2671 };
2672
f66ad460
AZ
2673 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2674 * in order to maintain the same behavior as systemd < 242. */
2675 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2676 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2677 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2678
2679 }
2680
de40a303 2681 return capability_quintet_enforce(&q);
88213476
LP
2682}
2683
db999e0f
LP
2684static int reset_audit_loginuid(void) {
2685 _cleanup_free_ char *p = NULL;
2686 int r;
2687
0c582db0 2688 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2689 return 0;
2690
2691 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2692 if (r == -ENOENT)
db999e0f 2693 return 0;
f647962d
MS
2694 if (r < 0)
2695 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2696
2697 /* Already reset? */
2698 if (streq(p, "4294967295"))
2699 return 0;
2700
57512c89 2701 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2702 if (r < 0) {
10a87006
LP
2703 log_error_errno(r,
2704 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2705 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2706 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2707 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2708 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2709
db999e0f 2710 sleep(5);
77b6e194 2711 }
db999e0f
LP
2712
2713 return 0;
77b6e194
LP
2714}
2715
e79581dd 2716static int mount_tunnel_dig(const char *root) {
785890ac 2717 const char *p, *q;
709f6e46 2718 int r;
785890ac
LP
2719
2720 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2721 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2722 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2723 (void) mkdir_p(p, 0600);
2724
5a27b395 2725 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2726 if (r < 0)
5a27b395 2727 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2728
e79581dd 2729 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
709f6e46 2730 if (r < 0)
e79581dd 2731 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
03cfe0d5 2732
e79581dd 2733 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
511a8cfe 2734 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2735 if (r < 0)
2736 return r;
785890ac 2737
511a8cfe 2738 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2739 if (r < 0)
2740 return r;
785890ac 2741
e79581dd
CB
2742 return 0;
2743}
2744
2745static int mount_tunnel_open(void) {
2746 int r;
2747
2748 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2749 if (r < 0)
2750 return r;
2751
2752 return 0;
785890ac
LP
2753}
2754
317feb4d 2755static int setup_machine_id(const char *directory) {
3bbaff3e 2756 int r;
e01ff70a 2757
317feb4d
LP
2758 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2759 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2760 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2761 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2762 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2763 * container behaves nicely). */
2764
319477f1 2765 r = id128_get_machine(directory, &arg_uuid);
bb44fd07
ZJS
2766 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2767 /* If the file is missing, empty, or uninitialized, we don't mind */
317feb4d
LP
2768 if (sd_id128_is_null(arg_uuid)) {
2769 r = sd_id128_randomize(&arg_uuid);
2770 if (r < 0)
2771 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2772 }
bb44fd07
ZJS
2773 } else if (r < 0)
2774 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2775
e01ff70a
MS
2776 return 0;
2777}
2778
7336138e
LP
2779static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2780 int r;
2781
2782 assert(directory);
2783
6c045a99 2784 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2785 return 0;
2786
2787 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2788 if (r == -EOPNOTSUPP)
2789 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2790 if (r == -EBADE)
2791 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2792 if (r < 0)
2793 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2794 if (r == 0)
2795 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2796 else
2797 log_debug("Patched directory tree to match UID/GID range.");
2798
2799 return r;
2800}
2801
113cea80 2802/*
6d416b9c
LS
2803 * Return values:
2804 * < 0 : wait_for_terminate() failed to get the state of the
2805 * container, the container was terminated by a signal, or
2806 * failed for an unknown reason. No change is made to the
2807 * container argument.
2808 * > 0 : The program executed in the container terminated with an
2809 * error. The exit code of the program executed in the
919699ec
LP
2810 * container is returned. The container argument has been set
2811 * to CONTAINER_TERMINATED.
6d416b9c
LS
2812 * 0 : The container is being rebooted, has been shut down or exited
2813 * successfully. The container argument has been set to either
2814 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2815 *
6d416b9c
LS
2816 * That is, success is indicated by a return value of zero, and an
2817 * error is indicated by a non-zero value.
113cea80
DH
2818 */
2819static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2820 siginfo_t status;
919699ec 2821 int r;
113cea80
DH
2822
2823 r = wait_for_terminate(pid, &status);
f647962d
MS
2824 if (r < 0)
2825 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2826
2827 switch (status.si_code) {
fddbb89c 2828
113cea80 2829 case CLD_EXITED:
b5a2179b 2830 if (status.si_status == 0)
919699ec 2831 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2832 else
919699ec 2833 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2834
919699ec
LP
2835 *container = CONTAINER_TERMINATED;
2836 return status.si_status;
113cea80
DH
2837
2838 case CLD_KILLED:
2839 if (status.si_status == SIGINT) {
919699ec 2840 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2841 *container = CONTAINER_TERMINATED;
919699ec
LP
2842 return 0;
2843
113cea80 2844 } else if (status.si_status == SIGHUP) {
919699ec 2845 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2846 *container = CONTAINER_REBOOTED;
919699ec 2847 return 0;
113cea80 2848 }
919699ec 2849
4831981d 2850 _fallthrough_;
113cea80 2851 case CLD_DUMPED:
baaa35ad
ZJS
2852 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2853 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2854
2855 default:
baaa35ad
ZJS
2856 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2857 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2858 }
113cea80
DH
2859}
2860
023fb90b
LP
2861static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2862 pid_t pid;
2863
4a0b58c4 2864 pid = PTR_TO_PID(userdata);
023fb90b 2865 if (pid > 0) {
c6c8f6e2 2866 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2867 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2868 sd_event_source_set_userdata(s, NULL);
2869 return 0;
2870 }
2871 }
2872
2873 sd_event_exit(sd_event_source_get_event(s), 0);
2874 return 0;
2875}
2876
6916b164 2877static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2878 pid_t pid;
2879
2880 assert(s);
2881 assert(ssi);
2882
2883 pid = PTR_TO_PID(userdata);
2884
6916b164
AU
2885 for (;;) {
2886 siginfo_t si = {};
abdb9b08 2887
6916b164
AU
2888 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2889 return log_error_errno(errno, "Failed to waitid(): %m");
2890 if (si.si_pid == 0) /* No pending children. */
2891 break;
abdb9b08 2892 if (si.si_pid == pid) {
6916b164
AU
2893 /* The main process we care for has exited. Return from
2894 * signal handler but leave the zombie. */
2895 sd_event_exit(sd_event_source_get_event(s), 0);
2896 break;
2897 }
abdb9b08 2898
6916b164
AU
2899 /* Reap all other children. */
2900 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2901 }
2902
2903 return 0;
2904}
2905
abdb9b08
LP
2906static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2907 pid_t pid;
2908
2909 assert(m);
2910
2911 pid = PTR_TO_PID(userdata);
2912
2913 if (arg_kill_signal > 0) {
2914 log_info("Container termination requested. Attempting to halt container.");
2915 (void) kill(pid, arg_kill_signal);
2916 } else {
2917 log_info("Container termination requested. Exiting.");
2918 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2919 }
2920
2921 return 0;
2922}
2923
ec16945e 2924static int determine_names(void) {
1b9cebf6 2925 int r;
ec16945e 2926
c1521918
LP
2927 if (arg_template && !arg_directory && arg_machine) {
2928
2929 /* If --template= was specified then we should not
2930 * search for a machine, but instead create a new one
2931 * in /var/lib/machine. */
2932
657ee2d8 2933 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2934 if (!arg_directory)
2935 return log_oom();
2936 }
2937
ec16945e 2938 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2939 if (arg_machine) {
2940 _cleanup_(image_unrefp) Image *i = NULL;
2941
d577d4a4 2942 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
2943 if (r == -ENOENT)
2944 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2945 if (r < 0)
2946 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2947
eb38edce 2948 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2949 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2950 else
0f03c2a4 2951 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2952 if (r < 0)
0f3be6ca 2953 return log_oom();
1b9cebf6 2954
aee327b8
LP
2955 if (!arg_ephemeral)
2956 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2957 } else {
2958 r = safe_getcwd(&arg_directory);
2959 if (r < 0)
2960 return log_error_errno(r, "Failed to determine current directory: %m");
2961 }
ec16945e 2962
c6147113
LP
2963 if (!arg_directory && !arg_image)
2964 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2965 }
2966
2967 if (!arg_machine) {
b9ba4dab
LP
2968 if (arg_directory && path_equal(arg_directory, "/"))
2969 arg_machine = gethostname_malloc();
e9b88a6d
LP
2970 else if (arg_image) {
2971 char *e;
4827ab48 2972
b36e39d2
LP
2973 r = path_extract_filename(arg_image, &arg_machine);
2974 if (r < 0)
2975 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
4827ab48 2976
e9b88a6d
LP
2977 /* Truncate suffix if there is one */
2978 e = endswith(arg_machine, ".raw");
2979 if (e)
2980 *e = 0;
b36e39d2
LP
2981 } else {
2982 r = path_extract_filename(arg_directory, &arg_machine);
2983 if (r < 0)
2984 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
2985 }
ec16945e 2986
ae691c1d 2987 hostname_cleanup(arg_machine);
52ef5dd7 2988 if (!hostname_is_valid(arg_machine, 0))
c6147113 2989 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 2990
3603f151
LB
2991 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
2992 * to match fixed config file names. */
2993 arg_settings_filename = strjoin(arg_machine, ".nspawn");
2994 if (!arg_settings_filename)
2995 return log_oom();
2996
e9b88a6d
LP
2997 /* Add a random suffix when this is an ephemeral machine, so that we can run many
2998 * instances at once without manually having to specify -M each time. */
2999 if (arg_ephemeral)
3000 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3001 return log_oom();
3603f151
LB
3002 } else {
3003 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3004 if (!arg_settings_filename)
3005 return log_oom();
ec16945e
LP
3006 }
3007
3008 return 0;
3009}
3010
f461a28d 3011static int chase_and_update(char **p, unsigned flags) {
3f342ec4
LP
3012 char *chased;
3013 int r;
3014
3015 assert(p);
3016
3017 if (!*p)
3018 return 0;
3019
f461a28d 3020 r = chase(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3021 if (r < 0)
3022 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3023
a5648b80 3024 return free_and_replace(*p, chased);
3f342ec4
LP
3025}
3026
03cfe0d5 3027static int determine_uid_shift(const char *directory) {
6dac160c 3028
0de7acce 3029 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3030 arg_uid_shift = 0;
6dac160c 3031 return 0;
03cfe0d5 3032 }
6dac160c
LP
3033
3034 if (arg_uid_shift == UID_INVALID) {
3035 struct stat st;
3036
993da6d4
LP
3037 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3038
3039 if (stat(directory, &st) < 0)
03cfe0d5 3040 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3041
3042 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3043
baaa35ad
ZJS
3044 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3045 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3046 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3047
3048 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3049
3050 if (arg_uid_shift != 0) {
3051 /* If the image is shifted already, then we'll fall back to classic chowning, for
3052 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3053
3054 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3055 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3056 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3057 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3058 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3059 "UID base of %s is not zero, UID mapping not supported.", directory);
3060 }
6dac160c
LP
3061 }
3062
58e13de5
LP
3063 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3064 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3065
6dac160c
LP
3066 return 0;
3067}
3068
de40a303
LP
3069static unsigned long effective_clone_ns_flags(void) {
3070 unsigned long flags = arg_clone_ns_flags;
3071
3072 if (arg_private_network)
3073 flags |= CLONE_NEWNET;
3074 if (arg_use_cgns)
3075 flags |= CLONE_NEWCGROUP;
3076 if (arg_userns_mode != USER_NAMESPACE_NO)
3077 flags |= CLONE_NEWUSER;
3078
3079 return flags;
3080}
3081
3082static int patch_sysctl(void) {
3083
3084 /* This table is inspired by runc's sysctl() function */
3085 static const struct {
3086 const char *key;
3087 bool prefix;
3088 unsigned long clone_flags;
3089 } safe_sysctl[] = {
3090 { "kernel.hostname", false, CLONE_NEWUTS },
3091 { "kernel.domainname", false, CLONE_NEWUTS },
3092 { "kernel.msgmax", false, CLONE_NEWIPC },
3093 { "kernel.msgmnb", false, CLONE_NEWIPC },
3094 { "kernel.msgmni", false, CLONE_NEWIPC },
3095 { "kernel.sem", false, CLONE_NEWIPC },
3096 { "kernel.shmall", false, CLONE_NEWIPC },
3097 { "kernel.shmmax", false, CLONE_NEWIPC },
3098 { "kernel.shmmni", false, CLONE_NEWIPC },
3099 { "fs.mqueue.", true, CLONE_NEWIPC },
3100 { "net.", true, CLONE_NEWNET },
3101 };
3102
3103 unsigned long flags;
de40a303
LP
3104 int r;
3105
3106 flags = effective_clone_ns_flags();
3107
3108 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3109 bool good = false;
3110 size_t i;
3111
3112 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3113
3114 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3115 continue;
3116
3117 if (safe_sysctl[i].prefix)
3118 good = startswith(*k, safe_sysctl[i].key);
3119 else
3120 good = streq(*k, safe_sysctl[i].key);
3121
3122 if (good)
3123 break;
3124 }
3125
c6147113
LP
3126 if (!good)
3127 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3128
3129 r = sysctl_write(*k, *v);
3130 if (r < 0)
3131 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3132 }
3133
3134 return 0;
3135}
3136
03cfe0d5
LP
3137static int inner_child(
3138 Barrier *barrier,
5d9d3fcb 3139 int fd_inner_socket,
e1bb4b0d
LB
3140 FDSet *fds,
3141 char **os_release_pairs) {
69c79d3c 3142
03cfe0d5 3143 _cleanup_free_ char *home = NULL;
88614c8a 3144 size_t n_env = 1;
4ab3d29f
ZJS
3145 char *envp[] = {
3146 (char*) "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3147 NULL, /* container */
03cfe0d5
LP
3148 NULL, /* TERM */
3149 NULL, /* HOME */
3150 NULL, /* USER */
3151 NULL, /* LOGNAME */
3152 NULL, /* container_uuid */
3153 NULL, /* LISTEN_FDS */
3154 NULL, /* LISTEN_PID */
9c1e04d0 3155 NULL, /* NOTIFY_SOCKET */
3652872a 3156 NULL, /* CREDENTIALS_DIRECTORY */
b626f695 3157 NULL, /* LANG */
03cfe0d5
LP
3158 NULL
3159 };
1a68e1e5 3160 const char *exec_target;
2371271c 3161 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3162 int r, which_failed;
88213476 3163
b37469d7
LP
3164 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3165 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3166 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3167 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3168 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3169 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3170 * namespace.
3171 *
3172 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3173 * unshare(). See below. */
3174
03cfe0d5 3175 assert(barrier);
5d9d3fcb 3176 assert(fd_inner_socket >= 0);
88213476 3177
de40a303
LP
3178 log_debug("Inner child is initializing.");
3179
0de7acce 3180 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3181 /* Tell the parent, that it now can write the UID map. */
3182 (void) barrier_place(barrier); /* #1 */
7027ff61 3183
03cfe0d5 3184 /* Wait until the parent wrote the UID map */
baaa35ad 3185 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3186 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3187
2a2e78e9
LP
3188 /* Become the new root user inside our namespace */
3189 r = reset_uid_gid();
3190 if (r < 0)
3191 return log_error_errno(r, "Couldn't become new root: %m");
3192
3193 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3194 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3195 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3196 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3197 if (r < 0)
3198 return r;
3199 }
6d66bd3b 3200
0de7acce 3201 r = mount_all(NULL,
4f086aab 3202 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3203 arg_uid_shift,
0de7acce 3204 arg_selinux_apifs_context);
03cfe0d5
LP
3205 if (r < 0)
3206 return r;
3207
04413780
ZJS
3208 if (!arg_network_namespace_path && arg_private_network) {
3209 r = unshare(CLONE_NEWNET);
3210 if (r < 0)
3211 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3212
3213 /* Tell the parent that it can setup network interfaces. */
3214 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3215 }
3216
4f086aab 3217 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3218 if (r < 0)
3219 return r;
3220
03cfe0d5
LP
3221 /* Wait until we are cgroup-ified, so that we
3222 * can mount the right cgroup path writable */
baaa35ad
ZJS
3223 if (!barrier_place_and_sync(barrier)) /* #4 */
3224 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3225 "Parent died too early");
88213476 3226
489fae52 3227 if (arg_use_cgns) {
0996ef00
CB
3228 r = unshare(CLONE_NEWCGROUP);
3229 if (r < 0)
04413780 3230 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3231 r = mount_cgroups(
3232 "",
3233 arg_unified_cgroup_hierarchy,
3234 arg_userns_mode != USER_NAMESPACE_NO,
3235 arg_uid_shift,
3236 arg_uid_range,
5a8ff0e6 3237 arg_selinux_apifs_context,
ada54120 3238 true);
1433e0f2 3239 } else
0996ef00 3240 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3241 if (r < 0)
3242 return r;
ec16945e 3243
1e4f1671 3244 r = setup_boot_id();
03cfe0d5
LP
3245 if (r < 0)
3246 return r;
ec16945e 3247
5d9d3fcb 3248 r = setup_kmsg(fd_inner_socket);
03cfe0d5
LP
3249 if (r < 0)
3250 return r;
ec16945e 3251
de40a303
LP
3252 r = mount_custom(
3253 "/",
3254 arg_custom_mounts,
3255 arg_n_custom_mounts,
de40a303 3256 0,
c0c8f718 3257 0,
de40a303 3258 arg_selinux_apifs_context,
5f0a6347 3259 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3260 if (r < 0)
3261 return r;
3262
03cfe0d5
LP
3263 if (setsid() < 0)
3264 return log_error_errno(errno, "setsid() failed: %m");
3265
3266 if (arg_private_network)
df883de9 3267 (void) loopback_setup();
03cfe0d5 3268
7a8f6325 3269 if (arg_expose_ports) {
b07ee903 3270 r = expose_port_send_rtnl(fd_inner_socket);
7a8f6325
LP
3271 if (r < 0)
3272 return r;
7a8f6325 3273 }
03cfe0d5 3274
3acc84eb 3275 if (arg_console_mode != CONSOLE_PIPE) {
5bb1d7fb 3276 _cleanup_close_ int master = -EBADF;
3acc84eb
FB
3277 _cleanup_free_ char *console = NULL;
3278
3279 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3280 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3281 if (master < 0)
dc98caea 3282 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3283
3284 r = setup_dev_console(console);
3285 if (r < 0)
105a1a36 3286 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb 3287
bb1aa185 3288 r = send_one_fd(fd_inner_socket, master, 0);
3acc84eb
FB
3289 if (r < 0)
3290 return log_error_errno(r, "Failed to send master fd: %m");
3acc84eb
FB
3291
3292 r = setup_stdio_as_dev_console();
3293 if (r < 0)
3294 return r;
3295 }
3296
de40a303
LP
3297 r = patch_sysctl();
3298 if (r < 0)
3299 return r;
3300
81f345df
LP
3301 if (arg_oom_score_adjust_set) {
3302 r = set_oom_score_adjust(arg_oom_score_adjust);
3303 if (r < 0)
3304 return log_error_errno(r, "Failed to adjust OOM score: %m");
3305 }
3306
0985c7c4
ZJS
3307 if (arg_cpu_set.set)
3308 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3309 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3310
c818eef1 3311 (void) setup_hostname();
03cfe0d5 3312
050f7277 3313 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3314 r = safe_personality(arg_personality);
3315 if (r < 0)
3316 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3317#ifdef ARCHITECTURE_SECONDARY
3318 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
21022b9d
LP
3319 r = safe_personality(PER_LINUX32);
3320 if (r < 0)
3321 return log_error_errno(r, "personality() failed: %m");
4c27749b 3322#endif
af262e5f
LB
3323 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3324 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3325 "invocation with qemu userspace emulator (or equivalent) in effect.",
3326 architecture_to_string(arg_architecture));
03cfe0d5 3327
de40a303
LP
3328 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3329 if (r < 0)
3330 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3331
3332#if HAVE_SECCOMP
3333 if (arg_seccomp) {
3334
3335 if (is_seccomp_available()) {
de40a303 3336 r = seccomp_load(arg_seccomp);
3c098014
ZJS
3337 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3338 return log_error_errno(r, "Failed to install seccomp filter: %m");
3339 if (r < 0)
de40a303
LP
3340 log_debug_errno(r, "Failed to install seccomp filter: %m");
3341 }
3342 } else
3343#endif
3344 {
6b000af4 3345 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3346 if (r < 0)
3347 return r;
3348 }
3349
4a4654e0 3350 if (arg_suppress_sync) {
20e458ae 3351#if HAVE_SECCOMP
4a4654e0
LP
3352 r = seccomp_suppress_sync();
3353 if (r < 0)
3354 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
20e458ae 3355#else
2db32618 3356 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
20e458ae 3357#endif
4a4654e0
LP
3358 }
3359
349cc4a5 3360#if HAVE_SELINUX
03cfe0d5 3361 if (arg_selinux_context)
2ed96880 3362 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3363 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3364#endif
3365
de40a303
LP
3366 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3367 * if we need to later on. */
3368 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3369 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3370
3371 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3372 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3373 else
3462d773 3374 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3375 if (r < 0)
3376 return r;
3377
de40a303
LP
3378 r = drop_capabilities(getuid());
3379 if (r < 0)
3380 return log_error_errno(r, "Dropping capabilities failed: %m");
3381
66edd963
LP
3382 if (arg_no_new_privileges)
3383 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3384 return log_error_errno(errno, "Failed to disable new privileges: %m");
3385
6aadfa4c
ILG
3386 /* LXC sets container=lxc, so follow the scheme here */
3387 envp[n_env++] = strjoina("container=", arg_container_service_name);
3388
03cfe0d5
LP
3389 envp[n_env] = strv_find_prefix(environ, "TERM=");
3390 if (envp[n_env])
313cefa1 3391 n_env++;
03cfe0d5 3392
de40a303 3393 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3394 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
de40a303
LP
3395 return log_oom();
3396
3397 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3398 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
1da3cb81 3399 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
de40a303 3400 return log_oom();
03cfe0d5 3401
3bbaff3e 3402 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3403
b7416360 3404 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
e01ff70a 3405 return log_oom();
03cfe0d5
LP
3406
3407 if (fdset_size(fds) > 0) {
3408 r = fdset_cloexec(fds, false);
3409 if (r < 0)
3410 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3411
4ab3d29f
ZJS
3412 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3413 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
03cfe0d5
LP
3414 return log_oom();
3415 }
4ab3d29f 3416 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
9c1e04d0 3417 return log_oom();
03cfe0d5 3418
3652872a
LP
3419 if (arg_n_credentials > 0) {
3420 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3421 if (!envp[n_env])
3422 return log_oom();
3423 n_env++;
3424 }
3425
b626f695 3426 if (arg_start_mode != START_BOOT) {
a22f5186 3427 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
b626f695
DDM
3428 if (!envp[n_env])
3429 return log_oom();
3430 n_env++;
3431 }
3432
4ab3d29f 3433 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
2371271c
TG
3434 if (!env_use)
3435 return log_oom();
03cfe0d5 3436
1a8d7814 3437 /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */
baaa35ad 3438 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3439 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3440
5f932eb9
LP
3441 if (arg_chdir)
3442 if (chdir(arg_chdir) < 0)
3443 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3444
7732f92b 3445 if (arg_start_mode == START_PID2) {
75bf701f 3446 r = stub_pid1(arg_uuid);
7732f92b
LP
3447 if (r < 0)
3448 return r;
3449 }
3450
335d2ead
LP
3451 if (arg_console_mode != CONSOLE_PIPE) {
3452 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3453 * are configured for that. Acquire it as controlling tty. */
3454 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3455 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3456 }
3457
de40a303
LP
3458 log_debug("Inner child completed, invoking payload.");
3459
8ca082b4
LP
3460 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3461 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3462 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3463 log_close();
8ca082b4 3464 log_set_open_when_needed(true);
a3b00f91 3465 log_settle_target();
8ca082b4 3466
03cfe0d5
LP
3467 (void) fdset_close_others(fds);
3468
7732f92b 3469 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3470 char **a;
3471 size_t m;
3472
3473 /* Automatically search for the init system */
3474
75f32f04
ZJS
3475 m = strv_length(arg_parameters);
3476 a = newa(char*, m + 2);
3477 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3478 a[1 + m] = NULL;
03cfe0d5 3479
a5096641
LP
3480 FOREACH_STRING(init,
3481 "/usr/lib/systemd/systemd",
3482 "/lib/systemd/systemd",
3483 "/sbin/init") {
3484 a[0] = (char*) init;
3485 execve(a[0], a, env_use);
3486 }
ced58da7
LP
3487
3488 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3489 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3490 const char *dollar_path;
3491
1a68e1e5 3492 exec_target = arg_parameters[0];
b6b180b7
LP
3493
3494 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3495 * binary. */
3496 dollar_path = strv_env_get(env_use, "PATH");
3497 if (dollar_path) {
6f646e01 3498 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3499 return log_error_errno(errno, "Failed to update $PATH: %m");
3500 }
3501
f757855e 3502 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3503 } else {
5f932eb9 3504 if (!arg_chdir)
d929b0f9
ZJS
3505 /* If we cannot change the directory, we'll end up in /, that is expected. */
3506 (void) chdir(home ?: "/root");
5f932eb9 3507
53350c7b 3508 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3509 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3510 execle("/bin/bash", "-bash", NULL, env_use);
3511 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3512 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7 3513
53350c7b 3514 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
03cfe0d5
LP
3515 }
3516
8ca082b4 3517 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3518}
3519
e96ceaba 3520static int setup_notify_child(void) {
254d1313 3521 _cleanup_close_ int fd = -EBADF;
1eb874b9 3522 static const union sockaddr_union sa = {
44ed5214
LP
3523 .un.sun_family = AF_UNIX,
3524 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3525 };
3526 int r;
3527
3528 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3529 if (fd < 0)
3530 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3531
3532 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3533 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3534
9c1e04d0 3535 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3536 if (r < 0)
44ed5214 3537 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3538
adc7d9f0 3539 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3540 if (r < 0)
adc7d9f0 3541 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3542
2ff48e98 3543 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3544 if (r < 0)
2ff48e98 3545 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3546
271f518f 3547 return TAKE_FD(fd);
9c1e04d0
AP
3548}
3549
03cfe0d5
LP
3550static int outer_child(
3551 Barrier *barrier,
3552 const char *directory,
2d845785 3553 DissectedImage *dissected_image,
af06cd30 3554 int fd_outer_socket,
5d9d3fcb 3555 int fd_inner_socket,
d7bea6b6
DP
3556 FDSet *fds,
3557 int netns_fd) {
03cfe0d5 3558
2f893044 3559 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3560 _cleanup_strv_free_ char **os_release_pairs = NULL;
254d1313 3561 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
f61c7f88 3562 bool idmap = false;
e5f10caf 3563 const char *p;
03cfe0d5
LP
3564 pid_t pid;
3565 ssize_t l;
de40a303 3566 int r;
03cfe0d5 3567
d1d0b895
LP
3568 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3569 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3570 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3571 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3572 * forked off it, and it exits. */
b37469d7 3573
03cfe0d5
LP
3574 assert(barrier);
3575 assert(directory);
af06cd30 3576 assert(fd_outer_socket >= 0);
5d9d3fcb 3577 assert(fd_inner_socket >= 0);
03cfe0d5 3578
de40a303
LP
3579 log_debug("Outer child is initializing.");
3580
e1bb4b0d
LB
3581 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3582 if (r < 0)
3583 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3584
03cfe0d5
LP
3585 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3586 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3587
03cfe0d5
LP
3588 r = reset_audit_loginuid();
3589 if (r < 0)
3590 return r;
3591
2a2e78e9
LP
3592 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3593 * mounts to the real root. */
511a8cfe 3594 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3595 if (r < 0)
3596 return r;
03cfe0d5 3597
2d845785 3598 if (dissected_image) {
d1d0b895
LP
3599 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3600 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3601 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3602 * right place right away. This makes sure ESP partitions and userns are compatible. */
2d3a5a73 3603
af187ab2 3604 r = dissected_image_mount_and_warn(
d04faa4e
LP
3605 dissected_image,
3606 directory,
3607 arg_uid_shift,
21b61b1d 3608 arg_uid_range,
8d9a1d59 3609 /* userns_fd= */ -EBADF,
d04faa4e
LP
3610 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3611 DISSECT_IMAGE_DISCARD_ON_LOOP|
3612 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3613 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3614 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3615 if (r < 0)
af187ab2 3616 return r;
2d845785 3617 }
03cfe0d5 3618
391567f4
LP
3619 r = determine_uid_shift(directory);
3620 if (r < 0)
3621 return r;
3622
0de7acce 3623 if (arg_userns_mode != USER_NAMESPACE_NO) {
b71a0192
CB
3624 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3625 if (r < 0)
3626 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3627
af06cd30 3628 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
b71a0192
CB
3629 if (l < 0)
3630 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3631 mntns_fd = safe_close(mntns_fd);
3632
0e7ac751 3633 /* Let the parent know which UID shift we read from the image */
af06cd30 3634 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
825d5287
RM
3635 if (l < 0)
3636 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3637 if (l != sizeof(arg_uid_shift))
3638 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3639 "Short write while sending UID shift.");
0e7ac751 3640
0de7acce 3641 if (arg_userns_mode == USER_NAMESPACE_PICK) {
d1d0b895
LP
3642 /* When we are supposed to pick the UID shift, the parent will check now whether the
3643 * UID shift we just read from the image is available. If yes, it will send the UID
3644 * shift back to us, if not it will pick a different one, and send it back to us. */
0e7ac751 3645
af06cd30 3646 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
0e7ac751
LP
3647 if (l < 0)
3648 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3649 if (l != sizeof(arg_uid_shift))
3650 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3651 "Short read while receiving UID shift.");
0e7ac751
LP
3652 }
3653
ff6c6cc1
LP
3654 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3655 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3656 }
3657
6f83d3d1
LP
3658 if (path_equal(directory, "/")) {
3659 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3660 * place, so that we can make changes to its mount structure (for example, to implement
3661 * --volatile=) without this interfering with our ability to access files such as
3662 * /etc/localtime to copy into the container. Note that we use a fixed place for this
6c2d70ce 3663 * (instead of a temporary directory, since we are living in our own mount namespace here
7802194a 3664 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
6f83d3d1
LP
3665 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3666
511a8cfe 3667 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3668 if (r < 0)
3669 return r;
3670
3671 directory = "/run/systemd/nspawn-root";
e50cd82f 3672 }
7d0ecdd6 3673
75f81732
LP
3674 /* Make sure we always have a mount that we can move to root later on. */
3675 r = make_mount_point(directory);
3676 if (r < 0)
3677 return r;
3678
3679 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3680 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3681 * we'll live in our own little world from now on, and propagation from the host may only happen via
3682 * the mount tunnel dir, or not at all. */
3683 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3684 if (r < 0)
3685 return r;
3686
7d0ecdd6
LP
3687 r = setup_pivot_root(
3688 directory,
3689 arg_pivot_root_new,
3690 arg_pivot_root_old);
3691 if (r < 0)
3692 return r;
3693
3694 r = setup_volatile_mode(
3695 directory,
3696 arg_volatile_mode,
7d0ecdd6 3697 arg_uid_shift,
8f1ed04a 3698 arg_selinux_apifs_context);
7d0ecdd6
LP
3699 if (r < 0)
3700 return r;
3701
2f893044
LP
3702 r = bind_user_prepare(
3703 directory,
3704 arg_bind_user,
3705 arg_uid_shift,
3706 arg_uid_range,
3707 &arg_custom_mounts, &arg_n_custom_mounts,
3708 &bind_user_context);
3709 if (r < 0)
3710 return r;
3711
3712 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
d1d0b895
LP
3713 /* Send the user maps we determined to the parent, so that it installs it in our user
3714 * namespace UID map table */
2f893044
LP
3715
3716 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3717 uid_t map[] = {
3718 bind_user_context->data[i].payload_user->uid,
3719 bind_user_context->data[i].host_user->uid,
3720 (uid_t) bind_user_context->data[i].payload_group->gid,
3721 (uid_t) bind_user_context->data[i].host_group->gid,
3722 };
3723
af06cd30 3724 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
2f893044
LP
3725 if (l < 0)
3726 return log_error_errno(errno, "Failed to send user UID map: %m");
3727 if (l != sizeof(map))
3728 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3729 "Short write while sending user UID map.");
3730 }
3731 }
3732
5f0a6347
DDM
3733 r = mount_custom(
3734 directory,
3735 arg_custom_mounts,
3736 arg_n_custom_mounts,
5f0a6347 3737 arg_uid_shift,
c0c8f718 3738 arg_uid_range,
5f0a6347
DDM
3739 arg_selinux_apifs_context,
3740 MOUNT_ROOT_ONLY);
3741 if (r < 0)
3742 return r;
3743
c0c8f718
AV
3744 if (arg_userns_mode != USER_NAMESPACE_NO &&
3745 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3746 arg_uid_shift != 0) {
dba4fa89
LP
3747 _cleanup_free_ char *usr_subtree = NULL;
3748 char *dirs[3];
3749 size_t i = 0;
c0c8f718 3750
dba4fa89
LP
3751 dirs[i++] = (char*) directory;
3752
3753 if (dissected_image && dissected_image->partitions[PARTITION_USR].found) {
3754 usr_subtree = path_join(directory, "/usr");
3755 if (!usr_subtree)
3756 return log_oom();
3757
3758 dirs[i++] = usr_subtree;
3759 }
3760
3761 dirs[i] = NULL;
3762
3763 r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
bb44fd07
ZJS
3764 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3765 /* This might fail because the kernel or file system doesn't support idmapping. We
3766 * can't really distinguish this nicely, nor do we have any guarantees about the
3767 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3768 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3769 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3770 "ID mapped mounts are apparently not available, sorry.");
3771
3772 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3773 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3774 } else if (r < 0)
3775 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3776 else {
c0c8f718
AV
3777 log_debug("ID mapped mounts available, making use of them.");
3778 idmap = true;
3779 }
3780 }
3781
2d3a5a73
LP
3782 if (dissected_image) {
3783 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3784 r = dissected_image_mount(
3785 dissected_image,
3786 directory,
3787 arg_uid_shift,
21b61b1d 3788 arg_uid_range,
8d9a1d59 3789 /* userns_fd= */ -EBADF,
d04faa4e
LP
3790 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3791 DISSECT_IMAGE_DISCARD_ON_LOOP|
3792 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3793 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3794 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3795 if (r == -EUCLEAN)
3796 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3797 if (r < 0)
4fcb96ce 3798 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3799 }
3800
8199d554
LP
3801 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3802 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3803
3804 r = detect_unified_cgroup_hierarchy_from_image(directory);
3805 if (r < 0)
3806 return r;
3807
fefb7a6d 3808 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
8199d554
LP
3809 if (l < 0)
3810 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3811 if (l != sizeof(arg_unified_cgroup_hierarchy))
3812 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3813 "Short write while sending cgroup mode.");
8199d554
LP
3814 }
3815
4ad14eff
LP
3816 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3817 if (r < 0)
3818 return r;
3819
03cfe0d5
LP
3820 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3821 if (r < 0)
3822 return r;
3823
bbd407ea
DDM
3824 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3825 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3826 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3827 if (r < 0)
3828 return log_error_errno(r, "Failed to make tree read-only: %m");
3829 }
3830
0de7acce 3831 r = mount_all(directory,
4f086aab 3832 arg_mount_settings,
0de7acce 3833 arg_uid_shift,
0de7acce 3834 arg_selinux_apifs_context);
03cfe0d5
LP
3835 if (r < 0)
3836 return r;
3837
07fa00f9
LP
3838 r = copy_devnodes(directory);
3839 if (r < 0)
03cfe0d5
LP
3840 return r;
3841
de40a303
LP
3842 r = make_extra_nodes(directory);
3843 if (r < 0)
3844 return r;
3845
3846 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3847
9fac5029 3848 p = prefix_roota(directory, "/run/host");
e5f10caf 3849 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3850
07fa00f9
LP
3851 r = setup_pts(directory);
3852 if (r < 0)
03cfe0d5
LP
3853 return r;
3854
e79581dd 3855 r = mount_tunnel_dig(directory);
03cfe0d5
LP
3856 if (r < 0)
3857 return r;
3858
8e5430c4
LP
3859 r = setup_keyring();
3860 if (r < 0)
3861 return r;
3862
3652872a
LP
3863 r = setup_credentials(directory);
3864 if (r < 0)
3865 return r;
3866
2f893044
LP
3867 r = bind_user_setup(bind_user_context, directory);
3868 if (r < 0)
3869 return r;
3870
5c4deb9a
MJ
3871 r = mount_custom(
3872 directory,
3873 arg_custom_mounts,
3874 arg_n_custom_mounts,
3875 arg_uid_shift,
c0c8f718 3876 arg_uid_range,
5c4deb9a
MJ
3877 arg_selinux_apifs_context,
3878 MOUNT_NON_ROOT_ONLY);
3879 if (r < 0)
3880 return r;
3881
03cfe0d5
LP
3882 r = setup_timezone(directory);
3883 if (r < 0)
3884 return r;
3885
3886 r = setup_resolv_conf(directory);
3887 if (r < 0)
3888 return r;
3889
e01ff70a
MS
3890 r = setup_machine_id(directory);
3891 if (r < 0)
3892 return r;
3893
03cfe0d5
LP
3894 r = setup_journal(directory);
3895 if (r < 0)
3896 return r;
3897
0f48ba7b
LP
3898 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3899 p = prefix_roota(directory, "/run/host/container-manager");
3900 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3901
3902 /* The same stuff as the $container_uuid env var */
3903 p = prefix_roota(directory, "/run/host/container-uuid");
3904 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3905
489fae52 3906 if (!arg_use_cgns) {
0996ef00
CB
3907 r = mount_cgroups(
3908 directory,
3909 arg_unified_cgroup_hierarchy,
3910 arg_userns_mode != USER_NAMESPACE_NO,
3911 arg_uid_shift,
3912 arg_uid_range,
5a8ff0e6 3913 arg_selinux_apifs_context,
ada54120 3914 false);
0996ef00
CB
3915 if (r < 0)
3916 return r;
3917 }
03cfe0d5 3918
57c10a56
CB
3919 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3920 * mounts available in systemd services inside the container that create a new mount namespace. See
3921 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3922 * will inherit the shared propagation mode.
3923 *
3924 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3925 * directory mount to root later on.
3926 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3927 */
9d50f850 3928 r = mount_switch_root(directory, MS_SHARED);
03cfe0d5
LP
3929 if (r < 0)
3930 return log_error_errno(r, "Failed to move root directory: %m");
3931
e79581dd
CB
3932 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
3933 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
3934 * the container. */
3935 r = mount_tunnel_open();
3936 if (r < 0)
3937 return r;
3938
b71a0192
CB
3939 if (arg_userns_mode != USER_NAMESPACE_NO) {
3940 /* In order to mount procfs and sysfs in an unprivileged container the kernel
3941 * requires that a fully visible instance is already present in the target mount
3942 * namespace. Mount one here so the inner child can mount its own instances. Later
3943 * we umount the temporary instances created here before we actually exec the
3944 * payload. Since the rootfs is shared the umount will propagate into the container.
3945 * Note, the inner child wouldn't be able to unmount the instances on its own since
3946 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
3947 * this. */
3948 r = pin_fully_visible_fs();
3949 if (r < 0)
3950 return r;
3951 }
3952
e96ceaba 3953 fd = setup_notify_child();
9c1e04d0
AP
3954 if (fd < 0)
3955 return fd;
3956
03cfe0d5 3957 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3958 arg_clone_ns_flags |
8869a0b4 3959 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3960 if (pid < 0)
3961 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5 3962 if (pid == 0) {
af06cd30 3963 fd_outer_socket = safe_close(fd_outer_socket);
03cfe0d5 3964
2a2e78e9
LP
3965 /* The inner child has all namespaces that are requested, so that we all are owned by the
3966 * user if user namespaces are turned on. */
03cfe0d5 3967
d7bea6b6
DP
3968 if (arg_network_namespace_path) {
3969 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3970 if (r < 0)
e2d39e54 3971 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3972 }
3973
11875a98 3974 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
03cfe0d5
LP
3975 if (r < 0)
3976 _exit(EXIT_FAILURE);
3977
3978 _exit(EXIT_SUCCESS);
3979 }
3980
af06cd30 3981 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
03cfe0d5
LP
3982 if (l < 0)
3983 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3984 if (l != sizeof(pid))
3985 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3986 "Short write while sending PID.");
03cfe0d5 3987
af06cd30 3988 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
e01ff70a
MS
3989 if (l < 0)
3990 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3991 if (l != sizeof(arg_uuid))
3992 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3993 "Short write while sending machine ID.");
e01ff70a 3994
af06cd30 3995 l = send_one_fd(fd_outer_socket, fd, 0);
9c1e04d0 3996 if (l < 0)
ba72801d 3997 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3998
af06cd30 3999 fd_outer_socket = safe_close(fd_outer_socket);
5d9d3fcb 4000 fd_inner_socket = safe_close(fd_inner_socket);
d7bea6b6 4001 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
4002
4003 return 0;
4004}
4005
0e7ac751 4006static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4007 bool tried_hashed = false;
0e7ac751
LP
4008 unsigned n_tries = 100;
4009 uid_t candidate;
4010 int r;
4011
4012 assert(shift);
4013 assert(ret_lock_file);
0de7acce 4014 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4015 assert(arg_uid_range == 0x10000U);
4016
4017 candidate = *shift;
4018
4019 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4020
4021 for (;;) {
fbd0b64f 4022 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4023 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4024
4025 if (--n_tries <= 0)
4026 return -EBUSY;
4027
87d5e4f2 4028 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4029 goto next;
4030 if ((candidate & UINT32_C(0xFFFF)) != 0)
4031 goto next;
4032
4033 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4034 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4035 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4036 goto next;
4037 if (r < 0)
4038 return r;
4039
4040 /* Make some superficial checks whether the range is currently known in the user database */
4041 if (getpwuid(candidate))
4042 goto next;
4043 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4044 goto next;
4045 if (getgrgid(candidate))
4046 goto next;
4047 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4048 goto next;
4049
4050 *ret_lock_file = lf;
4051 lf = (struct LockFile) LOCK_FILE_INIT;
4052 *shift = candidate;
4053 return 0;
4054
4055 next:
d381c8a6
LP
4056 if (arg_machine && !tried_hashed) {
4057 /* Try to hash the base from the container name */
4058
4059 static const uint8_t hash_key[] = {
4060 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4061 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4062 };
4063
4064 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4065
4066 tried_hashed = true;
4067 } else
4068 random_bytes(&candidate, sizeof(candidate));
4069
87d5e4f2 4070 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4071 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4072 }
4073}
4074
2f893044
LP
4075static int add_one_uid_map(
4076 char **p,
4077 uid_t container_uid,
4078 uid_t host_uid,
4079 uid_t range) {
4080
4081 return strextendf(p,
4082 UID_FMT " " UID_FMT " " UID_FMT "\n",
4083 container_uid, host_uid, range);
4084}
4085
4086static int make_uid_map_string(
4087 const uid_t bind_user_uid[],
4088 size_t n_bind_user_uid,
4089 size_t offset,
4090 char **ret) {
4091
4092 _cleanup_free_ char *s = NULL;
4093 uid_t previous_uid = 0;
4094 int r;
4095
4096 assert(n_bind_user_uid == 0 || bind_user_uid);
2f092762 4097 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
2f893044
LP
4098 assert(ret);
4099
4100 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4101 * quadruplet, consisting of host and container UID + GID. */
4102
4103 for (size_t i = 0; i < n_bind_user_uid; i++) {
05ab439a
YW
4104 uid_t payload_uid = bind_user_uid[i*4+offset],
4105 host_uid = bind_user_uid[i*4+offset+1];
2f893044
LP
4106
4107 assert(previous_uid <= payload_uid);
4108 assert(payload_uid < arg_uid_range);
4109
4110 /* Add a range to close the gap to previous entry */
4111 if (payload_uid > previous_uid) {
4112 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4113 if (r < 0)
4114 return r;
4115 }
4116
4117 /* Map this specific user */
4118 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4119 if (r < 0)
4120 return r;
4121
4122 previous_uid = payload_uid + 1;
4123 }
4124
4125 /* And add a range to close the gap to finish the range */
4126 if (arg_uid_range > previous_uid) {
4127 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4128 if (r < 0)
4129 return r;
4130 }
4131
4132 assert(s);
4133
4134 *ret = TAKE_PTR(s);
4135 return 0;
4136}
4137
4138static int setup_uid_map(
4139 pid_t pid,
4140 const uid_t bind_user_uid[],
4141 size_t n_bind_user_uid) {
4142
4143 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4144 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4145 int r;
4146
4147 assert(pid > 1);
4148
2f893044
LP
4149 /* Build the UID map string */
4150 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4151 return log_oom();
4152
03cfe0d5 4153 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4154 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4155 if (r < 0)
4156 return log_error_errno(r, "Failed to write UID map: %m");
4157
2f893044
LP
4158 /* And now build the GID map string */
4159 s = mfree(s);
4160 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4161 return log_oom();
4162
03cfe0d5 4163 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4164 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4165 if (r < 0)
4166 return log_error_errno(r, "Failed to write GID map: %m");
4167
4168 return 0;
4169}
4170
9c1e04d0 4171static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4172 char buf[NOTIFY_BUFFER_MAX+1];
4173 char *p = NULL;
4174 struct iovec iovec = {
4175 .iov_base = buf,
4176 .iov_len = sizeof(buf)-1,
4177 };
fb29cdbe
LP
4178 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4179 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4180 struct msghdr msghdr = {
4181 .msg_iov = &iovec,
4182 .msg_iovlen = 1,
4183 .msg_control = &control,
4184 .msg_controllen = sizeof(control),
4185 };
371d72e0 4186 struct ucred *ucred;
9c1e04d0
AP
4187 ssize_t n;
4188 pid_t inner_child_pid;
4189 _cleanup_strv_free_ char **tags = NULL;
4bf4f50f 4190 int r;
9c1e04d0
AP
4191
4192 assert(userdata);
4193
4194 inner_child_pid = PTR_TO_PID(userdata);
4195
4196 if (revents != EPOLLIN) {
4197 log_warning("Got unexpected poll event for notify fd.");
4198 return 0;
4199 }
4200
3691bcf3 4201 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
bb44fd07
ZJS
4202 if (ERRNO_IS_NEG_TRANSIENT(n))
4203 return 0;
4204 else if (n == -EXFULL) {
4205 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4206 return 0;
4207 } else if (n < 0)
3691bcf3 4208 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 4209
9c1e04d0
AP
4210 cmsg_close_all(&msghdr);
4211
371d72e0 4212 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4213 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4214 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4215 return 0;
4216 }
4217
4218 if ((size_t) n >= sizeof(buf)) {
4219 log_warning("Received notify message exceeded maximum size. Ignoring.");
4220 return 0;
4221 }
4222
4223 buf[n] = 0;
4224 tags = strv_split(buf, "\n\r");
4225 if (!tags)
4226 return log_oom();
4227
d29cc4d6 4228 if (strv_contains(tags, "READY=1")) {
d4341b76 4229 r = sd_notify(false, "READY=1\n");
4bf4f50f
ZJS
4230 if (r < 0)
4231 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4232 }
9c1e04d0
AP
4233
4234 p = strv_find_startswith(tags, "STATUS=");
4235 if (p)
04f590a4 4236 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4237
4238 return 0;
4239}
4240
e96ceaba 4241static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4242 int r;
9c1e04d0 4243
5773024d 4244 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4245 if (r < 0)
4246 return log_error_errno(r, "Failed to allocate notify event source: %m");
4247
5773024d 4248 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4249
4250 return 0;
4251}
4252
5d961407
LP
4253static int merge_settings(Settings *settings, const char *path) {
4254 int rl;
f757855e 4255
5d961407
LP
4256 assert(settings);
4257 assert(path);
f757855e 4258
5d961407
LP
4259 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4260 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4261
7732f92b
LP
4262 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4263 settings->start_mode >= 0) {
4264 arg_start_mode = settings->start_mode;
130d3d22 4265 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4266 }
4267
d3689b94
LP
4268 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4269 settings->ephemeral >= 0)
a2f577fc
JL
4270 arg_ephemeral = settings->ephemeral;
4271
de40a303
LP
4272 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4273 settings->root) {
4274
4275 if (!arg_settings_trusted)
4276 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4277 else
4278 free_and_replace(arg_directory, settings->root);
4279 }
4280
b53ede69
PW
4281 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4282 settings->pivot_root_new) {
4283 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4284 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4285 }
4286
5f932eb9 4287 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4288 settings->working_directory)
4289 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4290
f757855e 4291 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4292 settings->environment)
4293 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4294
de40a303
LP
4295 if ((arg_settings_mask & SETTING_USER) == 0) {
4296
4297 if (settings->user)
4298 free_and_replace(arg_user, settings->user);
4299
4300 if (uid_is_valid(settings->uid))
4301 arg_uid = settings->uid;
4302 if (gid_is_valid(settings->gid))
4303 arg_gid = settings->gid;
4304 if (settings->n_supplementary_gids > 0) {
4305 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4306 arg_n_supplementary_gids = settings->n_supplementary_gids;
4307 }
4308 }
f757855e
LP
4309
4310 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4311 uint64_t plus, minus;
7be830c6 4312 uint64_t network_minus = 0;
88fc9c9b 4313 uint64_t ambient;
f757855e 4314
de40a303
LP
4315 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4316 * Settings structure */
4317
0e265674 4318 plus = settings->capability;
a3fc6b55
LP
4319 minus = settings->drop_capability;
4320
9baa294c
LP
4321 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4322 settings_network_configured(settings)) {
a3fc6b55
LP
4323 if (settings_private_network(settings))
4324 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4325 else
7be830c6 4326 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4327 }
0e265674
LP
4328
4329 if (!arg_settings_trusted && plus != 0) {
4330 if (settings->capability != 0)
5d961407 4331 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4332 } else {
4333 arg_caps_retain &= ~network_minus;
520e0d54 4334 arg_caps_retain |= plus;
7be830c6 4335 }
f757855e 4336
a3fc6b55 4337 arg_caps_retain &= ~minus;
de40a303
LP
4338
4339 /* Copy the full capabilities over too */
4340 if (capability_quintet_is_set(&settings->full_capabilities)) {
4341 if (!arg_settings_trusted)
5238e957 4342 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4343 else
4344 arg_full_capabilities = settings->full_capabilities;
4345 }
88fc9c9b
TH
4346
4347 ambient = settings->ambient_capability;
4348 if (!arg_settings_trusted && ambient != 0)
4349 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4350 else
4351 arg_caps_ambient |= ambient;
f757855e
LP
4352 }
4353
4354 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4355 settings->kill_signal > 0)
4356 arg_kill_signal = settings->kill_signal;
4357
4358 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4359 settings->personality != PERSONALITY_INVALID)
4360 arg_personality = settings->personality;
4361
4362 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4363 !sd_id128_is_null(settings->machine_id)) {
4364
4365 if (!arg_settings_trusted)
5d961407 4366 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4367 else
4368 arg_uuid = settings->machine_id;
4369 }
4370
4371 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4372 settings->read_only >= 0)
4373 arg_read_only = settings->read_only;
4374
4375 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4376 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4377 arg_volatile_mode = settings->volatile_mode;
4378
4379 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4380 settings->n_custom_mounts > 0) {
4381
4382 if (!arg_settings_trusted)
5d961407 4383 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4384 else {
4385 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4386 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4387 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4388 settings->n_custom_mounts = 0;
4389 }
4390 }
4391
4392 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
a1dfd585 4393 settings_network_configured(settings)) {
f757855e
LP
4394
4395 if (!arg_settings_trusted)
5d961407 4396 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4397 else {
f6d6bad1 4398 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4399 arg_private_network = settings_private_network(settings);
4400
130d3d22
YW
4401 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4402 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4403 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4404 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4405
1cc6c93a
YW
4406 free_and_replace(arg_network_bridge, settings->network_bridge);
4407 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4408
4409 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4410 }
4411 }
4412
4413 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4414 settings->expose_ports) {
4415
4416 if (!arg_settings_trusted)
5d961407 4417 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4418 else {
4419 expose_port_free_all(arg_expose_ports);
1cc6c93a 4420 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4421 }
4422 }
4423
0de7acce
LP
4424 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4425 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4426
4427 if (!arg_settings_trusted)
5d961407 4428 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4429 else {
4430 arg_userns_mode = settings->userns_mode;
4431 arg_uid_shift = settings->uid_shift;
4432 arg_uid_range = settings->uid_range;
6c045a99 4433 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4434 }
4435 }
4436
0cc3c9f9
LP
4437 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4438 !strv_isempty(settings->bind_user))
2f893044
LP
4439 strv_free_and_replace(arg_bind_user, settings->bind_user);
4440
d3689b94
LP
4441 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4442 settings->notify_ready >= 0)
9c1e04d0
AP
4443 arg_notify_ready = settings->notify_ready;
4444
960e4569
LP
4445 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4446
2d09ea44
LP
4447 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4448 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4449 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4450 else {
4451 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4452 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4453 }
960e4569 4454 }
de40a303
LP
4455
4456#if HAVE_SECCOMP
2d09ea44
LP
4457 if (settings->seccomp) {
4458 if (!arg_settings_trusted)
4459 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4460 else {
4461 seccomp_release(arg_seccomp);
4462 arg_seccomp = TAKE_PTR(settings->seccomp);
4463 }
de40a303
LP
4464 }
4465#endif
960e4569
LP
4466 }
4467
bf428efb
LP
4468 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4469 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4470 continue;
4471
4472 if (!settings->rlimit[rl])
4473 continue;
4474
4475 if (!arg_settings_trusted) {
5d961407 4476 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4477 continue;
4478 }
4479
4480 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4481 }
4482
3a9530e5
LP
4483 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4484 settings->hostname)
4485 free_and_replace(arg_hostname, settings->hostname);
4486
66edd963
LP
4487 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4488 settings->no_new_privileges >= 0)
4489 arg_no_new_privileges = settings->no_new_privileges;
4490
81f345df
LP
4491 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4492 settings->oom_score_adjust_set) {
4493
4494 if (!arg_settings_trusted)
5d961407 4495 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4496 else {
4497 arg_oom_score_adjust = settings->oom_score_adjust;
4498 arg_oom_score_adjust_set = true;
4499 }
4500 }
4501
d107bb7d 4502 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4503 settings->cpu_set.set) {
d107bb7d
LP
4504
4505 if (!arg_settings_trusted)
5d961407 4506 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4507 else {
0985c7c4 4508 cpu_set_reset(&arg_cpu_set);
088d71f8 4509 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
d107bb7d
LP
4510 }
4511 }
4512
09d423e9
LP
4513 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4514 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4515 arg_resolv_conf = settings->resolv_conf;
4516
4e1d6aa9
LP
4517 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4518 settings->link_journal != _LINK_JOURNAL_INVALID) {
4519
4520 if (!arg_settings_trusted)
4521 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4522 else {
4523 arg_link_journal = settings->link_journal;
4524 arg_link_journal_try = settings->link_journal_try;
4525 }
4526 }
4527
1688841f
LP
4528 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4529 settings->timezone != _TIMEZONE_MODE_INVALID)
4530 arg_timezone = settings->timezone;
4531
de40a303
LP
4532 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4533 settings->slice) {
4534
4535 if (!arg_settings_trusted)
4536 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4537 else
4538 free_and_replace(arg_slice, settings->slice);
4539 }
4540
4541 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4542 settings->use_cgns >= 0) {
4543
4544 if (!arg_settings_trusted)
4545 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4546 else
4547 arg_use_cgns = settings->use_cgns;
4548 }
4549
4550 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4551 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4552
4553 if (!arg_settings_trusted)
4554 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4555 else
4556 arg_clone_ns_flags = settings->clone_ns_flags;
4557 }
4558
4559 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4560 settings->console_mode >= 0) {
4561
4562 if (!arg_settings_trusted)
4563 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4564 else
4565 arg_console_mode = settings->console_mode;
4566 }
4567
d3689b94
LP
4568 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4569 settings->suppress_sync >= 0)
4a4654e0
LP
4570 arg_suppress_sync = settings->suppress_sync;
4571
de40a303
LP
4572 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4573 * don't consult arg_settings_mask for them. */
4574
4575 sd_bus_message_unref(arg_property_message);
4576 arg_property_message = TAKE_PTR(settings->properties);
4577
4578 arg_console_width = settings->console_width;
4579 arg_console_height = settings->console_height;
4580
b2645747 4581 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4582 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4583 arg_n_extra_nodes = settings->n_extra_nodes;
825210d4 4584 settings->n_extra_nodes = 0;
de40a303 4585
f757855e
LP
4586 return 0;
4587}
4588
5d961407
LP
4589static int load_settings(void) {
4590 _cleanup_(settings_freep) Settings *settings = NULL;
4591 _cleanup_fclose_ FILE *f = NULL;
3603f151 4592 _cleanup_free_ char *p = NULL;
5d961407
LP
4593 int r;
4594
de40a303
LP
4595 if (arg_oci_bundle)
4596 return 0;
4597
5d961407
LP
4598 /* If all settings are masked, there's no point in looking for
4599 * the settings file */
d7a0f1f4 4600 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4601 return 0;
4602
5d961407
LP
4603 /* We first look in the admin's directories in /etc and /run */
4604 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4605 _cleanup_free_ char *j = NULL;
4606
3603f151 4607 j = path_join(i, arg_settings_filename);
5d961407
LP
4608 if (!j)
4609 return log_oom();
4610
4611 f = fopen(j, "re");
4612 if (f) {
4613 p = TAKE_PTR(j);
4614
4615 /* By default, we trust configuration from /etc and /run */
4616 if (arg_settings_trusted < 0)
4617 arg_settings_trusted = true;
4618
4619 break;
4620 }
4621
4622 if (errno != ENOENT)
4623 return log_error_errno(errno, "Failed to open %s: %m", j);
4624 }
4625
4626 if (!f) {
4627 /* After that, let's look for a file next to the
4628 * actual image we shall boot. */
4629
4630 if (arg_image) {
162f6477
LP
4631 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4632 if (r < 0)
4633 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4634 } else if (arg_directory) {
4635 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4636 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4637 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
5d961407
LP
4638 }
4639
4640 if (p) {
4641 f = fopen(p, "re");
4642 if (!f && errno != ENOENT)
4643 return log_error_errno(errno, "Failed to open %s: %m", p);
4644
4645 /* By default, we do not trust configuration from /var/lib/machines */
4646 if (arg_settings_trusted < 0)
4647 arg_settings_trusted = false;
4648 }
4649 }
4650
4651 if (!f)
4652 return 0;
4653
4654 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4655
4656 r = settings_load(f, p, &settings);
4657 if (r < 0)
4658 return r;
4659
4660 return merge_settings(settings, p);
4661}
4662
de40a303
LP
4663static int load_oci_bundle(void) {
4664 _cleanup_(settings_freep) Settings *settings = NULL;
4665 int r;
4666
4667 if (!arg_oci_bundle)
4668 return 0;
4669
4670 /* By default let's trust OCI bundles */
4671 if (arg_settings_trusted < 0)
4672 arg_settings_trusted = true;
4673
4674 r = oci_load(NULL, arg_oci_bundle, &settings);
4675 if (r < 0)
4676 return r;
4677
4678 return merge_settings(settings, arg_oci_bundle);
4679}
4680
3acc84eb 4681static int run_container(
2d845785 4682 DissectedImage *dissected_image,
b0067625
ZJS
4683 FDSet *fds,
4684 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4685 struct ExposeArgs *expose_args,
3acc84eb 4686 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4687
4688 static const struct sigaction sa = {
4689 .sa_handler = nop_signal_handler,
e28c7cd0 4690 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4691 };
4692
8e766630 4693 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
5bb1d7fb 4694 _cleanup_close_ int etc_passwd_lock = -EBADF;
b0067625 4695 _cleanup_close_pair_ int
71136404
LP
4696 fd_inner_socket_pair[2] = EBADF_PAIR,
4697 fd_outer_socket_pair[2] = EBADF_PAIR;
8199d554 4698
5bb1d7fb 4699 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
b0067625 4700 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4701 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4702 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4703 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4704 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4705 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4706 _cleanup_free_ uid_t *bind_user_uid = NULL;
4707 size_t n_bind_user_uid = 0;
b0067625 4708 ContainerStatus container_status = 0;
b0067625
ZJS
4709 int ifi = 0, r;
4710 ssize_t l;
4711 sigset_t mask_chld;
254d1313 4712 _cleanup_close_ int child_netns_fd = -EBADF;
b0067625
ZJS
4713
4714 assert_se(sigemptyset(&mask_chld) == 0);
4715 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4716
4717 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4718 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4719 * check with getpwuid() if the specific user already exists. Note that /etc might be
4720 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4721 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4722 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4723 * really ours. */
4724
4725 etc_passwd_lock = take_etc_passwd_lock(NULL);
4726 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4727 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4728 }
4729
4730 r = barrier_create(&barrier);
4731 if (r < 0)
4732 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4733
5d9d3fcb
CB
4734 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4735 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4736
af06cd30
CB
4737 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4738 return log_error_errno(errno, "Failed to create outer socket pair: %m");
b0067625 4739
b0067625
ZJS
4740 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4741 * parent's blocking calls and give it a chance to call wait() and terminate. */
4742 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4743 if (r < 0)
4744 return log_error_errno(errno, "Failed to change the signal mask: %m");
4745
4746 r = sigaction(SIGCHLD, &sa, NULL);
4747 if (r < 0)
4748 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4749
d7bea6b6 4750 if (arg_network_namespace_path) {
5b4855ab
DDM
4751 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4752 if (child_netns_fd < 0)
d7bea6b6
DP
4753 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4754
54c2459d 4755 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4756 if (r == -EUCLEAN)
4757 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4758 else if (r < 0)
d7bea6b6 4759 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4760 else if (r == 0)
4761 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4762 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4763 }
4764
b0067625
ZJS
4765 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4766 if (*pid < 0)
4767 return log_error_errno(errno, "clone() failed%s: %m",
4768 errno == EINVAL ?
4769 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4770
4771 if (*pid == 0) {
4772 /* The outer child only has a file system namespace. */
4773 barrier_set_role(&barrier, BARRIER_CHILD);
4774
5d9d3fcb 4775 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
af06cd30 4776 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
b0067625
ZJS
4777
4778 (void) reset_all_signal_handlers();
4779 (void) reset_signal_mask();
4780
4781 r = outer_child(&barrier,
4782 arg_directory,
2d845785 4783 dissected_image,
af06cd30 4784 fd_outer_socket_pair[1],
5d9d3fcb 4785 fd_inner_socket_pair[1],
d7bea6b6 4786 fds,
5b4855ab 4787 child_netns_fd);
b0067625
ZJS
4788 if (r < 0)
4789 _exit(EXIT_FAILURE);
4790
4791 _exit(EXIT_SUCCESS);
4792 }
4793
4794 barrier_set_role(&barrier, BARRIER_PARENT);
4795
e4077ff6 4796 fdset_close(fds);
b0067625 4797
5d9d3fcb 4798 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
af06cd30 4799 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
b0067625
ZJS
4800
4801 if (arg_userns_mode != USER_NAMESPACE_NO) {
af06cd30 4802 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
b71a0192
CB
4803 if (mntns_fd < 0)
4804 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4805
b0067625 4806 /* The child just let us know the UID shift it might have read from the image. */
af06cd30 4807 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
b0067625
ZJS
4808 if (l < 0)
4809 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4810 if (l != sizeof arg_uid_shift)
4811 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4812
4813 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4814 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4815 * image, but if that's already in use, pick a new one, and report back to the child,
4816 * which one we now picked. */
4817
4818 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4819 if (r < 0)
4820 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4821
af06cd30 4822 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
b0067625
ZJS
4823 if (l < 0)
4824 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4825 if (l != sizeof arg_uid_shift)
4826 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4827 }
2f893044
LP
4828
4829 n_bind_user_uid = strv_length(arg_bind_user);
4830 if (n_bind_user_uid > 0) {
4831 /* Right after the UID shift, we'll receive the list of UID mappings for the
4832 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4833
4834 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4835 if (!bind_user_uid)
4836 return log_oom();
4837
4838 for (size_t i = 0; i < n_bind_user_uid; i++) {
af06cd30 4839 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
2f893044
LP
4840 if (l < 0)
4841 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4842 if (l != sizeof(uid_t)*4)
4843 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4844 SYNTHETIC_ERRNO(EIO),
4845 "Short read while reading bind user UID pairs.");
4846 }
4847 }
b0067625
ZJS
4848 }
4849
8199d554
LP
4850 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4851 /* The child let us know the support cgroup mode it might have read from the image. */
fefb7a6d 4852 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
8199d554
LP
4853 if (l < 0)
4854 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113 4855 if (l != sizeof(arg_unified_cgroup_hierarchy))
c0f86d66 4856 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
c6147113 4857 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4858 }
4859
b0067625 4860 /* Wait for the outer child. */
d2e0ac3d
LP
4861 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4862 if (r < 0)
4863 return r;
4864 if (r != EXIT_SUCCESS)
4865 return -EIO;
b0067625
ZJS
4866
4867 /* And now retrieve the PID of the inner child. */
af06cd30 4868 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
b0067625
ZJS
4869 if (l < 0)
4870 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4871 if (l != sizeof *pid)
4872 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4873
4874 /* We also retrieve container UUID in case it was generated by outer child */
af06cd30 4875 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
b0067625
ZJS
4876 if (l < 0)
4877 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4878 if (l != sizeof(arg_uuid))
4879 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4880
4881 /* We also retrieve the socket used for notifications generated by outer child */
af06cd30 4882 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
b0067625
ZJS
4883 if (notify_socket < 0)
4884 return log_error_errno(notify_socket,
4885 "Failed to receive notification socket from the outer child: %m");
4886
4887 log_debug("Init process invoked as PID "PID_FMT, *pid);
4888
4889 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4890 if (!barrier_place_and_sync(&barrier)) /* #1 */
4891 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4892
2f893044 4893 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4894 if (r < 0)
4895 return r;
4896
4897 (void) barrier_place(&barrier); /* #2 */
4898 }
4899
4900 if (arg_private_network) {
75116558
PS
4901 if (!arg_network_namespace_path) {
4902 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4903 if (!barrier_place_and_sync(&barrier)) /* #3 */
4904 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4905 }
4906
5b4855ab
DDM
4907 if (child_netns_fd < 0) {
4908 /* Make sure we have an open file descriptor to the child's network
4909 * namespace so it stays alive even if the child exits. */
4910 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4911 if (r < 0)
4912 return log_error_errno(r, "Failed to open child network namespace: %m");
4913 }
4914
4915 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4916 if (r < 0)
4917 return r;
4918
4919 if (arg_network_veth) {
4920 r = setup_veth(arg_machine, *pid, veth_name,
813dbff4 4921 arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
b0067625
ZJS
4922 if (r < 0)
4923 return r;
4924 else if (r > 0)
4925 ifi = r;
4926
4927 if (arg_network_bridge) {
4928 /* Add the interface to a bridge */
4929 r = setup_bridge(veth_name, arg_network_bridge, false);
4930 if (r < 0)
4931 return r;
4932 if (r > 0)
4933 ifi = r;
4934 } else if (arg_network_zone) {
4935 /* Add the interface to a bridge, possibly creating it */
4936 r = setup_bridge(veth_name, arg_network_zone, true);
4937 if (r < 0)
4938 return r;
4939 if (r > 0)
4940 ifi = r;
4941 }
4942 }
4943
4944 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4945 if (r < 0)
4946 return r;
4947
4948 /* We created the primary and extra veth links now; let's remember this, so that we know to
4949 remove them later on. Note that we don't bother with removing veth links that were created
4950 here when their setup failed half-way, because in that case the kernel should be able to
4951 remove them on its own, since they cannot be referenced by anything yet. */
4952 *veth_created = true;
4953
4954 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4955 if (r < 0)
4956 return r;
4957
4958 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4959 if (r < 0)
4960 return r;
4961 }
4962
abdb9b08
LP
4963 if (arg_register || !arg_keep_unit) {
4964 r = sd_bus_default_system(&bus);
4965 if (r < 0)
4966 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4967
4968 r = sd_bus_set_close_on_exit(bus, false);
4969 if (r < 0)
4970 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4971 }
4972
4973 if (!arg_keep_unit) {
4974 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4975 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4976 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4977
75152a4d
LP
4978 r = sd_bus_match_signal_async(
4979 bus,
4980 NULL,
4981 "org.freedesktop.systemd1",
4982 NULL,
4983 "org.freedesktop.systemd1.Scope",
4984 "RequestStop",
4985 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4986 if (r < 0)
75152a4d 4987 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4988 }
4989
b0067625
ZJS
4990 if (arg_register) {
4991 r = register_machine(
abdb9b08 4992 bus,
b0067625
ZJS
4993 arg_machine,
4994 *pid,
4995 arg_directory,
4996 arg_uuid,
4997 ifi,
4998 arg_slice,
4999 arg_custom_mounts, arg_n_custom_mounts,
5000 arg_kill_signal,
5001 arg_property,
de40a303 5002 arg_property_message,
b0067625 5003 arg_keep_unit,
411d8c72
NR
5004 arg_container_service_name,
5005 arg_start_mode);
b0067625
ZJS
5006 if (r < 0)
5007 return r;
abdb9b08 5008
cd2dfc6f
LP
5009 } else if (!arg_keep_unit) {
5010 r = allocate_scope(
abdb9b08 5011 bus,
cd2dfc6f
LP
5012 arg_machine,
5013 *pid,
5014 arg_slice,
5015 arg_custom_mounts, arg_n_custom_mounts,
5016 arg_kill_signal,
de40a303 5017 arg_property,
7eda208f 5018 arg_property_message,
411d8c72
NR
5019 /* allow_pidfds= */ true,
5020 arg_start_mode);
cd2dfc6f
LP
5021 if (r < 0)
5022 return r;
5023
5024 } else if (arg_slice || arg_property)
5025 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5026
27da7ef0 5027 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5028 if (r < 0)
5029 return r;
5030
27da7ef0 5031 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5032 if (r < 0)
5033 return r;
b0067625 5034
de54e02d 5035 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5036 if (r < 0)
5037 return r;
5038
5039 /* Notify the child that the parent is ready with all
5040 * its setup (including cgroup-ification), and that
5041 * the child can now hand over control to the code to
5042 * run inside the container. */
75116558 5043 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5044
5045 /* Block SIGCHLD here, before notifying child.
5046 * process_pty() will handle it with the other signals. */
5047 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5048
5049 /* Reset signal to default */
9c274488 5050 r = default_signals(SIGCHLD);
b0067625
ZJS
5051 if (r < 0)
5052 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5053
5054 r = sd_event_new(&event);
5055 if (r < 0)
5056 return log_error_errno(r, "Failed to get default event source: %m");
5057
8fd010bb
LP
5058 (void) sd_event_set_watchdog(event, true);
5059
abdb9b08
LP
5060 if (bus) {
5061 r = sd_bus_attach_event(bus, event, 0);
5062 if (r < 0)
5063 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5064 }
5065
e96ceaba 5066 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5067 if (r < 0)
5068 return r;
5069
1a8d7814
LP
5070 /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on,
5071 * before we take the fully visible instances away. */
5072 if (!barrier_sync(&barrier)) /* #5.1 */
5073 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5074
b71a0192
CB
5075 if (arg_userns_mode != USER_NAMESPACE_NO) {
5076 r = wipe_fully_visible_fs(mntns_fd);
5077 if (r < 0)
5078 return r;
5079 mntns_fd = safe_close(mntns_fd);
5080 }
5081
1a8d7814
LP
5082 /* And now let the child know that we completed removing the procfs instances, and it can start the
5083 * payload. */
5084 if (!barrier_place(&barrier)) /* #5.2 */
c6147113 5085 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5086
38ccb557 5087 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5088 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5089 etc_passwd_lock = safe_close(etc_passwd_lock);
5090
04f590a4
LP
5091 (void) sd_notifyf(false,
5092 "STATUS=Container running.\n"
5093 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4bf4f50f
ZJS
5094 if (!arg_notify_ready) {
5095 r = sd_notify(false, "READY=1\n");
5096 if (r < 0)
5097 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5098 }
b0067625
ZJS
5099
5100 if (arg_kill_signal > 0) {
5101 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5102 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5103 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5104 } else {
5105 /* Immediately exit */
919f5ae0
LP
5106 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5107 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5108 }
5109
988851b6
LP
5110 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5111
5112 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5113 if (r < 0)
5114 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5115
6916b164 5116 /* Exit when the child exits */
919f5ae0 5117 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625 5118
b07ee903
CB
5119 /* Retrieve the kmsg fifo allocated by inner child */
5120 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5121 if (fd_kmsg_fifo < 0)
5122 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5123
b0067625 5124 if (arg_expose_ports) {
b07ee903 5125 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5126 if (r < 0)
5127 return r;
5128
deff68e7
FW
5129 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5130 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5131 }
5132
3acc84eb 5133 if (arg_console_mode != CONSOLE_PIPE) {
254d1313 5134 _cleanup_close_ int fd = -EBADF;
3acc84eb 5135 PTYForwardFlags flags = 0;
de40a303 5136
3acc84eb 5137 /* Retrieve the master pty allocated by inner child */
bb1aa185 5138 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
3acc84eb
FB
5139 if (fd < 0)
5140 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5141
5142 switch (arg_console_mode) {
de40a303 5143
3acc84eb
FB
5144 case CONSOLE_READ_ONLY:
5145 flags |= PTY_FORWARD_READ_ONLY;
5146
5147 _fallthrough_;
5148
5149 case CONSOLE_INTERACTIVE:
5150 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5151
5152 r = pty_forward_new(event, fd, flags, &forward);
5153 if (r < 0)
5154 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5155
f5fbe71d 5156 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5157 (void) pty_forward_set_width_height(forward,
5158 arg_console_width,
5159 arg_console_height);
5160 break;
5161
5162 default:
5163 assert(arg_console_mode == CONSOLE_PASSIVE);
5164 }
5165
5166 *master = TAKE_FD(fd);
de40a303 5167 }
b0067625 5168
5d9d3fcb
CB
5169 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5170
b0067625
ZJS
5171 r = sd_event_loop(event);
5172 if (r < 0)
5173 return log_error_errno(r, "Failed to run event loop: %m");
5174
de40a303
LP
5175 if (forward) {
5176 char last_char = 0;
b0067625 5177
de40a303
LP
5178 (void) pty_forward_get_last_char(forward, &last_char);
5179 forward = pty_forward_free(forward);
b0067625 5180
de40a303
LP
5181 if (!arg_quiet && last_char != '\n')
5182 putc('\n', stdout);
5183 }
b0067625
ZJS
5184
5185 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5186 if (!arg_register && !arg_keep_unit && bus)
5187 terminate_scope(bus, arg_machine);
b0067625
ZJS
5188
5189 /* Normally redundant, but better safe than sorry */
c67b0082 5190 (void) kill(*pid, SIGKILL);
b0067625 5191
5d9d3fcb
CB
5192 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5193
5b4855ab
DDM
5194 if (arg_private_network) {
5195 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5196 * to avoid having to move the parent to the child network namespace. */
e9ccae31 5197 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG, NULL);
5b4855ab
DDM
5198 if (r < 0)
5199 return r;
5200
5201 if (r == 0) {
254d1313 5202 _cleanup_close_ int parent_netns_fd = -EBADF;
5b4855ab 5203
19b761a0 5204 r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5b4855ab
DDM
5205 if (r < 0) {
5206 log_error_errno(r, "Failed to open parent network namespace: %m");
5207 _exit(EXIT_FAILURE);
5208 }
5209
5210 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5211 if (r < 0) {
5212 log_error_errno(r, "Failed to enter child network namespace: %m");
5213 _exit(EXIT_FAILURE);
5214 }
5215
2f091b1b
TM
5216 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5217 * This is about ensuring interfaces get their old name back when being moved back. */
5218 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5219
5b4855ab
DDM
5220 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5221 if (r < 0)
5222 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5223
5224 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5225 }
5226 }
5227
8f03de53 5228 r = wait_for_container(TAKE_PID(*pid), &container_status);
b0067625 5229
0bb0a9fa
ZJS
5230 /* Tell machined that we are gone. */
5231 if (bus)
5232 (void) unregister_machine(bus, arg_machine);
5233
b0067625
ZJS
5234 if (r < 0)
5235 /* We failed to wait for the container, or the container exited abnormally. */
5236 return r;
5237 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5238 /* r > 0 → The container exited with a non-zero status.
5239 * As a special case, we need to replace 133 with a different value,
5240 * because 133 is special-cased in the service file to reboot the container.
5241 * otherwise → The container exited with zero status and a reboot was not requested.
5242 */
2a49b612 5243 if (r == EXIT_FORCE_RESTART)
27e29a1e 5244 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5245 *ret = r;
b0067625
ZJS
5246 return 0; /* finito */
5247 }
5248
5249 /* CONTAINER_REBOOTED, loop again */
5250
5251 if (arg_keep_unit) {
5252 /* Special handling if we are running as a service: instead of simply
5253 * restarting the machine we want to restart the entire service, so let's
5254 * inform systemd about this with the special exit code 133. The service
5255 * file uses RestartForceExitStatus=133 so that this results in a full
5256 * nspawn restart. This is necessary since we might have cgroup parameters
5257 * set we want to have flushed out. */
2a49b612
ZJS
5258 *ret = EXIT_FORCE_RESTART;
5259 return 0; /* finito */
b0067625
ZJS
5260 }
5261
deff68e7
FW
5262 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5263 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5264
5265 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5266 *veth_created = false;
5267 return 1; /* loop again */
5268}
5269
bf428efb 5270static int initialize_rlimits(void) {
852b6250 5271 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
bf428efb
LP
5272 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5273 * container execution environments. */
5274
5275 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
852b6250
LP
5276 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5277 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5278 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5279 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5280 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5281 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5282 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5283 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5284 [RLIMIT_NICE] = { 0, 0 },
5285 [RLIMIT_NOFILE] = { 1024, 4096 },
5286 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5287 [RLIMIT_RTPRIO] = { 0, 0 },
5288 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5289 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
bf428efb
LP
5290
5291 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5292 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5293 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5294 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5295 * that PID 1 changes a number of other resource limits during early initialization which is why we
5296 * don't read the other limits from PID 1 but prefer the static table above. */
5297 };
5298
5299 int rl;
5300
5301 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5302 /* Let's only fill in what the user hasn't explicitly configured anyway */
5303 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5304 const struct rlimit *v;
5305 struct rlimit buffer;
5306
5307 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5308 /* For these two let's read the limits off PID 1. See above for an explanation. */
5309
5310 if (prlimit(1, rl, NULL, &buffer) < 0)
5311 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5312
dbf1aca6
LP
5313 v = &buffer;
5314 } else if (rl == RLIMIT_NOFILE) {
5315 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5316 * userspace. Given that nspawn containers are often run without our PID 1,
5317 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5318 * so that container userspace gets similar resources as host userspace
5319 * gets. */
5320 buffer = kernel_defaults[rl];
5321 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
bf428efb
LP
5322 v = &buffer;
5323 } else
5324 v = kernel_defaults + rl;
5325
5326 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5327 if (!arg_rlimit[rl])
5328 return log_oom();
5329 }
5330
5331 if (DEBUG_LOGGING) {
5332 _cleanup_free_ char *k = NULL;
5333
5334 (void) rlimit_format(arg_rlimit[rl], &k);
5335 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5336 }
5337 }
5338
5339 return 0;
5340}
5341
287b7376 5342static int cant_be_in_netns(void) {
254d1313 5343 _cleanup_close_ int fd = -EBADF;
287b7376
LP
5344 struct ucred ucred;
5345 int r;
5346
5347 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5348 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5349 * nice message. */
5350
5351 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5352 return 0;
5353
5354 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5355 if (fd < 0)
5356 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5357
1861986a 5358 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
bb44fd07
ZJS
5359 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5360 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5361 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5362 if (r < 0)
1861986a 5363 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
287b7376
LP
5364
5365 r = getpeercred(fd, &ucred);
5366 if (r < 0)
5367 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5368
f7a2dc3d 5369 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
287b7376 5370 if (r < 0)
f7a2dc3d
CB
5371 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5372 if (r == 0)
287b7376
LP
5373 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5374 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5375 return 0;
5376}
5377
44dbef90 5378static int run(int argc, char *argv[]) {
4c27749b 5379 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5bb1d7fb 5380 _cleanup_close_ int master = -EBADF;
03cfe0d5 5381 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5382 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5383 char veth_name[IFNAMSIZ] = "";
761cf19d 5384 struct ExposeArgs expose_args = {};
8e766630 5385 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5386 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5387 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e 5388 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5389 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5390 pid_t pid = 0;
03cfe0d5
LP
5391
5392 log_parse_environment();
5393 log_open();
415fc41c 5394
03cfe0d5
LP
5395 r = parse_argv(argc, argv);
5396 if (r <= 0)
5397 goto finish;
5398
38ee19c0
ZJS
5399 if (geteuid() != 0) {
5400 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5401 argc >= 2 ? "Need to be root." :
5402 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5403 goto finish;
38ee19c0 5404 }
fba868fa 5405
287b7376
LP
5406 r = cant_be_in_netns();
5407 if (r < 0)
5408 goto finish;
5409
bf428efb
LP
5410 r = initialize_rlimits();
5411 if (r < 0)
5412 goto finish;
5413
de40a303
LP
5414 r = load_oci_bundle();
5415 if (r < 0)
5416 goto finish;
5417
f757855e
LP
5418 r = determine_names();
5419 if (r < 0)
5420 goto finish;
5421
5422 r = load_settings();
5423 if (r < 0)
5424 goto finish;
5425
d4d99bc6 5426 r = cg_unified();
5eee8290
LP
5427 if (r < 0) {
5428 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5429 goto finish;
5430 }
5431
f757855e
LP
5432 r = verify_arguments();
5433 if (r < 0)
5434 goto finish;
03cfe0d5 5435
2f091b1b
TM
5436 r = verify_network_interfaces_initialized();
5437 if (r < 0)
5438 goto finish;
5439
49048684
ZJS
5440 /* Reapply environment settings. */
5441 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5442
2949ff26
LP
5443 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5444 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5445 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5446 (void) ignore_signals(SIGPIPE);
2949ff26 5447
03cfe0d5
LP
5448 n_fd_passed = sd_listen_fds(false);
5449 if (n_fd_passed > 0) {
5450 r = fdset_new_listen_fds(&fds, false);
5451 if (r < 0) {
5452 log_error_errno(r, "Failed to collect file descriptors: %m");
5453 goto finish;
5454 }
5455 }
5456
83e803a9
ZJS
5457 /* The "default" umask. This is appropriate for most file and directory
5458 * operations performed by nspawn, and is the umask that will be used for
5459 * the child. Functions like copy_devnodes() change the umask temporarily. */
5460 umask(0022);
5461
03cfe0d5
LP
5462 if (arg_directory) {
5463 assert(!arg_image);
5464
b35ca61a
LP
5465 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5466 * /var from the host will propagate into container dynamically (because bad things happen if
5467 * two systems write to the same /var). Let's allow it for the special cases where /var is
5468 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5469 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
1406bd66
LP
5470 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5471 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5472 goto finish;
5473 }
5474
5475 if (arg_ephemeral) {
5476 _cleanup_free_ char *np = NULL;
5477
f461a28d 5478 r = chase_and_update(&arg_directory, 0);
3f342ec4
LP
5479 if (r < 0)
5480 goto finish;
5481
7bf011e3
LP
5482 /* If the specified path is a mount point we generate the new snapshot immediately
5483 * inside it under a random name. However if the specified is not a mount point we
5484 * create the new snapshot in the parent directory, just next to it. */
e1873695 5485 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5486 if (r < 0) {
5487 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5488 goto finish;
5489 }
5490 if (r > 0)
770b5ce4 5491 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5492 else
770b5ce4 5493 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5494 if (r < 0) {
0f3be6ca 5495 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5496 goto finish;
5497 }
5498
6992459c 5499 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5500 * only owned by us and no one else. */
6992459c 5501 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5502 if (r < 0) {
5503 log_error_errno(r, "Failed to lock %s: %m", np);
5504 goto finish;
5505 }
5506
7bf011e3
LP
5507 {
5508 BLOCK_SIGNALS(SIGINT);
fab4ef72
DDM
5509 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5510 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5511 BTRFS_SNAPSHOT_FALLBACK_COPY |
5512 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5513 BTRFS_SNAPSHOT_RECURSIVE |
5514 BTRFS_SNAPSHOT_QUOTA |
5515 BTRFS_SNAPSHOT_SIGINT);
7bf011e3
LP
5516 }
5517 if (r == -EINTR) {
5518 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5519 goto finish;
5520 }
03cfe0d5
LP
5521 if (r < 0) {
5522 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5523 goto finish;
ec16945e
LP
5524 }
5525
1cc6c93a 5526 free_and_replace(arg_directory, np);
17cbb288 5527 remove_directory = true;
30535c16 5528 } else {
f461a28d 5529 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5530 if (r < 0)
5531 goto finish;
5532
30535c16
LP
5533 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5534 if (r == -EBUSY) {
5535 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5536 goto finish;
5537 }
5538 if (r < 0) {
5539 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5540 goto finish;
30535c16
LP
5541 }
5542
5543 if (arg_template) {
f461a28d 5544 r = chase_and_update(&arg_template, 0);
3f342ec4
LP
5545 if (r < 0)
5546 goto finish;
5547
7bf011e3
LP
5548 {
5549 BLOCK_SIGNALS(SIGINT);
fab4ef72
DDM
5550 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5551 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5552 BTRFS_SNAPSHOT_FALLBACK_COPY |
5553 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5554 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5555 BTRFS_SNAPSHOT_RECURSIVE |
5556 BTRFS_SNAPSHOT_QUOTA |
5557 BTRFS_SNAPSHOT_SIGINT);
7bf011e3 5558 }
ff6c6cc1
LP
5559 if (r == -EEXIST)
5560 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5561 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5562 else if (r == -EINTR) {
5563 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5564 goto finish;
5565 } else if (r < 0) {
83521414 5566 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5567 goto finish;
ff6c6cc1
LP
5568 } else
5569 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5570 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5571 }
ec16945e
LP
5572 }
5573
7732f92b 5574 if (arg_start_mode == START_BOOT) {
aff7ae0d 5575 _cleanup_free_ char *b = NULL;
a5201ed6 5576 const char *p;
d4317fe1 5577 int check_os_release, is_os_tree;
c9fe05e0 5578
aff7ae0d
LP
5579 if (arg_pivot_root_new) {
5580 b = path_join(arg_directory, arg_pivot_root_new);
41de458a
LP
5581 if (!b) {
5582 r = log_oom();
5583 goto finish;
5584 }
aff7ae0d
LP
5585
5586 p = b;
5587 } else
a5201ed6 5588 p = arg_directory;
c9fe05e0 5589
d4317fe1
FS
5590 check_os_release = getenv_bool("SYSTEMD_NSPAWN_CHECK_OS_RELEASE");
5591 if (check_os_release < 0 && check_os_release != -ENXIO) {
5592 r = log_error_errno(check_os_release, "Failed to parse $SYSTEMD_NSPAWN_CHECK_OS_RELEASE: %m");
5593 goto finish;
5594 }
5595
5596 is_os_tree = path_is_os_tree(p);
5597 if (is_os_tree == 0 && check_os_release == 0)
5598 log_debug("Directory %s is missing an os-release file, continuing anyway.", p);
5599 else if (is_os_tree <= 0) {
aff7ae0d
LP
5600 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5601 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
1b9e5b12
LP
5602 goto finish;
5603 }
5604 } else {
aff7ae0d 5605 _cleanup_free_ char *p = NULL;
c9fe05e0 5606
a5201ed6 5607 if (arg_pivot_root_new)
aff7ae0d 5608 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
a5201ed6 5609 else
aff7ae0d 5610 p = path_join(arg_directory, "/usr/");
41de458a
LP
5611 if (!p) {
5612 r = log_oom();
5613 goto finish;
5614 }
1b9e5b12 5615
aff7ae0d
LP
5616 if (laccess(p, F_OK) < 0) {
5617 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5618 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
1b9e5b12 5619 goto finish;
1b9e5b12
LP
5620 }
5621 }
ec16945e 5622
6b9132a9 5623 } else {
d04faa4e 5624 DissectImageFlags dissect_image_flags =
4b5de5dd 5625 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5626 DISSECT_IMAGE_REQUIRE_ROOT |
5627 DISSECT_IMAGE_RELAX_VAR_CHECK |
73d88b80
LP
5628 DISSECT_IMAGE_USR_NO_ROOT |
5629 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5630 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
ec16945e
LP
5631 assert(arg_image);
5632 assert(!arg_template);
5633
f461a28d 5634 r = chase_and_update(&arg_image, 0);
3f342ec4
LP
5635 if (r < 0)
5636 goto finish;
5637
0f3be6ca
LP
5638 if (arg_ephemeral) {
5639 _cleanup_free_ char *np = NULL;
5640
5641 r = tempfn_random(arg_image, "machine.", &np);
5642 if (r < 0) {
5643 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5644 goto finish;
5645 }
5646
6992459c
LP
5647 /* Always take an exclusive lock on our own ephemeral copy. */
5648 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5649 if (r < 0) {
5650 r = log_error_errno(r, "Failed to create image lock: %m");
5651 goto finish;
5652 }
5653
7bf011e3
LP
5654 {
5655 BLOCK_SIGNALS(SIGINT);
7c2f5495
DDM
5656 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5657 FS_NOCOW_FL, FS_NOCOW_FL,
5658 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5659 NULL, NULL);
7bf011e3
LP
5660 }
5661 if (r == -EINTR) {
5662 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5663 goto finish;
5664 }
0f3be6ca
LP
5665 if (r < 0) {
5666 r = log_error_errno(r, "Failed to copy image file: %m");
5667 goto finish;
5668 }
5669
1cc6c93a 5670 free_and_replace(arg_image, np);
0f3be6ca
LP
5671 remove_image = true;
5672 } else {
5673 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5674 if (r == -EBUSY) {
5675 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5676 goto finish;
5677 }
5678 if (r < 0) {
5679 r = log_error_errno(r, "Failed to create image lock: %m");
5680 goto finish;
5681 }
4623e8e6 5682
89e62e0b
LP
5683 r = verity_settings_load(
5684 &arg_verity_settings,
5685 arg_image, NULL, NULL);
e7cbe5cb
LB
5686 if (r < 0) {
5687 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5688 goto finish;
78ebe980 5689 }
89e62e0b
LP
5690
5691 if (arg_verity_settings.data_path)
5692 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5693 }
5694
c67b0082 5695 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5696 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5697 goto finish;
1b9e5b12 5698 }
6b9132a9 5699
c67b0082
LP
5700 remove_tmprootdir = true;
5701
5702 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5703 if (!arg_directory) {
5704 r = log_oom();
5705 goto finish;
6b9132a9 5706 }
88213476 5707
89e62e0b
LP
5708 r = loop_device_make_by_path(
5709 arg_image,
5710 arg_read_only ? O_RDONLY : O_RDWR,
22ee78a8 5711 /* sector_size= */ UINT32_MAX,
89e62e0b 5712 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
7f52206a 5713 LOCK_SH,
89e62e0b 5714 &loop);
2d845785
LP
5715 if (r < 0) {
5716 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5717 goto finish;
5718 }
1b9e5b12 5719
bad31660 5720 r = dissect_loop_device_and_warn(
bad31660 5721 loop,
89e62e0b 5722 &arg_verity_settings,
84be0c71
LP
5723 /* mount_options=*/ NULL,
5724 arg_image_policy ?: &image_policy_container,
e7cbe5cb 5725 dissect_image_flags,
e0f9e7bd 5726 &dissected_image);
2d845785 5727 if (r == -ENOPKG) {
4526113f 5728 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5729 log_notice("Note that the disk image needs to\n"
5730 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5731 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
db811444 5732 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
2d845785
LP
5733 " d) or contain a file system without a partition table\n"
5734 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5735 goto finish;
2d845785 5736 }
4526113f 5737 if (r < 0)
842f3b0f 5738 goto finish;
1b9e5b12 5739
88b3300f
LP
5740 r = dissected_image_load_verity_sig_partition(
5741 dissected_image,
5742 loop->fd,
5743 &arg_verity_settings);
5744 if (r < 0)
5745 goto finish;
5746
8ee9615e
LP
5747 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5748 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5749 "root hash signature found! Proceeding without integrity checking.", arg_image);
4623e8e6 5750
89e62e0b
LP
5751 r = dissected_image_decrypt_interactively(
5752 dissected_image,
5753 NULL,
5754 &arg_verity_settings,
e330f97a 5755 0);
1b9e5b12
LP
5756 if (r < 0)
5757 goto finish;
0f3be6ca
LP
5758
5759 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5760 if (remove_image && unlink(arg_image) >= 0)
5761 remove_image = false;
4c27749b
LP
5762
5763 if (arg_architecture < 0)
5764 arg_architecture = dissected_image_architecture(dissected_image);
842f3b0f 5765 }
842f3b0f 5766
86c0dd4a 5767 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5768 if (r < 0)
5769 goto finish;
5770
de40a303
LP
5771 if (arg_console_mode < 0)
5772 arg_console_mode =
5773 isatty(STDIN_FILENO) > 0 &&
5774 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5775
de40a303
LP
5776 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5777 arg_quiet = true;
a258bf26 5778
9c857b9d 5779 if (!arg_quiet)
c85c2f79 5780 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
9c857b9d
LP
5781 arg_machine, arg_image ?: arg_directory);
5782
988851b6 5783 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
a258bf26 5784
8c3fe1b5
LP
5785 r = make_reaper_process(true);
5786 if (r < 0) {
5787 log_error_errno(r, "Failed to become subreaper: %m");
03cfe0d5
LP
5788 goto finish;
5789 }
5790
761cf19d
FW
5791 if (arg_expose_ports) {
5792 r = fw_ctx_new(&fw_ctx);
5793 if (r < 0) {
5794 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5795 goto finish;
5796 }
5797 expose_args.fw_ctx = fw_ctx;
5798 }
d87be9b0 5799 for (;;) {
3acc84eb 5800 r = run_container(dissected_image,
44dbef90
LP
5801 fds,
5802 veth_name, &veth_created,
761cf19d 5803 &expose_args, &master,
44dbef90 5804 &pid, &ret);
b0067625 5805 if (r <= 0)
d87be9b0 5806 break;
d87be9b0 5807 }
88213476
LP
5808
5809finish:
04f590a4
LP
5810 (void) sd_notify(false,
5811 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5812 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5813
9444b1f2 5814 if (pid > 0)
c67b0082 5815 (void) kill(pid, SIGKILL);
88213476 5816
503546da 5817 /* Try to flush whatever is still queued in the pty */
6a0f896b 5818 if (master >= 0) {
f5fbe71d 5819 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5820 master = safe_close(master);
5821 }
5822
5823 if (pid > 0)
5824 (void) wait_for_terminate(pid, NULL);
503546da 5825
50ebcf6c
LP
5826 pager_close();
5827
17cbb288 5828 if (remove_directory && arg_directory) {
ec16945e
LP
5829 int k;
5830
17cbb288 5831 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5832 if (k < 0)
17cbb288 5833 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5834 }
5835
0f3be6ca
LP
5836 if (remove_image && arg_image) {
5837 if (unlink(arg_image) < 0)
5838 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5839 }
5840
c67b0082
LP
5841 if (remove_tmprootdir) {
5842 if (rmdir(tmprootdir) < 0)
5843 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5844 }
5845
785890ac
LP
5846 if (arg_machine) {
5847 const char *p;
5848
63c372cb 5849 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5850 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5851 }
5852
deff68e7
FW
5853 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5854 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5855
5856 if (veth_created)
5857 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5858 (void) remove_bridge(arg_network_zone);
f757855e 5859
f757855e
LP
5860 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5861 expose_port_free_all(arg_expose_ports);
bf428efb 5862 rlimit_free_all(arg_rlimit);
b2645747 5863 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
e8ac916e 5864 machine_credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5865
44dbef90
LP
5866 if (r < 0)
5867 return r;
5868
5869 return ret;
88213476 5870}
44dbef90
LP
5871
5872DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);