]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
udev: introduce .PART_SUFFIX internal property
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
d6b4d1c7 31#include "build.h"
b8ea7a6e 32#include "bus-error.h"
7f8a85e6 33#include "bus-locator.h"
b053cd5f 34#include "bus-util.h"
8fe0087e 35#include "cap-list.h"
430f0182 36#include "capability-util.h"
04d391da 37#include "cgroup-util.h"
f461a28d 38#include "chase.h"
988851b6 39#include "common-signal.h"
8fe0087e 40#include "copy.h"
d107bb7d 41#include "cpu-set-util.h"
786d19fd 42#include "creds-util.h"
4fc9982c 43#include "dev-setup.h"
57f1b61b 44#include "discover-image.h"
2d845785 45#include "dissect-image.h"
8fe0087e 46#include "env-util.h"
3652872a 47#include "escape.h"
3ffd4af2 48#include "fd-util.h"
842f3b0f 49#include "fdset.h"
a5c32cff 50#include "fileio.h"
f97b34a6 51#include "format-util.h"
f4f15635 52#include "fs-util.h"
1b9e5b12 53#include "gpt.h"
4623e8e6 54#include "hexdecoct.h"
e2054217 55#include "hostname-setup.h"
8fe0087e 56#include "hostname-util.h"
910fd145 57#include "id128-util.h"
3652872a 58#include "io-util.h"
8fe0087e 59#include "log.h"
2d845785 60#include "loop-util.h"
8fe0087e 61#include "loopback-setup.h"
8fe0087e 62#include "macro.h"
44dbef90 63#include "main-func.h"
f5947a5e 64#include "missing_sched.h"
8fe0087e 65#include "mkdir.h"
4349cd7c 66#include "mount-util.h"
049af8ad 67#include "mountpoint-util.h"
0cb8e3d1 68#include "namespace-util.h"
8fe0087e 69#include "netlink-util.h"
2f893044 70#include "nspawn-bind-user.h"
07630cea 71#include "nspawn-cgroup.h"
3652872a 72#include "nspawn-creds.h"
3603efde 73#include "nspawn-def.h"
07630cea
LP
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
de40a303 77#include "nspawn-oci.h"
7336138e 78#include "nspawn-patch-uid.h"
07630cea 79#include "nspawn-register.h"
910fd145 80#include "nspawn-seccomp.h"
07630cea
LP
81#include "nspawn-settings.h"
82#include "nspawn-setuid.h"
7732f92b 83#include "nspawn-stub-pid1.h"
c9394f4f 84#include "nspawn-util.h"
91181e07 85#include "nspawn.h"
d8b4d14d 86#include "nulstr-util.h"
d58ad743 87#include "os-util.h"
50ebcf6c 88#include "pager.h"
614b022c 89#include "parse-argument.h"
6bedfcbb 90#include "parse-util.h"
294bf0c3 91#include "pretty-print.h"
0b452006 92#include "process-util.h"
8fe0087e
LP
93#include "ptyfwd.h"
94#include "random-util.h"
8869a0b4 95#include "raw-clone.h"
86775e35 96#include "resolve-util.h"
bf428efb 97#include "rlimit-util.h"
8fe0087e 98#include "rm-rf.h"
de40a303
LP
99#if HAVE_SECCOMP
100#include "seccomp-util.h"
101#endif
68b02049 102#include "selinux-util.h"
8fe0087e 103#include "signal-util.h"
2583fbea 104#include "socket-util.h"
8fcde012 105#include "stat-util.h"
15a5e950 106#include "stdio-util.h"
5c828e66 107#include "string-table.h"
07630cea 108#include "string-util.h"
8fe0087e 109#include "strv.h"
de40a303 110#include "sysctl-util.h"
8fe0087e 111#include "terminal-util.h"
e4de7287 112#include "tmpfile-util.h"
affb60b1 113#include "umask-util.h"
43c3fb46 114#include "unit-name.h"
b1d4f8e1 115#include "user-util.h"
e9642be2 116
e96ceaba
LP
117/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
118#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
e79581dd 119#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
0e7ac751 120
2a49b612
ZJS
121#define EXIT_FORCE_RESTART 133
122
113cea80
DH
123typedef enum ContainerStatus {
124 CONTAINER_TERMINATED,
6145bb4f 125 CONTAINER_REBOOTED,
113cea80
DH
126} ContainerStatus;
127
88213476 128static char *arg_directory = NULL;
ec16945e 129static char *arg_template = NULL;
5f932eb9 130static char *arg_chdir = NULL;
b53ede69
PW
131static char *arg_pivot_root_new = NULL;
132static char *arg_pivot_root_old = NULL;
687d0825 133static char *arg_user = NULL;
de40a303
LP
134static uid_t arg_uid = UID_INVALID;
135static gid_t arg_gid = GID_INVALID;
136static gid_t* arg_supplementary_gids = NULL;
137static size_t arg_n_supplementary_gids = 0;
9444b1f2 138static sd_id128_t arg_uuid = {};
3a9530e5
LP
139static char *arg_machine = NULL; /* The name used by the host to refer to this */
140static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
141static const char *arg_selinux_context = NULL;
142static const char *arg_selinux_apifs_context = NULL;
de40a303 143static char *arg_slice = NULL;
ff01d048 144static bool arg_private_network = false;
bc2f673e 145static bool arg_read_only = false;
7732f92b 146static StartMode arg_start_mode = START_PID1;
ec16945e 147static bool arg_ephemeral = false;
57fb9fb5 148static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 149static bool arg_link_journal_try = false;
520e0d54 150static uint64_t arg_caps_retain =
50b52222
LP
151 (1ULL << CAP_AUDIT_CONTROL) |
152 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
153 (1ULL << CAP_CHOWN) |
154 (1ULL << CAP_DAC_OVERRIDE) |
155 (1ULL << CAP_DAC_READ_SEARCH) |
156 (1ULL << CAP_FOWNER) |
157 (1ULL << CAP_FSETID) |
158 (1ULL << CAP_IPC_OWNER) |
159 (1ULL << CAP_KILL) |
160 (1ULL << CAP_LEASE) |
161 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 162 (1ULL << CAP_MKNOD) |
5076f0cc
LP
163 (1ULL << CAP_NET_BIND_SERVICE) |
164 (1ULL << CAP_NET_BROADCAST) |
165 (1ULL << CAP_NET_RAW) |
5076f0cc 166 (1ULL << CAP_SETFCAP) |
50b52222 167 (1ULL << CAP_SETGID) |
5076f0cc
LP
168 (1ULL << CAP_SETPCAP) |
169 (1ULL << CAP_SETUID) |
170 (1ULL << CAP_SYS_ADMIN) |
50b52222 171 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
172 (1ULL << CAP_SYS_CHROOT) |
173 (1ULL << CAP_SYS_NICE) |
174 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 175 (1ULL << CAP_SYS_RESOURCE) |
50b52222 176 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 177static uint64_t arg_caps_ambient = 0;
de40a303 178static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 179static CustomMount *arg_custom_mounts = NULL;
88614c8a 180static size_t arg_n_custom_mounts = 0;
f4889f65 181static char **arg_setenv = NULL;
284c0b91 182static bool arg_quiet = false;
eb91eb18 183static bool arg_register = true;
89f7c846 184static bool arg_keep_unit = false;
aa28aefe 185static char **arg_network_interfaces = NULL;
c74e630d 186static char **arg_network_macvlan = NULL;
4bbfe7ad 187static char **arg_network_ipvlan = NULL;
69c79d3c 188static bool arg_network_veth = false;
f6d6bad1 189static char **arg_network_veth_extra = NULL;
f757855e 190static char *arg_network_bridge = NULL;
22b28dfd 191static char *arg_network_zone = NULL;
d7bea6b6 192static char *arg_network_namespace_path = NULL;
bb068de0 193static PagerFlags arg_pager_flags = 0;
050f7277 194static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 195static char *arg_image = NULL;
de40a303 196static char *arg_oci_bundle = NULL;
f757855e 197static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 198static ExposePort *arg_expose_ports = NULL;
f36933fe 199static char **arg_property = NULL;
de40a303 200static sd_bus_message *arg_property_message = NULL;
0de7acce 201static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 202static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 203static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 204static int arg_kill_signal = 0;
5da38d07 205static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
206static SettingsMask arg_settings_mask = 0;
207static int arg_settings_trusted = -1;
208static char **arg_parameters = NULL;
6aadfa4c 209static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 210static bool arg_notify_ready = false;
5a8ff0e6 211static bool arg_use_cgns = true;
0c582db0 212static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 213static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 214static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
215static char **arg_syscall_allow_list = NULL;
216static char **arg_syscall_deny_list = NULL;
de40a303
LP
217#if HAVE_SECCOMP
218static scmp_filter_ctx arg_seccomp = NULL;
219#endif
bf428efb 220static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 221static bool arg_no_new_privileges = false;
81f345df
LP
222static int arg_oom_score_adjust = 0;
223static bool arg_oom_score_adjust_set = false;
0985c7c4 224static CPUSet arg_cpu_set = {};
09d423e9 225static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 226static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 227static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
228static DeviceNode* arg_extra_nodes = NULL;
229static size_t arg_n_extra_nodes = 0;
230static char **arg_sysctl = NULL;
231static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
232static Credential *arg_credentials = NULL;
233static size_t arg_n_credentials = 0;
2f893044 234static char **arg_bind_user = NULL;
4a4654e0 235static bool arg_suppress_sync = false;
3603f151 236static char *arg_settings_filename = NULL;
4c27749b 237static Architecture arg_architecture = _ARCHITECTURE_INVALID;
84be0c71 238static ImagePolicy *arg_image_policy = NULL;
88213476 239
6145bb4f
LP
240STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
249STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
250STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
254STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
255STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
256STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
258STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
259STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
260STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
261STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
262STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 263STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
264STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
265STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
266#if HAVE_SECCOMP
267STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
268#endif
0985c7c4 269STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 270STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 271STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
3603f151 272STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
84be0c71 273STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
6145bb4f 274
dce66ffe
ZJS
275static int handle_arg_console(const char *arg) {
276 if (streq(arg, "help")) {
10e8a60b
LP
277 puts("autopipe\n"
278 "interactive\n"
dce66ffe 279 "passive\n"
10e8a60b
LP
280 "pipe\n"
281 "read-only");
dce66ffe
ZJS
282 return 0;
283 }
284
285 if (streq(arg, "interactive"))
286 arg_console_mode = CONSOLE_INTERACTIVE;
287 else if (streq(arg, "read-only"))
288 arg_console_mode = CONSOLE_READ_ONLY;
289 else if (streq(arg, "passive"))
290 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
291 else if (streq(arg, "pipe")) {
292 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
293 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
294 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
295 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
296 "Proceeding anyway.");
297
dce66ffe 298 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
299 } else if (streq(arg, "autopipe")) {
300 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
301 arg_console_mode = CONSOLE_INTERACTIVE;
302 else
303 arg_console_mode = CONSOLE_PIPE;
554c4beb 304 } else
dce66ffe
ZJS
305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
306
307 arg_settings_mask |= SETTING_CONSOLE_MODE;
308 return 1;
309}
310
37ec0fdd
LP
311static int help(void) {
312 _cleanup_free_ char *link = NULL;
313 int r;
314
384c2c32 315 pager_open(arg_pager_flags);
50ebcf6c 316
37ec0fdd
LP
317 r = terminal_urlify_man("systemd-nspawn", "1", &link);
318 if (r < 0)
319 return log_oom();
320
25148653 321 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 322 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
323 " -h --help Show this help\n"
324 " --version Print version string\n"
69c79d3c 325 " -q --quiet Do not show status information\n"
bb068de0 326 " --no-pager Do not pipe output into a pager\n"
25148653
LP
327 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
328 "%3$sImage:%4$s\n"
1b9e5b12 329 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
330 " --template=PATH Initialize root directory from template directory,\n"
331 " if missing\n"
332 " -x --ephemeral Run container with snapshot of root directory, and\n"
333 " remove it after exit\n"
25e68fd3
LP
334 " -i --image=PATH Root file system disk image (or device node) for\n"
335 " the container\n"
84be0c71 336 " --image-policy=POLICY Specify disk image dissection policy\n"
de40a303 337 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
338 " --read-only Mount the root directory read-only\n"
339 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 340 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
341 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
342 " as a DER encoded PKCS7, either as a path to a file\n"
343 " or as an ASCII base64 encoded string prefixed by\n"
344 " 'base64:'\n"
e7cbe5cb 345 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
346 " --pivot-root=PATH[:PATH]\n"
347 " Pivot root to given directory in the container\n\n"
348 "%3$sExecution:%4$s\n"
7732f92b 349 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 350 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 351 " --chdir=PATH Set working directory in the container\n"
0d2a0179 352 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
25148653
LP
353 " -u --user=USER Run the command under specified user or UID\n"
354 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
4a4654e0
LP
355 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
356 " --suppress-sync=BOOLEAN\n"
357 " Suppress any form of disk data synchronization\n\n"
25148653 358 "%3$sSystem Identity:%4$s\n"
a8828ed9 359 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 360 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
361 " --uuid=UUID Set a specific machine UUID for the container\n\n"
362 "%3$sProperties:%4$s\n"
a8828ed9 363 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 364 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
365 " --register=BOOLEAN Register container as machine\n"
366 " --keep-unit Do not register a scope for the machine, reuse\n"
367 " the service unit nspawn is running in\n\n"
368 "%3$sUser Namespacing:%4$s\n"
b917743d
YW
369 " --private-users=no Run without user namespacing\n"
370 " --private-users=yes|pick|identity\n"
371 " Run within user namespace, autoselect UID/GID range\n"
372 " --private-users=UIDBASE[:NUIDS]\n"
90b4a64d 373 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
374 " --private-users-ownership=MODE\n"
375 " Adjust ('chown') or map ('map') OS tree ownership\n"
b917743d
YW
376 " to private UID/GID range\n"
377 " -U Equivalent to --private-users=pick and\n"
378 " --private-users-ownership=auto\n\n"
25148653 379 "%3$sNetworking:%4$s\n"
69c79d3c 380 " --private-network Disable network in container\n"
2f091b1b 381 " --network-interface=HOSTIF[:CONTAINERIF]\n"
69c79d3c
LP
382 " Assign an existing network interface to the\n"
383 " container\n"
2f091b1b 384 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
c74e630d
LP
385 " Create a macvlan network interface based on an\n"
386 " existing network interface to the container\n"
2f091b1b 387 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
387f6955 388 " Create an ipvlan network interface based on an\n"
4bbfe7ad 389 " existing network interface to the container\n"
a8eaaee7 390 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 391 " and container\n"
f6d6bad1
LP
392 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
393 " Add an additional virtual Ethernet link between\n"
394 " host and container\n"
ab046dde 395 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
396 " Add a virtual Ethernet connection to the container\n"
397 " and attach it to an existing bridge on the host\n"
398 " --network-zone=NAME Similar, but attach the new interface to an\n"
399 " an automatically managed bridge interface\n"
d7bea6b6
DP
400 " --network-namespace-path=PATH\n"
401 " Set network namespace to the one represented by\n"
402 " the specified kernel namespace file node\n"
6d0b55c2 403 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
404 " Expose a container IP port on the host\n\n"
405 "%3$sSecurity:%4$s\n"
a8828ed9
DW
406 " --capability=CAP In addition to the default, retain specified\n"
407 " capability\n"
408 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
409 " --ambient-capability=CAP\n"
410 " Sets the specified capability for the started\n"
411 " process. Not useful if booting a machine.\n"
f4e803c8 412 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
413 " --system-call-filter=LIST|~LIST\n"
414 " Permit/prohibit specific system calls\n"
25148653
LP
415 " -Z --selinux-context=SECLABEL\n"
416 " Set the SELinux security context to be used by\n"
417 " processes in the container\n"
418 " -L --selinux-apifs-context=SECLABEL\n"
419 " Set the SELinux security context to be used by\n"
420 " API/tmpfs file systems in the container\n\n"
421 "%3$sResources:%4$s\n"
bf428efb 422 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
423 " --oom-score-adjust=VALUE\n"
424 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
425 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
426 " --personality=ARCH Pick personality for this container\n\n"
25148653 427 "%3$sIntegration:%4$s\n"
09d423e9 428 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 429 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
430 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
431 " host, try-guest, try-host\n"
432 " -j Equivalent to --link-journal=try-guest\n\n"
433 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
434 " --bind=PATH[:PATH[:OPTIONS]]\n"
435 " Bind mount a file or directory from the host into\n"
a8828ed9 436 " the container\n"
5e5bfa6e
EY
437 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
438 " Similar, but creates a read-only bind mount\n"
de40a303
LP
439 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
440 " it\n"
06c17c39 441 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
442 " --overlay=PATH[:PATH...]:PATH\n"
443 " Create an overlay mount from the host to \n"
444 " the container\n"
445 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
446 " Similar, but creates a read-only overlay mount\n"
447 " --bind-user=NAME Bind user from host to container\n\n"
25148653 448 "%3$sInput/Output:%4$s\n"
de40a303
LP
449 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
450 " set up for the container.\n"
3652872a
LP
451 " -P --pipe Equivalent to --console=pipe\n\n"
452 "%3$sCredentials:%4$s\n"
453 " --set-credential=ID:VALUE\n"
454 " Pass a credential with literal value to container.\n"
455 " --load-credential=ID:PATH\n"
456 " Load credential to pass to container from file or\n"
457 " AF_UNIX stream socket.\n"
bc556335
DDM
458 "\nSee the %2$s for details.\n",
459 program_invocation_short_name,
460 link,
461 ansi_underline(),
462 ansi_normal(),
463 ansi_highlight(),
464 ansi_normal());
37ec0fdd
LP
465
466 return 0;
88213476
LP
467}
468
86c0dd4a 469static int custom_mount_check_all(void) {
88614c8a 470 size_t i;
5a8af538 471
5a8af538
LP
472 for (i = 0; i < arg_n_custom_mounts; i++) {
473 CustomMount *m = &arg_custom_mounts[i];
474
0de7acce 475 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 476 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 478 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 479 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
481 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 482 }
5a8af538
LP
483 }
484
485 return 0;
486}
487
8199d554 488static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 489 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 490 int r;
5da38d07 491
efdb0237 492 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
493
494 e = getenv(var);
495 if (!e) {
d5fc5b2f 496 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
497 var = "UNIFIED_CGROUP_HIERARCHY";
498 e = getenv(var);
c78c095b
ZJS
499 }
500
501 if (!isempty(e)) {
efdb0237
LP
502 r = parse_boolean(e);
503 if (r < 0)
c78c095b 504 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
505 if (r > 0)
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
507 else
508 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
509 }
510
8199d554
LP
511 return 0;
512}
513
514static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
515 int r;
516
75b0d8b8
ZJS
517 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
518 * in the image actually supports. */
b4cccbc1
LP
519 r = cg_all_unified();
520 if (r < 0)
521 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
522 if (r > 0) {
a8725a06
ZJS
523 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
524 * routine only detects 231, so we'll have a false negative here for 230. */
7e6821ed 525 r = systemd_installation_has_version(directory, "230");
a8725a06
ZJS
526 if (r < 0)
527 return log_error_errno(r, "Failed to determine systemd version in container: %m");
528 if (r > 0)
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
530 else
531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 532 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b 533 /* Mixed cgroup hierarchy support was added in 233 */
7e6821ed 534 r = systemd_installation_has_version(directory, "233");
0fd9563f
ZJS
535 if (r < 0)
536 return log_error_errno(r, "Failed to determine systemd version in container: %m");
537 if (r > 0)
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
539 else
540 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
541 } else
5da38d07 542 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 543
8199d554
LP
544 log_debug("Using %s hierarchy for container.",
545 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
546 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
547
efdb0237
LP
548 return 0;
549}
550
8a99bd0c
ZJS
551static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
552 uint64_t mask = 0;
553 int r;
554
555 for (;;) {
556 _cleanup_free_ char *t = NULL;
557
558 r = extract_first_word(&spec, &t, ",", 0);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse capability %s.", t);
561 if (r == 0)
562 break;
563
564 if (streq(t, "help")) {
565 for (int i = 0; i < capability_list_length(); i++) {
566 const char *name;
567
568 name = capability_to_name(i);
569 if (name)
570 puts(name);
571 }
572
573 return 0; /* quit */
574 }
575
576 if (streq(t, "all"))
f5fbe71d 577 mask = UINT64_MAX;
8a99bd0c
ZJS
578 else {
579 r = capability_from_name(t);
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse capability %s.", t);
582
583 mask |= 1ULL << r;
584 }
585 }
586
587 *ret_mask = mask;
588 return 1; /* continue */
589}
590
49048684 591static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
592 int r;
593
594 r = getenv_bool(name);
595 if (r == -ENXIO)
49048684 596 return 0;
0c582db0 597 if (r < 0)
49048684 598 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 599
0c582db0 600 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 601 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 602 return 0;
0c582db0
LB
603}
604
49048684 605static int parse_mount_settings_env(void) {
4f086aab 606 const char *e;
1099ceeb
LP
607 int r;
608
609 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
610 if (r < 0 && r != -ENXIO)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
612 if (r >= 0)
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
614
615 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 616 if (streq_ptr(e, "network"))
4f086aab 617 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 618
49048684
ZJS
619 else if (e) {
620 r = parse_boolean(e);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
623
624 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
625 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 626 }
4f086aab 627
49048684 628 return 0;
4f086aab
SU
629}
630
49048684 631static int parse_environment(void) {
d5455d2f
LP
632 const char *e;
633 int r;
634
49048684
ZJS
635 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
636 if (r < 0)
637 return r;
638 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
639 if (r < 0)
640 return r;
641 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
642 if (r < 0)
643 return r;
644 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
645 if (r < 0)
646 return r;
d5455d2f 647
49048684
ZJS
648 r = parse_mount_settings_env();
649 if (r < 0)
650 return r;
d5455d2f 651
489fae52
ZJS
652 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
653 * even if it is supported. If not supported, it has no effect. */
de40a303 654 if (!cg_ns_supported())
489fae52 655 arg_use_cgns = false;
de40a303
LP
656 else {
657 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
658 if (r < 0) {
659 if (r != -ENXIO)
49048684 660 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
661
662 arg_use_cgns = true;
663 } else {
664 arg_use_cgns = r > 0;
665 arg_settings_mask |= SETTING_USE_CGNS;
666 }
667 }
d5455d2f
LP
668
669 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
670 if (e)
671 arg_container_service_name = e;
672
4a4654e0
LP
673 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
674 if (r >= 0)
675 arg_suppress_sync = r;
676 else if (r != -ENXIO)
677 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
678
49048684 679 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
680}
681
88213476 682static int parse_argv(int argc, char *argv[]) {
a41fe3a2 683 enum {
acbeb427
ZJS
684 ARG_VERSION = 0x100,
685 ARG_PRIVATE_NETWORK,
bc2f673e 686 ARG_UUID,
5076f0cc 687 ARG_READ_ONLY,
57fb9fb5 688 ARG_CAPABILITY,
88fc9c9b 689 ARG_AMBIENT_CAPABILITY,
420c7379 690 ARG_DROP_CAPABILITY,
17fe0523
LP
691 ARG_LINK_JOURNAL,
692 ARG_BIND,
f4889f65 693 ARG_BIND_RO,
06c17c39 694 ARG_TMPFS,
5a8af538
LP
695 ARG_OVERLAY,
696 ARG_OVERLAY_RO,
de40a303 697 ARG_INACCESSIBLE,
eb91eb18 698 ARG_SHARE_SYSTEM,
89f7c846 699 ARG_REGISTER,
aa28aefe 700 ARG_KEEP_UNIT,
69c79d3c 701 ARG_NETWORK_INTERFACE,
c74e630d 702 ARG_NETWORK_MACVLAN,
4bbfe7ad 703 ARG_NETWORK_IPVLAN,
ab046dde 704 ARG_NETWORK_BRIDGE,
22b28dfd 705 ARG_NETWORK_ZONE,
f6d6bad1 706 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 707 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 708 ARG_PERSONALITY,
4d9f07b4 709 ARG_VOLATILE,
ec16945e 710 ARG_TEMPLATE,
f36933fe 711 ARG_PROPERTY,
6dac160c 712 ARG_PRIVATE_USERS,
c6c8f6e2 713 ARG_KILL_SIGNAL,
f757855e 714 ARG_SETTINGS,
5f932eb9 715 ARG_CHDIR,
b53ede69 716 ARG_PIVOT_ROOT,
7336138e 717 ARG_PRIVATE_USERS_CHOWN,
6c045a99 718 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 719 ARG_NOTIFY_READY,
4623e8e6 720 ARG_ROOT_HASH,
89e62e0b
LP
721 ARG_ROOT_HASH_SIG,
722 ARG_VERITY_DATA,
960e4569 723 ARG_SYSTEM_CALL_FILTER,
bf428efb 724 ARG_RLIMIT,
3a9530e5 725 ARG_HOSTNAME,
66edd963 726 ARG_NO_NEW_PRIVILEGES,
81f345df 727 ARG_OOM_SCORE_ADJUST,
d107bb7d 728 ARG_CPU_AFFINITY,
09d423e9 729 ARG_RESOLV_CONF,
1688841f 730 ARG_TIMEZONE,
de40a303
LP
731 ARG_CONSOLE,
732 ARG_PIPE,
733 ARG_OCI_BUNDLE,
bb068de0 734 ARG_NO_PAGER,
3652872a
LP
735 ARG_SET_CREDENTIAL,
736 ARG_LOAD_CREDENTIAL,
2f893044 737 ARG_BIND_USER,
4a4654e0 738 ARG_SUPPRESS_SYNC,
84be0c71 739 ARG_IMAGE_POLICY,
a41fe3a2
LP
740 };
741
88213476 742 static const struct option options[] = {
d7bea6b6
DP
743 { "help", no_argument, NULL, 'h' },
744 { "version", no_argument, NULL, ARG_VERSION },
745 { "directory", required_argument, NULL, 'D' },
746 { "template", required_argument, NULL, ARG_TEMPLATE },
747 { "ephemeral", no_argument, NULL, 'x' },
748 { "user", required_argument, NULL, 'u' },
749 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
750 { "as-pid2", no_argument, NULL, 'a' },
751 { "boot", no_argument, NULL, 'b' },
752 { "uuid", required_argument, NULL, ARG_UUID },
753 { "read-only", no_argument, NULL, ARG_READ_ONLY },
754 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 755 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 756 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 757 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
758 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
759 { "bind", required_argument, NULL, ARG_BIND },
760 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
761 { "tmpfs", required_argument, NULL, ARG_TMPFS },
762 { "overlay", required_argument, NULL, ARG_OVERLAY },
763 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 764 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 765 { "machine", required_argument, NULL, 'M' },
3a9530e5 766 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
767 { "slice", required_argument, NULL, 'S' },
768 { "setenv", required_argument, NULL, 'E' },
769 { "selinux-context", required_argument, NULL, 'Z' },
770 { "selinux-apifs-context", required_argument, NULL, 'L' },
771 { "quiet", no_argument, NULL, 'q' },
772 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
773 { "register", required_argument, NULL, ARG_REGISTER },
774 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
775 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
776 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
777 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
778 { "network-veth", no_argument, NULL, 'n' },
779 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
780 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
781 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
782 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
783 { "personality", required_argument, NULL, ARG_PERSONALITY },
784 { "image", required_argument, NULL, 'i' },
785 { "volatile", optional_argument, NULL, ARG_VOLATILE },
786 { "port", required_argument, NULL, 'p' },
787 { "property", required_argument, NULL, ARG_PROPERTY },
788 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
789 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
790 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
791 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
792 { "settings", required_argument, NULL, ARG_SETTINGS },
793 { "chdir", required_argument, NULL, ARG_CHDIR },
794 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
795 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
796 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
797 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
798 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 799 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 800 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 801 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 802 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 803 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 804 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
805 { "console", required_argument, NULL, ARG_CONSOLE },
806 { "pipe", no_argument, NULL, ARG_PIPE },
807 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 808 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
809 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
810 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 811 { "bind-user", required_argument, NULL, ARG_BIND_USER },
4a4654e0 812 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
84be0c71 813 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
eb9da376 814 {}
88213476
LP
815 };
816
9444b1f2 817 int c, r;
a42c8b54 818 uint64_t plus = 0, minus = 0;
f757855e 819 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
820
821 assert(argc >= 0);
822 assert(argv);
823
ef9c12b1
YW
824 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
825 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
826 optind = 0;
de40a303 827 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
828 switch (c) {
829
830 case 'h':
37ec0fdd 831 return help();
88213476 832
acbeb427 833 case ARG_VERSION:
3f6fd1ba 834 return version();
acbeb427 835
88213476 836 case 'D':
614b022c 837 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 838 if (r < 0)
0f03c2a4 839 return r;
de40a303
LP
840
841 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
842 break;
843
844 case ARG_TEMPLATE:
614b022c 845 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 846 if (r < 0)
0f03c2a4 847 return r;
de40a303
LP
848
849 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
850 break;
851
1b9e5b12 852 case 'i':
614b022c 853 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 854 if (r < 0)
0f03c2a4 855 return r;
de40a303
LP
856
857 arg_settings_mask |= SETTING_DIRECTORY;
858 break;
859
860 case ARG_OCI_BUNDLE:
614b022c 861 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
862 if (r < 0)
863 return r;
864
ec16945e
LP
865 break;
866
867 case 'x':
868 arg_ephemeral = true;
a2f577fc 869 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
870 break;
871
687d0825 872 case 'u':
2fc09a9c
DM
873 r = free_and_strdup(&arg_user, optarg);
874 if (r < 0)
7027ff61 875 return log_oom();
687d0825 876
f757855e 877 arg_settings_mask |= SETTING_USER;
687d0825
MV
878 break;
879
22b28dfd 880 case ARG_NETWORK_ZONE: {
fee9f7b5 881 _cleanup_free_ char *j = NULL;
22b28dfd 882
b910cc72 883 j = strjoin("vz-", optarg);
22b28dfd
LP
884 if (!j)
885 return log_oom();
886
fee9f7b5
FS
887 if (!ifname_valid(j))
888 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
889 "Network zone name not valid: %s", j);
22b28dfd 890
df1fac6d 891 free_and_replace(arg_network_zone, j);
22b28dfd
LP
892
893 arg_network_veth = true;
894 arg_private_network = true;
895 arg_settings_mask |= SETTING_NETWORK;
896 break;
897 }
898
ab046dde 899 case ARG_NETWORK_BRIDGE:
ef76dff2 900
baaa35ad
ZJS
901 if (!ifname_valid(optarg))
902 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
903 "Bridge interface name not valid: %s", optarg);
ef76dff2 904
f757855e
LP
905 r = free_and_strdup(&arg_network_bridge, optarg);
906 if (r < 0)
907 return log_oom();
ab046dde 908
4831981d 909 _fallthrough_;
0dfaa006 910 case 'n':
69c79d3c
LP
911 arg_network_veth = true;
912 arg_private_network = true;
f757855e 913 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
914 break;
915
f6d6bad1
LP
916 case ARG_NETWORK_VETH_EXTRA:
917 r = veth_extra_parse(&arg_network_veth_extra, optarg);
918 if (r < 0)
919 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
920
921 arg_private_network = true;
922 arg_settings_mask |= SETTING_NETWORK;
923 break;
924
aa28aefe 925 case ARG_NETWORK_INTERFACE:
2f091b1b 926 r = interface_pair_parse(&arg_network_interfaces, optarg);
b390f178
DDM
927 if (r < 0)
928 return r;
929
c74e630d 930 arg_private_network = true;
f757855e 931 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
932 break;
933
934 case ARG_NETWORK_MACVLAN:
2f091b1b 935 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
b390f178
DDM
936 if (r < 0)
937 return r;
938
4bbfe7ad 939 arg_private_network = true;
f757855e 940 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
941 break;
942
943 case ARG_NETWORK_IPVLAN:
2f091b1b 944 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
b390f178
DDM
945 if (r < 0)
946 return r;
947
4831981d 948 _fallthrough_;
ff01d048
LP
949 case ARG_PRIVATE_NETWORK:
950 arg_private_network = true;
f757855e 951 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
952 break;
953
d7bea6b6 954 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 955 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
956 if (r < 0)
957 return r;
958
de40a303 959 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
960 break;
961
0f0dbc46 962 case 'b':
baaa35ad
ZJS
963 if (arg_start_mode == START_PID2)
964 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
965 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
966
967 arg_start_mode = START_BOOT;
968 arg_settings_mask |= SETTING_START_MODE;
969 break;
970
971 case 'a':
baaa35ad
ZJS
972 if (arg_start_mode == START_BOOT)
973 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
974 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
975
976 arg_start_mode = START_PID2;
977 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
978 break;
979
144f0fc0 980 case ARG_UUID:
9444b1f2 981 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
982 if (r < 0)
983 return log_error_errno(r, "Invalid UUID: %s", optarg);
984
baaa35ad
ZJS
985 if (sd_id128_is_null(arg_uuid))
986 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
987 "Machine UUID may not be all zeroes.");
f757855e
LP
988
989 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 990 break;
aa96c6cb 991
43c3fb46
LP
992 case 'S': {
993 _cleanup_free_ char *mangled = NULL;
994
995 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
996 if (r < 0)
997 return log_oom();
998
43c3fb46 999 free_and_replace(arg_slice, mangled);
de40a303 1000 arg_settings_mask |= SETTING_SLICE;
144f0fc0 1001 break;
43c3fb46 1002 }
144f0fc0 1003
7027ff61 1004 case 'M':
c1521918 1005 if (isempty(optarg))
97b11eed 1006 arg_machine = mfree(arg_machine);
c1521918 1007 else {
52ef5dd7 1008 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1009 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1010 "Invalid machine name: %s", optarg);
7027ff61 1011
0c3c4284
LP
1012 r = free_and_strdup(&arg_machine, optarg);
1013 if (r < 0)
eb91eb18 1014 return log_oom();
eb91eb18 1015 }
9ce6d1b3 1016 break;
7027ff61 1017
3a9530e5
LP
1018 case ARG_HOSTNAME:
1019 if (isempty(optarg))
1020 arg_hostname = mfree(arg_hostname);
1021 else {
52ef5dd7 1022 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1023 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1024 "Invalid hostname: %s", optarg);
3a9530e5
LP
1025
1026 r = free_and_strdup(&arg_hostname, optarg);
1027 if (r < 0)
1028 return log_oom();
1029 }
1030
1031 arg_settings_mask |= SETTING_HOSTNAME;
1032 break;
1033
82adf6af
LP
1034 case 'Z':
1035 arg_selinux_context = optarg;
a8828ed9
DW
1036 break;
1037
82adf6af
LP
1038 case 'L':
1039 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1040 break;
1041
bc2f673e
LP
1042 case ARG_READ_ONLY:
1043 arg_read_only = true;
f757855e 1044 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1045 break;
1046
88fc9c9b
TH
1047 case ARG_AMBIENT_CAPABILITY: {
1048 uint64_t m;
1049 r = parse_capability_spec(optarg, &m);
1050 if (r <= 0)
1051 return r;
1052 arg_caps_ambient |= m;
1053 arg_settings_mask |= SETTING_CAPABILITY;
1054 break;
1055 }
420c7379
LP
1056 case ARG_CAPABILITY:
1057 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1058 uint64_t m;
1059 r = parse_capability_spec(optarg, &m);
1060 if (r <= 0)
1061 return r;
5076f0cc 1062
8a99bd0c
ZJS
1063 if (c == ARG_CAPABILITY)
1064 plus |= m;
1065 else
1066 minus |= m;
f757855e 1067 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1068 break;
1069 }
66edd963
LP
1070 case ARG_NO_NEW_PRIVILEGES:
1071 r = parse_boolean(optarg);
1072 if (r < 0)
1073 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1074
1075 arg_no_new_privileges = r;
1076 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1077 break;
1078
57fb9fb5
LP
1079 case 'j':
1080 arg_link_journal = LINK_GUEST;
574edc90 1081 arg_link_journal_try = true;
4e1d6aa9 1082 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1083 break;
1084
1085 case ARG_LINK_JOURNAL:
4e1d6aa9 1086 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1087 if (r < 0)
1088 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1089
4e1d6aa9 1090 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1091 break;
1092
17fe0523 1093 case ARG_BIND:
f757855e
LP
1094 case ARG_BIND_RO:
1095 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1096 if (r < 0)
1097 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1098
f757855e 1099 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1100 break;
06c17c39 1101
f757855e
LP
1102 case ARG_TMPFS:
1103 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1104 if (r < 0)
1105 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1106
f757855e 1107 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1108 break;
5a8af538
LP
1109
1110 case ARG_OVERLAY:
ad85779a
LP
1111 case ARG_OVERLAY_RO:
1112 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1113 if (r == -EADDRNOTAVAIL)
1114 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1117
f757855e 1118 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1119 break;
06c17c39 1120
de40a303
LP
1121 case ARG_INACCESSIBLE:
1122 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1123 if (r < 0)
1124 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1125
1126 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1127 break;
1128
0d2a0179
ZJS
1129 case 'E':
1130 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
aaf057c4 1131 if (r < 0)
0d2a0179 1132 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
f4889f65 1133
f757855e 1134 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65 1135 break;
f4889f65 1136
284c0b91
LP
1137 case 'q':
1138 arg_quiet = true;
1139 break;
1140
8a96d94e 1141 case ARG_SHARE_SYSTEM:
a6b5216c 1142 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1143 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1144 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1145 arg_clone_ns_flags = 0;
8a96d94e
LP
1146 break;
1147
eb91eb18
LP
1148 case ARG_REGISTER:
1149 r = parse_boolean(optarg);
1150 if (r < 0) {
1151 log_error("Failed to parse --register= argument: %s", optarg);
1152 return r;
1153 }
1154
1155 arg_register = r;
1156 break;
1157
89f7c846
LP
1158 case ARG_KEEP_UNIT:
1159 arg_keep_unit = true;
1160 break;
1161
6afc95b7
LP
1162 case ARG_PERSONALITY:
1163
ac45f971 1164 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1165 if (arg_personality == PERSONALITY_INVALID)
1166 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1167 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1168
f757855e 1169 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1170 break;
1171
4d9f07b4
LP
1172 case ARG_VOLATILE:
1173
1174 if (!optarg)
f757855e 1175 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1176 else if (streq(optarg, "help")) {
1177 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1178 return 0;
1179 } else {
f757855e 1180 VolatileMode m;
4d9f07b4 1181
f757855e 1182 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1183 if (m < 0)
1184 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1185 "Failed to parse --volatile= argument: %s", optarg);
1186 else
f757855e 1187 arg_volatile_mode = m;
6d0b55c2
LP
1188 }
1189
f757855e
LP
1190 arg_settings_mask |= SETTING_VOLATILE_MODE;
1191 break;
6d0b55c2 1192
f757855e
LP
1193 case 'p':
1194 r = expose_port_parse(&arg_expose_ports, optarg);
1195 if (r == -EEXIST)
1196 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1197 if (r < 0)
1198 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1199
f757855e 1200 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1201 break;
6d0b55c2 1202
f36933fe
LP
1203 case ARG_PROPERTY:
1204 if (strv_extend(&arg_property, optarg) < 0)
1205 return log_oom();
1206
1207 break;
1208
ae209204 1209 case ARG_PRIVATE_USERS: {
33eac552 1210 int boolean;
0de7acce 1211
ae209204
ZJS
1212 if (!optarg)
1213 boolean = true;
1214 else if (!in_charset(optarg, DIGITS))
1215 /* do *not* parse numbers as booleans */
1216 boolean = parse_boolean(optarg);
33eac552
LP
1217 else
1218 boolean = -1;
ae209204 1219
33eac552 1220 if (boolean == 0) {
0de7acce
LP
1221 /* no: User namespacing off */
1222 arg_userns_mode = USER_NAMESPACE_NO;
1223 arg_uid_shift = UID_INVALID;
1224 arg_uid_range = UINT32_C(0x10000);
33eac552 1225 } else if (boolean > 0) {
0de7acce
LP
1226 /* yes: User namespacing on, UID range is read from root dir */
1227 arg_userns_mode = USER_NAMESPACE_FIXED;
1228 arg_uid_shift = UID_INVALID;
1229 arg_uid_range = UINT32_C(0x10000);
1230 } else if (streq(optarg, "pick")) {
1231 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1232 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1233 * implied by USER_NAMESPACE_PICK
33eac552 1234 * further down. */
0de7acce
LP
1235 arg_uid_shift = UID_INVALID;
1236 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1237
1238 } else if (streq(optarg, "identity")) {
6c2d70ce 1239 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
33eac552
LP
1240 * itself, i.e. we don't actually map anything, but do take benefit of
1241 * isolation of capability sets. */
1242 arg_userns_mode = USER_NAMESPACE_FIXED;
1243 arg_uid_shift = 0;
1244 arg_uid_range = UINT32_C(0x10000);
0de7acce 1245 } else {
6c2058b3 1246 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1247 const char *range, *shift;
1248
0de7acce
LP
1249 /* anything else: User namespacing on, UID range is explicitly configured */
1250
6dac160c
LP
1251 range = strchr(optarg, ':');
1252 if (range) {
6c2058b3
ZJS
1253 buffer = strndup(optarg, range - optarg);
1254 if (!buffer)
1255 return log_oom();
1256 shift = buffer;
6dac160c
LP
1257
1258 range++;
bfd292ec
ZJS
1259 r = safe_atou32(range, &arg_uid_range);
1260 if (r < 0)
be715731 1261 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1262 } else
1263 shift = optarg;
1264
be715731
ZJS
1265 r = parse_uid(shift, &arg_uid_shift);
1266 if (r < 0)
1267 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1268
1269 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1270
58e13de5
LP
1271 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1272 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1273 }
be715731 1274
0de7acce 1275 arg_settings_mask |= SETTING_USERNS;
6dac160c 1276 break;
ae209204 1277 }
6dac160c 1278
0de7acce 1279 case 'U':
ccabee0d 1280 if (userns_supported()) {
6c045a99
LP
1281 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1282 * implied by USER_NAMESPACE_PICK
33eac552 1283 * further down. */
ccabee0d
LP
1284 arg_uid_shift = UID_INVALID;
1285 arg_uid_range = UINT32_C(0x10000);
1286
1287 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1288 }
1289
7336138e
LP
1290 break;
1291
0de7acce 1292 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1293 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1294
1295 arg_settings_mask |= SETTING_USERNS;
1296 break;
1297
1298 case ARG_PRIVATE_USERS_OWNERSHIP:
1299 if (streq(optarg, "help")) {
1300 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1301 return 0;
1302 }
1303
1304 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1305 if (arg_userns_ownership < 0)
1306 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1307
1308 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1309 break;
1310
c6c8f6e2 1311 case ARG_KILL_SIGNAL:
5c828e66
LP
1312 if (streq(optarg, "help")) {
1313 DUMP_STRING_TABLE(signal, int, _NSIG);
1314 return 0;
1315 }
1316
29a3db75 1317 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1318 if (arg_kill_signal < 0)
7211c853 1319 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1320
f757855e
LP
1321 arg_settings_mask |= SETTING_KILL_SIGNAL;
1322 break;
1323
1324 case ARG_SETTINGS:
1325
1326 /* no → do not read files
1327 * yes → read files, do not override cmdline, trust only subset
1328 * override → read files, override cmdline, trust only subset
1329 * trusted → read files, do not override cmdline, trust all
1330 */
1331
1332 r = parse_boolean(optarg);
1333 if (r < 0) {
1334 if (streq(optarg, "trusted")) {
1335 mask_all_settings = false;
1336 mask_no_settings = false;
1337 arg_settings_trusted = true;
1338
1339 } else if (streq(optarg, "override")) {
1340 mask_all_settings = false;
1341 mask_no_settings = true;
1342 arg_settings_trusted = -1;
1343 } else
1344 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1345 } else if (r > 0) {
1346 /* yes */
1347 mask_all_settings = false;
1348 mask_no_settings = false;
1349 arg_settings_trusted = -1;
1350 } else {
1351 /* no */
1352 mask_all_settings = true;
1353 mask_no_settings = false;
1354 arg_settings_trusted = false;
1355 }
1356
c6c8f6e2
LP
1357 break;
1358
5f932eb9 1359 case ARG_CHDIR:
baaa35ad
ZJS
1360 if (!path_is_absolute(optarg))
1361 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1362 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1363
1364 r = free_and_strdup(&arg_chdir, optarg);
1365 if (r < 0)
1366 return log_oom();
1367
1368 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1369 break;
1370
b53ede69
PW
1371 case ARG_PIVOT_ROOT:
1372 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1375
1376 arg_settings_mask |= SETTING_PIVOT_ROOT;
1377 break;
1378
9c1e04d0
AP
1379 case ARG_NOTIFY_READY:
1380 r = parse_boolean(optarg);
baaa35ad
ZJS
1381 if (r < 0)
1382 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1383 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1384 arg_notify_ready = r;
1385 arg_settings_mask |= SETTING_NOTIFY_READY;
1386 break;
1387
4623e8e6 1388 case ARG_ROOT_HASH: {
89e62e0b 1389 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1390 size_t l;
1391
1392 r = unhexmem(optarg, strlen(optarg), &k, &l);
1393 if (r < 0)
1394 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1395 if (l < sizeof(sd_id128_t))
c6147113 1396 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1397
89e62e0b
LP
1398 free_and_replace(arg_verity_settings.root_hash, k);
1399 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1400 break;
1401 }
1402
c2923fdc
LB
1403 case ARG_ROOT_HASH_SIG: {
1404 char *value;
89e62e0b
LP
1405 size_t l;
1406 void *p;
c2923fdc
LB
1407
1408 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1409 r = unbase64mem(value, strlen(value), &p, &l);
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1412
c2923fdc 1413 } else {
89e62e0b 1414 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1415 if (r < 0)
89e62e0b 1416 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1417 }
1418
89e62e0b
LP
1419 free_and_replace(arg_verity_settings.root_hash_sig, p);
1420 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1421 break;
1422 }
1423
89e62e0b 1424 case ARG_VERITY_DATA:
614b022c 1425 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1426 if (r < 0)
1427 return r;
1428 break;
1429
960e4569
LP
1430 case ARG_SYSTEM_CALL_FILTER: {
1431 bool negative;
1432 const char *items;
1433
1434 negative = optarg[0] == '~';
1435 items = negative ? optarg + 1 : optarg;
1436
1437 for (;;) {
1438 _cleanup_free_ char *word = NULL;
1439
1440 r = extract_first_word(&items, &word, NULL, 0);
1441 if (r == 0)
1442 break;
1443 if (r == -ENOMEM)
1444 return log_oom();
1445 if (r < 0)
1446 return log_error_errno(r, "Failed to parse system call filter: %m");
1447
1448 if (negative)
6b000af4 1449 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1450 else
6b000af4 1451 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1452 if (r < 0)
1453 return log_oom();
1454 }
1455
1456 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1457 break;
1458 }
1459
bf428efb
LP
1460 case ARG_RLIMIT: {
1461 const char *eq;
622ecfa8 1462 _cleanup_free_ char *name = NULL;
bf428efb
LP
1463 int rl;
1464
5c828e66
LP
1465 if (streq(optarg, "help")) {
1466 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1467 return 0;
1468 }
1469
bf428efb 1470 eq = strchr(optarg, '=');
baaa35ad
ZJS
1471 if (!eq)
1472 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1473 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1474
1475 name = strndup(optarg, eq - optarg);
1476 if (!name)
1477 return log_oom();
1478
1479 rl = rlimit_from_string_harder(name);
baaa35ad 1480 if (rl < 0)
7211c853 1481 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1482
1483 if (!arg_rlimit[rl]) {
1484 arg_rlimit[rl] = new0(struct rlimit, 1);
1485 if (!arg_rlimit[rl])
1486 return log_oom();
1487 }
1488
1489 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1490 if (r < 0)
1491 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1492
1493 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1494 break;
1495 }
1496
81f345df
LP
1497 case ARG_OOM_SCORE_ADJUST:
1498 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1499 if (r < 0)
1500 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1501
1502 arg_oom_score_adjust_set = true;
1503 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1504 break;
1505
d107bb7d 1506 case ARG_CPU_AFFINITY: {
0985c7c4 1507 CPUSet cpuset;
d107bb7d
LP
1508
1509 r = parse_cpu_set(optarg, &cpuset);
1510 if (r < 0)
0985c7c4 1511 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1512
0985c7c4
ZJS
1513 cpu_set_reset(&arg_cpu_set);
1514 arg_cpu_set = cpuset;
d107bb7d
LP
1515 arg_settings_mask |= SETTING_CPU_AFFINITY;
1516 break;
1517 }
1518
09d423e9
LP
1519 case ARG_RESOLV_CONF:
1520 if (streq(optarg, "help")) {
1521 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1522 return 0;
1523 }
1524
1525 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1526 if (arg_resolv_conf < 0)
7211c853 1527 return log_error_errno(arg_resolv_conf,
baaa35ad 1528 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1529
1530 arg_settings_mask |= SETTING_RESOLV_CONF;
1531 break;
1532
1688841f
LP
1533 case ARG_TIMEZONE:
1534 if (streq(optarg, "help")) {
1535 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1536 return 0;
1537 }
1538
1539 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1540 if (arg_timezone < 0)
7211c853 1541 return log_error_errno(arg_timezone,
baaa35ad 1542 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1543
1544 arg_settings_mask |= SETTING_TIMEZONE;
1545 break;
1546
de40a303 1547 case ARG_CONSOLE:
dce66ffe
ZJS
1548 r = handle_arg_console(optarg);
1549 if (r <= 0)
1550 return r;
de40a303
LP
1551 break;
1552
1553 case 'P':
1554 case ARG_PIPE:
dce66ffe
ZJS
1555 r = handle_arg_console("pipe");
1556 if (r <= 0)
1557 return r;
de40a303
LP
1558 break;
1559
bb068de0
ZJS
1560 case ARG_NO_PAGER:
1561 arg_pager_flags |= PAGER_DISABLE;
1562 break;
1563
3652872a
LP
1564 case ARG_SET_CREDENTIAL: {
1565 _cleanup_free_ char *word = NULL, *data = NULL;
1566 const char *p = optarg;
1567 Credential *a;
e437538f 1568 ssize_t l;
3652872a
LP
1569
1570 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1571 if (r == -ENOMEM)
1572 return log_oom();
1573 if (r < 0)
1574 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1575 if (r == 0 || !p)
1576 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1577
1578 if (!credential_name_valid(word))
1579 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1580
12d729b2 1581 for (size_t i = 0; i < arg_n_credentials; i++)
3652872a
LP
1582 if (streq(arg_credentials[i].id, word))
1583 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1584
1585 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1586 if (l < 0)
1587 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1588
1589 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1590 if (!a)
1591 return log_oom();
1592
1593 a[arg_n_credentials++] = (Credential) {
1594 .id = TAKE_PTR(word),
1595 .data = TAKE_PTR(data),
1596 .size = l,
1597 };
1598
1599 arg_credentials = a;
1600
1601 arg_settings_mask |= SETTING_CREDENTIALS;
1602 break;
1603 }
1604
1605 case ARG_LOAD_CREDENTIAL: {
1606 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1607 _cleanup_(erase_and_freep) char *data = NULL;
1608 _cleanup_free_ char *word = NULL, *j = NULL;
1609 const char *p = optarg;
1610 Credential *a;
1611 size_t size, i;
1612
1613 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1614 if (r == -ENOMEM)
1615 return log_oom();
1616 if (r < 0)
c941b650 1617 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
3652872a 1618 if (r == 0 || !p)
c941b650 1619 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
3652872a
LP
1620
1621 if (!credential_name_valid(word))
1622 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1623
1624 for (i = 0; i < arg_n_credentials; i++)
1625 if (streq(arg_credentials[i].id, word))
1626 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1627
1628 if (path_is_absolute(p))
1629 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1630 else {
1631 const char *e;
1632
786d19fd
LP
1633 r = get_credentials_dir(&e);
1634 if (r < 0)
1635 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
3652872a
LP
1636
1637 j = path_join(e, p);
1638 if (!j)
1639 return log_oom();
1640 }
1641
986311c2
LP
1642 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1643 flags,
1644 NULL,
1645 &data, &size);
3652872a
LP
1646 if (r < 0)
1647 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1648
1649 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1650 if (!a)
1651 return log_oom();
1652
1653 a[arg_n_credentials++] = (Credential) {
1654 .id = TAKE_PTR(word),
1655 .data = TAKE_PTR(data),
1656 .size = size,
1657 };
1658
1659 arg_credentials = a;
1660
1661 arg_settings_mask |= SETTING_CREDENTIALS;
1662 break;
1663 }
1664
2f893044
LP
1665 case ARG_BIND_USER:
1666 if (!valid_user_group_name(optarg, 0))
1667 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1668
1669 if (strv_extend(&arg_bind_user, optarg) < 0)
1670 return log_oom();
1671
1672 arg_settings_mask |= SETTING_BIND_USER;
1673 break;
1674
4a4654e0
LP
1675 case ARG_SUPPRESS_SYNC:
1676 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1677 if (r < 0)
1678 return r;
1679
1680 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1681 break;
1682
06e78680
YW
1683 case ARG_IMAGE_POLICY:
1684 r = parse_image_policy_argument(optarg, &arg_image_policy);
84be0c71 1685 if (r < 0)
06e78680 1686 return r;
84be0c71 1687 break;
84be0c71 1688
88213476
LP
1689 case '?':
1690 return -EINVAL;
1691
1692 default:
04499a70 1693 assert_not_reached();
88213476 1694 }
88213476 1695
60f1ec13
LP
1696 if (argc > optind) {
1697 strv_free(arg_parameters);
1698 arg_parameters = strv_copy(argv + optind);
1699 if (!arg_parameters)
1700 return log_oom();
d7bea6b6 1701
60f1ec13
LP
1702 arg_settings_mask |= SETTING_START_MODE;
1703 }
1704
1705 if (arg_ephemeral && arg_template && !arg_directory)
1706 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1707 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1708 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1709 * --directory=". */
1710 arg_directory = TAKE_PTR(arg_template);
1711
2642d22a
DDM
1712 arg_caps_retain |= plus;
1713 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1714
1715 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1716 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1717 * indicate that. */
1718 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1719 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1720
1721 arg_caps_retain &= ~minus;
60f1ec13 1722
de40a303 1723 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1724 r = parse_environment();
1725 if (r < 0)
1726 return r;
de40a303 1727
60f1ec13
LP
1728 /* Load all settings from .nspawn files */
1729 if (mask_no_settings)
1730 arg_settings_mask = 0;
1731
1732 /* Don't load any settings from .nspawn files */
1733 if (mask_all_settings)
1734 arg_settings_mask = _SETTINGS_MASK_ALL;
1735
1736 return 1;
1737}
1738
1739static int verify_arguments(void) {
1740 int r;
a6b5216c 1741
75b0d8b8
ZJS
1742 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1743 /* If we are running the stub init in the container, we don't need to look at what the init
1744 * in the container supports, because we are not using it. Let's immediately pick the right
1745 * setting based on the host system configuration.
1746 *
1747 * We only do this, if the user didn't use an environment variable to override the detection.
1748 */
1749
1750 r = cg_all_unified();
1751 if (r < 0)
1752 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1753 if (r > 0)
1754 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1755 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1756 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1757 else
1758 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1759 }
1760
4f086aab
SU
1761 if (arg_userns_mode != USER_NAMESPACE_NO)
1762 arg_mount_settings |= MOUNT_USE_USERNS;
1763
1764 if (arg_private_network)
1765 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1766
48a8d337
LB
1767 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1768 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1769 arg_register = false;
baaa35ad 1770 if (arg_start_mode != START_PID1)
60f1ec13 1771 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1772 }
eb91eb18 1773
6c045a99
LP
1774 if (arg_userns_ownership < 0)
1775 arg_userns_ownership =
f61c7f88 1776 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1777 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1778
60f1ec13
LP
1779 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1780 arg_kill_signal = SIGRTMIN+3;
1781
e5a4bb0d
LP
1782 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1783 arg_read_only = true;
1784
2436ea76
DDM
1785 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1786 arg_read_only = true;
1787
baaa35ad 1788 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1789 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1790 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1791 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1792
baaa35ad 1793 if (arg_directory && arg_image)
60f1ec13 1794 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1795
baaa35ad 1796 if (arg_template && arg_image)
60f1ec13 1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1798
baaa35ad 1799 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1801
baaa35ad 1802 if (arg_ephemeral && arg_template)
60f1ec13 1803 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1804
baaa35ad 1805 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1806 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1807
baaa35ad 1808 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1809 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1810
6c045a99 1811 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1812 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1813 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1814
6c045a99
LP
1815 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1816 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1817 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1818 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1819 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1820
679ecd36
SZ
1821 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1822 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1823 if (arg_network_namespace_path &&
1824 (arg_network_interfaces || arg_network_macvlan ||
1825 arg_network_ipvlan || arg_network_veth_extra ||
1826 arg_network_bridge || arg_network_zone ||
679ecd36 1827 arg_network_veth))
de40a303 1828 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1829
60f1ec13 1830 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1831 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1832 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1833
baaa35ad 1834 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1835 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1836
baaa35ad 1837 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1838 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1839
baaa35ad 1840 if (arg_expose_ports && !arg_private_network)
60f1ec13 1841 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1842
88fc9c9b 1843 if (arg_caps_ambient) {
f5fbe71d 1844 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1846
1847 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1848 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1849
1850 if (arg_start_mode == START_BOOT)
1851 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1852 }
1853
2f893044
LP
1854 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1855 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1856
1857 /* Drop duplicate --bind-user= entries */
1858 strv_uniq(arg_bind_user);
1859
60f1ec13
LP
1860 r = custom_mount_check_all();
1861 if (r < 0)
1862 return r;
c6c8f6e2 1863
f757855e 1864 return 0;
88213476
LP
1865}
1866
2f091b1b
TM
1867static int verify_network_interfaces_initialized(void) {
1868 int r;
1869 r = test_network_interfaces_initialized(arg_network_interfaces);
1870 if (r < 0)
1871 return r;
1872
1873 r = test_network_interfaces_initialized(arg_network_macvlan);
1874 if (r < 0)
1875 return r;
1876
1877 r = test_network_interfaces_initialized(arg_network_ipvlan);
1878 if (r < 0)
1879 return r;
1880
1881 return 0;
1882}
1883
91181e07 1884int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1885 assert(p);
1886
0de7acce 1887 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1888 return 0;
1889
1890 if (uid == UID_INVALID && gid == GID_INVALID)
1891 return 0;
1892
1893 if (uid != UID_INVALID) {
1894 uid += arg_uid_shift;
1895
1896 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1897 return -EOVERFLOW;
1898 }
1899
1900 if (gid != GID_INVALID) {
1901 gid += (gid_t) arg_uid_shift;
1902
1903 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1904 return -EOVERFLOW;
1905 }
1906
7c248223 1907 return RET_NERRNO(lchown(p, uid, gid));
b12afc8c
LP
1908}
1909
91181e07 1910int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1911 const char *q;
dae8b82e 1912 int r;
03cfe0d5
LP
1913
1914 q = prefix_roota(root, path);
3f692e2e 1915 r = RET_NERRNO(mkdir(q, mode));
dae8b82e
ZJS
1916 if (r == -EEXIST)
1917 return 0;
1918 if (r < 0)
1919 return r;
03cfe0d5
LP
1920
1921 return userns_lchown(q, uid, gid);
1922}
1923
1688841f 1924static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1925 return PATH_STARTSWITH_SET(
1926 path,
1927 "../usr/share/zoneinfo/",
1928 "/usr/share/zoneinfo/");
1688841f
LP
1929}
1930
83205269
LP
1931static bool etc_writable(void) {
1932 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1933}
1934
e58a1277 1935static int setup_timezone(const char *dest) {
1688841f
LP
1936 _cleanup_free_ char *p = NULL, *etc = NULL;
1937 const char *where, *check;
1938 TimezoneMode m;
d4036145 1939 int r;
f8440af5 1940
e58a1277
LP
1941 assert(dest);
1942
1688841f 1943 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1944 r = readlink_malloc("/etc/localtime", &p);
1945 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1946 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1947 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1948 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1949 else if (r < 0) {
1950 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1951 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1952 * file.
1953 *
1954 * Example:
1955 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1956 */
1957 return 0;
1958 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1959 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1960 else
1961 m = arg_timezone;
1962 } else
1963 m = arg_timezone;
1964
1965 if (m == TIMEZONE_OFF)
1966 return 0;
1967
f461a28d 1968 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1969 if (r < 0) {
1688841f 1970 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1971 return 0;
1972 }
1973
1688841f
LP
1974 where = strjoina(etc, "/localtime");
1975
1976 switch (m) {
1977
1978 case TIMEZONE_DELETE:
1979 if (unlink(where) < 0)
1980 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1981
d4036145 1982 return 0;
d4036145 1983
1688841f
LP
1984 case TIMEZONE_SYMLINK: {
1985 _cleanup_free_ char *q = NULL;
1986 const char *z, *what;
4d1c38b8 1987
1688841f
LP
1988 z = timezone_from_path(p);
1989 if (!z) {
1990 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1991 return 0;
1688841f 1992 }
d4036145 1993
1688841f
LP
1994 r = readlink_malloc(where, &q);
1995 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1996 return 0; /* Already pointing to the right place? Then do nothing .. */
1997
1998 check = strjoina(dest, "/usr/share/zoneinfo/", z);
f461a28d 1999 r = chase(check, dest, 0, NULL, NULL);
1688841f
LP
2000 if (r < 0)
2001 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
2002 else {
2003 if (unlink(where) < 0 && errno != ENOENT) {
2004 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
2005 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
2006 return 0;
2007 }
2008
2009 what = strjoina("../usr/share/zoneinfo/", z);
2010 if (symlink(what, where) < 0) {
2011 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2012 errno, "Failed to correct timezone of container, ignoring: %m");
2013 return 0;
2014 }
2015
2016 break;
2017 }
2018
2019 _fallthrough_;
d4036145 2020 }
68fb0892 2021
1688841f
LP
2022 case TIMEZONE_BIND: {
2023 _cleanup_free_ char *resolved = NULL;
2024 int found;
2025
f461a28d 2026 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
2027 if (found < 0) {
2028 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2029 return 0;
2030 }
2031
2032 if (found == 0) /* missing? */
2033 (void) touch(resolved);
2034
511a8cfe 2035 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 2036 if (r >= 0)
511a8cfe 2037 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
2038
2039 _fallthrough_;
79d80fc1 2040 }
4d9f07b4 2041
1688841f
LP
2042 case TIMEZONE_COPY:
2043 /* If mounting failed, try to copy */
7c2f5495 2044 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
2045 if (r < 0) {
2046 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2047 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2048 return 0;
2049 }
2050
2051 break;
2052
2053 default:
04499a70 2054 assert_not_reached();
d4036145 2055 }
e58a1277 2056
1688841f 2057 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
2058 r = userns_lchown(where, 0, 0);
2059 if (r < 0)
1688841f 2060 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 2061
e58a1277 2062 return 0;
88213476
LP
2063}
2064
09d423e9
LP
2065static int have_resolv_conf(const char *path) {
2066 assert(path);
2067
2068 if (access(path, F_OK) < 0) {
2069 if (errno == ENOENT)
2070 return 0;
2071
2072 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2073 }
2074
2075 return 1;
2076}
2077
7357272e 2078static int resolved_listening(void) {
b8ea7a6e 2079 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2080 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2081 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2082 int r;
2083
7357272e 2084 /* Check if resolved is listening */
b053cd5f
LP
2085
2086 r = sd_bus_open_system(&bus);
2087 if (r < 0)
b8ea7a6e 2088 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2089
7357272e 2090 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2091 if (r < 0)
2092 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2093 if (r == 0)
2094 return 0;
7357272e 2095
7f8a85e6 2096 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
7357272e 2097 if (r < 0)
b8ea7a6e 2098 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2099
2100 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2101}
2102
2547bb41 2103static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2104 _cleanup_free_ char *etc = NULL;
2105 const char *where, *what;
2106 ResolvConfMode m;
2107 int r;
2547bb41
LP
2108
2109 assert(dest);
2110
09d423e9
LP
2111 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2112 if (arg_private_network)
2113 m = RESOLV_CONF_OFF;
86775e35
LP
2114 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2115 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2116 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2117 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2118 else
83205269 2119 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2120
09d423e9
LP
2121 } else
2122 m = arg_resolv_conf;
2123
2124 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2125 return 0;
2126
f461a28d 2127 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2128 if (r < 0) {
2129 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2130 return 0;
2131 }
2132
2133 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2134
2135 if (m == RESOLV_CONF_DELETE) {
2136 if (unlink(where) < 0)
2137 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2138
87447ae4
LP
2139 return 0;
2140 }
79d80fc1 2141
86775e35
LP
2142 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2143 what = PRIVATE_STATIC_RESOLV_CONF;
2144 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2145 what = PRIVATE_UPLINK_RESOLV_CONF;
2146 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2147 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2148 else
2149 what = "/etc/resolv.conf";
87447ae4 2150
86775e35 2151 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2152 _cleanup_free_ char *resolved = NULL;
2153 int found;
2154
d404c8d8 2155 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
09d423e9
LP
2156 if (found < 0) {
2157 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2158 return 0;
2159 }
3539724c 2160
87447ae4
LP
2161 if (found == 0) /* missing? */
2162 (void) touch(resolved);
5367354d 2163
511a8cfe 2164 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2165 if (r >= 0)
511a8cfe 2166 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2167
2168 /* If that didn't work, let's copy the file */
3539724c
LP
2169 }
2170
86775e35 2171 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
7c2f5495 2172 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
86775e35 2173 else
7c2f5495 2174 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
79d80fc1 2175 if (r < 0) {
3539724c
LP
2176 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2177 * resolved or something similar runs inside and the symlink points there.
68a313c5 2178 *
3539724c 2179 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2180 */
86775e35
LP
2181 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2182 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2183 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2184 return 0;
2185 }
2547bb41 2186
03cfe0d5
LP
2187 r = userns_lchown(where, 0, 0);
2188 if (r < 0)
3539724c 2189 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2190
2547bb41
LP
2191 return 0;
2192}
2193
1e4f1671 2194static int setup_boot_id(void) {
cdde6ba6
LP
2195 _cleanup_(unlink_and_freep) char *from = NULL;
2196 _cleanup_free_ char *path = NULL;
3bbaff3e 2197 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2198 const char *to;
04bc4a3f
LP
2199 int r;
2200
1eacc470 2201 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2202
1eacc470 2203 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2204 if (r < 0)
2205 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2206
2207 r = sd_id128_randomize(&rnd);
f647962d
MS
2208 if (r < 0)
2209 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2210
b40c8ebd 2211 r = id128_write(path, ID128_FORMAT_UUID, rnd);
f647962d
MS
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2214
cdde6ba6
LP
2215 from = TAKE_PTR(path);
2216 to = "/proc/sys/kernel/random/boot_id";
2217
511a8cfe 2218 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2219 if (r < 0)
2220 return r;
04bc4a3f 2221
511a8cfe 2222 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2223}
2224
e58a1277 2225static int copy_devnodes(const char *dest) {
88213476
LP
2226 static const char devnodes[] =
2227 "null\0"
2228 "zero\0"
2229 "full\0"
2230 "random\0"
2231 "urandom\0"
85614d66
TG
2232 "tty\0"
2233 "net/tun\0";
88213476 2234
e58a1277 2235 int r = 0;
a258bf26
LP
2236
2237 assert(dest);
124640f1 2238
52f05ef2 2239 BLOCK_WITH_UMASK(0000);
88213476 2240
03cfe0d5
LP
2241 /* Create /dev/net, so that we can create /dev/net/tun in it */
2242 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2243 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2244
88213476 2245 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2246 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2247 struct stat st;
88213476 2248
c6134d3e 2249 from = path_join("/dev/", d);
8967f291
LP
2250 if (!from)
2251 return log_oom();
2252
c6134d3e 2253 to = path_join(dest, from);
8967f291
LP
2254 if (!to)
2255 return log_oom();
88213476
LP
2256
2257 if (stat(from, &st) < 0) {
2258
4a62c710
MS
2259 if (errno != ENOENT)
2260 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2261
baaa35ad
ZJS
2262 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2263 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2264 "%s is not a char or block device, cannot copy.", from);
2265 else {
8dfce114
LP
2266 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2267
81f5049b 2268 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2269 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2270 if (errno == EEXIST)
8dbf71ec 2271 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2272 if (errno != EPERM)
2273 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2274
8dfce114 2275 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2276 r = touch(to);
2277 if (r < 0)
2278 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2279 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2280 if (r < 0)
2281 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2282 }
6278cf60 2283
03cfe0d5
LP
2284 r = userns_lchown(to, 0, 0);
2285 if (r < 0)
2286 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2287
657ee2d8 2288 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2289 if (!dn)
2290 return log_oom();
2291
2292 r = userns_mkdir(dest, dn, 0755, 0, 0);
2293 if (r < 0)
2294 return log_error_errno(r, "Failed to create '%s': %m", dn);
2295
2296 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2297 return log_oom();
2298
c6134d3e 2299 prefixed = path_join(dest, sl);
8dfce114
LP
2300 if (!prefixed)
2301 return log_oom();
2302
2d9b74ba 2303 t = path_join("..", d);
8dfce114
LP
2304 if (!t)
2305 return log_oom();
2306
2307 if (symlink(t, prefixed) < 0)
2308 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2309 }
88213476
LP
2310 }
2311
e58a1277
LP
2312 return r;
2313}
88213476 2314
de40a303 2315static int make_extra_nodes(const char *dest) {
de40a303
LP
2316 size_t i;
2317 int r;
2318
52f05ef2 2319 BLOCK_WITH_UMASK(0000);
de40a303
LP
2320
2321 for (i = 0; i < arg_n_extra_nodes; i++) {
2322 _cleanup_free_ char *path = NULL;
2323 DeviceNode *n = arg_extra_nodes + i;
2324
c6134d3e 2325 path = path_join(dest, n->path);
de40a303
LP
2326 if (!path)
2327 return log_oom();
2328
2329 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2330 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2331
2332 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2333 if (r < 0)
2334 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2335 }
2336
2337 return 0;
2338}
2339
03cfe0d5
LP
2340static int setup_pts(const char *dest) {
2341 _cleanup_free_ char *options = NULL;
2342 const char *p;
709f6e46 2343 int r;
03cfe0d5 2344
349cc4a5 2345#if HAVE_SELINUX
03cfe0d5
LP
2346 if (arg_selinux_apifs_context)
2347 (void) asprintf(&options,
3dce8915 2348 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2349 arg_uid_shift + TTY_GID,
2350 arg_selinux_apifs_context);
2351 else
2352#endif
2353 (void) asprintf(&options,
3dce8915 2354 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2355 arg_uid_shift + TTY_GID);
f2d88580 2356
03cfe0d5 2357 if (!options)
f2d88580
LP
2358 return log_oom();
2359
03cfe0d5 2360 /* Mount /dev/pts itself */
cc9fce65 2361 p = prefix_roota(dest, "/dev/pts");
3f692e2e 2362 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e
ZJS
2363 if (r < 0)
2364 return log_error_errno(r, "Failed to create /dev/pts: %m");
2365
511a8cfe 2366 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2367 if (r < 0)
2368 return r;
709f6e46
MS
2369 r = userns_lchown(p, 0, 0);
2370 if (r < 0)
2371 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2372
2373 /* Create /dev/ptmx symlink */
2374 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2375 if (symlink("pts/ptmx", p) < 0)
2376 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2377 r = userns_lchown(p, 0, 0);
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2380
03cfe0d5
LP
2381 /* And fix /dev/pts/ptmx ownership */
2382 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2383 r = userns_lchown(p, 0, 0);
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2386
f2d88580
LP
2387 return 0;
2388}
2389
3acc84eb 2390static int setup_stdio_as_dev_console(void) {
5bb1d7fb 2391 _cleanup_close_ int terminal = -EBADF;
e58a1277 2392 int r;
e58a1277 2393
335d2ead
LP
2394 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2395 * explicitly, if we are configured to. */
2396 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2397 if (terminal < 0)
2398 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2399
3acc84eb
FB
2400 /* Make sure we can continue logging to the original stderr, even if
2401 * stderr points elsewhere now */
2402 r = log_dup_console();
2403 if (r < 0)
2404 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2405
3acc84eb
FB
2406 /* invalidates 'terminal' on success and failure */
2407 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2408 TAKE_FD(terminal);
f647962d 2409 if (r < 0)
3acc84eb
FB
2410 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2411
2412 return 0;
2413}
88213476 2414
3acc84eb
FB
2415static int setup_dev_console(const char *console) {
2416 _cleanup_free_ char *p = NULL;
2417 int r;
a258bf26 2418
3acc84eb
FB
2419 /* Create /dev/console symlink */
2420 r = path_make_relative("/dev", console, &p);
81f5049b 2421 if (r < 0)
3acc84eb
FB
2422 return log_error_errno(r, "Failed to create relative path: %m");
2423
2424 if (symlink(p, "/dev/console") < 0)
2425 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2426
3acc84eb 2427 return 0;
e58a1277
LP
2428}
2429
8e5430c4
LP
2430static int setup_keyring(void) {
2431 key_serial_t keyring;
2432
6b000af4
LP
2433 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2434 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2435 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2436 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2437 * into the container. */
8e5430c4
LP
2438
2439 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2440 if (keyring == -1) {
2441 if (errno == ENOSYS)
2442 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2443 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2444 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2445 else
2446 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2447 }
2448
2449 return 0;
2450}
2451
3652872a
LP
2452static int setup_credentials(const char *root) {
2453 const char *q;
2454 int r;
2455
2456 if (arg_n_credentials <= 0)
2457 return 0;
2458
2459 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2460 if (r < 0)
2461 return log_error_errno(r, "Failed to create /run/host: %m");
2462
2463 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2464 if (r < 0)
2465 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2466
2467 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2468 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2469 if (r < 0)
2470 return r;
2471
2472 for (size_t i = 0; i < arg_n_credentials; i++) {
2473 _cleanup_free_ char *j = NULL;
254d1313 2474 _cleanup_close_ int fd = -EBADF;
3652872a
LP
2475
2476 j = path_join(q, arg_credentials[i].id);
2477 if (!j)
2478 return log_oom();
2479
2480 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2481 if (fd < 0)
2482 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2483
2484 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2485 if (r < 0)
2486 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2487
2488 if (fchmod(fd, 0400) < 0)
2489 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2490
2491 if (arg_userns_mode != USER_NAMESPACE_NO) {
2492 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2493 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2494 }
2495 }
2496
2497 if (chmod(q, 0500) < 0)
2498 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2499
2500 r = userns_lchown(q, 0, 0);
2501 if (r < 0)
2502 return r;
2503
2504 /* Make both mount and superblock read-only now */
511a8cfe 2505 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2506 if (r < 0)
2507 return r;
2508
511a8cfe 2509 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2510}
2511
5d9d3fcb 2512static int setup_kmsg(int fd_inner_socket) {
9ec5a93c
LP
2513 _cleanup_(unlink_and_freep) char *from = NULL;
2514 _cleanup_free_ char *fifo = NULL;
254d1313 2515 _cleanup_close_ int fd = -EBADF;
9ec5a93c 2516 int r;
e58a1277 2517
5d9d3fcb 2518 assert(fd_inner_socket >= 0);
a258bf26 2519
52f05ef2 2520 BLOCK_WITH_UMASK(0000);
a258bf26 2521
30fd9a2d 2522 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2523 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2524 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2525 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2526
1eacc470 2527 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2528 if (r < 0)
2529 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2530
9ec5a93c 2531 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2532 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2533
2534 from = TAKE_PTR(fifo);
9ec5a93c 2535
511a8cfe 2536 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2537 if (r < 0)
2538 return r;
e58a1277 2539
669fc4e5 2540 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2541 if (fd < 0)
2542 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2543
9ec5a93c 2544 /* Store away the fd in the socket, so that it stays open as long as we run the child */
5d9d3fcb 2545 r = send_one_fd(fd_inner_socket, fd, 0);
d9603714
DH
2546 if (r < 0)
2547 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2548
25ea79fe 2549 return 0;
88213476
LP
2550}
2551
761cf19d 2552struct ExposeArgs {
deff68e7
FW
2553 union in_addr_union address4;
2554 union in_addr_union address6;
761cf19d
FW
2555 struct FirewallContext *fw_ctx;
2556};
2557
1c4baffc 2558static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
99534007 2559 struct ExposeArgs *args = ASSERT_PTR(userdata);
6d0b55c2
LP
2560
2561 assert(rtnl);
2562 assert(m);
6d0b55c2 2563
fb9044cb
LP
2564 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2565 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2566 return 0;
2567}
2568
3a74cea5 2569static int setup_hostname(void) {
c818eef1 2570 int r;
3a74cea5 2571
0c582db0 2572 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2573 return 0;
2574
c818eef1
LP
2575 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2576 if (r < 0)
2577 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2578
7027ff61 2579 return 0;
3a74cea5
LP
2580}
2581
57fb9fb5 2582static int setup_journal(const char *directory) {
0f5e1382 2583 _cleanup_free_ char *d = NULL;
5980d463 2584 const char *p, *q;
b2238e38 2585 sd_id128_t this_id;
8054d749 2586 bool try;
57fb9fb5
LP
2587 int r;
2588
df9a75e4
LP
2589 /* Don't link journals in ephemeral mode */
2590 if (arg_ephemeral)
2591 return 0;
2592
8054d749
LP
2593 if (arg_link_journal == LINK_NO)
2594 return 0;
2595
2596 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2597
4d680aee 2598 r = sd_id128_get_machine(&this_id);
f647962d
MS
2599 if (r < 0)
2600 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2601
e01ff70a 2602 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2603 log_full(try ? LOG_WARNING : LOG_ERR,
85b55869 2604 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
8054d749 2605 if (try)
4d680aee 2606 return 0;
df9a75e4 2607 return -EEXIST;
4d680aee
ZJS
2608 }
2609
369ca6da
ZJS
2610 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2611 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2612 if (r < 0) {
2613 bool ignore = r == -EROFS && try;
2614 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2615 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2616 return ignore ? 0 : r;
2617 }
2618 }
03cfe0d5 2619
85b55869 2620 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
03cfe0d5 2621 q = prefix_roota(directory, p);
27407a01 2622
e1873695 2623 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2624 if (try)
2625 return 0;
27407a01 2626
baaa35ad
ZJS
2627 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2628 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2629 }
2630
e1873695 2631 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2632 if (try)
2633 return 0;
57fb9fb5 2634
baaa35ad
ZJS
2635 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2636 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2637 }
2638
2639 r = readlink_and_make_absolute(p, &d);
2640 if (r >= 0) {
3742095b 2641 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2642 path_equal(d, q)) {
2643
03cfe0d5 2644 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2645 if (r < 0)
709f6e46 2646 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2647 return 0;
57fb9fb5
LP
2648 }
2649
4a62c710
MS
2650 if (unlink(p) < 0)
2651 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2652 } else if (r == -EINVAL) {
2653
2654 if (arg_link_journal == LINK_GUEST &&
2655 rmdir(p) < 0) {
2656
27407a01
ZJS
2657 if (errno == ENOTDIR) {
2658 log_error("%s already exists and is neither a symlink nor a directory", p);
2659 return r;
4314d33f
MS
2660 } else
2661 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2662 }
4314d33f
MS
2663 } else if (r != -ENOENT)
2664 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2665
2666 if (arg_link_journal == LINK_GUEST) {
2667
2668 if (symlink(q, p) < 0) {
8054d749 2669 if (try) {
56f64d95 2670 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2671 return 0;
4314d33f
MS
2672 } else
2673 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2674 }
2675
03cfe0d5 2676 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2677 if (r < 0)
709f6e46 2678 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2679 return 0;
57fb9fb5
LP
2680 }
2681
2682 if (arg_link_journal == LINK_HOST) {
ccddd104 2683 /* don't create parents here — if the host doesn't have
574edc90 2684 * permanent journal set up, don't force it here */
ba8e6c4d 2685
3f692e2e 2686 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e 2687 if (r < 0 && r != -EEXIST) {
8054d749 2688 if (try) {
dae8b82e 2689 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2690 return 0;
4314d33f 2691 } else
dae8b82e 2692 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2693 }
2694
27407a01
ZJS
2695 } else if (access(p, F_OK) < 0)
2696 return 0;
57fb9fb5 2697
db55bbf2 2698 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
cdb2b9d0
LP
2699 log_warning("%s is not empty, proceeding anyway.", q);
2700
03cfe0d5 2701 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2702 if (r < 0)
2703 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2704
511a8cfe 2705 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2706 if (r < 0)
4a62c710 2707 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2708
27407a01 2709 return 0;
57fb9fb5
LP
2710}
2711
de40a303
LP
2712static int drop_capabilities(uid_t uid) {
2713 CapabilityQuintet q;
2714
2715 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2716 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2717 * arg_caps_retain. */
2718
2719 if (capability_quintet_is_set(&arg_full_capabilities)) {
2720 q = arg_full_capabilities;
2721
f5fbe71d 2722 if (q.bounding == UINT64_MAX)
de40a303
LP
2723 q.bounding = uid == 0 ? arg_caps_retain : 0;
2724
f5fbe71d 2725 if (q.effective == UINT64_MAX)
de40a303
LP
2726 q.effective = uid == 0 ? q.bounding : 0;
2727
f5fbe71d 2728 if (q.inheritable == UINT64_MAX)
88fc9c9b 2729 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2730
f5fbe71d 2731 if (q.permitted == UINT64_MAX)
88fc9c9b 2732 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2733
f5fbe71d 2734 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2735 q.ambient = arg_caps_ambient;
f66ad460
AZ
2736
2737 if (capability_quintet_mangle(&q))
2738 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2739
2740 } else {
de40a303
LP
2741 q = (CapabilityQuintet) {
2742 .bounding = arg_caps_retain,
2743 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2744 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2745 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2746 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2747 };
2748
f66ad460
AZ
2749 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2750 * in order to maintain the same behavior as systemd < 242. */
2751 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2752 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2753 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2754
2755 }
2756
de40a303 2757 return capability_quintet_enforce(&q);
88213476
LP
2758}
2759
db999e0f
LP
2760static int reset_audit_loginuid(void) {
2761 _cleanup_free_ char *p = NULL;
2762 int r;
2763
0c582db0 2764 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2765 return 0;
2766
2767 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2768 if (r == -ENOENT)
db999e0f 2769 return 0;
f647962d
MS
2770 if (r < 0)
2771 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2772
2773 /* Already reset? */
2774 if (streq(p, "4294967295"))
2775 return 0;
2776
57512c89 2777 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2778 if (r < 0) {
10a87006
LP
2779 log_error_errno(r,
2780 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2781 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2782 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2783 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2784 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2785
db999e0f 2786 sleep(5);
77b6e194 2787 }
db999e0f
LP
2788
2789 return 0;
77b6e194
LP
2790}
2791
e79581dd 2792static int mount_tunnel_dig(const char *root) {
785890ac 2793 const char *p, *q;
709f6e46 2794 int r;
785890ac
LP
2795
2796 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2797 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2798 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2799 (void) mkdir_p(p, 0600);
2800
5a27b395 2801 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2802 if (r < 0)
5a27b395 2803 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2804
e79581dd 2805 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
709f6e46 2806 if (r < 0)
e79581dd 2807 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
03cfe0d5 2808
e79581dd 2809 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
511a8cfe 2810 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2811 if (r < 0)
2812 return r;
785890ac 2813
511a8cfe 2814 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2815 if (r < 0)
2816 return r;
785890ac 2817
e79581dd
CB
2818 return 0;
2819}
2820
2821static int mount_tunnel_open(void) {
2822 int r;
2823
2824 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2825 if (r < 0)
2826 return r;
2827
2828 return 0;
785890ac
LP
2829}
2830
317feb4d 2831static int setup_machine_id(const char *directory) {
3bbaff3e 2832 int r;
e01ff70a 2833
317feb4d
LP
2834 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2835 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2836 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2837 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2838 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2839 * container behaves nicely). */
2840
319477f1 2841 r = id128_get_machine(directory, &arg_uuid);
317feb4d 2842 if (r < 0) {
74e795ee 2843 if (!ERRNO_IS_MACHINE_ID_UNSET(r)) /* If the file is missing, empty, or uninitialized, we don't mind */
317feb4d 2844 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2845
317feb4d
LP
2846 if (sd_id128_is_null(arg_uuid)) {
2847 r = sd_id128_randomize(&arg_uuid);
2848 if (r < 0)
2849 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2850 }
317feb4d 2851 }
691675ba 2852
e01ff70a
MS
2853 return 0;
2854}
2855
7336138e
LP
2856static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2857 int r;
2858
2859 assert(directory);
2860
6c045a99 2861 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2862 return 0;
2863
2864 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2865 if (r == -EOPNOTSUPP)
2866 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2867 if (r == -EBADE)
2868 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2869 if (r < 0)
2870 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2871 if (r == 0)
2872 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2873 else
2874 log_debug("Patched directory tree to match UID/GID range.");
2875
2876 return r;
2877}
2878
113cea80 2879/*
6d416b9c
LS
2880 * Return values:
2881 * < 0 : wait_for_terminate() failed to get the state of the
2882 * container, the container was terminated by a signal, or
2883 * failed for an unknown reason. No change is made to the
2884 * container argument.
2885 * > 0 : The program executed in the container terminated with an
2886 * error. The exit code of the program executed in the
919699ec
LP
2887 * container is returned. The container argument has been set
2888 * to CONTAINER_TERMINATED.
6d416b9c
LS
2889 * 0 : The container is being rebooted, has been shut down or exited
2890 * successfully. The container argument has been set to either
2891 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2892 *
6d416b9c
LS
2893 * That is, success is indicated by a return value of zero, and an
2894 * error is indicated by a non-zero value.
113cea80
DH
2895 */
2896static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2897 siginfo_t status;
919699ec 2898 int r;
113cea80
DH
2899
2900 r = wait_for_terminate(pid, &status);
f647962d
MS
2901 if (r < 0)
2902 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2903
2904 switch (status.si_code) {
fddbb89c 2905
113cea80 2906 case CLD_EXITED:
b5a2179b 2907 if (status.si_status == 0)
919699ec 2908 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2909 else
919699ec 2910 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2911
919699ec
LP
2912 *container = CONTAINER_TERMINATED;
2913 return status.si_status;
113cea80
DH
2914
2915 case CLD_KILLED:
2916 if (status.si_status == SIGINT) {
919699ec 2917 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2918 *container = CONTAINER_TERMINATED;
919699ec
LP
2919 return 0;
2920
113cea80 2921 } else if (status.si_status == SIGHUP) {
919699ec 2922 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2923 *container = CONTAINER_REBOOTED;
919699ec 2924 return 0;
113cea80 2925 }
919699ec 2926
4831981d 2927 _fallthrough_;
113cea80 2928 case CLD_DUMPED:
baaa35ad
ZJS
2929 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2930 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2931
2932 default:
baaa35ad
ZJS
2933 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2934 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2935 }
113cea80
DH
2936}
2937
023fb90b
LP
2938static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2939 pid_t pid;
2940
4a0b58c4 2941 pid = PTR_TO_PID(userdata);
023fb90b 2942 if (pid > 0) {
c6c8f6e2 2943 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2944 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2945 sd_event_source_set_userdata(s, NULL);
2946 return 0;
2947 }
2948 }
2949
2950 sd_event_exit(sd_event_source_get_event(s), 0);
2951 return 0;
2952}
2953
6916b164 2954static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2955 pid_t pid;
2956
2957 assert(s);
2958 assert(ssi);
2959
2960 pid = PTR_TO_PID(userdata);
2961
6916b164
AU
2962 for (;;) {
2963 siginfo_t si = {};
abdb9b08 2964
6916b164
AU
2965 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2966 return log_error_errno(errno, "Failed to waitid(): %m");
2967 if (si.si_pid == 0) /* No pending children. */
2968 break;
abdb9b08 2969 if (si.si_pid == pid) {
6916b164
AU
2970 /* The main process we care for has exited. Return from
2971 * signal handler but leave the zombie. */
2972 sd_event_exit(sd_event_source_get_event(s), 0);
2973 break;
2974 }
abdb9b08 2975
6916b164
AU
2976 /* Reap all other children. */
2977 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2978 }
2979
2980 return 0;
2981}
2982
abdb9b08
LP
2983static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2984 pid_t pid;
2985
2986 assert(m);
2987
2988 pid = PTR_TO_PID(userdata);
2989
2990 if (arg_kill_signal > 0) {
2991 log_info("Container termination requested. Attempting to halt container.");
2992 (void) kill(pid, arg_kill_signal);
2993 } else {
2994 log_info("Container termination requested. Exiting.");
2995 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2996 }
2997
2998 return 0;
2999}
3000
ec16945e 3001static int determine_names(void) {
1b9cebf6 3002 int r;
ec16945e 3003
c1521918
LP
3004 if (arg_template && !arg_directory && arg_machine) {
3005
3006 /* If --template= was specified then we should not
3007 * search for a machine, but instead create a new one
3008 * in /var/lib/machine. */
3009
657ee2d8 3010 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
3011 if (!arg_directory)
3012 return log_oom();
3013 }
3014
ec16945e 3015 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3016 if (arg_machine) {
3017 _cleanup_(image_unrefp) Image *i = NULL;
3018
d577d4a4 3019 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
3020 if (r == -ENOENT)
3021 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
3022 if (r < 0)
3023 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 3024
eb38edce 3025 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 3026 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 3027 else
0f03c2a4 3028 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 3029 if (r < 0)
0f3be6ca 3030 return log_oom();
1b9cebf6 3031
aee327b8
LP
3032 if (!arg_ephemeral)
3033 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
3034 } else {
3035 r = safe_getcwd(&arg_directory);
3036 if (r < 0)
3037 return log_error_errno(r, "Failed to determine current directory: %m");
3038 }
ec16945e 3039
c6147113
LP
3040 if (!arg_directory && !arg_image)
3041 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
3042 }
3043
3044 if (!arg_machine) {
b9ba4dab
LP
3045 if (arg_directory && path_equal(arg_directory, "/"))
3046 arg_machine = gethostname_malloc();
e9b88a6d
LP
3047 else if (arg_image) {
3048 char *e;
4827ab48 3049
b36e39d2
LP
3050 r = path_extract_filename(arg_image, &arg_machine);
3051 if (r < 0)
3052 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
4827ab48 3053
e9b88a6d
LP
3054 /* Truncate suffix if there is one */
3055 e = endswith(arg_machine, ".raw");
3056 if (e)
3057 *e = 0;
b36e39d2
LP
3058 } else {
3059 r = path_extract_filename(arg_directory, &arg_machine);
3060 if (r < 0)
3061 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3062 }
ec16945e 3063
ae691c1d 3064 hostname_cleanup(arg_machine);
52ef5dd7 3065 if (!hostname_is_valid(arg_machine, 0))
c6147113 3066 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 3067
3603f151
LB
3068 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3069 * to match fixed config file names. */
3070 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3071 if (!arg_settings_filename)
3072 return log_oom();
3073
e9b88a6d
LP
3074 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3075 * instances at once without manually having to specify -M each time. */
3076 if (arg_ephemeral)
3077 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3078 return log_oom();
3603f151
LB
3079 } else {
3080 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3081 if (!arg_settings_filename)
3082 return log_oom();
ec16945e
LP
3083 }
3084
3085 return 0;
3086}
3087
f461a28d 3088static int chase_and_update(char **p, unsigned flags) {
3f342ec4
LP
3089 char *chased;
3090 int r;
3091
3092 assert(p);
3093
3094 if (!*p)
3095 return 0;
3096
f461a28d 3097 r = chase(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3098 if (r < 0)
3099 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3100
a5648b80 3101 return free_and_replace(*p, chased);
3f342ec4
LP
3102}
3103
03cfe0d5 3104static int determine_uid_shift(const char *directory) {
6dac160c 3105
0de7acce 3106 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3107 arg_uid_shift = 0;
6dac160c 3108 return 0;
03cfe0d5 3109 }
6dac160c
LP
3110
3111 if (arg_uid_shift == UID_INVALID) {
3112 struct stat st;
3113
993da6d4
LP
3114 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3115
3116 if (stat(directory, &st) < 0)
03cfe0d5 3117 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3118
3119 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3120
baaa35ad
ZJS
3121 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3122 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3123 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3124
3125 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3126
3127 if (arg_uid_shift != 0) {
3128 /* If the image is shifted already, then we'll fall back to classic chowning, for
3129 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3130
3131 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3132 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3133 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3134 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3135 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3136 "UID base of %s is not zero, UID mapping not supported.", directory);
3137 }
6dac160c
LP
3138 }
3139
58e13de5
LP
3140 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3141 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3142
6dac160c
LP
3143 return 0;
3144}
3145
de40a303
LP
3146static unsigned long effective_clone_ns_flags(void) {
3147 unsigned long flags = arg_clone_ns_flags;
3148
3149 if (arg_private_network)
3150 flags |= CLONE_NEWNET;
3151 if (arg_use_cgns)
3152 flags |= CLONE_NEWCGROUP;
3153 if (arg_userns_mode != USER_NAMESPACE_NO)
3154 flags |= CLONE_NEWUSER;
3155
3156 return flags;
3157}
3158
3159static int patch_sysctl(void) {
3160
3161 /* This table is inspired by runc's sysctl() function */
3162 static const struct {
3163 const char *key;
3164 bool prefix;
3165 unsigned long clone_flags;
3166 } safe_sysctl[] = {
3167 { "kernel.hostname", false, CLONE_NEWUTS },
3168 { "kernel.domainname", false, CLONE_NEWUTS },
3169 { "kernel.msgmax", false, CLONE_NEWIPC },
3170 { "kernel.msgmnb", false, CLONE_NEWIPC },
3171 { "kernel.msgmni", false, CLONE_NEWIPC },
3172 { "kernel.sem", false, CLONE_NEWIPC },
3173 { "kernel.shmall", false, CLONE_NEWIPC },
3174 { "kernel.shmmax", false, CLONE_NEWIPC },
3175 { "kernel.shmmni", false, CLONE_NEWIPC },
3176 { "fs.mqueue.", true, CLONE_NEWIPC },
3177 { "net.", true, CLONE_NEWNET },
3178 };
3179
3180 unsigned long flags;
de40a303
LP
3181 int r;
3182
3183 flags = effective_clone_ns_flags();
3184
3185 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3186 bool good = false;
3187 size_t i;
3188
3189 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3190
3191 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3192 continue;
3193
3194 if (safe_sysctl[i].prefix)
3195 good = startswith(*k, safe_sysctl[i].key);
3196 else
3197 good = streq(*k, safe_sysctl[i].key);
3198
3199 if (good)
3200 break;
3201 }
3202
c6147113
LP
3203 if (!good)
3204 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3205
3206 r = sysctl_write(*k, *v);
3207 if (r < 0)
3208 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3209 }
3210
3211 return 0;
3212}
3213
03cfe0d5
LP
3214static int inner_child(
3215 Barrier *barrier,
5d9d3fcb 3216 int fd_inner_socket,
e1bb4b0d
LB
3217 FDSet *fds,
3218 char **os_release_pairs) {
69c79d3c 3219
03cfe0d5 3220 _cleanup_free_ char *home = NULL;
88614c8a 3221 size_t n_env = 1;
4ab3d29f
ZJS
3222 char *envp[] = {
3223 (char*) "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3224 NULL, /* container */
03cfe0d5
LP
3225 NULL, /* TERM */
3226 NULL, /* HOME */
3227 NULL, /* USER */
3228 NULL, /* LOGNAME */
3229 NULL, /* container_uuid */
3230 NULL, /* LISTEN_FDS */
3231 NULL, /* LISTEN_PID */
9c1e04d0 3232 NULL, /* NOTIFY_SOCKET */
3652872a 3233 NULL, /* CREDENTIALS_DIRECTORY */
b626f695 3234 NULL, /* LANG */
03cfe0d5
LP
3235 NULL
3236 };
1a68e1e5 3237 const char *exec_target;
2371271c 3238 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3239 int r, which_failed;
88213476 3240
b37469d7
LP
3241 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3242 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3243 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3244 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3245 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3246 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3247 * namespace.
3248 *
3249 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3250 * unshare(). See below. */
3251
03cfe0d5 3252 assert(barrier);
5d9d3fcb 3253 assert(fd_inner_socket >= 0);
88213476 3254
de40a303
LP
3255 log_debug("Inner child is initializing.");
3256
0de7acce 3257 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3258 /* Tell the parent, that it now can write the UID map. */
3259 (void) barrier_place(barrier); /* #1 */
7027ff61 3260
03cfe0d5 3261 /* Wait until the parent wrote the UID map */
baaa35ad 3262 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3263 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3264
2a2e78e9
LP
3265 /* Become the new root user inside our namespace */
3266 r = reset_uid_gid();
3267 if (r < 0)
3268 return log_error_errno(r, "Couldn't become new root: %m");
3269
3270 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3271 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3272 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3273 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3274 if (r < 0)
3275 return r;
3276 }
6d66bd3b 3277
0de7acce 3278 r = mount_all(NULL,
4f086aab 3279 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3280 arg_uid_shift,
0de7acce 3281 arg_selinux_apifs_context);
03cfe0d5
LP
3282 if (r < 0)
3283 return r;
3284
04413780
ZJS
3285 if (!arg_network_namespace_path && arg_private_network) {
3286 r = unshare(CLONE_NEWNET);
3287 if (r < 0)
3288 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3289
3290 /* Tell the parent that it can setup network interfaces. */
3291 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3292 }
3293
4f086aab 3294 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3295 if (r < 0)
3296 return r;
3297
03cfe0d5
LP
3298 /* Wait until we are cgroup-ified, so that we
3299 * can mount the right cgroup path writable */
baaa35ad
ZJS
3300 if (!barrier_place_and_sync(barrier)) /* #4 */
3301 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3302 "Parent died too early");
88213476 3303
489fae52 3304 if (arg_use_cgns) {
0996ef00
CB
3305 r = unshare(CLONE_NEWCGROUP);
3306 if (r < 0)
04413780 3307 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3308 r = mount_cgroups(
3309 "",
3310 arg_unified_cgroup_hierarchy,
3311 arg_userns_mode != USER_NAMESPACE_NO,
3312 arg_uid_shift,
3313 arg_uid_range,
5a8ff0e6 3314 arg_selinux_apifs_context,
ada54120 3315 true);
1433e0f2 3316 } else
0996ef00 3317 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3318 if (r < 0)
3319 return r;
ec16945e 3320
1e4f1671 3321 r = setup_boot_id();
03cfe0d5
LP
3322 if (r < 0)
3323 return r;
ec16945e 3324
5d9d3fcb 3325 r = setup_kmsg(fd_inner_socket);
03cfe0d5
LP
3326 if (r < 0)
3327 return r;
ec16945e 3328
de40a303
LP
3329 r = mount_custom(
3330 "/",
3331 arg_custom_mounts,
3332 arg_n_custom_mounts,
de40a303 3333 0,
c0c8f718 3334 0,
de40a303 3335 arg_selinux_apifs_context,
5f0a6347 3336 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3337 if (r < 0)
3338 return r;
3339
03cfe0d5
LP
3340 if (setsid() < 0)
3341 return log_error_errno(errno, "setsid() failed: %m");
3342
3343 if (arg_private_network)
df883de9 3344 (void) loopback_setup();
03cfe0d5 3345
7a8f6325 3346 if (arg_expose_ports) {
b07ee903 3347 r = expose_port_send_rtnl(fd_inner_socket);
7a8f6325
LP
3348 if (r < 0)
3349 return r;
7a8f6325 3350 }
03cfe0d5 3351
3acc84eb 3352 if (arg_console_mode != CONSOLE_PIPE) {
5bb1d7fb 3353 _cleanup_close_ int master = -EBADF;
3acc84eb
FB
3354 _cleanup_free_ char *console = NULL;
3355
3356 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3357 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3358 if (master < 0)
dc98caea 3359 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3360
3361 r = setup_dev_console(console);
3362 if (r < 0)
105a1a36 3363 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb 3364
bb1aa185 3365 r = send_one_fd(fd_inner_socket, master, 0);
3acc84eb
FB
3366 if (r < 0)
3367 return log_error_errno(r, "Failed to send master fd: %m");
3acc84eb
FB
3368
3369 r = setup_stdio_as_dev_console();
3370 if (r < 0)
3371 return r;
3372 }
3373
de40a303
LP
3374 r = patch_sysctl();
3375 if (r < 0)
3376 return r;
3377
81f345df
LP
3378 if (arg_oom_score_adjust_set) {
3379 r = set_oom_score_adjust(arg_oom_score_adjust);
3380 if (r < 0)
3381 return log_error_errno(r, "Failed to adjust OOM score: %m");
3382 }
3383
0985c7c4
ZJS
3384 if (arg_cpu_set.set)
3385 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3386 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3387
c818eef1 3388 (void) setup_hostname();
03cfe0d5 3389
050f7277 3390 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3391 r = safe_personality(arg_personality);
3392 if (r < 0)
3393 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3394#ifdef ARCHITECTURE_SECONDARY
3395 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
21022b9d
LP
3396 r = safe_personality(PER_LINUX32);
3397 if (r < 0)
3398 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3399#endif
3400 } else if (arg_architecture >= 0 && arg_architecture != native_architecture())
3401 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3402 "Selected architecture '%s' not supported locally, refusing.",
3403 architecture_to_string(arg_architecture));
03cfe0d5 3404
de40a303
LP
3405 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3406 if (r < 0)
3407 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3408
3409#if HAVE_SECCOMP
3410 if (arg_seccomp) {
3411
3412 if (is_seccomp_available()) {
3413
3414 r = seccomp_load(arg_seccomp);
7bc5e0b1 3415 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3416 return log_error_errno(r, "Failed to install seccomp filter: %m");
3417 if (r < 0)
3418 log_debug_errno(r, "Failed to install seccomp filter: %m");
3419 }
3420 } else
3421#endif
3422 {
6b000af4 3423 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3424 if (r < 0)
3425 return r;
3426 }
3427
4a4654e0 3428 if (arg_suppress_sync) {
20e458ae 3429#if HAVE_SECCOMP
4a4654e0
LP
3430 r = seccomp_suppress_sync();
3431 if (r < 0)
3432 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
20e458ae 3433#else
2db32618 3434 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
20e458ae 3435#endif
4a4654e0
LP
3436 }
3437
349cc4a5 3438#if HAVE_SELINUX
03cfe0d5 3439 if (arg_selinux_context)
2ed96880 3440 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3441 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3442#endif
3443
de40a303
LP
3444 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3445 * if we need to later on. */
3446 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3447 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3448
3449 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3450 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3451 else
3462d773 3452 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3453 if (r < 0)
3454 return r;
3455
de40a303
LP
3456 r = drop_capabilities(getuid());
3457 if (r < 0)
3458 return log_error_errno(r, "Dropping capabilities failed: %m");
3459
66edd963
LP
3460 if (arg_no_new_privileges)
3461 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3462 return log_error_errno(errno, "Failed to disable new privileges: %m");
3463
6aadfa4c
ILG
3464 /* LXC sets container=lxc, so follow the scheme here */
3465 envp[n_env++] = strjoina("container=", arg_container_service_name);
3466
03cfe0d5
LP
3467 envp[n_env] = strv_find_prefix(environ, "TERM=");
3468 if (envp[n_env])
313cefa1 3469 n_env++;
03cfe0d5 3470
de40a303 3471 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3472 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
de40a303
LP
3473 return log_oom();
3474
3475 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3476 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
1da3cb81 3477 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
de40a303 3478 return log_oom();
03cfe0d5 3479
3bbaff3e 3480 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3481
b7416360 3482 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
e01ff70a 3483 return log_oom();
03cfe0d5
LP
3484
3485 if (fdset_size(fds) > 0) {
3486 r = fdset_cloexec(fds, false);
3487 if (r < 0)
3488 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3489
4ab3d29f
ZJS
3490 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3491 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
03cfe0d5
LP
3492 return log_oom();
3493 }
4ab3d29f 3494 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
9c1e04d0 3495 return log_oom();
03cfe0d5 3496
3652872a
LP
3497 if (arg_n_credentials > 0) {
3498 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3499 if (!envp[n_env])
3500 return log_oom();
3501 n_env++;
3502 }
3503
b626f695 3504 if (arg_start_mode != START_BOOT) {
a22f5186 3505 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
b626f695
DDM
3506 if (!envp[n_env])
3507 return log_oom();
3508 n_env++;
3509 }
3510
4ab3d29f 3511 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
2371271c
TG
3512 if (!env_use)
3513 return log_oom();
03cfe0d5
LP
3514
3515 /* Let the parent know that we are ready and
3516 * wait until the parent is ready with the
3517 * setup, too... */
baaa35ad 3518 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3519 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3520
5f932eb9
LP
3521 if (arg_chdir)
3522 if (chdir(arg_chdir) < 0)
3523 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3524
7732f92b 3525 if (arg_start_mode == START_PID2) {
75bf701f 3526 r = stub_pid1(arg_uuid);
7732f92b
LP
3527 if (r < 0)
3528 return r;
3529 }
3530
335d2ead
LP
3531 if (arg_console_mode != CONSOLE_PIPE) {
3532 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3533 * are configured for that. Acquire it as controlling tty. */
3534 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3535 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3536 }
3537
de40a303
LP
3538 log_debug("Inner child completed, invoking payload.");
3539
8ca082b4
LP
3540 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3541 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3542 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3543 log_close();
8ca082b4 3544 log_set_open_when_needed(true);
a3b00f91 3545 log_settle_target();
8ca082b4 3546
03cfe0d5
LP
3547 (void) fdset_close_others(fds);
3548
7732f92b 3549 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3550 char **a;
3551 size_t m;
3552
3553 /* Automatically search for the init system */
3554
75f32f04
ZJS
3555 m = strv_length(arg_parameters);
3556 a = newa(char*, m + 2);
3557 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3558 a[1 + m] = NULL;
03cfe0d5 3559
a5096641
LP
3560 FOREACH_STRING(init,
3561 "/usr/lib/systemd/systemd",
3562 "/lib/systemd/systemd",
3563 "/sbin/init") {
3564 a[0] = (char*) init;
3565 execve(a[0], a, env_use);
3566 }
ced58da7
LP
3567
3568 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3569 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3570 const char *dollar_path;
3571
1a68e1e5 3572 exec_target = arg_parameters[0];
b6b180b7
LP
3573
3574 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3575 * binary. */
3576 dollar_path = strv_env_get(env_use, "PATH");
3577 if (dollar_path) {
6f646e01 3578 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3579 return log_error_errno(errno, "Failed to update $PATH: %m");
3580 }
3581
f757855e 3582 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3583 } else {
5f932eb9 3584 if (!arg_chdir)
d929b0f9
ZJS
3585 /* If we cannot change the directory, we'll end up in /, that is expected. */
3586 (void) chdir(home ?: "/root");
5f932eb9 3587
53350c7b 3588 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3589 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3590 execle("/bin/bash", "-bash", NULL, env_use);
3591 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3592 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7 3593
53350c7b 3594 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
03cfe0d5
LP
3595 }
3596
8ca082b4 3597 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3598}
3599
e96ceaba 3600static int setup_notify_child(void) {
254d1313 3601 _cleanup_close_ int fd = -EBADF;
1eb874b9 3602 static const union sockaddr_union sa = {
44ed5214
LP
3603 .un.sun_family = AF_UNIX,
3604 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3605 };
3606 int r;
3607
3608 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3609 if (fd < 0)
3610 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3611
3612 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3613 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3614
9c1e04d0 3615 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3616 if (r < 0)
44ed5214 3617 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3618
adc7d9f0 3619 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3620 if (r < 0)
adc7d9f0 3621 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3622
2ff48e98 3623 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3624 if (r < 0)
2ff48e98 3625 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3626
271f518f 3627 return TAKE_FD(fd);
9c1e04d0
AP
3628}
3629
03cfe0d5
LP
3630static int outer_child(
3631 Barrier *barrier,
3632 const char *directory,
2d845785 3633 DissectedImage *dissected_image,
af06cd30 3634 int fd_outer_socket,
5d9d3fcb 3635 int fd_inner_socket,
d7bea6b6
DP
3636 FDSet *fds,
3637 int netns_fd) {
03cfe0d5 3638
2f893044 3639 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3640 _cleanup_strv_free_ char **os_release_pairs = NULL;
254d1313 3641 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
f61c7f88 3642 bool idmap = false;
e5f10caf 3643 const char *p;
03cfe0d5
LP
3644 pid_t pid;
3645 ssize_t l;
de40a303 3646 int r;
03cfe0d5 3647
d1d0b895
LP
3648 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3649 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3650 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3651 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3652 * forked off it, and it exits. */
b37469d7 3653
03cfe0d5
LP
3654 assert(barrier);
3655 assert(directory);
af06cd30 3656 assert(fd_outer_socket >= 0);
5d9d3fcb 3657 assert(fd_inner_socket >= 0);
03cfe0d5 3658
de40a303
LP
3659 log_debug("Outer child is initializing.");
3660
e1bb4b0d
LB
3661 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3662 if (r < 0)
3663 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3664
03cfe0d5
LP
3665 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3666 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3667
03cfe0d5
LP
3668 r = reset_audit_loginuid();
3669 if (r < 0)
3670 return r;
3671
2a2e78e9
LP
3672 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3673 * mounts to the real root. */
511a8cfe 3674 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3675 if (r < 0)
3676 return r;
03cfe0d5 3677
2d845785 3678 if (dissected_image) {
d1d0b895
LP
3679 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3680 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3681 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3682 * right place right away. This makes sure ESP partitions and userns are compatible. */
2d3a5a73 3683
af187ab2 3684 r = dissected_image_mount_and_warn(
d04faa4e
LP
3685 dissected_image,
3686 directory,
3687 arg_uid_shift,
21b61b1d 3688 arg_uid_range,
d04faa4e
LP
3689 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3690 DISSECT_IMAGE_DISCARD_ON_LOOP|
3691 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3692 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3693 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3694 if (r < 0)
af187ab2 3695 return r;
2d845785 3696 }
03cfe0d5 3697
391567f4
LP
3698 r = determine_uid_shift(directory);
3699 if (r < 0)
3700 return r;
3701
0de7acce 3702 if (arg_userns_mode != USER_NAMESPACE_NO) {
b71a0192
CB
3703 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3704 if (r < 0)
3705 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3706
af06cd30 3707 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
b71a0192
CB
3708 if (l < 0)
3709 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3710 mntns_fd = safe_close(mntns_fd);
3711
0e7ac751 3712 /* Let the parent know which UID shift we read from the image */
af06cd30 3713 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
825d5287
RM
3714 if (l < 0)
3715 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3716 if (l != sizeof(arg_uid_shift))
3717 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3718 "Short write while sending UID shift.");
0e7ac751 3719
0de7acce 3720 if (arg_userns_mode == USER_NAMESPACE_PICK) {
d1d0b895
LP
3721 /* When we are supposed to pick the UID shift, the parent will check now whether the
3722 * UID shift we just read from the image is available. If yes, it will send the UID
3723 * shift back to us, if not it will pick a different one, and send it back to us. */
0e7ac751 3724
af06cd30 3725 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
0e7ac751
LP
3726 if (l < 0)
3727 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3728 if (l != sizeof(arg_uid_shift))
3729 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3730 "Short read while receiving UID shift.");
0e7ac751
LP
3731 }
3732
ff6c6cc1
LP
3733 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3734 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3735 }
3736
6f83d3d1
LP
3737 if (path_equal(directory, "/")) {
3738 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3739 * place, so that we can make changes to its mount structure (for example, to implement
3740 * --volatile=) without this interfering with our ability to access files such as
3741 * /etc/localtime to copy into the container. Note that we use a fixed place for this
6c2d70ce 3742 * (instead of a temporary directory, since we are living in our own mount namespace here
7802194a 3743 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
6f83d3d1
LP
3744 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3745
511a8cfe 3746 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3747 if (r < 0)
3748 return r;
3749
3750 directory = "/run/systemd/nspawn-root";
e50cd82f 3751 }
7d0ecdd6 3752
75f81732
LP
3753 /* Make sure we always have a mount that we can move to root later on. */
3754 r = make_mount_point(directory);
3755 if (r < 0)
3756 return r;
3757
3758 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3759 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3760 * we'll live in our own little world from now on, and propagation from the host may only happen via
3761 * the mount tunnel dir, or not at all. */
3762 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3763 if (r < 0)
3764 return r;
3765
7d0ecdd6
LP
3766 r = setup_pivot_root(
3767 directory,
3768 arg_pivot_root_new,
3769 arg_pivot_root_old);
3770 if (r < 0)
3771 return r;
3772
3773 r = setup_volatile_mode(
3774 directory,
3775 arg_volatile_mode,
7d0ecdd6 3776 arg_uid_shift,
8f1ed04a 3777 arg_selinux_apifs_context);
7d0ecdd6
LP
3778 if (r < 0)
3779 return r;
3780
2f893044
LP
3781 r = bind_user_prepare(
3782 directory,
3783 arg_bind_user,
3784 arg_uid_shift,
3785 arg_uid_range,
3786 &arg_custom_mounts, &arg_n_custom_mounts,
3787 &bind_user_context);
3788 if (r < 0)
3789 return r;
3790
3791 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
d1d0b895
LP
3792 /* Send the user maps we determined to the parent, so that it installs it in our user
3793 * namespace UID map table */
2f893044
LP
3794
3795 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3796 uid_t map[] = {
3797 bind_user_context->data[i].payload_user->uid,
3798 bind_user_context->data[i].host_user->uid,
3799 (uid_t) bind_user_context->data[i].payload_group->gid,
3800 (uid_t) bind_user_context->data[i].host_group->gid,
3801 };
3802
af06cd30 3803 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
2f893044
LP
3804 if (l < 0)
3805 return log_error_errno(errno, "Failed to send user UID map: %m");
3806 if (l != sizeof(map))
3807 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3808 "Short write while sending user UID map.");
3809 }
3810 }
3811
5f0a6347
DDM
3812 r = mount_custom(
3813 directory,
3814 arg_custom_mounts,
3815 arg_n_custom_mounts,
5f0a6347 3816 arg_uid_shift,
c0c8f718 3817 arg_uid_range,
5f0a6347
DDM
3818 arg_selinux_apifs_context,
3819 MOUNT_ROOT_ONLY);
3820 if (r < 0)
3821 return r;
3822
c0c8f718
AV
3823 if (arg_userns_mode != USER_NAMESPACE_NO &&
3824 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3825 arg_uid_shift != 0) {
3826
2b2777ed 3827 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
c0c8f718
AV
3828 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3829 /* This might fail because the kernel or file system doesn't support idmapping. We
3830 * can't really distinguish this nicely, nor do we have any guarantees about the
3831 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3832 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3833 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3834 "ID mapped mounts are apparently not available, sorry.");
3835
3836 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3837 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3838 } else if (r < 0)
3839 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3840 else {
3841 log_debug("ID mapped mounts available, making use of them.");
3842 idmap = true;
3843 }
3844 }
3845
2d3a5a73
LP
3846 if (dissected_image) {
3847 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3848 r = dissected_image_mount(
3849 dissected_image,
3850 directory,
3851 arg_uid_shift,
21b61b1d 3852 arg_uid_range,
d04faa4e
LP
3853 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3854 DISSECT_IMAGE_DISCARD_ON_LOOP|
3855 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3856 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3857 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3858 if (r == -EUCLEAN)
3859 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3860 if (r < 0)
4fcb96ce 3861 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3862 }
3863
8199d554
LP
3864 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3865 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3866
3867 r = detect_unified_cgroup_hierarchy_from_image(directory);
3868 if (r < 0)
3869 return r;
3870
fefb7a6d 3871 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
8199d554
LP
3872 if (l < 0)
3873 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3874 if (l != sizeof(arg_unified_cgroup_hierarchy))
3875 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3876 "Short write while sending cgroup mode.");
8199d554
LP
3877 }
3878
4ad14eff
LP
3879 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3880 if (r < 0)
3881 return r;
3882
03cfe0d5
LP
3883 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3884 if (r < 0)
3885 return r;
3886
bbd407ea
DDM
3887 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3888 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3889 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3890 if (r < 0)
3891 return log_error_errno(r, "Failed to make tree read-only: %m");
3892 }
3893
0de7acce 3894 r = mount_all(directory,
4f086aab 3895 arg_mount_settings,
0de7acce 3896 arg_uid_shift,
0de7acce 3897 arg_selinux_apifs_context);
03cfe0d5
LP
3898 if (r < 0)
3899 return r;
3900
07fa00f9
LP
3901 r = copy_devnodes(directory);
3902 if (r < 0)
03cfe0d5
LP
3903 return r;
3904
de40a303
LP
3905 r = make_extra_nodes(directory);
3906 if (r < 0)
3907 return r;
3908
3909 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3910
9fac5029 3911 p = prefix_roota(directory, "/run/host");
e5f10caf 3912 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3913
07fa00f9
LP
3914 r = setup_pts(directory);
3915 if (r < 0)
03cfe0d5
LP
3916 return r;
3917
e79581dd 3918 r = mount_tunnel_dig(directory);
03cfe0d5
LP
3919 if (r < 0)
3920 return r;
3921
8e5430c4
LP
3922 r = setup_keyring();
3923 if (r < 0)
3924 return r;
3925
3652872a
LP
3926 r = setup_credentials(directory);
3927 if (r < 0)
3928 return r;
3929
2f893044
LP
3930 r = bind_user_setup(bind_user_context, directory);
3931 if (r < 0)
3932 return r;
3933
5c4deb9a
MJ
3934 r = mount_custom(
3935 directory,
3936 arg_custom_mounts,
3937 arg_n_custom_mounts,
3938 arg_uid_shift,
c0c8f718 3939 arg_uid_range,
5c4deb9a
MJ
3940 arg_selinux_apifs_context,
3941 MOUNT_NON_ROOT_ONLY);
3942 if (r < 0)
3943 return r;
3944
03cfe0d5
LP
3945 r = setup_timezone(directory);
3946 if (r < 0)
3947 return r;
3948
3949 r = setup_resolv_conf(directory);
3950 if (r < 0)
3951 return r;
3952
e01ff70a
MS
3953 r = setup_machine_id(directory);
3954 if (r < 0)
3955 return r;
3956
03cfe0d5
LP
3957 r = setup_journal(directory);
3958 if (r < 0)
3959 return r;
3960
0f48ba7b
LP
3961 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3962 p = prefix_roota(directory, "/run/host/container-manager");
3963 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3964
3965 /* The same stuff as the $container_uuid env var */
3966 p = prefix_roota(directory, "/run/host/container-uuid");
3967 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3968
489fae52 3969 if (!arg_use_cgns) {
0996ef00
CB
3970 r = mount_cgroups(
3971 directory,
3972 arg_unified_cgroup_hierarchy,
3973 arg_userns_mode != USER_NAMESPACE_NO,
3974 arg_uid_shift,
3975 arg_uid_range,
5a8ff0e6 3976 arg_selinux_apifs_context,
ada54120 3977 false);
0996ef00
CB
3978 if (r < 0)
3979 return r;
3980 }
03cfe0d5 3981
57c10a56
CB
3982 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3983 * mounts available in systemd services inside the container that create a new mount namespace. See
3984 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3985 * will inherit the shared propagation mode.
3986 *
3987 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3988 * directory mount to root later on.
3989 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3990 */
9d50f850 3991 r = mount_switch_root(directory, MS_SHARED);
03cfe0d5
LP
3992 if (r < 0)
3993 return log_error_errno(r, "Failed to move root directory: %m");
3994
e79581dd
CB
3995 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
3996 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
3997 * the container. */
3998 r = mount_tunnel_open();
3999 if (r < 0)
4000 return r;
4001
b71a0192
CB
4002 if (arg_userns_mode != USER_NAMESPACE_NO) {
4003 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4004 * requires that a fully visible instance is already present in the target mount
4005 * namespace. Mount one here so the inner child can mount its own instances. Later
4006 * we umount the temporary instances created here before we actually exec the
4007 * payload. Since the rootfs is shared the umount will propagate into the container.
4008 * Note, the inner child wouldn't be able to unmount the instances on its own since
4009 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4010 * this. */
4011 r = pin_fully_visible_fs();
4012 if (r < 0)
4013 return r;
4014 }
4015
e96ceaba 4016 fd = setup_notify_child();
9c1e04d0
AP
4017 if (fd < 0)
4018 return fd;
4019
03cfe0d5 4020 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 4021 arg_clone_ns_flags |
8869a0b4 4022 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
4023 if (pid < 0)
4024 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5 4025 if (pid == 0) {
af06cd30 4026 fd_outer_socket = safe_close(fd_outer_socket);
03cfe0d5 4027
2a2e78e9
LP
4028 /* The inner child has all namespaces that are requested, so that we all are owned by the
4029 * user if user namespaces are turned on. */
03cfe0d5 4030
d7bea6b6
DP
4031 if (arg_network_namespace_path) {
4032 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4033 if (r < 0)
e2d39e54 4034 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
4035 }
4036
11875a98 4037 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
03cfe0d5
LP
4038 if (r < 0)
4039 _exit(EXIT_FAILURE);
4040
4041 _exit(EXIT_SUCCESS);
4042 }
4043
af06cd30 4044 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
03cfe0d5
LP
4045 if (l < 0)
4046 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
4047 if (l != sizeof(pid))
4048 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4049 "Short write while sending PID.");
03cfe0d5 4050
af06cd30 4051 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
e01ff70a
MS
4052 if (l < 0)
4053 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
4054 if (l != sizeof(arg_uuid))
4055 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4056 "Short write while sending machine ID.");
e01ff70a 4057
af06cd30 4058 l = send_one_fd(fd_outer_socket, fd, 0);
9c1e04d0 4059 if (l < 0)
ba72801d 4060 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 4061
af06cd30 4062 fd_outer_socket = safe_close(fd_outer_socket);
5d9d3fcb 4063 fd_inner_socket = safe_close(fd_inner_socket);
d7bea6b6 4064 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
4065
4066 return 0;
4067}
4068
0e7ac751 4069static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4070 bool tried_hashed = false;
0e7ac751
LP
4071 unsigned n_tries = 100;
4072 uid_t candidate;
4073 int r;
4074
4075 assert(shift);
4076 assert(ret_lock_file);
0de7acce 4077 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4078 assert(arg_uid_range == 0x10000U);
4079
4080 candidate = *shift;
4081
4082 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4083
4084 for (;;) {
fbd0b64f 4085 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4086 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4087
4088 if (--n_tries <= 0)
4089 return -EBUSY;
4090
87d5e4f2 4091 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4092 goto next;
4093 if ((candidate & UINT32_C(0xFFFF)) != 0)
4094 goto next;
4095
4096 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4097 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4098 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4099 goto next;
4100 if (r < 0)
4101 return r;
4102
4103 /* Make some superficial checks whether the range is currently known in the user database */
4104 if (getpwuid(candidate))
4105 goto next;
4106 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4107 goto next;
4108 if (getgrgid(candidate))
4109 goto next;
4110 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4111 goto next;
4112
4113 *ret_lock_file = lf;
4114 lf = (struct LockFile) LOCK_FILE_INIT;
4115 *shift = candidate;
4116 return 0;
4117
4118 next:
d381c8a6
LP
4119 if (arg_machine && !tried_hashed) {
4120 /* Try to hash the base from the container name */
4121
4122 static const uint8_t hash_key[] = {
4123 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4124 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4125 };
4126
4127 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4128
4129 tried_hashed = true;
4130 } else
4131 random_bytes(&candidate, sizeof(candidate));
4132
87d5e4f2 4133 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4134 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4135 }
4136}
4137
2f893044
LP
4138static int add_one_uid_map(
4139 char **p,
4140 uid_t container_uid,
4141 uid_t host_uid,
4142 uid_t range) {
4143
4144 return strextendf(p,
4145 UID_FMT " " UID_FMT " " UID_FMT "\n",
4146 container_uid, host_uid, range);
4147}
4148
4149static int make_uid_map_string(
4150 const uid_t bind_user_uid[],
4151 size_t n_bind_user_uid,
4152 size_t offset,
4153 char **ret) {
4154
4155 _cleanup_free_ char *s = NULL;
4156 uid_t previous_uid = 0;
4157 int r;
4158
4159 assert(n_bind_user_uid == 0 || bind_user_uid);
2f092762 4160 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
2f893044
LP
4161 assert(ret);
4162
4163 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4164 * quadruplet, consisting of host and container UID + GID. */
4165
4166 for (size_t i = 0; i < n_bind_user_uid; i++) {
05ab439a
YW
4167 uid_t payload_uid = bind_user_uid[i*4+offset],
4168 host_uid = bind_user_uid[i*4+offset+1];
2f893044
LP
4169
4170 assert(previous_uid <= payload_uid);
4171 assert(payload_uid < arg_uid_range);
4172
4173 /* Add a range to close the gap to previous entry */
4174 if (payload_uid > previous_uid) {
4175 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4176 if (r < 0)
4177 return r;
4178 }
4179
4180 /* Map this specific user */
4181 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4182 if (r < 0)
4183 return r;
4184
4185 previous_uid = payload_uid + 1;
4186 }
4187
4188 /* And add a range to close the gap to finish the range */
4189 if (arg_uid_range > previous_uid) {
4190 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4191 if (r < 0)
4192 return r;
4193 }
4194
4195 assert(s);
4196
4197 *ret = TAKE_PTR(s);
4198 return 0;
4199}
4200
4201static int setup_uid_map(
4202 pid_t pid,
4203 const uid_t bind_user_uid[],
4204 size_t n_bind_user_uid) {
4205
4206 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4207 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4208 int r;
4209
4210 assert(pid > 1);
4211
2f893044
LP
4212 /* Build the UID map string */
4213 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4214 return log_oom();
4215
03cfe0d5 4216 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4217 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4218 if (r < 0)
4219 return log_error_errno(r, "Failed to write UID map: %m");
4220
2f893044
LP
4221 /* And now build the GID map string */
4222 s = mfree(s);
4223 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4224 return log_oom();
4225
03cfe0d5 4226 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4227 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4228 if (r < 0)
4229 return log_error_errno(r, "Failed to write GID map: %m");
4230
4231 return 0;
4232}
4233
9c1e04d0 4234static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4235 char buf[NOTIFY_BUFFER_MAX+1];
4236 char *p = NULL;
4237 struct iovec iovec = {
4238 .iov_base = buf,
4239 .iov_len = sizeof(buf)-1,
4240 };
fb29cdbe
LP
4241 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4242 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4243 struct msghdr msghdr = {
4244 .msg_iov = &iovec,
4245 .msg_iovlen = 1,
4246 .msg_control = &control,
4247 .msg_controllen = sizeof(control),
4248 };
371d72e0 4249 struct ucred *ucred;
9c1e04d0
AP
4250 ssize_t n;
4251 pid_t inner_child_pid;
4252 _cleanup_strv_free_ char **tags = NULL;
4bf4f50f 4253 int r;
9c1e04d0
AP
4254
4255 assert(userdata);
4256
4257 inner_child_pid = PTR_TO_PID(userdata);
4258
4259 if (revents != EPOLLIN) {
4260 log_warning("Got unexpected poll event for notify fd.");
4261 return 0;
4262 }
4263
3691bcf3 4264 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
8add30a0
YW
4265 if (n < 0) {
4266 if (ERRNO_IS_TRANSIENT(n))
4267 return 0;
4268 if (n == -EXFULL) {
4269 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4270 return 0;
4271 }
3691bcf3 4272 return log_warning_errno(n, "Couldn't read notification socket: %m");
8add30a0 4273 }
9c1e04d0 4274
9c1e04d0
AP
4275 cmsg_close_all(&msghdr);
4276
371d72e0 4277 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4278 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4279 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4280 return 0;
4281 }
4282
4283 if ((size_t) n >= sizeof(buf)) {
4284 log_warning("Received notify message exceeded maximum size. Ignoring.");
4285 return 0;
4286 }
4287
4288 buf[n] = 0;
4289 tags = strv_split(buf, "\n\r");
4290 if (!tags)
4291 return log_oom();
4292
d29cc4d6 4293 if (strv_contains(tags, "READY=1")) {
d4341b76 4294 r = sd_notify(false, "READY=1\n");
4bf4f50f
ZJS
4295 if (r < 0)
4296 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4297 }
9c1e04d0
AP
4298
4299 p = strv_find_startswith(tags, "STATUS=");
4300 if (p)
04f590a4 4301 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4302
4303 return 0;
4304}
4305
e96ceaba 4306static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4307 int r;
9c1e04d0 4308
5773024d 4309 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4310 if (r < 0)
4311 return log_error_errno(r, "Failed to allocate notify event source: %m");
4312
5773024d 4313 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4314
4315 return 0;
4316}
4317
5d961407
LP
4318static int merge_settings(Settings *settings, const char *path) {
4319 int rl;
f757855e 4320
5d961407
LP
4321 assert(settings);
4322 assert(path);
f757855e 4323
5d961407
LP
4324 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4325 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4326
7732f92b
LP
4327 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4328 settings->start_mode >= 0) {
4329 arg_start_mode = settings->start_mode;
130d3d22 4330 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4331 }
4332
d3689b94
LP
4333 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4334 settings->ephemeral >= 0)
a2f577fc
JL
4335 arg_ephemeral = settings->ephemeral;
4336
de40a303
LP
4337 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4338 settings->root) {
4339
4340 if (!arg_settings_trusted)
4341 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4342 else
4343 free_and_replace(arg_directory, settings->root);
4344 }
4345
b53ede69
PW
4346 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4347 settings->pivot_root_new) {
4348 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4349 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4350 }
4351
5f932eb9 4352 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4353 settings->working_directory)
4354 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4355
f757855e 4356 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4357 settings->environment)
4358 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4359
de40a303
LP
4360 if ((arg_settings_mask & SETTING_USER) == 0) {
4361
4362 if (settings->user)
4363 free_and_replace(arg_user, settings->user);
4364
4365 if (uid_is_valid(settings->uid))
4366 arg_uid = settings->uid;
4367 if (gid_is_valid(settings->gid))
4368 arg_gid = settings->gid;
4369 if (settings->n_supplementary_gids > 0) {
4370 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4371 arg_n_supplementary_gids = settings->n_supplementary_gids;
4372 }
4373 }
f757855e
LP
4374
4375 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4376 uint64_t plus, minus;
7be830c6 4377 uint64_t network_minus = 0;
88fc9c9b 4378 uint64_t ambient;
f757855e 4379
de40a303
LP
4380 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4381 * Settings structure */
4382
0e265674 4383 plus = settings->capability;
a3fc6b55
LP
4384 minus = settings->drop_capability;
4385
9baa294c
LP
4386 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4387 settings_network_configured(settings)) {
a3fc6b55
LP
4388 if (settings_private_network(settings))
4389 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4390 else
7be830c6 4391 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4392 }
0e265674
LP
4393
4394 if (!arg_settings_trusted && plus != 0) {
4395 if (settings->capability != 0)
5d961407 4396 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4397 } else {
4398 arg_caps_retain &= ~network_minus;
520e0d54 4399 arg_caps_retain |= plus;
7be830c6 4400 }
f757855e 4401
a3fc6b55 4402 arg_caps_retain &= ~minus;
de40a303
LP
4403
4404 /* Copy the full capabilities over too */
4405 if (capability_quintet_is_set(&settings->full_capabilities)) {
4406 if (!arg_settings_trusted)
5238e957 4407 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4408 else
4409 arg_full_capabilities = settings->full_capabilities;
4410 }
88fc9c9b
TH
4411
4412 ambient = settings->ambient_capability;
4413 if (!arg_settings_trusted && ambient != 0)
4414 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4415 else
4416 arg_caps_ambient |= ambient;
f757855e
LP
4417 }
4418
4419 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4420 settings->kill_signal > 0)
4421 arg_kill_signal = settings->kill_signal;
4422
4423 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4424 settings->personality != PERSONALITY_INVALID)
4425 arg_personality = settings->personality;
4426
4427 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4428 !sd_id128_is_null(settings->machine_id)) {
4429
4430 if (!arg_settings_trusted)
5d961407 4431 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4432 else
4433 arg_uuid = settings->machine_id;
4434 }
4435
4436 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4437 settings->read_only >= 0)
4438 arg_read_only = settings->read_only;
4439
4440 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4441 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4442 arg_volatile_mode = settings->volatile_mode;
4443
4444 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4445 settings->n_custom_mounts > 0) {
4446
4447 if (!arg_settings_trusted)
5d961407 4448 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4449 else {
4450 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4451 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4452 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4453 settings->n_custom_mounts = 0;
4454 }
4455 }
4456
4457 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
a1dfd585 4458 settings_network_configured(settings)) {
f757855e
LP
4459
4460 if (!arg_settings_trusted)
5d961407 4461 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4462 else {
f6d6bad1 4463 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4464 arg_private_network = settings_private_network(settings);
4465
130d3d22
YW
4466 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4467 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4468 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4469 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4470
1cc6c93a
YW
4471 free_and_replace(arg_network_bridge, settings->network_bridge);
4472 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4473
4474 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4475 }
4476 }
4477
4478 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4479 settings->expose_ports) {
4480
4481 if (!arg_settings_trusted)
5d961407 4482 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4483 else {
4484 expose_port_free_all(arg_expose_ports);
1cc6c93a 4485 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4486 }
4487 }
4488
0de7acce
LP
4489 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4490 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4491
4492 if (!arg_settings_trusted)
5d961407 4493 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4494 else {
4495 arg_userns_mode = settings->userns_mode;
4496 arg_uid_shift = settings->uid_shift;
4497 arg_uid_range = settings->uid_range;
6c045a99 4498 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4499 }
4500 }
4501
0cc3c9f9
LP
4502 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4503 !strv_isempty(settings->bind_user))
2f893044
LP
4504 strv_free_and_replace(arg_bind_user, settings->bind_user);
4505
d3689b94
LP
4506 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4507 settings->notify_ready >= 0)
9c1e04d0
AP
4508 arg_notify_ready = settings->notify_ready;
4509
960e4569
LP
4510 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4511
2d09ea44
LP
4512 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4513 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4514 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4515 else {
4516 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4517 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4518 }
960e4569 4519 }
de40a303
LP
4520
4521#if HAVE_SECCOMP
2d09ea44
LP
4522 if (settings->seccomp) {
4523 if (!arg_settings_trusted)
4524 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4525 else {
4526 seccomp_release(arg_seccomp);
4527 arg_seccomp = TAKE_PTR(settings->seccomp);
4528 }
de40a303
LP
4529 }
4530#endif
960e4569
LP
4531 }
4532
bf428efb
LP
4533 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4534 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4535 continue;
4536
4537 if (!settings->rlimit[rl])
4538 continue;
4539
4540 if (!arg_settings_trusted) {
5d961407 4541 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4542 continue;
4543 }
4544
4545 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4546 }
4547
3a9530e5
LP
4548 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4549 settings->hostname)
4550 free_and_replace(arg_hostname, settings->hostname);
4551
66edd963
LP
4552 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4553 settings->no_new_privileges >= 0)
4554 arg_no_new_privileges = settings->no_new_privileges;
4555
81f345df
LP
4556 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4557 settings->oom_score_adjust_set) {
4558
4559 if (!arg_settings_trusted)
5d961407 4560 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4561 else {
4562 arg_oom_score_adjust = settings->oom_score_adjust;
4563 arg_oom_score_adjust_set = true;
4564 }
4565 }
4566
d107bb7d 4567 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4568 settings->cpu_set.set) {
d107bb7d
LP
4569
4570 if (!arg_settings_trusted)
5d961407 4571 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4572 else {
0985c7c4 4573 cpu_set_reset(&arg_cpu_set);
088d71f8 4574 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
d107bb7d
LP
4575 }
4576 }
4577
09d423e9
LP
4578 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4579 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4580 arg_resolv_conf = settings->resolv_conf;
4581
4e1d6aa9
LP
4582 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4583 settings->link_journal != _LINK_JOURNAL_INVALID) {
4584
4585 if (!arg_settings_trusted)
4586 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4587 else {
4588 arg_link_journal = settings->link_journal;
4589 arg_link_journal_try = settings->link_journal_try;
4590 }
4591 }
4592
1688841f
LP
4593 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4594 settings->timezone != _TIMEZONE_MODE_INVALID)
4595 arg_timezone = settings->timezone;
4596
de40a303
LP
4597 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4598 settings->slice) {
4599
4600 if (!arg_settings_trusted)
4601 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4602 else
4603 free_and_replace(arg_slice, settings->slice);
4604 }
4605
4606 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4607 settings->use_cgns >= 0) {
4608
4609 if (!arg_settings_trusted)
4610 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4611 else
4612 arg_use_cgns = settings->use_cgns;
4613 }
4614
4615 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4616 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4617
4618 if (!arg_settings_trusted)
4619 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4620 else
4621 arg_clone_ns_flags = settings->clone_ns_flags;
4622 }
4623
4624 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4625 settings->console_mode >= 0) {
4626
4627 if (!arg_settings_trusted)
4628 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4629 else
4630 arg_console_mode = settings->console_mode;
4631 }
4632
d3689b94
LP
4633 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4634 settings->suppress_sync >= 0)
4a4654e0
LP
4635 arg_suppress_sync = settings->suppress_sync;
4636
de40a303
LP
4637 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4638 * don't consult arg_settings_mask for them. */
4639
4640 sd_bus_message_unref(arg_property_message);
4641 arg_property_message = TAKE_PTR(settings->properties);
4642
4643 arg_console_width = settings->console_width;
4644 arg_console_height = settings->console_height;
4645
b2645747 4646 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4647 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4648 arg_n_extra_nodes = settings->n_extra_nodes;
825210d4 4649 settings->n_extra_nodes = 0;
de40a303 4650
f757855e
LP
4651 return 0;
4652}
4653
5d961407
LP
4654static int load_settings(void) {
4655 _cleanup_(settings_freep) Settings *settings = NULL;
4656 _cleanup_fclose_ FILE *f = NULL;
3603f151 4657 _cleanup_free_ char *p = NULL;
5d961407
LP
4658 int r;
4659
de40a303
LP
4660 if (arg_oci_bundle)
4661 return 0;
4662
5d961407
LP
4663 /* If all settings are masked, there's no point in looking for
4664 * the settings file */
d7a0f1f4 4665 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4666 return 0;
4667
5d961407
LP
4668 /* We first look in the admin's directories in /etc and /run */
4669 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4670 _cleanup_free_ char *j = NULL;
4671
3603f151 4672 j = path_join(i, arg_settings_filename);
5d961407
LP
4673 if (!j)
4674 return log_oom();
4675
4676 f = fopen(j, "re");
4677 if (f) {
4678 p = TAKE_PTR(j);
4679
4680 /* By default, we trust configuration from /etc and /run */
4681 if (arg_settings_trusted < 0)
4682 arg_settings_trusted = true;
4683
4684 break;
4685 }
4686
4687 if (errno != ENOENT)
4688 return log_error_errno(errno, "Failed to open %s: %m", j);
4689 }
4690
4691 if (!f) {
4692 /* After that, let's look for a file next to the
4693 * actual image we shall boot. */
4694
4695 if (arg_image) {
162f6477
LP
4696 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4697 if (r < 0)
4698 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4699 } else if (arg_directory) {
4700 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4701 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4702 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
5d961407
LP
4703 }
4704
4705 if (p) {
4706 f = fopen(p, "re");
4707 if (!f && errno != ENOENT)
4708 return log_error_errno(errno, "Failed to open %s: %m", p);
4709
4710 /* By default, we do not trust configuration from /var/lib/machines */
4711 if (arg_settings_trusted < 0)
4712 arg_settings_trusted = false;
4713 }
4714 }
4715
4716 if (!f)
4717 return 0;
4718
4719 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4720
4721 r = settings_load(f, p, &settings);
4722 if (r < 0)
4723 return r;
4724
4725 return merge_settings(settings, p);
4726}
4727
de40a303
LP
4728static int load_oci_bundle(void) {
4729 _cleanup_(settings_freep) Settings *settings = NULL;
4730 int r;
4731
4732 if (!arg_oci_bundle)
4733 return 0;
4734
4735 /* By default let's trust OCI bundles */
4736 if (arg_settings_trusted < 0)
4737 arg_settings_trusted = true;
4738
4739 r = oci_load(NULL, arg_oci_bundle, &settings);
4740 if (r < 0)
4741 return r;
4742
4743 return merge_settings(settings, arg_oci_bundle);
4744}
4745
3acc84eb 4746static int run_container(
2d845785 4747 DissectedImage *dissected_image,
b0067625
ZJS
4748 FDSet *fds,
4749 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4750 struct ExposeArgs *expose_args,
3acc84eb 4751 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4752
4753 static const struct sigaction sa = {
4754 .sa_handler = nop_signal_handler,
e28c7cd0 4755 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4756 };
4757
8e766630 4758 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
5bb1d7fb 4759 _cleanup_close_ int etc_passwd_lock = -EBADF;
b0067625 4760 _cleanup_close_pair_ int
19ee48a6
YW
4761 fd_inner_socket_pair[2] = PIPE_EBADF,
4762 fd_outer_socket_pair[2] = PIPE_EBADF;
8199d554 4763
5bb1d7fb 4764 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
b0067625 4765 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4766 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4767 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4768 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4769 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4770 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4771 _cleanup_free_ uid_t *bind_user_uid = NULL;
4772 size_t n_bind_user_uid = 0;
b0067625 4773 ContainerStatus container_status = 0;
b0067625
ZJS
4774 int ifi = 0, r;
4775 ssize_t l;
4776 sigset_t mask_chld;
254d1313 4777 _cleanup_close_ int child_netns_fd = -EBADF;
b0067625
ZJS
4778
4779 assert_se(sigemptyset(&mask_chld) == 0);
4780 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4781
4782 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4783 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4784 * check with getpwuid() if the specific user already exists. Note that /etc might be
4785 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4786 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4787 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4788 * really ours. */
4789
4790 etc_passwd_lock = take_etc_passwd_lock(NULL);
4791 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4792 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4793 }
4794
4795 r = barrier_create(&barrier);
4796 if (r < 0)
4797 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4798
5d9d3fcb
CB
4799 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4800 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4801
af06cd30
CB
4802 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4803 return log_error_errno(errno, "Failed to create outer socket pair: %m");
b0067625 4804
b0067625
ZJS
4805 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4806 * parent's blocking calls and give it a chance to call wait() and terminate. */
4807 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4808 if (r < 0)
4809 return log_error_errno(errno, "Failed to change the signal mask: %m");
4810
4811 r = sigaction(SIGCHLD, &sa, NULL);
4812 if (r < 0)
4813 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4814
d7bea6b6 4815 if (arg_network_namespace_path) {
5b4855ab
DDM
4816 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4817 if (child_netns_fd < 0)
d7bea6b6
DP
4818 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4819
54c2459d 4820 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4821 if (r == -EUCLEAN)
4822 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4823 else if (r < 0)
d7bea6b6 4824 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4825 else if (r == 0)
4826 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4827 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4828 }
4829
b0067625
ZJS
4830 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4831 if (*pid < 0)
4832 return log_error_errno(errno, "clone() failed%s: %m",
4833 errno == EINVAL ?
4834 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4835
4836 if (*pid == 0) {
4837 /* The outer child only has a file system namespace. */
4838 barrier_set_role(&barrier, BARRIER_CHILD);
4839
5d9d3fcb 4840 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
af06cd30 4841 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
b0067625
ZJS
4842
4843 (void) reset_all_signal_handlers();
4844 (void) reset_signal_mask();
4845
4846 r = outer_child(&barrier,
4847 arg_directory,
2d845785 4848 dissected_image,
af06cd30 4849 fd_outer_socket_pair[1],
5d9d3fcb 4850 fd_inner_socket_pair[1],
d7bea6b6 4851 fds,
5b4855ab 4852 child_netns_fd);
b0067625
ZJS
4853 if (r < 0)
4854 _exit(EXIT_FAILURE);
4855
4856 _exit(EXIT_SUCCESS);
4857 }
4858
4859 barrier_set_role(&barrier, BARRIER_PARENT);
4860
e4077ff6 4861 fdset_close(fds);
b0067625 4862
5d9d3fcb 4863 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
af06cd30 4864 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
b0067625
ZJS
4865
4866 if (arg_userns_mode != USER_NAMESPACE_NO) {
af06cd30 4867 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
b71a0192
CB
4868 if (mntns_fd < 0)
4869 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4870
b0067625 4871 /* The child just let us know the UID shift it might have read from the image. */
af06cd30 4872 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
b0067625
ZJS
4873 if (l < 0)
4874 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4875 if (l != sizeof arg_uid_shift)
4876 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4877
4878 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4879 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4880 * image, but if that's already in use, pick a new one, and report back to the child,
4881 * which one we now picked. */
4882
4883 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4884 if (r < 0)
4885 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4886
af06cd30 4887 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
b0067625
ZJS
4888 if (l < 0)
4889 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4890 if (l != sizeof arg_uid_shift)
4891 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4892 }
2f893044
LP
4893
4894 n_bind_user_uid = strv_length(arg_bind_user);
4895 if (n_bind_user_uid > 0) {
4896 /* Right after the UID shift, we'll receive the list of UID mappings for the
4897 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4898
4899 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4900 if (!bind_user_uid)
4901 return log_oom();
4902
4903 for (size_t i = 0; i < n_bind_user_uid; i++) {
af06cd30 4904 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
2f893044
LP
4905 if (l < 0)
4906 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4907 if (l != sizeof(uid_t)*4)
4908 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4909 SYNTHETIC_ERRNO(EIO),
4910 "Short read while reading bind user UID pairs.");
4911 }
4912 }
b0067625
ZJS
4913 }
4914
8199d554
LP
4915 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4916 /* The child let us know the support cgroup mode it might have read from the image. */
fefb7a6d 4917 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
8199d554
LP
4918 if (l < 0)
4919 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113 4920 if (l != sizeof(arg_unified_cgroup_hierarchy))
c0f86d66 4921 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
c6147113 4922 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4923 }
4924
b0067625 4925 /* Wait for the outer child. */
d2e0ac3d
LP
4926 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4927 if (r < 0)
4928 return r;
4929 if (r != EXIT_SUCCESS)
4930 return -EIO;
b0067625
ZJS
4931
4932 /* And now retrieve the PID of the inner child. */
af06cd30 4933 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
b0067625
ZJS
4934 if (l < 0)
4935 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4936 if (l != sizeof *pid)
4937 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4938
4939 /* We also retrieve container UUID in case it was generated by outer child */
af06cd30 4940 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
b0067625
ZJS
4941 if (l < 0)
4942 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4943 if (l != sizeof(arg_uuid))
4944 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4945
4946 /* We also retrieve the socket used for notifications generated by outer child */
af06cd30 4947 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
b0067625
ZJS
4948 if (notify_socket < 0)
4949 return log_error_errno(notify_socket,
4950 "Failed to receive notification socket from the outer child: %m");
4951
4952 log_debug("Init process invoked as PID "PID_FMT, *pid);
4953
4954 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4955 if (!barrier_place_and_sync(&barrier)) /* #1 */
4956 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4957
2f893044 4958 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4959 if (r < 0)
4960 return r;
4961
4962 (void) barrier_place(&barrier); /* #2 */
4963 }
4964
4965 if (arg_private_network) {
75116558
PS
4966 if (!arg_network_namespace_path) {
4967 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4968 if (!barrier_place_and_sync(&barrier)) /* #3 */
4969 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4970 }
4971
5b4855ab
DDM
4972 if (child_netns_fd < 0) {
4973 /* Make sure we have an open file descriptor to the child's network
4974 * namespace so it stays alive even if the child exits. */
4975 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4976 if (r < 0)
4977 return log_error_errno(r, "Failed to open child network namespace: %m");
4978 }
4979
4980 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4981 if (r < 0)
4982 return r;
4983
4984 if (arg_network_veth) {
4985 r = setup_veth(arg_machine, *pid, veth_name,
4986 arg_network_bridge || arg_network_zone);
4987 if (r < 0)
4988 return r;
4989 else if (r > 0)
4990 ifi = r;
4991
4992 if (arg_network_bridge) {
4993 /* Add the interface to a bridge */
4994 r = setup_bridge(veth_name, arg_network_bridge, false);
4995 if (r < 0)
4996 return r;
4997 if (r > 0)
4998 ifi = r;
4999 } else if (arg_network_zone) {
5000 /* Add the interface to a bridge, possibly creating it */
5001 r = setup_bridge(veth_name, arg_network_zone, true);
5002 if (r < 0)
5003 return r;
5004 if (r > 0)
5005 ifi = r;
5006 }
5007 }
5008
5009 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5010 if (r < 0)
5011 return r;
5012
5013 /* We created the primary and extra veth links now; let's remember this, so that we know to
5014 remove them later on. Note that we don't bother with removing veth links that were created
5015 here when their setup failed half-way, because in that case the kernel should be able to
5016 remove them on its own, since they cannot be referenced by anything yet. */
5017 *veth_created = true;
5018
5019 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5020 if (r < 0)
5021 return r;
5022
5023 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5024 if (r < 0)
5025 return r;
5026 }
5027
abdb9b08
LP
5028 if (arg_register || !arg_keep_unit) {
5029 r = sd_bus_default_system(&bus);
5030 if (r < 0)
5031 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
5032
5033 r = sd_bus_set_close_on_exit(bus, false);
5034 if (r < 0)
5035 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
5036 }
5037
5038 if (!arg_keep_unit) {
5039 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5040 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5041 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5042
75152a4d
LP
5043 r = sd_bus_match_signal_async(
5044 bus,
5045 NULL,
5046 "org.freedesktop.systemd1",
5047 NULL,
5048 "org.freedesktop.systemd1.Scope",
5049 "RequestStop",
5050 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 5051 if (r < 0)
75152a4d 5052 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
5053 }
5054
b0067625
ZJS
5055 if (arg_register) {
5056 r = register_machine(
abdb9b08 5057 bus,
b0067625
ZJS
5058 arg_machine,
5059 *pid,
5060 arg_directory,
5061 arg_uuid,
5062 ifi,
5063 arg_slice,
5064 arg_custom_mounts, arg_n_custom_mounts,
5065 arg_kill_signal,
5066 arg_property,
de40a303 5067 arg_property_message,
b0067625
ZJS
5068 arg_keep_unit,
5069 arg_container_service_name);
5070 if (r < 0)
5071 return r;
abdb9b08 5072
cd2dfc6f
LP
5073 } else if (!arg_keep_unit) {
5074 r = allocate_scope(
abdb9b08 5075 bus,
cd2dfc6f
LP
5076 arg_machine,
5077 *pid,
5078 arg_slice,
5079 arg_custom_mounts, arg_n_custom_mounts,
5080 arg_kill_signal,
de40a303
LP
5081 arg_property,
5082 arg_property_message);
cd2dfc6f
LP
5083 if (r < 0)
5084 return r;
5085
5086 } else if (arg_slice || arg_property)
5087 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5088
27da7ef0 5089 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5090 if (r < 0)
5091 return r;
5092
27da7ef0 5093 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5094 if (r < 0)
5095 return r;
b0067625 5096
de54e02d 5097 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5098 if (r < 0)
5099 return r;
5100
5101 /* Notify the child that the parent is ready with all
5102 * its setup (including cgroup-ification), and that
5103 * the child can now hand over control to the code to
5104 * run inside the container. */
75116558 5105 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5106
5107 /* Block SIGCHLD here, before notifying child.
5108 * process_pty() will handle it with the other signals. */
5109 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5110
5111 /* Reset signal to default */
9c274488 5112 r = default_signals(SIGCHLD);
b0067625
ZJS
5113 if (r < 0)
5114 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5115
5116 r = sd_event_new(&event);
5117 if (r < 0)
5118 return log_error_errno(r, "Failed to get default event source: %m");
5119
8fd010bb
LP
5120 (void) sd_event_set_watchdog(event, true);
5121
abdb9b08
LP
5122 if (bus) {
5123 r = sd_bus_attach_event(bus, event, 0);
5124 if (r < 0)
5125 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5126 }
5127
e96ceaba 5128 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5129 if (r < 0)
5130 return r;
5131
b71a0192
CB
5132 if (arg_userns_mode != USER_NAMESPACE_NO) {
5133 r = wipe_fully_visible_fs(mntns_fd);
5134 if (r < 0)
5135 return r;
5136 mntns_fd = safe_close(mntns_fd);
5137 }
5138
b0067625 5139 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
5140 if (!barrier_place_and_sync(&barrier)) /* #5 */
5141 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5142
38ccb557 5143 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5144 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5145 etc_passwd_lock = safe_close(etc_passwd_lock);
5146
04f590a4
LP
5147 (void) sd_notifyf(false,
5148 "STATUS=Container running.\n"
5149 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4bf4f50f
ZJS
5150 if (!arg_notify_ready) {
5151 r = sd_notify(false, "READY=1\n");
5152 if (r < 0)
5153 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5154 }
b0067625
ZJS
5155
5156 if (arg_kill_signal > 0) {
5157 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5158 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5159 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5160 } else {
5161 /* Immediately exit */
919f5ae0
LP
5162 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5163 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5164 }
5165
988851b6
LP
5166 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5167
5168 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5169 if (r < 0)
5170 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5171
6916b164 5172 /* Exit when the child exits */
919f5ae0 5173 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625 5174
b07ee903
CB
5175 /* Retrieve the kmsg fifo allocated by inner child */
5176 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5177 if (fd_kmsg_fifo < 0)
5178 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5179
b0067625 5180 if (arg_expose_ports) {
b07ee903 5181 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5182 if (r < 0)
5183 return r;
5184
deff68e7
FW
5185 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5186 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5187 }
5188
3acc84eb 5189 if (arg_console_mode != CONSOLE_PIPE) {
254d1313 5190 _cleanup_close_ int fd = -EBADF;
3acc84eb 5191 PTYForwardFlags flags = 0;
de40a303 5192
3acc84eb 5193 /* Retrieve the master pty allocated by inner child */
bb1aa185 5194 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
3acc84eb
FB
5195 if (fd < 0)
5196 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5197
5198 switch (arg_console_mode) {
de40a303 5199
3acc84eb
FB
5200 case CONSOLE_READ_ONLY:
5201 flags |= PTY_FORWARD_READ_ONLY;
5202
5203 _fallthrough_;
5204
5205 case CONSOLE_INTERACTIVE:
5206 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5207
5208 r = pty_forward_new(event, fd, flags, &forward);
5209 if (r < 0)
5210 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5211
f5fbe71d 5212 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5213 (void) pty_forward_set_width_height(forward,
5214 arg_console_width,
5215 arg_console_height);
5216 break;
5217
5218 default:
5219 assert(arg_console_mode == CONSOLE_PASSIVE);
5220 }
5221
5222 *master = TAKE_FD(fd);
de40a303 5223 }
b0067625 5224
5d9d3fcb
CB
5225 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5226
b0067625
ZJS
5227 r = sd_event_loop(event);
5228 if (r < 0)
5229 return log_error_errno(r, "Failed to run event loop: %m");
5230
de40a303
LP
5231 if (forward) {
5232 char last_char = 0;
b0067625 5233
de40a303
LP
5234 (void) pty_forward_get_last_char(forward, &last_char);
5235 forward = pty_forward_free(forward);
b0067625 5236
de40a303
LP
5237 if (!arg_quiet && last_char != '\n')
5238 putc('\n', stdout);
5239 }
b0067625
ZJS
5240
5241 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5242 if (!arg_register && !arg_keep_unit && bus)
5243 terminate_scope(bus, arg_machine);
b0067625
ZJS
5244
5245 /* Normally redundant, but better safe than sorry */
c67b0082 5246 (void) kill(*pid, SIGKILL);
b0067625 5247
5d9d3fcb
CB
5248 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5249
5b4855ab
DDM
5250 if (arg_private_network) {
5251 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5252 * to avoid having to move the parent to the child network namespace. */
5253 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5254 if (r < 0)
5255 return r;
5256
5257 if (r == 0) {
254d1313 5258 _cleanup_close_ int parent_netns_fd = -EBADF;
5b4855ab
DDM
5259
5260 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5261 if (r < 0) {
5262 log_error_errno(r, "Failed to open parent network namespace: %m");
5263 _exit(EXIT_FAILURE);
5264 }
5265
5266 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5267 if (r < 0) {
5268 log_error_errno(r, "Failed to enter child network namespace: %m");
5269 _exit(EXIT_FAILURE);
5270 }
5271
2f091b1b
TM
5272 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5273 * This is about ensuring interfaces get their old name back when being moved back. */
5274 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5275
5b4855ab
DDM
5276 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5277 if (r < 0)
5278 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5279
5280 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5281 }
5282 }
5283
8f03de53 5284 r = wait_for_container(TAKE_PID(*pid), &container_status);
b0067625 5285
0bb0a9fa
ZJS
5286 /* Tell machined that we are gone. */
5287 if (bus)
5288 (void) unregister_machine(bus, arg_machine);
5289
b0067625
ZJS
5290 if (r < 0)
5291 /* We failed to wait for the container, or the container exited abnormally. */
5292 return r;
5293 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5294 /* r > 0 → The container exited with a non-zero status.
5295 * As a special case, we need to replace 133 with a different value,
5296 * because 133 is special-cased in the service file to reboot the container.
5297 * otherwise → The container exited with zero status and a reboot was not requested.
5298 */
2a49b612 5299 if (r == EXIT_FORCE_RESTART)
27e29a1e 5300 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5301 *ret = r;
b0067625
ZJS
5302 return 0; /* finito */
5303 }
5304
5305 /* CONTAINER_REBOOTED, loop again */
5306
5307 if (arg_keep_unit) {
5308 /* Special handling if we are running as a service: instead of simply
5309 * restarting the machine we want to restart the entire service, so let's
5310 * inform systemd about this with the special exit code 133. The service
5311 * file uses RestartForceExitStatus=133 so that this results in a full
5312 * nspawn restart. This is necessary since we might have cgroup parameters
5313 * set we want to have flushed out. */
2a49b612
ZJS
5314 *ret = EXIT_FORCE_RESTART;
5315 return 0; /* finito */
b0067625
ZJS
5316 }
5317
deff68e7
FW
5318 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5319 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5320
5321 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5322 *veth_created = false;
5323 return 1; /* loop again */
5324}
5325
bf428efb 5326static int initialize_rlimits(void) {
852b6250 5327 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
bf428efb
LP
5328 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5329 * container execution environments. */
5330
5331 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
852b6250
LP
5332 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5333 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5334 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5335 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5336 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5337 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5338 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5339 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5340 [RLIMIT_NICE] = { 0, 0 },
5341 [RLIMIT_NOFILE] = { 1024, 4096 },
5342 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5343 [RLIMIT_RTPRIO] = { 0, 0 },
5344 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5345 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
bf428efb
LP
5346
5347 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5348 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5349 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5350 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5351 * that PID 1 changes a number of other resource limits during early initialization which is why we
5352 * don't read the other limits from PID 1 but prefer the static table above. */
5353 };
5354
5355 int rl;
5356
5357 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5358 /* Let's only fill in what the user hasn't explicitly configured anyway */
5359 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5360 const struct rlimit *v;
5361 struct rlimit buffer;
5362
5363 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5364 /* For these two let's read the limits off PID 1. See above for an explanation. */
5365
5366 if (prlimit(1, rl, NULL, &buffer) < 0)
5367 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5368
dbf1aca6
LP
5369 v = &buffer;
5370 } else if (rl == RLIMIT_NOFILE) {
5371 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5372 * userspace. Given that nspawn containers are often run without our PID 1,
5373 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5374 * so that container userspace gets similar resources as host userspace
5375 * gets. */
5376 buffer = kernel_defaults[rl];
5377 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
bf428efb
LP
5378 v = &buffer;
5379 } else
5380 v = kernel_defaults + rl;
5381
5382 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5383 if (!arg_rlimit[rl])
5384 return log_oom();
5385 }
5386
5387 if (DEBUG_LOGGING) {
5388 _cleanup_free_ char *k = NULL;
5389
5390 (void) rlimit_format(arg_rlimit[rl], &k);
5391 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5392 }
5393 }
5394
5395 return 0;
5396}
5397
287b7376 5398static int cant_be_in_netns(void) {
254d1313 5399 _cleanup_close_ int fd = -EBADF;
287b7376
LP
5400 struct ucred ucred;
5401 int r;
5402
5403 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5404 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5405 * nice message. */
5406
5407 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5408 return 0;
5409
5410 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5411 if (fd < 0)
5412 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5413
1861986a
LP
5414 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5415 if (r < 0) {
5416 if (r == -ENOENT || ERRNO_IS_DISCONNECT(r))
287b7376
LP
5417 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5418 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5419
1861986a 5420 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
287b7376
LP
5421 }
5422
5423 r = getpeercred(fd, &ucred);
5424 if (r < 0)
5425 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5426
f7a2dc3d 5427 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
287b7376 5428 if (r < 0)
f7a2dc3d
CB
5429 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5430 if (r == 0)
287b7376
LP
5431 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5432 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5433 return 0;
5434}
5435
44dbef90 5436static int run(int argc, char *argv[]) {
4c27749b 5437 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5bb1d7fb 5438 _cleanup_close_ int master = -EBADF;
03cfe0d5 5439 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5440 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5441 char veth_name[IFNAMSIZ] = "";
761cf19d 5442 struct ExposeArgs expose_args = {};
8e766630 5443 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5444 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5445 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e 5446 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5447 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5448 pid_t pid = 0;
03cfe0d5
LP
5449
5450 log_parse_environment();
5451 log_open();
415fc41c 5452
03cfe0d5
LP
5453 r = parse_argv(argc, argv);
5454 if (r <= 0)
5455 goto finish;
5456
38ee19c0
ZJS
5457 if (geteuid() != 0) {
5458 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5459 argc >= 2 ? "Need to be root." :
5460 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5461 goto finish;
38ee19c0 5462 }
fba868fa 5463
287b7376
LP
5464 r = cant_be_in_netns();
5465 if (r < 0)
5466 goto finish;
5467
bf428efb
LP
5468 r = initialize_rlimits();
5469 if (r < 0)
5470 goto finish;
5471
de40a303
LP
5472 r = load_oci_bundle();
5473 if (r < 0)
5474 goto finish;
5475
f757855e
LP
5476 r = determine_names();
5477 if (r < 0)
5478 goto finish;
5479
5480 r = load_settings();
5481 if (r < 0)
5482 goto finish;
5483
d4d99bc6 5484 r = cg_unified();
5eee8290
LP
5485 if (r < 0) {
5486 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5487 goto finish;
5488 }
5489
f757855e
LP
5490 r = verify_arguments();
5491 if (r < 0)
5492 goto finish;
03cfe0d5 5493
2f091b1b
TM
5494 r = verify_network_interfaces_initialized();
5495 if (r < 0)
5496 goto finish;
5497
49048684
ZJS
5498 /* Reapply environment settings. */
5499 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5500
2949ff26
LP
5501 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5502 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5503 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5504 (void) ignore_signals(SIGPIPE);
2949ff26 5505
03cfe0d5
LP
5506 n_fd_passed = sd_listen_fds(false);
5507 if (n_fd_passed > 0) {
5508 r = fdset_new_listen_fds(&fds, false);
5509 if (r < 0) {
5510 log_error_errno(r, "Failed to collect file descriptors: %m");
5511 goto finish;
5512 }
5513 }
5514
83e803a9
ZJS
5515 /* The "default" umask. This is appropriate for most file and directory
5516 * operations performed by nspawn, and is the umask that will be used for
5517 * the child. Functions like copy_devnodes() change the umask temporarily. */
5518 umask(0022);
5519
03cfe0d5
LP
5520 if (arg_directory) {
5521 assert(!arg_image);
5522
b35ca61a
LP
5523 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5524 * /var from the host will propagate into container dynamically (because bad things happen if
5525 * two systems write to the same /var). Let's allow it for the special cases where /var is
5526 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5527 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
1406bd66
LP
5528 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5529 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5530 goto finish;
5531 }
5532
5533 if (arg_ephemeral) {
5534 _cleanup_free_ char *np = NULL;
5535
f461a28d 5536 r = chase_and_update(&arg_directory, 0);
3f342ec4
LP
5537 if (r < 0)
5538 goto finish;
5539
7bf011e3
LP
5540 /* If the specified path is a mount point we generate the new snapshot immediately
5541 * inside it under a random name. However if the specified is not a mount point we
5542 * create the new snapshot in the parent directory, just next to it. */
e1873695 5543 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5544 if (r < 0) {
5545 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5546 goto finish;
5547 }
5548 if (r > 0)
770b5ce4 5549 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5550 else
770b5ce4 5551 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5552 if (r < 0) {
0f3be6ca 5553 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5554 goto finish;
5555 }
5556
6992459c 5557 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5558 * only owned by us and no one else. */
6992459c 5559 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5560 if (r < 0) {
5561 log_error_errno(r, "Failed to lock %s: %m", np);
5562 goto finish;
5563 }
5564
7bf011e3
LP
5565 {
5566 BLOCK_SIGNALS(SIGINT);
5567 r = btrfs_subvol_snapshot(arg_directory, np,
5568 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5569 BTRFS_SNAPSHOT_FALLBACK_COPY |
5570 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5571 BTRFS_SNAPSHOT_RECURSIVE |
5572 BTRFS_SNAPSHOT_QUOTA |
5573 BTRFS_SNAPSHOT_SIGINT);
5574 }
5575 if (r == -EINTR) {
5576 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5577 goto finish;
5578 }
03cfe0d5
LP
5579 if (r < 0) {
5580 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5581 goto finish;
ec16945e
LP
5582 }
5583
1cc6c93a 5584 free_and_replace(arg_directory, np);
17cbb288 5585 remove_directory = true;
30535c16 5586 } else {
f461a28d 5587 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5588 if (r < 0)
5589 goto finish;
5590
30535c16
LP
5591 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5592 if (r == -EBUSY) {
5593 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5594 goto finish;
5595 }
5596 if (r < 0) {
5597 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5598 goto finish;
30535c16
LP
5599 }
5600
5601 if (arg_template) {
f461a28d 5602 r = chase_and_update(&arg_template, 0);
3f342ec4
LP
5603 if (r < 0)
5604 goto finish;
5605
7bf011e3
LP
5606 {
5607 BLOCK_SIGNALS(SIGINT);
5608 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5609 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5610 BTRFS_SNAPSHOT_FALLBACK_COPY |
5611 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5612 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5613 BTRFS_SNAPSHOT_RECURSIVE |
5614 BTRFS_SNAPSHOT_QUOTA |
5615 BTRFS_SNAPSHOT_SIGINT);
5616 }
ff6c6cc1
LP
5617 if (r == -EEXIST)
5618 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5619 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5620 else if (r == -EINTR) {
5621 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5622 goto finish;
5623 } else if (r < 0) {
83521414 5624 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5625 goto finish;
ff6c6cc1
LP
5626 } else
5627 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5628 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5629 }
ec16945e
LP
5630 }
5631
7732f92b 5632 if (arg_start_mode == START_BOOT) {
aff7ae0d 5633 _cleanup_free_ char *b = NULL;
a5201ed6 5634 const char *p;
c9fe05e0 5635
aff7ae0d
LP
5636 if (arg_pivot_root_new) {
5637 b = path_join(arg_directory, arg_pivot_root_new);
5638 if (!b)
5639 return log_oom();
5640
5641 p = b;
5642 } else
a5201ed6 5643 p = arg_directory;
c9fe05e0
AR
5644
5645 if (path_is_os_tree(p) <= 0) {
aff7ae0d
LP
5646 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5647 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
1b9e5b12
LP
5648 goto finish;
5649 }
5650 } else {
aff7ae0d 5651 _cleanup_free_ char *p = NULL;
c9fe05e0 5652
a5201ed6 5653 if (arg_pivot_root_new)
aff7ae0d 5654 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
a5201ed6 5655 else
aff7ae0d
LP
5656 p = path_join(arg_directory, "/usr/");
5657 if (!p)
5658 return log_oom();
1b9e5b12 5659
aff7ae0d
LP
5660 if (laccess(p, F_OK) < 0) {
5661 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5662 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
1b9e5b12 5663 goto finish;
1b9e5b12
LP
5664 }
5665 }
ec16945e 5666
6b9132a9 5667 } else {
d04faa4e 5668 DissectImageFlags dissect_image_flags =
4b5de5dd 5669 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5670 DISSECT_IMAGE_REQUIRE_ROOT |
5671 DISSECT_IMAGE_RELAX_VAR_CHECK |
73d88b80
LP
5672 DISSECT_IMAGE_USR_NO_ROOT |
5673 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5674 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
ec16945e
LP
5675 assert(arg_image);
5676 assert(!arg_template);
5677
f461a28d 5678 r = chase_and_update(&arg_image, 0);
3f342ec4
LP
5679 if (r < 0)
5680 goto finish;
5681
0f3be6ca
LP
5682 if (arg_ephemeral) {
5683 _cleanup_free_ char *np = NULL;
5684
5685 r = tempfn_random(arg_image, "machine.", &np);
5686 if (r < 0) {
5687 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5688 goto finish;
5689 }
5690
6992459c
LP
5691 /* Always take an exclusive lock on our own ephemeral copy. */
5692 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5693 if (r < 0) {
5694 r = log_error_errno(r, "Failed to create image lock: %m");
5695 goto finish;
5696 }
5697
7bf011e3
LP
5698 {
5699 BLOCK_SIGNALS(SIGINT);
7c2f5495
DDM
5700 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5701 FS_NOCOW_FL, FS_NOCOW_FL,
5702 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5703 NULL, NULL);
7bf011e3
LP
5704 }
5705 if (r == -EINTR) {
5706 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5707 goto finish;
5708 }
0f3be6ca
LP
5709 if (r < 0) {
5710 r = log_error_errno(r, "Failed to copy image file: %m");
5711 goto finish;
5712 }
5713
1cc6c93a 5714 free_and_replace(arg_image, np);
0f3be6ca
LP
5715 remove_image = true;
5716 } else {
5717 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5718 if (r == -EBUSY) {
5719 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5720 goto finish;
5721 }
5722 if (r < 0) {
5723 r = log_error_errno(r, "Failed to create image lock: %m");
5724 goto finish;
5725 }
4623e8e6 5726
89e62e0b
LP
5727 r = verity_settings_load(
5728 &arg_verity_settings,
5729 arg_image, NULL, NULL);
e7cbe5cb
LB
5730 if (r < 0) {
5731 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5732 goto finish;
78ebe980 5733 }
89e62e0b
LP
5734
5735 if (arg_verity_settings.data_path)
5736 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5737 }
5738
c67b0082 5739 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5740 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5741 goto finish;
1b9e5b12 5742 }
6b9132a9 5743
c67b0082
LP
5744 remove_tmprootdir = true;
5745
5746 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5747 if (!arg_directory) {
5748 r = log_oom();
5749 goto finish;
6b9132a9 5750 }
88213476 5751
89e62e0b
LP
5752 r = loop_device_make_by_path(
5753 arg_image,
5754 arg_read_only ? O_RDONLY : O_RDWR,
22ee78a8 5755 /* sector_size= */ UINT32_MAX,
89e62e0b 5756 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
7f52206a 5757 LOCK_SH,
89e62e0b 5758 &loop);
2d845785
LP
5759 if (r < 0) {
5760 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5761 goto finish;
5762 }
1b9e5b12 5763
bad31660 5764 r = dissect_loop_device_and_warn(
bad31660 5765 loop,
89e62e0b 5766 &arg_verity_settings,
84be0c71
LP
5767 /* mount_options=*/ NULL,
5768 arg_image_policy ?: &image_policy_container,
e7cbe5cb 5769 dissect_image_flags,
e0f9e7bd 5770 &dissected_image);
2d845785 5771 if (r == -ENOPKG) {
4526113f 5772 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5773 log_notice("Note that the disk image needs to\n"
5774 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5775 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
db811444 5776 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
2d845785
LP
5777 " d) or contain a file system without a partition table\n"
5778 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5779 goto finish;
2d845785 5780 }
4526113f 5781 if (r < 0)
842f3b0f 5782 goto finish;
1b9e5b12 5783
88b3300f
LP
5784 r = dissected_image_load_verity_sig_partition(
5785 dissected_image,
5786 loop->fd,
5787 &arg_verity_settings);
5788 if (r < 0)
5789 goto finish;
5790
8ee9615e
LP
5791 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5792 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5793 "root hash signature found! Proceeding without integrity checking.", arg_image);
4623e8e6 5794
89e62e0b
LP
5795 r = dissected_image_decrypt_interactively(
5796 dissected_image,
5797 NULL,
5798 &arg_verity_settings,
e330f97a 5799 0);
1b9e5b12
LP
5800 if (r < 0)
5801 goto finish;
0f3be6ca
LP
5802
5803 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5804 if (remove_image && unlink(arg_image) >= 0)
5805 remove_image = false;
4c27749b
LP
5806
5807 if (arg_architecture < 0)
5808 arg_architecture = dissected_image_architecture(dissected_image);
842f3b0f 5809 }
842f3b0f 5810
86c0dd4a 5811 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5812 if (r < 0)
5813 goto finish;
5814
de40a303
LP
5815 if (arg_console_mode < 0)
5816 arg_console_mode =
5817 isatty(STDIN_FILENO) > 0 &&
5818 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5819
de40a303
LP
5820 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5821 arg_quiet = true;
a258bf26 5822
9c857b9d 5823 if (!arg_quiet)
c85c2f79 5824 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
9c857b9d
LP
5825 arg_machine, arg_image ?: arg_directory);
5826
988851b6 5827 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
a258bf26 5828
66edd963 5829 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5830 r = log_error_errno(errno, "Failed to become subreaper: %m");
5831 goto finish;
5832 }
5833
761cf19d
FW
5834 if (arg_expose_ports) {
5835 r = fw_ctx_new(&fw_ctx);
5836 if (r < 0) {
5837 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5838 goto finish;
5839 }
5840 expose_args.fw_ctx = fw_ctx;
5841 }
d87be9b0 5842 for (;;) {
3acc84eb 5843 r = run_container(dissected_image,
44dbef90
LP
5844 fds,
5845 veth_name, &veth_created,
761cf19d 5846 &expose_args, &master,
44dbef90 5847 &pid, &ret);
b0067625 5848 if (r <= 0)
d87be9b0 5849 break;
d87be9b0 5850 }
88213476
LP
5851
5852finish:
04f590a4
LP
5853 (void) sd_notify(false,
5854 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5855 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5856
9444b1f2 5857 if (pid > 0)
c67b0082 5858 (void) kill(pid, SIGKILL);
88213476 5859
503546da 5860 /* Try to flush whatever is still queued in the pty */
6a0f896b 5861 if (master >= 0) {
f5fbe71d 5862 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5863 master = safe_close(master);
5864 }
5865
5866 if (pid > 0)
5867 (void) wait_for_terminate(pid, NULL);
503546da 5868
50ebcf6c
LP
5869 pager_close();
5870
17cbb288 5871 if (remove_directory && arg_directory) {
ec16945e
LP
5872 int k;
5873
17cbb288 5874 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5875 if (k < 0)
17cbb288 5876 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5877 }
5878
0f3be6ca
LP
5879 if (remove_image && arg_image) {
5880 if (unlink(arg_image) < 0)
5881 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5882 }
5883
c67b0082
LP
5884 if (remove_tmprootdir) {
5885 if (rmdir(tmprootdir) < 0)
5886 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5887 }
5888
785890ac
LP
5889 if (arg_machine) {
5890 const char *p;
5891
63c372cb 5892 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5893 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5894 }
5895
deff68e7
FW
5896 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5897 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5898
5899 if (veth_created)
5900 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5901 (void) remove_bridge(arg_network_zone);
f757855e 5902
f757855e
LP
5903 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5904 expose_port_free_all(arg_expose_ports);
bf428efb 5905 rlimit_free_all(arg_rlimit);
b2645747 5906 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5907 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5908
44dbef90
LP
5909 if (r < 0)
5910 return r;
5911
5912 return ret;
88213476 5913}
44dbef90
LP
5914
5915DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);