]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #29836 from poettering/libiptc-dlopen
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
813dbff4 27#include "ether-addr-util.h"
8fe0087e
LP
28#include "barrier.h"
29#include "base-filesystem.h"
30#include "blkid-util.h"
31#include "btrfs-util.h"
d6b4d1c7 32#include "build.h"
b8ea7a6e 33#include "bus-error.h"
7f8a85e6 34#include "bus-locator.h"
b053cd5f 35#include "bus-util.h"
8fe0087e 36#include "cap-list.h"
430f0182 37#include "capability-util.h"
04d391da 38#include "cgroup-util.h"
f461a28d 39#include "chase.h"
988851b6 40#include "common-signal.h"
8fe0087e 41#include "copy.h"
d107bb7d 42#include "cpu-set-util.h"
786d19fd 43#include "creds-util.h"
4fc9982c 44#include "dev-setup.h"
57f1b61b 45#include "discover-image.h"
2d845785 46#include "dissect-image.h"
8fe0087e 47#include "env-util.h"
3652872a 48#include "escape.h"
3ffd4af2 49#include "fd-util.h"
842f3b0f 50#include "fdset.h"
a5c32cff 51#include "fileio.h"
f97b34a6 52#include "format-util.h"
f4f15635 53#include "fs-util.h"
1b9e5b12 54#include "gpt.h"
4623e8e6 55#include "hexdecoct.h"
e2054217 56#include "hostname-setup.h"
8fe0087e 57#include "hostname-util.h"
910fd145 58#include "id128-util.h"
3652872a 59#include "io-util.h"
8fe0087e 60#include "log.h"
2d845785 61#include "loop-util.h"
8fe0087e 62#include "loopback-setup.h"
8fe0087e 63#include "macro.h"
44dbef90 64#include "main-func.h"
f5947a5e 65#include "missing_sched.h"
8fe0087e 66#include "mkdir.h"
4349cd7c 67#include "mount-util.h"
049af8ad 68#include "mountpoint-util.h"
0cb8e3d1 69#include "namespace-util.h"
8fe0087e 70#include "netlink-util.h"
2f893044 71#include "nspawn-bind-user.h"
07630cea 72#include "nspawn-cgroup.h"
3652872a 73#include "nspawn-creds.h"
3603efde 74#include "nspawn-def.h"
07630cea
LP
75#include "nspawn-expose-ports.h"
76#include "nspawn-mount.h"
77#include "nspawn-network.h"
de40a303 78#include "nspawn-oci.h"
7336138e 79#include "nspawn-patch-uid.h"
07630cea 80#include "nspawn-register.h"
910fd145 81#include "nspawn-seccomp.h"
07630cea
LP
82#include "nspawn-settings.h"
83#include "nspawn-setuid.h"
7732f92b 84#include "nspawn-stub-pid1.h"
c9394f4f 85#include "nspawn-util.h"
91181e07 86#include "nspawn.h"
d8b4d14d 87#include "nulstr-util.h"
d58ad743 88#include "os-util.h"
50ebcf6c 89#include "pager.h"
614b022c 90#include "parse-argument.h"
6bedfcbb 91#include "parse-util.h"
294bf0c3 92#include "pretty-print.h"
0b452006 93#include "process-util.h"
8fe0087e
LP
94#include "ptyfwd.h"
95#include "random-util.h"
8869a0b4 96#include "raw-clone.h"
86775e35 97#include "resolve-util.h"
bf428efb 98#include "rlimit-util.h"
8fe0087e 99#include "rm-rf.h"
de40a303 100#include "seccomp-util.h"
68b02049 101#include "selinux-util.h"
8fe0087e 102#include "signal-util.h"
2583fbea 103#include "socket-util.h"
8fcde012 104#include "stat-util.h"
15a5e950 105#include "stdio-util.h"
5c828e66 106#include "string-table.h"
07630cea 107#include "string-util.h"
8fe0087e 108#include "strv.h"
de40a303 109#include "sysctl-util.h"
8fe0087e 110#include "terminal-util.h"
e4de7287 111#include "tmpfile-util.h"
affb60b1 112#include "umask-util.h"
43c3fb46 113#include "unit-name.h"
b1d4f8e1 114#include "user-util.h"
e9642be2 115
e96ceaba
LP
116/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
e79581dd 118#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
0e7ac751 119
2a49b612
ZJS
120#define EXIT_FORCE_RESTART 133
121
113cea80
DH
122typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
6145bb4f 124 CONTAINER_REBOOTED,
113cea80
DH
125} ContainerStatus;
126
88213476 127static char *arg_directory = NULL;
ec16945e 128static char *arg_template = NULL;
5f932eb9 129static char *arg_chdir = NULL;
b53ede69
PW
130static char *arg_pivot_root_new = NULL;
131static char *arg_pivot_root_old = NULL;
687d0825 132static char *arg_user = NULL;
de40a303
LP
133static uid_t arg_uid = UID_INVALID;
134static gid_t arg_gid = GID_INVALID;
135static gid_t* arg_supplementary_gids = NULL;
136static size_t arg_n_supplementary_gids = 0;
9444b1f2 137static sd_id128_t arg_uuid = {};
3a9530e5
LP
138static char *arg_machine = NULL; /* The name used by the host to refer to this */
139static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
140static const char *arg_selinux_context = NULL;
141static const char *arg_selinux_apifs_context = NULL;
de40a303 142static char *arg_slice = NULL;
ff01d048 143static bool arg_private_network = false;
bc2f673e 144static bool arg_read_only = false;
7732f92b 145static StartMode arg_start_mode = START_PID1;
ec16945e 146static bool arg_ephemeral = false;
57fb9fb5 147static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 148static bool arg_link_journal_try = false;
520e0d54 149static uint64_t arg_caps_retain =
50b52222
LP
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 161 (1ULL << CAP_MKNOD) |
5076f0cc
LP
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
5076f0cc 165 (1ULL << CAP_SETFCAP) |
50b52222 166 (1ULL << CAP_SETGID) |
5076f0cc
LP
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
50b52222 170 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 174 (1ULL << CAP_SYS_RESOURCE) |
50b52222 175 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 176static uint64_t arg_caps_ambient = 0;
de40a303 177static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 178static CustomMount *arg_custom_mounts = NULL;
88614c8a 179static size_t arg_n_custom_mounts = 0;
f4889f65 180static char **arg_setenv = NULL;
284c0b91 181static bool arg_quiet = false;
eb91eb18 182static bool arg_register = true;
89f7c846 183static bool arg_keep_unit = false;
aa28aefe 184static char **arg_network_interfaces = NULL;
c74e630d 185static char **arg_network_macvlan = NULL;
4bbfe7ad 186static char **arg_network_ipvlan = NULL;
69c79d3c 187static bool arg_network_veth = false;
f6d6bad1 188static char **arg_network_veth_extra = NULL;
f757855e 189static char *arg_network_bridge = NULL;
22b28dfd 190static char *arg_network_zone = NULL;
d7bea6b6 191static char *arg_network_namespace_path = NULL;
813dbff4 192struct ether_addr arg_network_provided_mac = {};
bb068de0 193static PagerFlags arg_pager_flags = 0;
050f7277 194static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 195static char *arg_image = NULL;
de40a303 196static char *arg_oci_bundle = NULL;
f757855e 197static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 198static ExposePort *arg_expose_ports = NULL;
f36933fe 199static char **arg_property = NULL;
de40a303 200static sd_bus_message *arg_property_message = NULL;
0de7acce 201static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 202static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 203static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 204static int arg_kill_signal = 0;
5da38d07 205static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
206static SettingsMask arg_settings_mask = 0;
207static int arg_settings_trusted = -1;
208static char **arg_parameters = NULL;
6aadfa4c 209static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 210static bool arg_notify_ready = false;
5a8ff0e6 211static bool arg_use_cgns = true;
0c582db0 212static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 213static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 214static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
215static char **arg_syscall_allow_list = NULL;
216static char **arg_syscall_deny_list = NULL;
de40a303
LP
217#if HAVE_SECCOMP
218static scmp_filter_ctx arg_seccomp = NULL;
219#endif
bf428efb 220static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 221static bool arg_no_new_privileges = false;
81f345df
LP
222static int arg_oom_score_adjust = 0;
223static bool arg_oom_score_adjust_set = false;
0985c7c4 224static CPUSet arg_cpu_set = {};
09d423e9 225static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 226static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 227static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
228static DeviceNode* arg_extra_nodes = NULL;
229static size_t arg_n_extra_nodes = 0;
230static char **arg_sysctl = NULL;
231static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
232static Credential *arg_credentials = NULL;
233static size_t arg_n_credentials = 0;
2f893044 234static char **arg_bind_user = NULL;
4a4654e0 235static bool arg_suppress_sync = false;
3603f151 236static char *arg_settings_filename = NULL;
4c27749b 237static Architecture arg_architecture = _ARCHITECTURE_INVALID;
84be0c71 238static ImagePolicy *arg_image_policy = NULL;
88213476 239
6145bb4f
LP
240STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
249STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
250STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
254STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
255STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
256STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
258STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
259STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
260STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
261STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
262STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 263STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
264STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
265STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
266#if HAVE_SECCOMP
267STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
268#endif
0985c7c4 269STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 270STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 271STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
3603f151 272STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
84be0c71 273STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
6145bb4f 274
dce66ffe
ZJS
275static int handle_arg_console(const char *arg) {
276 if (streq(arg, "help")) {
10e8a60b
LP
277 puts("autopipe\n"
278 "interactive\n"
dce66ffe 279 "passive\n"
10e8a60b
LP
280 "pipe\n"
281 "read-only");
dce66ffe
ZJS
282 return 0;
283 }
284
285 if (streq(arg, "interactive"))
286 arg_console_mode = CONSOLE_INTERACTIVE;
287 else if (streq(arg, "read-only"))
288 arg_console_mode = CONSOLE_READ_ONLY;
289 else if (streq(arg, "passive"))
290 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
291 else if (streq(arg, "pipe")) {
292 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
293 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
294 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
295 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
296 "Proceeding anyway.");
297
dce66ffe 298 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
299 } else if (streq(arg, "autopipe")) {
300 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
301 arg_console_mode = CONSOLE_INTERACTIVE;
302 else
303 arg_console_mode = CONSOLE_PIPE;
554c4beb 304 } else
dce66ffe
ZJS
305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
306
307 arg_settings_mask |= SETTING_CONSOLE_MODE;
308 return 1;
309}
310
37ec0fdd
LP
311static int help(void) {
312 _cleanup_free_ char *link = NULL;
313 int r;
314
384c2c32 315 pager_open(arg_pager_flags);
50ebcf6c 316
37ec0fdd
LP
317 r = terminal_urlify_man("systemd-nspawn", "1", &link);
318 if (r < 0)
319 return log_oom();
320
25148653 321 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 322 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
323 " -h --help Show this help\n"
324 " --version Print version string\n"
69c79d3c 325 " -q --quiet Do not show status information\n"
bb068de0 326 " --no-pager Do not pipe output into a pager\n"
25148653
LP
327 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
328 "%3$sImage:%4$s\n"
1b9e5b12 329 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
330 " --template=PATH Initialize root directory from template directory,\n"
331 " if missing\n"
332 " -x --ephemeral Run container with snapshot of root directory, and\n"
333 " remove it after exit\n"
25e68fd3
LP
334 " -i --image=PATH Root file system disk image (or device node) for\n"
335 " the container\n"
84be0c71 336 " --image-policy=POLICY Specify disk image dissection policy\n"
de40a303 337 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
338 " --read-only Mount the root directory read-only\n"
339 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 340 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
341 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
342 " as a DER encoded PKCS7, either as a path to a file\n"
343 " or as an ASCII base64 encoded string prefixed by\n"
344 " 'base64:'\n"
e7cbe5cb 345 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
346 " --pivot-root=PATH[:PATH]\n"
347 " Pivot root to given directory in the container\n\n"
348 "%3$sExecution:%4$s\n"
7732f92b 349 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 350 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 351 " --chdir=PATH Set working directory in the container\n"
0d2a0179 352 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
25148653
LP
353 " -u --user=USER Run the command under specified user or UID\n"
354 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
4a4654e0
LP
355 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
356 " --suppress-sync=BOOLEAN\n"
357 " Suppress any form of disk data synchronization\n\n"
25148653 358 "%3$sSystem Identity:%4$s\n"
a8828ed9 359 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 360 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
361 " --uuid=UUID Set a specific machine UUID for the container\n\n"
362 "%3$sProperties:%4$s\n"
a8828ed9 363 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 364 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
365 " --register=BOOLEAN Register container as machine\n"
366 " --keep-unit Do not register a scope for the machine, reuse\n"
367 " the service unit nspawn is running in\n\n"
368 "%3$sUser Namespacing:%4$s\n"
b917743d
YW
369 " --private-users=no Run without user namespacing\n"
370 " --private-users=yes|pick|identity\n"
371 " Run within user namespace, autoselect UID/GID range\n"
372 " --private-users=UIDBASE[:NUIDS]\n"
90b4a64d 373 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
374 " --private-users-ownership=MODE\n"
375 " Adjust ('chown') or map ('map') OS tree ownership\n"
b917743d
YW
376 " to private UID/GID range\n"
377 " -U Equivalent to --private-users=pick and\n"
378 " --private-users-ownership=auto\n\n"
25148653 379 "%3$sNetworking:%4$s\n"
69c79d3c 380 " --private-network Disable network in container\n"
2f091b1b 381 " --network-interface=HOSTIF[:CONTAINERIF]\n"
69c79d3c
LP
382 " Assign an existing network interface to the\n"
383 " container\n"
2f091b1b 384 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
c74e630d
LP
385 " Create a macvlan network interface based on an\n"
386 " existing network interface to the container\n"
2f091b1b 387 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
387f6955 388 " Create an ipvlan network interface based on an\n"
4bbfe7ad 389 " existing network interface to the container\n"
a8eaaee7 390 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 391 " and container\n"
f6d6bad1
LP
392 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
393 " Add an additional virtual Ethernet link between\n"
394 " host and container\n"
ab046dde 395 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
396 " Add a virtual Ethernet connection to the container\n"
397 " and attach it to an existing bridge on the host\n"
398 " --network-zone=NAME Similar, but attach the new interface to an\n"
399 " an automatically managed bridge interface\n"
d7bea6b6
DP
400 " --network-namespace-path=PATH\n"
401 " Set network namespace to the one represented by\n"
402 " the specified kernel namespace file node\n"
6d0b55c2 403 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
404 " Expose a container IP port on the host\n\n"
405 "%3$sSecurity:%4$s\n"
a8828ed9
DW
406 " --capability=CAP In addition to the default, retain specified\n"
407 " capability\n"
408 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
409 " --ambient-capability=CAP\n"
410 " Sets the specified capability for the started\n"
411 " process. Not useful if booting a machine.\n"
f4e803c8 412 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
413 " --system-call-filter=LIST|~LIST\n"
414 " Permit/prohibit specific system calls\n"
25148653
LP
415 " -Z --selinux-context=SECLABEL\n"
416 " Set the SELinux security context to be used by\n"
417 " processes in the container\n"
418 " -L --selinux-apifs-context=SECLABEL\n"
419 " Set the SELinux security context to be used by\n"
420 " API/tmpfs file systems in the container\n\n"
421 "%3$sResources:%4$s\n"
bf428efb 422 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
423 " --oom-score-adjust=VALUE\n"
424 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
425 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
426 " --personality=ARCH Pick personality for this container\n\n"
25148653 427 "%3$sIntegration:%4$s\n"
09d423e9 428 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 429 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
430 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
431 " host, try-guest, try-host\n"
432 " -j Equivalent to --link-journal=try-guest\n\n"
433 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
434 " --bind=PATH[:PATH[:OPTIONS]]\n"
435 " Bind mount a file or directory from the host into\n"
a8828ed9 436 " the container\n"
5e5bfa6e
EY
437 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
438 " Similar, but creates a read-only bind mount\n"
de40a303
LP
439 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
440 " it\n"
06c17c39 441 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
442 " --overlay=PATH[:PATH...]:PATH\n"
443 " Create an overlay mount from the host to \n"
444 " the container\n"
445 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
446 " Similar, but creates a read-only overlay mount\n"
447 " --bind-user=NAME Bind user from host to container\n\n"
25148653 448 "%3$sInput/Output:%4$s\n"
de40a303
LP
449 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
450 " set up for the container.\n"
3652872a
LP
451 " -P --pipe Equivalent to --console=pipe\n\n"
452 "%3$sCredentials:%4$s\n"
453 " --set-credential=ID:VALUE\n"
454 " Pass a credential with literal value to container.\n"
455 " --load-credential=ID:PATH\n"
456 " Load credential to pass to container from file or\n"
457 " AF_UNIX stream socket.\n"
bc556335
DDM
458 "\nSee the %2$s for details.\n",
459 program_invocation_short_name,
460 link,
461 ansi_underline(),
462 ansi_normal(),
463 ansi_highlight(),
464 ansi_normal());
37ec0fdd
LP
465
466 return 0;
88213476
LP
467}
468
86c0dd4a 469static int custom_mount_check_all(void) {
88614c8a 470 size_t i;
5a8af538 471
5a8af538
LP
472 for (i = 0; i < arg_n_custom_mounts; i++) {
473 CustomMount *m = &arg_custom_mounts[i];
474
0de7acce 475 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 476 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 478 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 479 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
481 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 482 }
5a8af538
LP
483 }
484
485 return 0;
486}
487
8199d554 488static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 489 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 490 int r;
5da38d07 491
efdb0237 492 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
493
494 e = getenv(var);
495 if (!e) {
d5fc5b2f 496 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
497 var = "UNIFIED_CGROUP_HIERARCHY";
498 e = getenv(var);
c78c095b
ZJS
499 }
500
501 if (!isempty(e)) {
efdb0237
LP
502 r = parse_boolean(e);
503 if (r < 0)
c78c095b 504 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
505 if (r > 0)
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
507 else
508 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
509 }
510
8199d554
LP
511 return 0;
512}
513
514static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
515 int r;
516
75b0d8b8
ZJS
517 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
518 * in the image actually supports. */
b4cccbc1
LP
519 r = cg_all_unified();
520 if (r < 0)
521 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
522 if (r > 0) {
a8725a06
ZJS
523 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
524 * routine only detects 231, so we'll have a false negative here for 230. */
7e6821ed 525 r = systemd_installation_has_version(directory, "230");
a8725a06
ZJS
526 if (r < 0)
527 return log_error_errno(r, "Failed to determine systemd version in container: %m");
528 if (r > 0)
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
530 else
531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 532 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b 533 /* Mixed cgroup hierarchy support was added in 233 */
7e6821ed 534 r = systemd_installation_has_version(directory, "233");
0fd9563f
ZJS
535 if (r < 0)
536 return log_error_errno(r, "Failed to determine systemd version in container: %m");
537 if (r > 0)
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
539 else
540 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
541 } else
5da38d07 542 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 543
8199d554
LP
544 log_debug("Using %s hierarchy for container.",
545 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
546 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
547
efdb0237
LP
548 return 0;
549}
550
8a99bd0c
ZJS
551static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
552 uint64_t mask = 0;
553 int r;
554
555 for (;;) {
556 _cleanup_free_ char *t = NULL;
557
558 r = extract_first_word(&spec, &t, ",", 0);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse capability %s.", t);
561 if (r == 0)
562 break;
563
564 if (streq(t, "help")) {
565 for (int i = 0; i < capability_list_length(); i++) {
566 const char *name;
567
568 name = capability_to_name(i);
569 if (name)
570 puts(name);
571 }
572
573 return 0; /* quit */
574 }
575
576 if (streq(t, "all"))
f5fbe71d 577 mask = UINT64_MAX;
8a99bd0c
ZJS
578 else {
579 r = capability_from_name(t);
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse capability %s.", t);
582
583 mask |= 1ULL << r;
584 }
585 }
586
587 *ret_mask = mask;
588 return 1; /* continue */
589}
590
49048684 591static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
592 int r;
593
594 r = getenv_bool(name);
595 if (r == -ENXIO)
49048684 596 return 0;
0c582db0 597 if (r < 0)
49048684 598 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 599
0c582db0 600 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 601 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 602 return 0;
0c582db0
LB
603}
604
49048684 605static int parse_mount_settings_env(void) {
4f086aab 606 const char *e;
1099ceeb
LP
607 int r;
608
609 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
610 if (r < 0 && r != -ENXIO)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
612 if (r >= 0)
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
614
615 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 616 if (streq_ptr(e, "network"))
4f086aab 617 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 618
49048684
ZJS
619 else if (e) {
620 r = parse_boolean(e);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
623
624 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
625 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 626 }
4f086aab 627
49048684 628 return 0;
4f086aab
SU
629}
630
49048684 631static int parse_environment(void) {
d5455d2f
LP
632 const char *e;
633 int r;
634
49048684
ZJS
635 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
636 if (r < 0)
637 return r;
638 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
639 if (r < 0)
640 return r;
641 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
642 if (r < 0)
643 return r;
644 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
645 if (r < 0)
646 return r;
d5455d2f 647
49048684
ZJS
648 r = parse_mount_settings_env();
649 if (r < 0)
650 return r;
d5455d2f 651
489fae52
ZJS
652 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
653 * even if it is supported. If not supported, it has no effect. */
de40a303 654 if (!cg_ns_supported())
489fae52 655 arg_use_cgns = false;
de40a303
LP
656 else {
657 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
658 if (r < 0) {
659 if (r != -ENXIO)
49048684 660 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
661
662 arg_use_cgns = true;
663 } else {
664 arg_use_cgns = r > 0;
665 arg_settings_mask |= SETTING_USE_CGNS;
666 }
667 }
d5455d2f
LP
668
669 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
670 if (e)
671 arg_container_service_name = e;
672
813dbff4
RC
673 e = getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
674 if (e) {
675 r = parse_ether_addr(e, &arg_network_provided_mac);
676 if (r < 0)
677 return log_error_errno(r, "Failed to parse provided MAC address via environment variable");
678 }
679
4a4654e0
LP
680 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
681 if (r >= 0)
682 arg_suppress_sync = r;
683 else if (r != -ENXIO)
684 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
685
49048684 686 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
687}
688
88213476 689static int parse_argv(int argc, char *argv[]) {
a41fe3a2 690 enum {
acbeb427
ZJS
691 ARG_VERSION = 0x100,
692 ARG_PRIVATE_NETWORK,
bc2f673e 693 ARG_UUID,
5076f0cc 694 ARG_READ_ONLY,
57fb9fb5 695 ARG_CAPABILITY,
88fc9c9b 696 ARG_AMBIENT_CAPABILITY,
420c7379 697 ARG_DROP_CAPABILITY,
17fe0523
LP
698 ARG_LINK_JOURNAL,
699 ARG_BIND,
f4889f65 700 ARG_BIND_RO,
06c17c39 701 ARG_TMPFS,
5a8af538
LP
702 ARG_OVERLAY,
703 ARG_OVERLAY_RO,
de40a303 704 ARG_INACCESSIBLE,
eb91eb18 705 ARG_SHARE_SYSTEM,
89f7c846 706 ARG_REGISTER,
aa28aefe 707 ARG_KEEP_UNIT,
69c79d3c 708 ARG_NETWORK_INTERFACE,
c74e630d 709 ARG_NETWORK_MACVLAN,
4bbfe7ad 710 ARG_NETWORK_IPVLAN,
ab046dde 711 ARG_NETWORK_BRIDGE,
22b28dfd 712 ARG_NETWORK_ZONE,
f6d6bad1 713 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 714 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 715 ARG_PERSONALITY,
4d9f07b4 716 ARG_VOLATILE,
ec16945e 717 ARG_TEMPLATE,
f36933fe 718 ARG_PROPERTY,
6dac160c 719 ARG_PRIVATE_USERS,
c6c8f6e2 720 ARG_KILL_SIGNAL,
f757855e 721 ARG_SETTINGS,
5f932eb9 722 ARG_CHDIR,
b53ede69 723 ARG_PIVOT_ROOT,
7336138e 724 ARG_PRIVATE_USERS_CHOWN,
6c045a99 725 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 726 ARG_NOTIFY_READY,
4623e8e6 727 ARG_ROOT_HASH,
89e62e0b
LP
728 ARG_ROOT_HASH_SIG,
729 ARG_VERITY_DATA,
960e4569 730 ARG_SYSTEM_CALL_FILTER,
bf428efb 731 ARG_RLIMIT,
3a9530e5 732 ARG_HOSTNAME,
66edd963 733 ARG_NO_NEW_PRIVILEGES,
81f345df 734 ARG_OOM_SCORE_ADJUST,
d107bb7d 735 ARG_CPU_AFFINITY,
09d423e9 736 ARG_RESOLV_CONF,
1688841f 737 ARG_TIMEZONE,
de40a303
LP
738 ARG_CONSOLE,
739 ARG_PIPE,
740 ARG_OCI_BUNDLE,
bb068de0 741 ARG_NO_PAGER,
3652872a
LP
742 ARG_SET_CREDENTIAL,
743 ARG_LOAD_CREDENTIAL,
2f893044 744 ARG_BIND_USER,
4a4654e0 745 ARG_SUPPRESS_SYNC,
84be0c71 746 ARG_IMAGE_POLICY,
a41fe3a2
LP
747 };
748
88213476 749 static const struct option options[] = {
d7bea6b6
DP
750 { "help", no_argument, NULL, 'h' },
751 { "version", no_argument, NULL, ARG_VERSION },
752 { "directory", required_argument, NULL, 'D' },
753 { "template", required_argument, NULL, ARG_TEMPLATE },
754 { "ephemeral", no_argument, NULL, 'x' },
755 { "user", required_argument, NULL, 'u' },
756 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
757 { "as-pid2", no_argument, NULL, 'a' },
758 { "boot", no_argument, NULL, 'b' },
759 { "uuid", required_argument, NULL, ARG_UUID },
760 { "read-only", no_argument, NULL, ARG_READ_ONLY },
761 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 762 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 763 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 764 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
765 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
766 { "bind", required_argument, NULL, ARG_BIND },
767 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
768 { "tmpfs", required_argument, NULL, ARG_TMPFS },
769 { "overlay", required_argument, NULL, ARG_OVERLAY },
770 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 771 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 772 { "machine", required_argument, NULL, 'M' },
3a9530e5 773 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
774 { "slice", required_argument, NULL, 'S' },
775 { "setenv", required_argument, NULL, 'E' },
776 { "selinux-context", required_argument, NULL, 'Z' },
777 { "selinux-apifs-context", required_argument, NULL, 'L' },
778 { "quiet", no_argument, NULL, 'q' },
779 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
780 { "register", required_argument, NULL, ARG_REGISTER },
781 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
782 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
783 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
784 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
785 { "network-veth", no_argument, NULL, 'n' },
786 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
787 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
788 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
789 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
790 { "personality", required_argument, NULL, ARG_PERSONALITY },
791 { "image", required_argument, NULL, 'i' },
792 { "volatile", optional_argument, NULL, ARG_VOLATILE },
793 { "port", required_argument, NULL, 'p' },
794 { "property", required_argument, NULL, ARG_PROPERTY },
795 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
796 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
797 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
798 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
799 { "settings", required_argument, NULL, ARG_SETTINGS },
800 { "chdir", required_argument, NULL, ARG_CHDIR },
801 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
802 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
803 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
804 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
805 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 806 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 807 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 808 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 809 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 810 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 811 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
812 { "console", required_argument, NULL, ARG_CONSOLE },
813 { "pipe", no_argument, NULL, ARG_PIPE },
814 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 815 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
816 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
817 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 818 { "bind-user", required_argument, NULL, ARG_BIND_USER },
4a4654e0 819 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
84be0c71 820 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
eb9da376 821 {}
88213476
LP
822 };
823
9444b1f2 824 int c, r;
a42c8b54 825 uint64_t plus = 0, minus = 0;
f757855e 826 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
827
828 assert(argc >= 0);
829 assert(argv);
830
ef9c12b1
YW
831 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
832 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
833 optind = 0;
de40a303 834 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
835 switch (c) {
836
837 case 'h':
37ec0fdd 838 return help();
88213476 839
acbeb427 840 case ARG_VERSION:
3f6fd1ba 841 return version();
acbeb427 842
88213476 843 case 'D':
614b022c 844 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 845 if (r < 0)
0f03c2a4 846 return r;
de40a303
LP
847
848 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
849 break;
850
851 case ARG_TEMPLATE:
614b022c 852 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 853 if (r < 0)
0f03c2a4 854 return r;
de40a303
LP
855
856 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
857 break;
858
1b9e5b12 859 case 'i':
614b022c 860 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 861 if (r < 0)
0f03c2a4 862 return r;
de40a303
LP
863
864 arg_settings_mask |= SETTING_DIRECTORY;
865 break;
866
867 case ARG_OCI_BUNDLE:
614b022c 868 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
869 if (r < 0)
870 return r;
871
ec16945e
LP
872 break;
873
874 case 'x':
875 arg_ephemeral = true;
a2f577fc 876 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
877 break;
878
687d0825 879 case 'u':
2fc09a9c
DM
880 r = free_and_strdup(&arg_user, optarg);
881 if (r < 0)
7027ff61 882 return log_oom();
687d0825 883
f757855e 884 arg_settings_mask |= SETTING_USER;
687d0825
MV
885 break;
886
22b28dfd 887 case ARG_NETWORK_ZONE: {
fee9f7b5 888 _cleanup_free_ char *j = NULL;
22b28dfd 889
b910cc72 890 j = strjoin("vz-", optarg);
22b28dfd
LP
891 if (!j)
892 return log_oom();
893
fee9f7b5
FS
894 if (!ifname_valid(j))
895 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
896 "Network zone name not valid: %s", j);
22b28dfd 897
df1fac6d 898 free_and_replace(arg_network_zone, j);
22b28dfd
LP
899
900 arg_network_veth = true;
901 arg_private_network = true;
902 arg_settings_mask |= SETTING_NETWORK;
903 break;
904 }
905
ab046dde 906 case ARG_NETWORK_BRIDGE:
ef76dff2 907
baaa35ad
ZJS
908 if (!ifname_valid(optarg))
909 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
910 "Bridge interface name not valid: %s", optarg);
ef76dff2 911
f757855e
LP
912 r = free_and_strdup(&arg_network_bridge, optarg);
913 if (r < 0)
914 return log_oom();
ab046dde 915
4831981d 916 _fallthrough_;
0dfaa006 917 case 'n':
69c79d3c
LP
918 arg_network_veth = true;
919 arg_private_network = true;
f757855e 920 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
921 break;
922
f6d6bad1
LP
923 case ARG_NETWORK_VETH_EXTRA:
924 r = veth_extra_parse(&arg_network_veth_extra, optarg);
925 if (r < 0)
926 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
927
928 arg_private_network = true;
929 arg_settings_mask |= SETTING_NETWORK;
930 break;
931
aa28aefe 932 case ARG_NETWORK_INTERFACE:
2f091b1b 933 r = interface_pair_parse(&arg_network_interfaces, optarg);
b390f178
DDM
934 if (r < 0)
935 return r;
936
c74e630d 937 arg_private_network = true;
f757855e 938 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
939 break;
940
941 case ARG_NETWORK_MACVLAN:
2f091b1b 942 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
b390f178
DDM
943 if (r < 0)
944 return r;
945
4bbfe7ad 946 arg_private_network = true;
f757855e 947 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
948 break;
949
950 case ARG_NETWORK_IPVLAN:
2f091b1b 951 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
b390f178
DDM
952 if (r < 0)
953 return r;
954
4831981d 955 _fallthrough_;
ff01d048
LP
956 case ARG_PRIVATE_NETWORK:
957 arg_private_network = true;
f757855e 958 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
959 break;
960
d7bea6b6 961 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 962 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
963 if (r < 0)
964 return r;
965
de40a303 966 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
967 break;
968
0f0dbc46 969 case 'b':
baaa35ad
ZJS
970 if (arg_start_mode == START_PID2)
971 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
972 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
973
974 arg_start_mode = START_BOOT;
975 arg_settings_mask |= SETTING_START_MODE;
976 break;
977
978 case 'a':
baaa35ad
ZJS
979 if (arg_start_mode == START_BOOT)
980 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
981 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
982
983 arg_start_mode = START_PID2;
984 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
985 break;
986
144f0fc0 987 case ARG_UUID:
aea3f594
ZJS
988 r = id128_from_string_nonzero(optarg, &arg_uuid);
989 if (r == -ENXIO)
baaa35ad
ZJS
990 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
991 "Machine UUID may not be all zeroes.");
aea3f594
ZJS
992 if (r < 0)
993 return log_error_errno(r, "Invalid UUID: %s", optarg);
f757855e
LP
994
995 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 996 break;
aa96c6cb 997
43c3fb46
LP
998 case 'S': {
999 _cleanup_free_ char *mangled = NULL;
1000
1001 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
1002 if (r < 0)
1003 return log_oom();
1004
43c3fb46 1005 free_and_replace(arg_slice, mangled);
de40a303 1006 arg_settings_mask |= SETTING_SLICE;
144f0fc0 1007 break;
43c3fb46 1008 }
144f0fc0 1009
7027ff61 1010 case 'M':
c1521918 1011 if (isempty(optarg))
97b11eed 1012 arg_machine = mfree(arg_machine);
c1521918 1013 else {
52ef5dd7 1014 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1015 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1016 "Invalid machine name: %s", optarg);
7027ff61 1017
0c3c4284
LP
1018 r = free_and_strdup(&arg_machine, optarg);
1019 if (r < 0)
eb91eb18 1020 return log_oom();
eb91eb18 1021 }
9ce6d1b3 1022 break;
7027ff61 1023
3a9530e5
LP
1024 case ARG_HOSTNAME:
1025 if (isempty(optarg))
1026 arg_hostname = mfree(arg_hostname);
1027 else {
52ef5dd7 1028 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1029 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1030 "Invalid hostname: %s", optarg);
3a9530e5
LP
1031
1032 r = free_and_strdup(&arg_hostname, optarg);
1033 if (r < 0)
1034 return log_oom();
1035 }
1036
1037 arg_settings_mask |= SETTING_HOSTNAME;
1038 break;
1039
82adf6af
LP
1040 case 'Z':
1041 arg_selinux_context = optarg;
a8828ed9
DW
1042 break;
1043
82adf6af
LP
1044 case 'L':
1045 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1046 break;
1047
bc2f673e
LP
1048 case ARG_READ_ONLY:
1049 arg_read_only = true;
f757855e 1050 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1051 break;
1052
88fc9c9b
TH
1053 case ARG_AMBIENT_CAPABILITY: {
1054 uint64_t m;
1055 r = parse_capability_spec(optarg, &m);
1056 if (r <= 0)
1057 return r;
1058 arg_caps_ambient |= m;
1059 arg_settings_mask |= SETTING_CAPABILITY;
1060 break;
1061 }
420c7379
LP
1062 case ARG_CAPABILITY:
1063 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1064 uint64_t m;
1065 r = parse_capability_spec(optarg, &m);
1066 if (r <= 0)
1067 return r;
5076f0cc 1068
8a99bd0c
ZJS
1069 if (c == ARG_CAPABILITY)
1070 plus |= m;
1071 else
1072 minus |= m;
f757855e 1073 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1074 break;
1075 }
66edd963
LP
1076 case ARG_NO_NEW_PRIVILEGES:
1077 r = parse_boolean(optarg);
1078 if (r < 0)
1079 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1080
1081 arg_no_new_privileges = r;
1082 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1083 break;
1084
57fb9fb5
LP
1085 case 'j':
1086 arg_link_journal = LINK_GUEST;
574edc90 1087 arg_link_journal_try = true;
4e1d6aa9 1088 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1089 break;
1090
1091 case ARG_LINK_JOURNAL:
4e1d6aa9 1092 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1095
4e1d6aa9 1096 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1097 break;
1098
17fe0523 1099 case ARG_BIND:
f757855e
LP
1100 case ARG_BIND_RO:
1101 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1102 if (r < 0)
1103 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1104
f757855e 1105 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1106 break;
06c17c39 1107
f757855e
LP
1108 case ARG_TMPFS:
1109 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1110 if (r < 0)
1111 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1112
f757855e 1113 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1114 break;
5a8af538
LP
1115
1116 case ARG_OVERLAY:
ad85779a
LP
1117 case ARG_OVERLAY_RO:
1118 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1119 if (r == -EADDRNOTAVAIL)
1120 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1123
f757855e 1124 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1125 break;
06c17c39 1126
de40a303
LP
1127 case ARG_INACCESSIBLE:
1128 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1129 if (r < 0)
1130 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1131
1132 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1133 break;
1134
0d2a0179
ZJS
1135 case 'E':
1136 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
aaf057c4 1137 if (r < 0)
0d2a0179 1138 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
f4889f65 1139
f757855e 1140 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65 1141 break;
f4889f65 1142
284c0b91
LP
1143 case 'q':
1144 arg_quiet = true;
1145 break;
1146
8a96d94e 1147 case ARG_SHARE_SYSTEM:
a6b5216c 1148 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1149 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1150 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1151 arg_clone_ns_flags = 0;
8a96d94e
LP
1152 break;
1153
eb91eb18
LP
1154 case ARG_REGISTER:
1155 r = parse_boolean(optarg);
1156 if (r < 0) {
1157 log_error("Failed to parse --register= argument: %s", optarg);
1158 return r;
1159 }
1160
1161 arg_register = r;
1162 break;
1163
89f7c846
LP
1164 case ARG_KEEP_UNIT:
1165 arg_keep_unit = true;
1166 break;
1167
6afc95b7
LP
1168 case ARG_PERSONALITY:
1169
ac45f971 1170 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1171 if (arg_personality == PERSONALITY_INVALID)
1172 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1173 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1174
f757855e 1175 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1176 break;
1177
4d9f07b4
LP
1178 case ARG_VOLATILE:
1179
1180 if (!optarg)
f757855e 1181 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1182 else if (streq(optarg, "help")) {
1183 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1184 return 0;
1185 } else {
f757855e 1186 VolatileMode m;
4d9f07b4 1187
f757855e 1188 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1189 if (m < 0)
1190 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1191 "Failed to parse --volatile= argument: %s", optarg);
1192 else
f757855e 1193 arg_volatile_mode = m;
6d0b55c2
LP
1194 }
1195
f757855e
LP
1196 arg_settings_mask |= SETTING_VOLATILE_MODE;
1197 break;
6d0b55c2 1198
f757855e
LP
1199 case 'p':
1200 r = expose_port_parse(&arg_expose_ports, optarg);
1201 if (r == -EEXIST)
1202 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1203 if (r < 0)
1204 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1205
f757855e 1206 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1207 break;
6d0b55c2 1208
f36933fe
LP
1209 case ARG_PROPERTY:
1210 if (strv_extend(&arg_property, optarg) < 0)
1211 return log_oom();
1212
1213 break;
1214
ae209204 1215 case ARG_PRIVATE_USERS: {
33eac552 1216 int boolean;
0de7acce 1217
ae209204
ZJS
1218 if (!optarg)
1219 boolean = true;
1220 else if (!in_charset(optarg, DIGITS))
1221 /* do *not* parse numbers as booleans */
1222 boolean = parse_boolean(optarg);
33eac552
LP
1223 else
1224 boolean = -1;
ae209204 1225
33eac552 1226 if (boolean == 0) {
0de7acce
LP
1227 /* no: User namespacing off */
1228 arg_userns_mode = USER_NAMESPACE_NO;
1229 arg_uid_shift = UID_INVALID;
1230 arg_uid_range = UINT32_C(0x10000);
33eac552 1231 } else if (boolean > 0) {
0de7acce
LP
1232 /* yes: User namespacing on, UID range is read from root dir */
1233 arg_userns_mode = USER_NAMESPACE_FIXED;
1234 arg_uid_shift = UID_INVALID;
1235 arg_uid_range = UINT32_C(0x10000);
1236 } else if (streq(optarg, "pick")) {
1237 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1238 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1239 * implied by USER_NAMESPACE_PICK
33eac552 1240 * further down. */
0de7acce
LP
1241 arg_uid_shift = UID_INVALID;
1242 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1243
1244 } else if (streq(optarg, "identity")) {
6c2d70ce 1245 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
33eac552
LP
1246 * itself, i.e. we don't actually map anything, but do take benefit of
1247 * isolation of capability sets. */
1248 arg_userns_mode = USER_NAMESPACE_FIXED;
1249 arg_uid_shift = 0;
1250 arg_uid_range = UINT32_C(0x10000);
0de7acce 1251 } else {
6c2058b3 1252 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1253 const char *range, *shift;
1254
0de7acce
LP
1255 /* anything else: User namespacing on, UID range is explicitly configured */
1256
6dac160c
LP
1257 range = strchr(optarg, ':');
1258 if (range) {
6c2058b3
ZJS
1259 buffer = strndup(optarg, range - optarg);
1260 if (!buffer)
1261 return log_oom();
1262 shift = buffer;
6dac160c
LP
1263
1264 range++;
bfd292ec
ZJS
1265 r = safe_atou32(range, &arg_uid_range);
1266 if (r < 0)
be715731 1267 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1268 } else
1269 shift = optarg;
1270
be715731
ZJS
1271 r = parse_uid(shift, &arg_uid_shift);
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1274
1275 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1276
58e13de5
LP
1277 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1278 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1279 }
be715731 1280
0de7acce 1281 arg_settings_mask |= SETTING_USERNS;
6dac160c 1282 break;
ae209204 1283 }
6dac160c 1284
0de7acce 1285 case 'U':
ccabee0d 1286 if (userns_supported()) {
6c045a99
LP
1287 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1288 * implied by USER_NAMESPACE_PICK
33eac552 1289 * further down. */
ccabee0d
LP
1290 arg_uid_shift = UID_INVALID;
1291 arg_uid_range = UINT32_C(0x10000);
1292
1293 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1294 }
1295
7336138e
LP
1296 break;
1297
0de7acce 1298 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1299 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1300
1301 arg_settings_mask |= SETTING_USERNS;
1302 break;
1303
1304 case ARG_PRIVATE_USERS_OWNERSHIP:
1305 if (streq(optarg, "help")) {
1306 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1307 return 0;
1308 }
1309
1310 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1311 if (arg_userns_ownership < 0)
1312 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1313
1314 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1315 break;
1316
c6c8f6e2 1317 case ARG_KILL_SIGNAL:
5c828e66
LP
1318 if (streq(optarg, "help")) {
1319 DUMP_STRING_TABLE(signal, int, _NSIG);
1320 return 0;
1321 }
1322
29a3db75 1323 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1324 if (arg_kill_signal < 0)
7211c853 1325 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1326
f757855e
LP
1327 arg_settings_mask |= SETTING_KILL_SIGNAL;
1328 break;
1329
1330 case ARG_SETTINGS:
1331
1332 /* no → do not read files
1333 * yes → read files, do not override cmdline, trust only subset
1334 * override → read files, override cmdline, trust only subset
1335 * trusted → read files, do not override cmdline, trust all
1336 */
1337
1338 r = parse_boolean(optarg);
1339 if (r < 0) {
1340 if (streq(optarg, "trusted")) {
1341 mask_all_settings = false;
1342 mask_no_settings = false;
1343 arg_settings_trusted = true;
1344
1345 } else if (streq(optarg, "override")) {
1346 mask_all_settings = false;
1347 mask_no_settings = true;
1348 arg_settings_trusted = -1;
1349 } else
1350 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1351 } else if (r > 0) {
1352 /* yes */
1353 mask_all_settings = false;
1354 mask_no_settings = false;
1355 arg_settings_trusted = -1;
1356 } else {
1357 /* no */
1358 mask_all_settings = true;
1359 mask_no_settings = false;
1360 arg_settings_trusted = false;
1361 }
1362
c6c8f6e2
LP
1363 break;
1364
5f932eb9 1365 case ARG_CHDIR:
baaa35ad
ZJS
1366 if (!path_is_absolute(optarg))
1367 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1368 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1369
1370 r = free_and_strdup(&arg_chdir, optarg);
1371 if (r < 0)
1372 return log_oom();
1373
1374 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1375 break;
1376
b53ede69
PW
1377 case ARG_PIVOT_ROOT:
1378 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1379 if (r < 0)
1380 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1381
1382 arg_settings_mask |= SETTING_PIVOT_ROOT;
1383 break;
1384
9c1e04d0
AP
1385 case ARG_NOTIFY_READY:
1386 r = parse_boolean(optarg);
baaa35ad
ZJS
1387 if (r < 0)
1388 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1389 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1390 arg_notify_ready = r;
1391 arg_settings_mask |= SETTING_NOTIFY_READY;
1392 break;
1393
4623e8e6 1394 case ARG_ROOT_HASH: {
89e62e0b 1395 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1396 size_t l;
1397
1398 r = unhexmem(optarg, strlen(optarg), &k, &l);
1399 if (r < 0)
1400 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1401 if (l < sizeof(sd_id128_t))
da890466 1402 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
4623e8e6 1403
89e62e0b
LP
1404 free_and_replace(arg_verity_settings.root_hash, k);
1405 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1406 break;
1407 }
1408
c2923fdc
LB
1409 case ARG_ROOT_HASH_SIG: {
1410 char *value;
89e62e0b
LP
1411 size_t l;
1412 void *p;
c2923fdc
LB
1413
1414 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1415 r = unbase64mem(value, strlen(value), &p, &l);
1416 if (r < 0)
1417 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1418
c2923fdc 1419 } else {
89e62e0b 1420 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1421 if (r < 0)
89e62e0b 1422 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1423 }
1424
89e62e0b
LP
1425 free_and_replace(arg_verity_settings.root_hash_sig, p);
1426 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1427 break;
1428 }
1429
89e62e0b 1430 case ARG_VERITY_DATA:
614b022c 1431 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1432 if (r < 0)
1433 return r;
1434 break;
1435
960e4569
LP
1436 case ARG_SYSTEM_CALL_FILTER: {
1437 bool negative;
1438 const char *items;
1439
1440 negative = optarg[0] == '~';
1441 items = negative ? optarg + 1 : optarg;
1442
1443 for (;;) {
1444 _cleanup_free_ char *word = NULL;
1445
1446 r = extract_first_word(&items, &word, NULL, 0);
1447 if (r == 0)
1448 break;
1449 if (r == -ENOMEM)
1450 return log_oom();
1451 if (r < 0)
1452 return log_error_errno(r, "Failed to parse system call filter: %m");
1453
1454 if (negative)
6b000af4 1455 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1456 else
6b000af4 1457 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1458 if (r < 0)
1459 return log_oom();
1460 }
1461
1462 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1463 break;
1464 }
1465
bf428efb
LP
1466 case ARG_RLIMIT: {
1467 const char *eq;
622ecfa8 1468 _cleanup_free_ char *name = NULL;
bf428efb
LP
1469 int rl;
1470
5c828e66
LP
1471 if (streq(optarg, "help")) {
1472 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1473 return 0;
1474 }
1475
bf428efb 1476 eq = strchr(optarg, '=');
baaa35ad
ZJS
1477 if (!eq)
1478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1479 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1480
1481 name = strndup(optarg, eq - optarg);
1482 if (!name)
1483 return log_oom();
1484
1485 rl = rlimit_from_string_harder(name);
baaa35ad 1486 if (rl < 0)
7211c853 1487 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1488
1489 if (!arg_rlimit[rl]) {
1490 arg_rlimit[rl] = new0(struct rlimit, 1);
1491 if (!arg_rlimit[rl])
1492 return log_oom();
1493 }
1494
1495 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1498
1499 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1500 break;
1501 }
1502
81f345df
LP
1503 case ARG_OOM_SCORE_ADJUST:
1504 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1505 if (r < 0)
1506 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1507
1508 arg_oom_score_adjust_set = true;
1509 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1510 break;
1511
d107bb7d 1512 case ARG_CPU_AFFINITY: {
0985c7c4 1513 CPUSet cpuset;
d107bb7d
LP
1514
1515 r = parse_cpu_set(optarg, &cpuset);
1516 if (r < 0)
0985c7c4 1517 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1518
0985c7c4
ZJS
1519 cpu_set_reset(&arg_cpu_set);
1520 arg_cpu_set = cpuset;
d107bb7d
LP
1521 arg_settings_mask |= SETTING_CPU_AFFINITY;
1522 break;
1523 }
1524
09d423e9
LP
1525 case ARG_RESOLV_CONF:
1526 if (streq(optarg, "help")) {
1527 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1528 return 0;
1529 }
1530
1531 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1532 if (arg_resolv_conf < 0)
7211c853 1533 return log_error_errno(arg_resolv_conf,
baaa35ad 1534 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1535
1536 arg_settings_mask |= SETTING_RESOLV_CONF;
1537 break;
1538
1688841f
LP
1539 case ARG_TIMEZONE:
1540 if (streq(optarg, "help")) {
1541 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1542 return 0;
1543 }
1544
1545 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1546 if (arg_timezone < 0)
7211c853 1547 return log_error_errno(arg_timezone,
baaa35ad 1548 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1549
1550 arg_settings_mask |= SETTING_TIMEZONE;
1551 break;
1552
de40a303 1553 case ARG_CONSOLE:
dce66ffe
ZJS
1554 r = handle_arg_console(optarg);
1555 if (r <= 0)
1556 return r;
de40a303
LP
1557 break;
1558
1559 case 'P':
1560 case ARG_PIPE:
dce66ffe
ZJS
1561 r = handle_arg_console("pipe");
1562 if (r <= 0)
1563 return r;
de40a303
LP
1564 break;
1565
bb068de0
ZJS
1566 case ARG_NO_PAGER:
1567 arg_pager_flags |= PAGER_DISABLE;
1568 break;
1569
3652872a
LP
1570 case ARG_SET_CREDENTIAL: {
1571 _cleanup_free_ char *word = NULL, *data = NULL;
1572 const char *p = optarg;
1573 Credential *a;
e437538f 1574 ssize_t l;
3652872a
LP
1575
1576 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1577 if (r == -ENOMEM)
1578 return log_oom();
1579 if (r < 0)
1580 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1581 if (r == 0 || !p)
1582 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1583
1584 if (!credential_name_valid(word))
1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1586
12d729b2 1587 for (size_t i = 0; i < arg_n_credentials; i++)
3652872a
LP
1588 if (streq(arg_credentials[i].id, word))
1589 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1590
1591 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1592 if (l < 0)
1593 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1594
1595 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1596 if (!a)
1597 return log_oom();
1598
1599 a[arg_n_credentials++] = (Credential) {
1600 .id = TAKE_PTR(word),
1601 .data = TAKE_PTR(data),
1602 .size = l,
1603 };
1604
1605 arg_credentials = a;
1606
1607 arg_settings_mask |= SETTING_CREDENTIALS;
1608 break;
1609 }
1610
1611 case ARG_LOAD_CREDENTIAL: {
1612 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1613 _cleanup_(erase_and_freep) char *data = NULL;
1614 _cleanup_free_ char *word = NULL, *j = NULL;
1615 const char *p = optarg;
1616 Credential *a;
1617 size_t size, i;
1618
1619 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1620 if (r == -ENOMEM)
1621 return log_oom();
1622 if (r < 0)
c941b650 1623 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
3652872a 1624 if (r == 0 || !p)
c941b650 1625 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
3652872a
LP
1626
1627 if (!credential_name_valid(word))
1628 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1629
1630 for (i = 0; i < arg_n_credentials; i++)
1631 if (streq(arg_credentials[i].id, word))
1632 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1633
1634 if (path_is_absolute(p))
1635 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1636 else {
1637 const char *e;
1638
786d19fd
LP
1639 r = get_credentials_dir(&e);
1640 if (r < 0)
1641 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
3652872a
LP
1642
1643 j = path_join(e, p);
1644 if (!j)
1645 return log_oom();
1646 }
1647
986311c2
LP
1648 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1649 flags,
1650 NULL,
1651 &data, &size);
3652872a
LP
1652 if (r < 0)
1653 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1654
1655 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1656 if (!a)
1657 return log_oom();
1658
1659 a[arg_n_credentials++] = (Credential) {
1660 .id = TAKE_PTR(word),
1661 .data = TAKE_PTR(data),
1662 .size = size,
1663 };
1664
1665 arg_credentials = a;
1666
1667 arg_settings_mask |= SETTING_CREDENTIALS;
1668 break;
1669 }
1670
2f893044
LP
1671 case ARG_BIND_USER:
1672 if (!valid_user_group_name(optarg, 0))
1673 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1674
1675 if (strv_extend(&arg_bind_user, optarg) < 0)
1676 return log_oom();
1677
1678 arg_settings_mask |= SETTING_BIND_USER;
1679 break;
1680
4a4654e0
LP
1681 case ARG_SUPPRESS_SYNC:
1682 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1683 if (r < 0)
1684 return r;
1685
1686 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1687 break;
1688
06e78680
YW
1689 case ARG_IMAGE_POLICY:
1690 r = parse_image_policy_argument(optarg, &arg_image_policy);
84be0c71 1691 if (r < 0)
06e78680 1692 return r;
84be0c71 1693 break;
84be0c71 1694
88213476
LP
1695 case '?':
1696 return -EINVAL;
1697
1698 default:
04499a70 1699 assert_not_reached();
88213476 1700 }
88213476 1701
60f1ec13
LP
1702 if (argc > optind) {
1703 strv_free(arg_parameters);
1704 arg_parameters = strv_copy(argv + optind);
1705 if (!arg_parameters)
1706 return log_oom();
d7bea6b6 1707
60f1ec13
LP
1708 arg_settings_mask |= SETTING_START_MODE;
1709 }
1710
1711 if (arg_ephemeral && arg_template && !arg_directory)
1712 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1713 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1714 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1715 * --directory=". */
1716 arg_directory = TAKE_PTR(arg_template);
1717
2642d22a
DDM
1718 arg_caps_retain |= plus;
1719 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1720
1721 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1722 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1723 * indicate that. */
1724 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1725 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1726
1727 arg_caps_retain &= ~minus;
60f1ec13 1728
de40a303 1729 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1730 r = parse_environment();
1731 if (r < 0)
1732 return r;
de40a303 1733
60f1ec13
LP
1734 /* Load all settings from .nspawn files */
1735 if (mask_no_settings)
1736 arg_settings_mask = 0;
1737
1738 /* Don't load any settings from .nspawn files */
1739 if (mask_all_settings)
1740 arg_settings_mask = _SETTINGS_MASK_ALL;
1741
1742 return 1;
1743}
1744
1745static int verify_arguments(void) {
1746 int r;
a6b5216c 1747
75b0d8b8
ZJS
1748 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1749 /* If we are running the stub init in the container, we don't need to look at what the init
1750 * in the container supports, because we are not using it. Let's immediately pick the right
1751 * setting based on the host system configuration.
1752 *
1753 * We only do this, if the user didn't use an environment variable to override the detection.
1754 */
1755
1756 r = cg_all_unified();
1757 if (r < 0)
1758 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1759 if (r > 0)
1760 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1761 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1762 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1763 else
1764 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1765 }
1766
4f086aab
SU
1767 if (arg_userns_mode != USER_NAMESPACE_NO)
1768 arg_mount_settings |= MOUNT_USE_USERNS;
1769
1770 if (arg_private_network)
1771 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1772
48a8d337
LB
1773 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1774 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1775 arg_register = false;
baaa35ad 1776 if (arg_start_mode != START_PID1)
60f1ec13 1777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1778 }
eb91eb18 1779
6c045a99
LP
1780 if (arg_userns_ownership < 0)
1781 arg_userns_ownership =
f61c7f88 1782 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1783 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1784
60f1ec13
LP
1785 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1786 arg_kill_signal = SIGRTMIN+3;
1787
e5a4bb0d
LP
1788 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1789 arg_read_only = true;
1790
2436ea76
DDM
1791 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1792 arg_read_only = true;
1793
baaa35ad 1794 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1795 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1796 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1798
baaa35ad 1799 if (arg_directory && arg_image)
60f1ec13 1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1801
baaa35ad 1802 if (arg_template && arg_image)
60f1ec13 1803 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1804
baaa35ad 1805 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1806 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1807
baaa35ad 1808 if (arg_ephemeral && arg_template)
60f1ec13 1809 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1810
baaa35ad 1811 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1812 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1813
baaa35ad 1814 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1815 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1816
6c045a99 1817 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1818 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1819 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1820
6c045a99
LP
1821 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1822 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1823 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1824 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1826
679ecd36
SZ
1827 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1828 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1829 if (arg_network_namespace_path &&
1830 (arg_network_interfaces || arg_network_macvlan ||
1831 arg_network_ipvlan || arg_network_veth_extra ||
1832 arg_network_bridge || arg_network_zone ||
679ecd36 1833 arg_network_veth))
de40a303 1834 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1835
60f1ec13 1836 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1837 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1838 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1839
baaa35ad 1840 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1841 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1842
baaa35ad 1843 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1844 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1845
baaa35ad 1846 if (arg_expose_ports && !arg_private_network)
60f1ec13 1847 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1848
88fc9c9b 1849 if (arg_caps_ambient) {
f5fbe71d 1850 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1851 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1852
1853 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1854 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1855
1856 if (arg_start_mode == START_BOOT)
1857 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1858 }
1859
2f893044
LP
1860 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1862
1863 /* Drop duplicate --bind-user= entries */
1864 strv_uniq(arg_bind_user);
1865
60f1ec13
LP
1866 r = custom_mount_check_all();
1867 if (r < 0)
1868 return r;
c6c8f6e2 1869
f757855e 1870 return 0;
88213476
LP
1871}
1872
2f091b1b
TM
1873static int verify_network_interfaces_initialized(void) {
1874 int r;
1875 r = test_network_interfaces_initialized(arg_network_interfaces);
1876 if (r < 0)
1877 return r;
1878
1879 r = test_network_interfaces_initialized(arg_network_macvlan);
1880 if (r < 0)
1881 return r;
1882
1883 r = test_network_interfaces_initialized(arg_network_ipvlan);
1884 if (r < 0)
1885 return r;
1886
1887 return 0;
1888}
1889
91181e07 1890int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1891 assert(p);
1892
0de7acce 1893 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1894 return 0;
1895
1896 if (uid == UID_INVALID && gid == GID_INVALID)
1897 return 0;
1898
1899 if (uid != UID_INVALID) {
1900 uid += arg_uid_shift;
1901
1902 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1903 return -EOVERFLOW;
1904 }
1905
1906 if (gid != GID_INVALID) {
1907 gid += (gid_t) arg_uid_shift;
1908
1909 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1910 return -EOVERFLOW;
1911 }
1912
7c248223 1913 return RET_NERRNO(lchown(p, uid, gid));
b12afc8c
LP
1914}
1915
91181e07 1916int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1917 const char *q;
dae8b82e 1918 int r;
03cfe0d5
LP
1919
1920 q = prefix_roota(root, path);
3f692e2e 1921 r = RET_NERRNO(mkdir(q, mode));
dae8b82e
ZJS
1922 if (r == -EEXIST)
1923 return 0;
1924 if (r < 0)
1925 return r;
03cfe0d5
LP
1926
1927 return userns_lchown(q, uid, gid);
1928}
1929
1688841f 1930static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1931 return PATH_STARTSWITH_SET(
1932 path,
1933 "../usr/share/zoneinfo/",
1934 "/usr/share/zoneinfo/");
1688841f
LP
1935}
1936
83205269
LP
1937static bool etc_writable(void) {
1938 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1939}
1940
e58a1277 1941static int setup_timezone(const char *dest) {
1688841f
LP
1942 _cleanup_free_ char *p = NULL, *etc = NULL;
1943 const char *where, *check;
1944 TimezoneMode m;
d4036145 1945 int r;
f8440af5 1946
e58a1277
LP
1947 assert(dest);
1948
1688841f 1949 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1950 r = readlink_malloc("/etc/localtime", &p);
1951 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1952 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1953 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1954 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1955 else if (r < 0) {
1956 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1957 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1958 * file.
1959 *
1960 * Example:
1961 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1962 */
1963 return 0;
1964 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1965 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1966 else
1967 m = arg_timezone;
1968 } else
1969 m = arg_timezone;
1970
1971 if (m == TIMEZONE_OFF)
1972 return 0;
1973
f461a28d 1974 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1975 if (r < 0) {
1688841f 1976 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1977 return 0;
1978 }
1979
1688841f
LP
1980 where = strjoina(etc, "/localtime");
1981
1982 switch (m) {
1983
1984 case TIMEZONE_DELETE:
1985 if (unlink(where) < 0)
1986 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1987
d4036145 1988 return 0;
d4036145 1989
1688841f
LP
1990 case TIMEZONE_SYMLINK: {
1991 _cleanup_free_ char *q = NULL;
1992 const char *z, *what;
4d1c38b8 1993
1688841f
LP
1994 z = timezone_from_path(p);
1995 if (!z) {
1996 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1997 return 0;
1688841f 1998 }
d4036145 1999
1688841f
LP
2000 r = readlink_malloc(where, &q);
2001 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
2002 return 0; /* Already pointing to the right place? Then do nothing .. */
2003
2004 check = strjoina(dest, "/usr/share/zoneinfo/", z);
f461a28d 2005 r = chase(check, dest, 0, NULL, NULL);
1688841f
LP
2006 if (r < 0)
2007 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
2008 else {
2009 if (unlink(where) < 0 && errno != ENOENT) {
2010 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
2011 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
2012 return 0;
2013 }
2014
2015 what = strjoina("../usr/share/zoneinfo/", z);
2016 if (symlink(what, where) < 0) {
2017 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2018 errno, "Failed to correct timezone of container, ignoring: %m");
2019 return 0;
2020 }
2021
2022 break;
2023 }
2024
2025 _fallthrough_;
d4036145 2026 }
68fb0892 2027
1688841f
LP
2028 case TIMEZONE_BIND: {
2029 _cleanup_free_ char *resolved = NULL;
2030 int found;
2031
f461a28d 2032 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
2033 if (found < 0) {
2034 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2035 return 0;
2036 }
2037
2038 if (found == 0) /* missing? */
2039 (void) touch(resolved);
2040
511a8cfe 2041 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 2042 if (r >= 0)
511a8cfe 2043 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
2044
2045 _fallthrough_;
79d80fc1 2046 }
4d9f07b4 2047
1688841f
LP
2048 case TIMEZONE_COPY:
2049 /* If mounting failed, try to copy */
7c2f5495 2050 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
2051 if (r < 0) {
2052 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2053 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2054 return 0;
2055 }
2056
2057 break;
2058
2059 default:
04499a70 2060 assert_not_reached();
d4036145 2061 }
e58a1277 2062
1688841f 2063 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
2064 r = userns_lchown(where, 0, 0);
2065 if (r < 0)
1688841f 2066 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 2067
e58a1277 2068 return 0;
88213476
LP
2069}
2070
09d423e9
LP
2071static int have_resolv_conf(const char *path) {
2072 assert(path);
2073
2074 if (access(path, F_OK) < 0) {
2075 if (errno == ENOENT)
2076 return 0;
2077
2078 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2079 }
2080
2081 return 1;
2082}
2083
7357272e 2084static int resolved_listening(void) {
b8ea7a6e 2085 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2086 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2087 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2088 int r;
2089
7357272e 2090 /* Check if resolved is listening */
b053cd5f
LP
2091
2092 r = sd_bus_open_system(&bus);
2093 if (r < 0)
b8ea7a6e 2094 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2095
7357272e 2096 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2097 if (r < 0)
2098 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2099 if (r == 0)
2100 return 0;
7357272e 2101
7f8a85e6 2102 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
7357272e 2103 if (r < 0)
b8ea7a6e 2104 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2105
2106 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2107}
2108
2547bb41 2109static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2110 _cleanup_free_ char *etc = NULL;
2111 const char *where, *what;
2112 ResolvConfMode m;
2113 int r;
2547bb41
LP
2114
2115 assert(dest);
2116
09d423e9
LP
2117 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2118 if (arg_private_network)
2119 m = RESOLV_CONF_OFF;
86775e35
LP
2120 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2121 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2122 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2123 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2124 else
83205269 2125 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2126
09d423e9
LP
2127 } else
2128 m = arg_resolv_conf;
2129
2130 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2131 return 0;
2132
f461a28d 2133 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2134 if (r < 0) {
2135 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2136 return 0;
2137 }
2138
2139 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2140
2141 if (m == RESOLV_CONF_DELETE) {
2142 if (unlink(where) < 0)
2143 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2144
87447ae4
LP
2145 return 0;
2146 }
79d80fc1 2147
86775e35
LP
2148 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2149 what = PRIVATE_STATIC_RESOLV_CONF;
2150 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2151 what = PRIVATE_UPLINK_RESOLV_CONF;
2152 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2153 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2154 else
2155 what = "/etc/resolv.conf";
87447ae4 2156
86775e35 2157 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2158 _cleanup_free_ char *resolved = NULL;
2159 int found;
2160
d404c8d8 2161 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
09d423e9
LP
2162 if (found < 0) {
2163 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2164 return 0;
2165 }
3539724c 2166
87447ae4
LP
2167 if (found == 0) /* missing? */
2168 (void) touch(resolved);
5367354d 2169
511a8cfe 2170 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2171 if (r >= 0)
511a8cfe 2172 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2173
2174 /* If that didn't work, let's copy the file */
3539724c
LP
2175 }
2176
86775e35 2177 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
7c2f5495 2178 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
86775e35 2179 else
7c2f5495 2180 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
79d80fc1 2181 if (r < 0) {
3539724c
LP
2182 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2183 * resolved or something similar runs inside and the symlink points there.
68a313c5 2184 *
3539724c 2185 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2186 */
86775e35
LP
2187 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2188 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2189 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2190 return 0;
2191 }
2547bb41 2192
03cfe0d5
LP
2193 r = userns_lchown(where, 0, 0);
2194 if (r < 0)
3539724c 2195 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2196
2547bb41
LP
2197 return 0;
2198}
2199
1e4f1671 2200static int setup_boot_id(void) {
cdde6ba6
LP
2201 _cleanup_(unlink_and_freep) char *from = NULL;
2202 _cleanup_free_ char *path = NULL;
3bbaff3e 2203 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2204 const char *to;
04bc4a3f
LP
2205 int r;
2206
1eacc470 2207 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2208
1eacc470 2209 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2210 if (r < 0)
2211 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2212
2213 r = sd_id128_randomize(&rnd);
f647962d
MS
2214 if (r < 0)
2215 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2216
b40c8ebd 2217 r = id128_write(path, ID128_FORMAT_UUID, rnd);
f647962d
MS
2218 if (r < 0)
2219 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2220
cdde6ba6
LP
2221 from = TAKE_PTR(path);
2222 to = "/proc/sys/kernel/random/boot_id";
2223
511a8cfe 2224 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2225 if (r < 0)
2226 return r;
04bc4a3f 2227
511a8cfe 2228 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2229}
2230
e58a1277 2231static int copy_devnodes(const char *dest) {
88213476
LP
2232 static const char devnodes[] =
2233 "null\0"
2234 "zero\0"
2235 "full\0"
2236 "random\0"
2237 "urandom\0"
85614d66
TG
2238 "tty\0"
2239 "net/tun\0";
88213476 2240
e58a1277 2241 int r = 0;
a258bf26
LP
2242
2243 assert(dest);
124640f1 2244
52f05ef2 2245 BLOCK_WITH_UMASK(0000);
88213476 2246
03cfe0d5
LP
2247 /* Create /dev/net, so that we can create /dev/net/tun in it */
2248 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2249 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2250
88213476 2251 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2252 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2253 struct stat st;
88213476 2254
c6134d3e 2255 from = path_join("/dev/", d);
8967f291
LP
2256 if (!from)
2257 return log_oom();
2258
c6134d3e 2259 to = path_join(dest, from);
8967f291
LP
2260 if (!to)
2261 return log_oom();
88213476
LP
2262
2263 if (stat(from, &st) < 0) {
2264
4a62c710
MS
2265 if (errno != ENOENT)
2266 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2267
baaa35ad
ZJS
2268 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2269 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2270 "%s is not a char or block device, cannot copy.", from);
2271 else {
8dfce114
LP
2272 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2273
81f5049b 2274 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2275 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2276 if (errno == EEXIST)
8dbf71ec 2277 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2278 if (errno != EPERM)
2279 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2280
8dfce114 2281 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2282 r = touch(to);
2283 if (r < 0)
2284 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2285 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2286 if (r < 0)
2287 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2288 }
6278cf60 2289
03cfe0d5
LP
2290 r = userns_lchown(to, 0, 0);
2291 if (r < 0)
2292 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2293
657ee2d8 2294 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2295 if (!dn)
2296 return log_oom();
2297
2298 r = userns_mkdir(dest, dn, 0755, 0, 0);
2299 if (r < 0)
2300 return log_error_errno(r, "Failed to create '%s': %m", dn);
2301
2302 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2303 return log_oom();
2304
c6134d3e 2305 prefixed = path_join(dest, sl);
8dfce114
LP
2306 if (!prefixed)
2307 return log_oom();
2308
2d9b74ba 2309 t = path_join("..", d);
8dfce114
LP
2310 if (!t)
2311 return log_oom();
2312
2313 if (symlink(t, prefixed) < 0)
2314 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2315 }
88213476
LP
2316 }
2317
e58a1277
LP
2318 return r;
2319}
88213476 2320
de40a303 2321static int make_extra_nodes(const char *dest) {
de40a303
LP
2322 size_t i;
2323 int r;
2324
52f05ef2 2325 BLOCK_WITH_UMASK(0000);
de40a303
LP
2326
2327 for (i = 0; i < arg_n_extra_nodes; i++) {
2328 _cleanup_free_ char *path = NULL;
2329 DeviceNode *n = arg_extra_nodes + i;
2330
c6134d3e 2331 path = path_join(dest, n->path);
de40a303
LP
2332 if (!path)
2333 return log_oom();
2334
2335 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2336 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2337
2338 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2339 if (r < 0)
2340 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2341 }
2342
2343 return 0;
2344}
2345
03cfe0d5
LP
2346static int setup_pts(const char *dest) {
2347 _cleanup_free_ char *options = NULL;
2348 const char *p;
709f6e46 2349 int r;
03cfe0d5 2350
349cc4a5 2351#if HAVE_SELINUX
03cfe0d5
LP
2352 if (arg_selinux_apifs_context)
2353 (void) asprintf(&options,
3dce8915 2354 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2355 arg_uid_shift + TTY_GID,
2356 arg_selinux_apifs_context);
2357 else
2358#endif
2359 (void) asprintf(&options,
3dce8915 2360 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2361 arg_uid_shift + TTY_GID);
f2d88580 2362
03cfe0d5 2363 if (!options)
f2d88580
LP
2364 return log_oom();
2365
03cfe0d5 2366 /* Mount /dev/pts itself */
cc9fce65 2367 p = prefix_roota(dest, "/dev/pts");
3f692e2e 2368 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e
ZJS
2369 if (r < 0)
2370 return log_error_errno(r, "Failed to create /dev/pts: %m");
2371
511a8cfe 2372 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2373 if (r < 0)
2374 return r;
709f6e46
MS
2375 r = userns_lchown(p, 0, 0);
2376 if (r < 0)
2377 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2378
2379 /* Create /dev/ptmx symlink */
2380 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2381 if (symlink("pts/ptmx", p) < 0)
2382 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2383 r = userns_lchown(p, 0, 0);
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2386
03cfe0d5
LP
2387 /* And fix /dev/pts/ptmx ownership */
2388 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2389 r = userns_lchown(p, 0, 0);
2390 if (r < 0)
2391 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2392
f2d88580
LP
2393 return 0;
2394}
2395
3acc84eb 2396static int setup_stdio_as_dev_console(void) {
5bb1d7fb 2397 _cleanup_close_ int terminal = -EBADF;
e58a1277 2398 int r;
e58a1277 2399
335d2ead
LP
2400 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2401 * explicitly, if we are configured to. */
2402 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2403 if (terminal < 0)
2404 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2405
3acc84eb
FB
2406 /* Make sure we can continue logging to the original stderr, even if
2407 * stderr points elsewhere now */
2408 r = log_dup_console();
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2411
3acc84eb
FB
2412 /* invalidates 'terminal' on success and failure */
2413 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2414 TAKE_FD(terminal);
f647962d 2415 if (r < 0)
3acc84eb
FB
2416 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2417
2418 return 0;
2419}
88213476 2420
3acc84eb
FB
2421static int setup_dev_console(const char *console) {
2422 _cleanup_free_ char *p = NULL;
2423 int r;
a258bf26 2424
3acc84eb
FB
2425 /* Create /dev/console symlink */
2426 r = path_make_relative("/dev", console, &p);
81f5049b 2427 if (r < 0)
3acc84eb
FB
2428 return log_error_errno(r, "Failed to create relative path: %m");
2429
2430 if (symlink(p, "/dev/console") < 0)
2431 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2432
3acc84eb 2433 return 0;
e58a1277
LP
2434}
2435
8e5430c4
LP
2436static int setup_keyring(void) {
2437 key_serial_t keyring;
2438
6b000af4
LP
2439 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2440 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2441 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2442 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2443 * into the container. */
8e5430c4
LP
2444
2445 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2446 if (keyring == -1) {
2447 if (errno == ENOSYS)
2448 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2449 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2450 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2451 else
2452 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2453 }
2454
2455 return 0;
2456}
2457
3652872a
LP
2458static int setup_credentials(const char *root) {
2459 const char *q;
2460 int r;
2461
2462 if (arg_n_credentials <= 0)
2463 return 0;
2464
2465 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2466 if (r < 0)
2467 return log_error_errno(r, "Failed to create /run/host: %m");
2468
2469 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2470 if (r < 0)
2471 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2472
2473 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2474 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2475 if (r < 0)
2476 return r;
2477
2478 for (size_t i = 0; i < arg_n_credentials; i++) {
2479 _cleanup_free_ char *j = NULL;
254d1313 2480 _cleanup_close_ int fd = -EBADF;
3652872a
LP
2481
2482 j = path_join(q, arg_credentials[i].id);
2483 if (!j)
2484 return log_oom();
2485
2486 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2487 if (fd < 0)
2488 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2489
e22c60a9 2490 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size);
3652872a
LP
2491 if (r < 0)
2492 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2493
2494 if (fchmod(fd, 0400) < 0)
2495 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2496
2497 if (arg_userns_mode != USER_NAMESPACE_NO) {
2498 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2499 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2500 }
2501 }
2502
2503 if (chmod(q, 0500) < 0)
2504 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2505
2506 r = userns_lchown(q, 0, 0);
2507 if (r < 0)
2508 return r;
2509
2510 /* Make both mount and superblock read-only now */
511a8cfe 2511 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2512 if (r < 0)
2513 return r;
2514
511a8cfe 2515 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2516}
2517
5d9d3fcb 2518static int setup_kmsg(int fd_inner_socket) {
9ec5a93c
LP
2519 _cleanup_(unlink_and_freep) char *from = NULL;
2520 _cleanup_free_ char *fifo = NULL;
254d1313 2521 _cleanup_close_ int fd = -EBADF;
9ec5a93c 2522 int r;
e58a1277 2523
5d9d3fcb 2524 assert(fd_inner_socket >= 0);
a258bf26 2525
52f05ef2 2526 BLOCK_WITH_UMASK(0000);
a258bf26 2527
30fd9a2d 2528 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2529 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2530 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2531 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2532
1eacc470 2533 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2534 if (r < 0)
2535 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2536
9ec5a93c 2537 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2538 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2539
2540 from = TAKE_PTR(fifo);
9ec5a93c 2541
511a8cfe 2542 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2543 if (r < 0)
2544 return r;
e58a1277 2545
669fc4e5 2546 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2547 if (fd < 0)
2548 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2549
9ec5a93c 2550 /* Store away the fd in the socket, so that it stays open as long as we run the child */
5d9d3fcb 2551 r = send_one_fd(fd_inner_socket, fd, 0);
d9603714
DH
2552 if (r < 0)
2553 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2554
25ea79fe 2555 return 0;
88213476
LP
2556}
2557
761cf19d 2558struct ExposeArgs {
deff68e7
FW
2559 union in_addr_union address4;
2560 union in_addr_union address6;
761cf19d
FW
2561 struct FirewallContext *fw_ctx;
2562};
2563
1c4baffc 2564static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
99534007 2565 struct ExposeArgs *args = ASSERT_PTR(userdata);
6d0b55c2
LP
2566
2567 assert(rtnl);
2568 assert(m);
6d0b55c2 2569
fb9044cb
LP
2570 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2571 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2572 return 0;
2573}
2574
3a74cea5 2575static int setup_hostname(void) {
c818eef1 2576 int r;
3a74cea5 2577
0c582db0 2578 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2579 return 0;
2580
c818eef1
LP
2581 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2582 if (r < 0)
2583 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2584
7027ff61 2585 return 0;
3a74cea5
LP
2586}
2587
57fb9fb5 2588static int setup_journal(const char *directory) {
0f5e1382 2589 _cleanup_free_ char *d = NULL;
5980d463 2590 const char *p, *q;
b2238e38 2591 sd_id128_t this_id;
8054d749 2592 bool try;
57fb9fb5
LP
2593 int r;
2594
df9a75e4
LP
2595 /* Don't link journals in ephemeral mode */
2596 if (arg_ephemeral)
2597 return 0;
2598
8054d749
LP
2599 if (arg_link_journal == LINK_NO)
2600 return 0;
2601
2602 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2603
4d680aee 2604 r = sd_id128_get_machine(&this_id);
f647962d
MS
2605 if (r < 0)
2606 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2607
e01ff70a 2608 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2609 log_full(try ? LOG_WARNING : LOG_ERR,
85b55869 2610 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
8054d749 2611 if (try)
4d680aee 2612 return 0;
df9a75e4 2613 return -EEXIST;
4d680aee
ZJS
2614 }
2615
369ca6da
ZJS
2616 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2617 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2618 if (r < 0) {
2619 bool ignore = r == -EROFS && try;
2620 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2621 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2622 return ignore ? 0 : r;
2623 }
2624 }
03cfe0d5 2625
85b55869 2626 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
03cfe0d5 2627 q = prefix_roota(directory, p);
27407a01 2628
e1873695 2629 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2630 if (try)
2631 return 0;
27407a01 2632
baaa35ad
ZJS
2633 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2634 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2635 }
2636
e1873695 2637 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2638 if (try)
2639 return 0;
57fb9fb5 2640
baaa35ad
ZJS
2641 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2642 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2643 }
2644
2645 r = readlink_and_make_absolute(p, &d);
2646 if (r >= 0) {
3742095b 2647 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2648 path_equal(d, q)) {
2649
03cfe0d5 2650 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2651 if (r < 0)
709f6e46 2652 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2653 return 0;
57fb9fb5
LP
2654 }
2655
4a62c710
MS
2656 if (unlink(p) < 0)
2657 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2658 } else if (r == -EINVAL) {
2659
2660 if (arg_link_journal == LINK_GUEST &&
2661 rmdir(p) < 0) {
2662
27407a01
ZJS
2663 if (errno == ENOTDIR) {
2664 log_error("%s already exists and is neither a symlink nor a directory", p);
2665 return r;
4314d33f
MS
2666 } else
2667 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2668 }
4314d33f
MS
2669 } else if (r != -ENOENT)
2670 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2671
2672 if (arg_link_journal == LINK_GUEST) {
2673
2674 if (symlink(q, p) < 0) {
8054d749 2675 if (try) {
56f64d95 2676 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2677 return 0;
4314d33f
MS
2678 } else
2679 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2680 }
2681
03cfe0d5 2682 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2683 if (r < 0)
709f6e46 2684 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2685 return 0;
57fb9fb5
LP
2686 }
2687
2688 if (arg_link_journal == LINK_HOST) {
ccddd104 2689 /* don't create parents here — if the host doesn't have
574edc90 2690 * permanent journal set up, don't force it here */
ba8e6c4d 2691
3f692e2e 2692 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e 2693 if (r < 0 && r != -EEXIST) {
8054d749 2694 if (try) {
dae8b82e 2695 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2696 return 0;
4314d33f 2697 } else
dae8b82e 2698 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2699 }
2700
27407a01
ZJS
2701 } else if (access(p, F_OK) < 0)
2702 return 0;
57fb9fb5 2703
db55bbf2 2704 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
cdb2b9d0
LP
2705 log_warning("%s is not empty, proceeding anyway.", q);
2706
03cfe0d5 2707 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2708 if (r < 0)
2709 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2710
511a8cfe 2711 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2712 if (r < 0)
4a62c710 2713 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2714
27407a01 2715 return 0;
57fb9fb5
LP
2716}
2717
de40a303
LP
2718static int drop_capabilities(uid_t uid) {
2719 CapabilityQuintet q;
2720
2721 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2722 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2723 * arg_caps_retain. */
2724
2725 if (capability_quintet_is_set(&arg_full_capabilities)) {
2726 q = arg_full_capabilities;
2727
f5fbe71d 2728 if (q.bounding == UINT64_MAX)
de40a303
LP
2729 q.bounding = uid == 0 ? arg_caps_retain : 0;
2730
f5fbe71d 2731 if (q.effective == UINT64_MAX)
de40a303
LP
2732 q.effective = uid == 0 ? q.bounding : 0;
2733
f5fbe71d 2734 if (q.inheritable == UINT64_MAX)
88fc9c9b 2735 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2736
f5fbe71d 2737 if (q.permitted == UINT64_MAX)
88fc9c9b 2738 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2739
f5fbe71d 2740 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2741 q.ambient = arg_caps_ambient;
f66ad460
AZ
2742
2743 if (capability_quintet_mangle(&q))
2744 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2745
2746 } else {
de40a303
LP
2747 q = (CapabilityQuintet) {
2748 .bounding = arg_caps_retain,
2749 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2750 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2751 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2752 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2753 };
2754
f66ad460
AZ
2755 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2756 * in order to maintain the same behavior as systemd < 242. */
2757 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2758 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2759 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2760
2761 }
2762
de40a303 2763 return capability_quintet_enforce(&q);
88213476
LP
2764}
2765
db999e0f
LP
2766static int reset_audit_loginuid(void) {
2767 _cleanup_free_ char *p = NULL;
2768 int r;
2769
0c582db0 2770 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2771 return 0;
2772
2773 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2774 if (r == -ENOENT)
db999e0f 2775 return 0;
f647962d
MS
2776 if (r < 0)
2777 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2778
2779 /* Already reset? */
2780 if (streq(p, "4294967295"))
2781 return 0;
2782
57512c89 2783 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2784 if (r < 0) {
10a87006
LP
2785 log_error_errno(r,
2786 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2787 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2788 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2789 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2790 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2791
db999e0f 2792 sleep(5);
77b6e194 2793 }
db999e0f
LP
2794
2795 return 0;
77b6e194
LP
2796}
2797
e79581dd 2798static int mount_tunnel_dig(const char *root) {
785890ac 2799 const char *p, *q;
709f6e46 2800 int r;
785890ac
LP
2801
2802 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2803 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2804 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2805 (void) mkdir_p(p, 0600);
2806
5a27b395 2807 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2808 if (r < 0)
5a27b395 2809 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2810
e79581dd 2811 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
709f6e46 2812 if (r < 0)
e79581dd 2813 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
03cfe0d5 2814
e79581dd 2815 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
511a8cfe 2816 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2817 if (r < 0)
2818 return r;
785890ac 2819
511a8cfe 2820 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2821 if (r < 0)
2822 return r;
785890ac 2823
e79581dd
CB
2824 return 0;
2825}
2826
2827static int mount_tunnel_open(void) {
2828 int r;
2829
2830 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2831 if (r < 0)
2832 return r;
2833
2834 return 0;
785890ac
LP
2835}
2836
317feb4d 2837static int setup_machine_id(const char *directory) {
3bbaff3e 2838 int r;
e01ff70a 2839
317feb4d
LP
2840 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2841 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2842 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2843 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2844 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2845 * container behaves nicely). */
2846
319477f1 2847 r = id128_get_machine(directory, &arg_uuid);
bb44fd07
ZJS
2848 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2849 /* If the file is missing, empty, or uninitialized, we don't mind */
317feb4d
LP
2850 if (sd_id128_is_null(arg_uuid)) {
2851 r = sd_id128_randomize(&arg_uuid);
2852 if (r < 0)
2853 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2854 }
bb44fd07
ZJS
2855 } else if (r < 0)
2856 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2857
e01ff70a
MS
2858 return 0;
2859}
2860
7336138e
LP
2861static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2862 int r;
2863
2864 assert(directory);
2865
6c045a99 2866 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2867 return 0;
2868
2869 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2870 if (r == -EOPNOTSUPP)
2871 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2872 if (r == -EBADE)
2873 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2874 if (r < 0)
2875 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2876 if (r == 0)
2877 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2878 else
2879 log_debug("Patched directory tree to match UID/GID range.");
2880
2881 return r;
2882}
2883
113cea80 2884/*
6d416b9c
LS
2885 * Return values:
2886 * < 0 : wait_for_terminate() failed to get the state of the
2887 * container, the container was terminated by a signal, or
2888 * failed for an unknown reason. No change is made to the
2889 * container argument.
2890 * > 0 : The program executed in the container terminated with an
2891 * error. The exit code of the program executed in the
919699ec
LP
2892 * container is returned. The container argument has been set
2893 * to CONTAINER_TERMINATED.
6d416b9c
LS
2894 * 0 : The container is being rebooted, has been shut down or exited
2895 * successfully. The container argument has been set to either
2896 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2897 *
6d416b9c
LS
2898 * That is, success is indicated by a return value of zero, and an
2899 * error is indicated by a non-zero value.
113cea80
DH
2900 */
2901static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2902 siginfo_t status;
919699ec 2903 int r;
113cea80
DH
2904
2905 r = wait_for_terminate(pid, &status);
f647962d
MS
2906 if (r < 0)
2907 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2908
2909 switch (status.si_code) {
fddbb89c 2910
113cea80 2911 case CLD_EXITED:
b5a2179b 2912 if (status.si_status == 0)
919699ec 2913 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2914 else
919699ec 2915 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2916
919699ec
LP
2917 *container = CONTAINER_TERMINATED;
2918 return status.si_status;
113cea80
DH
2919
2920 case CLD_KILLED:
2921 if (status.si_status == SIGINT) {
919699ec 2922 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2923 *container = CONTAINER_TERMINATED;
919699ec
LP
2924 return 0;
2925
113cea80 2926 } else if (status.si_status == SIGHUP) {
919699ec 2927 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2928 *container = CONTAINER_REBOOTED;
919699ec 2929 return 0;
113cea80 2930 }
919699ec 2931
4831981d 2932 _fallthrough_;
113cea80 2933 case CLD_DUMPED:
baaa35ad
ZJS
2934 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2935 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2936
2937 default:
baaa35ad
ZJS
2938 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2939 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2940 }
113cea80
DH
2941}
2942
023fb90b
LP
2943static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2944 pid_t pid;
2945
4a0b58c4 2946 pid = PTR_TO_PID(userdata);
023fb90b 2947 if (pid > 0) {
c6c8f6e2 2948 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2949 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2950 sd_event_source_set_userdata(s, NULL);
2951 return 0;
2952 }
2953 }
2954
2955 sd_event_exit(sd_event_source_get_event(s), 0);
2956 return 0;
2957}
2958
6916b164 2959static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2960 pid_t pid;
2961
2962 assert(s);
2963 assert(ssi);
2964
2965 pid = PTR_TO_PID(userdata);
2966
6916b164
AU
2967 for (;;) {
2968 siginfo_t si = {};
abdb9b08 2969
6916b164
AU
2970 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2971 return log_error_errno(errno, "Failed to waitid(): %m");
2972 if (si.si_pid == 0) /* No pending children. */
2973 break;
abdb9b08 2974 if (si.si_pid == pid) {
6916b164
AU
2975 /* The main process we care for has exited. Return from
2976 * signal handler but leave the zombie. */
2977 sd_event_exit(sd_event_source_get_event(s), 0);
2978 break;
2979 }
abdb9b08 2980
6916b164
AU
2981 /* Reap all other children. */
2982 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2983 }
2984
2985 return 0;
2986}
2987
abdb9b08
LP
2988static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2989 pid_t pid;
2990
2991 assert(m);
2992
2993 pid = PTR_TO_PID(userdata);
2994
2995 if (arg_kill_signal > 0) {
2996 log_info("Container termination requested. Attempting to halt container.");
2997 (void) kill(pid, arg_kill_signal);
2998 } else {
2999 log_info("Container termination requested. Exiting.");
3000 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
3001 }
3002
3003 return 0;
3004}
3005
ec16945e 3006static int determine_names(void) {
1b9cebf6 3007 int r;
ec16945e 3008
c1521918
LP
3009 if (arg_template && !arg_directory && arg_machine) {
3010
3011 /* If --template= was specified then we should not
3012 * search for a machine, but instead create a new one
3013 * in /var/lib/machine. */
3014
657ee2d8 3015 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
3016 if (!arg_directory)
3017 return log_oom();
3018 }
3019
ec16945e 3020 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3021 if (arg_machine) {
3022 _cleanup_(image_unrefp) Image *i = NULL;
3023
d577d4a4 3024 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
3025 if (r == -ENOENT)
3026 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
3027 if (r < 0)
3028 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 3029
eb38edce 3030 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 3031 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 3032 else
0f03c2a4 3033 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 3034 if (r < 0)
0f3be6ca 3035 return log_oom();
1b9cebf6 3036
aee327b8
LP
3037 if (!arg_ephemeral)
3038 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
3039 } else {
3040 r = safe_getcwd(&arg_directory);
3041 if (r < 0)
3042 return log_error_errno(r, "Failed to determine current directory: %m");
3043 }
ec16945e 3044
c6147113
LP
3045 if (!arg_directory && !arg_image)
3046 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
3047 }
3048
3049 if (!arg_machine) {
b9ba4dab
LP
3050 if (arg_directory && path_equal(arg_directory, "/"))
3051 arg_machine = gethostname_malloc();
e9b88a6d
LP
3052 else if (arg_image) {
3053 char *e;
4827ab48 3054
b36e39d2
LP
3055 r = path_extract_filename(arg_image, &arg_machine);
3056 if (r < 0)
3057 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
4827ab48 3058
e9b88a6d
LP
3059 /* Truncate suffix if there is one */
3060 e = endswith(arg_machine, ".raw");
3061 if (e)
3062 *e = 0;
b36e39d2
LP
3063 } else {
3064 r = path_extract_filename(arg_directory, &arg_machine);
3065 if (r < 0)
3066 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3067 }
ec16945e 3068
ae691c1d 3069 hostname_cleanup(arg_machine);
52ef5dd7 3070 if (!hostname_is_valid(arg_machine, 0))
c6147113 3071 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 3072
3603f151
LB
3073 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3074 * to match fixed config file names. */
3075 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3076 if (!arg_settings_filename)
3077 return log_oom();
3078
e9b88a6d
LP
3079 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3080 * instances at once without manually having to specify -M each time. */
3081 if (arg_ephemeral)
3082 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3083 return log_oom();
3603f151
LB
3084 } else {
3085 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3086 if (!arg_settings_filename)
3087 return log_oom();
ec16945e
LP
3088 }
3089
3090 return 0;
3091}
3092
f461a28d 3093static int chase_and_update(char **p, unsigned flags) {
3f342ec4
LP
3094 char *chased;
3095 int r;
3096
3097 assert(p);
3098
3099 if (!*p)
3100 return 0;
3101
f461a28d 3102 r = chase(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3103 if (r < 0)
3104 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3105
a5648b80 3106 return free_and_replace(*p, chased);
3f342ec4
LP
3107}
3108
03cfe0d5 3109static int determine_uid_shift(const char *directory) {
6dac160c 3110
0de7acce 3111 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3112 arg_uid_shift = 0;
6dac160c 3113 return 0;
03cfe0d5 3114 }
6dac160c
LP
3115
3116 if (arg_uid_shift == UID_INVALID) {
3117 struct stat st;
3118
993da6d4
LP
3119 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3120
3121 if (stat(directory, &st) < 0)
03cfe0d5 3122 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3123
3124 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3125
baaa35ad
ZJS
3126 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3127 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3128 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3129
3130 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3131
3132 if (arg_uid_shift != 0) {
3133 /* If the image is shifted already, then we'll fall back to classic chowning, for
3134 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3135
3136 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3137 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3138 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3139 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3140 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3141 "UID base of %s is not zero, UID mapping not supported.", directory);
3142 }
6dac160c
LP
3143 }
3144
58e13de5
LP
3145 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3146 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3147
6dac160c
LP
3148 return 0;
3149}
3150
de40a303
LP
3151static unsigned long effective_clone_ns_flags(void) {
3152 unsigned long flags = arg_clone_ns_flags;
3153
3154 if (arg_private_network)
3155 flags |= CLONE_NEWNET;
3156 if (arg_use_cgns)
3157 flags |= CLONE_NEWCGROUP;
3158 if (arg_userns_mode != USER_NAMESPACE_NO)
3159 flags |= CLONE_NEWUSER;
3160
3161 return flags;
3162}
3163
3164static int patch_sysctl(void) {
3165
3166 /* This table is inspired by runc's sysctl() function */
3167 static const struct {
3168 const char *key;
3169 bool prefix;
3170 unsigned long clone_flags;
3171 } safe_sysctl[] = {
3172 { "kernel.hostname", false, CLONE_NEWUTS },
3173 { "kernel.domainname", false, CLONE_NEWUTS },
3174 { "kernel.msgmax", false, CLONE_NEWIPC },
3175 { "kernel.msgmnb", false, CLONE_NEWIPC },
3176 { "kernel.msgmni", false, CLONE_NEWIPC },
3177 { "kernel.sem", false, CLONE_NEWIPC },
3178 { "kernel.shmall", false, CLONE_NEWIPC },
3179 { "kernel.shmmax", false, CLONE_NEWIPC },
3180 { "kernel.shmmni", false, CLONE_NEWIPC },
3181 { "fs.mqueue.", true, CLONE_NEWIPC },
3182 { "net.", true, CLONE_NEWNET },
3183 };
3184
3185 unsigned long flags;
de40a303
LP
3186 int r;
3187
3188 flags = effective_clone_ns_flags();
3189
3190 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3191 bool good = false;
3192 size_t i;
3193
3194 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3195
3196 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3197 continue;
3198
3199 if (safe_sysctl[i].prefix)
3200 good = startswith(*k, safe_sysctl[i].key);
3201 else
3202 good = streq(*k, safe_sysctl[i].key);
3203
3204 if (good)
3205 break;
3206 }
3207
c6147113
LP
3208 if (!good)
3209 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3210
3211 r = sysctl_write(*k, *v);
3212 if (r < 0)
3213 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3214 }
3215
3216 return 0;
3217}
3218
03cfe0d5
LP
3219static int inner_child(
3220 Barrier *barrier,
5d9d3fcb 3221 int fd_inner_socket,
e1bb4b0d
LB
3222 FDSet *fds,
3223 char **os_release_pairs) {
69c79d3c 3224
03cfe0d5 3225 _cleanup_free_ char *home = NULL;
88614c8a 3226 size_t n_env = 1;
4ab3d29f
ZJS
3227 char *envp[] = {
3228 (char*) "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3229 NULL, /* container */
03cfe0d5
LP
3230 NULL, /* TERM */
3231 NULL, /* HOME */
3232 NULL, /* USER */
3233 NULL, /* LOGNAME */
3234 NULL, /* container_uuid */
3235 NULL, /* LISTEN_FDS */
3236 NULL, /* LISTEN_PID */
9c1e04d0 3237 NULL, /* NOTIFY_SOCKET */
3652872a 3238 NULL, /* CREDENTIALS_DIRECTORY */
b626f695 3239 NULL, /* LANG */
03cfe0d5
LP
3240 NULL
3241 };
1a68e1e5 3242 const char *exec_target;
2371271c 3243 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3244 int r, which_failed;
88213476 3245
b37469d7
LP
3246 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3247 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3248 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3249 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3250 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3251 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3252 * namespace.
3253 *
3254 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3255 * unshare(). See below. */
3256
03cfe0d5 3257 assert(barrier);
5d9d3fcb 3258 assert(fd_inner_socket >= 0);
88213476 3259
de40a303
LP
3260 log_debug("Inner child is initializing.");
3261
0de7acce 3262 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3263 /* Tell the parent, that it now can write the UID map. */
3264 (void) barrier_place(barrier); /* #1 */
7027ff61 3265
03cfe0d5 3266 /* Wait until the parent wrote the UID map */
baaa35ad 3267 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3268 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3269
2a2e78e9
LP
3270 /* Become the new root user inside our namespace */
3271 r = reset_uid_gid();
3272 if (r < 0)
3273 return log_error_errno(r, "Couldn't become new root: %m");
3274
3275 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3276 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3277 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3278 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3279 if (r < 0)
3280 return r;
3281 }
6d66bd3b 3282
0de7acce 3283 r = mount_all(NULL,
4f086aab 3284 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3285 arg_uid_shift,
0de7acce 3286 arg_selinux_apifs_context);
03cfe0d5
LP
3287 if (r < 0)
3288 return r;
3289
04413780
ZJS
3290 if (!arg_network_namespace_path && arg_private_network) {
3291 r = unshare(CLONE_NEWNET);
3292 if (r < 0)
3293 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3294
3295 /* Tell the parent that it can setup network interfaces. */
3296 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3297 }
3298
4f086aab 3299 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3300 if (r < 0)
3301 return r;
3302
03cfe0d5
LP
3303 /* Wait until we are cgroup-ified, so that we
3304 * can mount the right cgroup path writable */
baaa35ad
ZJS
3305 if (!barrier_place_and_sync(barrier)) /* #4 */
3306 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3307 "Parent died too early");
88213476 3308
489fae52 3309 if (arg_use_cgns) {
0996ef00
CB
3310 r = unshare(CLONE_NEWCGROUP);
3311 if (r < 0)
04413780 3312 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3313 r = mount_cgroups(
3314 "",
3315 arg_unified_cgroup_hierarchy,
3316 arg_userns_mode != USER_NAMESPACE_NO,
3317 arg_uid_shift,
3318 arg_uid_range,
5a8ff0e6 3319 arg_selinux_apifs_context,
ada54120 3320 true);
1433e0f2 3321 } else
0996ef00 3322 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3323 if (r < 0)
3324 return r;
ec16945e 3325
1e4f1671 3326 r = setup_boot_id();
03cfe0d5
LP
3327 if (r < 0)
3328 return r;
ec16945e 3329
5d9d3fcb 3330 r = setup_kmsg(fd_inner_socket);
03cfe0d5
LP
3331 if (r < 0)
3332 return r;
ec16945e 3333
de40a303
LP
3334 r = mount_custom(
3335 "/",
3336 arg_custom_mounts,
3337 arg_n_custom_mounts,
de40a303 3338 0,
c0c8f718 3339 0,
de40a303 3340 arg_selinux_apifs_context,
5f0a6347 3341 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3342 if (r < 0)
3343 return r;
3344
03cfe0d5
LP
3345 if (setsid() < 0)
3346 return log_error_errno(errno, "setsid() failed: %m");
3347
3348 if (arg_private_network)
df883de9 3349 (void) loopback_setup();
03cfe0d5 3350
7a8f6325 3351 if (arg_expose_ports) {
b07ee903 3352 r = expose_port_send_rtnl(fd_inner_socket);
7a8f6325
LP
3353 if (r < 0)
3354 return r;
7a8f6325 3355 }
03cfe0d5 3356
3acc84eb 3357 if (arg_console_mode != CONSOLE_PIPE) {
5bb1d7fb 3358 _cleanup_close_ int master = -EBADF;
3acc84eb
FB
3359 _cleanup_free_ char *console = NULL;
3360
3361 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3362 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3363 if (master < 0)
dc98caea 3364 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3365
3366 r = setup_dev_console(console);
3367 if (r < 0)
105a1a36 3368 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb 3369
bb1aa185 3370 r = send_one_fd(fd_inner_socket, master, 0);
3acc84eb
FB
3371 if (r < 0)
3372 return log_error_errno(r, "Failed to send master fd: %m");
3acc84eb
FB
3373
3374 r = setup_stdio_as_dev_console();
3375 if (r < 0)
3376 return r;
3377 }
3378
de40a303
LP
3379 r = patch_sysctl();
3380 if (r < 0)
3381 return r;
3382
81f345df
LP
3383 if (arg_oom_score_adjust_set) {
3384 r = set_oom_score_adjust(arg_oom_score_adjust);
3385 if (r < 0)
3386 return log_error_errno(r, "Failed to adjust OOM score: %m");
3387 }
3388
0985c7c4
ZJS
3389 if (arg_cpu_set.set)
3390 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3391 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3392
c818eef1 3393 (void) setup_hostname();
03cfe0d5 3394
050f7277 3395 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3396 r = safe_personality(arg_personality);
3397 if (r < 0)
3398 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3399#ifdef ARCHITECTURE_SECONDARY
3400 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
21022b9d
LP
3401 r = safe_personality(PER_LINUX32);
3402 if (r < 0)
3403 return log_error_errno(r, "personality() failed: %m");
4c27749b 3404#endif
af262e5f
LB
3405 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3406 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3407 "invocation with qemu userspace emulator (or equivalent) in effect.",
3408 architecture_to_string(arg_architecture));
03cfe0d5 3409
de40a303
LP
3410 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3411 if (r < 0)
3412 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3413
3414#if HAVE_SECCOMP
3415 if (arg_seccomp) {
3416
3417 if (is_seccomp_available()) {
de40a303 3418 r = seccomp_load(arg_seccomp);
3c098014
ZJS
3419 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3420 return log_error_errno(r, "Failed to install seccomp filter: %m");
3421 if (r < 0)
de40a303
LP
3422 log_debug_errno(r, "Failed to install seccomp filter: %m");
3423 }
3424 } else
3425#endif
3426 {
6b000af4 3427 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3428 if (r < 0)
3429 return r;
3430 }
3431
4a4654e0 3432 if (arg_suppress_sync) {
20e458ae 3433#if HAVE_SECCOMP
4a4654e0
LP
3434 r = seccomp_suppress_sync();
3435 if (r < 0)
3436 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
20e458ae 3437#else
2db32618 3438 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
20e458ae 3439#endif
4a4654e0
LP
3440 }
3441
349cc4a5 3442#if HAVE_SELINUX
03cfe0d5 3443 if (arg_selinux_context)
2ed96880 3444 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3445 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3446#endif
3447
de40a303
LP
3448 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3449 * if we need to later on. */
3450 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3451 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3452
3453 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3454 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3455 else
3462d773 3456 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3457 if (r < 0)
3458 return r;
3459
de40a303
LP
3460 r = drop_capabilities(getuid());
3461 if (r < 0)
3462 return log_error_errno(r, "Dropping capabilities failed: %m");
3463
66edd963
LP
3464 if (arg_no_new_privileges)
3465 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3466 return log_error_errno(errno, "Failed to disable new privileges: %m");
3467
6aadfa4c
ILG
3468 /* LXC sets container=lxc, so follow the scheme here */
3469 envp[n_env++] = strjoina("container=", arg_container_service_name);
3470
03cfe0d5
LP
3471 envp[n_env] = strv_find_prefix(environ, "TERM=");
3472 if (envp[n_env])
313cefa1 3473 n_env++;
03cfe0d5 3474
de40a303 3475 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3476 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
de40a303
LP
3477 return log_oom();
3478
3479 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3480 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
1da3cb81 3481 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
de40a303 3482 return log_oom();
03cfe0d5 3483
3bbaff3e 3484 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3485
b7416360 3486 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
e01ff70a 3487 return log_oom();
03cfe0d5
LP
3488
3489 if (fdset_size(fds) > 0) {
3490 r = fdset_cloexec(fds, false);
3491 if (r < 0)
3492 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3493
4ab3d29f
ZJS
3494 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3495 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
03cfe0d5
LP
3496 return log_oom();
3497 }
4ab3d29f 3498 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
9c1e04d0 3499 return log_oom();
03cfe0d5 3500
3652872a
LP
3501 if (arg_n_credentials > 0) {
3502 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3503 if (!envp[n_env])
3504 return log_oom();
3505 n_env++;
3506 }
3507
b626f695 3508 if (arg_start_mode != START_BOOT) {
a22f5186 3509 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
b626f695
DDM
3510 if (!envp[n_env])
3511 return log_oom();
3512 n_env++;
3513 }
3514
4ab3d29f 3515 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
2371271c
TG
3516 if (!env_use)
3517 return log_oom();
03cfe0d5 3518
1a8d7814 3519 /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */
baaa35ad 3520 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3521 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3522
5f932eb9
LP
3523 if (arg_chdir)
3524 if (chdir(arg_chdir) < 0)
3525 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3526
7732f92b 3527 if (arg_start_mode == START_PID2) {
75bf701f 3528 r = stub_pid1(arg_uuid);
7732f92b
LP
3529 if (r < 0)
3530 return r;
3531 }
3532
335d2ead
LP
3533 if (arg_console_mode != CONSOLE_PIPE) {
3534 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3535 * are configured for that. Acquire it as controlling tty. */
3536 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3537 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3538 }
3539
de40a303
LP
3540 log_debug("Inner child completed, invoking payload.");
3541
8ca082b4
LP
3542 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3543 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3544 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3545 log_close();
8ca082b4 3546 log_set_open_when_needed(true);
a3b00f91 3547 log_settle_target();
8ca082b4 3548
03cfe0d5
LP
3549 (void) fdset_close_others(fds);
3550
7732f92b 3551 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3552 char **a;
3553 size_t m;
3554
3555 /* Automatically search for the init system */
3556
75f32f04
ZJS
3557 m = strv_length(arg_parameters);
3558 a = newa(char*, m + 2);
3559 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3560 a[1 + m] = NULL;
03cfe0d5 3561
a5096641
LP
3562 FOREACH_STRING(init,
3563 "/usr/lib/systemd/systemd",
3564 "/lib/systemd/systemd",
3565 "/sbin/init") {
3566 a[0] = (char*) init;
3567 execve(a[0], a, env_use);
3568 }
ced58da7
LP
3569
3570 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3571 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3572 const char *dollar_path;
3573
1a68e1e5 3574 exec_target = arg_parameters[0];
b6b180b7
LP
3575
3576 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3577 * binary. */
3578 dollar_path = strv_env_get(env_use, "PATH");
3579 if (dollar_path) {
6f646e01 3580 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3581 return log_error_errno(errno, "Failed to update $PATH: %m");
3582 }
3583
f757855e 3584 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3585 } else {
5f932eb9 3586 if (!arg_chdir)
d929b0f9
ZJS
3587 /* If we cannot change the directory, we'll end up in /, that is expected. */
3588 (void) chdir(home ?: "/root");
5f932eb9 3589
53350c7b 3590 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3591 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3592 execle("/bin/bash", "-bash", NULL, env_use);
3593 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3594 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7 3595
53350c7b 3596 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
03cfe0d5
LP
3597 }
3598
8ca082b4 3599 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3600}
3601
e96ceaba 3602static int setup_notify_child(void) {
254d1313 3603 _cleanup_close_ int fd = -EBADF;
1eb874b9 3604 static const union sockaddr_union sa = {
44ed5214
LP
3605 .un.sun_family = AF_UNIX,
3606 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3607 };
3608 int r;
3609
3610 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3611 if (fd < 0)
3612 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3613
3614 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3615 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3616
9c1e04d0 3617 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3618 if (r < 0)
44ed5214 3619 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3620
adc7d9f0 3621 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3622 if (r < 0)
adc7d9f0 3623 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3624
2ff48e98 3625 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3626 if (r < 0)
2ff48e98 3627 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3628
271f518f 3629 return TAKE_FD(fd);
9c1e04d0
AP
3630}
3631
03cfe0d5
LP
3632static int outer_child(
3633 Barrier *barrier,
3634 const char *directory,
2d845785 3635 DissectedImage *dissected_image,
af06cd30 3636 int fd_outer_socket,
5d9d3fcb 3637 int fd_inner_socket,
d7bea6b6
DP
3638 FDSet *fds,
3639 int netns_fd) {
03cfe0d5 3640
2f893044 3641 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3642 _cleanup_strv_free_ char **os_release_pairs = NULL;
254d1313 3643 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
f61c7f88 3644 bool idmap = false;
e5f10caf 3645 const char *p;
03cfe0d5
LP
3646 pid_t pid;
3647 ssize_t l;
de40a303 3648 int r;
03cfe0d5 3649
d1d0b895
LP
3650 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3651 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3652 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3653 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3654 * forked off it, and it exits. */
b37469d7 3655
03cfe0d5
LP
3656 assert(barrier);
3657 assert(directory);
af06cd30 3658 assert(fd_outer_socket >= 0);
5d9d3fcb 3659 assert(fd_inner_socket >= 0);
03cfe0d5 3660
de40a303
LP
3661 log_debug("Outer child is initializing.");
3662
e1bb4b0d
LB
3663 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3664 if (r < 0)
3665 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3666
03cfe0d5
LP
3667 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3668 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3669
03cfe0d5
LP
3670 r = reset_audit_loginuid();
3671 if (r < 0)
3672 return r;
3673
2a2e78e9
LP
3674 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3675 * mounts to the real root. */
511a8cfe 3676 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3677 if (r < 0)
3678 return r;
03cfe0d5 3679
2d845785 3680 if (dissected_image) {
d1d0b895
LP
3681 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3682 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3683 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3684 * right place right away. This makes sure ESP partitions and userns are compatible. */
2d3a5a73 3685
af187ab2 3686 r = dissected_image_mount_and_warn(
d04faa4e
LP
3687 dissected_image,
3688 directory,
3689 arg_uid_shift,
21b61b1d 3690 arg_uid_range,
8d9a1d59 3691 /* userns_fd= */ -EBADF,
d04faa4e
LP
3692 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3693 DISSECT_IMAGE_DISCARD_ON_LOOP|
3694 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3695 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3696 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3697 if (r < 0)
af187ab2 3698 return r;
2d845785 3699 }
03cfe0d5 3700
391567f4
LP
3701 r = determine_uid_shift(directory);
3702 if (r < 0)
3703 return r;
3704
0de7acce 3705 if (arg_userns_mode != USER_NAMESPACE_NO) {
b71a0192
CB
3706 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3707 if (r < 0)
3708 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3709
af06cd30 3710 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
b71a0192
CB
3711 if (l < 0)
3712 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3713 mntns_fd = safe_close(mntns_fd);
3714
0e7ac751 3715 /* Let the parent know which UID shift we read from the image */
af06cd30 3716 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
825d5287
RM
3717 if (l < 0)
3718 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3719 if (l != sizeof(arg_uid_shift))
3720 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3721 "Short write while sending UID shift.");
0e7ac751 3722
0de7acce 3723 if (arg_userns_mode == USER_NAMESPACE_PICK) {
d1d0b895
LP
3724 /* When we are supposed to pick the UID shift, the parent will check now whether the
3725 * UID shift we just read from the image is available. If yes, it will send the UID
3726 * shift back to us, if not it will pick a different one, and send it back to us. */
0e7ac751 3727
af06cd30 3728 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
0e7ac751
LP
3729 if (l < 0)
3730 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3731 if (l != sizeof(arg_uid_shift))
3732 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3733 "Short read while receiving UID shift.");
0e7ac751
LP
3734 }
3735
ff6c6cc1
LP
3736 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3737 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3738 }
3739
6f83d3d1
LP
3740 if (path_equal(directory, "/")) {
3741 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3742 * place, so that we can make changes to its mount structure (for example, to implement
3743 * --volatile=) without this interfering with our ability to access files such as
3744 * /etc/localtime to copy into the container. Note that we use a fixed place for this
6c2d70ce 3745 * (instead of a temporary directory, since we are living in our own mount namespace here
7802194a 3746 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
6f83d3d1
LP
3747 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3748
511a8cfe 3749 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3750 if (r < 0)
3751 return r;
3752
3753 directory = "/run/systemd/nspawn-root";
e50cd82f 3754 }
7d0ecdd6 3755
75f81732
LP
3756 /* Make sure we always have a mount that we can move to root later on. */
3757 r = make_mount_point(directory);
3758 if (r < 0)
3759 return r;
3760
3761 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3762 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3763 * we'll live in our own little world from now on, and propagation from the host may only happen via
3764 * the mount tunnel dir, or not at all. */
3765 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3766 if (r < 0)
3767 return r;
3768
7d0ecdd6
LP
3769 r = setup_pivot_root(
3770 directory,
3771 arg_pivot_root_new,
3772 arg_pivot_root_old);
3773 if (r < 0)
3774 return r;
3775
3776 r = setup_volatile_mode(
3777 directory,
3778 arg_volatile_mode,
7d0ecdd6 3779 arg_uid_shift,
8f1ed04a 3780 arg_selinux_apifs_context);
7d0ecdd6
LP
3781 if (r < 0)
3782 return r;
3783
2f893044
LP
3784 r = bind_user_prepare(
3785 directory,
3786 arg_bind_user,
3787 arg_uid_shift,
3788 arg_uid_range,
3789 &arg_custom_mounts, &arg_n_custom_mounts,
3790 &bind_user_context);
3791 if (r < 0)
3792 return r;
3793
3794 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
d1d0b895
LP
3795 /* Send the user maps we determined to the parent, so that it installs it in our user
3796 * namespace UID map table */
2f893044
LP
3797
3798 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3799 uid_t map[] = {
3800 bind_user_context->data[i].payload_user->uid,
3801 bind_user_context->data[i].host_user->uid,
3802 (uid_t) bind_user_context->data[i].payload_group->gid,
3803 (uid_t) bind_user_context->data[i].host_group->gid,
3804 };
3805
af06cd30 3806 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
2f893044
LP
3807 if (l < 0)
3808 return log_error_errno(errno, "Failed to send user UID map: %m");
3809 if (l != sizeof(map))
3810 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3811 "Short write while sending user UID map.");
3812 }
3813 }
3814
5f0a6347
DDM
3815 r = mount_custom(
3816 directory,
3817 arg_custom_mounts,
3818 arg_n_custom_mounts,
5f0a6347 3819 arg_uid_shift,
c0c8f718 3820 arg_uid_range,
5f0a6347
DDM
3821 arg_selinux_apifs_context,
3822 MOUNT_ROOT_ONLY);
3823 if (r < 0)
3824 return r;
3825
c0c8f718
AV
3826 if (arg_userns_mode != USER_NAMESPACE_NO &&
3827 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3828 arg_uid_shift != 0) {
dba4fa89
LP
3829 _cleanup_free_ char *usr_subtree = NULL;
3830 char *dirs[3];
3831 size_t i = 0;
c0c8f718 3832
dba4fa89
LP
3833 dirs[i++] = (char*) directory;
3834
3835 if (dissected_image && dissected_image->partitions[PARTITION_USR].found) {
3836 usr_subtree = path_join(directory, "/usr");
3837 if (!usr_subtree)
3838 return log_oom();
3839
3840 dirs[i++] = usr_subtree;
3841 }
3842
3843 dirs[i] = NULL;
3844
3845 r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
bb44fd07
ZJS
3846 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3847 /* This might fail because the kernel or file system doesn't support idmapping. We
3848 * can't really distinguish this nicely, nor do we have any guarantees about the
3849 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3850 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3851 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3852 "ID mapped mounts are apparently not available, sorry.");
3853
3854 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3855 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3856 } else if (r < 0)
3857 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3858 else {
c0c8f718
AV
3859 log_debug("ID mapped mounts available, making use of them.");
3860 idmap = true;
3861 }
3862 }
3863
2d3a5a73
LP
3864 if (dissected_image) {
3865 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3866 r = dissected_image_mount(
3867 dissected_image,
3868 directory,
3869 arg_uid_shift,
21b61b1d 3870 arg_uid_range,
8d9a1d59 3871 /* userns_fd= */ -EBADF,
d04faa4e
LP
3872 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3873 DISSECT_IMAGE_DISCARD_ON_LOOP|
3874 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3875 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3876 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3877 if (r == -EUCLEAN)
3878 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3879 if (r < 0)
4fcb96ce 3880 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3881 }
3882
8199d554
LP
3883 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3884 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3885
3886 r = detect_unified_cgroup_hierarchy_from_image(directory);
3887 if (r < 0)
3888 return r;
3889
fefb7a6d 3890 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
8199d554
LP
3891 if (l < 0)
3892 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3893 if (l != sizeof(arg_unified_cgroup_hierarchy))
3894 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3895 "Short write while sending cgroup mode.");
8199d554
LP
3896 }
3897
4ad14eff
LP
3898 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3899 if (r < 0)
3900 return r;
3901
03cfe0d5
LP
3902 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3903 if (r < 0)
3904 return r;
3905
bbd407ea
DDM
3906 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3907 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3908 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3909 if (r < 0)
3910 return log_error_errno(r, "Failed to make tree read-only: %m");
3911 }
3912
0de7acce 3913 r = mount_all(directory,
4f086aab 3914 arg_mount_settings,
0de7acce 3915 arg_uid_shift,
0de7acce 3916 arg_selinux_apifs_context);
03cfe0d5
LP
3917 if (r < 0)
3918 return r;
3919
07fa00f9
LP
3920 r = copy_devnodes(directory);
3921 if (r < 0)
03cfe0d5
LP
3922 return r;
3923
de40a303
LP
3924 r = make_extra_nodes(directory);
3925 if (r < 0)
3926 return r;
3927
3928 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3929
9fac5029 3930 p = prefix_roota(directory, "/run/host");
e5f10caf 3931 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3932
07fa00f9
LP
3933 r = setup_pts(directory);
3934 if (r < 0)
03cfe0d5
LP
3935 return r;
3936
e79581dd 3937 r = mount_tunnel_dig(directory);
03cfe0d5
LP
3938 if (r < 0)
3939 return r;
3940
8e5430c4
LP
3941 r = setup_keyring();
3942 if (r < 0)
3943 return r;
3944
3652872a
LP
3945 r = setup_credentials(directory);
3946 if (r < 0)
3947 return r;
3948
2f893044
LP
3949 r = bind_user_setup(bind_user_context, directory);
3950 if (r < 0)
3951 return r;
3952
5c4deb9a
MJ
3953 r = mount_custom(
3954 directory,
3955 arg_custom_mounts,
3956 arg_n_custom_mounts,
3957 arg_uid_shift,
c0c8f718 3958 arg_uid_range,
5c4deb9a
MJ
3959 arg_selinux_apifs_context,
3960 MOUNT_NON_ROOT_ONLY);
3961 if (r < 0)
3962 return r;
3963
03cfe0d5
LP
3964 r = setup_timezone(directory);
3965 if (r < 0)
3966 return r;
3967
3968 r = setup_resolv_conf(directory);
3969 if (r < 0)
3970 return r;
3971
e01ff70a
MS
3972 r = setup_machine_id(directory);
3973 if (r < 0)
3974 return r;
3975
03cfe0d5
LP
3976 r = setup_journal(directory);
3977 if (r < 0)
3978 return r;
3979
0f48ba7b
LP
3980 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3981 p = prefix_roota(directory, "/run/host/container-manager");
3982 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3983
3984 /* The same stuff as the $container_uuid env var */
3985 p = prefix_roota(directory, "/run/host/container-uuid");
3986 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3987
489fae52 3988 if (!arg_use_cgns) {
0996ef00
CB
3989 r = mount_cgroups(
3990 directory,
3991 arg_unified_cgroup_hierarchy,
3992 arg_userns_mode != USER_NAMESPACE_NO,
3993 arg_uid_shift,
3994 arg_uid_range,
5a8ff0e6 3995 arg_selinux_apifs_context,
ada54120 3996 false);
0996ef00
CB
3997 if (r < 0)
3998 return r;
3999 }
03cfe0d5 4000
57c10a56
CB
4001 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
4002 * mounts available in systemd services inside the container that create a new mount namespace. See
4003 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
4004 * will inherit the shared propagation mode.
4005 *
4006 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
4007 * directory mount to root later on.
4008 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
4009 */
9d50f850 4010 r = mount_switch_root(directory, MS_SHARED);
03cfe0d5
LP
4011 if (r < 0)
4012 return log_error_errno(r, "Failed to move root directory: %m");
4013
e79581dd
CB
4014 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4015 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4016 * the container. */
4017 r = mount_tunnel_open();
4018 if (r < 0)
4019 return r;
4020
b71a0192
CB
4021 if (arg_userns_mode != USER_NAMESPACE_NO) {
4022 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4023 * requires that a fully visible instance is already present in the target mount
4024 * namespace. Mount one here so the inner child can mount its own instances. Later
4025 * we umount the temporary instances created here before we actually exec the
4026 * payload. Since the rootfs is shared the umount will propagate into the container.
4027 * Note, the inner child wouldn't be able to unmount the instances on its own since
4028 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4029 * this. */
4030 r = pin_fully_visible_fs();
4031 if (r < 0)
4032 return r;
4033 }
4034
e96ceaba 4035 fd = setup_notify_child();
9c1e04d0
AP
4036 if (fd < 0)
4037 return fd;
4038
03cfe0d5 4039 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 4040 arg_clone_ns_flags |
8869a0b4 4041 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
4042 if (pid < 0)
4043 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5 4044 if (pid == 0) {
af06cd30 4045 fd_outer_socket = safe_close(fd_outer_socket);
03cfe0d5 4046
2a2e78e9
LP
4047 /* The inner child has all namespaces that are requested, so that we all are owned by the
4048 * user if user namespaces are turned on. */
03cfe0d5 4049
d7bea6b6
DP
4050 if (arg_network_namespace_path) {
4051 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4052 if (r < 0)
e2d39e54 4053 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
4054 }
4055
11875a98 4056 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
03cfe0d5
LP
4057 if (r < 0)
4058 _exit(EXIT_FAILURE);
4059
4060 _exit(EXIT_SUCCESS);
4061 }
4062
af06cd30 4063 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
03cfe0d5
LP
4064 if (l < 0)
4065 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
4066 if (l != sizeof(pid))
4067 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4068 "Short write while sending PID.");
03cfe0d5 4069
af06cd30 4070 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
e01ff70a
MS
4071 if (l < 0)
4072 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
4073 if (l != sizeof(arg_uuid))
4074 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4075 "Short write while sending machine ID.");
e01ff70a 4076
af06cd30 4077 l = send_one_fd(fd_outer_socket, fd, 0);
9c1e04d0 4078 if (l < 0)
ba72801d 4079 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 4080
af06cd30 4081 fd_outer_socket = safe_close(fd_outer_socket);
5d9d3fcb 4082 fd_inner_socket = safe_close(fd_inner_socket);
d7bea6b6 4083 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
4084
4085 return 0;
4086}
4087
0e7ac751 4088static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4089 bool tried_hashed = false;
0e7ac751
LP
4090 unsigned n_tries = 100;
4091 uid_t candidate;
4092 int r;
4093
4094 assert(shift);
4095 assert(ret_lock_file);
0de7acce 4096 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4097 assert(arg_uid_range == 0x10000U);
4098
4099 candidate = *shift;
4100
4101 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4102
4103 for (;;) {
fbd0b64f 4104 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4105 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4106
4107 if (--n_tries <= 0)
4108 return -EBUSY;
4109
87d5e4f2 4110 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4111 goto next;
4112 if ((candidate & UINT32_C(0xFFFF)) != 0)
4113 goto next;
4114
4115 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4116 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4117 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4118 goto next;
4119 if (r < 0)
4120 return r;
4121
4122 /* Make some superficial checks whether the range is currently known in the user database */
4123 if (getpwuid(candidate))
4124 goto next;
4125 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4126 goto next;
4127 if (getgrgid(candidate))
4128 goto next;
4129 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4130 goto next;
4131
4132 *ret_lock_file = lf;
4133 lf = (struct LockFile) LOCK_FILE_INIT;
4134 *shift = candidate;
4135 return 0;
4136
4137 next:
d381c8a6
LP
4138 if (arg_machine && !tried_hashed) {
4139 /* Try to hash the base from the container name */
4140
4141 static const uint8_t hash_key[] = {
4142 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4143 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4144 };
4145
4146 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4147
4148 tried_hashed = true;
4149 } else
4150 random_bytes(&candidate, sizeof(candidate));
4151
87d5e4f2 4152 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4153 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4154 }
4155}
4156
2f893044
LP
4157static int add_one_uid_map(
4158 char **p,
4159 uid_t container_uid,
4160 uid_t host_uid,
4161 uid_t range) {
4162
4163 return strextendf(p,
4164 UID_FMT " " UID_FMT " " UID_FMT "\n",
4165 container_uid, host_uid, range);
4166}
4167
4168static int make_uid_map_string(
4169 const uid_t bind_user_uid[],
4170 size_t n_bind_user_uid,
4171 size_t offset,
4172 char **ret) {
4173
4174 _cleanup_free_ char *s = NULL;
4175 uid_t previous_uid = 0;
4176 int r;
4177
4178 assert(n_bind_user_uid == 0 || bind_user_uid);
2f092762 4179 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
2f893044
LP
4180 assert(ret);
4181
4182 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4183 * quadruplet, consisting of host and container UID + GID. */
4184
4185 for (size_t i = 0; i < n_bind_user_uid; i++) {
05ab439a
YW
4186 uid_t payload_uid = bind_user_uid[i*4+offset],
4187 host_uid = bind_user_uid[i*4+offset+1];
2f893044
LP
4188
4189 assert(previous_uid <= payload_uid);
4190 assert(payload_uid < arg_uid_range);
4191
4192 /* Add a range to close the gap to previous entry */
4193 if (payload_uid > previous_uid) {
4194 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4195 if (r < 0)
4196 return r;
4197 }
4198
4199 /* Map this specific user */
4200 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4201 if (r < 0)
4202 return r;
4203
4204 previous_uid = payload_uid + 1;
4205 }
4206
4207 /* And add a range to close the gap to finish the range */
4208 if (arg_uid_range > previous_uid) {
4209 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4210 if (r < 0)
4211 return r;
4212 }
4213
4214 assert(s);
4215
4216 *ret = TAKE_PTR(s);
4217 return 0;
4218}
4219
4220static int setup_uid_map(
4221 pid_t pid,
4222 const uid_t bind_user_uid[],
4223 size_t n_bind_user_uid) {
4224
4225 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4226 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4227 int r;
4228
4229 assert(pid > 1);
4230
2f893044
LP
4231 /* Build the UID map string */
4232 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4233 return log_oom();
4234
03cfe0d5 4235 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4236 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4237 if (r < 0)
4238 return log_error_errno(r, "Failed to write UID map: %m");
4239
2f893044
LP
4240 /* And now build the GID map string */
4241 s = mfree(s);
4242 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4243 return log_oom();
4244
03cfe0d5 4245 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4246 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4247 if (r < 0)
4248 return log_error_errno(r, "Failed to write GID map: %m");
4249
4250 return 0;
4251}
4252
9c1e04d0 4253static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4254 char buf[NOTIFY_BUFFER_MAX+1];
4255 char *p = NULL;
4256 struct iovec iovec = {
4257 .iov_base = buf,
4258 .iov_len = sizeof(buf)-1,
4259 };
fb29cdbe
LP
4260 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4261 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4262 struct msghdr msghdr = {
4263 .msg_iov = &iovec,
4264 .msg_iovlen = 1,
4265 .msg_control = &control,
4266 .msg_controllen = sizeof(control),
4267 };
371d72e0 4268 struct ucred *ucred;
9c1e04d0
AP
4269 ssize_t n;
4270 pid_t inner_child_pid;
4271 _cleanup_strv_free_ char **tags = NULL;
4bf4f50f 4272 int r;
9c1e04d0
AP
4273
4274 assert(userdata);
4275
4276 inner_child_pid = PTR_TO_PID(userdata);
4277
4278 if (revents != EPOLLIN) {
4279 log_warning("Got unexpected poll event for notify fd.");
4280 return 0;
4281 }
4282
3691bcf3 4283 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
bb44fd07
ZJS
4284 if (ERRNO_IS_NEG_TRANSIENT(n))
4285 return 0;
4286 else if (n == -EXFULL) {
4287 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4288 return 0;
4289 } else if (n < 0)
3691bcf3 4290 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 4291
9c1e04d0
AP
4292 cmsg_close_all(&msghdr);
4293
371d72e0 4294 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4295 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4296 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4297 return 0;
4298 }
4299
4300 if ((size_t) n >= sizeof(buf)) {
4301 log_warning("Received notify message exceeded maximum size. Ignoring.");
4302 return 0;
4303 }
4304
4305 buf[n] = 0;
4306 tags = strv_split(buf, "\n\r");
4307 if (!tags)
4308 return log_oom();
4309
d29cc4d6 4310 if (strv_contains(tags, "READY=1")) {
d4341b76 4311 r = sd_notify(false, "READY=1\n");
4bf4f50f
ZJS
4312 if (r < 0)
4313 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4314 }
9c1e04d0
AP
4315
4316 p = strv_find_startswith(tags, "STATUS=");
4317 if (p)
04f590a4 4318 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4319
4320 return 0;
4321}
4322
e96ceaba 4323static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4324 int r;
9c1e04d0 4325
5773024d 4326 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4327 if (r < 0)
4328 return log_error_errno(r, "Failed to allocate notify event source: %m");
4329
5773024d 4330 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4331
4332 return 0;
4333}
4334
5d961407
LP
4335static int merge_settings(Settings *settings, const char *path) {
4336 int rl;
f757855e 4337
5d961407
LP
4338 assert(settings);
4339 assert(path);
f757855e 4340
5d961407
LP
4341 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4342 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4343
7732f92b
LP
4344 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4345 settings->start_mode >= 0) {
4346 arg_start_mode = settings->start_mode;
130d3d22 4347 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4348 }
4349
d3689b94
LP
4350 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4351 settings->ephemeral >= 0)
a2f577fc
JL
4352 arg_ephemeral = settings->ephemeral;
4353
de40a303
LP
4354 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4355 settings->root) {
4356
4357 if (!arg_settings_trusted)
4358 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4359 else
4360 free_and_replace(arg_directory, settings->root);
4361 }
4362
b53ede69
PW
4363 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4364 settings->pivot_root_new) {
4365 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4366 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4367 }
4368
5f932eb9 4369 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4370 settings->working_directory)
4371 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4372
f757855e 4373 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4374 settings->environment)
4375 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4376
de40a303
LP
4377 if ((arg_settings_mask & SETTING_USER) == 0) {
4378
4379 if (settings->user)
4380 free_and_replace(arg_user, settings->user);
4381
4382 if (uid_is_valid(settings->uid))
4383 arg_uid = settings->uid;
4384 if (gid_is_valid(settings->gid))
4385 arg_gid = settings->gid;
4386 if (settings->n_supplementary_gids > 0) {
4387 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4388 arg_n_supplementary_gids = settings->n_supplementary_gids;
4389 }
4390 }
f757855e
LP
4391
4392 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4393 uint64_t plus, minus;
7be830c6 4394 uint64_t network_minus = 0;
88fc9c9b 4395 uint64_t ambient;
f757855e 4396
de40a303
LP
4397 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4398 * Settings structure */
4399
0e265674 4400 plus = settings->capability;
a3fc6b55
LP
4401 minus = settings->drop_capability;
4402
9baa294c
LP
4403 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4404 settings_network_configured(settings)) {
a3fc6b55
LP
4405 if (settings_private_network(settings))
4406 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4407 else
7be830c6 4408 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4409 }
0e265674
LP
4410
4411 if (!arg_settings_trusted && plus != 0) {
4412 if (settings->capability != 0)
5d961407 4413 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4414 } else {
4415 arg_caps_retain &= ~network_minus;
520e0d54 4416 arg_caps_retain |= plus;
7be830c6 4417 }
f757855e 4418
a3fc6b55 4419 arg_caps_retain &= ~minus;
de40a303
LP
4420
4421 /* Copy the full capabilities over too */
4422 if (capability_quintet_is_set(&settings->full_capabilities)) {
4423 if (!arg_settings_trusted)
5238e957 4424 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4425 else
4426 arg_full_capabilities = settings->full_capabilities;
4427 }
88fc9c9b
TH
4428
4429 ambient = settings->ambient_capability;
4430 if (!arg_settings_trusted && ambient != 0)
4431 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4432 else
4433 arg_caps_ambient |= ambient;
f757855e
LP
4434 }
4435
4436 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4437 settings->kill_signal > 0)
4438 arg_kill_signal = settings->kill_signal;
4439
4440 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4441 settings->personality != PERSONALITY_INVALID)
4442 arg_personality = settings->personality;
4443
4444 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4445 !sd_id128_is_null(settings->machine_id)) {
4446
4447 if (!arg_settings_trusted)
5d961407 4448 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4449 else
4450 arg_uuid = settings->machine_id;
4451 }
4452
4453 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4454 settings->read_only >= 0)
4455 arg_read_only = settings->read_only;
4456
4457 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4458 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4459 arg_volatile_mode = settings->volatile_mode;
4460
4461 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4462 settings->n_custom_mounts > 0) {
4463
4464 if (!arg_settings_trusted)
5d961407 4465 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4466 else {
4467 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4468 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4469 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4470 settings->n_custom_mounts = 0;
4471 }
4472 }
4473
4474 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
a1dfd585 4475 settings_network_configured(settings)) {
f757855e
LP
4476
4477 if (!arg_settings_trusted)
5d961407 4478 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4479 else {
f6d6bad1 4480 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4481 arg_private_network = settings_private_network(settings);
4482
130d3d22
YW
4483 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4484 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4485 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4486 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4487
1cc6c93a
YW
4488 free_and_replace(arg_network_bridge, settings->network_bridge);
4489 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4490
4491 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4492 }
4493 }
4494
4495 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4496 settings->expose_ports) {
4497
4498 if (!arg_settings_trusted)
5d961407 4499 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4500 else {
4501 expose_port_free_all(arg_expose_ports);
1cc6c93a 4502 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4503 }
4504 }
4505
0de7acce
LP
4506 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4507 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4508
4509 if (!arg_settings_trusted)
5d961407 4510 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4511 else {
4512 arg_userns_mode = settings->userns_mode;
4513 arg_uid_shift = settings->uid_shift;
4514 arg_uid_range = settings->uid_range;
6c045a99 4515 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4516 }
4517 }
4518
0cc3c9f9
LP
4519 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4520 !strv_isempty(settings->bind_user))
2f893044
LP
4521 strv_free_and_replace(arg_bind_user, settings->bind_user);
4522
d3689b94
LP
4523 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4524 settings->notify_ready >= 0)
9c1e04d0
AP
4525 arg_notify_ready = settings->notify_ready;
4526
960e4569
LP
4527 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4528
2d09ea44
LP
4529 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4530 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4531 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4532 else {
4533 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4534 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4535 }
960e4569 4536 }
de40a303
LP
4537
4538#if HAVE_SECCOMP
2d09ea44
LP
4539 if (settings->seccomp) {
4540 if (!arg_settings_trusted)
4541 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4542 else {
4543 seccomp_release(arg_seccomp);
4544 arg_seccomp = TAKE_PTR(settings->seccomp);
4545 }
de40a303
LP
4546 }
4547#endif
960e4569
LP
4548 }
4549
bf428efb
LP
4550 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4551 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4552 continue;
4553
4554 if (!settings->rlimit[rl])
4555 continue;
4556
4557 if (!arg_settings_trusted) {
5d961407 4558 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4559 continue;
4560 }
4561
4562 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4563 }
4564
3a9530e5
LP
4565 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4566 settings->hostname)
4567 free_and_replace(arg_hostname, settings->hostname);
4568
66edd963
LP
4569 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4570 settings->no_new_privileges >= 0)
4571 arg_no_new_privileges = settings->no_new_privileges;
4572
81f345df
LP
4573 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4574 settings->oom_score_adjust_set) {
4575
4576 if (!arg_settings_trusted)
5d961407 4577 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4578 else {
4579 arg_oom_score_adjust = settings->oom_score_adjust;
4580 arg_oom_score_adjust_set = true;
4581 }
4582 }
4583
d107bb7d 4584 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4585 settings->cpu_set.set) {
d107bb7d
LP
4586
4587 if (!arg_settings_trusted)
5d961407 4588 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4589 else {
0985c7c4 4590 cpu_set_reset(&arg_cpu_set);
088d71f8 4591 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
d107bb7d
LP
4592 }
4593 }
4594
09d423e9
LP
4595 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4596 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4597 arg_resolv_conf = settings->resolv_conf;
4598
4e1d6aa9
LP
4599 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4600 settings->link_journal != _LINK_JOURNAL_INVALID) {
4601
4602 if (!arg_settings_trusted)
4603 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4604 else {
4605 arg_link_journal = settings->link_journal;
4606 arg_link_journal_try = settings->link_journal_try;
4607 }
4608 }
4609
1688841f
LP
4610 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4611 settings->timezone != _TIMEZONE_MODE_INVALID)
4612 arg_timezone = settings->timezone;
4613
de40a303
LP
4614 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4615 settings->slice) {
4616
4617 if (!arg_settings_trusted)
4618 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4619 else
4620 free_and_replace(arg_slice, settings->slice);
4621 }
4622
4623 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4624 settings->use_cgns >= 0) {
4625
4626 if (!arg_settings_trusted)
4627 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4628 else
4629 arg_use_cgns = settings->use_cgns;
4630 }
4631
4632 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4633 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4634
4635 if (!arg_settings_trusted)
4636 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4637 else
4638 arg_clone_ns_flags = settings->clone_ns_flags;
4639 }
4640
4641 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4642 settings->console_mode >= 0) {
4643
4644 if (!arg_settings_trusted)
4645 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4646 else
4647 arg_console_mode = settings->console_mode;
4648 }
4649
d3689b94
LP
4650 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4651 settings->suppress_sync >= 0)
4a4654e0
LP
4652 arg_suppress_sync = settings->suppress_sync;
4653
de40a303
LP
4654 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4655 * don't consult arg_settings_mask for them. */
4656
4657 sd_bus_message_unref(arg_property_message);
4658 arg_property_message = TAKE_PTR(settings->properties);
4659
4660 arg_console_width = settings->console_width;
4661 arg_console_height = settings->console_height;
4662
b2645747 4663 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4664 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4665 arg_n_extra_nodes = settings->n_extra_nodes;
825210d4 4666 settings->n_extra_nodes = 0;
de40a303 4667
f757855e
LP
4668 return 0;
4669}
4670
5d961407
LP
4671static int load_settings(void) {
4672 _cleanup_(settings_freep) Settings *settings = NULL;
4673 _cleanup_fclose_ FILE *f = NULL;
3603f151 4674 _cleanup_free_ char *p = NULL;
5d961407
LP
4675 int r;
4676
de40a303
LP
4677 if (arg_oci_bundle)
4678 return 0;
4679
5d961407
LP
4680 /* If all settings are masked, there's no point in looking for
4681 * the settings file */
d7a0f1f4 4682 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4683 return 0;
4684
5d961407
LP
4685 /* We first look in the admin's directories in /etc and /run */
4686 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4687 _cleanup_free_ char *j = NULL;
4688
3603f151 4689 j = path_join(i, arg_settings_filename);
5d961407
LP
4690 if (!j)
4691 return log_oom();
4692
4693 f = fopen(j, "re");
4694 if (f) {
4695 p = TAKE_PTR(j);
4696
4697 /* By default, we trust configuration from /etc and /run */
4698 if (arg_settings_trusted < 0)
4699 arg_settings_trusted = true;
4700
4701 break;
4702 }
4703
4704 if (errno != ENOENT)
4705 return log_error_errno(errno, "Failed to open %s: %m", j);
4706 }
4707
4708 if (!f) {
4709 /* After that, let's look for a file next to the
4710 * actual image we shall boot. */
4711
4712 if (arg_image) {
162f6477
LP
4713 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4714 if (r < 0)
4715 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4716 } else if (arg_directory) {
4717 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4718 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4719 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
5d961407
LP
4720 }
4721
4722 if (p) {
4723 f = fopen(p, "re");
4724 if (!f && errno != ENOENT)
4725 return log_error_errno(errno, "Failed to open %s: %m", p);
4726
4727 /* By default, we do not trust configuration from /var/lib/machines */
4728 if (arg_settings_trusted < 0)
4729 arg_settings_trusted = false;
4730 }
4731 }
4732
4733 if (!f)
4734 return 0;
4735
4736 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4737
4738 r = settings_load(f, p, &settings);
4739 if (r < 0)
4740 return r;
4741
4742 return merge_settings(settings, p);
4743}
4744
de40a303
LP
4745static int load_oci_bundle(void) {
4746 _cleanup_(settings_freep) Settings *settings = NULL;
4747 int r;
4748
4749 if (!arg_oci_bundle)
4750 return 0;
4751
4752 /* By default let's trust OCI bundles */
4753 if (arg_settings_trusted < 0)
4754 arg_settings_trusted = true;
4755
4756 r = oci_load(NULL, arg_oci_bundle, &settings);
4757 if (r < 0)
4758 return r;
4759
4760 return merge_settings(settings, arg_oci_bundle);
4761}
4762
3acc84eb 4763static int run_container(
2d845785 4764 DissectedImage *dissected_image,
b0067625
ZJS
4765 FDSet *fds,
4766 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4767 struct ExposeArgs *expose_args,
3acc84eb 4768 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4769
4770 static const struct sigaction sa = {
4771 .sa_handler = nop_signal_handler,
e28c7cd0 4772 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4773 };
4774
8e766630 4775 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
5bb1d7fb 4776 _cleanup_close_ int etc_passwd_lock = -EBADF;
b0067625 4777 _cleanup_close_pair_ int
71136404
LP
4778 fd_inner_socket_pair[2] = EBADF_PAIR,
4779 fd_outer_socket_pair[2] = EBADF_PAIR;
8199d554 4780
5bb1d7fb 4781 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
b0067625 4782 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4783 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4784 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4785 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4786 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4787 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4788 _cleanup_free_ uid_t *bind_user_uid = NULL;
4789 size_t n_bind_user_uid = 0;
b0067625 4790 ContainerStatus container_status = 0;
b0067625
ZJS
4791 int ifi = 0, r;
4792 ssize_t l;
4793 sigset_t mask_chld;
254d1313 4794 _cleanup_close_ int child_netns_fd = -EBADF;
b0067625
ZJS
4795
4796 assert_se(sigemptyset(&mask_chld) == 0);
4797 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4798
4799 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4800 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4801 * check with getpwuid() if the specific user already exists. Note that /etc might be
4802 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4803 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4804 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4805 * really ours. */
4806
4807 etc_passwd_lock = take_etc_passwd_lock(NULL);
4808 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4809 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4810 }
4811
4812 r = barrier_create(&barrier);
4813 if (r < 0)
4814 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4815
5d9d3fcb
CB
4816 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4817 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4818
af06cd30
CB
4819 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4820 return log_error_errno(errno, "Failed to create outer socket pair: %m");
b0067625 4821
b0067625
ZJS
4822 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4823 * parent's blocking calls and give it a chance to call wait() and terminate. */
4824 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4825 if (r < 0)
4826 return log_error_errno(errno, "Failed to change the signal mask: %m");
4827
4828 r = sigaction(SIGCHLD, &sa, NULL);
4829 if (r < 0)
4830 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4831
d7bea6b6 4832 if (arg_network_namespace_path) {
5b4855ab
DDM
4833 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4834 if (child_netns_fd < 0)
d7bea6b6
DP
4835 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4836
54c2459d 4837 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4838 if (r == -EUCLEAN)
4839 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4840 else if (r < 0)
d7bea6b6 4841 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4842 else if (r == 0)
4843 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4844 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4845 }
4846
b0067625
ZJS
4847 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4848 if (*pid < 0)
4849 return log_error_errno(errno, "clone() failed%s: %m",
4850 errno == EINVAL ?
4851 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4852
4853 if (*pid == 0) {
4854 /* The outer child only has a file system namespace. */
4855 barrier_set_role(&barrier, BARRIER_CHILD);
4856
5d9d3fcb 4857 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
af06cd30 4858 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
b0067625
ZJS
4859
4860 (void) reset_all_signal_handlers();
4861 (void) reset_signal_mask();
4862
4863 r = outer_child(&barrier,
4864 arg_directory,
2d845785 4865 dissected_image,
af06cd30 4866 fd_outer_socket_pair[1],
5d9d3fcb 4867 fd_inner_socket_pair[1],
d7bea6b6 4868 fds,
5b4855ab 4869 child_netns_fd);
b0067625
ZJS
4870 if (r < 0)
4871 _exit(EXIT_FAILURE);
4872
4873 _exit(EXIT_SUCCESS);
4874 }
4875
4876 barrier_set_role(&barrier, BARRIER_PARENT);
4877
e4077ff6 4878 fdset_close(fds);
b0067625 4879
5d9d3fcb 4880 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
af06cd30 4881 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
b0067625
ZJS
4882
4883 if (arg_userns_mode != USER_NAMESPACE_NO) {
af06cd30 4884 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
b71a0192
CB
4885 if (mntns_fd < 0)
4886 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4887
b0067625 4888 /* The child just let us know the UID shift it might have read from the image. */
af06cd30 4889 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
b0067625
ZJS
4890 if (l < 0)
4891 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4892 if (l != sizeof arg_uid_shift)
4893 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4894
4895 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4896 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4897 * image, but if that's already in use, pick a new one, and report back to the child,
4898 * which one we now picked. */
4899
4900 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4901 if (r < 0)
4902 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4903
af06cd30 4904 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
b0067625
ZJS
4905 if (l < 0)
4906 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4907 if (l != sizeof arg_uid_shift)
4908 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4909 }
2f893044
LP
4910
4911 n_bind_user_uid = strv_length(arg_bind_user);
4912 if (n_bind_user_uid > 0) {
4913 /* Right after the UID shift, we'll receive the list of UID mappings for the
4914 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4915
4916 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4917 if (!bind_user_uid)
4918 return log_oom();
4919
4920 for (size_t i = 0; i < n_bind_user_uid; i++) {
af06cd30 4921 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
2f893044
LP
4922 if (l < 0)
4923 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4924 if (l != sizeof(uid_t)*4)
4925 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4926 SYNTHETIC_ERRNO(EIO),
4927 "Short read while reading bind user UID pairs.");
4928 }
4929 }
b0067625
ZJS
4930 }
4931
8199d554
LP
4932 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4933 /* The child let us know the support cgroup mode it might have read from the image. */
fefb7a6d 4934 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
8199d554
LP
4935 if (l < 0)
4936 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113 4937 if (l != sizeof(arg_unified_cgroup_hierarchy))
c0f86d66 4938 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
c6147113 4939 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4940 }
4941
b0067625 4942 /* Wait for the outer child. */
d2e0ac3d
LP
4943 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4944 if (r < 0)
4945 return r;
4946 if (r != EXIT_SUCCESS)
4947 return -EIO;
b0067625
ZJS
4948
4949 /* And now retrieve the PID of the inner child. */
af06cd30 4950 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
b0067625
ZJS
4951 if (l < 0)
4952 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4953 if (l != sizeof *pid)
4954 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4955
4956 /* We also retrieve container UUID in case it was generated by outer child */
af06cd30 4957 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
b0067625
ZJS
4958 if (l < 0)
4959 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4960 if (l != sizeof(arg_uuid))
4961 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4962
4963 /* We also retrieve the socket used for notifications generated by outer child */
af06cd30 4964 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
b0067625
ZJS
4965 if (notify_socket < 0)
4966 return log_error_errno(notify_socket,
4967 "Failed to receive notification socket from the outer child: %m");
4968
4969 log_debug("Init process invoked as PID "PID_FMT, *pid);
4970
4971 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4972 if (!barrier_place_and_sync(&barrier)) /* #1 */
4973 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4974
2f893044 4975 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4976 if (r < 0)
4977 return r;
4978
4979 (void) barrier_place(&barrier); /* #2 */
4980 }
4981
4982 if (arg_private_network) {
75116558
PS
4983 if (!arg_network_namespace_path) {
4984 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4985 if (!barrier_place_and_sync(&barrier)) /* #3 */
4986 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4987 }
4988
5b4855ab
DDM
4989 if (child_netns_fd < 0) {
4990 /* Make sure we have an open file descriptor to the child's network
4991 * namespace so it stays alive even if the child exits. */
4992 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4993 if (r < 0)
4994 return log_error_errno(r, "Failed to open child network namespace: %m");
4995 }
4996
4997 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4998 if (r < 0)
4999 return r;
5000
5001 if (arg_network_veth) {
5002 r = setup_veth(arg_machine, *pid, veth_name,
813dbff4 5003 arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
b0067625
ZJS
5004 if (r < 0)
5005 return r;
5006 else if (r > 0)
5007 ifi = r;
5008
5009 if (arg_network_bridge) {
5010 /* Add the interface to a bridge */
5011 r = setup_bridge(veth_name, arg_network_bridge, false);
5012 if (r < 0)
5013 return r;
5014 if (r > 0)
5015 ifi = r;
5016 } else if (arg_network_zone) {
5017 /* Add the interface to a bridge, possibly creating it */
5018 r = setup_bridge(veth_name, arg_network_zone, true);
5019 if (r < 0)
5020 return r;
5021 if (r > 0)
5022 ifi = r;
5023 }
5024 }
5025
5026 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5027 if (r < 0)
5028 return r;
5029
5030 /* We created the primary and extra veth links now; let's remember this, so that we know to
5031 remove them later on. Note that we don't bother with removing veth links that were created
5032 here when their setup failed half-way, because in that case the kernel should be able to
5033 remove them on its own, since they cannot be referenced by anything yet. */
5034 *veth_created = true;
5035
5036 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5037 if (r < 0)
5038 return r;
5039
5040 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5041 if (r < 0)
5042 return r;
5043 }
5044
abdb9b08
LP
5045 if (arg_register || !arg_keep_unit) {
5046 r = sd_bus_default_system(&bus);
5047 if (r < 0)
5048 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
5049
5050 r = sd_bus_set_close_on_exit(bus, false);
5051 if (r < 0)
5052 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
5053 }
5054
5055 if (!arg_keep_unit) {
5056 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5057 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5058 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5059
75152a4d
LP
5060 r = sd_bus_match_signal_async(
5061 bus,
5062 NULL,
5063 "org.freedesktop.systemd1",
5064 NULL,
5065 "org.freedesktop.systemd1.Scope",
5066 "RequestStop",
5067 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 5068 if (r < 0)
75152a4d 5069 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
5070 }
5071
b0067625
ZJS
5072 if (arg_register) {
5073 r = register_machine(
abdb9b08 5074 bus,
b0067625
ZJS
5075 arg_machine,
5076 *pid,
5077 arg_directory,
5078 arg_uuid,
5079 ifi,
5080 arg_slice,
5081 arg_custom_mounts, arg_n_custom_mounts,
5082 arg_kill_signal,
5083 arg_property,
de40a303 5084 arg_property_message,
b0067625 5085 arg_keep_unit,
411d8c72
NR
5086 arg_container_service_name,
5087 arg_start_mode);
b0067625
ZJS
5088 if (r < 0)
5089 return r;
abdb9b08 5090
cd2dfc6f
LP
5091 } else if (!arg_keep_unit) {
5092 r = allocate_scope(
abdb9b08 5093 bus,
cd2dfc6f
LP
5094 arg_machine,
5095 *pid,
5096 arg_slice,
5097 arg_custom_mounts, arg_n_custom_mounts,
5098 arg_kill_signal,
de40a303 5099 arg_property,
7eda208f 5100 arg_property_message,
411d8c72
NR
5101 /* allow_pidfds= */ true,
5102 arg_start_mode);
cd2dfc6f
LP
5103 if (r < 0)
5104 return r;
5105
5106 } else if (arg_slice || arg_property)
5107 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5108
27da7ef0 5109 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5110 if (r < 0)
5111 return r;
5112
27da7ef0 5113 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5114 if (r < 0)
5115 return r;
b0067625 5116
de54e02d 5117 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5118 if (r < 0)
5119 return r;
5120
5121 /* Notify the child that the parent is ready with all
5122 * its setup (including cgroup-ification), and that
5123 * the child can now hand over control to the code to
5124 * run inside the container. */
75116558 5125 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5126
5127 /* Block SIGCHLD here, before notifying child.
5128 * process_pty() will handle it with the other signals. */
5129 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5130
5131 /* Reset signal to default */
9c274488 5132 r = default_signals(SIGCHLD);
b0067625
ZJS
5133 if (r < 0)
5134 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5135
5136 r = sd_event_new(&event);
5137 if (r < 0)
5138 return log_error_errno(r, "Failed to get default event source: %m");
5139
8fd010bb
LP
5140 (void) sd_event_set_watchdog(event, true);
5141
abdb9b08
LP
5142 if (bus) {
5143 r = sd_bus_attach_event(bus, event, 0);
5144 if (r < 0)
5145 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5146 }
5147
e96ceaba 5148 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5149 if (r < 0)
5150 return r;
5151
1a8d7814
LP
5152 /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on,
5153 * before we take the fully visible instances away. */
5154 if (!barrier_sync(&barrier)) /* #5.1 */
5155 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5156
b71a0192
CB
5157 if (arg_userns_mode != USER_NAMESPACE_NO) {
5158 r = wipe_fully_visible_fs(mntns_fd);
5159 if (r < 0)
5160 return r;
5161 mntns_fd = safe_close(mntns_fd);
5162 }
5163
1a8d7814
LP
5164 /* And now let the child know that we completed removing the procfs instances, and it can start the
5165 * payload. */
5166 if (!barrier_place(&barrier)) /* #5.2 */
c6147113 5167 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5168
38ccb557 5169 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5170 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5171 etc_passwd_lock = safe_close(etc_passwd_lock);
5172
04f590a4
LP
5173 (void) sd_notifyf(false,
5174 "STATUS=Container running.\n"
5175 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4bf4f50f
ZJS
5176 if (!arg_notify_ready) {
5177 r = sd_notify(false, "READY=1\n");
5178 if (r < 0)
5179 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5180 }
b0067625
ZJS
5181
5182 if (arg_kill_signal > 0) {
5183 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5184 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5185 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5186 } else {
5187 /* Immediately exit */
919f5ae0
LP
5188 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5189 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5190 }
5191
988851b6
LP
5192 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5193
5194 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5195 if (r < 0)
5196 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5197
6916b164 5198 /* Exit when the child exits */
919f5ae0 5199 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625 5200
b07ee903
CB
5201 /* Retrieve the kmsg fifo allocated by inner child */
5202 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5203 if (fd_kmsg_fifo < 0)
5204 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5205
b0067625 5206 if (arg_expose_ports) {
b07ee903 5207 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5208 if (r < 0)
5209 return r;
5210
deff68e7
FW
5211 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5212 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5213 }
5214
3acc84eb 5215 if (arg_console_mode != CONSOLE_PIPE) {
254d1313 5216 _cleanup_close_ int fd = -EBADF;
3acc84eb 5217 PTYForwardFlags flags = 0;
de40a303 5218
3acc84eb 5219 /* Retrieve the master pty allocated by inner child */
bb1aa185 5220 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
3acc84eb
FB
5221 if (fd < 0)
5222 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5223
5224 switch (arg_console_mode) {
de40a303 5225
3acc84eb
FB
5226 case CONSOLE_READ_ONLY:
5227 flags |= PTY_FORWARD_READ_ONLY;
5228
5229 _fallthrough_;
5230
5231 case CONSOLE_INTERACTIVE:
5232 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5233
5234 r = pty_forward_new(event, fd, flags, &forward);
5235 if (r < 0)
5236 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5237
f5fbe71d 5238 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5239 (void) pty_forward_set_width_height(forward,
5240 arg_console_width,
5241 arg_console_height);
5242 break;
5243
5244 default:
5245 assert(arg_console_mode == CONSOLE_PASSIVE);
5246 }
5247
5248 *master = TAKE_FD(fd);
de40a303 5249 }
b0067625 5250
5d9d3fcb
CB
5251 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5252
b0067625
ZJS
5253 r = sd_event_loop(event);
5254 if (r < 0)
5255 return log_error_errno(r, "Failed to run event loop: %m");
5256
de40a303
LP
5257 if (forward) {
5258 char last_char = 0;
b0067625 5259
de40a303
LP
5260 (void) pty_forward_get_last_char(forward, &last_char);
5261 forward = pty_forward_free(forward);
b0067625 5262
de40a303
LP
5263 if (!arg_quiet && last_char != '\n')
5264 putc('\n', stdout);
5265 }
b0067625
ZJS
5266
5267 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5268 if (!arg_register && !arg_keep_unit && bus)
5269 terminate_scope(bus, arg_machine);
b0067625
ZJS
5270
5271 /* Normally redundant, but better safe than sorry */
c67b0082 5272 (void) kill(*pid, SIGKILL);
b0067625 5273
5d9d3fcb
CB
5274 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5275
5b4855ab
DDM
5276 if (arg_private_network) {
5277 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5278 * to avoid having to move the parent to the child network namespace. */
e9ccae31 5279 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG, NULL);
5b4855ab
DDM
5280 if (r < 0)
5281 return r;
5282
5283 if (r == 0) {
254d1313 5284 _cleanup_close_ int parent_netns_fd = -EBADF;
5b4855ab 5285
19b761a0 5286 r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5b4855ab
DDM
5287 if (r < 0) {
5288 log_error_errno(r, "Failed to open parent network namespace: %m");
5289 _exit(EXIT_FAILURE);
5290 }
5291
5292 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5293 if (r < 0) {
5294 log_error_errno(r, "Failed to enter child network namespace: %m");
5295 _exit(EXIT_FAILURE);
5296 }
5297
2f091b1b
TM
5298 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5299 * This is about ensuring interfaces get their old name back when being moved back. */
5300 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5301
5b4855ab
DDM
5302 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5303 if (r < 0)
5304 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5305
5306 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5307 }
5308 }
5309
8f03de53 5310 r = wait_for_container(TAKE_PID(*pid), &container_status);
b0067625 5311
0bb0a9fa
ZJS
5312 /* Tell machined that we are gone. */
5313 if (bus)
5314 (void) unregister_machine(bus, arg_machine);
5315
b0067625
ZJS
5316 if (r < 0)
5317 /* We failed to wait for the container, or the container exited abnormally. */
5318 return r;
5319 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5320 /* r > 0 → The container exited with a non-zero status.
5321 * As a special case, we need to replace 133 with a different value,
5322 * because 133 is special-cased in the service file to reboot the container.
5323 * otherwise → The container exited with zero status and a reboot was not requested.
5324 */
2a49b612 5325 if (r == EXIT_FORCE_RESTART)
27e29a1e 5326 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5327 *ret = r;
b0067625
ZJS
5328 return 0; /* finito */
5329 }
5330
5331 /* CONTAINER_REBOOTED, loop again */
5332
5333 if (arg_keep_unit) {
5334 /* Special handling if we are running as a service: instead of simply
5335 * restarting the machine we want to restart the entire service, so let's
5336 * inform systemd about this with the special exit code 133. The service
5337 * file uses RestartForceExitStatus=133 so that this results in a full
5338 * nspawn restart. This is necessary since we might have cgroup parameters
5339 * set we want to have flushed out. */
2a49b612
ZJS
5340 *ret = EXIT_FORCE_RESTART;
5341 return 0; /* finito */
b0067625
ZJS
5342 }
5343
deff68e7
FW
5344 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5345 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5346
5347 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5348 *veth_created = false;
5349 return 1; /* loop again */
5350}
5351
bf428efb 5352static int initialize_rlimits(void) {
852b6250 5353 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
bf428efb
LP
5354 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5355 * container execution environments. */
5356
5357 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
852b6250
LP
5358 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5359 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5360 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5361 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5362 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5363 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5364 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5365 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5366 [RLIMIT_NICE] = { 0, 0 },
5367 [RLIMIT_NOFILE] = { 1024, 4096 },
5368 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5369 [RLIMIT_RTPRIO] = { 0, 0 },
5370 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5371 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
bf428efb
LP
5372
5373 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5374 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5375 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5376 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5377 * that PID 1 changes a number of other resource limits during early initialization which is why we
5378 * don't read the other limits from PID 1 but prefer the static table above. */
5379 };
5380
5381 int rl;
5382
5383 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5384 /* Let's only fill in what the user hasn't explicitly configured anyway */
5385 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5386 const struct rlimit *v;
5387 struct rlimit buffer;
5388
5389 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5390 /* For these two let's read the limits off PID 1. See above for an explanation. */
5391
5392 if (prlimit(1, rl, NULL, &buffer) < 0)
5393 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5394
dbf1aca6
LP
5395 v = &buffer;
5396 } else if (rl == RLIMIT_NOFILE) {
5397 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5398 * userspace. Given that nspawn containers are often run without our PID 1,
5399 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5400 * so that container userspace gets similar resources as host userspace
5401 * gets. */
5402 buffer = kernel_defaults[rl];
5403 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
bf428efb
LP
5404 v = &buffer;
5405 } else
5406 v = kernel_defaults + rl;
5407
5408 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5409 if (!arg_rlimit[rl])
5410 return log_oom();
5411 }
5412
5413 if (DEBUG_LOGGING) {
5414 _cleanup_free_ char *k = NULL;
5415
5416 (void) rlimit_format(arg_rlimit[rl], &k);
5417 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5418 }
5419 }
5420
5421 return 0;
5422}
5423
287b7376 5424static int cant_be_in_netns(void) {
254d1313 5425 _cleanup_close_ int fd = -EBADF;
287b7376
LP
5426 struct ucred ucred;
5427 int r;
5428
5429 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5430 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5431 * nice message. */
5432
5433 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5434 return 0;
5435
5436 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5437 if (fd < 0)
5438 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5439
1861986a 5440 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
bb44fd07
ZJS
5441 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5442 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5443 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5444 if (r < 0)
1861986a 5445 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
287b7376
LP
5446
5447 r = getpeercred(fd, &ucred);
5448 if (r < 0)
5449 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5450
f7a2dc3d 5451 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
287b7376 5452 if (r < 0)
f7a2dc3d
CB
5453 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5454 if (r == 0)
287b7376
LP
5455 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5456 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5457 return 0;
5458}
5459
44dbef90 5460static int run(int argc, char *argv[]) {
4c27749b 5461 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5bb1d7fb 5462 _cleanup_close_ int master = -EBADF;
03cfe0d5 5463 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5464 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5465 char veth_name[IFNAMSIZ] = "";
761cf19d 5466 struct ExposeArgs expose_args = {};
8e766630 5467 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5468 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5469 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e 5470 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5471 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5472 pid_t pid = 0;
03cfe0d5
LP
5473
5474 log_parse_environment();
5475 log_open();
415fc41c 5476
03cfe0d5
LP
5477 r = parse_argv(argc, argv);
5478 if (r <= 0)
5479 goto finish;
5480
38ee19c0
ZJS
5481 if (geteuid() != 0) {
5482 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5483 argc >= 2 ? "Need to be root." :
5484 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5485 goto finish;
38ee19c0 5486 }
fba868fa 5487
287b7376
LP
5488 r = cant_be_in_netns();
5489 if (r < 0)
5490 goto finish;
5491
bf428efb
LP
5492 r = initialize_rlimits();
5493 if (r < 0)
5494 goto finish;
5495
de40a303
LP
5496 r = load_oci_bundle();
5497 if (r < 0)
5498 goto finish;
5499
f757855e
LP
5500 r = determine_names();
5501 if (r < 0)
5502 goto finish;
5503
5504 r = load_settings();
5505 if (r < 0)
5506 goto finish;
5507
d4d99bc6 5508 r = cg_unified();
5eee8290
LP
5509 if (r < 0) {
5510 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5511 goto finish;
5512 }
5513
f757855e
LP
5514 r = verify_arguments();
5515 if (r < 0)
5516 goto finish;
03cfe0d5 5517
2f091b1b
TM
5518 r = verify_network_interfaces_initialized();
5519 if (r < 0)
5520 goto finish;
5521
49048684
ZJS
5522 /* Reapply environment settings. */
5523 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5524
2949ff26
LP
5525 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5526 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5527 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5528 (void) ignore_signals(SIGPIPE);
2949ff26 5529
03cfe0d5
LP
5530 n_fd_passed = sd_listen_fds(false);
5531 if (n_fd_passed > 0) {
5532 r = fdset_new_listen_fds(&fds, false);
5533 if (r < 0) {
5534 log_error_errno(r, "Failed to collect file descriptors: %m");
5535 goto finish;
5536 }
5537 }
5538
83e803a9
ZJS
5539 /* The "default" umask. This is appropriate for most file and directory
5540 * operations performed by nspawn, and is the umask that will be used for
5541 * the child. Functions like copy_devnodes() change the umask temporarily. */
5542 umask(0022);
5543
03cfe0d5
LP
5544 if (arg_directory) {
5545 assert(!arg_image);
5546
b35ca61a
LP
5547 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5548 * /var from the host will propagate into container dynamically (because bad things happen if
5549 * two systems write to the same /var). Let's allow it for the special cases where /var is
5550 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5551 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
1406bd66
LP
5552 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5553 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5554 goto finish;
5555 }
5556
5557 if (arg_ephemeral) {
5558 _cleanup_free_ char *np = NULL;
5559
f461a28d 5560 r = chase_and_update(&arg_directory, 0);
3f342ec4
LP
5561 if (r < 0)
5562 goto finish;
5563
7bf011e3
LP
5564 /* If the specified path is a mount point we generate the new snapshot immediately
5565 * inside it under a random name. However if the specified is not a mount point we
5566 * create the new snapshot in the parent directory, just next to it. */
e1873695 5567 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5568 if (r < 0) {
5569 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5570 goto finish;
5571 }
5572 if (r > 0)
770b5ce4 5573 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5574 else
770b5ce4 5575 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5576 if (r < 0) {
0f3be6ca 5577 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5578 goto finish;
5579 }
5580
6992459c 5581 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5582 * only owned by us and no one else. */
6992459c 5583 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5584 if (r < 0) {
5585 log_error_errno(r, "Failed to lock %s: %m", np);
5586 goto finish;
5587 }
5588
7bf011e3
LP
5589 {
5590 BLOCK_SIGNALS(SIGINT);
fab4ef72
DDM
5591 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5592 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5593 BTRFS_SNAPSHOT_FALLBACK_COPY |
5594 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5595 BTRFS_SNAPSHOT_RECURSIVE |
5596 BTRFS_SNAPSHOT_QUOTA |
5597 BTRFS_SNAPSHOT_SIGINT);
7bf011e3
LP
5598 }
5599 if (r == -EINTR) {
5600 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5601 goto finish;
5602 }
03cfe0d5
LP
5603 if (r < 0) {
5604 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5605 goto finish;
ec16945e
LP
5606 }
5607
1cc6c93a 5608 free_and_replace(arg_directory, np);
17cbb288 5609 remove_directory = true;
30535c16 5610 } else {
f461a28d 5611 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5612 if (r < 0)
5613 goto finish;
5614
30535c16
LP
5615 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5616 if (r == -EBUSY) {
5617 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5618 goto finish;
5619 }
5620 if (r < 0) {
5621 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5622 goto finish;
30535c16
LP
5623 }
5624
5625 if (arg_template) {
f461a28d 5626 r = chase_and_update(&arg_template, 0);
3f342ec4
LP
5627 if (r < 0)
5628 goto finish;
5629
7bf011e3
LP
5630 {
5631 BLOCK_SIGNALS(SIGINT);
fab4ef72
DDM
5632 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5633 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5634 BTRFS_SNAPSHOT_FALLBACK_COPY |
5635 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5636 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5637 BTRFS_SNAPSHOT_RECURSIVE |
5638 BTRFS_SNAPSHOT_QUOTA |
5639 BTRFS_SNAPSHOT_SIGINT);
7bf011e3 5640 }
ff6c6cc1
LP
5641 if (r == -EEXIST)
5642 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5643 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5644 else if (r == -EINTR) {
5645 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5646 goto finish;
5647 } else if (r < 0) {
83521414 5648 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5649 goto finish;
ff6c6cc1
LP
5650 } else
5651 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5652 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5653 }
ec16945e
LP
5654 }
5655
7732f92b 5656 if (arg_start_mode == START_BOOT) {
aff7ae0d 5657 _cleanup_free_ char *b = NULL;
a5201ed6 5658 const char *p;
c9fe05e0 5659
aff7ae0d
LP
5660 if (arg_pivot_root_new) {
5661 b = path_join(arg_directory, arg_pivot_root_new);
5662 if (!b)
5663 return log_oom();
5664
5665 p = b;
5666 } else
a5201ed6 5667 p = arg_directory;
c9fe05e0
AR
5668
5669 if (path_is_os_tree(p) <= 0) {
aff7ae0d
LP
5670 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5671 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
1b9e5b12
LP
5672 goto finish;
5673 }
5674 } else {
aff7ae0d 5675 _cleanup_free_ char *p = NULL;
c9fe05e0 5676
a5201ed6 5677 if (arg_pivot_root_new)
aff7ae0d 5678 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
a5201ed6 5679 else
aff7ae0d
LP
5680 p = path_join(arg_directory, "/usr/");
5681 if (!p)
5682 return log_oom();
1b9e5b12 5683
aff7ae0d
LP
5684 if (laccess(p, F_OK) < 0) {
5685 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5686 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
1b9e5b12 5687 goto finish;
1b9e5b12
LP
5688 }
5689 }
ec16945e 5690
6b9132a9 5691 } else {
d04faa4e 5692 DissectImageFlags dissect_image_flags =
4b5de5dd 5693 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5694 DISSECT_IMAGE_REQUIRE_ROOT |
5695 DISSECT_IMAGE_RELAX_VAR_CHECK |
73d88b80
LP
5696 DISSECT_IMAGE_USR_NO_ROOT |
5697 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5698 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
ec16945e
LP
5699 assert(arg_image);
5700 assert(!arg_template);
5701
f461a28d 5702 r = chase_and_update(&arg_image, 0);
3f342ec4
LP
5703 if (r < 0)
5704 goto finish;
5705
0f3be6ca
LP
5706 if (arg_ephemeral) {
5707 _cleanup_free_ char *np = NULL;
5708
5709 r = tempfn_random(arg_image, "machine.", &np);
5710 if (r < 0) {
5711 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5712 goto finish;
5713 }
5714
6992459c
LP
5715 /* Always take an exclusive lock on our own ephemeral copy. */
5716 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5717 if (r < 0) {
5718 r = log_error_errno(r, "Failed to create image lock: %m");
5719 goto finish;
5720 }
5721
7bf011e3
LP
5722 {
5723 BLOCK_SIGNALS(SIGINT);
7c2f5495
DDM
5724 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5725 FS_NOCOW_FL, FS_NOCOW_FL,
5726 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5727 NULL, NULL);
7bf011e3
LP
5728 }
5729 if (r == -EINTR) {
5730 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5731 goto finish;
5732 }
0f3be6ca
LP
5733 if (r < 0) {
5734 r = log_error_errno(r, "Failed to copy image file: %m");
5735 goto finish;
5736 }
5737
1cc6c93a 5738 free_and_replace(arg_image, np);
0f3be6ca
LP
5739 remove_image = true;
5740 } else {
5741 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5742 if (r == -EBUSY) {
5743 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5744 goto finish;
5745 }
5746 if (r < 0) {
5747 r = log_error_errno(r, "Failed to create image lock: %m");
5748 goto finish;
5749 }
4623e8e6 5750
89e62e0b
LP
5751 r = verity_settings_load(
5752 &arg_verity_settings,
5753 arg_image, NULL, NULL);
e7cbe5cb
LB
5754 if (r < 0) {
5755 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5756 goto finish;
78ebe980 5757 }
89e62e0b
LP
5758
5759 if (arg_verity_settings.data_path)
5760 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5761 }
5762
c67b0082 5763 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5764 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5765 goto finish;
1b9e5b12 5766 }
6b9132a9 5767
c67b0082
LP
5768 remove_tmprootdir = true;
5769
5770 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5771 if (!arg_directory) {
5772 r = log_oom();
5773 goto finish;
6b9132a9 5774 }
88213476 5775
89e62e0b
LP
5776 r = loop_device_make_by_path(
5777 arg_image,
5778 arg_read_only ? O_RDONLY : O_RDWR,
22ee78a8 5779 /* sector_size= */ UINT32_MAX,
89e62e0b 5780 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
7f52206a 5781 LOCK_SH,
89e62e0b 5782 &loop);
2d845785
LP
5783 if (r < 0) {
5784 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5785 goto finish;
5786 }
1b9e5b12 5787
bad31660 5788 r = dissect_loop_device_and_warn(
bad31660 5789 loop,
89e62e0b 5790 &arg_verity_settings,
84be0c71
LP
5791 /* mount_options=*/ NULL,
5792 arg_image_policy ?: &image_policy_container,
e7cbe5cb 5793 dissect_image_flags,
e0f9e7bd 5794 &dissected_image);
2d845785 5795 if (r == -ENOPKG) {
4526113f 5796 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5797 log_notice("Note that the disk image needs to\n"
5798 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5799 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
db811444 5800 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
2d845785
LP
5801 " d) or contain a file system without a partition table\n"
5802 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5803 goto finish;
2d845785 5804 }
4526113f 5805 if (r < 0)
842f3b0f 5806 goto finish;
1b9e5b12 5807
88b3300f
LP
5808 r = dissected_image_load_verity_sig_partition(
5809 dissected_image,
5810 loop->fd,
5811 &arg_verity_settings);
5812 if (r < 0)
5813 goto finish;
5814
8ee9615e
LP
5815 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5816 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5817 "root hash signature found! Proceeding without integrity checking.", arg_image);
4623e8e6 5818
89e62e0b
LP
5819 r = dissected_image_decrypt_interactively(
5820 dissected_image,
5821 NULL,
5822 &arg_verity_settings,
e330f97a 5823 0);
1b9e5b12
LP
5824 if (r < 0)
5825 goto finish;
0f3be6ca
LP
5826
5827 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5828 if (remove_image && unlink(arg_image) >= 0)
5829 remove_image = false;
4c27749b
LP
5830
5831 if (arg_architecture < 0)
5832 arg_architecture = dissected_image_architecture(dissected_image);
842f3b0f 5833 }
842f3b0f 5834
86c0dd4a 5835 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5836 if (r < 0)
5837 goto finish;
5838
de40a303
LP
5839 if (arg_console_mode < 0)
5840 arg_console_mode =
5841 isatty(STDIN_FILENO) > 0 &&
5842 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5843
de40a303
LP
5844 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5845 arg_quiet = true;
a258bf26 5846
9c857b9d 5847 if (!arg_quiet)
c85c2f79 5848 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
9c857b9d
LP
5849 arg_machine, arg_image ?: arg_directory);
5850
988851b6 5851 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
a258bf26 5852
8c3fe1b5
LP
5853 r = make_reaper_process(true);
5854 if (r < 0) {
5855 log_error_errno(r, "Failed to become subreaper: %m");
03cfe0d5
LP
5856 goto finish;
5857 }
5858
761cf19d
FW
5859 if (arg_expose_ports) {
5860 r = fw_ctx_new(&fw_ctx);
5861 if (r < 0) {
5862 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5863 goto finish;
5864 }
5865 expose_args.fw_ctx = fw_ctx;
5866 }
d87be9b0 5867 for (;;) {
3acc84eb 5868 r = run_container(dissected_image,
44dbef90
LP
5869 fds,
5870 veth_name, &veth_created,
761cf19d 5871 &expose_args, &master,
44dbef90 5872 &pid, &ret);
b0067625 5873 if (r <= 0)
d87be9b0 5874 break;
d87be9b0 5875 }
88213476
LP
5876
5877finish:
04f590a4
LP
5878 (void) sd_notify(false,
5879 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5880 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5881
9444b1f2 5882 if (pid > 0)
c67b0082 5883 (void) kill(pid, SIGKILL);
88213476 5884
503546da 5885 /* Try to flush whatever is still queued in the pty */
6a0f896b 5886 if (master >= 0) {
f5fbe71d 5887 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5888 master = safe_close(master);
5889 }
5890
5891 if (pid > 0)
5892 (void) wait_for_terminate(pid, NULL);
503546da 5893
50ebcf6c
LP
5894 pager_close();
5895
17cbb288 5896 if (remove_directory && arg_directory) {
ec16945e
LP
5897 int k;
5898
17cbb288 5899 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5900 if (k < 0)
17cbb288 5901 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5902 }
5903
0f3be6ca
LP
5904 if (remove_image && arg_image) {
5905 if (unlink(arg_image) < 0)
5906 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5907 }
5908
c67b0082
LP
5909 if (remove_tmprootdir) {
5910 if (rmdir(tmprootdir) < 0)
5911 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5912 }
5913
785890ac
LP
5914 if (arg_machine) {
5915 const char *p;
5916
63c372cb 5917 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5918 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5919 }
5920
deff68e7
FW
5921 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5922 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5923
5924 if (veth_created)
5925 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5926 (void) remove_bridge(arg_network_zone);
f757855e 5927
f757855e
LP
5928 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5929 expose_port_free_all(arg_expose_ports);
bf428efb 5930 rlimit_free_all(arg_rlimit);
b2645747 5931 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5932 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5933
44dbef90
LP
5934 if (r < 0)
5935 return r;
5936
5937 return ret;
88213476 5938}
44dbef90
LP
5939
5940DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);