]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: file system namespace -> mount namespace
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
d6b4d1c7 31#include "build.h"
b8ea7a6e 32#include "bus-error.h"
b053cd5f 33#include "bus-util.h"
8fe0087e 34#include "cap-list.h"
430f0182 35#include "capability-util.h"
04d391da 36#include "cgroup-util.h"
f461a28d 37#include "chase.h"
988851b6 38#include "common-signal.h"
8fe0087e 39#include "copy.h"
d107bb7d 40#include "cpu-set-util.h"
786d19fd 41#include "creds-util.h"
4fc9982c 42#include "dev-setup.h"
57f1b61b 43#include "discover-image.h"
2d845785 44#include "dissect-image.h"
8fe0087e 45#include "env-util.h"
3652872a 46#include "escape.h"
3ffd4af2 47#include "fd-util.h"
842f3b0f 48#include "fdset.h"
a5c32cff 49#include "fileio.h"
f97b34a6 50#include "format-util.h"
f4f15635 51#include "fs-util.h"
1b9e5b12 52#include "gpt.h"
4623e8e6 53#include "hexdecoct.h"
e2054217 54#include "hostname-setup.h"
8fe0087e 55#include "hostname-util.h"
910fd145 56#include "id128-util.h"
3652872a 57#include "io-util.h"
8fe0087e 58#include "log.h"
2d845785 59#include "loop-util.h"
8fe0087e 60#include "loopback-setup.h"
8fe0087e 61#include "macro.h"
44dbef90 62#include "main-func.h"
f5947a5e 63#include "missing_sched.h"
8fe0087e 64#include "mkdir.h"
4349cd7c 65#include "mount-util.h"
049af8ad 66#include "mountpoint-util.h"
0cb8e3d1 67#include "namespace-util.h"
8fe0087e 68#include "netlink-util.h"
2f893044 69#include "nspawn-bind-user.h"
07630cea 70#include "nspawn-cgroup.h"
3652872a 71#include "nspawn-creds.h"
3603efde 72#include "nspawn-def.h"
07630cea
LP
73#include "nspawn-expose-ports.h"
74#include "nspawn-mount.h"
75#include "nspawn-network.h"
de40a303 76#include "nspawn-oci.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
c9394f4f 83#include "nspawn-util.h"
91181e07 84#include "nspawn.h"
d8b4d14d 85#include "nulstr-util.h"
d58ad743 86#include "os-util.h"
50ebcf6c 87#include "pager.h"
614b022c 88#include "parse-argument.h"
6bedfcbb 89#include "parse-util.h"
294bf0c3 90#include "pretty-print.h"
0b452006 91#include "process-util.h"
8fe0087e
LP
92#include "ptyfwd.h"
93#include "random-util.h"
8869a0b4 94#include "raw-clone.h"
86775e35 95#include "resolve-util.h"
bf428efb 96#include "rlimit-util.h"
8fe0087e 97#include "rm-rf.h"
de40a303
LP
98#if HAVE_SECCOMP
99#include "seccomp-util.h"
100#endif
68b02049 101#include "selinux-util.h"
8fe0087e 102#include "signal-util.h"
2583fbea 103#include "socket-util.h"
8fcde012 104#include "stat-util.h"
15a5e950 105#include "stdio-util.h"
5c828e66 106#include "string-table.h"
07630cea 107#include "string-util.h"
8fe0087e 108#include "strv.h"
de40a303 109#include "sysctl-util.h"
8fe0087e 110#include "terminal-util.h"
e4de7287 111#include "tmpfile-util.h"
affb60b1 112#include "umask-util.h"
43c3fb46 113#include "unit-name.h"
b1d4f8e1 114#include "user-util.h"
e9642be2 115
e96ceaba
LP
116/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
e79581dd 118#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
0e7ac751 119
2a49b612
ZJS
120#define EXIT_FORCE_RESTART 133
121
113cea80
DH
122typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
6145bb4f 124 CONTAINER_REBOOTED,
113cea80
DH
125} ContainerStatus;
126
88213476 127static char *arg_directory = NULL;
ec16945e 128static char *arg_template = NULL;
5f932eb9 129static char *arg_chdir = NULL;
b53ede69
PW
130static char *arg_pivot_root_new = NULL;
131static char *arg_pivot_root_old = NULL;
687d0825 132static char *arg_user = NULL;
de40a303
LP
133static uid_t arg_uid = UID_INVALID;
134static gid_t arg_gid = GID_INVALID;
135static gid_t* arg_supplementary_gids = NULL;
136static size_t arg_n_supplementary_gids = 0;
9444b1f2 137static sd_id128_t arg_uuid = {};
3a9530e5
LP
138static char *arg_machine = NULL; /* The name used by the host to refer to this */
139static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
140static const char *arg_selinux_context = NULL;
141static const char *arg_selinux_apifs_context = NULL;
de40a303 142static char *arg_slice = NULL;
ff01d048 143static bool arg_private_network = false;
bc2f673e 144static bool arg_read_only = false;
7732f92b 145static StartMode arg_start_mode = START_PID1;
ec16945e 146static bool arg_ephemeral = false;
57fb9fb5 147static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 148static bool arg_link_journal_try = false;
520e0d54 149static uint64_t arg_caps_retain =
50b52222
LP
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 161 (1ULL << CAP_MKNOD) |
5076f0cc
LP
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
5076f0cc 165 (1ULL << CAP_SETFCAP) |
50b52222 166 (1ULL << CAP_SETGID) |
5076f0cc
LP
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
50b52222 170 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 174 (1ULL << CAP_SYS_RESOURCE) |
50b52222 175 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 176static uint64_t arg_caps_ambient = 0;
de40a303 177static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 178static CustomMount *arg_custom_mounts = NULL;
88614c8a 179static size_t arg_n_custom_mounts = 0;
f4889f65 180static char **arg_setenv = NULL;
284c0b91 181static bool arg_quiet = false;
eb91eb18 182static bool arg_register = true;
89f7c846 183static bool arg_keep_unit = false;
aa28aefe 184static char **arg_network_interfaces = NULL;
c74e630d 185static char **arg_network_macvlan = NULL;
4bbfe7ad 186static char **arg_network_ipvlan = NULL;
69c79d3c 187static bool arg_network_veth = false;
f6d6bad1 188static char **arg_network_veth_extra = NULL;
f757855e 189static char *arg_network_bridge = NULL;
22b28dfd 190static char *arg_network_zone = NULL;
d7bea6b6 191static char *arg_network_namespace_path = NULL;
bb068de0 192static PagerFlags arg_pager_flags = 0;
050f7277 193static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 194static char *arg_image = NULL;
de40a303 195static char *arg_oci_bundle = NULL;
f757855e 196static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 197static ExposePort *arg_expose_ports = NULL;
f36933fe 198static char **arg_property = NULL;
de40a303 199static sd_bus_message *arg_property_message = NULL;
0de7acce 200static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 201static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 202static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 203static int arg_kill_signal = 0;
5da38d07 204static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
205static SettingsMask arg_settings_mask = 0;
206static int arg_settings_trusted = -1;
207static char **arg_parameters = NULL;
6aadfa4c 208static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 209static bool arg_notify_ready = false;
5a8ff0e6 210static bool arg_use_cgns = true;
0c582db0 211static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 212static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 213static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
214static char **arg_syscall_allow_list = NULL;
215static char **arg_syscall_deny_list = NULL;
de40a303
LP
216#if HAVE_SECCOMP
217static scmp_filter_ctx arg_seccomp = NULL;
218#endif
bf428efb 219static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 220static bool arg_no_new_privileges = false;
81f345df
LP
221static int arg_oom_score_adjust = 0;
222static bool arg_oom_score_adjust_set = false;
0985c7c4 223static CPUSet arg_cpu_set = {};
09d423e9 224static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 225static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 226static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
227static DeviceNode* arg_extra_nodes = NULL;
228static size_t arg_n_extra_nodes = 0;
229static char **arg_sysctl = NULL;
230static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
231static Credential *arg_credentials = NULL;
232static size_t arg_n_credentials = 0;
2f893044 233static char **arg_bind_user = NULL;
4a4654e0 234static bool arg_suppress_sync = false;
3603f151 235static char *arg_settings_filename = NULL;
4c27749b 236static Architecture arg_architecture = _ARCHITECTURE_INVALID;
84be0c71 237static ImagePolicy *arg_image_policy = NULL;
88213476 238
6145bb4f
LP
239STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
249STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
250STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
254STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
255STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
256STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
258STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
259STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
260STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
261STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 262STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
263STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
264STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
265#if HAVE_SECCOMP
266STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
267#endif
0985c7c4 268STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 269STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 270STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
3603f151 271STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
84be0c71 272STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
6145bb4f 273
dce66ffe
ZJS
274static int handle_arg_console(const char *arg) {
275 if (streq(arg, "help")) {
10e8a60b
LP
276 puts("autopipe\n"
277 "interactive\n"
dce66ffe 278 "passive\n"
10e8a60b
LP
279 "pipe\n"
280 "read-only");
dce66ffe
ZJS
281 return 0;
282 }
283
284 if (streq(arg, "interactive"))
285 arg_console_mode = CONSOLE_INTERACTIVE;
286 else if (streq(arg, "read-only"))
287 arg_console_mode = CONSOLE_READ_ONLY;
288 else if (streq(arg, "passive"))
289 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
290 else if (streq(arg, "pipe")) {
291 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
292 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
293 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
294 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
295 "Proceeding anyway.");
296
dce66ffe 297 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
298 } else if (streq(arg, "autopipe")) {
299 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
300 arg_console_mode = CONSOLE_INTERACTIVE;
301 else
302 arg_console_mode = CONSOLE_PIPE;
554c4beb 303 } else
dce66ffe
ZJS
304 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
305
306 arg_settings_mask |= SETTING_CONSOLE_MODE;
307 return 1;
308}
309
37ec0fdd
LP
310static int help(void) {
311 _cleanup_free_ char *link = NULL;
312 int r;
313
384c2c32 314 pager_open(arg_pager_flags);
50ebcf6c 315
37ec0fdd
LP
316 r = terminal_urlify_man("systemd-nspawn", "1", &link);
317 if (r < 0)
318 return log_oom();
319
25148653 320 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 321 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
322 " -h --help Show this help\n"
323 " --version Print version string\n"
69c79d3c 324 " -q --quiet Do not show status information\n"
bb068de0 325 " --no-pager Do not pipe output into a pager\n"
25148653
LP
326 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
327 "%3$sImage:%4$s\n"
1b9e5b12 328 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
329 " --template=PATH Initialize root directory from template directory,\n"
330 " if missing\n"
331 " -x --ephemeral Run container with snapshot of root directory, and\n"
332 " remove it after exit\n"
25e68fd3
LP
333 " -i --image=PATH Root file system disk image (or device node) for\n"
334 " the container\n"
84be0c71 335 " --image-policy=POLICY Specify disk image dissection policy\n"
de40a303 336 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
337 " --read-only Mount the root directory read-only\n"
338 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 339 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
340 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
341 " as a DER encoded PKCS7, either as a path to a file\n"
342 " or as an ASCII base64 encoded string prefixed by\n"
343 " 'base64:'\n"
e7cbe5cb 344 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
345 " --pivot-root=PATH[:PATH]\n"
346 " Pivot root to given directory in the container\n\n"
347 "%3$sExecution:%4$s\n"
7732f92b 348 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 349 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 350 " --chdir=PATH Set working directory in the container\n"
0d2a0179 351 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
25148653
LP
352 " -u --user=USER Run the command under specified user or UID\n"
353 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
4a4654e0
LP
354 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
355 " --suppress-sync=BOOLEAN\n"
356 " Suppress any form of disk data synchronization\n\n"
25148653 357 "%3$sSystem Identity:%4$s\n"
a8828ed9 358 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 359 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
360 " --uuid=UUID Set a specific machine UUID for the container\n\n"
361 "%3$sProperties:%4$s\n"
a8828ed9 362 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 363 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
364 " --register=BOOLEAN Register container as machine\n"
365 " --keep-unit Do not register a scope for the machine, reuse\n"
366 " the service unit nspawn is running in\n\n"
367 "%3$sUser Namespacing:%4$s\n"
b917743d
YW
368 " --private-users=no Run without user namespacing\n"
369 " --private-users=yes|pick|identity\n"
370 " Run within user namespace, autoselect UID/GID range\n"
371 " --private-users=UIDBASE[:NUIDS]\n"
90b4a64d 372 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
373 " --private-users-ownership=MODE\n"
374 " Adjust ('chown') or map ('map') OS tree ownership\n"
b917743d
YW
375 " to private UID/GID range\n"
376 " -U Equivalent to --private-users=pick and\n"
377 " --private-users-ownership=auto\n\n"
25148653 378 "%3$sNetworking:%4$s\n"
69c79d3c 379 " --private-network Disable network in container\n"
2f091b1b 380 " --network-interface=HOSTIF[:CONTAINERIF]\n"
69c79d3c
LP
381 " Assign an existing network interface to the\n"
382 " container\n"
2f091b1b 383 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
c74e630d
LP
384 " Create a macvlan network interface based on an\n"
385 " existing network interface to the container\n"
2f091b1b 386 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
387f6955 387 " Create an ipvlan network interface based on an\n"
4bbfe7ad 388 " existing network interface to the container\n"
a8eaaee7 389 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 390 " and container\n"
f6d6bad1
LP
391 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
392 " Add an additional virtual Ethernet link between\n"
393 " host and container\n"
ab046dde 394 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
395 " Add a virtual Ethernet connection to the container\n"
396 " and attach it to an existing bridge on the host\n"
397 " --network-zone=NAME Similar, but attach the new interface to an\n"
398 " an automatically managed bridge interface\n"
d7bea6b6
DP
399 " --network-namespace-path=PATH\n"
400 " Set network namespace to the one represented by\n"
401 " the specified kernel namespace file node\n"
6d0b55c2 402 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
403 " Expose a container IP port on the host\n\n"
404 "%3$sSecurity:%4$s\n"
a8828ed9
DW
405 " --capability=CAP In addition to the default, retain specified\n"
406 " capability\n"
407 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
408 " --ambient-capability=CAP\n"
409 " Sets the specified capability for the started\n"
410 " process. Not useful if booting a machine.\n"
f4e803c8 411 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
412 " --system-call-filter=LIST|~LIST\n"
413 " Permit/prohibit specific system calls\n"
25148653
LP
414 " -Z --selinux-context=SECLABEL\n"
415 " Set the SELinux security context to be used by\n"
416 " processes in the container\n"
417 " -L --selinux-apifs-context=SECLABEL\n"
418 " Set the SELinux security context to be used by\n"
419 " API/tmpfs file systems in the container\n\n"
420 "%3$sResources:%4$s\n"
bf428efb 421 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
422 " --oom-score-adjust=VALUE\n"
423 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
424 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
425 " --personality=ARCH Pick personality for this container\n\n"
25148653 426 "%3$sIntegration:%4$s\n"
09d423e9 427 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 428 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
429 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
430 " host, try-guest, try-host\n"
431 " -j Equivalent to --link-journal=try-guest\n\n"
432 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
433 " --bind=PATH[:PATH[:OPTIONS]]\n"
434 " Bind mount a file or directory from the host into\n"
a8828ed9 435 " the container\n"
5e5bfa6e
EY
436 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
437 " Similar, but creates a read-only bind mount\n"
de40a303
LP
438 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
439 " it\n"
06c17c39 440 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
441 " --overlay=PATH[:PATH...]:PATH\n"
442 " Create an overlay mount from the host to \n"
443 " the container\n"
444 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
445 " Similar, but creates a read-only overlay mount\n"
446 " --bind-user=NAME Bind user from host to container\n\n"
25148653 447 "%3$sInput/Output:%4$s\n"
de40a303
LP
448 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
449 " set up for the container.\n"
3652872a
LP
450 " -P --pipe Equivalent to --console=pipe\n\n"
451 "%3$sCredentials:%4$s\n"
452 " --set-credential=ID:VALUE\n"
453 " Pass a credential with literal value to container.\n"
454 " --load-credential=ID:PATH\n"
455 " Load credential to pass to container from file or\n"
456 " AF_UNIX stream socket.\n"
bc556335
DDM
457 "\nSee the %2$s for details.\n",
458 program_invocation_short_name,
459 link,
460 ansi_underline(),
461 ansi_normal(),
462 ansi_highlight(),
463 ansi_normal());
37ec0fdd
LP
464
465 return 0;
88213476
LP
466}
467
86c0dd4a 468static int custom_mount_check_all(void) {
88614c8a 469 size_t i;
5a8af538 470
5a8af538
LP
471 for (i = 0; i < arg_n_custom_mounts; i++) {
472 CustomMount *m = &arg_custom_mounts[i];
473
0de7acce 474 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 475 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 476 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 477 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 478 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
479 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
480 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 481 }
5a8af538
LP
482 }
483
484 return 0;
485}
486
8199d554 487static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 488 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 489 int r;
5da38d07 490
efdb0237 491 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
492
493 e = getenv(var);
494 if (!e) {
d5fc5b2f 495 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
496 var = "UNIFIED_CGROUP_HIERARCHY";
497 e = getenv(var);
c78c095b
ZJS
498 }
499
500 if (!isempty(e)) {
efdb0237
LP
501 r = parse_boolean(e);
502 if (r < 0)
c78c095b 503 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
504 if (r > 0)
505 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
506 else
507 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
508 }
509
8199d554
LP
510 return 0;
511}
512
513static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
514 int r;
515
75b0d8b8
ZJS
516 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
517 * in the image actually supports. */
b4cccbc1
LP
518 r = cg_all_unified();
519 if (r < 0)
520 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
521 if (r > 0) {
a8725a06
ZJS
522 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
523 * routine only detects 231, so we'll have a false negative here for 230. */
7e6821ed 524 r = systemd_installation_has_version(directory, "230");
a8725a06
ZJS
525 if (r < 0)
526 return log_error_errno(r, "Failed to determine systemd version in container: %m");
527 if (r > 0)
528 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
529 else
530 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 531 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b 532 /* Mixed cgroup hierarchy support was added in 233 */
7e6821ed 533 r = systemd_installation_has_version(directory, "233");
0fd9563f
ZJS
534 if (r < 0)
535 return log_error_errno(r, "Failed to determine systemd version in container: %m");
536 if (r > 0)
537 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
538 else
539 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
540 } else
5da38d07 541 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 542
8199d554
LP
543 log_debug("Using %s hierarchy for container.",
544 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
545 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
546
efdb0237
LP
547 return 0;
548}
549
8a99bd0c
ZJS
550static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
551 uint64_t mask = 0;
552 int r;
553
554 for (;;) {
555 _cleanup_free_ char *t = NULL;
556
557 r = extract_first_word(&spec, &t, ",", 0);
558 if (r < 0)
559 return log_error_errno(r, "Failed to parse capability %s.", t);
560 if (r == 0)
561 break;
562
563 if (streq(t, "help")) {
564 for (int i = 0; i < capability_list_length(); i++) {
565 const char *name;
566
567 name = capability_to_name(i);
568 if (name)
569 puts(name);
570 }
571
572 return 0; /* quit */
573 }
574
575 if (streq(t, "all"))
f5fbe71d 576 mask = UINT64_MAX;
8a99bd0c
ZJS
577 else {
578 r = capability_from_name(t);
579 if (r < 0)
580 return log_error_errno(r, "Failed to parse capability %s.", t);
581
582 mask |= 1ULL << r;
583 }
584 }
585
586 *ret_mask = mask;
587 return 1; /* continue */
588}
589
49048684 590static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
591 int r;
592
593 r = getenv_bool(name);
594 if (r == -ENXIO)
49048684 595 return 0;
0c582db0 596 if (r < 0)
49048684 597 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 598
0c582db0 599 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 600 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 601 return 0;
0c582db0
LB
602}
603
49048684 604static int parse_mount_settings_env(void) {
4f086aab 605 const char *e;
1099ceeb
LP
606 int r;
607
608 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
609 if (r < 0 && r != -ENXIO)
610 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
611 if (r >= 0)
612 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
613
614 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 615 if (streq_ptr(e, "network"))
4f086aab 616 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 617
49048684
ZJS
618 else if (e) {
619 r = parse_boolean(e);
620 if (r < 0)
621 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
622
623 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
624 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 625 }
4f086aab 626
49048684 627 return 0;
4f086aab
SU
628}
629
49048684 630static int parse_environment(void) {
d5455d2f
LP
631 const char *e;
632 int r;
633
49048684
ZJS
634 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
635 if (r < 0)
636 return r;
637 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
638 if (r < 0)
639 return r;
640 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
641 if (r < 0)
642 return r;
643 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
644 if (r < 0)
645 return r;
d5455d2f 646
49048684
ZJS
647 r = parse_mount_settings_env();
648 if (r < 0)
649 return r;
d5455d2f 650
489fae52
ZJS
651 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
652 * even if it is supported. If not supported, it has no effect. */
de40a303 653 if (!cg_ns_supported())
489fae52 654 arg_use_cgns = false;
de40a303
LP
655 else {
656 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
657 if (r < 0) {
658 if (r != -ENXIO)
49048684 659 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
660
661 arg_use_cgns = true;
662 } else {
663 arg_use_cgns = r > 0;
664 arg_settings_mask |= SETTING_USE_CGNS;
665 }
666 }
d5455d2f
LP
667
668 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
669 if (e)
670 arg_container_service_name = e;
671
4a4654e0
LP
672 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
673 if (r >= 0)
674 arg_suppress_sync = r;
675 else if (r != -ENXIO)
676 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
677
49048684 678 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
679}
680
88213476 681static int parse_argv(int argc, char *argv[]) {
a41fe3a2 682 enum {
acbeb427
ZJS
683 ARG_VERSION = 0x100,
684 ARG_PRIVATE_NETWORK,
bc2f673e 685 ARG_UUID,
5076f0cc 686 ARG_READ_ONLY,
57fb9fb5 687 ARG_CAPABILITY,
88fc9c9b 688 ARG_AMBIENT_CAPABILITY,
420c7379 689 ARG_DROP_CAPABILITY,
17fe0523
LP
690 ARG_LINK_JOURNAL,
691 ARG_BIND,
f4889f65 692 ARG_BIND_RO,
06c17c39 693 ARG_TMPFS,
5a8af538
LP
694 ARG_OVERLAY,
695 ARG_OVERLAY_RO,
de40a303 696 ARG_INACCESSIBLE,
eb91eb18 697 ARG_SHARE_SYSTEM,
89f7c846 698 ARG_REGISTER,
aa28aefe 699 ARG_KEEP_UNIT,
69c79d3c 700 ARG_NETWORK_INTERFACE,
c74e630d 701 ARG_NETWORK_MACVLAN,
4bbfe7ad 702 ARG_NETWORK_IPVLAN,
ab046dde 703 ARG_NETWORK_BRIDGE,
22b28dfd 704 ARG_NETWORK_ZONE,
f6d6bad1 705 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 706 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 707 ARG_PERSONALITY,
4d9f07b4 708 ARG_VOLATILE,
ec16945e 709 ARG_TEMPLATE,
f36933fe 710 ARG_PROPERTY,
6dac160c 711 ARG_PRIVATE_USERS,
c6c8f6e2 712 ARG_KILL_SIGNAL,
f757855e 713 ARG_SETTINGS,
5f932eb9 714 ARG_CHDIR,
b53ede69 715 ARG_PIVOT_ROOT,
7336138e 716 ARG_PRIVATE_USERS_CHOWN,
6c045a99 717 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 718 ARG_NOTIFY_READY,
4623e8e6 719 ARG_ROOT_HASH,
89e62e0b
LP
720 ARG_ROOT_HASH_SIG,
721 ARG_VERITY_DATA,
960e4569 722 ARG_SYSTEM_CALL_FILTER,
bf428efb 723 ARG_RLIMIT,
3a9530e5 724 ARG_HOSTNAME,
66edd963 725 ARG_NO_NEW_PRIVILEGES,
81f345df 726 ARG_OOM_SCORE_ADJUST,
d107bb7d 727 ARG_CPU_AFFINITY,
09d423e9 728 ARG_RESOLV_CONF,
1688841f 729 ARG_TIMEZONE,
de40a303
LP
730 ARG_CONSOLE,
731 ARG_PIPE,
732 ARG_OCI_BUNDLE,
bb068de0 733 ARG_NO_PAGER,
3652872a
LP
734 ARG_SET_CREDENTIAL,
735 ARG_LOAD_CREDENTIAL,
2f893044 736 ARG_BIND_USER,
4a4654e0 737 ARG_SUPPRESS_SYNC,
84be0c71 738 ARG_IMAGE_POLICY,
a41fe3a2
LP
739 };
740
88213476 741 static const struct option options[] = {
d7bea6b6
DP
742 { "help", no_argument, NULL, 'h' },
743 { "version", no_argument, NULL, ARG_VERSION },
744 { "directory", required_argument, NULL, 'D' },
745 { "template", required_argument, NULL, ARG_TEMPLATE },
746 { "ephemeral", no_argument, NULL, 'x' },
747 { "user", required_argument, NULL, 'u' },
748 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
749 { "as-pid2", no_argument, NULL, 'a' },
750 { "boot", no_argument, NULL, 'b' },
751 { "uuid", required_argument, NULL, ARG_UUID },
752 { "read-only", no_argument, NULL, ARG_READ_ONLY },
753 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 754 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 755 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 756 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
757 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
758 { "bind", required_argument, NULL, ARG_BIND },
759 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
760 { "tmpfs", required_argument, NULL, ARG_TMPFS },
761 { "overlay", required_argument, NULL, ARG_OVERLAY },
762 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 763 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 764 { "machine", required_argument, NULL, 'M' },
3a9530e5 765 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
766 { "slice", required_argument, NULL, 'S' },
767 { "setenv", required_argument, NULL, 'E' },
768 { "selinux-context", required_argument, NULL, 'Z' },
769 { "selinux-apifs-context", required_argument, NULL, 'L' },
770 { "quiet", no_argument, NULL, 'q' },
771 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
772 { "register", required_argument, NULL, ARG_REGISTER },
773 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
774 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
775 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
776 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
777 { "network-veth", no_argument, NULL, 'n' },
778 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
779 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
780 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
781 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
782 { "personality", required_argument, NULL, ARG_PERSONALITY },
783 { "image", required_argument, NULL, 'i' },
784 { "volatile", optional_argument, NULL, ARG_VOLATILE },
785 { "port", required_argument, NULL, 'p' },
786 { "property", required_argument, NULL, ARG_PROPERTY },
787 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
788 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
789 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
790 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
791 { "settings", required_argument, NULL, ARG_SETTINGS },
792 { "chdir", required_argument, NULL, ARG_CHDIR },
793 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
794 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
795 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
796 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
797 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 798 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 799 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 800 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 801 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 802 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 803 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
804 { "console", required_argument, NULL, ARG_CONSOLE },
805 { "pipe", no_argument, NULL, ARG_PIPE },
806 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 807 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
808 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
809 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 810 { "bind-user", required_argument, NULL, ARG_BIND_USER },
4a4654e0 811 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
84be0c71 812 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
eb9da376 813 {}
88213476
LP
814 };
815
9444b1f2 816 int c, r;
a42c8b54 817 uint64_t plus = 0, minus = 0;
f757855e 818 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
819
820 assert(argc >= 0);
821 assert(argv);
822
ef9c12b1
YW
823 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
824 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
825 optind = 0;
de40a303 826 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
827 switch (c) {
828
829 case 'h':
37ec0fdd 830 return help();
88213476 831
acbeb427 832 case ARG_VERSION:
3f6fd1ba 833 return version();
acbeb427 834
88213476 835 case 'D':
614b022c 836 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 837 if (r < 0)
0f03c2a4 838 return r;
de40a303
LP
839
840 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
841 break;
842
843 case ARG_TEMPLATE:
614b022c 844 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 845 if (r < 0)
0f03c2a4 846 return r;
de40a303
LP
847
848 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
849 break;
850
1b9e5b12 851 case 'i':
614b022c 852 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 853 if (r < 0)
0f03c2a4 854 return r;
de40a303
LP
855
856 arg_settings_mask |= SETTING_DIRECTORY;
857 break;
858
859 case ARG_OCI_BUNDLE:
614b022c 860 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
861 if (r < 0)
862 return r;
863
ec16945e
LP
864 break;
865
866 case 'x':
867 arg_ephemeral = true;
a2f577fc 868 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
869 break;
870
687d0825 871 case 'u':
2fc09a9c
DM
872 r = free_and_strdup(&arg_user, optarg);
873 if (r < 0)
7027ff61 874 return log_oom();
687d0825 875
f757855e 876 arg_settings_mask |= SETTING_USER;
687d0825
MV
877 break;
878
22b28dfd 879 case ARG_NETWORK_ZONE: {
fee9f7b5 880 _cleanup_free_ char *j = NULL;
22b28dfd 881
b910cc72 882 j = strjoin("vz-", optarg);
22b28dfd
LP
883 if (!j)
884 return log_oom();
885
fee9f7b5
FS
886 if (!ifname_valid(j))
887 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
888 "Network zone name not valid: %s", j);
22b28dfd 889
df1fac6d 890 free_and_replace(arg_network_zone, j);
22b28dfd
LP
891
892 arg_network_veth = true;
893 arg_private_network = true;
894 arg_settings_mask |= SETTING_NETWORK;
895 break;
896 }
897
ab046dde 898 case ARG_NETWORK_BRIDGE:
ef76dff2 899
baaa35ad
ZJS
900 if (!ifname_valid(optarg))
901 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
902 "Bridge interface name not valid: %s", optarg);
ef76dff2 903
f757855e
LP
904 r = free_and_strdup(&arg_network_bridge, optarg);
905 if (r < 0)
906 return log_oom();
ab046dde 907
4831981d 908 _fallthrough_;
0dfaa006 909 case 'n':
69c79d3c
LP
910 arg_network_veth = true;
911 arg_private_network = true;
f757855e 912 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
913 break;
914
f6d6bad1
LP
915 case ARG_NETWORK_VETH_EXTRA:
916 r = veth_extra_parse(&arg_network_veth_extra, optarg);
917 if (r < 0)
918 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
919
920 arg_private_network = true;
921 arg_settings_mask |= SETTING_NETWORK;
922 break;
923
aa28aefe 924 case ARG_NETWORK_INTERFACE:
2f091b1b 925 r = interface_pair_parse(&arg_network_interfaces, optarg);
b390f178
DDM
926 if (r < 0)
927 return r;
928
c74e630d 929 arg_private_network = true;
f757855e 930 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
931 break;
932
933 case ARG_NETWORK_MACVLAN:
2f091b1b 934 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
b390f178
DDM
935 if (r < 0)
936 return r;
937
4bbfe7ad 938 arg_private_network = true;
f757855e 939 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
940 break;
941
942 case ARG_NETWORK_IPVLAN:
2f091b1b 943 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
b390f178
DDM
944 if (r < 0)
945 return r;
946
4831981d 947 _fallthrough_;
ff01d048
LP
948 case ARG_PRIVATE_NETWORK:
949 arg_private_network = true;
f757855e 950 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
951 break;
952
d7bea6b6 953 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 954 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
955 if (r < 0)
956 return r;
957
de40a303 958 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
959 break;
960
0f0dbc46 961 case 'b':
baaa35ad
ZJS
962 if (arg_start_mode == START_PID2)
963 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
964 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
965
966 arg_start_mode = START_BOOT;
967 arg_settings_mask |= SETTING_START_MODE;
968 break;
969
970 case 'a':
baaa35ad
ZJS
971 if (arg_start_mode == START_BOOT)
972 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
973 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
974
975 arg_start_mode = START_PID2;
976 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
977 break;
978
144f0fc0 979 case ARG_UUID:
9444b1f2 980 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
981 if (r < 0)
982 return log_error_errno(r, "Invalid UUID: %s", optarg);
983
baaa35ad
ZJS
984 if (sd_id128_is_null(arg_uuid))
985 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
986 "Machine UUID may not be all zeroes.");
f757855e
LP
987
988 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 989 break;
aa96c6cb 990
43c3fb46
LP
991 case 'S': {
992 _cleanup_free_ char *mangled = NULL;
993
994 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
995 if (r < 0)
996 return log_oom();
997
43c3fb46 998 free_and_replace(arg_slice, mangled);
de40a303 999 arg_settings_mask |= SETTING_SLICE;
144f0fc0 1000 break;
43c3fb46 1001 }
144f0fc0 1002
7027ff61 1003 case 'M':
c1521918 1004 if (isempty(optarg))
97b11eed 1005 arg_machine = mfree(arg_machine);
c1521918 1006 else {
52ef5dd7 1007 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1008 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1009 "Invalid machine name: %s", optarg);
7027ff61 1010
0c3c4284
LP
1011 r = free_and_strdup(&arg_machine, optarg);
1012 if (r < 0)
eb91eb18 1013 return log_oom();
eb91eb18 1014 }
9ce6d1b3 1015 break;
7027ff61 1016
3a9530e5
LP
1017 case ARG_HOSTNAME:
1018 if (isempty(optarg))
1019 arg_hostname = mfree(arg_hostname);
1020 else {
52ef5dd7 1021 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1022 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1023 "Invalid hostname: %s", optarg);
3a9530e5
LP
1024
1025 r = free_and_strdup(&arg_hostname, optarg);
1026 if (r < 0)
1027 return log_oom();
1028 }
1029
1030 arg_settings_mask |= SETTING_HOSTNAME;
1031 break;
1032
82adf6af
LP
1033 case 'Z':
1034 arg_selinux_context = optarg;
a8828ed9
DW
1035 break;
1036
82adf6af
LP
1037 case 'L':
1038 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1039 break;
1040
bc2f673e
LP
1041 case ARG_READ_ONLY:
1042 arg_read_only = true;
f757855e 1043 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1044 break;
1045
88fc9c9b
TH
1046 case ARG_AMBIENT_CAPABILITY: {
1047 uint64_t m;
1048 r = parse_capability_spec(optarg, &m);
1049 if (r <= 0)
1050 return r;
1051 arg_caps_ambient |= m;
1052 arg_settings_mask |= SETTING_CAPABILITY;
1053 break;
1054 }
420c7379
LP
1055 case ARG_CAPABILITY:
1056 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1057 uint64_t m;
1058 r = parse_capability_spec(optarg, &m);
1059 if (r <= 0)
1060 return r;
5076f0cc 1061
8a99bd0c
ZJS
1062 if (c == ARG_CAPABILITY)
1063 plus |= m;
1064 else
1065 minus |= m;
f757855e 1066 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1067 break;
1068 }
66edd963
LP
1069 case ARG_NO_NEW_PRIVILEGES:
1070 r = parse_boolean(optarg);
1071 if (r < 0)
1072 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1073
1074 arg_no_new_privileges = r;
1075 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1076 break;
1077
57fb9fb5
LP
1078 case 'j':
1079 arg_link_journal = LINK_GUEST;
574edc90 1080 arg_link_journal_try = true;
4e1d6aa9 1081 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1082 break;
1083
1084 case ARG_LINK_JOURNAL:
4e1d6aa9 1085 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1086 if (r < 0)
1087 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1088
4e1d6aa9 1089 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1090 break;
1091
17fe0523 1092 case ARG_BIND:
f757855e
LP
1093 case ARG_BIND_RO:
1094 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1095 if (r < 0)
1096 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1097
f757855e 1098 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1099 break;
06c17c39 1100
f757855e
LP
1101 case ARG_TMPFS:
1102 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1103 if (r < 0)
1104 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1105
f757855e 1106 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1107 break;
5a8af538
LP
1108
1109 case ARG_OVERLAY:
ad85779a
LP
1110 case ARG_OVERLAY_RO:
1111 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1112 if (r == -EADDRNOTAVAIL)
1113 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1114 if (r < 0)
1115 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1116
f757855e 1117 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1118 break;
06c17c39 1119
de40a303
LP
1120 case ARG_INACCESSIBLE:
1121 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1122 if (r < 0)
1123 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1124
1125 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1126 break;
1127
0d2a0179
ZJS
1128 case 'E':
1129 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
aaf057c4 1130 if (r < 0)
0d2a0179 1131 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
f4889f65 1132
f757855e 1133 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65 1134 break;
f4889f65 1135
284c0b91
LP
1136 case 'q':
1137 arg_quiet = true;
1138 break;
1139
8a96d94e 1140 case ARG_SHARE_SYSTEM:
a6b5216c 1141 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1142 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1143 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1144 arg_clone_ns_flags = 0;
8a96d94e
LP
1145 break;
1146
eb91eb18
LP
1147 case ARG_REGISTER:
1148 r = parse_boolean(optarg);
1149 if (r < 0) {
1150 log_error("Failed to parse --register= argument: %s", optarg);
1151 return r;
1152 }
1153
1154 arg_register = r;
1155 break;
1156
89f7c846
LP
1157 case ARG_KEEP_UNIT:
1158 arg_keep_unit = true;
1159 break;
1160
6afc95b7
LP
1161 case ARG_PERSONALITY:
1162
ac45f971 1163 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1164 if (arg_personality == PERSONALITY_INVALID)
1165 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1166 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1167
f757855e 1168 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1169 break;
1170
4d9f07b4
LP
1171 case ARG_VOLATILE:
1172
1173 if (!optarg)
f757855e 1174 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1175 else if (streq(optarg, "help")) {
1176 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1177 return 0;
1178 } else {
f757855e 1179 VolatileMode m;
4d9f07b4 1180
f757855e 1181 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1182 if (m < 0)
1183 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1184 "Failed to parse --volatile= argument: %s", optarg);
1185 else
f757855e 1186 arg_volatile_mode = m;
6d0b55c2
LP
1187 }
1188
f757855e
LP
1189 arg_settings_mask |= SETTING_VOLATILE_MODE;
1190 break;
6d0b55c2 1191
f757855e
LP
1192 case 'p':
1193 r = expose_port_parse(&arg_expose_ports, optarg);
1194 if (r == -EEXIST)
1195 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1196 if (r < 0)
1197 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1198
f757855e 1199 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1200 break;
6d0b55c2 1201
f36933fe
LP
1202 case ARG_PROPERTY:
1203 if (strv_extend(&arg_property, optarg) < 0)
1204 return log_oom();
1205
1206 break;
1207
ae209204 1208 case ARG_PRIVATE_USERS: {
33eac552 1209 int boolean;
0de7acce 1210
ae209204
ZJS
1211 if (!optarg)
1212 boolean = true;
1213 else if (!in_charset(optarg, DIGITS))
1214 /* do *not* parse numbers as booleans */
1215 boolean = parse_boolean(optarg);
33eac552
LP
1216 else
1217 boolean = -1;
ae209204 1218
33eac552 1219 if (boolean == 0) {
0de7acce
LP
1220 /* no: User namespacing off */
1221 arg_userns_mode = USER_NAMESPACE_NO;
1222 arg_uid_shift = UID_INVALID;
1223 arg_uid_range = UINT32_C(0x10000);
33eac552 1224 } else if (boolean > 0) {
0de7acce
LP
1225 /* yes: User namespacing on, UID range is read from root dir */
1226 arg_userns_mode = USER_NAMESPACE_FIXED;
1227 arg_uid_shift = UID_INVALID;
1228 arg_uid_range = UINT32_C(0x10000);
1229 } else if (streq(optarg, "pick")) {
1230 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1231 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1232 * implied by USER_NAMESPACE_PICK
33eac552 1233 * further down. */
0de7acce
LP
1234 arg_uid_shift = UID_INVALID;
1235 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1236
1237 } else if (streq(optarg, "identity")) {
6c2d70ce 1238 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
33eac552
LP
1239 * itself, i.e. we don't actually map anything, but do take benefit of
1240 * isolation of capability sets. */
1241 arg_userns_mode = USER_NAMESPACE_FIXED;
1242 arg_uid_shift = 0;
1243 arg_uid_range = UINT32_C(0x10000);
0de7acce 1244 } else {
6c2058b3 1245 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1246 const char *range, *shift;
1247
0de7acce
LP
1248 /* anything else: User namespacing on, UID range is explicitly configured */
1249
6dac160c
LP
1250 range = strchr(optarg, ':');
1251 if (range) {
6c2058b3
ZJS
1252 buffer = strndup(optarg, range - optarg);
1253 if (!buffer)
1254 return log_oom();
1255 shift = buffer;
6dac160c
LP
1256
1257 range++;
bfd292ec
ZJS
1258 r = safe_atou32(range, &arg_uid_range);
1259 if (r < 0)
be715731 1260 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1261 } else
1262 shift = optarg;
1263
be715731
ZJS
1264 r = parse_uid(shift, &arg_uid_shift);
1265 if (r < 0)
1266 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1267
1268 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1269
58e13de5
LP
1270 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1271 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1272 }
be715731 1273
0de7acce 1274 arg_settings_mask |= SETTING_USERNS;
6dac160c 1275 break;
ae209204 1276 }
6dac160c 1277
0de7acce 1278 case 'U':
ccabee0d 1279 if (userns_supported()) {
6c045a99
LP
1280 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1281 * implied by USER_NAMESPACE_PICK
33eac552 1282 * further down. */
ccabee0d
LP
1283 arg_uid_shift = UID_INVALID;
1284 arg_uid_range = UINT32_C(0x10000);
1285
1286 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1287 }
1288
7336138e
LP
1289 break;
1290
0de7acce 1291 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1292 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1293
1294 arg_settings_mask |= SETTING_USERNS;
1295 break;
1296
1297 case ARG_PRIVATE_USERS_OWNERSHIP:
1298 if (streq(optarg, "help")) {
1299 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1300 return 0;
1301 }
1302
1303 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1304 if (arg_userns_ownership < 0)
1305 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1306
1307 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1308 break;
1309
c6c8f6e2 1310 case ARG_KILL_SIGNAL:
5c828e66
LP
1311 if (streq(optarg, "help")) {
1312 DUMP_STRING_TABLE(signal, int, _NSIG);
1313 return 0;
1314 }
1315
29a3db75 1316 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1317 if (arg_kill_signal < 0)
7211c853 1318 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1319
f757855e
LP
1320 arg_settings_mask |= SETTING_KILL_SIGNAL;
1321 break;
1322
1323 case ARG_SETTINGS:
1324
1325 /* no → do not read files
1326 * yes → read files, do not override cmdline, trust only subset
1327 * override → read files, override cmdline, trust only subset
1328 * trusted → read files, do not override cmdline, trust all
1329 */
1330
1331 r = parse_boolean(optarg);
1332 if (r < 0) {
1333 if (streq(optarg, "trusted")) {
1334 mask_all_settings = false;
1335 mask_no_settings = false;
1336 arg_settings_trusted = true;
1337
1338 } else if (streq(optarg, "override")) {
1339 mask_all_settings = false;
1340 mask_no_settings = true;
1341 arg_settings_trusted = -1;
1342 } else
1343 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1344 } else if (r > 0) {
1345 /* yes */
1346 mask_all_settings = false;
1347 mask_no_settings = false;
1348 arg_settings_trusted = -1;
1349 } else {
1350 /* no */
1351 mask_all_settings = true;
1352 mask_no_settings = false;
1353 arg_settings_trusted = false;
1354 }
1355
c6c8f6e2
LP
1356 break;
1357
5f932eb9 1358 case ARG_CHDIR:
baaa35ad
ZJS
1359 if (!path_is_absolute(optarg))
1360 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1361 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1362
1363 r = free_and_strdup(&arg_chdir, optarg);
1364 if (r < 0)
1365 return log_oom();
1366
1367 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1368 break;
1369
b53ede69
PW
1370 case ARG_PIVOT_ROOT:
1371 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1372 if (r < 0)
1373 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1374
1375 arg_settings_mask |= SETTING_PIVOT_ROOT;
1376 break;
1377
9c1e04d0
AP
1378 case ARG_NOTIFY_READY:
1379 r = parse_boolean(optarg);
baaa35ad
ZJS
1380 if (r < 0)
1381 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1382 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1383 arg_notify_ready = r;
1384 arg_settings_mask |= SETTING_NOTIFY_READY;
1385 break;
1386
4623e8e6 1387 case ARG_ROOT_HASH: {
89e62e0b 1388 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1389 size_t l;
1390
1391 r = unhexmem(optarg, strlen(optarg), &k, &l);
1392 if (r < 0)
1393 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1394 if (l < sizeof(sd_id128_t))
c6147113 1395 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1396
89e62e0b
LP
1397 free_and_replace(arg_verity_settings.root_hash, k);
1398 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1399 break;
1400 }
1401
c2923fdc
LB
1402 case ARG_ROOT_HASH_SIG: {
1403 char *value;
89e62e0b
LP
1404 size_t l;
1405 void *p;
c2923fdc
LB
1406
1407 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1408 r = unbase64mem(value, strlen(value), &p, &l);
1409 if (r < 0)
1410 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1411
c2923fdc 1412 } else {
89e62e0b 1413 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1414 if (r < 0)
89e62e0b 1415 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1416 }
1417
89e62e0b
LP
1418 free_and_replace(arg_verity_settings.root_hash_sig, p);
1419 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1420 break;
1421 }
1422
89e62e0b 1423 case ARG_VERITY_DATA:
614b022c 1424 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1425 if (r < 0)
1426 return r;
1427 break;
1428
960e4569
LP
1429 case ARG_SYSTEM_CALL_FILTER: {
1430 bool negative;
1431 const char *items;
1432
1433 negative = optarg[0] == '~';
1434 items = negative ? optarg + 1 : optarg;
1435
1436 for (;;) {
1437 _cleanup_free_ char *word = NULL;
1438
1439 r = extract_first_word(&items, &word, NULL, 0);
1440 if (r == 0)
1441 break;
1442 if (r == -ENOMEM)
1443 return log_oom();
1444 if (r < 0)
1445 return log_error_errno(r, "Failed to parse system call filter: %m");
1446
1447 if (negative)
6b000af4 1448 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1449 else
6b000af4 1450 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1451 if (r < 0)
1452 return log_oom();
1453 }
1454
1455 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1456 break;
1457 }
1458
bf428efb
LP
1459 case ARG_RLIMIT: {
1460 const char *eq;
622ecfa8 1461 _cleanup_free_ char *name = NULL;
bf428efb
LP
1462 int rl;
1463
5c828e66
LP
1464 if (streq(optarg, "help")) {
1465 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1466 return 0;
1467 }
1468
bf428efb 1469 eq = strchr(optarg, '=');
baaa35ad
ZJS
1470 if (!eq)
1471 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1472 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1473
1474 name = strndup(optarg, eq - optarg);
1475 if (!name)
1476 return log_oom();
1477
1478 rl = rlimit_from_string_harder(name);
baaa35ad 1479 if (rl < 0)
7211c853 1480 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1481
1482 if (!arg_rlimit[rl]) {
1483 arg_rlimit[rl] = new0(struct rlimit, 1);
1484 if (!arg_rlimit[rl])
1485 return log_oom();
1486 }
1487
1488 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1489 if (r < 0)
1490 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1491
1492 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1493 break;
1494 }
1495
81f345df
LP
1496 case ARG_OOM_SCORE_ADJUST:
1497 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1498 if (r < 0)
1499 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1500
1501 arg_oom_score_adjust_set = true;
1502 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1503 break;
1504
d107bb7d 1505 case ARG_CPU_AFFINITY: {
0985c7c4 1506 CPUSet cpuset;
d107bb7d
LP
1507
1508 r = parse_cpu_set(optarg, &cpuset);
1509 if (r < 0)
0985c7c4 1510 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1511
0985c7c4
ZJS
1512 cpu_set_reset(&arg_cpu_set);
1513 arg_cpu_set = cpuset;
d107bb7d
LP
1514 arg_settings_mask |= SETTING_CPU_AFFINITY;
1515 break;
1516 }
1517
09d423e9
LP
1518 case ARG_RESOLV_CONF:
1519 if (streq(optarg, "help")) {
1520 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1521 return 0;
1522 }
1523
1524 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1525 if (arg_resolv_conf < 0)
7211c853 1526 return log_error_errno(arg_resolv_conf,
baaa35ad 1527 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1528
1529 arg_settings_mask |= SETTING_RESOLV_CONF;
1530 break;
1531
1688841f
LP
1532 case ARG_TIMEZONE:
1533 if (streq(optarg, "help")) {
1534 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1535 return 0;
1536 }
1537
1538 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1539 if (arg_timezone < 0)
7211c853 1540 return log_error_errno(arg_timezone,
baaa35ad 1541 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1542
1543 arg_settings_mask |= SETTING_TIMEZONE;
1544 break;
1545
de40a303 1546 case ARG_CONSOLE:
dce66ffe
ZJS
1547 r = handle_arg_console(optarg);
1548 if (r <= 0)
1549 return r;
de40a303
LP
1550 break;
1551
1552 case 'P':
1553 case ARG_PIPE:
dce66ffe
ZJS
1554 r = handle_arg_console("pipe");
1555 if (r <= 0)
1556 return r;
de40a303
LP
1557 break;
1558
bb068de0
ZJS
1559 case ARG_NO_PAGER:
1560 arg_pager_flags |= PAGER_DISABLE;
1561 break;
1562
3652872a
LP
1563 case ARG_SET_CREDENTIAL: {
1564 _cleanup_free_ char *word = NULL, *data = NULL;
1565 const char *p = optarg;
1566 Credential *a;
e437538f 1567 ssize_t l;
3652872a
LP
1568
1569 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1570 if (r == -ENOMEM)
1571 return log_oom();
1572 if (r < 0)
1573 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1574 if (r == 0 || !p)
1575 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1576
1577 if (!credential_name_valid(word))
1578 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1579
12d729b2 1580 for (size_t i = 0; i < arg_n_credentials; i++)
3652872a
LP
1581 if (streq(arg_credentials[i].id, word))
1582 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1583
1584 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1585 if (l < 0)
1586 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1587
1588 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1589 if (!a)
1590 return log_oom();
1591
1592 a[arg_n_credentials++] = (Credential) {
1593 .id = TAKE_PTR(word),
1594 .data = TAKE_PTR(data),
1595 .size = l,
1596 };
1597
1598 arg_credentials = a;
1599
1600 arg_settings_mask |= SETTING_CREDENTIALS;
1601 break;
1602 }
1603
1604 case ARG_LOAD_CREDENTIAL: {
1605 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1606 _cleanup_(erase_and_freep) char *data = NULL;
1607 _cleanup_free_ char *word = NULL, *j = NULL;
1608 const char *p = optarg;
1609 Credential *a;
1610 size_t size, i;
1611
1612 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1613 if (r == -ENOMEM)
1614 return log_oom();
1615 if (r < 0)
c941b650 1616 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
3652872a 1617 if (r == 0 || !p)
c941b650 1618 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
3652872a
LP
1619
1620 if (!credential_name_valid(word))
1621 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1622
1623 for (i = 0; i < arg_n_credentials; i++)
1624 if (streq(arg_credentials[i].id, word))
1625 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1626
1627 if (path_is_absolute(p))
1628 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1629 else {
1630 const char *e;
1631
786d19fd
LP
1632 r = get_credentials_dir(&e);
1633 if (r < 0)
1634 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
3652872a
LP
1635
1636 j = path_join(e, p);
1637 if (!j)
1638 return log_oom();
1639 }
1640
986311c2
LP
1641 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1642 flags,
1643 NULL,
1644 &data, &size);
3652872a
LP
1645 if (r < 0)
1646 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1647
1648 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1649 if (!a)
1650 return log_oom();
1651
1652 a[arg_n_credentials++] = (Credential) {
1653 .id = TAKE_PTR(word),
1654 .data = TAKE_PTR(data),
1655 .size = size,
1656 };
1657
1658 arg_credentials = a;
1659
1660 arg_settings_mask |= SETTING_CREDENTIALS;
1661 break;
1662 }
1663
2f893044
LP
1664 case ARG_BIND_USER:
1665 if (!valid_user_group_name(optarg, 0))
1666 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1667
1668 if (strv_extend(&arg_bind_user, optarg) < 0)
1669 return log_oom();
1670
1671 arg_settings_mask |= SETTING_BIND_USER;
1672 break;
1673
4a4654e0
LP
1674 case ARG_SUPPRESS_SYNC:
1675 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1676 if (r < 0)
1677 return r;
1678
1679 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1680 break;
1681
06e78680
YW
1682 case ARG_IMAGE_POLICY:
1683 r = parse_image_policy_argument(optarg, &arg_image_policy);
84be0c71 1684 if (r < 0)
06e78680 1685 return r;
84be0c71 1686 break;
84be0c71 1687
88213476
LP
1688 case '?':
1689 return -EINVAL;
1690
1691 default:
04499a70 1692 assert_not_reached();
88213476 1693 }
88213476 1694
60f1ec13
LP
1695 if (argc > optind) {
1696 strv_free(arg_parameters);
1697 arg_parameters = strv_copy(argv + optind);
1698 if (!arg_parameters)
1699 return log_oom();
d7bea6b6 1700
60f1ec13
LP
1701 arg_settings_mask |= SETTING_START_MODE;
1702 }
1703
1704 if (arg_ephemeral && arg_template && !arg_directory)
1705 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1706 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1707 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1708 * --directory=". */
1709 arg_directory = TAKE_PTR(arg_template);
1710
2642d22a
DDM
1711 arg_caps_retain |= plus;
1712 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1713
1714 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1715 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1716 * indicate that. */
1717 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1718 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1719
1720 arg_caps_retain &= ~minus;
60f1ec13 1721
de40a303 1722 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1723 r = parse_environment();
1724 if (r < 0)
1725 return r;
de40a303 1726
60f1ec13
LP
1727 /* Load all settings from .nspawn files */
1728 if (mask_no_settings)
1729 arg_settings_mask = 0;
1730
1731 /* Don't load any settings from .nspawn files */
1732 if (mask_all_settings)
1733 arg_settings_mask = _SETTINGS_MASK_ALL;
1734
1735 return 1;
1736}
1737
1738static int verify_arguments(void) {
1739 int r;
a6b5216c 1740
75b0d8b8
ZJS
1741 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1742 /* If we are running the stub init in the container, we don't need to look at what the init
1743 * in the container supports, because we are not using it. Let's immediately pick the right
1744 * setting based on the host system configuration.
1745 *
1746 * We only do this, if the user didn't use an environment variable to override the detection.
1747 */
1748
1749 r = cg_all_unified();
1750 if (r < 0)
1751 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1752 if (r > 0)
1753 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1754 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1755 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1756 else
1757 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1758 }
1759
4f086aab
SU
1760 if (arg_userns_mode != USER_NAMESPACE_NO)
1761 arg_mount_settings |= MOUNT_USE_USERNS;
1762
1763 if (arg_private_network)
1764 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1765
48a8d337
LB
1766 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1767 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1768 arg_register = false;
baaa35ad 1769 if (arg_start_mode != START_PID1)
60f1ec13 1770 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1771 }
eb91eb18 1772
6c045a99
LP
1773 if (arg_userns_ownership < 0)
1774 arg_userns_ownership =
f61c7f88 1775 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1776 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1777
60f1ec13
LP
1778 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1779 arg_kill_signal = SIGRTMIN+3;
1780
e5a4bb0d
LP
1781 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1782 arg_read_only = true;
1783
2436ea76
DDM
1784 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1785 arg_read_only = true;
1786
baaa35ad 1787 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1788 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1789 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1790 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1791
baaa35ad 1792 if (arg_directory && arg_image)
60f1ec13 1793 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1794
baaa35ad 1795 if (arg_template && arg_image)
60f1ec13 1796 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1797
baaa35ad 1798 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1799 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1800
baaa35ad 1801 if (arg_ephemeral && arg_template)
60f1ec13 1802 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1803
baaa35ad 1804 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1805 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1806
baaa35ad 1807 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1808 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1809
6c045a99 1810 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1811 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1812 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1813
6c045a99
LP
1814 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1815 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1816 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1817 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1818 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1819
679ecd36
SZ
1820 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1821 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1822 if (arg_network_namespace_path &&
1823 (arg_network_interfaces || arg_network_macvlan ||
1824 arg_network_ipvlan || arg_network_veth_extra ||
1825 arg_network_bridge || arg_network_zone ||
679ecd36 1826 arg_network_veth))
de40a303 1827 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1828
60f1ec13 1829 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1830 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1831 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1832
baaa35ad 1833 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1834 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1835
baaa35ad 1836 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1837 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1838
baaa35ad 1839 if (arg_expose_ports && !arg_private_network)
60f1ec13 1840 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1841
88fc9c9b 1842 if (arg_caps_ambient) {
f5fbe71d 1843 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1844 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1845
1846 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1847 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1848
1849 if (arg_start_mode == START_BOOT)
1850 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1851 }
1852
2f893044
LP
1853 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1854 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1855
1856 /* Drop duplicate --bind-user= entries */
1857 strv_uniq(arg_bind_user);
1858
60f1ec13
LP
1859 r = custom_mount_check_all();
1860 if (r < 0)
1861 return r;
c6c8f6e2 1862
f757855e 1863 return 0;
88213476
LP
1864}
1865
2f091b1b
TM
1866static int verify_network_interfaces_initialized(void) {
1867 int r;
1868 r = test_network_interfaces_initialized(arg_network_interfaces);
1869 if (r < 0)
1870 return r;
1871
1872 r = test_network_interfaces_initialized(arg_network_macvlan);
1873 if (r < 0)
1874 return r;
1875
1876 r = test_network_interfaces_initialized(arg_network_ipvlan);
1877 if (r < 0)
1878 return r;
1879
1880 return 0;
1881}
1882
91181e07 1883int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1884 assert(p);
1885
0de7acce 1886 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1887 return 0;
1888
1889 if (uid == UID_INVALID && gid == GID_INVALID)
1890 return 0;
1891
1892 if (uid != UID_INVALID) {
1893 uid += arg_uid_shift;
1894
1895 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1896 return -EOVERFLOW;
1897 }
1898
1899 if (gid != GID_INVALID) {
1900 gid += (gid_t) arg_uid_shift;
1901
1902 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1903 return -EOVERFLOW;
1904 }
1905
7c248223 1906 return RET_NERRNO(lchown(p, uid, gid));
b12afc8c
LP
1907}
1908
91181e07 1909int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1910 const char *q;
dae8b82e 1911 int r;
03cfe0d5
LP
1912
1913 q = prefix_roota(root, path);
3f692e2e 1914 r = RET_NERRNO(mkdir(q, mode));
dae8b82e
ZJS
1915 if (r == -EEXIST)
1916 return 0;
1917 if (r < 0)
1918 return r;
03cfe0d5
LP
1919
1920 return userns_lchown(q, uid, gid);
1921}
1922
1688841f 1923static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1924 return PATH_STARTSWITH_SET(
1925 path,
1926 "../usr/share/zoneinfo/",
1927 "/usr/share/zoneinfo/");
1688841f
LP
1928}
1929
83205269
LP
1930static bool etc_writable(void) {
1931 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1932}
1933
e58a1277 1934static int setup_timezone(const char *dest) {
1688841f
LP
1935 _cleanup_free_ char *p = NULL, *etc = NULL;
1936 const char *where, *check;
1937 TimezoneMode m;
d4036145 1938 int r;
f8440af5 1939
e58a1277
LP
1940 assert(dest);
1941
1688841f 1942 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1943 r = readlink_malloc("/etc/localtime", &p);
1944 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1945 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1946 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1947 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1948 else if (r < 0) {
1949 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1950 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1951 * file.
1952 *
1953 * Example:
1954 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1955 */
1956 return 0;
1957 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1958 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1959 else
1960 m = arg_timezone;
1961 } else
1962 m = arg_timezone;
1963
1964 if (m == TIMEZONE_OFF)
1965 return 0;
1966
f461a28d 1967 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1968 if (r < 0) {
1688841f 1969 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1970 return 0;
1971 }
1972
1688841f
LP
1973 where = strjoina(etc, "/localtime");
1974
1975 switch (m) {
1976
1977 case TIMEZONE_DELETE:
1978 if (unlink(where) < 0)
1979 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1980
d4036145 1981 return 0;
d4036145 1982
1688841f
LP
1983 case TIMEZONE_SYMLINK: {
1984 _cleanup_free_ char *q = NULL;
1985 const char *z, *what;
4d1c38b8 1986
1688841f
LP
1987 z = timezone_from_path(p);
1988 if (!z) {
1989 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1990 return 0;
1688841f 1991 }
d4036145 1992
1688841f
LP
1993 r = readlink_malloc(where, &q);
1994 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1995 return 0; /* Already pointing to the right place? Then do nothing .. */
1996
1997 check = strjoina(dest, "/usr/share/zoneinfo/", z);
f461a28d 1998 r = chase(check, dest, 0, NULL, NULL);
1688841f
LP
1999 if (r < 0)
2000 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
2001 else {
2002 if (unlink(where) < 0 && errno != ENOENT) {
2003 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
2004 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
2005 return 0;
2006 }
2007
2008 what = strjoina("../usr/share/zoneinfo/", z);
2009 if (symlink(what, where) < 0) {
2010 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2011 errno, "Failed to correct timezone of container, ignoring: %m");
2012 return 0;
2013 }
2014
2015 break;
2016 }
2017
2018 _fallthrough_;
d4036145 2019 }
68fb0892 2020
1688841f
LP
2021 case TIMEZONE_BIND: {
2022 _cleanup_free_ char *resolved = NULL;
2023 int found;
2024
f461a28d 2025 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
2026 if (found < 0) {
2027 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2028 return 0;
2029 }
2030
2031 if (found == 0) /* missing? */
2032 (void) touch(resolved);
2033
511a8cfe 2034 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 2035 if (r >= 0)
511a8cfe 2036 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
2037
2038 _fallthrough_;
79d80fc1 2039 }
4d9f07b4 2040
1688841f
LP
2041 case TIMEZONE_COPY:
2042 /* If mounting failed, try to copy */
7c2f5495 2043 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
2044 if (r < 0) {
2045 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2046 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2047 return 0;
2048 }
2049
2050 break;
2051
2052 default:
04499a70 2053 assert_not_reached();
d4036145 2054 }
e58a1277 2055
1688841f 2056 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
2057 r = userns_lchown(where, 0, 0);
2058 if (r < 0)
1688841f 2059 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 2060
e58a1277 2061 return 0;
88213476
LP
2062}
2063
09d423e9
LP
2064static int have_resolv_conf(const char *path) {
2065 assert(path);
2066
2067 if (access(path, F_OK) < 0) {
2068 if (errno == ENOENT)
2069 return 0;
2070
2071 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2072 }
2073
2074 return 1;
2075}
2076
7357272e 2077static int resolved_listening(void) {
b8ea7a6e 2078 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2079 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2080 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2081 int r;
2082
7357272e 2083 /* Check if resolved is listening */
b053cd5f
LP
2084
2085 r = sd_bus_open_system(&bus);
2086 if (r < 0)
b8ea7a6e 2087 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2088
7357272e 2089 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2090 if (r < 0)
2091 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2092 if (r == 0)
2093 return 0;
7357272e
DM
2094
2095 r = sd_bus_get_property_string(bus,
2096 "org.freedesktop.resolve1",
2097 "/org/freedesktop/resolve1",
2098 "org.freedesktop.resolve1.Manager",
2099 "DNSStubListener",
b8ea7a6e 2100 &error,
7357272e
DM
2101 &dns_stub_listener_mode);
2102 if (r < 0)
b8ea7a6e 2103 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2104
2105 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2106}
2107
2547bb41 2108static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2109 _cleanup_free_ char *etc = NULL;
2110 const char *where, *what;
2111 ResolvConfMode m;
2112 int r;
2547bb41
LP
2113
2114 assert(dest);
2115
09d423e9
LP
2116 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2117 if (arg_private_network)
2118 m = RESOLV_CONF_OFF;
86775e35
LP
2119 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2120 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2121 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2122 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2123 else
83205269 2124 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2125
09d423e9
LP
2126 } else
2127 m = arg_resolv_conf;
2128
2129 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2130 return 0;
2131
f461a28d 2132 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2133 if (r < 0) {
2134 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2135 return 0;
2136 }
2137
2138 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2139
2140 if (m == RESOLV_CONF_DELETE) {
2141 if (unlink(where) < 0)
2142 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2143
87447ae4
LP
2144 return 0;
2145 }
79d80fc1 2146
86775e35
LP
2147 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2148 what = PRIVATE_STATIC_RESOLV_CONF;
2149 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2150 what = PRIVATE_UPLINK_RESOLV_CONF;
2151 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2152 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2153 else
2154 what = "/etc/resolv.conf";
87447ae4 2155
86775e35 2156 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2157 _cleanup_free_ char *resolved = NULL;
2158 int found;
2159
d404c8d8 2160 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
09d423e9
LP
2161 if (found < 0) {
2162 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2163 return 0;
2164 }
3539724c 2165
87447ae4
LP
2166 if (found == 0) /* missing? */
2167 (void) touch(resolved);
5367354d 2168
511a8cfe 2169 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2170 if (r >= 0)
511a8cfe 2171 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2172
2173 /* If that didn't work, let's copy the file */
3539724c
LP
2174 }
2175
86775e35 2176 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
7c2f5495 2177 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
86775e35 2178 else
7c2f5495 2179 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
79d80fc1 2180 if (r < 0) {
3539724c
LP
2181 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2182 * resolved or something similar runs inside and the symlink points there.
68a313c5 2183 *
3539724c 2184 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2185 */
86775e35
LP
2186 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2187 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2188 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2189 return 0;
2190 }
2547bb41 2191
03cfe0d5
LP
2192 r = userns_lchown(where, 0, 0);
2193 if (r < 0)
3539724c 2194 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2195
2547bb41
LP
2196 return 0;
2197}
2198
1e4f1671 2199static int setup_boot_id(void) {
cdde6ba6
LP
2200 _cleanup_(unlink_and_freep) char *from = NULL;
2201 _cleanup_free_ char *path = NULL;
3bbaff3e 2202 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2203 const char *to;
04bc4a3f
LP
2204 int r;
2205
1eacc470 2206 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2207
1eacc470 2208 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2209 if (r < 0)
2210 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2211
2212 r = sd_id128_randomize(&rnd);
f647962d
MS
2213 if (r < 0)
2214 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2215
b40c8ebd 2216 r = id128_write(path, ID128_FORMAT_UUID, rnd);
f647962d
MS
2217 if (r < 0)
2218 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2219
cdde6ba6
LP
2220 from = TAKE_PTR(path);
2221 to = "/proc/sys/kernel/random/boot_id";
2222
511a8cfe 2223 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2224 if (r < 0)
2225 return r;
04bc4a3f 2226
511a8cfe 2227 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2228}
2229
e58a1277 2230static int copy_devnodes(const char *dest) {
88213476
LP
2231 static const char devnodes[] =
2232 "null\0"
2233 "zero\0"
2234 "full\0"
2235 "random\0"
2236 "urandom\0"
85614d66
TG
2237 "tty\0"
2238 "net/tun\0";
88213476 2239
e58a1277 2240 int r = 0;
a258bf26
LP
2241
2242 assert(dest);
124640f1 2243
52f05ef2 2244 BLOCK_WITH_UMASK(0000);
88213476 2245
03cfe0d5
LP
2246 /* Create /dev/net, so that we can create /dev/net/tun in it */
2247 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2248 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2249
88213476 2250 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2251 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2252 struct stat st;
88213476 2253
c6134d3e 2254 from = path_join("/dev/", d);
8967f291
LP
2255 if (!from)
2256 return log_oom();
2257
c6134d3e 2258 to = path_join(dest, from);
8967f291
LP
2259 if (!to)
2260 return log_oom();
88213476
LP
2261
2262 if (stat(from, &st) < 0) {
2263
4a62c710
MS
2264 if (errno != ENOENT)
2265 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2266
baaa35ad
ZJS
2267 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2268 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2269 "%s is not a char or block device, cannot copy.", from);
2270 else {
8dfce114
LP
2271 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2272
81f5049b 2273 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2274 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2275 if (errno == EEXIST)
8dbf71ec 2276 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2277 if (errno != EPERM)
2278 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2279
8dfce114 2280 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2281 r = touch(to);
2282 if (r < 0)
2283 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2284 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2285 if (r < 0)
2286 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2287 }
6278cf60 2288
03cfe0d5
LP
2289 r = userns_lchown(to, 0, 0);
2290 if (r < 0)
2291 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2292
657ee2d8 2293 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2294 if (!dn)
2295 return log_oom();
2296
2297 r = userns_mkdir(dest, dn, 0755, 0, 0);
2298 if (r < 0)
2299 return log_error_errno(r, "Failed to create '%s': %m", dn);
2300
2301 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2302 return log_oom();
2303
c6134d3e 2304 prefixed = path_join(dest, sl);
8dfce114
LP
2305 if (!prefixed)
2306 return log_oom();
2307
2d9b74ba 2308 t = path_join("..", d);
8dfce114
LP
2309 if (!t)
2310 return log_oom();
2311
2312 if (symlink(t, prefixed) < 0)
2313 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2314 }
88213476
LP
2315 }
2316
e58a1277
LP
2317 return r;
2318}
88213476 2319
de40a303 2320static int make_extra_nodes(const char *dest) {
de40a303
LP
2321 size_t i;
2322 int r;
2323
52f05ef2 2324 BLOCK_WITH_UMASK(0000);
de40a303
LP
2325
2326 for (i = 0; i < arg_n_extra_nodes; i++) {
2327 _cleanup_free_ char *path = NULL;
2328 DeviceNode *n = arg_extra_nodes + i;
2329
c6134d3e 2330 path = path_join(dest, n->path);
de40a303
LP
2331 if (!path)
2332 return log_oom();
2333
2334 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2335 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2336
2337 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2338 if (r < 0)
2339 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2340 }
2341
2342 return 0;
2343}
2344
03cfe0d5
LP
2345static int setup_pts(const char *dest) {
2346 _cleanup_free_ char *options = NULL;
2347 const char *p;
709f6e46 2348 int r;
03cfe0d5 2349
349cc4a5 2350#if HAVE_SELINUX
03cfe0d5
LP
2351 if (arg_selinux_apifs_context)
2352 (void) asprintf(&options,
3dce8915 2353 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2354 arg_uid_shift + TTY_GID,
2355 arg_selinux_apifs_context);
2356 else
2357#endif
2358 (void) asprintf(&options,
3dce8915 2359 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2360 arg_uid_shift + TTY_GID);
f2d88580 2361
03cfe0d5 2362 if (!options)
f2d88580
LP
2363 return log_oom();
2364
03cfe0d5 2365 /* Mount /dev/pts itself */
cc9fce65 2366 p = prefix_roota(dest, "/dev/pts");
3f692e2e 2367 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e
ZJS
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to create /dev/pts: %m");
2370
511a8cfe 2371 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2372 if (r < 0)
2373 return r;
709f6e46
MS
2374 r = userns_lchown(p, 0, 0);
2375 if (r < 0)
2376 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2377
2378 /* Create /dev/ptmx symlink */
2379 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2380 if (symlink("pts/ptmx", p) < 0)
2381 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2382 r = userns_lchown(p, 0, 0);
2383 if (r < 0)
2384 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2385
03cfe0d5
LP
2386 /* And fix /dev/pts/ptmx ownership */
2387 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2388 r = userns_lchown(p, 0, 0);
2389 if (r < 0)
2390 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2391
f2d88580
LP
2392 return 0;
2393}
2394
3acc84eb 2395static int setup_stdio_as_dev_console(void) {
5bb1d7fb 2396 _cleanup_close_ int terminal = -EBADF;
e58a1277 2397 int r;
e58a1277 2398
335d2ead
LP
2399 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2400 * explicitly, if we are configured to. */
2401 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2402 if (terminal < 0)
2403 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2404
3acc84eb
FB
2405 /* Make sure we can continue logging to the original stderr, even if
2406 * stderr points elsewhere now */
2407 r = log_dup_console();
2408 if (r < 0)
2409 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2410
3acc84eb
FB
2411 /* invalidates 'terminal' on success and failure */
2412 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2413 TAKE_FD(terminal);
f647962d 2414 if (r < 0)
3acc84eb
FB
2415 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2416
2417 return 0;
2418}
88213476 2419
3acc84eb
FB
2420static int setup_dev_console(const char *console) {
2421 _cleanup_free_ char *p = NULL;
2422 int r;
a258bf26 2423
3acc84eb
FB
2424 /* Create /dev/console symlink */
2425 r = path_make_relative("/dev", console, &p);
81f5049b 2426 if (r < 0)
3acc84eb
FB
2427 return log_error_errno(r, "Failed to create relative path: %m");
2428
2429 if (symlink(p, "/dev/console") < 0)
2430 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2431
3acc84eb 2432 return 0;
e58a1277
LP
2433}
2434
8e5430c4
LP
2435static int setup_keyring(void) {
2436 key_serial_t keyring;
2437
6b000af4
LP
2438 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2439 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2440 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2441 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2442 * into the container. */
8e5430c4
LP
2443
2444 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2445 if (keyring == -1) {
2446 if (errno == ENOSYS)
2447 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2448 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2449 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2450 else
2451 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2452 }
2453
2454 return 0;
2455}
2456
3652872a
LP
2457static int setup_credentials(const char *root) {
2458 const char *q;
2459 int r;
2460
2461 if (arg_n_credentials <= 0)
2462 return 0;
2463
2464 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2465 if (r < 0)
2466 return log_error_errno(r, "Failed to create /run/host: %m");
2467
2468 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2471
2472 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2473 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2474 if (r < 0)
2475 return r;
2476
2477 for (size_t i = 0; i < arg_n_credentials; i++) {
2478 _cleanup_free_ char *j = NULL;
254d1313 2479 _cleanup_close_ int fd = -EBADF;
3652872a
LP
2480
2481 j = path_join(q, arg_credentials[i].id);
2482 if (!j)
2483 return log_oom();
2484
2485 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2486 if (fd < 0)
2487 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2488
2489 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2490 if (r < 0)
2491 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2492
2493 if (fchmod(fd, 0400) < 0)
2494 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2495
2496 if (arg_userns_mode != USER_NAMESPACE_NO) {
2497 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2498 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2499 }
2500 }
2501
2502 if (chmod(q, 0500) < 0)
2503 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2504
2505 r = userns_lchown(q, 0, 0);
2506 if (r < 0)
2507 return r;
2508
2509 /* Make both mount and superblock read-only now */
511a8cfe 2510 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2511 if (r < 0)
2512 return r;
2513
511a8cfe 2514 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2515}
2516
5d9d3fcb 2517static int setup_kmsg(int fd_inner_socket) {
9ec5a93c
LP
2518 _cleanup_(unlink_and_freep) char *from = NULL;
2519 _cleanup_free_ char *fifo = NULL;
254d1313 2520 _cleanup_close_ int fd = -EBADF;
9ec5a93c 2521 int r;
e58a1277 2522
5d9d3fcb 2523 assert(fd_inner_socket >= 0);
a258bf26 2524
52f05ef2 2525 BLOCK_WITH_UMASK(0000);
a258bf26 2526
30fd9a2d 2527 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2528 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2529 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2530 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2531
1eacc470 2532 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2533 if (r < 0)
2534 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2535
9ec5a93c 2536 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2537 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2538
2539 from = TAKE_PTR(fifo);
9ec5a93c 2540
511a8cfe 2541 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2542 if (r < 0)
2543 return r;
e58a1277 2544
669fc4e5 2545 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2546 if (fd < 0)
2547 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2548
9ec5a93c 2549 /* Store away the fd in the socket, so that it stays open as long as we run the child */
5d9d3fcb 2550 r = send_one_fd(fd_inner_socket, fd, 0);
d9603714
DH
2551 if (r < 0)
2552 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2553
25ea79fe 2554 return 0;
88213476
LP
2555}
2556
761cf19d 2557struct ExposeArgs {
deff68e7
FW
2558 union in_addr_union address4;
2559 union in_addr_union address6;
761cf19d
FW
2560 struct FirewallContext *fw_ctx;
2561};
2562
1c4baffc 2563static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
99534007 2564 struct ExposeArgs *args = ASSERT_PTR(userdata);
6d0b55c2
LP
2565
2566 assert(rtnl);
2567 assert(m);
6d0b55c2 2568
fb9044cb
LP
2569 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2570 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2571 return 0;
2572}
2573
3a74cea5 2574static int setup_hostname(void) {
c818eef1 2575 int r;
3a74cea5 2576
0c582db0 2577 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2578 return 0;
2579
c818eef1
LP
2580 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2581 if (r < 0)
2582 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2583
7027ff61 2584 return 0;
3a74cea5
LP
2585}
2586
57fb9fb5 2587static int setup_journal(const char *directory) {
0f5e1382 2588 _cleanup_free_ char *d = NULL;
5980d463 2589 const char *p, *q;
b2238e38 2590 sd_id128_t this_id;
8054d749 2591 bool try;
57fb9fb5
LP
2592 int r;
2593
df9a75e4
LP
2594 /* Don't link journals in ephemeral mode */
2595 if (arg_ephemeral)
2596 return 0;
2597
8054d749
LP
2598 if (arg_link_journal == LINK_NO)
2599 return 0;
2600
2601 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2602
4d680aee 2603 r = sd_id128_get_machine(&this_id);
f647962d
MS
2604 if (r < 0)
2605 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2606
e01ff70a 2607 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2608 log_full(try ? LOG_WARNING : LOG_ERR,
85b55869 2609 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
8054d749 2610 if (try)
4d680aee 2611 return 0;
df9a75e4 2612 return -EEXIST;
4d680aee
ZJS
2613 }
2614
369ca6da
ZJS
2615 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2616 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2617 if (r < 0) {
2618 bool ignore = r == -EROFS && try;
2619 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2620 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2621 return ignore ? 0 : r;
2622 }
2623 }
03cfe0d5 2624
85b55869 2625 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
03cfe0d5 2626 q = prefix_roota(directory, p);
27407a01 2627
e1873695 2628 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2629 if (try)
2630 return 0;
27407a01 2631
baaa35ad
ZJS
2632 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2633 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2634 }
2635
e1873695 2636 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2637 if (try)
2638 return 0;
57fb9fb5 2639
baaa35ad
ZJS
2640 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2641 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2642 }
2643
2644 r = readlink_and_make_absolute(p, &d);
2645 if (r >= 0) {
3742095b 2646 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2647 path_equal(d, q)) {
2648
03cfe0d5 2649 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2650 if (r < 0)
709f6e46 2651 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2652 return 0;
57fb9fb5
LP
2653 }
2654
4a62c710
MS
2655 if (unlink(p) < 0)
2656 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2657 } else if (r == -EINVAL) {
2658
2659 if (arg_link_journal == LINK_GUEST &&
2660 rmdir(p) < 0) {
2661
27407a01
ZJS
2662 if (errno == ENOTDIR) {
2663 log_error("%s already exists and is neither a symlink nor a directory", p);
2664 return r;
4314d33f
MS
2665 } else
2666 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2667 }
4314d33f
MS
2668 } else if (r != -ENOENT)
2669 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2670
2671 if (arg_link_journal == LINK_GUEST) {
2672
2673 if (symlink(q, p) < 0) {
8054d749 2674 if (try) {
56f64d95 2675 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2676 return 0;
4314d33f
MS
2677 } else
2678 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2679 }
2680
03cfe0d5 2681 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2682 if (r < 0)
709f6e46 2683 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2684 return 0;
57fb9fb5
LP
2685 }
2686
2687 if (arg_link_journal == LINK_HOST) {
ccddd104 2688 /* don't create parents here — if the host doesn't have
574edc90 2689 * permanent journal set up, don't force it here */
ba8e6c4d 2690
3f692e2e 2691 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e 2692 if (r < 0 && r != -EEXIST) {
8054d749 2693 if (try) {
dae8b82e 2694 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2695 return 0;
4314d33f 2696 } else
dae8b82e 2697 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2698 }
2699
27407a01
ZJS
2700 } else if (access(p, F_OK) < 0)
2701 return 0;
57fb9fb5 2702
db55bbf2 2703 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
cdb2b9d0
LP
2704 log_warning("%s is not empty, proceeding anyway.", q);
2705
03cfe0d5 2706 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2707 if (r < 0)
2708 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2709
511a8cfe 2710 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2711 if (r < 0)
4a62c710 2712 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2713
27407a01 2714 return 0;
57fb9fb5
LP
2715}
2716
de40a303
LP
2717static int drop_capabilities(uid_t uid) {
2718 CapabilityQuintet q;
2719
2720 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2721 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2722 * arg_caps_retain. */
2723
2724 if (capability_quintet_is_set(&arg_full_capabilities)) {
2725 q = arg_full_capabilities;
2726
f5fbe71d 2727 if (q.bounding == UINT64_MAX)
de40a303
LP
2728 q.bounding = uid == 0 ? arg_caps_retain : 0;
2729
f5fbe71d 2730 if (q.effective == UINT64_MAX)
de40a303
LP
2731 q.effective = uid == 0 ? q.bounding : 0;
2732
f5fbe71d 2733 if (q.inheritable == UINT64_MAX)
88fc9c9b 2734 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2735
f5fbe71d 2736 if (q.permitted == UINT64_MAX)
88fc9c9b 2737 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2738
f5fbe71d 2739 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2740 q.ambient = arg_caps_ambient;
f66ad460
AZ
2741
2742 if (capability_quintet_mangle(&q))
2743 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2744
2745 } else {
de40a303
LP
2746 q = (CapabilityQuintet) {
2747 .bounding = arg_caps_retain,
2748 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2749 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2750 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2751 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2752 };
2753
f66ad460
AZ
2754 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2755 * in order to maintain the same behavior as systemd < 242. */
2756 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2757 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2758 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2759
2760 }
2761
de40a303 2762 return capability_quintet_enforce(&q);
88213476
LP
2763}
2764
db999e0f
LP
2765static int reset_audit_loginuid(void) {
2766 _cleanup_free_ char *p = NULL;
2767 int r;
2768
0c582db0 2769 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2770 return 0;
2771
2772 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2773 if (r == -ENOENT)
db999e0f 2774 return 0;
f647962d
MS
2775 if (r < 0)
2776 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2777
2778 /* Already reset? */
2779 if (streq(p, "4294967295"))
2780 return 0;
2781
57512c89 2782 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2783 if (r < 0) {
10a87006
LP
2784 log_error_errno(r,
2785 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2786 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2787 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2788 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2789 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2790
db999e0f 2791 sleep(5);
77b6e194 2792 }
db999e0f
LP
2793
2794 return 0;
77b6e194
LP
2795}
2796
e79581dd 2797static int mount_tunnel_dig(const char *root) {
785890ac 2798 const char *p, *q;
709f6e46 2799 int r;
785890ac
LP
2800
2801 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2802 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2803 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2804 (void) mkdir_p(p, 0600);
2805
5a27b395 2806 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2807 if (r < 0)
5a27b395 2808 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2809
e79581dd 2810 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
709f6e46 2811 if (r < 0)
e79581dd 2812 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
03cfe0d5 2813
e79581dd 2814 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
511a8cfe 2815 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2816 if (r < 0)
2817 return r;
785890ac 2818
511a8cfe 2819 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2820 if (r < 0)
2821 return r;
785890ac 2822
e79581dd
CB
2823 return 0;
2824}
2825
2826static int mount_tunnel_open(void) {
2827 int r;
2828
2829 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2830 if (r < 0)
2831 return r;
2832
2833 return 0;
785890ac
LP
2834}
2835
317feb4d 2836static int setup_machine_id(const char *directory) {
3bbaff3e 2837 int r;
e01ff70a 2838
317feb4d
LP
2839 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2840 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2841 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2842 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2843 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2844 * container behaves nicely). */
2845
319477f1 2846 r = id128_get_machine(directory, &arg_uuid);
317feb4d 2847 if (r < 0) {
74e795ee 2848 if (!ERRNO_IS_MACHINE_ID_UNSET(r)) /* If the file is missing, empty, or uninitialized, we don't mind */
317feb4d 2849 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2850
317feb4d
LP
2851 if (sd_id128_is_null(arg_uuid)) {
2852 r = sd_id128_randomize(&arg_uuid);
2853 if (r < 0)
2854 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2855 }
317feb4d 2856 }
691675ba 2857
e01ff70a
MS
2858 return 0;
2859}
2860
7336138e
LP
2861static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2862 int r;
2863
2864 assert(directory);
2865
6c045a99 2866 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2867 return 0;
2868
2869 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2870 if (r == -EOPNOTSUPP)
2871 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2872 if (r == -EBADE)
2873 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2874 if (r < 0)
2875 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2876 if (r == 0)
2877 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2878 else
2879 log_debug("Patched directory tree to match UID/GID range.");
2880
2881 return r;
2882}
2883
113cea80 2884/*
6d416b9c
LS
2885 * Return values:
2886 * < 0 : wait_for_terminate() failed to get the state of the
2887 * container, the container was terminated by a signal, or
2888 * failed for an unknown reason. No change is made to the
2889 * container argument.
2890 * > 0 : The program executed in the container terminated with an
2891 * error. The exit code of the program executed in the
919699ec
LP
2892 * container is returned. The container argument has been set
2893 * to CONTAINER_TERMINATED.
6d416b9c
LS
2894 * 0 : The container is being rebooted, has been shut down or exited
2895 * successfully. The container argument has been set to either
2896 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2897 *
6d416b9c
LS
2898 * That is, success is indicated by a return value of zero, and an
2899 * error is indicated by a non-zero value.
113cea80
DH
2900 */
2901static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2902 siginfo_t status;
919699ec 2903 int r;
113cea80
DH
2904
2905 r = wait_for_terminate(pid, &status);
f647962d
MS
2906 if (r < 0)
2907 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2908
2909 switch (status.si_code) {
fddbb89c 2910
113cea80 2911 case CLD_EXITED:
b5a2179b 2912 if (status.si_status == 0)
919699ec 2913 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2914 else
919699ec 2915 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2916
919699ec
LP
2917 *container = CONTAINER_TERMINATED;
2918 return status.si_status;
113cea80
DH
2919
2920 case CLD_KILLED:
2921 if (status.si_status == SIGINT) {
919699ec 2922 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2923 *container = CONTAINER_TERMINATED;
919699ec
LP
2924 return 0;
2925
113cea80 2926 } else if (status.si_status == SIGHUP) {
919699ec 2927 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2928 *container = CONTAINER_REBOOTED;
919699ec 2929 return 0;
113cea80 2930 }
919699ec 2931
4831981d 2932 _fallthrough_;
113cea80 2933 case CLD_DUMPED:
baaa35ad
ZJS
2934 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2935 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2936
2937 default:
baaa35ad
ZJS
2938 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2939 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2940 }
113cea80
DH
2941}
2942
023fb90b
LP
2943static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2944 pid_t pid;
2945
4a0b58c4 2946 pid = PTR_TO_PID(userdata);
023fb90b 2947 if (pid > 0) {
c6c8f6e2 2948 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2949 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2950 sd_event_source_set_userdata(s, NULL);
2951 return 0;
2952 }
2953 }
2954
2955 sd_event_exit(sd_event_source_get_event(s), 0);
2956 return 0;
2957}
2958
6916b164 2959static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2960 pid_t pid;
2961
2962 assert(s);
2963 assert(ssi);
2964
2965 pid = PTR_TO_PID(userdata);
2966
6916b164
AU
2967 for (;;) {
2968 siginfo_t si = {};
abdb9b08 2969
6916b164
AU
2970 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2971 return log_error_errno(errno, "Failed to waitid(): %m");
2972 if (si.si_pid == 0) /* No pending children. */
2973 break;
abdb9b08 2974 if (si.si_pid == pid) {
6916b164
AU
2975 /* The main process we care for has exited. Return from
2976 * signal handler but leave the zombie. */
2977 sd_event_exit(sd_event_source_get_event(s), 0);
2978 break;
2979 }
abdb9b08 2980
6916b164
AU
2981 /* Reap all other children. */
2982 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2983 }
2984
2985 return 0;
2986}
2987
abdb9b08
LP
2988static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2989 pid_t pid;
2990
2991 assert(m);
2992
2993 pid = PTR_TO_PID(userdata);
2994
2995 if (arg_kill_signal > 0) {
2996 log_info("Container termination requested. Attempting to halt container.");
2997 (void) kill(pid, arg_kill_signal);
2998 } else {
2999 log_info("Container termination requested. Exiting.");
3000 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
3001 }
3002
3003 return 0;
3004}
3005
ec16945e 3006static int determine_names(void) {
1b9cebf6 3007 int r;
ec16945e 3008
c1521918
LP
3009 if (arg_template && !arg_directory && arg_machine) {
3010
3011 /* If --template= was specified then we should not
3012 * search for a machine, but instead create a new one
3013 * in /var/lib/machine. */
3014
657ee2d8 3015 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
3016 if (!arg_directory)
3017 return log_oom();
3018 }
3019
ec16945e 3020 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3021 if (arg_machine) {
3022 _cleanup_(image_unrefp) Image *i = NULL;
3023
d577d4a4 3024 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
3025 if (r == -ENOENT)
3026 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
3027 if (r < 0)
3028 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 3029
eb38edce 3030 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 3031 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 3032 else
0f03c2a4 3033 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 3034 if (r < 0)
0f3be6ca 3035 return log_oom();
1b9cebf6 3036
aee327b8
LP
3037 if (!arg_ephemeral)
3038 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
3039 } else {
3040 r = safe_getcwd(&arg_directory);
3041 if (r < 0)
3042 return log_error_errno(r, "Failed to determine current directory: %m");
3043 }
ec16945e 3044
c6147113
LP
3045 if (!arg_directory && !arg_image)
3046 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
3047 }
3048
3049 if (!arg_machine) {
b9ba4dab
LP
3050 if (arg_directory && path_equal(arg_directory, "/"))
3051 arg_machine = gethostname_malloc();
e9b88a6d
LP
3052 else if (arg_image) {
3053 char *e;
4827ab48 3054
b36e39d2
LP
3055 r = path_extract_filename(arg_image, &arg_machine);
3056 if (r < 0)
3057 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
4827ab48 3058
e9b88a6d
LP
3059 /* Truncate suffix if there is one */
3060 e = endswith(arg_machine, ".raw");
3061 if (e)
3062 *e = 0;
b36e39d2
LP
3063 } else {
3064 r = path_extract_filename(arg_directory, &arg_machine);
3065 if (r < 0)
3066 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3067 }
ec16945e 3068
ae691c1d 3069 hostname_cleanup(arg_machine);
52ef5dd7 3070 if (!hostname_is_valid(arg_machine, 0))
c6147113 3071 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 3072
3603f151
LB
3073 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3074 * to match fixed config file names. */
3075 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3076 if (!arg_settings_filename)
3077 return log_oom();
3078
e9b88a6d
LP
3079 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3080 * instances at once without manually having to specify -M each time. */
3081 if (arg_ephemeral)
3082 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3083 return log_oom();
3603f151
LB
3084 } else {
3085 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3086 if (!arg_settings_filename)
3087 return log_oom();
ec16945e
LP
3088 }
3089
3090 return 0;
3091}
3092
f461a28d 3093static int chase_and_update(char **p, unsigned flags) {
3f342ec4
LP
3094 char *chased;
3095 int r;
3096
3097 assert(p);
3098
3099 if (!*p)
3100 return 0;
3101
f461a28d 3102 r = chase(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3103 if (r < 0)
3104 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3105
a5648b80 3106 return free_and_replace(*p, chased);
3f342ec4
LP
3107}
3108
03cfe0d5 3109static int determine_uid_shift(const char *directory) {
6dac160c 3110
0de7acce 3111 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3112 arg_uid_shift = 0;
6dac160c 3113 return 0;
03cfe0d5 3114 }
6dac160c
LP
3115
3116 if (arg_uid_shift == UID_INVALID) {
3117 struct stat st;
3118
993da6d4
LP
3119 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3120
3121 if (stat(directory, &st) < 0)
03cfe0d5 3122 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3123
3124 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3125
baaa35ad
ZJS
3126 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3127 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3128 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3129
3130 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3131
3132 if (arg_uid_shift != 0) {
3133 /* If the image is shifted already, then we'll fall back to classic chowning, for
3134 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3135
3136 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3137 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3138 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3139 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3140 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3141 "UID base of %s is not zero, UID mapping not supported.", directory);
3142 }
6dac160c
LP
3143 }
3144
58e13de5
LP
3145 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3146 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3147
6dac160c
LP
3148 return 0;
3149}
3150
de40a303
LP
3151static unsigned long effective_clone_ns_flags(void) {
3152 unsigned long flags = arg_clone_ns_flags;
3153
3154 if (arg_private_network)
3155 flags |= CLONE_NEWNET;
3156 if (arg_use_cgns)
3157 flags |= CLONE_NEWCGROUP;
3158 if (arg_userns_mode != USER_NAMESPACE_NO)
3159 flags |= CLONE_NEWUSER;
3160
3161 return flags;
3162}
3163
3164static int patch_sysctl(void) {
3165
3166 /* This table is inspired by runc's sysctl() function */
3167 static const struct {
3168 const char *key;
3169 bool prefix;
3170 unsigned long clone_flags;
3171 } safe_sysctl[] = {
3172 { "kernel.hostname", false, CLONE_NEWUTS },
3173 { "kernel.domainname", false, CLONE_NEWUTS },
3174 { "kernel.msgmax", false, CLONE_NEWIPC },
3175 { "kernel.msgmnb", false, CLONE_NEWIPC },
3176 { "kernel.msgmni", false, CLONE_NEWIPC },
3177 { "kernel.sem", false, CLONE_NEWIPC },
3178 { "kernel.shmall", false, CLONE_NEWIPC },
3179 { "kernel.shmmax", false, CLONE_NEWIPC },
3180 { "kernel.shmmni", false, CLONE_NEWIPC },
3181 { "fs.mqueue.", true, CLONE_NEWIPC },
3182 { "net.", true, CLONE_NEWNET },
3183 };
3184
3185 unsigned long flags;
de40a303
LP
3186 int r;
3187
3188 flags = effective_clone_ns_flags();
3189
3190 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3191 bool good = false;
3192 size_t i;
3193
3194 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3195
3196 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3197 continue;
3198
3199 if (safe_sysctl[i].prefix)
3200 good = startswith(*k, safe_sysctl[i].key);
3201 else
3202 good = streq(*k, safe_sysctl[i].key);
3203
3204 if (good)
3205 break;
3206 }
3207
c6147113
LP
3208 if (!good)
3209 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3210
3211 r = sysctl_write(*k, *v);
3212 if (r < 0)
3213 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3214 }
3215
3216 return 0;
3217}
3218
03cfe0d5
LP
3219static int inner_child(
3220 Barrier *barrier,
5d9d3fcb 3221 int fd_inner_socket,
e1bb4b0d
LB
3222 FDSet *fds,
3223 char **os_release_pairs) {
69c79d3c 3224
03cfe0d5 3225 _cleanup_free_ char *home = NULL;
88614c8a 3226 size_t n_env = 1;
4ab3d29f
ZJS
3227 char *envp[] = {
3228 (char*) "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3229 NULL, /* container */
03cfe0d5
LP
3230 NULL, /* TERM */
3231 NULL, /* HOME */
3232 NULL, /* USER */
3233 NULL, /* LOGNAME */
3234 NULL, /* container_uuid */
3235 NULL, /* LISTEN_FDS */
3236 NULL, /* LISTEN_PID */
9c1e04d0 3237 NULL, /* NOTIFY_SOCKET */
3652872a 3238 NULL, /* CREDENTIALS_DIRECTORY */
b626f695 3239 NULL, /* LANG */
03cfe0d5
LP
3240 NULL
3241 };
1a68e1e5 3242 const char *exec_target;
2371271c 3243 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3244 int r, which_failed;
88213476 3245
b37469d7
LP
3246 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3247 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3248 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3249 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3250 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3251 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3252 * namespace.
3253 *
3254 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3255 * unshare(). See below. */
3256
03cfe0d5 3257 assert(barrier);
5d9d3fcb 3258 assert(fd_inner_socket >= 0);
88213476 3259
de40a303
LP
3260 log_debug("Inner child is initializing.");
3261
0de7acce 3262 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3263 /* Tell the parent, that it now can write the UID map. */
3264 (void) barrier_place(barrier); /* #1 */
7027ff61 3265
03cfe0d5 3266 /* Wait until the parent wrote the UID map */
baaa35ad 3267 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3268 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3269
2a2e78e9
LP
3270 /* Become the new root user inside our namespace */
3271 r = reset_uid_gid();
3272 if (r < 0)
3273 return log_error_errno(r, "Couldn't become new root: %m");
3274
3275 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3276 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3277 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3278 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3279 if (r < 0)
3280 return r;
3281 }
6d66bd3b 3282
0de7acce 3283 r = mount_all(NULL,
4f086aab 3284 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3285 arg_uid_shift,
0de7acce 3286 arg_selinux_apifs_context);
03cfe0d5
LP
3287 if (r < 0)
3288 return r;
3289
04413780
ZJS
3290 if (!arg_network_namespace_path && arg_private_network) {
3291 r = unshare(CLONE_NEWNET);
3292 if (r < 0)
3293 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3294
3295 /* Tell the parent that it can setup network interfaces. */
3296 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3297 }
3298
4f086aab 3299 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3300 if (r < 0)
3301 return r;
3302
03cfe0d5
LP
3303 /* Wait until we are cgroup-ified, so that we
3304 * can mount the right cgroup path writable */
baaa35ad
ZJS
3305 if (!barrier_place_and_sync(barrier)) /* #4 */
3306 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3307 "Parent died too early");
88213476 3308
489fae52 3309 if (arg_use_cgns) {
0996ef00
CB
3310 r = unshare(CLONE_NEWCGROUP);
3311 if (r < 0)
04413780 3312 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3313 r = mount_cgroups(
3314 "",
3315 arg_unified_cgroup_hierarchy,
3316 arg_userns_mode != USER_NAMESPACE_NO,
3317 arg_uid_shift,
3318 arg_uid_range,
5a8ff0e6 3319 arg_selinux_apifs_context,
ada54120 3320 true);
1433e0f2 3321 } else
0996ef00 3322 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3323 if (r < 0)
3324 return r;
ec16945e 3325
1e4f1671 3326 r = setup_boot_id();
03cfe0d5
LP
3327 if (r < 0)
3328 return r;
ec16945e 3329
5d9d3fcb 3330 r = setup_kmsg(fd_inner_socket);
03cfe0d5
LP
3331 if (r < 0)
3332 return r;
ec16945e 3333
de40a303
LP
3334 r = mount_custom(
3335 "/",
3336 arg_custom_mounts,
3337 arg_n_custom_mounts,
de40a303 3338 0,
c0c8f718 3339 0,
de40a303 3340 arg_selinux_apifs_context,
5f0a6347 3341 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3342 if (r < 0)
3343 return r;
3344
03cfe0d5
LP
3345 if (setsid() < 0)
3346 return log_error_errno(errno, "setsid() failed: %m");
3347
3348 if (arg_private_network)
df883de9 3349 (void) loopback_setup();
03cfe0d5 3350
7a8f6325 3351 if (arg_expose_ports) {
b07ee903 3352 r = expose_port_send_rtnl(fd_inner_socket);
7a8f6325
LP
3353 if (r < 0)
3354 return r;
7a8f6325 3355 }
03cfe0d5 3356
3acc84eb 3357 if (arg_console_mode != CONSOLE_PIPE) {
5bb1d7fb 3358 _cleanup_close_ int master = -EBADF;
3acc84eb
FB
3359 _cleanup_free_ char *console = NULL;
3360
3361 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3362 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3363 if (master < 0)
dc98caea 3364 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3365
3366 r = setup_dev_console(console);
3367 if (r < 0)
105a1a36 3368 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb 3369
bb1aa185 3370 r = send_one_fd(fd_inner_socket, master, 0);
3acc84eb
FB
3371 if (r < 0)
3372 return log_error_errno(r, "Failed to send master fd: %m");
3acc84eb
FB
3373
3374 r = setup_stdio_as_dev_console();
3375 if (r < 0)
3376 return r;
3377 }
3378
de40a303
LP
3379 r = patch_sysctl();
3380 if (r < 0)
3381 return r;
3382
81f345df
LP
3383 if (arg_oom_score_adjust_set) {
3384 r = set_oom_score_adjust(arg_oom_score_adjust);
3385 if (r < 0)
3386 return log_error_errno(r, "Failed to adjust OOM score: %m");
3387 }
3388
0985c7c4
ZJS
3389 if (arg_cpu_set.set)
3390 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3391 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3392
c818eef1 3393 (void) setup_hostname();
03cfe0d5 3394
050f7277 3395 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3396 r = safe_personality(arg_personality);
3397 if (r < 0)
3398 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3399#ifdef ARCHITECTURE_SECONDARY
3400 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
21022b9d
LP
3401 r = safe_personality(PER_LINUX32);
3402 if (r < 0)
3403 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3404#endif
3405 } else if (arg_architecture >= 0 && arg_architecture != native_architecture())
3406 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3407 "Selected architecture '%s' not supported locally, refusing.",
3408 architecture_to_string(arg_architecture));
03cfe0d5 3409
de40a303
LP
3410 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3411 if (r < 0)
3412 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3413
3414#if HAVE_SECCOMP
3415 if (arg_seccomp) {
3416
3417 if (is_seccomp_available()) {
3418
3419 r = seccomp_load(arg_seccomp);
7bc5e0b1 3420 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3421 return log_error_errno(r, "Failed to install seccomp filter: %m");
3422 if (r < 0)
3423 log_debug_errno(r, "Failed to install seccomp filter: %m");
3424 }
3425 } else
3426#endif
3427 {
6b000af4 3428 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3429 if (r < 0)
3430 return r;
3431 }
3432
4a4654e0 3433 if (arg_suppress_sync) {
20e458ae 3434#if HAVE_SECCOMP
4a4654e0
LP
3435 r = seccomp_suppress_sync();
3436 if (r < 0)
3437 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
20e458ae 3438#else
2db32618 3439 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
20e458ae 3440#endif
4a4654e0
LP
3441 }
3442
349cc4a5 3443#if HAVE_SELINUX
03cfe0d5 3444 if (arg_selinux_context)
2ed96880 3445 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3446 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3447#endif
3448
de40a303
LP
3449 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3450 * if we need to later on. */
3451 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3452 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3453
3454 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3455 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3456 else
3462d773 3457 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3458 if (r < 0)
3459 return r;
3460
de40a303
LP
3461 r = drop_capabilities(getuid());
3462 if (r < 0)
3463 return log_error_errno(r, "Dropping capabilities failed: %m");
3464
66edd963
LP
3465 if (arg_no_new_privileges)
3466 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3467 return log_error_errno(errno, "Failed to disable new privileges: %m");
3468
6aadfa4c
ILG
3469 /* LXC sets container=lxc, so follow the scheme here */
3470 envp[n_env++] = strjoina("container=", arg_container_service_name);
3471
03cfe0d5
LP
3472 envp[n_env] = strv_find_prefix(environ, "TERM=");
3473 if (envp[n_env])
313cefa1 3474 n_env++;
03cfe0d5 3475
de40a303 3476 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3477 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
de40a303
LP
3478 return log_oom();
3479
3480 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3481 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
1da3cb81 3482 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
de40a303 3483 return log_oom();
03cfe0d5 3484
3bbaff3e 3485 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3486
b7416360 3487 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
e01ff70a 3488 return log_oom();
03cfe0d5
LP
3489
3490 if (fdset_size(fds) > 0) {
3491 r = fdset_cloexec(fds, false);
3492 if (r < 0)
3493 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3494
4ab3d29f
ZJS
3495 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3496 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
03cfe0d5
LP
3497 return log_oom();
3498 }
4ab3d29f 3499 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
9c1e04d0 3500 return log_oom();
03cfe0d5 3501
3652872a
LP
3502 if (arg_n_credentials > 0) {
3503 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3504 if (!envp[n_env])
3505 return log_oom();
3506 n_env++;
3507 }
3508
b626f695 3509 if (arg_start_mode != START_BOOT) {
a22f5186 3510 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
b626f695
DDM
3511 if (!envp[n_env])
3512 return log_oom();
3513 n_env++;
3514 }
3515
4ab3d29f 3516 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
2371271c
TG
3517 if (!env_use)
3518 return log_oom();
03cfe0d5
LP
3519
3520 /* Let the parent know that we are ready and
3521 * wait until the parent is ready with the
3522 * setup, too... */
baaa35ad 3523 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3524 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3525
5f932eb9
LP
3526 if (arg_chdir)
3527 if (chdir(arg_chdir) < 0)
3528 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3529
7732f92b 3530 if (arg_start_mode == START_PID2) {
75bf701f 3531 r = stub_pid1(arg_uuid);
7732f92b
LP
3532 if (r < 0)
3533 return r;
3534 }
3535
335d2ead
LP
3536 if (arg_console_mode != CONSOLE_PIPE) {
3537 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3538 * are configured for that. Acquire it as controlling tty. */
3539 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3540 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3541 }
3542
de40a303
LP
3543 log_debug("Inner child completed, invoking payload.");
3544
8ca082b4
LP
3545 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3546 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3547 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3548 log_close();
8ca082b4 3549 log_set_open_when_needed(true);
a3b00f91 3550 log_settle_target();
8ca082b4 3551
03cfe0d5
LP
3552 (void) fdset_close_others(fds);
3553
7732f92b 3554 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3555 char **a;
3556 size_t m;
3557
3558 /* Automatically search for the init system */
3559
75f32f04
ZJS
3560 m = strv_length(arg_parameters);
3561 a = newa(char*, m + 2);
3562 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3563 a[1 + m] = NULL;
03cfe0d5 3564
a5096641
LP
3565 FOREACH_STRING(init,
3566 "/usr/lib/systemd/systemd",
3567 "/lib/systemd/systemd",
3568 "/sbin/init") {
3569 a[0] = (char*) init;
3570 execve(a[0], a, env_use);
3571 }
ced58da7
LP
3572
3573 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3574 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3575 const char *dollar_path;
3576
1a68e1e5 3577 exec_target = arg_parameters[0];
b6b180b7
LP
3578
3579 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3580 * binary. */
3581 dollar_path = strv_env_get(env_use, "PATH");
3582 if (dollar_path) {
6f646e01 3583 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3584 return log_error_errno(errno, "Failed to update $PATH: %m");
3585 }
3586
f757855e 3587 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3588 } else {
5f932eb9 3589 if (!arg_chdir)
d929b0f9
ZJS
3590 /* If we cannot change the directory, we'll end up in /, that is expected. */
3591 (void) chdir(home ?: "/root");
5f932eb9 3592
53350c7b 3593 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3594 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3595 execle("/bin/bash", "-bash", NULL, env_use);
3596 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3597 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7 3598
53350c7b 3599 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
03cfe0d5
LP
3600 }
3601
8ca082b4 3602 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3603}
3604
e96ceaba 3605static int setup_notify_child(void) {
254d1313 3606 _cleanup_close_ int fd = -EBADF;
1eb874b9 3607 static const union sockaddr_union sa = {
44ed5214
LP
3608 .un.sun_family = AF_UNIX,
3609 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3610 };
3611 int r;
3612
3613 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3614 if (fd < 0)
3615 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3616
3617 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3618 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3619
9c1e04d0 3620 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3621 if (r < 0)
44ed5214 3622 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3623
adc7d9f0 3624 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3625 if (r < 0)
adc7d9f0 3626 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3627
2ff48e98 3628 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3629 if (r < 0)
2ff48e98 3630 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3631
271f518f 3632 return TAKE_FD(fd);
9c1e04d0
AP
3633}
3634
03cfe0d5
LP
3635static int outer_child(
3636 Barrier *barrier,
3637 const char *directory,
2d845785 3638 DissectedImage *dissected_image,
af06cd30 3639 int fd_outer_socket,
5d9d3fcb 3640 int fd_inner_socket,
d7bea6b6
DP
3641 FDSet *fds,
3642 int netns_fd) {
03cfe0d5 3643
2f893044 3644 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3645 _cleanup_strv_free_ char **os_release_pairs = NULL;
254d1313 3646 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
f61c7f88 3647 bool idmap = false;
e5f10caf 3648 const char *p;
03cfe0d5
LP
3649 pid_t pid;
3650 ssize_t l;
de40a303 3651 int r;
03cfe0d5 3652
d1d0b895
LP
3653 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3654 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3655 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3656 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3657 * forked off it, and it exits. */
b37469d7 3658
03cfe0d5
LP
3659 assert(barrier);
3660 assert(directory);
af06cd30 3661 assert(fd_outer_socket >= 0);
5d9d3fcb 3662 assert(fd_inner_socket >= 0);
03cfe0d5 3663
de40a303
LP
3664 log_debug("Outer child is initializing.");
3665
e1bb4b0d
LB
3666 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3667 if (r < 0)
3668 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3669
03cfe0d5
LP
3670 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3671 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3672
03cfe0d5
LP
3673 r = reset_audit_loginuid();
3674 if (r < 0)
3675 return r;
3676
2a2e78e9
LP
3677 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3678 * mounts to the real root. */
511a8cfe 3679 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3680 if (r < 0)
3681 return r;
03cfe0d5 3682
2d845785 3683 if (dissected_image) {
d1d0b895
LP
3684 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3685 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3686 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3687 * right place right away. This makes sure ESP partitions and userns are compatible. */
2d3a5a73 3688
af187ab2 3689 r = dissected_image_mount_and_warn(
d04faa4e
LP
3690 dissected_image,
3691 directory,
3692 arg_uid_shift,
21b61b1d 3693 arg_uid_range,
d04faa4e
LP
3694 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3695 DISSECT_IMAGE_DISCARD_ON_LOOP|
3696 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3697 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3698 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3699 if (r < 0)
af187ab2 3700 return r;
2d845785 3701 }
03cfe0d5 3702
391567f4
LP
3703 r = determine_uid_shift(directory);
3704 if (r < 0)
3705 return r;
3706
0de7acce 3707 if (arg_userns_mode != USER_NAMESPACE_NO) {
b71a0192
CB
3708 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3709 if (r < 0)
3710 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3711
af06cd30 3712 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
b71a0192
CB
3713 if (l < 0)
3714 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3715 mntns_fd = safe_close(mntns_fd);
3716
0e7ac751 3717 /* Let the parent know which UID shift we read from the image */
af06cd30 3718 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
825d5287
RM
3719 if (l < 0)
3720 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3721 if (l != sizeof(arg_uid_shift))
3722 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3723 "Short write while sending UID shift.");
0e7ac751 3724
0de7acce 3725 if (arg_userns_mode == USER_NAMESPACE_PICK) {
d1d0b895
LP
3726 /* When we are supposed to pick the UID shift, the parent will check now whether the
3727 * UID shift we just read from the image is available. If yes, it will send the UID
3728 * shift back to us, if not it will pick a different one, and send it back to us. */
0e7ac751 3729
af06cd30 3730 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
0e7ac751
LP
3731 if (l < 0)
3732 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3733 if (l != sizeof(arg_uid_shift))
3734 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3735 "Short read while receiving UID shift.");
0e7ac751
LP
3736 }
3737
ff6c6cc1
LP
3738 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3739 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3740 }
3741
6f83d3d1
LP
3742 if (path_equal(directory, "/")) {
3743 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3744 * place, so that we can make changes to its mount structure (for example, to implement
3745 * --volatile=) without this interfering with our ability to access files such as
3746 * /etc/localtime to copy into the container. Note that we use a fixed place for this
6c2d70ce 3747 * (instead of a temporary directory, since we are living in our own mount namespace here
7802194a 3748 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
6f83d3d1
LP
3749 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3750
511a8cfe 3751 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3752 if (r < 0)
3753 return r;
3754
3755 directory = "/run/systemd/nspawn-root";
e50cd82f 3756 }
7d0ecdd6 3757
75f81732
LP
3758 /* Make sure we always have a mount that we can move to root later on. */
3759 r = make_mount_point(directory);
3760 if (r < 0)
3761 return r;
3762
3763 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3764 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3765 * we'll live in our own little world from now on, and propagation from the host may only happen via
3766 * the mount tunnel dir, or not at all. */
3767 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3768 if (r < 0)
3769 return r;
3770
7d0ecdd6
LP
3771 r = setup_pivot_root(
3772 directory,
3773 arg_pivot_root_new,
3774 arg_pivot_root_old);
3775 if (r < 0)
3776 return r;
3777
3778 r = setup_volatile_mode(
3779 directory,
3780 arg_volatile_mode,
7d0ecdd6 3781 arg_uid_shift,
8f1ed04a 3782 arg_selinux_apifs_context);
7d0ecdd6
LP
3783 if (r < 0)
3784 return r;
3785
2f893044
LP
3786 r = bind_user_prepare(
3787 directory,
3788 arg_bind_user,
3789 arg_uid_shift,
3790 arg_uid_range,
3791 &arg_custom_mounts, &arg_n_custom_mounts,
3792 &bind_user_context);
3793 if (r < 0)
3794 return r;
3795
3796 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
d1d0b895
LP
3797 /* Send the user maps we determined to the parent, so that it installs it in our user
3798 * namespace UID map table */
2f893044
LP
3799
3800 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3801 uid_t map[] = {
3802 bind_user_context->data[i].payload_user->uid,
3803 bind_user_context->data[i].host_user->uid,
3804 (uid_t) bind_user_context->data[i].payload_group->gid,
3805 (uid_t) bind_user_context->data[i].host_group->gid,
3806 };
3807
af06cd30 3808 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
2f893044
LP
3809 if (l < 0)
3810 return log_error_errno(errno, "Failed to send user UID map: %m");
3811 if (l != sizeof(map))
3812 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3813 "Short write while sending user UID map.");
3814 }
3815 }
3816
5f0a6347
DDM
3817 r = mount_custom(
3818 directory,
3819 arg_custom_mounts,
3820 arg_n_custom_mounts,
5f0a6347 3821 arg_uid_shift,
c0c8f718 3822 arg_uid_range,
5f0a6347
DDM
3823 arg_selinux_apifs_context,
3824 MOUNT_ROOT_ONLY);
3825 if (r < 0)
3826 return r;
3827
c0c8f718
AV
3828 if (arg_userns_mode != USER_NAMESPACE_NO &&
3829 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3830 arg_uid_shift != 0) {
3831
2b2777ed 3832 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
c0c8f718
AV
3833 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3834 /* This might fail because the kernel or file system doesn't support idmapping. We
3835 * can't really distinguish this nicely, nor do we have any guarantees about the
3836 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3837 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3838 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3839 "ID mapped mounts are apparently not available, sorry.");
3840
3841 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3842 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3843 } else if (r < 0)
3844 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3845 else {
3846 log_debug("ID mapped mounts available, making use of them.");
3847 idmap = true;
3848 }
3849 }
3850
2d3a5a73
LP
3851 if (dissected_image) {
3852 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3853 r = dissected_image_mount(
3854 dissected_image,
3855 directory,
3856 arg_uid_shift,
21b61b1d 3857 arg_uid_range,
d04faa4e
LP
3858 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3859 DISSECT_IMAGE_DISCARD_ON_LOOP|
3860 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3861 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3862 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3863 if (r == -EUCLEAN)
3864 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3865 if (r < 0)
4fcb96ce 3866 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3867 }
3868
8199d554
LP
3869 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3870 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3871
3872 r = detect_unified_cgroup_hierarchy_from_image(directory);
3873 if (r < 0)
3874 return r;
3875
fefb7a6d 3876 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
8199d554
LP
3877 if (l < 0)
3878 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3879 if (l != sizeof(arg_unified_cgroup_hierarchy))
3880 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3881 "Short write while sending cgroup mode.");
8199d554
LP
3882 }
3883
4ad14eff
LP
3884 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3885 if (r < 0)
3886 return r;
3887
03cfe0d5
LP
3888 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3889 if (r < 0)
3890 return r;
3891
bbd407ea
DDM
3892 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3893 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3894 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3895 if (r < 0)
3896 return log_error_errno(r, "Failed to make tree read-only: %m");
3897 }
3898
0de7acce 3899 r = mount_all(directory,
4f086aab 3900 arg_mount_settings,
0de7acce 3901 arg_uid_shift,
0de7acce 3902 arg_selinux_apifs_context);
03cfe0d5
LP
3903 if (r < 0)
3904 return r;
3905
07fa00f9
LP
3906 r = copy_devnodes(directory);
3907 if (r < 0)
03cfe0d5
LP
3908 return r;
3909
de40a303
LP
3910 r = make_extra_nodes(directory);
3911 if (r < 0)
3912 return r;
3913
3914 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3915
9fac5029 3916 p = prefix_roota(directory, "/run/host");
e5f10caf 3917 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3918
07fa00f9
LP
3919 r = setup_pts(directory);
3920 if (r < 0)
03cfe0d5
LP
3921 return r;
3922
e79581dd 3923 r = mount_tunnel_dig(directory);
03cfe0d5
LP
3924 if (r < 0)
3925 return r;
3926
8e5430c4
LP
3927 r = setup_keyring();
3928 if (r < 0)
3929 return r;
3930
3652872a
LP
3931 r = setup_credentials(directory);
3932 if (r < 0)
3933 return r;
3934
2f893044
LP
3935 r = bind_user_setup(bind_user_context, directory);
3936 if (r < 0)
3937 return r;
3938
5c4deb9a
MJ
3939 r = mount_custom(
3940 directory,
3941 arg_custom_mounts,
3942 arg_n_custom_mounts,
3943 arg_uid_shift,
c0c8f718 3944 arg_uid_range,
5c4deb9a
MJ
3945 arg_selinux_apifs_context,
3946 MOUNT_NON_ROOT_ONLY);
3947 if (r < 0)
3948 return r;
3949
03cfe0d5
LP
3950 r = setup_timezone(directory);
3951 if (r < 0)
3952 return r;
3953
3954 r = setup_resolv_conf(directory);
3955 if (r < 0)
3956 return r;
3957
e01ff70a
MS
3958 r = setup_machine_id(directory);
3959 if (r < 0)
3960 return r;
3961
03cfe0d5
LP
3962 r = setup_journal(directory);
3963 if (r < 0)
3964 return r;
3965
0f48ba7b
LP
3966 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3967 p = prefix_roota(directory, "/run/host/container-manager");
3968 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3969
3970 /* The same stuff as the $container_uuid env var */
3971 p = prefix_roota(directory, "/run/host/container-uuid");
3972 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3973
489fae52 3974 if (!arg_use_cgns) {
0996ef00
CB
3975 r = mount_cgroups(
3976 directory,
3977 arg_unified_cgroup_hierarchy,
3978 arg_userns_mode != USER_NAMESPACE_NO,
3979 arg_uid_shift,
3980 arg_uid_range,
5a8ff0e6 3981 arg_selinux_apifs_context,
ada54120 3982 false);
0996ef00
CB
3983 if (r < 0)
3984 return r;
3985 }
03cfe0d5 3986
57c10a56
CB
3987 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3988 * mounts available in systemd services inside the container that create a new mount namespace. See
3989 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3990 * will inherit the shared propagation mode.
3991 *
3992 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3993 * directory mount to root later on.
3994 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3995 */
9d50f850 3996 r = mount_switch_root(directory, MS_SHARED);
03cfe0d5
LP
3997 if (r < 0)
3998 return log_error_errno(r, "Failed to move root directory: %m");
3999
e79581dd
CB
4000 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4001 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4002 * the container. */
4003 r = mount_tunnel_open();
4004 if (r < 0)
4005 return r;
4006
b71a0192
CB
4007 if (arg_userns_mode != USER_NAMESPACE_NO) {
4008 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4009 * requires that a fully visible instance is already present in the target mount
4010 * namespace. Mount one here so the inner child can mount its own instances. Later
4011 * we umount the temporary instances created here before we actually exec the
4012 * payload. Since the rootfs is shared the umount will propagate into the container.
4013 * Note, the inner child wouldn't be able to unmount the instances on its own since
4014 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4015 * this. */
4016 r = pin_fully_visible_fs();
4017 if (r < 0)
4018 return r;
4019 }
4020
e96ceaba 4021 fd = setup_notify_child();
9c1e04d0
AP
4022 if (fd < 0)
4023 return fd;
4024
03cfe0d5 4025 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 4026 arg_clone_ns_flags |
8869a0b4 4027 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
4028 if (pid < 0)
4029 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5 4030 if (pid == 0) {
af06cd30 4031 fd_outer_socket = safe_close(fd_outer_socket);
03cfe0d5 4032
2a2e78e9
LP
4033 /* The inner child has all namespaces that are requested, so that we all are owned by the
4034 * user if user namespaces are turned on. */
03cfe0d5 4035
d7bea6b6
DP
4036 if (arg_network_namespace_path) {
4037 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4038 if (r < 0)
e2d39e54 4039 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
4040 }
4041
11875a98 4042 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
03cfe0d5
LP
4043 if (r < 0)
4044 _exit(EXIT_FAILURE);
4045
4046 _exit(EXIT_SUCCESS);
4047 }
4048
af06cd30 4049 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
03cfe0d5
LP
4050 if (l < 0)
4051 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
4052 if (l != sizeof(pid))
4053 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4054 "Short write while sending PID.");
03cfe0d5 4055
af06cd30 4056 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
e01ff70a
MS
4057 if (l < 0)
4058 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
4059 if (l != sizeof(arg_uuid))
4060 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4061 "Short write while sending machine ID.");
e01ff70a 4062
af06cd30 4063 l = send_one_fd(fd_outer_socket, fd, 0);
9c1e04d0 4064 if (l < 0)
ba72801d 4065 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 4066
af06cd30 4067 fd_outer_socket = safe_close(fd_outer_socket);
5d9d3fcb 4068 fd_inner_socket = safe_close(fd_inner_socket);
d7bea6b6 4069 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
4070
4071 return 0;
4072}
4073
0e7ac751 4074static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4075 bool tried_hashed = false;
0e7ac751
LP
4076 unsigned n_tries = 100;
4077 uid_t candidate;
4078 int r;
4079
4080 assert(shift);
4081 assert(ret_lock_file);
0de7acce 4082 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4083 assert(arg_uid_range == 0x10000U);
4084
4085 candidate = *shift;
4086
4087 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4088
4089 for (;;) {
fbd0b64f 4090 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4091 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4092
4093 if (--n_tries <= 0)
4094 return -EBUSY;
4095
87d5e4f2 4096 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4097 goto next;
4098 if ((candidate & UINT32_C(0xFFFF)) != 0)
4099 goto next;
4100
4101 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4102 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4103 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4104 goto next;
4105 if (r < 0)
4106 return r;
4107
4108 /* Make some superficial checks whether the range is currently known in the user database */
4109 if (getpwuid(candidate))
4110 goto next;
4111 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4112 goto next;
4113 if (getgrgid(candidate))
4114 goto next;
4115 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4116 goto next;
4117
4118 *ret_lock_file = lf;
4119 lf = (struct LockFile) LOCK_FILE_INIT;
4120 *shift = candidate;
4121 return 0;
4122
4123 next:
d381c8a6
LP
4124 if (arg_machine && !tried_hashed) {
4125 /* Try to hash the base from the container name */
4126
4127 static const uint8_t hash_key[] = {
4128 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4129 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4130 };
4131
4132 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4133
4134 tried_hashed = true;
4135 } else
4136 random_bytes(&candidate, sizeof(candidate));
4137
87d5e4f2 4138 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4139 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4140 }
4141}
4142
2f893044
LP
4143static int add_one_uid_map(
4144 char **p,
4145 uid_t container_uid,
4146 uid_t host_uid,
4147 uid_t range) {
4148
4149 return strextendf(p,
4150 UID_FMT " " UID_FMT " " UID_FMT "\n",
4151 container_uid, host_uid, range);
4152}
4153
4154static int make_uid_map_string(
4155 const uid_t bind_user_uid[],
4156 size_t n_bind_user_uid,
4157 size_t offset,
4158 char **ret) {
4159
4160 _cleanup_free_ char *s = NULL;
4161 uid_t previous_uid = 0;
4162 int r;
4163
4164 assert(n_bind_user_uid == 0 || bind_user_uid);
2f092762 4165 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
2f893044
LP
4166 assert(ret);
4167
4168 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4169 * quadruplet, consisting of host and container UID + GID. */
4170
4171 for (size_t i = 0; i < n_bind_user_uid; i++) {
05ab439a
YW
4172 uid_t payload_uid = bind_user_uid[i*4+offset],
4173 host_uid = bind_user_uid[i*4+offset+1];
2f893044
LP
4174
4175 assert(previous_uid <= payload_uid);
4176 assert(payload_uid < arg_uid_range);
4177
4178 /* Add a range to close the gap to previous entry */
4179 if (payload_uid > previous_uid) {
4180 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4181 if (r < 0)
4182 return r;
4183 }
4184
4185 /* Map this specific user */
4186 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4187 if (r < 0)
4188 return r;
4189
4190 previous_uid = payload_uid + 1;
4191 }
4192
4193 /* And add a range to close the gap to finish the range */
4194 if (arg_uid_range > previous_uid) {
4195 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4196 if (r < 0)
4197 return r;
4198 }
4199
4200 assert(s);
4201
4202 *ret = TAKE_PTR(s);
4203 return 0;
4204}
4205
4206static int setup_uid_map(
4207 pid_t pid,
4208 const uid_t bind_user_uid[],
4209 size_t n_bind_user_uid) {
4210
4211 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4212 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4213 int r;
4214
4215 assert(pid > 1);
4216
2f893044
LP
4217 /* Build the UID map string */
4218 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4219 return log_oom();
4220
03cfe0d5 4221 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4222 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4223 if (r < 0)
4224 return log_error_errno(r, "Failed to write UID map: %m");
4225
2f893044
LP
4226 /* And now build the GID map string */
4227 s = mfree(s);
4228 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4229 return log_oom();
4230
03cfe0d5 4231 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4232 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4233 if (r < 0)
4234 return log_error_errno(r, "Failed to write GID map: %m");
4235
4236 return 0;
4237}
4238
9c1e04d0 4239static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4240 char buf[NOTIFY_BUFFER_MAX+1];
4241 char *p = NULL;
4242 struct iovec iovec = {
4243 .iov_base = buf,
4244 .iov_len = sizeof(buf)-1,
4245 };
fb29cdbe
LP
4246 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4247 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4248 struct msghdr msghdr = {
4249 .msg_iov = &iovec,
4250 .msg_iovlen = 1,
4251 .msg_control = &control,
4252 .msg_controllen = sizeof(control),
4253 };
371d72e0 4254 struct ucred *ucred;
9c1e04d0
AP
4255 ssize_t n;
4256 pid_t inner_child_pid;
4257 _cleanup_strv_free_ char **tags = NULL;
4bf4f50f 4258 int r;
9c1e04d0
AP
4259
4260 assert(userdata);
4261
4262 inner_child_pid = PTR_TO_PID(userdata);
4263
4264 if (revents != EPOLLIN) {
4265 log_warning("Got unexpected poll event for notify fd.");
4266 return 0;
4267 }
4268
3691bcf3 4269 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
8add30a0
YW
4270 if (n < 0) {
4271 if (ERRNO_IS_TRANSIENT(n))
4272 return 0;
4273 if (n == -EXFULL) {
4274 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4275 return 0;
4276 }
3691bcf3 4277 return log_warning_errno(n, "Couldn't read notification socket: %m");
8add30a0 4278 }
9c1e04d0 4279
9c1e04d0
AP
4280 cmsg_close_all(&msghdr);
4281
371d72e0 4282 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4283 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4284 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4285 return 0;
4286 }
4287
4288 if ((size_t) n >= sizeof(buf)) {
4289 log_warning("Received notify message exceeded maximum size. Ignoring.");
4290 return 0;
4291 }
4292
4293 buf[n] = 0;
4294 tags = strv_split(buf, "\n\r");
4295 if (!tags)
4296 return log_oom();
4297
d29cc4d6 4298 if (strv_contains(tags, "READY=1")) {
d4341b76 4299 r = sd_notify(false, "READY=1\n");
4bf4f50f
ZJS
4300 if (r < 0)
4301 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4302 }
9c1e04d0
AP
4303
4304 p = strv_find_startswith(tags, "STATUS=");
4305 if (p)
04f590a4 4306 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4307
4308 return 0;
4309}
4310
e96ceaba 4311static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4312 int r;
9c1e04d0 4313
5773024d 4314 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4315 if (r < 0)
4316 return log_error_errno(r, "Failed to allocate notify event source: %m");
4317
5773024d 4318 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4319
4320 return 0;
4321}
4322
5d961407
LP
4323static int merge_settings(Settings *settings, const char *path) {
4324 int rl;
f757855e 4325
5d961407
LP
4326 assert(settings);
4327 assert(path);
f757855e 4328
5d961407
LP
4329 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4330 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4331
7732f92b
LP
4332 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4333 settings->start_mode >= 0) {
4334 arg_start_mode = settings->start_mode;
130d3d22 4335 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4336 }
4337
d3689b94
LP
4338 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4339 settings->ephemeral >= 0)
a2f577fc
JL
4340 arg_ephemeral = settings->ephemeral;
4341
de40a303
LP
4342 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4343 settings->root) {
4344
4345 if (!arg_settings_trusted)
4346 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4347 else
4348 free_and_replace(arg_directory, settings->root);
4349 }
4350
b53ede69
PW
4351 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4352 settings->pivot_root_new) {
4353 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4354 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4355 }
4356
5f932eb9 4357 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4358 settings->working_directory)
4359 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4360
f757855e 4361 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4362 settings->environment)
4363 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4364
de40a303
LP
4365 if ((arg_settings_mask & SETTING_USER) == 0) {
4366
4367 if (settings->user)
4368 free_and_replace(arg_user, settings->user);
4369
4370 if (uid_is_valid(settings->uid))
4371 arg_uid = settings->uid;
4372 if (gid_is_valid(settings->gid))
4373 arg_gid = settings->gid;
4374 if (settings->n_supplementary_gids > 0) {
4375 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4376 arg_n_supplementary_gids = settings->n_supplementary_gids;
4377 }
4378 }
f757855e
LP
4379
4380 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4381 uint64_t plus, minus;
7be830c6 4382 uint64_t network_minus = 0;
88fc9c9b 4383 uint64_t ambient;
f757855e 4384
de40a303
LP
4385 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4386 * Settings structure */
4387
0e265674 4388 plus = settings->capability;
a3fc6b55
LP
4389 minus = settings->drop_capability;
4390
9baa294c
LP
4391 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4392 settings_network_configured(settings)) {
a3fc6b55
LP
4393 if (settings_private_network(settings))
4394 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4395 else
7be830c6 4396 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4397 }
0e265674
LP
4398
4399 if (!arg_settings_trusted && plus != 0) {
4400 if (settings->capability != 0)
5d961407 4401 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4402 } else {
4403 arg_caps_retain &= ~network_minus;
520e0d54 4404 arg_caps_retain |= plus;
7be830c6 4405 }
f757855e 4406
a3fc6b55 4407 arg_caps_retain &= ~minus;
de40a303
LP
4408
4409 /* Copy the full capabilities over too */
4410 if (capability_quintet_is_set(&settings->full_capabilities)) {
4411 if (!arg_settings_trusted)
5238e957 4412 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4413 else
4414 arg_full_capabilities = settings->full_capabilities;
4415 }
88fc9c9b
TH
4416
4417 ambient = settings->ambient_capability;
4418 if (!arg_settings_trusted && ambient != 0)
4419 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4420 else
4421 arg_caps_ambient |= ambient;
f757855e
LP
4422 }
4423
4424 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4425 settings->kill_signal > 0)
4426 arg_kill_signal = settings->kill_signal;
4427
4428 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4429 settings->personality != PERSONALITY_INVALID)
4430 arg_personality = settings->personality;
4431
4432 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4433 !sd_id128_is_null(settings->machine_id)) {
4434
4435 if (!arg_settings_trusted)
5d961407 4436 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4437 else
4438 arg_uuid = settings->machine_id;
4439 }
4440
4441 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4442 settings->read_only >= 0)
4443 arg_read_only = settings->read_only;
4444
4445 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4446 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4447 arg_volatile_mode = settings->volatile_mode;
4448
4449 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4450 settings->n_custom_mounts > 0) {
4451
4452 if (!arg_settings_trusted)
5d961407 4453 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4454 else {
4455 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4456 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4457 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4458 settings->n_custom_mounts = 0;
4459 }
4460 }
4461
4462 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
a1dfd585 4463 settings_network_configured(settings)) {
f757855e
LP
4464
4465 if (!arg_settings_trusted)
5d961407 4466 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4467 else {
f6d6bad1 4468 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4469 arg_private_network = settings_private_network(settings);
4470
130d3d22
YW
4471 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4472 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4473 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4474 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4475
1cc6c93a
YW
4476 free_and_replace(arg_network_bridge, settings->network_bridge);
4477 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4478
4479 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4480 }
4481 }
4482
4483 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4484 settings->expose_ports) {
4485
4486 if (!arg_settings_trusted)
5d961407 4487 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4488 else {
4489 expose_port_free_all(arg_expose_ports);
1cc6c93a 4490 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4491 }
4492 }
4493
0de7acce
LP
4494 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4495 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4496
4497 if (!arg_settings_trusted)
5d961407 4498 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4499 else {
4500 arg_userns_mode = settings->userns_mode;
4501 arg_uid_shift = settings->uid_shift;
4502 arg_uid_range = settings->uid_range;
6c045a99 4503 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4504 }
4505 }
4506
0cc3c9f9
LP
4507 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4508 !strv_isempty(settings->bind_user))
2f893044
LP
4509 strv_free_and_replace(arg_bind_user, settings->bind_user);
4510
d3689b94
LP
4511 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4512 settings->notify_ready >= 0)
9c1e04d0
AP
4513 arg_notify_ready = settings->notify_ready;
4514
960e4569
LP
4515 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4516
2d09ea44
LP
4517 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4518 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4519 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4520 else {
4521 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4522 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4523 }
960e4569 4524 }
de40a303
LP
4525
4526#if HAVE_SECCOMP
2d09ea44
LP
4527 if (settings->seccomp) {
4528 if (!arg_settings_trusted)
4529 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4530 else {
4531 seccomp_release(arg_seccomp);
4532 arg_seccomp = TAKE_PTR(settings->seccomp);
4533 }
de40a303
LP
4534 }
4535#endif
960e4569
LP
4536 }
4537
bf428efb
LP
4538 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4539 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4540 continue;
4541
4542 if (!settings->rlimit[rl])
4543 continue;
4544
4545 if (!arg_settings_trusted) {
5d961407 4546 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4547 continue;
4548 }
4549
4550 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4551 }
4552
3a9530e5
LP
4553 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4554 settings->hostname)
4555 free_and_replace(arg_hostname, settings->hostname);
4556
66edd963
LP
4557 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4558 settings->no_new_privileges >= 0)
4559 arg_no_new_privileges = settings->no_new_privileges;
4560
81f345df
LP
4561 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4562 settings->oom_score_adjust_set) {
4563
4564 if (!arg_settings_trusted)
5d961407 4565 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4566 else {
4567 arg_oom_score_adjust = settings->oom_score_adjust;
4568 arg_oom_score_adjust_set = true;
4569 }
4570 }
4571
d107bb7d 4572 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4573 settings->cpu_set.set) {
d107bb7d
LP
4574
4575 if (!arg_settings_trusted)
5d961407 4576 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4577 else {
0985c7c4 4578 cpu_set_reset(&arg_cpu_set);
088d71f8 4579 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
d107bb7d
LP
4580 }
4581 }
4582
09d423e9
LP
4583 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4584 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4585 arg_resolv_conf = settings->resolv_conf;
4586
4e1d6aa9
LP
4587 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4588 settings->link_journal != _LINK_JOURNAL_INVALID) {
4589
4590 if (!arg_settings_trusted)
4591 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4592 else {
4593 arg_link_journal = settings->link_journal;
4594 arg_link_journal_try = settings->link_journal_try;
4595 }
4596 }
4597
1688841f
LP
4598 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4599 settings->timezone != _TIMEZONE_MODE_INVALID)
4600 arg_timezone = settings->timezone;
4601
de40a303
LP
4602 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4603 settings->slice) {
4604
4605 if (!arg_settings_trusted)
4606 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4607 else
4608 free_and_replace(arg_slice, settings->slice);
4609 }
4610
4611 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4612 settings->use_cgns >= 0) {
4613
4614 if (!arg_settings_trusted)
4615 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4616 else
4617 arg_use_cgns = settings->use_cgns;
4618 }
4619
4620 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4621 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4622
4623 if (!arg_settings_trusted)
4624 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4625 else
4626 arg_clone_ns_flags = settings->clone_ns_flags;
4627 }
4628
4629 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4630 settings->console_mode >= 0) {
4631
4632 if (!arg_settings_trusted)
4633 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4634 else
4635 arg_console_mode = settings->console_mode;
4636 }
4637
d3689b94
LP
4638 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4639 settings->suppress_sync >= 0)
4a4654e0
LP
4640 arg_suppress_sync = settings->suppress_sync;
4641
de40a303
LP
4642 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4643 * don't consult arg_settings_mask for them. */
4644
4645 sd_bus_message_unref(arg_property_message);
4646 arg_property_message = TAKE_PTR(settings->properties);
4647
4648 arg_console_width = settings->console_width;
4649 arg_console_height = settings->console_height;
4650
b2645747 4651 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4652 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4653 arg_n_extra_nodes = settings->n_extra_nodes;
4654
f757855e
LP
4655 return 0;
4656}
4657
5d961407
LP
4658static int load_settings(void) {
4659 _cleanup_(settings_freep) Settings *settings = NULL;
4660 _cleanup_fclose_ FILE *f = NULL;
3603f151 4661 _cleanup_free_ char *p = NULL;
5d961407
LP
4662 int r;
4663
de40a303
LP
4664 if (arg_oci_bundle)
4665 return 0;
4666
5d961407
LP
4667 /* If all settings are masked, there's no point in looking for
4668 * the settings file */
d7a0f1f4 4669 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4670 return 0;
4671
5d961407
LP
4672 /* We first look in the admin's directories in /etc and /run */
4673 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4674 _cleanup_free_ char *j = NULL;
4675
3603f151 4676 j = path_join(i, arg_settings_filename);
5d961407
LP
4677 if (!j)
4678 return log_oom();
4679
4680 f = fopen(j, "re");
4681 if (f) {
4682 p = TAKE_PTR(j);
4683
4684 /* By default, we trust configuration from /etc and /run */
4685 if (arg_settings_trusted < 0)
4686 arg_settings_trusted = true;
4687
4688 break;
4689 }
4690
4691 if (errno != ENOENT)
4692 return log_error_errno(errno, "Failed to open %s: %m", j);
4693 }
4694
4695 if (!f) {
4696 /* After that, let's look for a file next to the
4697 * actual image we shall boot. */
4698
4699 if (arg_image) {
162f6477
LP
4700 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4701 if (r < 0)
4702 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4703 } else if (arg_directory) {
4704 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4705 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4706 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
5d961407
LP
4707 }
4708
4709 if (p) {
4710 f = fopen(p, "re");
4711 if (!f && errno != ENOENT)
4712 return log_error_errno(errno, "Failed to open %s: %m", p);
4713
4714 /* By default, we do not trust configuration from /var/lib/machines */
4715 if (arg_settings_trusted < 0)
4716 arg_settings_trusted = false;
4717 }
4718 }
4719
4720 if (!f)
4721 return 0;
4722
4723 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4724
4725 r = settings_load(f, p, &settings);
4726 if (r < 0)
4727 return r;
4728
4729 return merge_settings(settings, p);
4730}
4731
de40a303
LP
4732static int load_oci_bundle(void) {
4733 _cleanup_(settings_freep) Settings *settings = NULL;
4734 int r;
4735
4736 if (!arg_oci_bundle)
4737 return 0;
4738
4739 /* By default let's trust OCI bundles */
4740 if (arg_settings_trusted < 0)
4741 arg_settings_trusted = true;
4742
4743 r = oci_load(NULL, arg_oci_bundle, &settings);
4744 if (r < 0)
4745 return r;
4746
4747 return merge_settings(settings, arg_oci_bundle);
4748}
4749
3acc84eb 4750static int run_container(
2d845785 4751 DissectedImage *dissected_image,
b0067625
ZJS
4752 FDSet *fds,
4753 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4754 struct ExposeArgs *expose_args,
3acc84eb 4755 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4756
4757 static const struct sigaction sa = {
4758 .sa_handler = nop_signal_handler,
e28c7cd0 4759 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4760 };
4761
8e766630 4762 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
5bb1d7fb 4763 _cleanup_close_ int etc_passwd_lock = -EBADF;
b0067625 4764 _cleanup_close_pair_ int
19ee48a6
YW
4765 fd_inner_socket_pair[2] = PIPE_EBADF,
4766 fd_outer_socket_pair[2] = PIPE_EBADF;
8199d554 4767
5bb1d7fb 4768 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
b0067625 4769 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4770 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4771 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4772 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4773 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4774 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4775 _cleanup_free_ uid_t *bind_user_uid = NULL;
4776 size_t n_bind_user_uid = 0;
b0067625 4777 ContainerStatus container_status = 0;
b0067625
ZJS
4778 int ifi = 0, r;
4779 ssize_t l;
4780 sigset_t mask_chld;
254d1313 4781 _cleanup_close_ int child_netns_fd = -EBADF;
b0067625
ZJS
4782
4783 assert_se(sigemptyset(&mask_chld) == 0);
4784 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4785
4786 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4787 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4788 * check with getpwuid() if the specific user already exists. Note that /etc might be
4789 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4790 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4791 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4792 * really ours. */
4793
4794 etc_passwd_lock = take_etc_passwd_lock(NULL);
4795 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4796 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4797 }
4798
4799 r = barrier_create(&barrier);
4800 if (r < 0)
4801 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4802
5d9d3fcb
CB
4803 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4804 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4805
af06cd30
CB
4806 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4807 return log_error_errno(errno, "Failed to create outer socket pair: %m");
b0067625 4808
b0067625
ZJS
4809 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4810 * parent's blocking calls and give it a chance to call wait() and terminate. */
4811 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4812 if (r < 0)
4813 return log_error_errno(errno, "Failed to change the signal mask: %m");
4814
4815 r = sigaction(SIGCHLD, &sa, NULL);
4816 if (r < 0)
4817 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4818
d7bea6b6 4819 if (arg_network_namespace_path) {
5b4855ab
DDM
4820 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4821 if (child_netns_fd < 0)
d7bea6b6
DP
4822 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4823
54c2459d 4824 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4825 if (r == -EUCLEAN)
4826 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4827 else if (r < 0)
d7bea6b6 4828 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4829 else if (r == 0)
4830 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4831 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4832 }
4833
b0067625
ZJS
4834 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4835 if (*pid < 0)
4836 return log_error_errno(errno, "clone() failed%s: %m",
4837 errno == EINVAL ?
4838 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4839
4840 if (*pid == 0) {
4841 /* The outer child only has a file system namespace. */
4842 barrier_set_role(&barrier, BARRIER_CHILD);
4843
5d9d3fcb 4844 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
af06cd30 4845 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
b0067625
ZJS
4846
4847 (void) reset_all_signal_handlers();
4848 (void) reset_signal_mask();
4849
4850 r = outer_child(&barrier,
4851 arg_directory,
2d845785 4852 dissected_image,
af06cd30 4853 fd_outer_socket_pair[1],
5d9d3fcb 4854 fd_inner_socket_pair[1],
d7bea6b6 4855 fds,
5b4855ab 4856 child_netns_fd);
b0067625
ZJS
4857 if (r < 0)
4858 _exit(EXIT_FAILURE);
4859
4860 _exit(EXIT_SUCCESS);
4861 }
4862
4863 barrier_set_role(&barrier, BARRIER_PARENT);
4864
e4077ff6 4865 fdset_close(fds);
b0067625 4866
5d9d3fcb 4867 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
af06cd30 4868 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
b0067625
ZJS
4869
4870 if (arg_userns_mode != USER_NAMESPACE_NO) {
af06cd30 4871 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
b71a0192
CB
4872 if (mntns_fd < 0)
4873 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4874
b0067625 4875 /* The child just let us know the UID shift it might have read from the image. */
af06cd30 4876 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
b0067625
ZJS
4877 if (l < 0)
4878 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4879 if (l != sizeof arg_uid_shift)
4880 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4881
4882 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4883 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4884 * image, but if that's already in use, pick a new one, and report back to the child,
4885 * which one we now picked. */
4886
4887 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4888 if (r < 0)
4889 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4890
af06cd30 4891 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
b0067625
ZJS
4892 if (l < 0)
4893 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4894 if (l != sizeof arg_uid_shift)
4895 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4896 }
2f893044
LP
4897
4898 n_bind_user_uid = strv_length(arg_bind_user);
4899 if (n_bind_user_uid > 0) {
4900 /* Right after the UID shift, we'll receive the list of UID mappings for the
4901 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4902
4903 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4904 if (!bind_user_uid)
4905 return log_oom();
4906
4907 for (size_t i = 0; i < n_bind_user_uid; i++) {
af06cd30 4908 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
2f893044
LP
4909 if (l < 0)
4910 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4911 if (l != sizeof(uid_t)*4)
4912 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4913 SYNTHETIC_ERRNO(EIO),
4914 "Short read while reading bind user UID pairs.");
4915 }
4916 }
b0067625
ZJS
4917 }
4918
8199d554
LP
4919 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4920 /* The child let us know the support cgroup mode it might have read from the image. */
fefb7a6d 4921 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
8199d554
LP
4922 if (l < 0)
4923 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113 4924 if (l != sizeof(arg_unified_cgroup_hierarchy))
c0f86d66 4925 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
c6147113 4926 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4927 }
4928
b0067625 4929 /* Wait for the outer child. */
d2e0ac3d
LP
4930 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4931 if (r < 0)
4932 return r;
4933 if (r != EXIT_SUCCESS)
4934 return -EIO;
b0067625
ZJS
4935
4936 /* And now retrieve the PID of the inner child. */
af06cd30 4937 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
b0067625
ZJS
4938 if (l < 0)
4939 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4940 if (l != sizeof *pid)
4941 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4942
4943 /* We also retrieve container UUID in case it was generated by outer child */
af06cd30 4944 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
b0067625
ZJS
4945 if (l < 0)
4946 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4947 if (l != sizeof(arg_uuid))
4948 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4949
4950 /* We also retrieve the socket used for notifications generated by outer child */
af06cd30 4951 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
b0067625
ZJS
4952 if (notify_socket < 0)
4953 return log_error_errno(notify_socket,
4954 "Failed to receive notification socket from the outer child: %m");
4955
4956 log_debug("Init process invoked as PID "PID_FMT, *pid);
4957
4958 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4959 if (!barrier_place_and_sync(&barrier)) /* #1 */
4960 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4961
2f893044 4962 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4963 if (r < 0)
4964 return r;
4965
4966 (void) barrier_place(&barrier); /* #2 */
4967 }
4968
4969 if (arg_private_network) {
75116558
PS
4970 if (!arg_network_namespace_path) {
4971 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4972 if (!barrier_place_and_sync(&barrier)) /* #3 */
4973 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4974 }
4975
5b4855ab
DDM
4976 if (child_netns_fd < 0) {
4977 /* Make sure we have an open file descriptor to the child's network
4978 * namespace so it stays alive even if the child exits. */
4979 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4980 if (r < 0)
4981 return log_error_errno(r, "Failed to open child network namespace: %m");
4982 }
4983
4984 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4985 if (r < 0)
4986 return r;
4987
4988 if (arg_network_veth) {
4989 r = setup_veth(arg_machine, *pid, veth_name,
4990 arg_network_bridge || arg_network_zone);
4991 if (r < 0)
4992 return r;
4993 else if (r > 0)
4994 ifi = r;
4995
4996 if (arg_network_bridge) {
4997 /* Add the interface to a bridge */
4998 r = setup_bridge(veth_name, arg_network_bridge, false);
4999 if (r < 0)
5000 return r;
5001 if (r > 0)
5002 ifi = r;
5003 } else if (arg_network_zone) {
5004 /* Add the interface to a bridge, possibly creating it */
5005 r = setup_bridge(veth_name, arg_network_zone, true);
5006 if (r < 0)
5007 return r;
5008 if (r > 0)
5009 ifi = r;
5010 }
5011 }
5012
5013 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5014 if (r < 0)
5015 return r;
5016
5017 /* We created the primary and extra veth links now; let's remember this, so that we know to
5018 remove them later on. Note that we don't bother with removing veth links that were created
5019 here when their setup failed half-way, because in that case the kernel should be able to
5020 remove them on its own, since they cannot be referenced by anything yet. */
5021 *veth_created = true;
5022
5023 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5024 if (r < 0)
5025 return r;
5026
5027 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5028 if (r < 0)
5029 return r;
5030 }
5031
abdb9b08
LP
5032 if (arg_register || !arg_keep_unit) {
5033 r = sd_bus_default_system(&bus);
5034 if (r < 0)
5035 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
5036
5037 r = sd_bus_set_close_on_exit(bus, false);
5038 if (r < 0)
5039 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
5040 }
5041
5042 if (!arg_keep_unit) {
5043 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5044 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5045 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5046
75152a4d
LP
5047 r = sd_bus_match_signal_async(
5048 bus,
5049 NULL,
5050 "org.freedesktop.systemd1",
5051 NULL,
5052 "org.freedesktop.systemd1.Scope",
5053 "RequestStop",
5054 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 5055 if (r < 0)
75152a4d 5056 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
5057 }
5058
b0067625
ZJS
5059 if (arg_register) {
5060 r = register_machine(
abdb9b08 5061 bus,
b0067625
ZJS
5062 arg_machine,
5063 *pid,
5064 arg_directory,
5065 arg_uuid,
5066 ifi,
5067 arg_slice,
5068 arg_custom_mounts, arg_n_custom_mounts,
5069 arg_kill_signal,
5070 arg_property,
de40a303 5071 arg_property_message,
b0067625
ZJS
5072 arg_keep_unit,
5073 arg_container_service_name);
5074 if (r < 0)
5075 return r;
abdb9b08 5076
cd2dfc6f
LP
5077 } else if (!arg_keep_unit) {
5078 r = allocate_scope(
abdb9b08 5079 bus,
cd2dfc6f
LP
5080 arg_machine,
5081 *pid,
5082 arg_slice,
5083 arg_custom_mounts, arg_n_custom_mounts,
5084 arg_kill_signal,
de40a303
LP
5085 arg_property,
5086 arg_property_message);
cd2dfc6f
LP
5087 if (r < 0)
5088 return r;
5089
5090 } else if (arg_slice || arg_property)
5091 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5092
27da7ef0 5093 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5094 if (r < 0)
5095 return r;
5096
27da7ef0 5097 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5098 if (r < 0)
5099 return r;
b0067625 5100
de54e02d 5101 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5102 if (r < 0)
5103 return r;
5104
5105 /* Notify the child that the parent is ready with all
5106 * its setup (including cgroup-ification), and that
5107 * the child can now hand over control to the code to
5108 * run inside the container. */
75116558 5109 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5110
5111 /* Block SIGCHLD here, before notifying child.
5112 * process_pty() will handle it with the other signals. */
5113 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5114
5115 /* Reset signal to default */
9c274488 5116 r = default_signals(SIGCHLD);
b0067625
ZJS
5117 if (r < 0)
5118 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5119
5120 r = sd_event_new(&event);
5121 if (r < 0)
5122 return log_error_errno(r, "Failed to get default event source: %m");
5123
8fd010bb
LP
5124 (void) sd_event_set_watchdog(event, true);
5125
abdb9b08
LP
5126 if (bus) {
5127 r = sd_bus_attach_event(bus, event, 0);
5128 if (r < 0)
5129 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5130 }
5131
e96ceaba 5132 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5133 if (r < 0)
5134 return r;
5135
b71a0192
CB
5136 if (arg_userns_mode != USER_NAMESPACE_NO) {
5137 r = wipe_fully_visible_fs(mntns_fd);
5138 if (r < 0)
5139 return r;
5140 mntns_fd = safe_close(mntns_fd);
5141 }
5142
b0067625 5143 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
5144 if (!barrier_place_and_sync(&barrier)) /* #5 */
5145 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5146
38ccb557 5147 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5148 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5149 etc_passwd_lock = safe_close(etc_passwd_lock);
5150
04f590a4
LP
5151 (void) sd_notifyf(false,
5152 "STATUS=Container running.\n"
5153 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4bf4f50f
ZJS
5154 if (!arg_notify_ready) {
5155 r = sd_notify(false, "READY=1\n");
5156 if (r < 0)
5157 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5158 }
b0067625
ZJS
5159
5160 if (arg_kill_signal > 0) {
5161 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5162 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5163 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5164 } else {
5165 /* Immediately exit */
919f5ae0
LP
5166 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5167 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5168 }
5169
988851b6
LP
5170 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5171
5172 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5173 if (r < 0)
5174 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5175
6916b164 5176 /* Exit when the child exits */
919f5ae0 5177 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625 5178
b07ee903
CB
5179 /* Retrieve the kmsg fifo allocated by inner child */
5180 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5181 if (fd_kmsg_fifo < 0)
5182 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5183
b0067625 5184 if (arg_expose_ports) {
b07ee903 5185 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5186 if (r < 0)
5187 return r;
5188
deff68e7
FW
5189 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5190 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5191 }
5192
3acc84eb 5193 if (arg_console_mode != CONSOLE_PIPE) {
254d1313 5194 _cleanup_close_ int fd = -EBADF;
3acc84eb 5195 PTYForwardFlags flags = 0;
de40a303 5196
3acc84eb 5197 /* Retrieve the master pty allocated by inner child */
bb1aa185 5198 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
3acc84eb
FB
5199 if (fd < 0)
5200 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5201
5202 switch (arg_console_mode) {
de40a303 5203
3acc84eb
FB
5204 case CONSOLE_READ_ONLY:
5205 flags |= PTY_FORWARD_READ_ONLY;
5206
5207 _fallthrough_;
5208
5209 case CONSOLE_INTERACTIVE:
5210 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5211
5212 r = pty_forward_new(event, fd, flags, &forward);
5213 if (r < 0)
5214 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5215
f5fbe71d 5216 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5217 (void) pty_forward_set_width_height(forward,
5218 arg_console_width,
5219 arg_console_height);
5220 break;
5221
5222 default:
5223 assert(arg_console_mode == CONSOLE_PASSIVE);
5224 }
5225
5226 *master = TAKE_FD(fd);
de40a303 5227 }
b0067625 5228
5d9d3fcb
CB
5229 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5230
b0067625
ZJS
5231 r = sd_event_loop(event);
5232 if (r < 0)
5233 return log_error_errno(r, "Failed to run event loop: %m");
5234
de40a303
LP
5235 if (forward) {
5236 char last_char = 0;
b0067625 5237
de40a303
LP
5238 (void) pty_forward_get_last_char(forward, &last_char);
5239 forward = pty_forward_free(forward);
b0067625 5240
de40a303
LP
5241 if (!arg_quiet && last_char != '\n')
5242 putc('\n', stdout);
5243 }
b0067625
ZJS
5244
5245 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5246 if (!arg_register && !arg_keep_unit && bus)
5247 terminate_scope(bus, arg_machine);
b0067625
ZJS
5248
5249 /* Normally redundant, but better safe than sorry */
c67b0082 5250 (void) kill(*pid, SIGKILL);
b0067625 5251
5d9d3fcb
CB
5252 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5253
5b4855ab
DDM
5254 if (arg_private_network) {
5255 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5256 * to avoid having to move the parent to the child network namespace. */
5257 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5258 if (r < 0)
5259 return r;
5260
5261 if (r == 0) {
254d1313 5262 _cleanup_close_ int parent_netns_fd = -EBADF;
5b4855ab
DDM
5263
5264 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5265 if (r < 0) {
5266 log_error_errno(r, "Failed to open parent network namespace: %m");
5267 _exit(EXIT_FAILURE);
5268 }
5269
5270 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5271 if (r < 0) {
5272 log_error_errno(r, "Failed to enter child network namespace: %m");
5273 _exit(EXIT_FAILURE);
5274 }
5275
2f091b1b
TM
5276 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5277 * This is about ensuring interfaces get their old name back when being moved back. */
5278 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5279
5b4855ab
DDM
5280 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5281 if (r < 0)
5282 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5283
5284 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5285 }
5286 }
5287
8f03de53 5288 r = wait_for_container(TAKE_PID(*pid), &container_status);
b0067625 5289
0bb0a9fa
ZJS
5290 /* Tell machined that we are gone. */
5291 if (bus)
5292 (void) unregister_machine(bus, arg_machine);
5293
b0067625
ZJS
5294 if (r < 0)
5295 /* We failed to wait for the container, or the container exited abnormally. */
5296 return r;
5297 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5298 /* r > 0 → The container exited with a non-zero status.
5299 * As a special case, we need to replace 133 with a different value,
5300 * because 133 is special-cased in the service file to reboot the container.
5301 * otherwise → The container exited with zero status and a reboot was not requested.
5302 */
2a49b612 5303 if (r == EXIT_FORCE_RESTART)
27e29a1e 5304 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5305 *ret = r;
b0067625
ZJS
5306 return 0; /* finito */
5307 }
5308
5309 /* CONTAINER_REBOOTED, loop again */
5310
5311 if (arg_keep_unit) {
5312 /* Special handling if we are running as a service: instead of simply
5313 * restarting the machine we want to restart the entire service, so let's
5314 * inform systemd about this with the special exit code 133. The service
5315 * file uses RestartForceExitStatus=133 so that this results in a full
5316 * nspawn restart. This is necessary since we might have cgroup parameters
5317 * set we want to have flushed out. */
2a49b612
ZJS
5318 *ret = EXIT_FORCE_RESTART;
5319 return 0; /* finito */
b0067625
ZJS
5320 }
5321
deff68e7
FW
5322 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5323 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5324
5325 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5326 *veth_created = false;
5327 return 1; /* loop again */
5328}
5329
bf428efb 5330static int initialize_rlimits(void) {
852b6250 5331 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
bf428efb
LP
5332 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5333 * container execution environments. */
5334
5335 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
852b6250
LP
5336 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5337 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5338 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5339 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5340 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5341 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5342 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5343 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5344 [RLIMIT_NICE] = { 0, 0 },
5345 [RLIMIT_NOFILE] = { 1024, 4096 },
5346 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5347 [RLIMIT_RTPRIO] = { 0, 0 },
5348 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5349 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
bf428efb
LP
5350
5351 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5352 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5353 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5354 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5355 * that PID 1 changes a number of other resource limits during early initialization which is why we
5356 * don't read the other limits from PID 1 but prefer the static table above. */
5357 };
5358
5359 int rl;
5360
5361 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5362 /* Let's only fill in what the user hasn't explicitly configured anyway */
5363 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5364 const struct rlimit *v;
5365 struct rlimit buffer;
5366
5367 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5368 /* For these two let's read the limits off PID 1. See above for an explanation. */
5369
5370 if (prlimit(1, rl, NULL, &buffer) < 0)
5371 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5372
dbf1aca6
LP
5373 v = &buffer;
5374 } else if (rl == RLIMIT_NOFILE) {
5375 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5376 * userspace. Given that nspawn containers are often run without our PID 1,
5377 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5378 * so that container userspace gets similar resources as host userspace
5379 * gets. */
5380 buffer = kernel_defaults[rl];
5381 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
bf428efb
LP
5382 v = &buffer;
5383 } else
5384 v = kernel_defaults + rl;
5385
5386 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5387 if (!arg_rlimit[rl])
5388 return log_oom();
5389 }
5390
5391 if (DEBUG_LOGGING) {
5392 _cleanup_free_ char *k = NULL;
5393
5394 (void) rlimit_format(arg_rlimit[rl], &k);
5395 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5396 }
5397 }
5398
5399 return 0;
5400}
5401
287b7376 5402static int cant_be_in_netns(void) {
254d1313 5403 _cleanup_close_ int fd = -EBADF;
287b7376
LP
5404 struct ucred ucred;
5405 int r;
5406
5407 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5408 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5409 * nice message. */
5410
5411 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5412 return 0;
5413
5414 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5415 if (fd < 0)
5416 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5417
1861986a
LP
5418 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5419 if (r < 0) {
5420 if (r == -ENOENT || ERRNO_IS_DISCONNECT(r))
287b7376
LP
5421 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5422 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5423
1861986a 5424 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
287b7376
LP
5425 }
5426
5427 r = getpeercred(fd, &ucred);
5428 if (r < 0)
5429 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5430
f7a2dc3d 5431 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
287b7376 5432 if (r < 0)
f7a2dc3d
CB
5433 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5434 if (r == 0)
287b7376
LP
5435 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5436 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5437 return 0;
5438}
5439
44dbef90 5440static int run(int argc, char *argv[]) {
4c27749b 5441 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5bb1d7fb 5442 _cleanup_close_ int master = -EBADF;
03cfe0d5 5443 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5444 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5445 char veth_name[IFNAMSIZ] = "";
761cf19d 5446 struct ExposeArgs expose_args = {};
8e766630 5447 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5448 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5449 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e 5450 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5451 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5452 pid_t pid = 0;
03cfe0d5
LP
5453
5454 log_parse_environment();
5455 log_open();
415fc41c 5456
03cfe0d5
LP
5457 r = parse_argv(argc, argv);
5458 if (r <= 0)
5459 goto finish;
5460
38ee19c0
ZJS
5461 if (geteuid() != 0) {
5462 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5463 argc >= 2 ? "Need to be root." :
5464 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5465 goto finish;
38ee19c0 5466 }
fba868fa 5467
287b7376
LP
5468 r = cant_be_in_netns();
5469 if (r < 0)
5470 goto finish;
5471
bf428efb
LP
5472 r = initialize_rlimits();
5473 if (r < 0)
5474 goto finish;
5475
de40a303
LP
5476 r = load_oci_bundle();
5477 if (r < 0)
5478 goto finish;
5479
f757855e
LP
5480 r = determine_names();
5481 if (r < 0)
5482 goto finish;
5483
5484 r = load_settings();
5485 if (r < 0)
5486 goto finish;
5487
d4d99bc6 5488 r = cg_unified();
5eee8290
LP
5489 if (r < 0) {
5490 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5491 goto finish;
5492 }
5493
f757855e
LP
5494 r = verify_arguments();
5495 if (r < 0)
5496 goto finish;
03cfe0d5 5497
2f091b1b
TM
5498 r = verify_network_interfaces_initialized();
5499 if (r < 0)
5500 goto finish;
5501
49048684
ZJS
5502 /* Reapply environment settings. */
5503 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5504
2949ff26
LP
5505 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5506 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5507 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5508 (void) ignore_signals(SIGPIPE);
2949ff26 5509
03cfe0d5
LP
5510 n_fd_passed = sd_listen_fds(false);
5511 if (n_fd_passed > 0) {
5512 r = fdset_new_listen_fds(&fds, false);
5513 if (r < 0) {
5514 log_error_errno(r, "Failed to collect file descriptors: %m");
5515 goto finish;
5516 }
5517 }
5518
83e803a9
ZJS
5519 /* The "default" umask. This is appropriate for most file and directory
5520 * operations performed by nspawn, and is the umask that will be used for
5521 * the child. Functions like copy_devnodes() change the umask temporarily. */
5522 umask(0022);
5523
03cfe0d5
LP
5524 if (arg_directory) {
5525 assert(!arg_image);
5526
b35ca61a
LP
5527 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5528 * /var from the host will propagate into container dynamically (because bad things happen if
5529 * two systems write to the same /var). Let's allow it for the special cases where /var is
5530 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5531 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
1406bd66
LP
5532 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5533 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5534 goto finish;
5535 }
5536
5537 if (arg_ephemeral) {
5538 _cleanup_free_ char *np = NULL;
5539
f461a28d 5540 r = chase_and_update(&arg_directory, 0);
3f342ec4
LP
5541 if (r < 0)
5542 goto finish;
5543
7bf011e3
LP
5544 /* If the specified path is a mount point we generate the new snapshot immediately
5545 * inside it under a random name. However if the specified is not a mount point we
5546 * create the new snapshot in the parent directory, just next to it. */
e1873695 5547 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5548 if (r < 0) {
5549 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5550 goto finish;
5551 }
5552 if (r > 0)
770b5ce4 5553 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5554 else
770b5ce4 5555 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5556 if (r < 0) {
0f3be6ca 5557 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5558 goto finish;
5559 }
5560
6992459c 5561 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5562 * only owned by us and no one else. */
6992459c 5563 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5564 if (r < 0) {
5565 log_error_errno(r, "Failed to lock %s: %m", np);
5566 goto finish;
5567 }
5568
7bf011e3
LP
5569 {
5570 BLOCK_SIGNALS(SIGINT);
5571 r = btrfs_subvol_snapshot(arg_directory, np,
5572 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5573 BTRFS_SNAPSHOT_FALLBACK_COPY |
5574 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5575 BTRFS_SNAPSHOT_RECURSIVE |
5576 BTRFS_SNAPSHOT_QUOTA |
5577 BTRFS_SNAPSHOT_SIGINT);
5578 }
5579 if (r == -EINTR) {
5580 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5581 goto finish;
5582 }
03cfe0d5
LP
5583 if (r < 0) {
5584 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5585 goto finish;
ec16945e
LP
5586 }
5587
1cc6c93a 5588 free_and_replace(arg_directory, np);
17cbb288 5589 remove_directory = true;
30535c16 5590 } else {
f461a28d 5591 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5592 if (r < 0)
5593 goto finish;
5594
30535c16
LP
5595 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5596 if (r == -EBUSY) {
5597 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5598 goto finish;
5599 }
5600 if (r < 0) {
5601 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5602 goto finish;
30535c16
LP
5603 }
5604
5605 if (arg_template) {
f461a28d 5606 r = chase_and_update(&arg_template, 0);
3f342ec4
LP
5607 if (r < 0)
5608 goto finish;
5609
7bf011e3
LP
5610 {
5611 BLOCK_SIGNALS(SIGINT);
5612 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5613 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5614 BTRFS_SNAPSHOT_FALLBACK_COPY |
5615 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5616 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5617 BTRFS_SNAPSHOT_RECURSIVE |
5618 BTRFS_SNAPSHOT_QUOTA |
5619 BTRFS_SNAPSHOT_SIGINT);
5620 }
ff6c6cc1
LP
5621 if (r == -EEXIST)
5622 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5623 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5624 else if (r == -EINTR) {
5625 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5626 goto finish;
5627 } else if (r < 0) {
83521414 5628 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5629 goto finish;
ff6c6cc1
LP
5630 } else
5631 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5632 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5633 }
ec16945e
LP
5634 }
5635
7732f92b 5636 if (arg_start_mode == START_BOOT) {
aff7ae0d 5637 _cleanup_free_ char *b = NULL;
a5201ed6 5638 const char *p;
c9fe05e0 5639
aff7ae0d
LP
5640 if (arg_pivot_root_new) {
5641 b = path_join(arg_directory, arg_pivot_root_new);
5642 if (!b)
5643 return log_oom();
5644
5645 p = b;
5646 } else
a5201ed6 5647 p = arg_directory;
c9fe05e0
AR
5648
5649 if (path_is_os_tree(p) <= 0) {
aff7ae0d
LP
5650 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5651 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
1b9e5b12
LP
5652 goto finish;
5653 }
5654 } else {
aff7ae0d 5655 _cleanup_free_ char *p = NULL;
c9fe05e0 5656
a5201ed6 5657 if (arg_pivot_root_new)
aff7ae0d 5658 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
a5201ed6 5659 else
aff7ae0d
LP
5660 p = path_join(arg_directory, "/usr/");
5661 if (!p)
5662 return log_oom();
1b9e5b12 5663
aff7ae0d
LP
5664 if (laccess(p, F_OK) < 0) {
5665 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5666 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
1b9e5b12 5667 goto finish;
1b9e5b12
LP
5668 }
5669 }
ec16945e 5670
6b9132a9 5671 } else {
d04faa4e 5672 DissectImageFlags dissect_image_flags =
4b5de5dd 5673 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5674 DISSECT_IMAGE_REQUIRE_ROOT |
5675 DISSECT_IMAGE_RELAX_VAR_CHECK |
73d88b80
LP
5676 DISSECT_IMAGE_USR_NO_ROOT |
5677 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5678 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
ec16945e
LP
5679 assert(arg_image);
5680 assert(!arg_template);
5681
f461a28d 5682 r = chase_and_update(&arg_image, 0);
3f342ec4
LP
5683 if (r < 0)
5684 goto finish;
5685
0f3be6ca
LP
5686 if (arg_ephemeral) {
5687 _cleanup_free_ char *np = NULL;
5688
5689 r = tempfn_random(arg_image, "machine.", &np);
5690 if (r < 0) {
5691 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5692 goto finish;
5693 }
5694
6992459c
LP
5695 /* Always take an exclusive lock on our own ephemeral copy. */
5696 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5697 if (r < 0) {
5698 r = log_error_errno(r, "Failed to create image lock: %m");
5699 goto finish;
5700 }
5701
7bf011e3
LP
5702 {
5703 BLOCK_SIGNALS(SIGINT);
7c2f5495
DDM
5704 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5705 FS_NOCOW_FL, FS_NOCOW_FL,
5706 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5707 NULL, NULL);
7bf011e3
LP
5708 }
5709 if (r == -EINTR) {
5710 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5711 goto finish;
5712 }
0f3be6ca
LP
5713 if (r < 0) {
5714 r = log_error_errno(r, "Failed to copy image file: %m");
5715 goto finish;
5716 }
5717
1cc6c93a 5718 free_and_replace(arg_image, np);
0f3be6ca
LP
5719 remove_image = true;
5720 } else {
5721 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5722 if (r == -EBUSY) {
5723 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5724 goto finish;
5725 }
5726 if (r < 0) {
5727 r = log_error_errno(r, "Failed to create image lock: %m");
5728 goto finish;
5729 }
4623e8e6 5730
89e62e0b
LP
5731 r = verity_settings_load(
5732 &arg_verity_settings,
5733 arg_image, NULL, NULL);
e7cbe5cb
LB
5734 if (r < 0) {
5735 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5736 goto finish;
78ebe980 5737 }
89e62e0b
LP
5738
5739 if (arg_verity_settings.data_path)
5740 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5741 }
5742
c67b0082 5743 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5744 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5745 goto finish;
1b9e5b12 5746 }
6b9132a9 5747
c67b0082
LP
5748 remove_tmprootdir = true;
5749
5750 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5751 if (!arg_directory) {
5752 r = log_oom();
5753 goto finish;
6b9132a9 5754 }
88213476 5755
89e62e0b
LP
5756 r = loop_device_make_by_path(
5757 arg_image,
5758 arg_read_only ? O_RDONLY : O_RDWR,
22ee78a8 5759 /* sector_size= */ UINT32_MAX,
89e62e0b 5760 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
7f52206a 5761 LOCK_SH,
89e62e0b 5762 &loop);
2d845785
LP
5763 if (r < 0) {
5764 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5765 goto finish;
5766 }
1b9e5b12 5767
bad31660 5768 r = dissect_loop_device_and_warn(
bad31660 5769 loop,
89e62e0b 5770 &arg_verity_settings,
84be0c71
LP
5771 /* mount_options=*/ NULL,
5772 arg_image_policy ?: &image_policy_container,
e7cbe5cb 5773 dissect_image_flags,
e0f9e7bd 5774 &dissected_image);
2d845785 5775 if (r == -ENOPKG) {
4526113f 5776 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5777 log_notice("Note that the disk image needs to\n"
5778 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5779 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
db811444 5780 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
2d845785
LP
5781 " d) or contain a file system without a partition table\n"
5782 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5783 goto finish;
2d845785 5784 }
4526113f 5785 if (r < 0)
842f3b0f 5786 goto finish;
1b9e5b12 5787
88b3300f
LP
5788 r = dissected_image_load_verity_sig_partition(
5789 dissected_image,
5790 loop->fd,
5791 &arg_verity_settings);
5792 if (r < 0)
5793 goto finish;
5794
8ee9615e
LP
5795 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5796 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5797 "root hash signature found! Proceeding without integrity checking.", arg_image);
4623e8e6 5798
89e62e0b
LP
5799 r = dissected_image_decrypt_interactively(
5800 dissected_image,
5801 NULL,
5802 &arg_verity_settings,
e330f97a 5803 0);
1b9e5b12
LP
5804 if (r < 0)
5805 goto finish;
0f3be6ca
LP
5806
5807 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5808 if (remove_image && unlink(arg_image) >= 0)
5809 remove_image = false;
4c27749b
LP
5810
5811 if (arg_architecture < 0)
5812 arg_architecture = dissected_image_architecture(dissected_image);
842f3b0f 5813 }
842f3b0f 5814
86c0dd4a 5815 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5816 if (r < 0)
5817 goto finish;
5818
de40a303
LP
5819 if (arg_console_mode < 0)
5820 arg_console_mode =
5821 isatty(STDIN_FILENO) > 0 &&
5822 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5823
de40a303
LP
5824 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5825 arg_quiet = true;
a258bf26 5826
9c857b9d 5827 if (!arg_quiet)
c85c2f79 5828 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
9c857b9d
LP
5829 arg_machine, arg_image ?: arg_directory);
5830
988851b6 5831 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
a258bf26 5832
66edd963 5833 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5834 r = log_error_errno(errno, "Failed to become subreaper: %m");
5835 goto finish;
5836 }
5837
761cf19d
FW
5838 if (arg_expose_ports) {
5839 r = fw_ctx_new(&fw_ctx);
5840 if (r < 0) {
5841 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5842 goto finish;
5843 }
5844 expose_args.fw_ctx = fw_ctx;
5845 }
d87be9b0 5846 for (;;) {
3acc84eb 5847 r = run_container(dissected_image,
44dbef90
LP
5848 fds,
5849 veth_name, &veth_created,
761cf19d 5850 &expose_args, &master,
44dbef90 5851 &pid, &ret);
b0067625 5852 if (r <= 0)
d87be9b0 5853 break;
d87be9b0 5854 }
88213476
LP
5855
5856finish:
04f590a4
LP
5857 (void) sd_notify(false,
5858 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5859 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5860
9444b1f2 5861 if (pid > 0)
c67b0082 5862 (void) kill(pid, SIGKILL);
88213476 5863
503546da 5864 /* Try to flush whatever is still queued in the pty */
6a0f896b 5865 if (master >= 0) {
f5fbe71d 5866 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5867 master = safe_close(master);
5868 }
5869
5870 if (pid > 0)
5871 (void) wait_for_terminate(pid, NULL);
503546da 5872
50ebcf6c
LP
5873 pager_close();
5874
17cbb288 5875 if (remove_directory && arg_directory) {
ec16945e
LP
5876 int k;
5877
17cbb288 5878 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5879 if (k < 0)
17cbb288 5880 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5881 }
5882
0f3be6ca
LP
5883 if (remove_image && arg_image) {
5884 if (unlink(arg_image) < 0)
5885 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5886 }
5887
c67b0082
LP
5888 if (remove_tmprootdir) {
5889 if (rmdir(tmprootdir) < 0)
5890 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5891 }
5892
785890ac
LP
5893 if (arg_machine) {
5894 const char *p;
5895
63c372cb 5896 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5897 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5898 }
5899
deff68e7
FW
5900 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5901 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5902
5903 if (veth_created)
5904 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5905 (void) remove_bridge(arg_network_zone);
f757855e 5906
f757855e
LP
5907 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5908 expose_port_free_all(arg_expose_ports);
bf428efb 5909 rlimit_free_all(arg_rlimit);
b2645747 5910 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5911 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5912
44dbef90
LP
5913 if (r < 0)
5914 return r;
5915
5916 return ret;
88213476 5917}
44dbef90
LP
5918
5919DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);