]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
fileio: port read_file_full() to use connect_unix_path()
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
b8ea7a6e 31#include "bus-error.h"
b053cd5f 32#include "bus-util.h"
8fe0087e 33#include "cap-list.h"
430f0182 34#include "capability-util.h"
04d391da 35#include "cgroup-util.h"
f4351959 36#include "chase-symlinks.h"
8fe0087e 37#include "copy.h"
d107bb7d 38#include "cpu-set-util.h"
786d19fd 39#include "creds-util.h"
4fc9982c 40#include "dev-setup.h"
57f1b61b 41#include "discover-image.h"
2d845785 42#include "dissect-image.h"
8fe0087e 43#include "env-util.h"
3652872a 44#include "escape.h"
3ffd4af2 45#include "fd-util.h"
842f3b0f 46#include "fdset.h"
a5c32cff 47#include "fileio.h"
f97b34a6 48#include "format-util.h"
f4f15635 49#include "fs-util.h"
1b9e5b12 50#include "gpt.h"
4623e8e6 51#include "hexdecoct.h"
e2054217 52#include "hostname-setup.h"
8fe0087e 53#include "hostname-util.h"
910fd145 54#include "id128-util.h"
3652872a 55#include "io-util.h"
8fe0087e 56#include "log.h"
2d845785 57#include "loop-util.h"
8fe0087e 58#include "loopback-setup.h"
8fe0087e 59#include "macro.h"
44dbef90 60#include "main-func.h"
f5947a5e 61#include "missing_sched.h"
8fe0087e 62#include "mkdir.h"
4349cd7c 63#include "mount-util.h"
049af8ad 64#include "mountpoint-util.h"
0cb8e3d1 65#include "namespace-util.h"
8fe0087e 66#include "netlink-util.h"
2f893044 67#include "nspawn-bind-user.h"
07630cea 68#include "nspawn-cgroup.h"
3652872a 69#include "nspawn-creds.h"
3603efde 70#include "nspawn-def.h"
07630cea
LP
71#include "nspawn-expose-ports.h"
72#include "nspawn-mount.h"
73#include "nspawn-network.h"
de40a303 74#include "nspawn-oci.h"
7336138e 75#include "nspawn-patch-uid.h"
07630cea 76#include "nspawn-register.h"
910fd145 77#include "nspawn-seccomp.h"
07630cea
LP
78#include "nspawn-settings.h"
79#include "nspawn-setuid.h"
7732f92b 80#include "nspawn-stub-pid1.h"
c9394f4f 81#include "nspawn-util.h"
91181e07 82#include "nspawn.h"
d8b4d14d 83#include "nulstr-util.h"
d58ad743 84#include "os-util.h"
50ebcf6c 85#include "pager.h"
614b022c 86#include "parse-argument.h"
6bedfcbb 87#include "parse-util.h"
294bf0c3 88#include "pretty-print.h"
0b452006 89#include "process-util.h"
8fe0087e
LP
90#include "ptyfwd.h"
91#include "random-util.h"
8869a0b4 92#include "raw-clone.h"
86775e35 93#include "resolve-util.h"
bf428efb 94#include "rlimit-util.h"
8fe0087e 95#include "rm-rf.h"
de40a303
LP
96#if HAVE_SECCOMP
97#include "seccomp-util.h"
98#endif
68b02049 99#include "selinux-util.h"
8fe0087e 100#include "signal-util.h"
2583fbea 101#include "socket-util.h"
8fcde012 102#include "stat-util.h"
15a5e950 103#include "stdio-util.h"
5c828e66 104#include "string-table.h"
07630cea 105#include "string-util.h"
8fe0087e 106#include "strv.h"
de40a303 107#include "sysctl-util.h"
8fe0087e 108#include "terminal-util.h"
e4de7287 109#include "tmpfile-util.h"
affb60b1 110#include "umask-util.h"
43c3fb46 111#include "unit-name.h"
b1d4f8e1 112#include "user-util.h"
8fe0087e 113#include "util.h"
e9642be2 114
e96ceaba
LP
115/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
116#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
0e7ac751 117
2a49b612
ZJS
118#define EXIT_FORCE_RESTART 133
119
113cea80
DH
120typedef enum ContainerStatus {
121 CONTAINER_TERMINATED,
6145bb4f 122 CONTAINER_REBOOTED,
113cea80
DH
123} ContainerStatus;
124
88213476 125static char *arg_directory = NULL;
ec16945e 126static char *arg_template = NULL;
5f932eb9 127static char *arg_chdir = NULL;
b53ede69
PW
128static char *arg_pivot_root_new = NULL;
129static char *arg_pivot_root_old = NULL;
687d0825 130static char *arg_user = NULL;
de40a303
LP
131static uid_t arg_uid = UID_INVALID;
132static gid_t arg_gid = GID_INVALID;
133static gid_t* arg_supplementary_gids = NULL;
134static size_t arg_n_supplementary_gids = 0;
9444b1f2 135static sd_id128_t arg_uuid = {};
3a9530e5
LP
136static char *arg_machine = NULL; /* The name used by the host to refer to this */
137static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
138static const char *arg_selinux_context = NULL;
139static const char *arg_selinux_apifs_context = NULL;
de40a303 140static char *arg_slice = NULL;
ff01d048 141static bool arg_private_network = false;
bc2f673e 142static bool arg_read_only = false;
7732f92b 143static StartMode arg_start_mode = START_PID1;
ec16945e 144static bool arg_ephemeral = false;
57fb9fb5 145static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 146static bool arg_link_journal_try = false;
520e0d54 147static uint64_t arg_caps_retain =
50b52222
LP
148 (1ULL << CAP_AUDIT_CONTROL) |
149 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
150 (1ULL << CAP_CHOWN) |
151 (1ULL << CAP_DAC_OVERRIDE) |
152 (1ULL << CAP_DAC_READ_SEARCH) |
153 (1ULL << CAP_FOWNER) |
154 (1ULL << CAP_FSETID) |
155 (1ULL << CAP_IPC_OWNER) |
156 (1ULL << CAP_KILL) |
157 (1ULL << CAP_LEASE) |
158 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 159 (1ULL << CAP_MKNOD) |
5076f0cc
LP
160 (1ULL << CAP_NET_BIND_SERVICE) |
161 (1ULL << CAP_NET_BROADCAST) |
162 (1ULL << CAP_NET_RAW) |
5076f0cc 163 (1ULL << CAP_SETFCAP) |
50b52222 164 (1ULL << CAP_SETGID) |
5076f0cc
LP
165 (1ULL << CAP_SETPCAP) |
166 (1ULL << CAP_SETUID) |
167 (1ULL << CAP_SYS_ADMIN) |
50b52222 168 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
169 (1ULL << CAP_SYS_CHROOT) |
170 (1ULL << CAP_SYS_NICE) |
171 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 172 (1ULL << CAP_SYS_RESOURCE) |
50b52222 173 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 174static uint64_t arg_caps_ambient = 0;
de40a303 175static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 176static CustomMount *arg_custom_mounts = NULL;
88614c8a 177static size_t arg_n_custom_mounts = 0;
f4889f65 178static char **arg_setenv = NULL;
284c0b91 179static bool arg_quiet = false;
eb91eb18 180static bool arg_register = true;
89f7c846 181static bool arg_keep_unit = false;
aa28aefe 182static char **arg_network_interfaces = NULL;
c74e630d 183static char **arg_network_macvlan = NULL;
4bbfe7ad 184static char **arg_network_ipvlan = NULL;
69c79d3c 185static bool arg_network_veth = false;
f6d6bad1 186static char **arg_network_veth_extra = NULL;
f757855e 187static char *arg_network_bridge = NULL;
22b28dfd 188static char *arg_network_zone = NULL;
d7bea6b6 189static char *arg_network_namespace_path = NULL;
bb068de0 190static PagerFlags arg_pager_flags = 0;
050f7277 191static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 192static char *arg_image = NULL;
de40a303 193static char *arg_oci_bundle = NULL;
f757855e 194static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 195static ExposePort *arg_expose_ports = NULL;
f36933fe 196static char **arg_property = NULL;
de40a303 197static sd_bus_message *arg_property_message = NULL;
0de7acce 198static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 199static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 200static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 201static int arg_kill_signal = 0;
5da38d07 202static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
203static SettingsMask arg_settings_mask = 0;
204static int arg_settings_trusted = -1;
205static char **arg_parameters = NULL;
6aadfa4c 206static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 207static bool arg_notify_ready = false;
5a8ff0e6 208static bool arg_use_cgns = true;
0c582db0 209static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 210static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 211static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
212static char **arg_syscall_allow_list = NULL;
213static char **arg_syscall_deny_list = NULL;
de40a303
LP
214#if HAVE_SECCOMP
215static scmp_filter_ctx arg_seccomp = NULL;
216#endif
bf428efb 217static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 218static bool arg_no_new_privileges = false;
81f345df
LP
219static int arg_oom_score_adjust = 0;
220static bool arg_oom_score_adjust_set = false;
0985c7c4 221static CPUSet arg_cpu_set = {};
09d423e9 222static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 223static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 224static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
225static DeviceNode* arg_extra_nodes = NULL;
226static size_t arg_n_extra_nodes = 0;
227static char **arg_sysctl = NULL;
228static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
229static Credential *arg_credentials = NULL;
230static size_t arg_n_credentials = 0;
2f893044 231static char **arg_bind_user = NULL;
4a4654e0 232static bool arg_suppress_sync = false;
3603f151 233static char *arg_settings_filename = NULL;
88213476 234
6145bb4f
LP
235STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
237STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
238STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
239STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
246STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
247STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
248STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
249STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
250STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
251STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
253STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
254STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
255STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
256STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
257STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 258STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
259STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
260STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
261#if HAVE_SECCOMP
262STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
263#endif
0985c7c4 264STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 265STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 266STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
3603f151 267STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
6145bb4f 268
dce66ffe
ZJS
269static int handle_arg_console(const char *arg) {
270 if (streq(arg, "help")) {
10e8a60b
LP
271 puts("autopipe\n"
272 "interactive\n"
dce66ffe 273 "passive\n"
10e8a60b
LP
274 "pipe\n"
275 "read-only");
dce66ffe
ZJS
276 return 0;
277 }
278
279 if (streq(arg, "interactive"))
280 arg_console_mode = CONSOLE_INTERACTIVE;
281 else if (streq(arg, "read-only"))
282 arg_console_mode = CONSOLE_READ_ONLY;
283 else if (streq(arg, "passive"))
284 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
285 else if (streq(arg, "pipe")) {
286 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
287 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
288 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
289 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
290 "Proceeding anyway.");
291
dce66ffe 292 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
293 } else if (streq(arg, "autopipe")) {
294 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
295 arg_console_mode = CONSOLE_INTERACTIVE;
296 else
297 arg_console_mode = CONSOLE_PIPE;
554c4beb 298 } else
dce66ffe
ZJS
299 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
300
301 arg_settings_mask |= SETTING_CONSOLE_MODE;
302 return 1;
303}
304
37ec0fdd
LP
305static int help(void) {
306 _cleanup_free_ char *link = NULL;
307 int r;
308
384c2c32 309 pager_open(arg_pager_flags);
50ebcf6c 310
37ec0fdd
LP
311 r = terminal_urlify_man("systemd-nspawn", "1", &link);
312 if (r < 0)
313 return log_oom();
314
25148653 315 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 316 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
317 " -h --help Show this help\n"
318 " --version Print version string\n"
69c79d3c 319 " -q --quiet Do not show status information\n"
bb068de0 320 " --no-pager Do not pipe output into a pager\n"
25148653
LP
321 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
322 "%3$sImage:%4$s\n"
1b9e5b12 323 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
324 " --template=PATH Initialize root directory from template directory,\n"
325 " if missing\n"
326 " -x --ephemeral Run container with snapshot of root directory, and\n"
327 " remove it after exit\n"
25e68fd3
LP
328 " -i --image=PATH Root file system disk image (or device node) for\n"
329 " the container\n"
de40a303 330 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
331 " --read-only Mount the root directory read-only\n"
332 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 333 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
334 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
335 " as a DER encoded PKCS7, either as a path to a file\n"
336 " or as an ASCII base64 encoded string prefixed by\n"
337 " 'base64:'\n"
e7cbe5cb 338 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
339 " --pivot-root=PATH[:PATH]\n"
340 " Pivot root to given directory in the container\n\n"
341 "%3$sExecution:%4$s\n"
7732f92b 342 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 343 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 344 " --chdir=PATH Set working directory in the container\n"
0d2a0179 345 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
25148653
LP
346 " -u --user=USER Run the command under specified user or UID\n"
347 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
4a4654e0
LP
348 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
349 " --suppress-sync=BOOLEAN\n"
350 " Suppress any form of disk data synchronization\n\n"
25148653 351 "%3$sSystem Identity:%4$s\n"
a8828ed9 352 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 353 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
354 " --uuid=UUID Set a specific machine UUID for the container\n\n"
355 "%3$sProperties:%4$s\n"
a8828ed9 356 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 357 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
358 " --register=BOOLEAN Register container as machine\n"
359 " --keep-unit Do not register a scope for the machine, reuse\n"
360 " the service unit nspawn is running in\n\n"
361 "%3$sUser Namespacing:%4$s\n"
90b4a64d 362 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 363 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 364 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
365 " --private-users-ownership=MODE\n"
366 " Adjust ('chown') or map ('map') OS tree ownership\n"
367 " to private UID/GID range\n\n"
25148653 368 "%3$sNetworking:%4$s\n"
69c79d3c
LP
369 " --private-network Disable network in container\n"
370 " --network-interface=INTERFACE\n"
371 " Assign an existing network interface to the\n"
372 " container\n"
c74e630d
LP
373 " --network-macvlan=INTERFACE\n"
374 " Create a macvlan network interface based on an\n"
375 " existing network interface to the container\n"
4bbfe7ad 376 " --network-ipvlan=INTERFACE\n"
387f6955 377 " Create an ipvlan network interface based on an\n"
4bbfe7ad 378 " existing network interface to the container\n"
a8eaaee7 379 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 380 " and container\n"
f6d6bad1
LP
381 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
382 " Add an additional virtual Ethernet link between\n"
383 " host and container\n"
ab046dde 384 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
385 " Add a virtual Ethernet connection to the container\n"
386 " and attach it to an existing bridge on the host\n"
387 " --network-zone=NAME Similar, but attach the new interface to an\n"
388 " an automatically managed bridge interface\n"
d7bea6b6
DP
389 " --network-namespace-path=PATH\n"
390 " Set network namespace to the one represented by\n"
391 " the specified kernel namespace file node\n"
6d0b55c2 392 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
393 " Expose a container IP port on the host\n\n"
394 "%3$sSecurity:%4$s\n"
a8828ed9
DW
395 " --capability=CAP In addition to the default, retain specified\n"
396 " capability\n"
397 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
398 " --ambient-capability=CAP\n"
399 " Sets the specified capability for the started\n"
400 " process. Not useful if booting a machine.\n"
f4e803c8 401 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
402 " --system-call-filter=LIST|~LIST\n"
403 " Permit/prohibit specific system calls\n"
25148653
LP
404 " -Z --selinux-context=SECLABEL\n"
405 " Set the SELinux security context to be used by\n"
406 " processes in the container\n"
407 " -L --selinux-apifs-context=SECLABEL\n"
408 " Set the SELinux security context to be used by\n"
409 " API/tmpfs file systems in the container\n\n"
410 "%3$sResources:%4$s\n"
bf428efb 411 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
412 " --oom-score-adjust=VALUE\n"
413 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
414 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
415 " --personality=ARCH Pick personality for this container\n\n"
25148653 416 "%3$sIntegration:%4$s\n"
09d423e9 417 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 418 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
419 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
420 " host, try-guest, try-host\n"
421 " -j Equivalent to --link-journal=try-guest\n\n"
422 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
423 " --bind=PATH[:PATH[:OPTIONS]]\n"
424 " Bind mount a file or directory from the host into\n"
a8828ed9 425 " the container\n"
5e5bfa6e
EY
426 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
427 " Similar, but creates a read-only bind mount\n"
de40a303
LP
428 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
429 " it\n"
06c17c39 430 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
431 " --overlay=PATH[:PATH...]:PATH\n"
432 " Create an overlay mount from the host to \n"
433 " the container\n"
434 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
435 " Similar, but creates a read-only overlay mount\n"
436 " --bind-user=NAME Bind user from host to container\n\n"
25148653 437 "%3$sInput/Output:%4$s\n"
de40a303
LP
438 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
439 " set up for the container.\n"
3652872a
LP
440 " -P --pipe Equivalent to --console=pipe\n\n"
441 "%3$sCredentials:%4$s\n"
442 " --set-credential=ID:VALUE\n"
443 " Pass a credential with literal value to container.\n"
444 " --load-credential=ID:PATH\n"
445 " Load credential to pass to container from file or\n"
446 " AF_UNIX stream socket.\n"
bc556335
DDM
447 "\nSee the %2$s for details.\n",
448 program_invocation_short_name,
449 link,
450 ansi_underline(),
451 ansi_normal(),
452 ansi_highlight(),
453 ansi_normal());
37ec0fdd
LP
454
455 return 0;
88213476
LP
456}
457
86c0dd4a 458static int custom_mount_check_all(void) {
88614c8a 459 size_t i;
5a8af538 460
5a8af538
LP
461 for (i = 0; i < arg_n_custom_mounts; i++) {
462 CustomMount *m = &arg_custom_mounts[i];
463
0de7acce 464 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 465 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 466 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 467 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 468 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
469 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
470 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 471 }
5a8af538
LP
472 }
473
474 return 0;
475}
476
8199d554 477static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 478 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 479 int r;
5da38d07 480
efdb0237 481 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
482
483 e = getenv(var);
484 if (!e) {
d5fc5b2f 485 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
486 var = "UNIFIED_CGROUP_HIERARCHY";
487 e = getenv(var);
c78c095b
ZJS
488 }
489
490 if (!isempty(e)) {
efdb0237
LP
491 r = parse_boolean(e);
492 if (r < 0)
c78c095b 493 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
494 if (r > 0)
495 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
496 else
497 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
498 }
499
8199d554
LP
500 return 0;
501}
502
503static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
504 int r;
505
75b0d8b8
ZJS
506 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
507 * in the image actually supports. */
b4cccbc1
LP
508 r = cg_all_unified();
509 if (r < 0)
510 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
511 if (r > 0) {
a8725a06
ZJS
512 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
513 * routine only detects 231, so we'll have a false negative here for 230. */
7e6821ed 514 r = systemd_installation_has_version(directory, "230");
a8725a06
ZJS
515 if (r < 0)
516 return log_error_errno(r, "Failed to determine systemd version in container: %m");
517 if (r > 0)
518 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
519 else
520 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 521 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b 522 /* Mixed cgroup hierarchy support was added in 233 */
7e6821ed 523 r = systemd_installation_has_version(directory, "233");
0fd9563f
ZJS
524 if (r < 0)
525 return log_error_errno(r, "Failed to determine systemd version in container: %m");
526 if (r > 0)
527 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
528 else
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
530 } else
5da38d07 531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 532
8199d554
LP
533 log_debug("Using %s hierarchy for container.",
534 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
535 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
536
efdb0237
LP
537 return 0;
538}
539
8a99bd0c
ZJS
540static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
541 uint64_t mask = 0;
542 int r;
543
544 for (;;) {
545 _cleanup_free_ char *t = NULL;
546
547 r = extract_first_word(&spec, &t, ",", 0);
548 if (r < 0)
549 return log_error_errno(r, "Failed to parse capability %s.", t);
550 if (r == 0)
551 break;
552
553 if (streq(t, "help")) {
554 for (int i = 0; i < capability_list_length(); i++) {
555 const char *name;
556
557 name = capability_to_name(i);
558 if (name)
559 puts(name);
560 }
561
562 return 0; /* quit */
563 }
564
565 if (streq(t, "all"))
f5fbe71d 566 mask = UINT64_MAX;
8a99bd0c
ZJS
567 else {
568 r = capability_from_name(t);
569 if (r < 0)
570 return log_error_errno(r, "Failed to parse capability %s.", t);
571
572 mask |= 1ULL << r;
573 }
574 }
575
576 *ret_mask = mask;
577 return 1; /* continue */
578}
579
49048684 580static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
581 int r;
582
583 r = getenv_bool(name);
584 if (r == -ENXIO)
49048684 585 return 0;
0c582db0 586 if (r < 0)
49048684 587 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 588
0c582db0 589 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 590 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 591 return 0;
0c582db0
LB
592}
593
49048684 594static int parse_mount_settings_env(void) {
4f086aab 595 const char *e;
1099ceeb
LP
596 int r;
597
598 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
599 if (r < 0 && r != -ENXIO)
600 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
601 if (r >= 0)
602 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
603
604 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 605 if (streq_ptr(e, "network"))
4f086aab 606 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 607
49048684
ZJS
608 else if (e) {
609 r = parse_boolean(e);
610 if (r < 0)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
612
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
614 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 615 }
4f086aab 616
49048684 617 return 0;
4f086aab
SU
618}
619
49048684 620static int parse_environment(void) {
d5455d2f
LP
621 const char *e;
622 int r;
623
49048684
ZJS
624 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
625 if (r < 0)
626 return r;
627 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
628 if (r < 0)
629 return r;
630 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
631 if (r < 0)
632 return r;
633 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
634 if (r < 0)
635 return r;
d5455d2f 636
49048684
ZJS
637 r = parse_mount_settings_env();
638 if (r < 0)
639 return r;
d5455d2f 640
489fae52
ZJS
641 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
642 * even if it is supported. If not supported, it has no effect. */
de40a303 643 if (!cg_ns_supported())
489fae52 644 arg_use_cgns = false;
de40a303
LP
645 else {
646 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
647 if (r < 0) {
648 if (r != -ENXIO)
49048684 649 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
650
651 arg_use_cgns = true;
652 } else {
653 arg_use_cgns = r > 0;
654 arg_settings_mask |= SETTING_USE_CGNS;
655 }
656 }
d5455d2f
LP
657
658 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
659 if (e)
660 arg_container_service_name = e;
661
4a4654e0
LP
662 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
663 if (r >= 0)
664 arg_suppress_sync = r;
665 else if (r != -ENXIO)
666 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
667
49048684 668 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
669}
670
88213476 671static int parse_argv(int argc, char *argv[]) {
a41fe3a2 672 enum {
acbeb427
ZJS
673 ARG_VERSION = 0x100,
674 ARG_PRIVATE_NETWORK,
bc2f673e 675 ARG_UUID,
5076f0cc 676 ARG_READ_ONLY,
57fb9fb5 677 ARG_CAPABILITY,
88fc9c9b 678 ARG_AMBIENT_CAPABILITY,
420c7379 679 ARG_DROP_CAPABILITY,
17fe0523
LP
680 ARG_LINK_JOURNAL,
681 ARG_BIND,
f4889f65 682 ARG_BIND_RO,
06c17c39 683 ARG_TMPFS,
5a8af538
LP
684 ARG_OVERLAY,
685 ARG_OVERLAY_RO,
de40a303 686 ARG_INACCESSIBLE,
eb91eb18 687 ARG_SHARE_SYSTEM,
89f7c846 688 ARG_REGISTER,
aa28aefe 689 ARG_KEEP_UNIT,
69c79d3c 690 ARG_NETWORK_INTERFACE,
c74e630d 691 ARG_NETWORK_MACVLAN,
4bbfe7ad 692 ARG_NETWORK_IPVLAN,
ab046dde 693 ARG_NETWORK_BRIDGE,
22b28dfd 694 ARG_NETWORK_ZONE,
f6d6bad1 695 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 696 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 697 ARG_PERSONALITY,
4d9f07b4 698 ARG_VOLATILE,
ec16945e 699 ARG_TEMPLATE,
f36933fe 700 ARG_PROPERTY,
6dac160c 701 ARG_PRIVATE_USERS,
c6c8f6e2 702 ARG_KILL_SIGNAL,
f757855e 703 ARG_SETTINGS,
5f932eb9 704 ARG_CHDIR,
b53ede69 705 ARG_PIVOT_ROOT,
7336138e 706 ARG_PRIVATE_USERS_CHOWN,
6c045a99 707 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 708 ARG_NOTIFY_READY,
4623e8e6 709 ARG_ROOT_HASH,
89e62e0b
LP
710 ARG_ROOT_HASH_SIG,
711 ARG_VERITY_DATA,
960e4569 712 ARG_SYSTEM_CALL_FILTER,
bf428efb 713 ARG_RLIMIT,
3a9530e5 714 ARG_HOSTNAME,
66edd963 715 ARG_NO_NEW_PRIVILEGES,
81f345df 716 ARG_OOM_SCORE_ADJUST,
d107bb7d 717 ARG_CPU_AFFINITY,
09d423e9 718 ARG_RESOLV_CONF,
1688841f 719 ARG_TIMEZONE,
de40a303
LP
720 ARG_CONSOLE,
721 ARG_PIPE,
722 ARG_OCI_BUNDLE,
bb068de0 723 ARG_NO_PAGER,
3652872a
LP
724 ARG_SET_CREDENTIAL,
725 ARG_LOAD_CREDENTIAL,
2f893044 726 ARG_BIND_USER,
4a4654e0 727 ARG_SUPPRESS_SYNC,
a41fe3a2
LP
728 };
729
88213476 730 static const struct option options[] = {
d7bea6b6
DP
731 { "help", no_argument, NULL, 'h' },
732 { "version", no_argument, NULL, ARG_VERSION },
733 { "directory", required_argument, NULL, 'D' },
734 { "template", required_argument, NULL, ARG_TEMPLATE },
735 { "ephemeral", no_argument, NULL, 'x' },
736 { "user", required_argument, NULL, 'u' },
737 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
738 { "as-pid2", no_argument, NULL, 'a' },
739 { "boot", no_argument, NULL, 'b' },
740 { "uuid", required_argument, NULL, ARG_UUID },
741 { "read-only", no_argument, NULL, ARG_READ_ONLY },
742 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 743 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 744 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 745 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
746 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
747 { "bind", required_argument, NULL, ARG_BIND },
748 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
749 { "tmpfs", required_argument, NULL, ARG_TMPFS },
750 { "overlay", required_argument, NULL, ARG_OVERLAY },
751 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 752 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 753 { "machine", required_argument, NULL, 'M' },
3a9530e5 754 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
755 { "slice", required_argument, NULL, 'S' },
756 { "setenv", required_argument, NULL, 'E' },
757 { "selinux-context", required_argument, NULL, 'Z' },
758 { "selinux-apifs-context", required_argument, NULL, 'L' },
759 { "quiet", no_argument, NULL, 'q' },
760 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
761 { "register", required_argument, NULL, ARG_REGISTER },
762 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
763 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
764 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
765 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
766 { "network-veth", no_argument, NULL, 'n' },
767 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
768 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
769 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
770 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
771 { "personality", required_argument, NULL, ARG_PERSONALITY },
772 { "image", required_argument, NULL, 'i' },
773 { "volatile", optional_argument, NULL, ARG_VOLATILE },
774 { "port", required_argument, NULL, 'p' },
775 { "property", required_argument, NULL, ARG_PROPERTY },
776 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
777 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
778 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
779 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
780 { "settings", required_argument, NULL, ARG_SETTINGS },
781 { "chdir", required_argument, NULL, ARG_CHDIR },
782 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
783 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
784 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
785 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
786 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 787 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 788 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 789 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 790 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 791 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 792 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
793 { "console", required_argument, NULL, ARG_CONSOLE },
794 { "pipe", no_argument, NULL, ARG_PIPE },
795 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 796 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
797 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
798 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 799 { "bind-user", required_argument, NULL, ARG_BIND_USER },
4a4654e0 800 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
eb9da376 801 {}
88213476
LP
802 };
803
9444b1f2 804 int c, r;
a42c8b54 805 uint64_t plus = 0, minus = 0;
f757855e 806 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
807
808 assert(argc >= 0);
809 assert(argv);
810
de40a303 811 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
812 switch (c) {
813
814 case 'h':
37ec0fdd 815 return help();
88213476 816
acbeb427 817 case ARG_VERSION:
3f6fd1ba 818 return version();
acbeb427 819
88213476 820 case 'D':
614b022c 821 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 822 if (r < 0)
0f03c2a4 823 return r;
de40a303
LP
824
825 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
826 break;
827
828 case ARG_TEMPLATE:
614b022c 829 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 830 if (r < 0)
0f03c2a4 831 return r;
de40a303
LP
832
833 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
834 break;
835
1b9e5b12 836 case 'i':
614b022c 837 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 838 if (r < 0)
0f03c2a4 839 return r;
de40a303
LP
840
841 arg_settings_mask |= SETTING_DIRECTORY;
842 break;
843
844 case ARG_OCI_BUNDLE:
614b022c 845 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
846 if (r < 0)
847 return r;
848
ec16945e
LP
849 break;
850
851 case 'x':
852 arg_ephemeral = true;
a2f577fc 853 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
854 break;
855
687d0825 856 case 'u':
2fc09a9c
DM
857 r = free_and_strdup(&arg_user, optarg);
858 if (r < 0)
7027ff61 859 return log_oom();
687d0825 860
f757855e 861 arg_settings_mask |= SETTING_USER;
687d0825
MV
862 break;
863
22b28dfd
LP
864 case ARG_NETWORK_ZONE: {
865 char *j;
866
b910cc72 867 j = strjoin("vz-", optarg);
22b28dfd
LP
868 if (!j)
869 return log_oom();
870
871 if (!ifname_valid(j)) {
872 log_error("Network zone name not valid: %s", j);
873 free(j);
874 return -EINVAL;
875 }
876
df1fac6d 877 free_and_replace(arg_network_zone, j);
22b28dfd
LP
878
879 arg_network_veth = true;
880 arg_private_network = true;
881 arg_settings_mask |= SETTING_NETWORK;
882 break;
883 }
884
ab046dde 885 case ARG_NETWORK_BRIDGE:
ef76dff2 886
baaa35ad
ZJS
887 if (!ifname_valid(optarg))
888 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
889 "Bridge interface name not valid: %s", optarg);
ef76dff2 890
f757855e
LP
891 r = free_and_strdup(&arg_network_bridge, optarg);
892 if (r < 0)
893 return log_oom();
ab046dde 894
4831981d 895 _fallthrough_;
0dfaa006 896 case 'n':
69c79d3c
LP
897 arg_network_veth = true;
898 arg_private_network = true;
f757855e 899 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
900 break;
901
f6d6bad1
LP
902 case ARG_NETWORK_VETH_EXTRA:
903 r = veth_extra_parse(&arg_network_veth_extra, optarg);
904 if (r < 0)
905 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
906
907 arg_private_network = true;
908 arg_settings_mask |= SETTING_NETWORK;
909 break;
910
aa28aefe 911 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
912 if (!ifname_valid(optarg))
913 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
914 "Network interface name not valid: %s", optarg);
ef76dff2 915
b390f178
DDM
916 r = test_network_interface_initialized(optarg);
917 if (r < 0)
918 return r;
919
c74e630d
LP
920 if (strv_extend(&arg_network_interfaces, optarg) < 0)
921 return log_oom();
922
923 arg_private_network = true;
f757855e 924 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
925 break;
926
927 case ARG_NETWORK_MACVLAN:
ef76dff2 928
baaa35ad
ZJS
929 if (!ifname_valid(optarg))
930 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
931 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 932
b390f178
DDM
933 r = test_network_interface_initialized(optarg);
934 if (r < 0)
935 return r;
936
c74e630d 937 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
938 return log_oom();
939
4bbfe7ad 940 arg_private_network = true;
f757855e 941 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
942 break;
943
944 case ARG_NETWORK_IPVLAN:
ef76dff2 945
baaa35ad
ZJS
946 if (!ifname_valid(optarg))
947 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
948 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 949
b390f178
DDM
950 r = test_network_interface_initialized(optarg);
951 if (r < 0)
952 return r;
953
4bbfe7ad
TG
954 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
955 return log_oom();
956
4831981d 957 _fallthrough_;
ff01d048
LP
958 case ARG_PRIVATE_NETWORK:
959 arg_private_network = true;
f757855e 960 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
961 break;
962
d7bea6b6 963 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 964 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
965 if (r < 0)
966 return r;
967
de40a303 968 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
969 break;
970
0f0dbc46 971 case 'b':
baaa35ad
ZJS
972 if (arg_start_mode == START_PID2)
973 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
974 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
975
976 arg_start_mode = START_BOOT;
977 arg_settings_mask |= SETTING_START_MODE;
978 break;
979
980 case 'a':
baaa35ad
ZJS
981 if (arg_start_mode == START_BOOT)
982 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
983 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
984
985 arg_start_mode = START_PID2;
986 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
987 break;
988
144f0fc0 989 case ARG_UUID:
9444b1f2 990 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
991 if (r < 0)
992 return log_error_errno(r, "Invalid UUID: %s", optarg);
993
baaa35ad
ZJS
994 if (sd_id128_is_null(arg_uuid))
995 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
996 "Machine UUID may not be all zeroes.");
f757855e
LP
997
998 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 999 break;
aa96c6cb 1000
43c3fb46
LP
1001 case 'S': {
1002 _cleanup_free_ char *mangled = NULL;
1003
1004 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
1005 if (r < 0)
1006 return log_oom();
1007
43c3fb46 1008 free_and_replace(arg_slice, mangled);
de40a303 1009 arg_settings_mask |= SETTING_SLICE;
144f0fc0 1010 break;
43c3fb46 1011 }
144f0fc0 1012
7027ff61 1013 case 'M':
c1521918 1014 if (isempty(optarg))
97b11eed 1015 arg_machine = mfree(arg_machine);
c1521918 1016 else {
52ef5dd7 1017 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1018 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1019 "Invalid machine name: %s", optarg);
7027ff61 1020
0c3c4284
LP
1021 r = free_and_strdup(&arg_machine, optarg);
1022 if (r < 0)
eb91eb18 1023 return log_oom();
eb91eb18 1024 }
9ce6d1b3 1025 break;
7027ff61 1026
3a9530e5
LP
1027 case ARG_HOSTNAME:
1028 if (isempty(optarg))
1029 arg_hostname = mfree(arg_hostname);
1030 else {
52ef5dd7 1031 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1032 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1033 "Invalid hostname: %s", optarg);
3a9530e5
LP
1034
1035 r = free_and_strdup(&arg_hostname, optarg);
1036 if (r < 0)
1037 return log_oom();
1038 }
1039
1040 arg_settings_mask |= SETTING_HOSTNAME;
1041 break;
1042
82adf6af
LP
1043 case 'Z':
1044 arg_selinux_context = optarg;
a8828ed9
DW
1045 break;
1046
82adf6af
LP
1047 case 'L':
1048 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1049 break;
1050
bc2f673e
LP
1051 case ARG_READ_ONLY:
1052 arg_read_only = true;
f757855e 1053 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1054 break;
1055
88fc9c9b
TH
1056 case ARG_AMBIENT_CAPABILITY: {
1057 uint64_t m;
1058 r = parse_capability_spec(optarg, &m);
1059 if (r <= 0)
1060 return r;
1061 arg_caps_ambient |= m;
1062 arg_settings_mask |= SETTING_CAPABILITY;
1063 break;
1064 }
420c7379
LP
1065 case ARG_CAPABILITY:
1066 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1067 uint64_t m;
1068 r = parse_capability_spec(optarg, &m);
1069 if (r <= 0)
1070 return r;
5076f0cc 1071
8a99bd0c
ZJS
1072 if (c == ARG_CAPABILITY)
1073 plus |= m;
1074 else
1075 minus |= m;
f757855e 1076 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1077 break;
1078 }
66edd963
LP
1079 case ARG_NO_NEW_PRIVILEGES:
1080 r = parse_boolean(optarg);
1081 if (r < 0)
1082 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1083
1084 arg_no_new_privileges = r;
1085 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1086 break;
1087
57fb9fb5
LP
1088 case 'j':
1089 arg_link_journal = LINK_GUEST;
574edc90 1090 arg_link_journal_try = true;
4e1d6aa9 1091 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1092 break;
1093
1094 case ARG_LINK_JOURNAL:
4e1d6aa9 1095 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1096 if (r < 0)
1097 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1098
4e1d6aa9 1099 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1100 break;
1101
17fe0523 1102 case ARG_BIND:
f757855e
LP
1103 case ARG_BIND_RO:
1104 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1105 if (r < 0)
1106 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1107
f757855e 1108 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1109 break;
06c17c39 1110
f757855e
LP
1111 case ARG_TMPFS:
1112 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1113 if (r < 0)
1114 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1115
f757855e 1116 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1117 break;
5a8af538
LP
1118
1119 case ARG_OVERLAY:
ad85779a
LP
1120 case ARG_OVERLAY_RO:
1121 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1122 if (r == -EADDRNOTAVAIL)
1123 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1124 if (r < 0)
1125 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1126
f757855e 1127 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1128 break;
06c17c39 1129
de40a303
LP
1130 case ARG_INACCESSIBLE:
1131 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1132 if (r < 0)
1133 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1134
1135 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1136 break;
1137
0d2a0179
ZJS
1138 case 'E':
1139 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
aaf057c4 1140 if (r < 0)
0d2a0179 1141 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
f4889f65 1142
f757855e 1143 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65 1144 break;
f4889f65 1145
284c0b91
LP
1146 case 'q':
1147 arg_quiet = true;
1148 break;
1149
8a96d94e 1150 case ARG_SHARE_SYSTEM:
a6b5216c 1151 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1152 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1153 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1154 arg_clone_ns_flags = 0;
8a96d94e
LP
1155 break;
1156
eb91eb18
LP
1157 case ARG_REGISTER:
1158 r = parse_boolean(optarg);
1159 if (r < 0) {
1160 log_error("Failed to parse --register= argument: %s", optarg);
1161 return r;
1162 }
1163
1164 arg_register = r;
1165 break;
1166
89f7c846
LP
1167 case ARG_KEEP_UNIT:
1168 arg_keep_unit = true;
1169 break;
1170
6afc95b7
LP
1171 case ARG_PERSONALITY:
1172
ac45f971 1173 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1174 if (arg_personality == PERSONALITY_INVALID)
1175 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1176 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1177
f757855e 1178 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1179 break;
1180
4d9f07b4
LP
1181 case ARG_VOLATILE:
1182
1183 if (!optarg)
f757855e 1184 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1185 else if (streq(optarg, "help")) {
1186 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1187 return 0;
1188 } else {
f757855e 1189 VolatileMode m;
4d9f07b4 1190
f757855e 1191 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1192 if (m < 0)
1193 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1194 "Failed to parse --volatile= argument: %s", optarg);
1195 else
f757855e 1196 arg_volatile_mode = m;
6d0b55c2
LP
1197 }
1198
f757855e
LP
1199 arg_settings_mask |= SETTING_VOLATILE_MODE;
1200 break;
6d0b55c2 1201
f757855e
LP
1202 case 'p':
1203 r = expose_port_parse(&arg_expose_ports, optarg);
1204 if (r == -EEXIST)
1205 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1206 if (r < 0)
1207 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1208
f757855e 1209 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1210 break;
6d0b55c2 1211
f36933fe
LP
1212 case ARG_PROPERTY:
1213 if (strv_extend(&arg_property, optarg) < 0)
1214 return log_oom();
1215
1216 break;
1217
ae209204 1218 case ARG_PRIVATE_USERS: {
33eac552 1219 int boolean;
0de7acce 1220
ae209204
ZJS
1221 if (!optarg)
1222 boolean = true;
1223 else if (!in_charset(optarg, DIGITS))
1224 /* do *not* parse numbers as booleans */
1225 boolean = parse_boolean(optarg);
33eac552
LP
1226 else
1227 boolean = -1;
ae209204 1228
33eac552 1229 if (boolean == 0) {
0de7acce
LP
1230 /* no: User namespacing off */
1231 arg_userns_mode = USER_NAMESPACE_NO;
1232 arg_uid_shift = UID_INVALID;
1233 arg_uid_range = UINT32_C(0x10000);
33eac552 1234 } else if (boolean > 0) {
0de7acce
LP
1235 /* yes: User namespacing on, UID range is read from root dir */
1236 arg_userns_mode = USER_NAMESPACE_FIXED;
1237 arg_uid_shift = UID_INVALID;
1238 arg_uid_range = UINT32_C(0x10000);
1239 } else if (streq(optarg, "pick")) {
1240 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1241 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1242 * implied by USER_NAMESPACE_PICK
33eac552 1243 * further down. */
0de7acce
LP
1244 arg_uid_shift = UID_INVALID;
1245 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1246
1247 } else if (streq(optarg, "identity")) {
1248 /* identitiy: User namespaces on, UID range is map the 0…0xFFFF range to
1249 * itself, i.e. we don't actually map anything, but do take benefit of
1250 * isolation of capability sets. */
1251 arg_userns_mode = USER_NAMESPACE_FIXED;
1252 arg_uid_shift = 0;
1253 arg_uid_range = UINT32_C(0x10000);
0de7acce 1254 } else {
6c2058b3 1255 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1256 const char *range, *shift;
1257
0de7acce
LP
1258 /* anything else: User namespacing on, UID range is explicitly configured */
1259
6dac160c
LP
1260 range = strchr(optarg, ':');
1261 if (range) {
6c2058b3
ZJS
1262 buffer = strndup(optarg, range - optarg);
1263 if (!buffer)
1264 return log_oom();
1265 shift = buffer;
6dac160c
LP
1266
1267 range++;
bfd292ec
ZJS
1268 r = safe_atou32(range, &arg_uid_range);
1269 if (r < 0)
be715731 1270 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1271 } else
1272 shift = optarg;
1273
be715731
ZJS
1274 r = parse_uid(shift, &arg_uid_shift);
1275 if (r < 0)
1276 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1277
1278 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1279
58e13de5
LP
1280 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1281 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1282 }
be715731 1283
0de7acce 1284 arg_settings_mask |= SETTING_USERNS;
6dac160c 1285 break;
ae209204 1286 }
6dac160c 1287
0de7acce 1288 case 'U':
ccabee0d 1289 if (userns_supported()) {
6c045a99
LP
1290 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1291 * implied by USER_NAMESPACE_PICK
33eac552 1292 * further down. */
ccabee0d
LP
1293 arg_uid_shift = UID_INVALID;
1294 arg_uid_range = UINT32_C(0x10000);
1295
1296 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1297 }
1298
7336138e
LP
1299 break;
1300
0de7acce 1301 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1302 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1303
1304 arg_settings_mask |= SETTING_USERNS;
1305 break;
1306
1307 case ARG_PRIVATE_USERS_OWNERSHIP:
1308 if (streq(optarg, "help")) {
1309 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1310 return 0;
1311 }
1312
1313 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1314 if (arg_userns_ownership < 0)
1315 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1316
1317 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1318 break;
1319
c6c8f6e2 1320 case ARG_KILL_SIGNAL:
5c828e66
LP
1321 if (streq(optarg, "help")) {
1322 DUMP_STRING_TABLE(signal, int, _NSIG);
1323 return 0;
1324 }
1325
29a3db75 1326 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1327 if (arg_kill_signal < 0)
7211c853 1328 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1329
f757855e
LP
1330 arg_settings_mask |= SETTING_KILL_SIGNAL;
1331 break;
1332
1333 case ARG_SETTINGS:
1334
1335 /* no → do not read files
1336 * yes → read files, do not override cmdline, trust only subset
1337 * override → read files, override cmdline, trust only subset
1338 * trusted → read files, do not override cmdline, trust all
1339 */
1340
1341 r = parse_boolean(optarg);
1342 if (r < 0) {
1343 if (streq(optarg, "trusted")) {
1344 mask_all_settings = false;
1345 mask_no_settings = false;
1346 arg_settings_trusted = true;
1347
1348 } else if (streq(optarg, "override")) {
1349 mask_all_settings = false;
1350 mask_no_settings = true;
1351 arg_settings_trusted = -1;
1352 } else
1353 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1354 } else if (r > 0) {
1355 /* yes */
1356 mask_all_settings = false;
1357 mask_no_settings = false;
1358 arg_settings_trusted = -1;
1359 } else {
1360 /* no */
1361 mask_all_settings = true;
1362 mask_no_settings = false;
1363 arg_settings_trusted = false;
1364 }
1365
c6c8f6e2
LP
1366 break;
1367
5f932eb9 1368 case ARG_CHDIR:
baaa35ad
ZJS
1369 if (!path_is_absolute(optarg))
1370 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1371 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1372
1373 r = free_and_strdup(&arg_chdir, optarg);
1374 if (r < 0)
1375 return log_oom();
1376
1377 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1378 break;
1379
b53ede69
PW
1380 case ARG_PIVOT_ROOT:
1381 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1382 if (r < 0)
1383 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1384
1385 arg_settings_mask |= SETTING_PIVOT_ROOT;
1386 break;
1387
9c1e04d0
AP
1388 case ARG_NOTIFY_READY:
1389 r = parse_boolean(optarg);
baaa35ad
ZJS
1390 if (r < 0)
1391 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1392 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1393 arg_notify_ready = r;
1394 arg_settings_mask |= SETTING_NOTIFY_READY;
1395 break;
1396
4623e8e6 1397 case ARG_ROOT_HASH: {
89e62e0b 1398 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1399 size_t l;
1400
1401 r = unhexmem(optarg, strlen(optarg), &k, &l);
1402 if (r < 0)
1403 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1404 if (l < sizeof(sd_id128_t))
c6147113 1405 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1406
89e62e0b
LP
1407 free_and_replace(arg_verity_settings.root_hash, k);
1408 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1409 break;
1410 }
1411
c2923fdc
LB
1412 case ARG_ROOT_HASH_SIG: {
1413 char *value;
89e62e0b
LP
1414 size_t l;
1415 void *p;
c2923fdc
LB
1416
1417 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1418 r = unbase64mem(value, strlen(value), &p, &l);
1419 if (r < 0)
1420 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1421
c2923fdc 1422 } else {
89e62e0b 1423 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1424 if (r < 0)
89e62e0b 1425 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1426 }
1427
89e62e0b
LP
1428 free_and_replace(arg_verity_settings.root_hash_sig, p);
1429 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1430 break;
1431 }
1432
89e62e0b 1433 case ARG_VERITY_DATA:
614b022c 1434 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1435 if (r < 0)
1436 return r;
1437 break;
1438
960e4569
LP
1439 case ARG_SYSTEM_CALL_FILTER: {
1440 bool negative;
1441 const char *items;
1442
1443 negative = optarg[0] == '~';
1444 items = negative ? optarg + 1 : optarg;
1445
1446 for (;;) {
1447 _cleanup_free_ char *word = NULL;
1448
1449 r = extract_first_word(&items, &word, NULL, 0);
1450 if (r == 0)
1451 break;
1452 if (r == -ENOMEM)
1453 return log_oom();
1454 if (r < 0)
1455 return log_error_errno(r, "Failed to parse system call filter: %m");
1456
1457 if (negative)
6b000af4 1458 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1459 else
6b000af4 1460 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1461 if (r < 0)
1462 return log_oom();
1463 }
1464
1465 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1466 break;
1467 }
1468
bf428efb
LP
1469 case ARG_RLIMIT: {
1470 const char *eq;
622ecfa8 1471 _cleanup_free_ char *name = NULL;
bf428efb
LP
1472 int rl;
1473
5c828e66
LP
1474 if (streq(optarg, "help")) {
1475 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1476 return 0;
1477 }
1478
bf428efb 1479 eq = strchr(optarg, '=');
baaa35ad
ZJS
1480 if (!eq)
1481 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1482 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1483
1484 name = strndup(optarg, eq - optarg);
1485 if (!name)
1486 return log_oom();
1487
1488 rl = rlimit_from_string_harder(name);
baaa35ad 1489 if (rl < 0)
7211c853 1490 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1491
1492 if (!arg_rlimit[rl]) {
1493 arg_rlimit[rl] = new0(struct rlimit, 1);
1494 if (!arg_rlimit[rl])
1495 return log_oom();
1496 }
1497
1498 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1499 if (r < 0)
1500 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1501
1502 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1503 break;
1504 }
1505
81f345df
LP
1506 case ARG_OOM_SCORE_ADJUST:
1507 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1508 if (r < 0)
1509 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1510
1511 arg_oom_score_adjust_set = true;
1512 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1513 break;
1514
d107bb7d 1515 case ARG_CPU_AFFINITY: {
0985c7c4 1516 CPUSet cpuset;
d107bb7d
LP
1517
1518 r = parse_cpu_set(optarg, &cpuset);
1519 if (r < 0)
0985c7c4 1520 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1521
0985c7c4
ZJS
1522 cpu_set_reset(&arg_cpu_set);
1523 arg_cpu_set = cpuset;
d107bb7d
LP
1524 arg_settings_mask |= SETTING_CPU_AFFINITY;
1525 break;
1526 }
1527
09d423e9
LP
1528 case ARG_RESOLV_CONF:
1529 if (streq(optarg, "help")) {
1530 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1531 return 0;
1532 }
1533
1534 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1535 if (arg_resolv_conf < 0)
7211c853 1536 return log_error_errno(arg_resolv_conf,
baaa35ad 1537 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1538
1539 arg_settings_mask |= SETTING_RESOLV_CONF;
1540 break;
1541
1688841f
LP
1542 case ARG_TIMEZONE:
1543 if (streq(optarg, "help")) {
1544 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1545 return 0;
1546 }
1547
1548 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1549 if (arg_timezone < 0)
7211c853 1550 return log_error_errno(arg_timezone,
baaa35ad 1551 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1552
1553 arg_settings_mask |= SETTING_TIMEZONE;
1554 break;
1555
de40a303 1556 case ARG_CONSOLE:
dce66ffe
ZJS
1557 r = handle_arg_console(optarg);
1558 if (r <= 0)
1559 return r;
de40a303
LP
1560 break;
1561
1562 case 'P':
1563 case ARG_PIPE:
dce66ffe
ZJS
1564 r = handle_arg_console("pipe");
1565 if (r <= 0)
1566 return r;
de40a303
LP
1567 break;
1568
bb068de0
ZJS
1569 case ARG_NO_PAGER:
1570 arg_pager_flags |= PAGER_DISABLE;
1571 break;
1572
3652872a
LP
1573 case ARG_SET_CREDENTIAL: {
1574 _cleanup_free_ char *word = NULL, *data = NULL;
1575 const char *p = optarg;
1576 Credential *a;
e437538f 1577 ssize_t l;
3652872a
LP
1578
1579 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1580 if (r == -ENOMEM)
1581 return log_oom();
1582 if (r < 0)
1583 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1584 if (r == 0 || !p)
1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1586
1587 if (!credential_name_valid(word))
1588 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1589
12d729b2 1590 for (size_t i = 0; i < arg_n_credentials; i++)
3652872a
LP
1591 if (streq(arg_credentials[i].id, word))
1592 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1593
1594 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1595 if (l < 0)
1596 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1597
1598 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1599 if (!a)
1600 return log_oom();
1601
1602 a[arg_n_credentials++] = (Credential) {
1603 .id = TAKE_PTR(word),
1604 .data = TAKE_PTR(data),
1605 .size = l,
1606 };
1607
1608 arg_credentials = a;
1609
1610 arg_settings_mask |= SETTING_CREDENTIALS;
1611 break;
1612 }
1613
1614 case ARG_LOAD_CREDENTIAL: {
1615 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1616 _cleanup_(erase_and_freep) char *data = NULL;
1617 _cleanup_free_ char *word = NULL, *j = NULL;
1618 const char *p = optarg;
1619 Credential *a;
1620 size_t size, i;
1621
1622 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1623 if (r == -ENOMEM)
1624 return log_oom();
1625 if (r < 0)
1626 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1627 if (r == 0 || !p)
1628 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1629
1630 if (!credential_name_valid(word))
1631 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1632
1633 for (i = 0; i < arg_n_credentials; i++)
1634 if (streq(arg_credentials[i].id, word))
1635 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1636
1637 if (path_is_absolute(p))
1638 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1639 else {
1640 const char *e;
1641
786d19fd
LP
1642 r = get_credentials_dir(&e);
1643 if (r < 0)
1644 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
3652872a
LP
1645
1646 j = path_join(e, p);
1647 if (!j)
1648 return log_oom();
1649 }
1650
986311c2
LP
1651 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1652 flags,
1653 NULL,
1654 &data, &size);
3652872a
LP
1655 if (r < 0)
1656 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1657
1658 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1659 if (!a)
1660 return log_oom();
1661
1662 a[arg_n_credentials++] = (Credential) {
1663 .id = TAKE_PTR(word),
1664 .data = TAKE_PTR(data),
1665 .size = size,
1666 };
1667
1668 arg_credentials = a;
1669
1670 arg_settings_mask |= SETTING_CREDENTIALS;
1671 break;
1672 }
1673
2f893044
LP
1674 case ARG_BIND_USER:
1675 if (!valid_user_group_name(optarg, 0))
1676 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1677
1678 if (strv_extend(&arg_bind_user, optarg) < 0)
1679 return log_oom();
1680
1681 arg_settings_mask |= SETTING_BIND_USER;
1682 break;
1683
4a4654e0
LP
1684 case ARG_SUPPRESS_SYNC:
1685 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1686 if (r < 0)
1687 return r;
1688
1689 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1690 break;
1691
88213476
LP
1692 case '?':
1693 return -EINVAL;
1694
1695 default:
04499a70 1696 assert_not_reached();
88213476 1697 }
88213476 1698
60f1ec13
LP
1699 if (argc > optind) {
1700 strv_free(arg_parameters);
1701 arg_parameters = strv_copy(argv + optind);
1702 if (!arg_parameters)
1703 return log_oom();
d7bea6b6 1704
60f1ec13
LP
1705 arg_settings_mask |= SETTING_START_MODE;
1706 }
1707
1708 if (arg_ephemeral && arg_template && !arg_directory)
1709 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1710 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1711 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1712 * --directory=". */
1713 arg_directory = TAKE_PTR(arg_template);
1714
bd4b15f2 1715 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1716
de40a303 1717 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1718 r = parse_environment();
1719 if (r < 0)
1720 return r;
de40a303 1721
60f1ec13
LP
1722 /* Load all settings from .nspawn files */
1723 if (mask_no_settings)
1724 arg_settings_mask = 0;
1725
1726 /* Don't load any settings from .nspawn files */
1727 if (mask_all_settings)
1728 arg_settings_mask = _SETTINGS_MASK_ALL;
1729
1730 return 1;
1731}
1732
1733static int verify_arguments(void) {
1734 int r;
a6b5216c 1735
75b0d8b8
ZJS
1736 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1737 /* If we are running the stub init in the container, we don't need to look at what the init
1738 * in the container supports, because we are not using it. Let's immediately pick the right
1739 * setting based on the host system configuration.
1740 *
1741 * We only do this, if the user didn't use an environment variable to override the detection.
1742 */
1743
1744 r = cg_all_unified();
1745 if (r < 0)
1746 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1747 if (r > 0)
1748 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1749 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1750 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1751 else
1752 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1753 }
1754
4f086aab
SU
1755 if (arg_userns_mode != USER_NAMESPACE_NO)
1756 arg_mount_settings |= MOUNT_USE_USERNS;
1757
1758 if (arg_private_network)
1759 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1760
48a8d337
LB
1761 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1762 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1763 arg_register = false;
baaa35ad 1764 if (arg_start_mode != START_PID1)
60f1ec13 1765 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1766 }
eb91eb18 1767
6c045a99
LP
1768 if (arg_userns_ownership < 0)
1769 arg_userns_ownership =
f61c7f88 1770 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1771 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1772
60f1ec13
LP
1773 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1774 arg_kill_signal = SIGRTMIN+3;
1775
e5a4bb0d
LP
1776 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1777 arg_read_only = true;
1778
2436ea76
DDM
1779 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1780 arg_read_only = true;
1781
baaa35ad 1782 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1783 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1784 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1785 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1786
baaa35ad 1787 if (arg_directory && arg_image)
60f1ec13 1788 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1789
baaa35ad 1790 if (arg_template && arg_image)
60f1ec13 1791 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1792
baaa35ad 1793 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1794 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1795
baaa35ad 1796 if (arg_ephemeral && arg_template)
60f1ec13 1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1798
baaa35ad 1799 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1801
baaa35ad 1802 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1803 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1804
6c045a99 1805 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1806 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1807 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1808
6c045a99
LP
1809 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1810 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1811 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1812 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1813 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1814
679ecd36
SZ
1815 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1816 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1817 if (arg_network_namespace_path &&
1818 (arg_network_interfaces || arg_network_macvlan ||
1819 arg_network_ipvlan || arg_network_veth_extra ||
1820 arg_network_bridge || arg_network_zone ||
679ecd36 1821 arg_network_veth))
de40a303 1822 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1823
60f1ec13 1824 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1826 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1827
baaa35ad 1828 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1829 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1830
baaa35ad 1831 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1833
baaa35ad 1834 if (arg_expose_ports && !arg_private_network)
60f1ec13 1835 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1836
88fc9c9b 1837 if (arg_caps_ambient) {
f5fbe71d 1838 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1839 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1840
1841 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1842 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1843
1844 if (arg_start_mode == START_BOOT)
1845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1846 }
1847
2f893044
LP
1848 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1849 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1850
1851 /* Drop duplicate --bind-user= entries */
1852 strv_uniq(arg_bind_user);
1853
60f1ec13
LP
1854 r = custom_mount_check_all();
1855 if (r < 0)
1856 return r;
c6c8f6e2 1857
f757855e 1858 return 0;
88213476
LP
1859}
1860
91181e07 1861int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1862 assert(p);
1863
0de7acce 1864 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1865 return 0;
1866
1867 if (uid == UID_INVALID && gid == GID_INVALID)
1868 return 0;
1869
1870 if (uid != UID_INVALID) {
1871 uid += arg_uid_shift;
1872
1873 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1874 return -EOVERFLOW;
1875 }
1876
1877 if (gid != GID_INVALID) {
1878 gid += (gid_t) arg_uid_shift;
1879
1880 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1881 return -EOVERFLOW;
1882 }
1883
7c248223 1884 return RET_NERRNO(lchown(p, uid, gid));
b12afc8c
LP
1885}
1886
91181e07 1887int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1888 const char *q;
dae8b82e 1889 int r;
03cfe0d5
LP
1890
1891 q = prefix_roota(root, path);
3f692e2e 1892 r = RET_NERRNO(mkdir(q, mode));
dae8b82e
ZJS
1893 if (r == -EEXIST)
1894 return 0;
1895 if (r < 0)
1896 return r;
03cfe0d5
LP
1897
1898 return userns_lchown(q, uid, gid);
1899}
1900
1688841f 1901static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1902 return PATH_STARTSWITH_SET(
1903 path,
1904 "../usr/share/zoneinfo/",
1905 "/usr/share/zoneinfo/");
1688841f
LP
1906}
1907
83205269
LP
1908static bool etc_writable(void) {
1909 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1910}
1911
e58a1277 1912static int setup_timezone(const char *dest) {
1688841f
LP
1913 _cleanup_free_ char *p = NULL, *etc = NULL;
1914 const char *where, *check;
1915 TimezoneMode m;
d4036145 1916 int r;
f8440af5 1917
e58a1277
LP
1918 assert(dest);
1919
1688841f 1920 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1921 r = readlink_malloc("/etc/localtime", &p);
1922 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1923 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1924 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1925 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1926 else if (r < 0) {
1927 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1928 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1929 * file.
1930 *
1931 * Example:
1932 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1933 */
1934 return 0;
1935 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1936 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1937 else
1938 m = arg_timezone;
1939 } else
1940 m = arg_timezone;
1941
1942 if (m == TIMEZONE_OFF)
1943 return 0;
1944
a5648b80 1945 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1946 if (r < 0) {
1688841f 1947 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1948 return 0;
1949 }
1950
1688841f
LP
1951 where = strjoina(etc, "/localtime");
1952
1953 switch (m) {
1954
1955 case TIMEZONE_DELETE:
1956 if (unlink(where) < 0)
1957 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1958
d4036145 1959 return 0;
d4036145 1960
1688841f
LP
1961 case TIMEZONE_SYMLINK: {
1962 _cleanup_free_ char *q = NULL;
1963 const char *z, *what;
4d1c38b8 1964
1688841f
LP
1965 z = timezone_from_path(p);
1966 if (!z) {
1967 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1968 return 0;
1688841f 1969 }
d4036145 1970
1688841f
LP
1971 r = readlink_malloc(where, &q);
1972 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1973 return 0; /* Already pointing to the right place? Then do nothing .. */
1974
1975 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1976 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1977 if (r < 0)
1978 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1979 else {
1980 if (unlink(where) < 0 && errno != ENOENT) {
1981 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1982 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1983 return 0;
1984 }
1985
1986 what = strjoina("../usr/share/zoneinfo/", z);
1987 if (symlink(what, where) < 0) {
1988 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1989 errno, "Failed to correct timezone of container, ignoring: %m");
1990 return 0;
1991 }
1992
1993 break;
1994 }
1995
1996 _fallthrough_;
d4036145 1997 }
68fb0892 1998
1688841f
LP
1999 case TIMEZONE_BIND: {
2000 _cleanup_free_ char *resolved = NULL;
2001 int found;
2002
a5648b80 2003 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
2004 if (found < 0) {
2005 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2006 return 0;
2007 }
2008
2009 if (found == 0) /* missing? */
2010 (void) touch(resolved);
2011
511a8cfe 2012 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 2013 if (r >= 0)
511a8cfe 2014 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
2015
2016 _fallthrough_;
79d80fc1 2017 }
4d9f07b4 2018
1688841f
LP
2019 case TIMEZONE_COPY:
2020 /* If mounting failed, try to copy */
8a016c74 2021 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
2022 if (r < 0) {
2023 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2024 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2025 return 0;
2026 }
2027
2028 break;
2029
2030 default:
04499a70 2031 assert_not_reached();
d4036145 2032 }
e58a1277 2033
1688841f 2034 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
2035 r = userns_lchown(where, 0, 0);
2036 if (r < 0)
1688841f 2037 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 2038
e58a1277 2039 return 0;
88213476
LP
2040}
2041
09d423e9
LP
2042static int have_resolv_conf(const char *path) {
2043 assert(path);
2044
2045 if (access(path, F_OK) < 0) {
2046 if (errno == ENOENT)
2047 return 0;
2048
2049 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2050 }
2051
2052 return 1;
2053}
2054
7357272e 2055static int resolved_listening(void) {
b8ea7a6e 2056 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2057 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2058 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2059 int r;
2060
7357272e 2061 /* Check if resolved is listening */
b053cd5f
LP
2062
2063 r = sd_bus_open_system(&bus);
2064 if (r < 0)
b8ea7a6e 2065 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2066
7357272e 2067 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2068 if (r < 0)
2069 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2070 if (r == 0)
2071 return 0;
7357272e
DM
2072
2073 r = sd_bus_get_property_string(bus,
2074 "org.freedesktop.resolve1",
2075 "/org/freedesktop/resolve1",
2076 "org.freedesktop.resolve1.Manager",
2077 "DNSStubListener",
b8ea7a6e 2078 &error,
7357272e
DM
2079 &dns_stub_listener_mode);
2080 if (r < 0)
b8ea7a6e 2081 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2082
2083 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2084}
2085
2547bb41 2086static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2087 _cleanup_free_ char *etc = NULL;
2088 const char *where, *what;
2089 ResolvConfMode m;
2090 int r;
2547bb41
LP
2091
2092 assert(dest);
2093
09d423e9
LP
2094 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2095 if (arg_private_network)
2096 m = RESOLV_CONF_OFF;
86775e35
LP
2097 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2098 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2099 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2100 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2101 else
83205269 2102 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2103
09d423e9
LP
2104 } else
2105 m = arg_resolv_conf;
2106
2107 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2108 return 0;
2109
a5648b80 2110 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2111 if (r < 0) {
2112 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2113 return 0;
2114 }
2115
2116 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2117
2118 if (m == RESOLV_CONF_DELETE) {
2119 if (unlink(where) < 0)
2120 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2121
87447ae4
LP
2122 return 0;
2123 }
79d80fc1 2124
86775e35
LP
2125 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2126 what = PRIVATE_STATIC_RESOLV_CONF;
2127 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2128 what = PRIVATE_UPLINK_RESOLV_CONF;
2129 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2130 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2131 else
2132 what = "/etc/resolv.conf";
87447ae4 2133
86775e35 2134 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2135 _cleanup_free_ char *resolved = NULL;
2136 int found;
2137
a5648b80 2138 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2139 if (found < 0) {
2140 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2141 return 0;
2142 }
3539724c 2143
87447ae4
LP
2144 if (found == 0) /* missing? */
2145 (void) touch(resolved);
5367354d 2146
511a8cfe 2147 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2148 if (r >= 0)
511a8cfe 2149 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2150
2151 /* If that didn't work, let's copy the file */
3539724c
LP
2152 }
2153
86775e35
LP
2154 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2155 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2156 else
2157 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 2158 if (r < 0) {
3539724c
LP
2159 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2160 * resolved or something similar runs inside and the symlink points there.
68a313c5 2161 *
3539724c 2162 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2163 */
86775e35
LP
2164 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2165 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2166 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2167 return 0;
2168 }
2547bb41 2169
03cfe0d5
LP
2170 r = userns_lchown(where, 0, 0);
2171 if (r < 0)
3539724c 2172 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2173
2547bb41
LP
2174 return 0;
2175}
2176
1e4f1671 2177static int setup_boot_id(void) {
cdde6ba6
LP
2178 _cleanup_(unlink_and_freep) char *from = NULL;
2179 _cleanup_free_ char *path = NULL;
3bbaff3e 2180 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2181 const char *to;
04bc4a3f
LP
2182 int r;
2183
1eacc470 2184 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2185
1eacc470 2186 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2187 if (r < 0)
2188 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2189
2190 r = sd_id128_randomize(&rnd);
f647962d
MS
2191 if (r < 0)
2192 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2193
cdde6ba6 2194 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
2195 if (r < 0)
2196 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2197
cdde6ba6
LP
2198 from = TAKE_PTR(path);
2199 to = "/proc/sys/kernel/random/boot_id";
2200
511a8cfe 2201 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2202 if (r < 0)
2203 return r;
04bc4a3f 2204
511a8cfe 2205 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2206}
2207
e58a1277 2208static int copy_devnodes(const char *dest) {
88213476
LP
2209 static const char devnodes[] =
2210 "null\0"
2211 "zero\0"
2212 "full\0"
2213 "random\0"
2214 "urandom\0"
85614d66
TG
2215 "tty\0"
2216 "net/tun\0";
88213476
LP
2217
2218 const char *d;
e58a1277 2219 int r = 0;
a258bf26
LP
2220
2221 assert(dest);
124640f1 2222
52f05ef2 2223 BLOCK_WITH_UMASK(0000);
88213476 2224
03cfe0d5
LP
2225 /* Create /dev/net, so that we can create /dev/net/tun in it */
2226 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2227 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2228
88213476 2229 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2230 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2231 struct stat st;
88213476 2232
c6134d3e 2233 from = path_join("/dev/", d);
8967f291
LP
2234 if (!from)
2235 return log_oom();
2236
c6134d3e 2237 to = path_join(dest, from);
8967f291
LP
2238 if (!to)
2239 return log_oom();
88213476
LP
2240
2241 if (stat(from, &st) < 0) {
2242
4a62c710
MS
2243 if (errno != ENOENT)
2244 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2245
baaa35ad
ZJS
2246 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2247 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2248 "%s is not a char or block device, cannot copy.", from);
2249 else {
8dfce114
LP
2250 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2251
81f5049b 2252 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2253 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2254 if (errno == EEXIST)
8dbf71ec 2255 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2256 if (errno != EPERM)
2257 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2258
8dfce114 2259 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2260 r = touch(to);
2261 if (r < 0)
2262 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2263 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2264 if (r < 0)
2265 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2266 }
6278cf60 2267
03cfe0d5
LP
2268 r = userns_lchown(to, 0, 0);
2269 if (r < 0)
2270 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2271
657ee2d8 2272 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2273 if (!dn)
2274 return log_oom();
2275
2276 r = userns_mkdir(dest, dn, 0755, 0, 0);
2277 if (r < 0)
2278 return log_error_errno(r, "Failed to create '%s': %m", dn);
2279
2280 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2281 return log_oom();
2282
c6134d3e 2283 prefixed = path_join(dest, sl);
8dfce114
LP
2284 if (!prefixed)
2285 return log_oom();
2286
2d9b74ba 2287 t = path_join("..", d);
8dfce114
LP
2288 if (!t)
2289 return log_oom();
2290
2291 if (symlink(t, prefixed) < 0)
2292 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2293 }
88213476
LP
2294 }
2295
e58a1277
LP
2296 return r;
2297}
88213476 2298
de40a303 2299static int make_extra_nodes(const char *dest) {
de40a303
LP
2300 size_t i;
2301 int r;
2302
52f05ef2 2303 BLOCK_WITH_UMASK(0000);
de40a303
LP
2304
2305 for (i = 0; i < arg_n_extra_nodes; i++) {
2306 _cleanup_free_ char *path = NULL;
2307 DeviceNode *n = arg_extra_nodes + i;
2308
c6134d3e 2309 path = path_join(dest, n->path);
de40a303
LP
2310 if (!path)
2311 return log_oom();
2312
2313 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2314 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2315
2316 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2319 }
2320
2321 return 0;
2322}
2323
03cfe0d5
LP
2324static int setup_pts(const char *dest) {
2325 _cleanup_free_ char *options = NULL;
2326 const char *p;
709f6e46 2327 int r;
03cfe0d5 2328
349cc4a5 2329#if HAVE_SELINUX
03cfe0d5
LP
2330 if (arg_selinux_apifs_context)
2331 (void) asprintf(&options,
3dce8915 2332 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2333 arg_uid_shift + TTY_GID,
2334 arg_selinux_apifs_context);
2335 else
2336#endif
2337 (void) asprintf(&options,
3dce8915 2338 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2339 arg_uid_shift + TTY_GID);
f2d88580 2340
03cfe0d5 2341 if (!options)
f2d88580
LP
2342 return log_oom();
2343
03cfe0d5 2344 /* Mount /dev/pts itself */
cc9fce65 2345 p = prefix_roota(dest, "/dev/pts");
3f692e2e 2346 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e
ZJS
2347 if (r < 0)
2348 return log_error_errno(r, "Failed to create /dev/pts: %m");
2349
511a8cfe 2350 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2351 if (r < 0)
2352 return r;
709f6e46
MS
2353 r = userns_lchown(p, 0, 0);
2354 if (r < 0)
2355 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2356
2357 /* Create /dev/ptmx symlink */
2358 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2359 if (symlink("pts/ptmx", p) < 0)
2360 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2361 r = userns_lchown(p, 0, 0);
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2364
03cfe0d5
LP
2365 /* And fix /dev/pts/ptmx ownership */
2366 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2367 r = userns_lchown(p, 0, 0);
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2370
f2d88580
LP
2371 return 0;
2372}
2373
3acc84eb 2374static int setup_stdio_as_dev_console(void) {
2fef50cd 2375 _cleanup_close_ int terminal = -1;
e58a1277 2376 int r;
e58a1277 2377
335d2ead
LP
2378 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2379 * explicitly, if we are configured to. */
2380 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2381 if (terminal < 0)
2382 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2383
3acc84eb
FB
2384 /* Make sure we can continue logging to the original stderr, even if
2385 * stderr points elsewhere now */
2386 r = log_dup_console();
2387 if (r < 0)
2388 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2389
3acc84eb
FB
2390 /* invalidates 'terminal' on success and failure */
2391 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2392 TAKE_FD(terminal);
f647962d 2393 if (r < 0)
3acc84eb
FB
2394 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2395
2396 return 0;
2397}
88213476 2398
3acc84eb
FB
2399static int setup_dev_console(const char *console) {
2400 _cleanup_free_ char *p = NULL;
2401 int r;
a258bf26 2402
3acc84eb
FB
2403 /* Create /dev/console symlink */
2404 r = path_make_relative("/dev", console, &p);
81f5049b 2405 if (r < 0)
3acc84eb
FB
2406 return log_error_errno(r, "Failed to create relative path: %m");
2407
2408 if (symlink(p, "/dev/console") < 0)
2409 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2410
3acc84eb 2411 return 0;
e58a1277
LP
2412}
2413
8e5430c4
LP
2414static int setup_keyring(void) {
2415 key_serial_t keyring;
2416
6b000af4
LP
2417 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2418 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2419 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2420 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2421 * into the container. */
8e5430c4
LP
2422
2423 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2424 if (keyring == -1) {
2425 if (errno == ENOSYS)
2426 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2427 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2428 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2429 else
2430 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2431 }
2432
2433 return 0;
2434}
2435
3652872a
LP
2436static int setup_credentials(const char *root) {
2437 const char *q;
2438 int r;
2439
2440 if (arg_n_credentials <= 0)
2441 return 0;
2442
2443 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2444 if (r < 0)
2445 return log_error_errno(r, "Failed to create /run/host: %m");
2446
2447 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2448 if (r < 0)
2449 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2450
2451 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2452 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2453 if (r < 0)
2454 return r;
2455
2456 for (size_t i = 0; i < arg_n_credentials; i++) {
2457 _cleanup_free_ char *j = NULL;
2458 _cleanup_close_ int fd = -1;
2459
2460 j = path_join(q, arg_credentials[i].id);
2461 if (!j)
2462 return log_oom();
2463
2464 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2465 if (fd < 0)
2466 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2467
2468 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2471
2472 if (fchmod(fd, 0400) < 0)
2473 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2474
2475 if (arg_userns_mode != USER_NAMESPACE_NO) {
2476 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2477 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2478 }
2479 }
2480
2481 if (chmod(q, 0500) < 0)
2482 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2483
2484 r = userns_lchown(q, 0, 0);
2485 if (r < 0)
2486 return r;
2487
2488 /* Make both mount and superblock read-only now */
511a8cfe 2489 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2490 if (r < 0)
2491 return r;
2492
511a8cfe 2493 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2494}
2495
1e4f1671 2496static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2497 _cleanup_(unlink_and_freep) char *from = NULL;
2498 _cleanup_free_ char *fifo = NULL;
2499 _cleanup_close_ int fd = -1;
9ec5a93c 2500 int r;
e58a1277 2501
e58a1277 2502 assert(kmsg_socket >= 0);
a258bf26 2503
52f05ef2 2504 BLOCK_WITH_UMASK(0000);
a258bf26 2505
1eacc470 2506 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2507 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2508 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2509 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2510
1eacc470 2511 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2512 if (r < 0)
2513 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2514
9ec5a93c 2515 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2516 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2517
2518 from = TAKE_PTR(fifo);
9ec5a93c 2519
511a8cfe 2520 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2521 if (r < 0)
2522 return r;
e58a1277 2523
669fc4e5 2524 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2525 if (fd < 0)
2526 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2527
9ec5a93c 2528 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2529 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2530 if (r < 0)
2531 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2532
25ea79fe 2533 return 0;
88213476
LP
2534}
2535
761cf19d 2536struct ExposeArgs {
deff68e7
FW
2537 union in_addr_union address4;
2538 union in_addr_union address6;
761cf19d
FW
2539 struct FirewallContext *fw_ctx;
2540};
2541
1c4baffc 2542static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
761cf19d 2543 struct ExposeArgs *args = userdata;
6d0b55c2
LP
2544
2545 assert(rtnl);
2546 assert(m);
761cf19d 2547 assert(args);
6d0b55c2 2548
fb9044cb
LP
2549 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2550 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2551 return 0;
2552}
2553
3a74cea5 2554static int setup_hostname(void) {
c818eef1 2555 int r;
3a74cea5 2556
0c582db0 2557 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2558 return 0;
2559
c818eef1
LP
2560 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2561 if (r < 0)
2562 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2563
7027ff61 2564 return 0;
3a74cea5
LP
2565}
2566
57fb9fb5 2567static int setup_journal(const char *directory) {
0f5e1382 2568 _cleanup_free_ char *d = NULL;
5980d463 2569 const char *p, *q;
b2238e38 2570 sd_id128_t this_id;
8054d749 2571 bool try;
57fb9fb5
LP
2572 int r;
2573
df9a75e4
LP
2574 /* Don't link journals in ephemeral mode */
2575 if (arg_ephemeral)
2576 return 0;
2577
8054d749
LP
2578 if (arg_link_journal == LINK_NO)
2579 return 0;
2580
2581 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2582
4d680aee 2583 r = sd_id128_get_machine(&this_id);
f647962d
MS
2584 if (r < 0)
2585 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2586
e01ff70a 2587 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2588 log_full(try ? LOG_WARNING : LOG_ERR,
85b55869 2589 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
8054d749 2590 if (try)
4d680aee 2591 return 0;
df9a75e4 2592 return -EEXIST;
4d680aee
ZJS
2593 }
2594
369ca6da
ZJS
2595 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2596 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2597 if (r < 0) {
2598 bool ignore = r == -EROFS && try;
2599 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2600 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2601 return ignore ? 0 : r;
2602 }
2603 }
03cfe0d5 2604
85b55869 2605 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
03cfe0d5 2606 q = prefix_roota(directory, p);
27407a01 2607
e1873695 2608 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2609 if (try)
2610 return 0;
27407a01 2611
baaa35ad
ZJS
2612 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2613 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2614 }
2615
e1873695 2616 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2617 if (try)
2618 return 0;
57fb9fb5 2619
baaa35ad
ZJS
2620 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2621 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2622 }
2623
2624 r = readlink_and_make_absolute(p, &d);
2625 if (r >= 0) {
3742095b 2626 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2627 path_equal(d, q)) {
2628
03cfe0d5 2629 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2630 if (r < 0)
709f6e46 2631 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2632 return 0;
57fb9fb5
LP
2633 }
2634
4a62c710
MS
2635 if (unlink(p) < 0)
2636 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2637 } else if (r == -EINVAL) {
2638
2639 if (arg_link_journal == LINK_GUEST &&
2640 rmdir(p) < 0) {
2641
27407a01
ZJS
2642 if (errno == ENOTDIR) {
2643 log_error("%s already exists and is neither a symlink nor a directory", p);
2644 return r;
4314d33f
MS
2645 } else
2646 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2647 }
4314d33f
MS
2648 } else if (r != -ENOENT)
2649 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2650
2651 if (arg_link_journal == LINK_GUEST) {
2652
2653 if (symlink(q, p) < 0) {
8054d749 2654 if (try) {
56f64d95 2655 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2656 return 0;
4314d33f
MS
2657 } else
2658 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2659 }
2660
03cfe0d5 2661 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2662 if (r < 0)
709f6e46 2663 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2664 return 0;
57fb9fb5
LP
2665 }
2666
2667 if (arg_link_journal == LINK_HOST) {
ccddd104 2668 /* don't create parents here — if the host doesn't have
574edc90 2669 * permanent journal set up, don't force it here */
ba8e6c4d 2670
3f692e2e 2671 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e 2672 if (r < 0 && r != -EEXIST) {
8054d749 2673 if (try) {
dae8b82e 2674 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2675 return 0;
4314d33f 2676 } else
dae8b82e 2677 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2678 }
2679
27407a01
ZJS
2680 } else if (access(p, F_OK) < 0)
2681 return 0;
57fb9fb5 2682
db55bbf2 2683 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
cdb2b9d0
LP
2684 log_warning("%s is not empty, proceeding anyway.", q);
2685
03cfe0d5 2686 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2687 if (r < 0)
2688 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2689
511a8cfe 2690 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2691 if (r < 0)
4a62c710 2692 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2693
27407a01 2694 return 0;
57fb9fb5
LP
2695}
2696
de40a303
LP
2697static int drop_capabilities(uid_t uid) {
2698 CapabilityQuintet q;
2699
2700 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2701 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2702 * arg_caps_retain. */
2703
2704 if (capability_quintet_is_set(&arg_full_capabilities)) {
2705 q = arg_full_capabilities;
2706
f5fbe71d 2707 if (q.bounding == UINT64_MAX)
de40a303
LP
2708 q.bounding = uid == 0 ? arg_caps_retain : 0;
2709
f5fbe71d 2710 if (q.effective == UINT64_MAX)
de40a303
LP
2711 q.effective = uid == 0 ? q.bounding : 0;
2712
f5fbe71d 2713 if (q.inheritable == UINT64_MAX)
88fc9c9b 2714 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2715
f5fbe71d 2716 if (q.permitted == UINT64_MAX)
88fc9c9b 2717 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2718
f5fbe71d 2719 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2720 q.ambient = arg_caps_ambient;
f66ad460
AZ
2721
2722 if (capability_quintet_mangle(&q))
2723 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2724
2725 } else {
de40a303
LP
2726 q = (CapabilityQuintet) {
2727 .bounding = arg_caps_retain,
2728 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2729 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2730 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2731 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2732 };
2733
f66ad460
AZ
2734 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2735 * in order to maintain the same behavior as systemd < 242. */
2736 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2737 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2738 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2739
2740 }
2741
de40a303 2742 return capability_quintet_enforce(&q);
88213476
LP
2743}
2744
db999e0f
LP
2745static int reset_audit_loginuid(void) {
2746 _cleanup_free_ char *p = NULL;
2747 int r;
2748
0c582db0 2749 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2750 return 0;
2751
2752 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2753 if (r == -ENOENT)
db999e0f 2754 return 0;
f647962d
MS
2755 if (r < 0)
2756 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2757
2758 /* Already reset? */
2759 if (streq(p, "4294967295"))
2760 return 0;
2761
57512c89 2762 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2763 if (r < 0) {
10a87006
LP
2764 log_error_errno(r,
2765 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2766 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2767 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2768 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2769 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2770
db999e0f 2771 sleep(5);
77b6e194 2772 }
db999e0f
LP
2773
2774 return 0;
77b6e194
LP
2775}
2776
785890ac
LP
2777static int setup_propagate(const char *root) {
2778 const char *p, *q;
709f6e46 2779 int r;
785890ac
LP
2780
2781 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2782 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2783 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2784 (void) mkdir_p(p, 0600);
2785
5a27b395 2786 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2787 if (r < 0)
5a27b395 2788 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2789
5a27b395 2790 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
709f6e46 2791 if (r < 0)
5a27b395 2792 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
03cfe0d5 2793
5a27b395 2794 q = prefix_roota(root, "/run/host/incoming");
511a8cfe 2795 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2796 if (r < 0)
2797 return r;
785890ac 2798
511a8cfe 2799 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2800 if (r < 0)
2801 return r;
785890ac 2802
5a27b395 2803 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
511a8cfe 2804 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2805}
2806
317feb4d 2807static int setup_machine_id(const char *directory) {
691675ba
LP
2808 const char *etc_machine_id;
2809 sd_id128_t id;
3bbaff3e 2810 int r;
e01ff70a 2811
317feb4d
LP
2812 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2813 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2814 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2815 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2816 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2817 * container behaves nicely). */
2818
e01ff70a
MS
2819 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2820
c5fbeedb 2821 r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
317feb4d
LP
2822 if (r < 0) {
2823 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2824 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2825
317feb4d
LP
2826 if (sd_id128_is_null(arg_uuid)) {
2827 r = sd_id128_randomize(&arg_uuid);
2828 if (r < 0)
2829 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2830 }
2831 } else {
baaa35ad
ZJS
2832 if (sd_id128_is_null(id))
2833 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2834 "Machine ID in container image is zero, refusing.");
e01ff70a 2835
317feb4d
LP
2836 arg_uuid = id;
2837 }
691675ba 2838
e01ff70a
MS
2839 return 0;
2840}
2841
7336138e
LP
2842static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2843 int r;
2844
2845 assert(directory);
2846
6c045a99 2847 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2848 return 0;
2849
2850 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2851 if (r == -EOPNOTSUPP)
2852 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2853 if (r == -EBADE)
2854 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2855 if (r < 0)
2856 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2857 if (r == 0)
2858 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2859 else
2860 log_debug("Patched directory tree to match UID/GID range.");
2861
2862 return r;
2863}
2864
113cea80 2865/*
6d416b9c
LS
2866 * Return values:
2867 * < 0 : wait_for_terminate() failed to get the state of the
2868 * container, the container was terminated by a signal, or
2869 * failed for an unknown reason. No change is made to the
2870 * container argument.
2871 * > 0 : The program executed in the container terminated with an
2872 * error. The exit code of the program executed in the
919699ec
LP
2873 * container is returned. The container argument has been set
2874 * to CONTAINER_TERMINATED.
6d416b9c
LS
2875 * 0 : The container is being rebooted, has been shut down or exited
2876 * successfully. The container argument has been set to either
2877 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2878 *
6d416b9c
LS
2879 * That is, success is indicated by a return value of zero, and an
2880 * error is indicated by a non-zero value.
113cea80
DH
2881 */
2882static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2883 siginfo_t status;
919699ec 2884 int r;
113cea80
DH
2885
2886 r = wait_for_terminate(pid, &status);
f647962d
MS
2887 if (r < 0)
2888 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2889
2890 switch (status.si_code) {
fddbb89c 2891
113cea80 2892 case CLD_EXITED:
b5a2179b 2893 if (status.si_status == 0)
919699ec 2894 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2895 else
919699ec 2896 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2897
919699ec
LP
2898 *container = CONTAINER_TERMINATED;
2899 return status.si_status;
113cea80
DH
2900
2901 case CLD_KILLED:
2902 if (status.si_status == SIGINT) {
919699ec 2903 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2904 *container = CONTAINER_TERMINATED;
919699ec
LP
2905 return 0;
2906
113cea80 2907 } else if (status.si_status == SIGHUP) {
919699ec 2908 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2909 *container = CONTAINER_REBOOTED;
919699ec 2910 return 0;
113cea80 2911 }
919699ec 2912
4831981d 2913 _fallthrough_;
113cea80 2914 case CLD_DUMPED:
baaa35ad
ZJS
2915 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2916 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2917
2918 default:
baaa35ad
ZJS
2919 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2920 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2921 }
113cea80
DH
2922}
2923
023fb90b
LP
2924static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2925 pid_t pid;
2926
4a0b58c4 2927 pid = PTR_TO_PID(userdata);
023fb90b 2928 if (pid > 0) {
c6c8f6e2 2929 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2930 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2931 sd_event_source_set_userdata(s, NULL);
2932 return 0;
2933 }
2934 }
2935
2936 sd_event_exit(sd_event_source_get_event(s), 0);
2937 return 0;
2938}
2939
6916b164 2940static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2941 pid_t pid;
2942
2943 assert(s);
2944 assert(ssi);
2945
2946 pid = PTR_TO_PID(userdata);
2947
6916b164
AU
2948 for (;;) {
2949 siginfo_t si = {};
abdb9b08 2950
6916b164
AU
2951 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2952 return log_error_errno(errno, "Failed to waitid(): %m");
2953 if (si.si_pid == 0) /* No pending children. */
2954 break;
abdb9b08 2955 if (si.si_pid == pid) {
6916b164
AU
2956 /* The main process we care for has exited. Return from
2957 * signal handler but leave the zombie. */
2958 sd_event_exit(sd_event_source_get_event(s), 0);
2959 break;
2960 }
abdb9b08 2961
6916b164
AU
2962 /* Reap all other children. */
2963 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2964 }
2965
2966 return 0;
2967}
2968
abdb9b08
LP
2969static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2970 pid_t pid;
2971
2972 assert(m);
2973
2974 pid = PTR_TO_PID(userdata);
2975
2976 if (arg_kill_signal > 0) {
2977 log_info("Container termination requested. Attempting to halt container.");
2978 (void) kill(pid, arg_kill_signal);
2979 } else {
2980 log_info("Container termination requested. Exiting.");
2981 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2982 }
2983
2984 return 0;
2985}
2986
ec16945e 2987static int determine_names(void) {
1b9cebf6 2988 int r;
ec16945e 2989
c1521918
LP
2990 if (arg_template && !arg_directory && arg_machine) {
2991
2992 /* If --template= was specified then we should not
2993 * search for a machine, but instead create a new one
2994 * in /var/lib/machine. */
2995
657ee2d8 2996 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2997 if (!arg_directory)
2998 return log_oom();
2999 }
3000
ec16945e 3001 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3002 if (arg_machine) {
3003 _cleanup_(image_unrefp) Image *i = NULL;
3004
d577d4a4 3005 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
3006 if (r == -ENOENT)
3007 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
3008 if (r < 0)
3009 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 3010
eb38edce 3011 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 3012 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 3013 else
0f03c2a4 3014 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 3015 if (r < 0)
0f3be6ca 3016 return log_oom();
1b9cebf6 3017
aee327b8
LP
3018 if (!arg_ephemeral)
3019 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
3020 } else {
3021 r = safe_getcwd(&arg_directory);
3022 if (r < 0)
3023 return log_error_errno(r, "Failed to determine current directory: %m");
3024 }
ec16945e 3025
c6147113
LP
3026 if (!arg_directory && !arg_image)
3027 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
3028 }
3029
3030 if (!arg_machine) {
b9ba4dab
LP
3031 if (arg_directory && path_equal(arg_directory, "/"))
3032 arg_machine = gethostname_malloc();
e9b88a6d
LP
3033 else if (arg_image) {
3034 char *e;
4827ab48 3035
e9b88a6d 3036 arg_machine = strdup(basename(arg_image));
4827ab48 3037
e9b88a6d
LP
3038 /* Truncate suffix if there is one */
3039 e = endswith(arg_machine, ".raw");
3040 if (e)
3041 *e = 0;
3042 } else
3043 arg_machine = strdup(basename(arg_directory));
ec16945e
LP
3044 if (!arg_machine)
3045 return log_oom();
3046
ae691c1d 3047 hostname_cleanup(arg_machine);
52ef5dd7 3048 if (!hostname_is_valid(arg_machine, 0))
c6147113 3049 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 3050
3603f151
LB
3051 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3052 * to match fixed config file names. */
3053 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3054 if (!arg_settings_filename)
3055 return log_oom();
3056
e9b88a6d
LP
3057 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3058 * instances at once without manually having to specify -M each time. */
3059 if (arg_ephemeral)
3060 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3061 return log_oom();
3603f151
LB
3062 } else {
3063 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3064 if (!arg_settings_filename)
3065 return log_oom();
ec16945e
LP
3066 }
3067
3068 return 0;
3069}
3070
8d4aa2bb 3071static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
3072 char *chased;
3073 int r;
3074
3075 assert(p);
3076
3077 if (!*p)
3078 return 0;
3079
a5648b80 3080 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3081 if (r < 0)
3082 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3083
a5648b80 3084 return free_and_replace(*p, chased);
3f342ec4
LP
3085}
3086
03cfe0d5 3087static int determine_uid_shift(const char *directory) {
6dac160c 3088
0de7acce 3089 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3090 arg_uid_shift = 0;
6dac160c 3091 return 0;
03cfe0d5 3092 }
6dac160c
LP
3093
3094 if (arg_uid_shift == UID_INVALID) {
3095 struct stat st;
3096
993da6d4
LP
3097 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3098
3099 if (stat(directory, &st) < 0)
03cfe0d5 3100 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3101
3102 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3103
baaa35ad
ZJS
3104 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3105 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3106 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3107
3108 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3109
3110 if (arg_uid_shift != 0) {
3111 /* If the image is shifted already, then we'll fall back to classic chowning, for
3112 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3113
3114 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3115 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3116 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3117 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3118 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3119 "UID base of %s is not zero, UID mapping not supported.", directory);
3120 }
6dac160c
LP
3121 }
3122
58e13de5
LP
3123 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3124 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3125
6dac160c
LP
3126 return 0;
3127}
3128
de40a303
LP
3129static unsigned long effective_clone_ns_flags(void) {
3130 unsigned long flags = arg_clone_ns_flags;
3131
3132 if (arg_private_network)
3133 flags |= CLONE_NEWNET;
3134 if (arg_use_cgns)
3135 flags |= CLONE_NEWCGROUP;
3136 if (arg_userns_mode != USER_NAMESPACE_NO)
3137 flags |= CLONE_NEWUSER;
3138
3139 return flags;
3140}
3141
3142static int patch_sysctl(void) {
3143
3144 /* This table is inspired by runc's sysctl() function */
3145 static const struct {
3146 const char *key;
3147 bool prefix;
3148 unsigned long clone_flags;
3149 } safe_sysctl[] = {
3150 { "kernel.hostname", false, CLONE_NEWUTS },
3151 { "kernel.domainname", false, CLONE_NEWUTS },
3152 { "kernel.msgmax", false, CLONE_NEWIPC },
3153 { "kernel.msgmnb", false, CLONE_NEWIPC },
3154 { "kernel.msgmni", false, CLONE_NEWIPC },
3155 { "kernel.sem", false, CLONE_NEWIPC },
3156 { "kernel.shmall", false, CLONE_NEWIPC },
3157 { "kernel.shmmax", false, CLONE_NEWIPC },
3158 { "kernel.shmmni", false, CLONE_NEWIPC },
3159 { "fs.mqueue.", true, CLONE_NEWIPC },
3160 { "net.", true, CLONE_NEWNET },
3161 };
3162
3163 unsigned long flags;
de40a303
LP
3164 int r;
3165
3166 flags = effective_clone_ns_flags();
3167
3168 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3169 bool good = false;
3170 size_t i;
3171
3172 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3173
3174 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3175 continue;
3176
3177 if (safe_sysctl[i].prefix)
3178 good = startswith(*k, safe_sysctl[i].key);
3179 else
3180 good = streq(*k, safe_sysctl[i].key);
3181
3182 if (good)
3183 break;
3184 }
3185
c6147113
LP
3186 if (!good)
3187 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3188
3189 r = sysctl_write(*k, *v);
3190 if (r < 0)
3191 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3192 }
3193
3194 return 0;
3195}
3196
03cfe0d5
LP
3197static int inner_child(
3198 Barrier *barrier,
3199 const char *directory,
3200 bool secondary,
3201 int kmsg_socket,
3202 int rtnl_socket,
3acc84eb 3203 int master_pty_socket,
e1bb4b0d
LB
3204 FDSet *fds,
3205 char **os_release_pairs) {
69c79d3c 3206
03cfe0d5 3207 _cleanup_free_ char *home = NULL;
88614c8a 3208 size_t n_env = 1;
4ab3d29f
ZJS
3209 char *envp[] = {
3210 (char*) "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3211 NULL, /* container */
03cfe0d5
LP
3212 NULL, /* TERM */
3213 NULL, /* HOME */
3214 NULL, /* USER */
3215 NULL, /* LOGNAME */
3216 NULL, /* container_uuid */
3217 NULL, /* LISTEN_FDS */
3218 NULL, /* LISTEN_PID */
9c1e04d0 3219 NULL, /* NOTIFY_SOCKET */
3652872a 3220 NULL, /* CREDENTIALS_DIRECTORY */
b626f695 3221 NULL, /* LANG */
03cfe0d5
LP
3222 NULL
3223 };
1a68e1e5 3224 const char *exec_target;
2371271c 3225 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3226 int r, which_failed;
88213476 3227
b37469d7
LP
3228 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3229 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3230 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3231 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3232 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3233 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3234 * namespace.
3235 *
3236 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3237 * unshare(). See below. */
3238
03cfe0d5
LP
3239 assert(barrier);
3240 assert(directory);
3241 assert(kmsg_socket >= 0);
88213476 3242
de40a303
LP
3243 log_debug("Inner child is initializing.");
3244
0de7acce 3245 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3246 /* Tell the parent, that it now can write the UID map. */
3247 (void) barrier_place(barrier); /* #1 */
7027ff61 3248
03cfe0d5 3249 /* Wait until the parent wrote the UID map */
baaa35ad 3250 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3251 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3252
2a2e78e9
LP
3253 /* Become the new root user inside our namespace */
3254 r = reset_uid_gid();
3255 if (r < 0)
3256 return log_error_errno(r, "Couldn't become new root: %m");
3257
3258 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3259 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3260 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3261 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3262 if (r < 0)
3263 return r;
3264 }
6d66bd3b 3265
0de7acce 3266 r = mount_all(NULL,
4f086aab 3267 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3268 arg_uid_shift,
0de7acce 3269 arg_selinux_apifs_context);
03cfe0d5
LP
3270 if (r < 0)
3271 return r;
3272
04413780
ZJS
3273 if (!arg_network_namespace_path && arg_private_network) {
3274 r = unshare(CLONE_NEWNET);
3275 if (r < 0)
3276 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3277
3278 /* Tell the parent that it can setup network interfaces. */
3279 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3280 }
3281
4f086aab 3282 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3283 if (r < 0)
3284 return r;
3285
03cfe0d5
LP
3286 /* Wait until we are cgroup-ified, so that we
3287 * can mount the right cgroup path writable */
baaa35ad
ZJS
3288 if (!barrier_place_and_sync(barrier)) /* #4 */
3289 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3290 "Parent died too early");
88213476 3291
489fae52 3292 if (arg_use_cgns) {
0996ef00
CB
3293 r = unshare(CLONE_NEWCGROUP);
3294 if (r < 0)
04413780 3295 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3296 r = mount_cgroups(
3297 "",
3298 arg_unified_cgroup_hierarchy,
3299 arg_userns_mode != USER_NAMESPACE_NO,
3300 arg_uid_shift,
3301 arg_uid_range,
5a8ff0e6 3302 arg_selinux_apifs_context,
ada54120 3303 true);
1433e0f2 3304 } else
0996ef00 3305 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3306 if (r < 0)
3307 return r;
ec16945e 3308
1e4f1671 3309 r = setup_boot_id();
03cfe0d5
LP
3310 if (r < 0)
3311 return r;
ec16945e 3312
1e4f1671 3313 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
3314 if (r < 0)
3315 return r;
3316 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3317
de40a303
LP
3318 r = mount_custom(
3319 "/",
3320 arg_custom_mounts,
3321 arg_n_custom_mounts,
de40a303 3322 0,
c0c8f718 3323 0,
de40a303 3324 arg_selinux_apifs_context,
5f0a6347 3325 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3326 if (r < 0)
3327 return r;
3328
03cfe0d5
LP
3329 if (setsid() < 0)
3330 return log_error_errno(errno, "setsid() failed: %m");
3331
3332 if (arg_private_network)
df883de9 3333 (void) loopback_setup();
03cfe0d5 3334
7a8f6325
LP
3335 if (arg_expose_ports) {
3336 r = expose_port_send_rtnl(rtnl_socket);
3337 if (r < 0)
3338 return r;
3339 rtnl_socket = safe_close(rtnl_socket);
3340 }
03cfe0d5 3341
3acc84eb 3342 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3343 _cleanup_close_ int master = -1;
3acc84eb
FB
3344 _cleanup_free_ char *console = NULL;
3345
3346 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3347 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3348 if (master < 0)
dc98caea 3349 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3350
3351 r = setup_dev_console(console);
3352 if (r < 0)
105a1a36 3353 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb
FB
3354
3355 r = send_one_fd(master_pty_socket, master, 0);
3356 if (r < 0)
3357 return log_error_errno(r, "Failed to send master fd: %m");
3358 master_pty_socket = safe_close(master_pty_socket);
3359
3360 r = setup_stdio_as_dev_console();
3361 if (r < 0)
3362 return r;
3363 }
3364
de40a303
LP
3365 r = patch_sysctl();
3366 if (r < 0)
3367 return r;
3368
81f345df
LP
3369 if (arg_oom_score_adjust_set) {
3370 r = set_oom_score_adjust(arg_oom_score_adjust);
3371 if (r < 0)
3372 return log_error_errno(r, "Failed to adjust OOM score: %m");
3373 }
3374
0985c7c4
ZJS
3375 if (arg_cpu_set.set)
3376 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3377 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3378
c818eef1 3379 (void) setup_hostname();
03cfe0d5 3380
050f7277 3381 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3382 r = safe_personality(arg_personality);
3383 if (r < 0)
3384 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3385 } else if (secondary) {
21022b9d
LP
3386 r = safe_personality(PER_LINUX32);
3387 if (r < 0)
3388 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3389 }
3390
de40a303
LP
3391 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3392 if (r < 0)
3393 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3394
3395#if HAVE_SECCOMP
3396 if (arg_seccomp) {
3397
3398 if (is_seccomp_available()) {
3399
3400 r = seccomp_load(arg_seccomp);
7bc5e0b1 3401 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3402 return log_error_errno(r, "Failed to install seccomp filter: %m");
3403 if (r < 0)
3404 log_debug_errno(r, "Failed to install seccomp filter: %m");
3405 }
3406 } else
3407#endif
3408 {
6b000af4 3409 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3410 if (r < 0)
3411 return r;
3412 }
3413
4a4654e0 3414 if (arg_suppress_sync) {
20e458ae 3415#if HAVE_SECCOMP
4a4654e0
LP
3416 r = seccomp_suppress_sync();
3417 if (r < 0)
3418 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
20e458ae 3419#else
2db32618 3420 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
20e458ae 3421#endif
4a4654e0
LP
3422 }
3423
349cc4a5 3424#if HAVE_SELINUX
03cfe0d5 3425 if (arg_selinux_context)
2ed96880 3426 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3427 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3428#endif
3429
de40a303
LP
3430 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3431 * if we need to later on. */
3432 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3433 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3434
3435 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3436 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3437 else
3462d773 3438 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3439 if (r < 0)
3440 return r;
3441
de40a303
LP
3442 r = drop_capabilities(getuid());
3443 if (r < 0)
3444 return log_error_errno(r, "Dropping capabilities failed: %m");
3445
66edd963
LP
3446 if (arg_no_new_privileges)
3447 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3448 return log_error_errno(errno, "Failed to disable new privileges: %m");
3449
6aadfa4c
ILG
3450 /* LXC sets container=lxc, so follow the scheme here */
3451 envp[n_env++] = strjoina("container=", arg_container_service_name);
3452
03cfe0d5
LP
3453 envp[n_env] = strv_find_prefix(environ, "TERM=");
3454 if (envp[n_env])
313cefa1 3455 n_env++;
03cfe0d5 3456
de40a303 3457 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3458 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
de40a303
LP
3459 return log_oom();
3460
3461 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f
ZJS
3462 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3463 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
de40a303 3464 return log_oom();
03cfe0d5 3465
3bbaff3e 3466 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3467
b7416360 3468 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
e01ff70a 3469 return log_oom();
03cfe0d5
LP
3470
3471 if (fdset_size(fds) > 0) {
3472 r = fdset_cloexec(fds, false);
3473 if (r < 0)
3474 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3475
4ab3d29f
ZJS
3476 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3477 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
03cfe0d5
LP
3478 return log_oom();
3479 }
4ab3d29f 3480 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
9c1e04d0 3481 return log_oom();
03cfe0d5 3482
3652872a
LP
3483 if (arg_n_credentials > 0) {
3484 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3485 if (!envp[n_env])
3486 return log_oom();
3487 n_env++;
3488 }
3489
b626f695
DDM
3490 if (arg_start_mode != START_BOOT) {
3491 /* If we're running a command in the container, let's default to the C.UTF-8 locale as it's
3492 * part of glibc these days and was backported to most distros a long time before it got
3493 * added to upstream glibc. */
3494 envp[n_env] = strdup("LANG=C.UTF-8");
3495 if (!envp[n_env])
3496 return log_oom();
3497 n_env++;
3498 }
3499
4ab3d29f 3500 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
2371271c
TG
3501 if (!env_use)
3502 return log_oom();
03cfe0d5
LP
3503
3504 /* Let the parent know that we are ready and
3505 * wait until the parent is ready with the
3506 * setup, too... */
baaa35ad 3507 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3508 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3509
5f932eb9
LP
3510 if (arg_chdir)
3511 if (chdir(arg_chdir) < 0)
3512 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3513
7732f92b 3514 if (arg_start_mode == START_PID2) {
75bf701f 3515 r = stub_pid1(arg_uuid);
7732f92b
LP
3516 if (r < 0)
3517 return r;
3518 }
3519
335d2ead
LP
3520 if (arg_console_mode != CONSOLE_PIPE) {
3521 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3522 * are configured for that. Acquire it as controlling tty. */
3523 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3524 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3525 }
3526
de40a303
LP
3527 log_debug("Inner child completed, invoking payload.");
3528
8ca082b4
LP
3529 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3530 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3531 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3532 log_close();
8ca082b4
LP
3533 log_set_open_when_needed(true);
3534
03cfe0d5
LP
3535 (void) fdset_close_others(fds);
3536
7732f92b 3537 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3538 char **a;
3539 size_t m;
3540
3541 /* Automatically search for the init system */
3542
75f32f04
ZJS
3543 m = strv_length(arg_parameters);
3544 a = newa(char*, m + 2);
3545 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3546 a[1 + m] = NULL;
03cfe0d5 3547
a5096641
LP
3548 FOREACH_STRING(init,
3549 "/usr/lib/systemd/systemd",
3550 "/lib/systemd/systemd",
3551 "/sbin/init") {
3552 a[0] = (char*) init;
3553 execve(a[0], a, env_use);
3554 }
ced58da7
LP
3555
3556 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3557 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3558 const char *dollar_path;
3559
1a68e1e5 3560 exec_target = arg_parameters[0];
b6b180b7
LP
3561
3562 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3563 * binary. */
3564 dollar_path = strv_env_get(env_use, "PATH");
3565 if (dollar_path) {
6f646e01 3566 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3567 return log_error_errno(errno, "Failed to update $PATH: %m");
3568 }
3569
f757855e 3570 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3571 } else {
5f932eb9 3572 if (!arg_chdir)
d929b0f9
ZJS
3573 /* If we cannot change the directory, we'll end up in /, that is expected. */
3574 (void) chdir(home ?: "/root");
5f932eb9 3575
53350c7b 3576 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3577 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3578 execle("/bin/bash", "-bash", NULL, env_use);
3579 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3580 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7 3581
53350c7b 3582 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
03cfe0d5
LP
3583 }
3584
8ca082b4 3585 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3586}
3587
e96ceaba 3588static int setup_notify_child(void) {
271f518f 3589 _cleanup_close_ int fd = -1;
1eb874b9 3590 static const union sockaddr_union sa = {
44ed5214
LP
3591 .un.sun_family = AF_UNIX,
3592 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3593 };
3594 int r;
3595
3596 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3597 if (fd < 0)
3598 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3599
3600 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3601 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3602
9c1e04d0 3603 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3604 if (r < 0)
44ed5214 3605 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3606
adc7d9f0 3607 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3608 if (r < 0)
adc7d9f0 3609 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3610
2ff48e98 3611 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3612 if (r < 0)
2ff48e98 3613 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3614
271f518f 3615 return TAKE_FD(fd);
9c1e04d0
AP
3616}
3617
03cfe0d5
LP
3618static int outer_child(
3619 Barrier *barrier,
3620 const char *directory,
2d845785 3621 DissectedImage *dissected_image,
03cfe0d5
LP
3622 bool secondary,
3623 int pid_socket,
e01ff70a 3624 int uuid_socket,
9c1e04d0 3625 int notify_socket,
03cfe0d5
LP
3626 int kmsg_socket,
3627 int rtnl_socket,
825d5287 3628 int uid_shift_socket,
3acc84eb 3629 int master_pty_socket,
8199d554 3630 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3631 FDSet *fds,
3632 int netns_fd) {
03cfe0d5 3633
2f893044 3634 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3635 _cleanup_strv_free_ char **os_release_pairs = NULL;
bf428efb 3636 _cleanup_close_ int fd = -1;
f61c7f88 3637 bool idmap = false;
e5f10caf 3638 const char *p;
03cfe0d5
LP
3639 pid_t pid;
3640 ssize_t l;
de40a303 3641 int r;
03cfe0d5 3642
d1d0b895
LP
3643 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3644 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3645 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3646 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3647 * forked off it, and it exits. */
b37469d7 3648
03cfe0d5
LP
3649 assert(barrier);
3650 assert(directory);
03cfe0d5 3651 assert(pid_socket >= 0);
e01ff70a 3652 assert(uuid_socket >= 0);
9c1e04d0 3653 assert(notify_socket >= 0);
3acc84eb 3654 assert(master_pty_socket >= 0);
03cfe0d5
LP
3655 assert(kmsg_socket >= 0);
3656
de40a303
LP
3657 log_debug("Outer child is initializing.");
3658
e1bb4b0d
LB
3659 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3660 if (r < 0)
3661 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3662
03cfe0d5
LP
3663 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3664 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3665
03cfe0d5
LP
3666 r = reset_audit_loginuid();
3667 if (r < 0)
3668 return r;
3669
2a2e78e9
LP
3670 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3671 * mounts to the real root. */
511a8cfe 3672 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3673 if (r < 0)
3674 return r;
03cfe0d5 3675
2d845785 3676 if (dissected_image) {
d1d0b895
LP
3677 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3678 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3679 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3680 * right place right away. This makes sure ESP partitions and userns are compatible. */
2d3a5a73 3681
af187ab2 3682 r = dissected_image_mount_and_warn(
d04faa4e
LP
3683 dissected_image,
3684 directory,
3685 arg_uid_shift,
21b61b1d 3686 arg_uid_range,
d04faa4e
LP
3687 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3688 DISSECT_IMAGE_DISCARD_ON_LOOP|
3689 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3690 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3691 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3692 if (r < 0)
af187ab2 3693 return r;
2d845785 3694 }
03cfe0d5 3695
391567f4
LP
3696 r = determine_uid_shift(directory);
3697 if (r < 0)
3698 return r;
3699
0de7acce 3700 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3701 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3702 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3703 if (l < 0)
3704 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3705 if (l != sizeof(arg_uid_shift))
3706 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3707 "Short write while sending UID shift.");
0e7ac751 3708
0de7acce 3709 if (arg_userns_mode == USER_NAMESPACE_PICK) {
d1d0b895
LP
3710 /* When we are supposed to pick the UID shift, the parent will check now whether the
3711 * UID shift we just read from the image is available. If yes, it will send the UID
3712 * shift back to us, if not it will pick a different one, and send it back to us. */
0e7ac751
LP
3713
3714 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3715 if (l < 0)
3716 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3717 if (l != sizeof(arg_uid_shift))
3718 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3719 "Short read while receiving UID shift.");
0e7ac751
LP
3720 }
3721
ff6c6cc1
LP
3722 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3723 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3724 }
3725
6f83d3d1
LP
3726 if (path_equal(directory, "/")) {
3727 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3728 * place, so that we can make changes to its mount structure (for example, to implement
3729 * --volatile=) without this interfering with our ability to access files such as
3730 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3731 * (instead of a temporary directory, since we are living in our own mount namspace here
7802194a 3732 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
6f83d3d1
LP
3733 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3734
511a8cfe 3735 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3736 if (r < 0)
3737 return r;
3738
3739 directory = "/run/systemd/nspawn-root";
e50cd82f 3740 }
7d0ecdd6
LP
3741
3742 r = setup_pivot_root(
3743 directory,
3744 arg_pivot_root_new,
3745 arg_pivot_root_old);
3746 if (r < 0)
3747 return r;
3748
3749 r = setup_volatile_mode(
3750 directory,
3751 arg_volatile_mode,
7d0ecdd6 3752 arg_uid_shift,
8f1ed04a 3753 arg_selinux_apifs_context);
7d0ecdd6
LP
3754 if (r < 0)
3755 return r;
3756
2f893044
LP
3757 r = bind_user_prepare(
3758 directory,
3759 arg_bind_user,
3760 arg_uid_shift,
3761 arg_uid_range,
3762 &arg_custom_mounts, &arg_n_custom_mounts,
3763 &bind_user_context);
3764 if (r < 0)
3765 return r;
3766
3767 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
d1d0b895
LP
3768 /* Send the user maps we determined to the parent, so that it installs it in our user
3769 * namespace UID map table */
2f893044
LP
3770
3771 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3772 uid_t map[] = {
3773 bind_user_context->data[i].payload_user->uid,
3774 bind_user_context->data[i].host_user->uid,
3775 (uid_t) bind_user_context->data[i].payload_group->gid,
3776 (uid_t) bind_user_context->data[i].host_group->gid,
3777 };
3778
3779 l = send(uid_shift_socket, map, sizeof(map), MSG_NOSIGNAL);
3780 if (l < 0)
3781 return log_error_errno(errno, "Failed to send user UID map: %m");
3782 if (l != sizeof(map))
3783 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3784 "Short write while sending user UID map.");
3785 }
3786 }
3787
5f0a6347
DDM
3788 r = mount_custom(
3789 directory,
3790 arg_custom_mounts,
3791 arg_n_custom_mounts,
5f0a6347 3792 arg_uid_shift,
c0c8f718 3793 arg_uid_range,
5f0a6347
DDM
3794 arg_selinux_apifs_context,
3795 MOUNT_ROOT_ONLY);
3796 if (r < 0)
3797 return r;
3798
5530dc87 3799 /* Make sure we always have a mount that we can move to root later on. */
14a25e1f
LP
3800 r = make_mount_point(directory);
3801 if (r < 0)
3802 return r;
5530dc87 3803
c0c8f718
AV
3804 if (arg_userns_mode != USER_NAMESPACE_NO &&
3805 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3806 arg_uid_shift != 0) {
3807
50ae2966 3808 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, REMOUNT_IDMAP_HOST_ROOT);
c0c8f718
AV
3809 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3810 /* This might fail because the kernel or file system doesn't support idmapping. We
3811 * can't really distinguish this nicely, nor do we have any guarantees about the
3812 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3813 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3814 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3815 "ID mapped mounts are apparently not available, sorry.");
3816
3817 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3818 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3819 } else if (r < 0)
3820 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3821 else {
3822 log_debug("ID mapped mounts available, making use of them.");
3823 idmap = true;
3824 }
3825 }
3826
2d3a5a73
LP
3827 if (dissected_image) {
3828 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3829 r = dissected_image_mount(
3830 dissected_image,
3831 directory,
3832 arg_uid_shift,
21b61b1d 3833 arg_uid_range,
d04faa4e
LP
3834 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3835 DISSECT_IMAGE_DISCARD_ON_LOOP|
3836 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3837 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3838 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3839 if (r == -EUCLEAN)
3840 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3841 if (r < 0)
4fcb96ce 3842 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3843 }
3844
8199d554
LP
3845 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3846 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3847
3848 r = detect_unified_cgroup_hierarchy_from_image(directory);
3849 if (r < 0)
3850 return r;
3851
3852 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3853 if (l < 0)
3854 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3855 if (l != sizeof(arg_unified_cgroup_hierarchy))
3856 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3857 "Short write while sending cgroup mode.");
8199d554
LP
3858
3859 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3860 }
3861
d1d0b895
LP
3862 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3863 * mounts available in systemd services inside the container that create a new mount namespace. See
3864 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3865 * will inherit the shared propagation mode.
5f0a6347 3866 *
d1d0b895
LP
3867 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3868 * directory mount to root later on.
5f0a6347
DDM
3869 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3870 */
511a8cfe 3871 r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
4ad14eff
LP
3872 if (r < 0)
3873 return r;
3874
3875 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3876 if (r < 0)
3877 return r;
3878
03cfe0d5
LP
3879 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3880 if (r < 0)
3881 return r;
3882
bbd407ea
DDM
3883 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3884 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3885 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3886 if (r < 0)
3887 return log_error_errno(r, "Failed to make tree read-only: %m");
3888 }
3889
0de7acce 3890 r = mount_all(directory,
4f086aab 3891 arg_mount_settings,
0de7acce 3892 arg_uid_shift,
0de7acce 3893 arg_selinux_apifs_context);
03cfe0d5
LP
3894 if (r < 0)
3895 return r;
3896
07fa00f9
LP
3897 r = copy_devnodes(directory);
3898 if (r < 0)
03cfe0d5
LP
3899 return r;
3900
de40a303
LP
3901 r = make_extra_nodes(directory);
3902 if (r < 0)
3903 return r;
3904
3905 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3906
9fac5029 3907 p = prefix_roota(directory, "/run/host");
e5f10caf 3908 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3909
07fa00f9
LP
3910 r = setup_pts(directory);
3911 if (r < 0)
03cfe0d5
LP
3912 return r;
3913
3914 r = setup_propagate(directory);
3915 if (r < 0)
3916 return r;
3917
8e5430c4
LP
3918 r = setup_keyring();
3919 if (r < 0)
3920 return r;
3921
3652872a
LP
3922 r = setup_credentials(directory);
3923 if (r < 0)
3924 return r;
3925
2f893044
LP
3926 r = bind_user_setup(bind_user_context, directory);
3927 if (r < 0)
3928 return r;
3929
5c4deb9a
MJ
3930 r = mount_custom(
3931 directory,
3932 arg_custom_mounts,
3933 arg_n_custom_mounts,
3934 arg_uid_shift,
c0c8f718 3935 arg_uid_range,
5c4deb9a
MJ
3936 arg_selinux_apifs_context,
3937 MOUNT_NON_ROOT_ONLY);
3938 if (r < 0)
3939 return r;
3940
03cfe0d5
LP
3941 r = setup_timezone(directory);
3942 if (r < 0)
3943 return r;
3944
3945 r = setup_resolv_conf(directory);
3946 if (r < 0)
3947 return r;
3948
e01ff70a
MS
3949 r = setup_machine_id(directory);
3950 if (r < 0)
3951 return r;
3952
03cfe0d5
LP
3953 r = setup_journal(directory);
3954 if (r < 0)
3955 return r;
3956
0f48ba7b
LP
3957 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3958 p = prefix_roota(directory, "/run/host/container-manager");
3959 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3960
3961 /* The same stuff as the $container_uuid env var */
3962 p = prefix_roota(directory, "/run/host/container-uuid");
3963 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3964
489fae52 3965 if (!arg_use_cgns) {
0996ef00
CB
3966 r = mount_cgroups(
3967 directory,
3968 arg_unified_cgroup_hierarchy,
3969 arg_userns_mode != USER_NAMESPACE_NO,
3970 arg_uid_shift,
3971 arg_uid_range,
5a8ff0e6 3972 arg_selinux_apifs_context,
ada54120 3973 false);
0996ef00
CB
3974 if (r < 0)
3975 return r;
3976 }
03cfe0d5
LP
3977
3978 r = mount_move_root(directory);
3979 if (r < 0)
3980 return log_error_errno(r, "Failed to move root directory: %m");
3981
e96ceaba 3982 fd = setup_notify_child();
9c1e04d0
AP
3983 if (fd < 0)
3984 return fd;
3985
03cfe0d5 3986 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3987 arg_clone_ns_flags |
8869a0b4 3988 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3989 if (pid < 0)
3990 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3991 if (pid == 0) {
3992 pid_socket = safe_close(pid_socket);
e01ff70a 3993 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3994 notify_socket = safe_close(notify_socket);
825d5287 3995 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5 3996
2a2e78e9
LP
3997 /* The inner child has all namespaces that are requested, so that we all are owned by the
3998 * user if user namespaces are turned on. */
03cfe0d5 3999
d7bea6b6
DP
4000 if (arg_network_namespace_path) {
4001 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4002 if (r < 0)
e2d39e54 4003 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
4004 }
4005
e1bb4b0d 4006 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
03cfe0d5
LP
4007 if (r < 0)
4008 _exit(EXIT_FAILURE);
4009
4010 _exit(EXIT_SUCCESS);
4011 }
4012
4013 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4014 if (l < 0)
4015 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
4016 if (l != sizeof(pid))
4017 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4018 "Short write while sending PID.");
03cfe0d5 4019
e01ff70a
MS
4020 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4021 if (l < 0)
4022 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
4023 if (l != sizeof(arg_uuid))
4024 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4025 "Short write while sending machine ID.");
e01ff70a 4026
9c1e04d0
AP
4027 l = send_one_fd(notify_socket, fd, 0);
4028 if (l < 0)
ba72801d 4029 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 4030
03cfe0d5 4031 pid_socket = safe_close(pid_socket);
e01ff70a 4032 uuid_socket = safe_close(uuid_socket);
9c1e04d0 4033 notify_socket = safe_close(notify_socket);
3acc84eb 4034 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
4035 kmsg_socket = safe_close(kmsg_socket);
4036 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 4037 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
4038
4039 return 0;
4040}
4041
0e7ac751 4042static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4043 bool tried_hashed = false;
0e7ac751
LP
4044 unsigned n_tries = 100;
4045 uid_t candidate;
4046 int r;
4047
4048 assert(shift);
4049 assert(ret_lock_file);
0de7acce 4050 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4051 assert(arg_uid_range == 0x10000U);
4052
4053 candidate = *shift;
4054
4055 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4056
4057 for (;;) {
fbd0b64f 4058 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4059 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4060
4061 if (--n_tries <= 0)
4062 return -EBUSY;
4063
87d5e4f2 4064 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4065 goto next;
4066 if ((candidate & UINT32_C(0xFFFF)) != 0)
4067 goto next;
4068
4069 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4070 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4071 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4072 goto next;
4073 if (r < 0)
4074 return r;
4075
4076 /* Make some superficial checks whether the range is currently known in the user database */
4077 if (getpwuid(candidate))
4078 goto next;
4079 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4080 goto next;
4081 if (getgrgid(candidate))
4082 goto next;
4083 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4084 goto next;
4085
4086 *ret_lock_file = lf;
4087 lf = (struct LockFile) LOCK_FILE_INIT;
4088 *shift = candidate;
4089 return 0;
4090
4091 next:
d381c8a6
LP
4092 if (arg_machine && !tried_hashed) {
4093 /* Try to hash the base from the container name */
4094
4095 static const uint8_t hash_key[] = {
4096 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4097 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4098 };
4099
4100 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4101
4102 tried_hashed = true;
4103 } else
4104 random_bytes(&candidate, sizeof(candidate));
4105
87d5e4f2 4106 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4107 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4108 }
4109}
4110
2f893044
LP
4111static int add_one_uid_map(
4112 char **p,
4113 uid_t container_uid,
4114 uid_t host_uid,
4115 uid_t range) {
4116
4117 return strextendf(p,
4118 UID_FMT " " UID_FMT " " UID_FMT "\n",
4119 container_uid, host_uid, range);
4120}
4121
4122static int make_uid_map_string(
4123 const uid_t bind_user_uid[],
4124 size_t n_bind_user_uid,
4125 size_t offset,
4126 char **ret) {
4127
4128 _cleanup_free_ char *s = NULL;
4129 uid_t previous_uid = 0;
4130 int r;
4131
4132 assert(n_bind_user_uid == 0 || bind_user_uid);
2f092762 4133 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
2f893044
LP
4134 assert(ret);
4135
4136 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4137 * quadruplet, consisting of host and container UID + GID. */
4138
4139 for (size_t i = 0; i < n_bind_user_uid; i++) {
4140 uid_t payload_uid = bind_user_uid[i*2+offset],
4141 host_uid = bind_user_uid[i*2+offset+1];
4142
4143 assert(previous_uid <= payload_uid);
4144 assert(payload_uid < arg_uid_range);
4145
4146 /* Add a range to close the gap to previous entry */
4147 if (payload_uid > previous_uid) {
4148 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4149 if (r < 0)
4150 return r;
4151 }
4152
4153 /* Map this specific user */
4154 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4155 if (r < 0)
4156 return r;
4157
4158 previous_uid = payload_uid + 1;
4159 }
4160
4161 /* And add a range to close the gap to finish the range */
4162 if (arg_uid_range > previous_uid) {
4163 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4164 if (r < 0)
4165 return r;
4166 }
4167
4168 assert(s);
4169
4170 *ret = TAKE_PTR(s);
4171 return 0;
4172}
4173
4174static int setup_uid_map(
4175 pid_t pid,
4176 const uid_t bind_user_uid[],
4177 size_t n_bind_user_uid) {
4178
4179 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4180 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4181 int r;
4182
4183 assert(pid > 1);
4184
2f893044
LP
4185 /* Build the UID map string */
4186 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4187 return log_oom();
4188
03cfe0d5 4189 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4190 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4191 if (r < 0)
4192 return log_error_errno(r, "Failed to write UID map: %m");
4193
2f893044
LP
4194 /* And now build the GID map string */
4195 s = mfree(s);
4196 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4197 return log_oom();
4198
03cfe0d5 4199 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4200 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4201 if (r < 0)
4202 return log_error_errno(r, "Failed to write GID map: %m");
4203
4204 return 0;
4205}
4206
9c1e04d0 4207static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4208 char buf[NOTIFY_BUFFER_MAX+1];
4209 char *p = NULL;
4210 struct iovec iovec = {
4211 .iov_base = buf,
4212 .iov_len = sizeof(buf)-1,
4213 };
fb29cdbe
LP
4214 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4215 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4216 struct msghdr msghdr = {
4217 .msg_iov = &iovec,
4218 .msg_iovlen = 1,
4219 .msg_control = &control,
4220 .msg_controllen = sizeof(control),
4221 };
371d72e0 4222 struct ucred *ucred;
9c1e04d0
AP
4223 ssize_t n;
4224 pid_t inner_child_pid;
4225 _cleanup_strv_free_ char **tags = NULL;
4bf4f50f 4226 int r;
9c1e04d0
AP
4227
4228 assert(userdata);
4229
4230 inner_child_pid = PTR_TO_PID(userdata);
4231
4232 if (revents != EPOLLIN) {
4233 log_warning("Got unexpected poll event for notify fd.");
4234 return 0;
4235 }
4236
3691bcf3 4237 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
8add30a0
YW
4238 if (n < 0) {
4239 if (ERRNO_IS_TRANSIENT(n))
4240 return 0;
4241 if (n == -EXFULL) {
4242 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4243 return 0;
4244 }
3691bcf3 4245 return log_warning_errno(n, "Couldn't read notification socket: %m");
8add30a0 4246 }
9c1e04d0 4247
9c1e04d0
AP
4248 cmsg_close_all(&msghdr);
4249
371d72e0 4250 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4251 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4252 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4253 return 0;
4254 }
4255
4256 if ((size_t) n >= sizeof(buf)) {
4257 log_warning("Received notify message exceeded maximum size. Ignoring.");
4258 return 0;
4259 }
4260
4261 buf[n] = 0;
4262 tags = strv_split(buf, "\n\r");
4263 if (!tags)
4264 return log_oom();
4265
d29cc4d6 4266 if (strv_contains(tags, "READY=1")) {
d4341b76 4267 r = sd_notify(false, "READY=1\n");
4bf4f50f
ZJS
4268 if (r < 0)
4269 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4270 }
9c1e04d0
AP
4271
4272 p = strv_find_startswith(tags, "STATUS=");
4273 if (p)
04f590a4 4274 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4275
4276 return 0;
4277}
4278
e96ceaba 4279static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4280 int r;
9c1e04d0 4281
5773024d 4282 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4283 if (r < 0)
4284 return log_error_errno(r, "Failed to allocate notify event source: %m");
4285
5773024d 4286 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4287
4288 return 0;
4289}
4290
5d961407
LP
4291static int merge_settings(Settings *settings, const char *path) {
4292 int rl;
f757855e 4293
5d961407
LP
4294 assert(settings);
4295 assert(path);
f757855e 4296
5d961407
LP
4297 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4298 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4299
7732f92b
LP
4300 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4301 settings->start_mode >= 0) {
4302 arg_start_mode = settings->start_mode;
130d3d22 4303 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4304 }
4305
d3689b94
LP
4306 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4307 settings->ephemeral >= 0)
a2f577fc
JL
4308 arg_ephemeral = settings->ephemeral;
4309
de40a303
LP
4310 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4311 settings->root) {
4312
4313 if (!arg_settings_trusted)
4314 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4315 else
4316 free_and_replace(arg_directory, settings->root);
4317 }
4318
b53ede69
PW
4319 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4320 settings->pivot_root_new) {
4321 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4322 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4323 }
4324
5f932eb9 4325 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4326 settings->working_directory)
4327 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4328
f757855e 4329 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4330 settings->environment)
4331 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4332
de40a303
LP
4333 if ((arg_settings_mask & SETTING_USER) == 0) {
4334
4335 if (settings->user)
4336 free_and_replace(arg_user, settings->user);
4337
4338 if (uid_is_valid(settings->uid))
4339 arg_uid = settings->uid;
4340 if (gid_is_valid(settings->gid))
4341 arg_gid = settings->gid;
4342 if (settings->n_supplementary_gids > 0) {
4343 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4344 arg_n_supplementary_gids = settings->n_supplementary_gids;
4345 }
4346 }
f757855e
LP
4347
4348 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4349 uint64_t plus, minus;
7be830c6 4350 uint64_t network_minus = 0;
88fc9c9b 4351 uint64_t ambient;
f757855e 4352
de40a303
LP
4353 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4354 * Settings structure */
4355
0e265674 4356 plus = settings->capability;
a3fc6b55
LP
4357 minus = settings->drop_capability;
4358
9baa294c
LP
4359 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4360 settings_network_configured(settings)) {
a3fc6b55
LP
4361 if (settings_private_network(settings))
4362 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4363 else
7be830c6 4364 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4365 }
0e265674
LP
4366
4367 if (!arg_settings_trusted && plus != 0) {
4368 if (settings->capability != 0)
5d961407 4369 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4370 } else {
4371 arg_caps_retain &= ~network_minus;
520e0d54 4372 arg_caps_retain |= plus;
7be830c6 4373 }
f757855e 4374
a3fc6b55 4375 arg_caps_retain &= ~minus;
de40a303
LP
4376
4377 /* Copy the full capabilities over too */
4378 if (capability_quintet_is_set(&settings->full_capabilities)) {
4379 if (!arg_settings_trusted)
5238e957 4380 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4381 else
4382 arg_full_capabilities = settings->full_capabilities;
4383 }
88fc9c9b
TH
4384
4385 ambient = settings->ambient_capability;
4386 if (!arg_settings_trusted && ambient != 0)
4387 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4388 else
4389 arg_caps_ambient |= ambient;
f757855e
LP
4390 }
4391
4392 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4393 settings->kill_signal > 0)
4394 arg_kill_signal = settings->kill_signal;
4395
4396 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4397 settings->personality != PERSONALITY_INVALID)
4398 arg_personality = settings->personality;
4399
4400 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4401 !sd_id128_is_null(settings->machine_id)) {
4402
4403 if (!arg_settings_trusted)
5d961407 4404 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4405 else
4406 arg_uuid = settings->machine_id;
4407 }
4408
4409 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4410 settings->read_only >= 0)
4411 arg_read_only = settings->read_only;
4412
4413 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4414 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4415 arg_volatile_mode = settings->volatile_mode;
4416
4417 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4418 settings->n_custom_mounts > 0) {
4419
4420 if (!arg_settings_trusted)
5d961407 4421 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4422 else {
4423 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4424 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4425 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4426 settings->n_custom_mounts = 0;
4427 }
4428 }
4429
4430 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
a1dfd585 4431 settings_network_configured(settings)) {
f757855e
LP
4432
4433 if (!arg_settings_trusted)
5d961407 4434 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4435 else {
f6d6bad1 4436 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4437 arg_private_network = settings_private_network(settings);
4438
130d3d22
YW
4439 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4440 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4441 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4442 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4443
1cc6c93a
YW
4444 free_and_replace(arg_network_bridge, settings->network_bridge);
4445 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4446
4447 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4448 }
4449 }
4450
4451 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4452 settings->expose_ports) {
4453
4454 if (!arg_settings_trusted)
5d961407 4455 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4456 else {
4457 expose_port_free_all(arg_expose_ports);
1cc6c93a 4458 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4459 }
4460 }
4461
0de7acce
LP
4462 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4463 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4464
4465 if (!arg_settings_trusted)
5d961407 4466 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4467 else {
4468 arg_userns_mode = settings->userns_mode;
4469 arg_uid_shift = settings->uid_shift;
4470 arg_uid_range = settings->uid_range;
6c045a99 4471 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4472 }
4473 }
4474
0cc3c9f9
LP
4475 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4476 !strv_isempty(settings->bind_user))
2f893044
LP
4477 strv_free_and_replace(arg_bind_user, settings->bind_user);
4478
d3689b94
LP
4479 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4480 settings->notify_ready >= 0)
9c1e04d0
AP
4481 arg_notify_ready = settings->notify_ready;
4482
960e4569
LP
4483 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4484
2d09ea44
LP
4485 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4486 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4487 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4488 else {
4489 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4490 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4491 }
960e4569 4492 }
de40a303
LP
4493
4494#if HAVE_SECCOMP
2d09ea44
LP
4495 if (settings->seccomp) {
4496 if (!arg_settings_trusted)
4497 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4498 else {
4499 seccomp_release(arg_seccomp);
4500 arg_seccomp = TAKE_PTR(settings->seccomp);
4501 }
de40a303
LP
4502 }
4503#endif
960e4569
LP
4504 }
4505
bf428efb
LP
4506 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4507 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4508 continue;
4509
4510 if (!settings->rlimit[rl])
4511 continue;
4512
4513 if (!arg_settings_trusted) {
5d961407 4514 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4515 continue;
4516 }
4517
4518 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4519 }
4520
3a9530e5
LP
4521 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4522 settings->hostname)
4523 free_and_replace(arg_hostname, settings->hostname);
4524
66edd963
LP
4525 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4526 settings->no_new_privileges >= 0)
4527 arg_no_new_privileges = settings->no_new_privileges;
4528
81f345df
LP
4529 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4530 settings->oom_score_adjust_set) {
4531
4532 if (!arg_settings_trusted)
5d961407 4533 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4534 else {
4535 arg_oom_score_adjust = settings->oom_score_adjust;
4536 arg_oom_score_adjust_set = true;
4537 }
4538 }
4539
d107bb7d 4540 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4541 settings->cpu_set.set) {
d107bb7d
LP
4542
4543 if (!arg_settings_trusted)
5d961407 4544 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4545 else {
0985c7c4
ZJS
4546 cpu_set_reset(&arg_cpu_set);
4547 arg_cpu_set = settings->cpu_set;
4548 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4549 }
4550 }
4551
09d423e9
LP
4552 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4553 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4554 arg_resolv_conf = settings->resolv_conf;
4555
4e1d6aa9
LP
4556 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4557 settings->link_journal != _LINK_JOURNAL_INVALID) {
4558
4559 if (!arg_settings_trusted)
4560 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4561 else {
4562 arg_link_journal = settings->link_journal;
4563 arg_link_journal_try = settings->link_journal_try;
4564 }
4565 }
4566
1688841f
LP
4567 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4568 settings->timezone != _TIMEZONE_MODE_INVALID)
4569 arg_timezone = settings->timezone;
4570
de40a303
LP
4571 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4572 settings->slice) {
4573
4574 if (!arg_settings_trusted)
4575 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4576 else
4577 free_and_replace(arg_slice, settings->slice);
4578 }
4579
4580 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4581 settings->use_cgns >= 0) {
4582
4583 if (!arg_settings_trusted)
4584 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4585 else
4586 arg_use_cgns = settings->use_cgns;
4587 }
4588
4589 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4590 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4591
4592 if (!arg_settings_trusted)
4593 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4594 else
4595 arg_clone_ns_flags = settings->clone_ns_flags;
4596 }
4597
4598 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4599 settings->console_mode >= 0) {
4600
4601 if (!arg_settings_trusted)
4602 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4603 else
4604 arg_console_mode = settings->console_mode;
4605 }
4606
d3689b94
LP
4607 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4608 settings->suppress_sync >= 0)
4a4654e0
LP
4609 arg_suppress_sync = settings->suppress_sync;
4610
de40a303
LP
4611 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4612 * don't consult arg_settings_mask for them. */
4613
4614 sd_bus_message_unref(arg_property_message);
4615 arg_property_message = TAKE_PTR(settings->properties);
4616
4617 arg_console_width = settings->console_width;
4618 arg_console_height = settings->console_height;
4619
b2645747 4620 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4621 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4622 arg_n_extra_nodes = settings->n_extra_nodes;
4623
f757855e
LP
4624 return 0;
4625}
4626
5d961407
LP
4627static int load_settings(void) {
4628 _cleanup_(settings_freep) Settings *settings = NULL;
4629 _cleanup_fclose_ FILE *f = NULL;
3603f151 4630 _cleanup_free_ char *p = NULL;
5d961407
LP
4631 int r;
4632
de40a303
LP
4633 if (arg_oci_bundle)
4634 return 0;
4635
5d961407
LP
4636 /* If all settings are masked, there's no point in looking for
4637 * the settings file */
d7a0f1f4 4638 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4639 return 0;
4640
5d961407
LP
4641 /* We first look in the admin's directories in /etc and /run */
4642 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4643 _cleanup_free_ char *j = NULL;
4644
3603f151 4645 j = path_join(i, arg_settings_filename);
5d961407
LP
4646 if (!j)
4647 return log_oom();
4648
4649 f = fopen(j, "re");
4650 if (f) {
4651 p = TAKE_PTR(j);
4652
4653 /* By default, we trust configuration from /etc and /run */
4654 if (arg_settings_trusted < 0)
4655 arg_settings_trusted = true;
4656
4657 break;
4658 }
4659
4660 if (errno != ENOENT)
4661 return log_error_errno(errno, "Failed to open %s: %m", j);
4662 }
4663
4664 if (!f) {
4665 /* After that, let's look for a file next to the
4666 * actual image we shall boot. */
4667
4668 if (arg_image) {
3603f151 4669 p = file_in_same_dir(arg_image, arg_settings_filename);
5d961407
LP
4670 if (!p)
4671 return log_oom();
cd6e3914 4672 } else if (arg_directory && !path_equal(arg_directory, "/")) {
3603f151 4673 p = file_in_same_dir(arg_directory, arg_settings_filename);
5d961407
LP
4674 if (!p)
4675 return log_oom();
4676 }
4677
4678 if (p) {
4679 f = fopen(p, "re");
4680 if (!f && errno != ENOENT)
4681 return log_error_errno(errno, "Failed to open %s: %m", p);
4682
4683 /* By default, we do not trust configuration from /var/lib/machines */
4684 if (arg_settings_trusted < 0)
4685 arg_settings_trusted = false;
4686 }
4687 }
4688
4689 if (!f)
4690 return 0;
4691
4692 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4693
4694 r = settings_load(f, p, &settings);
4695 if (r < 0)
4696 return r;
4697
4698 return merge_settings(settings, p);
4699}
4700
de40a303
LP
4701static int load_oci_bundle(void) {
4702 _cleanup_(settings_freep) Settings *settings = NULL;
4703 int r;
4704
4705 if (!arg_oci_bundle)
4706 return 0;
4707
4708 /* By default let's trust OCI bundles */
4709 if (arg_settings_trusted < 0)
4710 arg_settings_trusted = true;
4711
4712 r = oci_load(NULL, arg_oci_bundle, &settings);
4713 if (r < 0)
4714 return r;
4715
4716 return merge_settings(settings, arg_oci_bundle);
4717}
4718
3acc84eb 4719static int run_container(
2d845785 4720 DissectedImage *dissected_image,
b0067625
ZJS
4721 bool secondary,
4722 FDSet *fds,
4723 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4724 struct ExposeArgs *expose_args,
3acc84eb 4725 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4726
4727 static const struct sigaction sa = {
4728 .sa_handler = nop_signal_handler,
e28c7cd0 4729 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4730 };
4731
8e766630 4732 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4733 _cleanup_close_ int etc_passwd_lock = -1;
4734 _cleanup_close_pair_ int
4735 kmsg_socket_pair[2] = { -1, -1 },
4736 rtnl_socket_pair[2] = { -1, -1 },
4737 pid_socket_pair[2] = { -1, -1 },
4738 uuid_socket_pair[2] = { -1, -1 },
4739 notify_socket_pair[2] = { -1, -1 },
8199d554 4740 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4741 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4742 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4743
3acc84eb 4744 _cleanup_close_ int notify_socket = -1;
b0067625 4745 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4746 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4747 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4748 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4749 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4750 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4751 _cleanup_free_ uid_t *bind_user_uid = NULL;
4752 size_t n_bind_user_uid = 0;
b0067625 4753 ContainerStatus container_status = 0;
b0067625
ZJS
4754 int ifi = 0, r;
4755 ssize_t l;
4756 sigset_t mask_chld;
5b4855ab 4757 _cleanup_close_ int child_netns_fd = -1;
b0067625
ZJS
4758
4759 assert_se(sigemptyset(&mask_chld) == 0);
4760 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4761
4762 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4763 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4764 * check with getpwuid() if the specific user already exists. Note that /etc might be
4765 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4766 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4767 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4768 * really ours. */
4769
4770 etc_passwd_lock = take_etc_passwd_lock(NULL);
4771 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4772 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4773 }
4774
4775 r = barrier_create(&barrier);
4776 if (r < 0)
4777 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4778
4779 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4780 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4781
4782 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4783 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4784
4785 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4786 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4787
4788 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4789 return log_error_errno(errno, "Failed to create id socket pair: %m");
4790
4791 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4792 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4793
3acc84eb
FB
4794 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4795 return log_error_errno(errno, "Failed to create console socket pair: %m");
4796
b0067625
ZJS
4797 if (arg_userns_mode != USER_NAMESPACE_NO)
4798 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4799 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4800
8199d554
LP
4801 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4802 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4803 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4804
b0067625
ZJS
4805 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4806 * parent's blocking calls and give it a chance to call wait() and terminate. */
4807 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4808 if (r < 0)
4809 return log_error_errno(errno, "Failed to change the signal mask: %m");
4810
4811 r = sigaction(SIGCHLD, &sa, NULL);
4812 if (r < 0)
4813 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4814
d7bea6b6 4815 if (arg_network_namespace_path) {
5b4855ab
DDM
4816 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4817 if (child_netns_fd < 0)
d7bea6b6
DP
4818 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4819
54c2459d 4820 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4821 if (r == -EUCLEAN)
4822 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4823 else if (r < 0)
d7bea6b6 4824 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4825 else if (r == 0)
4826 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4827 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4828 }
4829
b0067625
ZJS
4830 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4831 if (*pid < 0)
4832 return log_error_errno(errno, "clone() failed%s: %m",
4833 errno == EINVAL ?
4834 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4835
4836 if (*pid == 0) {
4837 /* The outer child only has a file system namespace. */
4838 barrier_set_role(&barrier, BARRIER_CHILD);
4839
b0067625
ZJS
4840 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4841 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4842 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4843 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4844 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4845 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4846 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4847 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4848
4849 (void) reset_all_signal_handlers();
4850 (void) reset_signal_mask();
4851
4852 r = outer_child(&barrier,
4853 arg_directory,
2d845785 4854 dissected_image,
b0067625
ZJS
4855 secondary,
4856 pid_socket_pair[1],
4857 uuid_socket_pair[1],
4858 notify_socket_pair[1],
4859 kmsg_socket_pair[1],
4860 rtnl_socket_pair[1],
4861 uid_shift_socket_pair[1],
3acc84eb 4862 master_pty_socket_pair[1],
8199d554 4863 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6 4864 fds,
5b4855ab 4865 child_netns_fd);
b0067625
ZJS
4866 if (r < 0)
4867 _exit(EXIT_FAILURE);
4868
4869 _exit(EXIT_SUCCESS);
4870 }
4871
4872 barrier_set_role(&barrier, BARRIER_PARENT);
4873
e4077ff6 4874 fdset_close(fds);
b0067625
ZJS
4875
4876 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4877 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4878 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4879 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4880 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4881 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4882 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4883 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4884
4885 if (arg_userns_mode != USER_NAMESPACE_NO) {
4886 /* The child just let us know the UID shift it might have read from the image. */
4887 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4888 if (l < 0)
4889 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4890 if (l != sizeof arg_uid_shift)
4891 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4892
4893 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4894 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4895 * image, but if that's already in use, pick a new one, and report back to the child,
4896 * which one we now picked. */
4897
4898 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4899 if (r < 0)
4900 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4901
4902 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4903 if (l < 0)
4904 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4905 if (l != sizeof arg_uid_shift)
4906 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4907 }
2f893044
LP
4908
4909 n_bind_user_uid = strv_length(arg_bind_user);
4910 if (n_bind_user_uid > 0) {
4911 /* Right after the UID shift, we'll receive the list of UID mappings for the
4912 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4913
4914 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4915 if (!bind_user_uid)
4916 return log_oom();
4917
4918 for (size_t i = 0; i < n_bind_user_uid; i++) {
4919 l = recv(uid_shift_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4920 if (l < 0)
4921 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4922 if (l != sizeof(uid_t)*4)
4923 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4924 SYNTHETIC_ERRNO(EIO),
4925 "Short read while reading bind user UID pairs.");
4926 }
4927 }
b0067625
ZJS
4928 }
4929
8199d554
LP
4930 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4931 /* The child let us know the support cgroup mode it might have read from the image. */
4932 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4933 if (l < 0)
4934 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4935 if (l != sizeof(arg_unified_cgroup_hierarchy))
4936 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4937 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4938 }
4939
b0067625 4940 /* Wait for the outer child. */
d2e0ac3d
LP
4941 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4942 if (r < 0)
4943 return r;
4944 if (r != EXIT_SUCCESS)
4945 return -EIO;
b0067625
ZJS
4946
4947 /* And now retrieve the PID of the inner child. */
4948 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4949 if (l < 0)
4950 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4951 if (l != sizeof *pid)
4952 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4953
4954 /* We also retrieve container UUID in case it was generated by outer child */
4955 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4956 if (l < 0)
4957 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4958 if (l != sizeof(arg_uuid))
4959 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4960
4961 /* We also retrieve the socket used for notifications generated by outer child */
4962 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4963 if (notify_socket < 0)
4964 return log_error_errno(notify_socket,
4965 "Failed to receive notification socket from the outer child: %m");
4966
4967 log_debug("Init process invoked as PID "PID_FMT, *pid);
4968
4969 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4970 if (!barrier_place_and_sync(&barrier)) /* #1 */
4971 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4972
2f893044 4973 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4974 if (r < 0)
4975 return r;
4976
4977 (void) barrier_place(&barrier); /* #2 */
4978 }
4979
4980 if (arg_private_network) {
75116558
PS
4981 if (!arg_network_namespace_path) {
4982 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4983 if (!barrier_place_and_sync(&barrier)) /* #3 */
4984 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4985 }
4986
5b4855ab
DDM
4987 if (child_netns_fd < 0) {
4988 /* Make sure we have an open file descriptor to the child's network
4989 * namespace so it stays alive even if the child exits. */
4990 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4991 if (r < 0)
4992 return log_error_errno(r, "Failed to open child network namespace: %m");
4993 }
4994
4995 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4996 if (r < 0)
4997 return r;
4998
4999 if (arg_network_veth) {
5000 r = setup_veth(arg_machine, *pid, veth_name,
5001 arg_network_bridge || arg_network_zone);
5002 if (r < 0)
5003 return r;
5004 else if (r > 0)
5005 ifi = r;
5006
5007 if (arg_network_bridge) {
5008 /* Add the interface to a bridge */
5009 r = setup_bridge(veth_name, arg_network_bridge, false);
5010 if (r < 0)
5011 return r;
5012 if (r > 0)
5013 ifi = r;
5014 } else if (arg_network_zone) {
5015 /* Add the interface to a bridge, possibly creating it */
5016 r = setup_bridge(veth_name, arg_network_zone, true);
5017 if (r < 0)
5018 return r;
5019 if (r > 0)
5020 ifi = r;
5021 }
5022 }
5023
5024 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5025 if (r < 0)
5026 return r;
5027
5028 /* We created the primary and extra veth links now; let's remember this, so that we know to
5029 remove them later on. Note that we don't bother with removing veth links that were created
5030 here when their setup failed half-way, because in that case the kernel should be able to
5031 remove them on its own, since they cannot be referenced by anything yet. */
5032 *veth_created = true;
5033
5034 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5035 if (r < 0)
5036 return r;
5037
5038 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5039 if (r < 0)
5040 return r;
5041 }
5042
abdb9b08
LP
5043 if (arg_register || !arg_keep_unit) {
5044 r = sd_bus_default_system(&bus);
5045 if (r < 0)
5046 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
5047
5048 r = sd_bus_set_close_on_exit(bus, false);
5049 if (r < 0)
5050 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
5051 }
5052
5053 if (!arg_keep_unit) {
5054 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5055 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5056 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5057
75152a4d
LP
5058 r = sd_bus_match_signal_async(
5059 bus,
5060 NULL,
5061 "org.freedesktop.systemd1",
5062 NULL,
5063 "org.freedesktop.systemd1.Scope",
5064 "RequestStop",
5065 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 5066 if (r < 0)
75152a4d 5067 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
5068 }
5069
b0067625
ZJS
5070 if (arg_register) {
5071 r = register_machine(
abdb9b08 5072 bus,
b0067625
ZJS
5073 arg_machine,
5074 *pid,
5075 arg_directory,
5076 arg_uuid,
5077 ifi,
5078 arg_slice,
5079 arg_custom_mounts, arg_n_custom_mounts,
5080 arg_kill_signal,
5081 arg_property,
de40a303 5082 arg_property_message,
b0067625
ZJS
5083 arg_keep_unit,
5084 arg_container_service_name);
5085 if (r < 0)
5086 return r;
abdb9b08 5087
cd2dfc6f
LP
5088 } else if (!arg_keep_unit) {
5089 r = allocate_scope(
abdb9b08 5090 bus,
cd2dfc6f
LP
5091 arg_machine,
5092 *pid,
5093 arg_slice,
5094 arg_custom_mounts, arg_n_custom_mounts,
5095 arg_kill_signal,
de40a303
LP
5096 arg_property,
5097 arg_property_message);
cd2dfc6f
LP
5098 if (r < 0)
5099 return r;
5100
5101 } else if (arg_slice || arg_property)
5102 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5103
27da7ef0 5104 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5105 if (r < 0)
5106 return r;
5107
27da7ef0 5108 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5109 if (r < 0)
5110 return r;
b0067625 5111
de54e02d 5112 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5113 if (r < 0)
5114 return r;
5115
5116 /* Notify the child that the parent is ready with all
5117 * its setup (including cgroup-ification), and that
5118 * the child can now hand over control to the code to
5119 * run inside the container. */
75116558 5120 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5121
5122 /* Block SIGCHLD here, before notifying child.
5123 * process_pty() will handle it with the other signals. */
5124 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5125
5126 /* Reset signal to default */
9c274488 5127 r = default_signals(SIGCHLD);
b0067625
ZJS
5128 if (r < 0)
5129 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5130
5131 r = sd_event_new(&event);
5132 if (r < 0)
5133 return log_error_errno(r, "Failed to get default event source: %m");
5134
8fd010bb
LP
5135 (void) sd_event_set_watchdog(event, true);
5136
abdb9b08
LP
5137 if (bus) {
5138 r = sd_bus_attach_event(bus, event, 0);
5139 if (r < 0)
5140 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5141 }
5142
e96ceaba 5143 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5144 if (r < 0)
5145 return r;
5146
5147 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
5148 if (!barrier_place_and_sync(&barrier)) /* #5 */
5149 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5150
38ccb557 5151 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5152 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5153 etc_passwd_lock = safe_close(etc_passwd_lock);
5154
04f590a4
LP
5155 (void) sd_notifyf(false,
5156 "STATUS=Container running.\n"
5157 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4bf4f50f
ZJS
5158 if (!arg_notify_ready) {
5159 r = sd_notify(false, "READY=1\n");
5160 if (r < 0)
5161 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5162 }
b0067625
ZJS
5163
5164 if (arg_kill_signal > 0) {
5165 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5166 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5167 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5168 } else {
5169 /* Immediately exit */
919f5ae0
LP
5170 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5171 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5172 }
5173
6916b164 5174 /* Exit when the child exits */
919f5ae0 5175 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
5176
5177 if (arg_expose_ports) {
761cf19d 5178 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5179 if (r < 0)
5180 return r;
5181
deff68e7
FW
5182 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5183 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5184 }
5185
5186 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5187
3acc84eb
FB
5188 if (arg_console_mode != CONSOLE_PIPE) {
5189 _cleanup_close_ int fd = -1;
5190 PTYForwardFlags flags = 0;
de40a303 5191
3acc84eb
FB
5192 /* Retrieve the master pty allocated by inner child */
5193 fd = receive_one_fd(master_pty_socket_pair[0], 0);
5194 if (fd < 0)
5195 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5196
5197 switch (arg_console_mode) {
de40a303 5198
3acc84eb
FB
5199 case CONSOLE_READ_ONLY:
5200 flags |= PTY_FORWARD_READ_ONLY;
5201
5202 _fallthrough_;
5203
5204 case CONSOLE_INTERACTIVE:
5205 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5206
5207 r = pty_forward_new(event, fd, flags, &forward);
5208 if (r < 0)
5209 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5210
f5fbe71d 5211 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5212 (void) pty_forward_set_width_height(forward,
5213 arg_console_width,
5214 arg_console_height);
5215 break;
5216
5217 default:
5218 assert(arg_console_mode == CONSOLE_PASSIVE);
5219 }
5220
5221 *master = TAKE_FD(fd);
de40a303 5222 }
b0067625
ZJS
5223
5224 r = sd_event_loop(event);
5225 if (r < 0)
5226 return log_error_errno(r, "Failed to run event loop: %m");
5227
de40a303
LP
5228 if (forward) {
5229 char last_char = 0;
b0067625 5230
de40a303
LP
5231 (void) pty_forward_get_last_char(forward, &last_char);
5232 forward = pty_forward_free(forward);
b0067625 5233
de40a303
LP
5234 if (!arg_quiet && last_char != '\n')
5235 putc('\n', stdout);
5236 }
b0067625
ZJS
5237
5238 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5239 if (!arg_register && !arg_keep_unit && bus)
5240 terminate_scope(bus, arg_machine);
b0067625
ZJS
5241
5242 /* Normally redundant, but better safe than sorry */
c67b0082 5243 (void) kill(*pid, SIGKILL);
b0067625 5244
5b4855ab
DDM
5245 if (arg_private_network) {
5246 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5247 * to avoid having to move the parent to the child network namespace. */
5248 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5249 if (r < 0)
5250 return r;
5251
5252 if (r == 0) {
5253 _cleanup_close_ int parent_netns_fd = -1;
5254
5255 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5256 if (r < 0) {
5257 log_error_errno(r, "Failed to open parent network namespace: %m");
5258 _exit(EXIT_FAILURE);
5259 }
5260
5261 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5262 if (r < 0) {
5263 log_error_errno(r, "Failed to enter child network namespace: %m");
5264 _exit(EXIT_FAILURE);
5265 }
5266
5267 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5268 if (r < 0)
5269 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5270
5271 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5272 }
5273 }
5274
8f03de53 5275 r = wait_for_container(TAKE_PID(*pid), &container_status);
b0067625 5276
0bb0a9fa
ZJS
5277 /* Tell machined that we are gone. */
5278 if (bus)
5279 (void) unregister_machine(bus, arg_machine);
5280
b0067625
ZJS
5281 if (r < 0)
5282 /* We failed to wait for the container, or the container exited abnormally. */
5283 return r;
5284 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5285 /* r > 0 → The container exited with a non-zero status.
5286 * As a special case, we need to replace 133 with a different value,
5287 * because 133 is special-cased in the service file to reboot the container.
5288 * otherwise → The container exited with zero status and a reboot was not requested.
5289 */
2a49b612 5290 if (r == EXIT_FORCE_RESTART)
27e29a1e 5291 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5292 *ret = r;
b0067625
ZJS
5293 return 0; /* finito */
5294 }
5295
5296 /* CONTAINER_REBOOTED, loop again */
5297
5298 if (arg_keep_unit) {
5299 /* Special handling if we are running as a service: instead of simply
5300 * restarting the machine we want to restart the entire service, so let's
5301 * inform systemd about this with the special exit code 133. The service
5302 * file uses RestartForceExitStatus=133 so that this results in a full
5303 * nspawn restart. This is necessary since we might have cgroup parameters
5304 * set we want to have flushed out. */
2a49b612
ZJS
5305 *ret = EXIT_FORCE_RESTART;
5306 return 0; /* finito */
b0067625
ZJS
5307 }
5308
deff68e7
FW
5309 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5310 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5311
5312 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5313 *veth_created = false;
5314 return 1; /* loop again */
5315}
5316
bf428efb 5317static int initialize_rlimits(void) {
852b6250 5318 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
bf428efb
LP
5319 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5320 * container execution environments. */
5321
5322 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
852b6250
LP
5323 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5324 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5325 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5326 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5327 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5328 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5329 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5330 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5331 [RLIMIT_NICE] = { 0, 0 },
5332 [RLIMIT_NOFILE] = { 1024, 4096 },
5333 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5334 [RLIMIT_RTPRIO] = { 0, 0 },
5335 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5336 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
bf428efb
LP
5337
5338 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5339 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5340 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5341 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5342 * that PID 1 changes a number of other resource limits during early initialization which is why we
5343 * don't read the other limits from PID 1 but prefer the static table above. */
5344 };
5345
5346 int rl;
5347
5348 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5349 /* Let's only fill in what the user hasn't explicitly configured anyway */
5350 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5351 const struct rlimit *v;
5352 struct rlimit buffer;
5353
5354 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5355 /* For these two let's read the limits off PID 1. See above for an explanation. */
5356
5357 if (prlimit(1, rl, NULL, &buffer) < 0)
5358 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5359
dbf1aca6
LP
5360 v = &buffer;
5361 } else if (rl == RLIMIT_NOFILE) {
5362 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5363 * userspace. Given that nspawn containers are often run without our PID 1,
5364 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5365 * so that container userspace gets similar resources as host userspace
5366 * gets. */
5367 buffer = kernel_defaults[rl];
5368 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
bf428efb
LP
5369 v = &buffer;
5370 } else
5371 v = kernel_defaults + rl;
5372
5373 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5374 if (!arg_rlimit[rl])
5375 return log_oom();
5376 }
5377
5378 if (DEBUG_LOGGING) {
5379 _cleanup_free_ char *k = NULL;
5380
5381 (void) rlimit_format(arg_rlimit[rl], &k);
5382 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5383 }
5384 }
5385
5386 return 0;
5387}
5388
287b7376
LP
5389static int cant_be_in_netns(void) {
5390 union sockaddr_union sa = {
5391 .un = {
5392 .sun_family = AF_UNIX,
5393 .sun_path = "/run/udev/control",
5394 },
5395 };
5396 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5397 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5398 _cleanup_close_ int fd = -1;
5399 struct ucred ucred;
5400 int r;
5401
5402 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5403 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5404 * nice message. */
5405
5406 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5407 return 0;
5408
5409 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5410 if (fd < 0)
5411 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5412
32b9736a 5413 if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
287b7376
LP
5414
5415 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5416 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5417 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5418
5419 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5420 }
5421
5422 r = getpeercred(fd, &ucred);
5423 if (r < 0)
5424 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5425
5426 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5427 r = readlink_malloc(udev_path, &udev_ns);
5428 if (r < 0)
5429 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5430
5431 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5432 if (r < 0)
5433 return log_error_errno(r, "Failed to read our own network namespace: %m");
5434
5435 if (!streq(our_ns, udev_ns))
5436 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5437 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5438 return 0;
5439}
5440
44dbef90 5441static int run(int argc, char *argv[]) {
7bf011e3
LP
5442 bool secondary = false, remove_directory = false, remove_image = false,
5443 veth_created = false, remove_tmprootdir = false;
2d845785 5444 _cleanup_close_ int master = -1;
03cfe0d5 5445 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5446 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5447 char veth_name[IFNAMSIZ] = "";
761cf19d 5448 struct ExposeArgs expose_args = {};
8e766630 5449 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5450 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5451 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
5452 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5453 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5454 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5455 pid_t pid = 0;
03cfe0d5
LP
5456
5457 log_parse_environment();
5458 log_open();
415fc41c 5459
03cfe0d5
LP
5460 r = parse_argv(argc, argv);
5461 if (r <= 0)
5462 goto finish;
5463
38ee19c0
ZJS
5464 if (geteuid() != 0) {
5465 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5466 argc >= 2 ? "Need to be root." :
5467 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5468 goto finish;
38ee19c0 5469 }
fba868fa 5470
287b7376
LP
5471 r = cant_be_in_netns();
5472 if (r < 0)
5473 goto finish;
5474
bf428efb
LP
5475 r = initialize_rlimits();
5476 if (r < 0)
5477 goto finish;
5478
de40a303
LP
5479 r = load_oci_bundle();
5480 if (r < 0)
5481 goto finish;
5482
f757855e
LP
5483 r = determine_names();
5484 if (r < 0)
5485 goto finish;
5486
5487 r = load_settings();
5488 if (r < 0)
5489 goto finish;
5490
d4d99bc6 5491 r = cg_unified();
5eee8290
LP
5492 if (r < 0) {
5493 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5494 goto finish;
5495 }
5496
f757855e
LP
5497 r = verify_arguments();
5498 if (r < 0)
5499 goto finish;
03cfe0d5 5500
49048684
ZJS
5501 /* Reapply environment settings. */
5502 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5503
2949ff26
LP
5504 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5505 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5506 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5507 (void) ignore_signals(SIGPIPE);
2949ff26 5508
03cfe0d5
LP
5509 n_fd_passed = sd_listen_fds(false);
5510 if (n_fd_passed > 0) {
5511 r = fdset_new_listen_fds(&fds, false);
5512 if (r < 0) {
5513 log_error_errno(r, "Failed to collect file descriptors: %m");
5514 goto finish;
5515 }
5516 }
5517
83e803a9
ZJS
5518 /* The "default" umask. This is appropriate for most file and directory
5519 * operations performed by nspawn, and is the umask that will be used for
5520 * the child. Functions like copy_devnodes() change the umask temporarily. */
5521 umask(0022);
5522
03cfe0d5
LP
5523 if (arg_directory) {
5524 assert(!arg_image);
5525
b35ca61a
LP
5526 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5527 * /var from the host will propagate into container dynamically (because bad things happen if
5528 * two systems write to the same /var). Let's allow it for the special cases where /var is
5529 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5530 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5531 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5532 r = -EINVAL;
5533 goto finish;
5534 }
5535
5536 if (arg_ephemeral) {
5537 _cleanup_free_ char *np = NULL;
5538
8d4aa2bb 5539 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
5540 if (r < 0)
5541 goto finish;
5542
7bf011e3
LP
5543 /* If the specified path is a mount point we generate the new snapshot immediately
5544 * inside it under a random name. However if the specified is not a mount point we
5545 * create the new snapshot in the parent directory, just next to it. */
e1873695 5546 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5547 if (r < 0) {
5548 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5549 goto finish;
5550 }
5551 if (r > 0)
770b5ce4 5552 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5553 else
770b5ce4 5554 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5555 if (r < 0) {
0f3be6ca 5556 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5557 goto finish;
5558 }
5559
6992459c 5560 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5561 * only owned by us and no one else. */
6992459c 5562 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5563 if (r < 0) {
5564 log_error_errno(r, "Failed to lock %s: %m", np);
5565 goto finish;
5566 }
5567
7bf011e3
LP
5568 {
5569 BLOCK_SIGNALS(SIGINT);
5570 r = btrfs_subvol_snapshot(arg_directory, np,
5571 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5572 BTRFS_SNAPSHOT_FALLBACK_COPY |
5573 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5574 BTRFS_SNAPSHOT_RECURSIVE |
5575 BTRFS_SNAPSHOT_QUOTA |
5576 BTRFS_SNAPSHOT_SIGINT);
5577 }
5578 if (r == -EINTR) {
5579 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5580 goto finish;
5581 }
03cfe0d5
LP
5582 if (r < 0) {
5583 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5584 goto finish;
ec16945e
LP
5585 }
5586
1cc6c93a 5587 free_and_replace(arg_directory, np);
17cbb288 5588 remove_directory = true;
30535c16 5589 } else {
cb638b5e 5590 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5591 if (r < 0)
5592 goto finish;
5593
30535c16
LP
5594 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5595 if (r == -EBUSY) {
5596 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5597 goto finish;
5598 }
5599 if (r < 0) {
5600 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5601 goto finish;
30535c16
LP
5602 }
5603
5604 if (arg_template) {
8d4aa2bb 5605 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
5606 if (r < 0)
5607 goto finish;
5608
7bf011e3
LP
5609 {
5610 BLOCK_SIGNALS(SIGINT);
5611 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5612 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5613 BTRFS_SNAPSHOT_FALLBACK_COPY |
5614 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5615 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5616 BTRFS_SNAPSHOT_RECURSIVE |
5617 BTRFS_SNAPSHOT_QUOTA |
5618 BTRFS_SNAPSHOT_SIGINT);
5619 }
ff6c6cc1
LP
5620 if (r == -EEXIST)
5621 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5622 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5623 else if (r == -EINTR) {
5624 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5625 goto finish;
5626 } else if (r < 0) {
83521414 5627 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5628 goto finish;
ff6c6cc1
LP
5629 } else
5630 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5631 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5632 }
ec16945e
LP
5633 }
5634
7732f92b 5635 if (arg_start_mode == START_BOOT) {
aff7ae0d 5636 _cleanup_free_ char *b = NULL;
a5201ed6 5637 const char *p;
c9fe05e0 5638
aff7ae0d
LP
5639 if (arg_pivot_root_new) {
5640 b = path_join(arg_directory, arg_pivot_root_new);
5641 if (!b)
5642 return log_oom();
5643
5644 p = b;
5645 } else
a5201ed6 5646 p = arg_directory;
c9fe05e0
AR
5647
5648 if (path_is_os_tree(p) <= 0) {
aff7ae0d
LP
5649 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5650 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
1b9e5b12
LP
5651 goto finish;
5652 }
5653 } else {
aff7ae0d 5654 _cleanup_free_ char *p = NULL;
c9fe05e0 5655
a5201ed6 5656 if (arg_pivot_root_new)
aff7ae0d 5657 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
a5201ed6 5658 else
aff7ae0d
LP
5659 p = path_join(arg_directory, "/usr/");
5660 if (!p)
5661 return log_oom();
1b9e5b12 5662
aff7ae0d
LP
5663 if (laccess(p, F_OK) < 0) {
5664 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5665 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
1b9e5b12 5666 goto finish;
1b9e5b12
LP
5667 }
5668 }
ec16945e 5669
6b9132a9 5670 } else {
d04faa4e 5671 DissectImageFlags dissect_image_flags =
4b5de5dd 5672 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5673 DISSECT_IMAGE_REQUIRE_ROOT |
5674 DISSECT_IMAGE_RELAX_VAR_CHECK |
5675 DISSECT_IMAGE_USR_NO_ROOT;
ec16945e
LP
5676 assert(arg_image);
5677 assert(!arg_template);
5678
8d4aa2bb 5679 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
5680 if (r < 0)
5681 goto finish;
5682
0f3be6ca
LP
5683 if (arg_ephemeral) {
5684 _cleanup_free_ char *np = NULL;
5685
5686 r = tempfn_random(arg_image, "machine.", &np);
5687 if (r < 0) {
5688 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5689 goto finish;
5690 }
5691
6992459c
LP
5692 /* Always take an exclusive lock on our own ephemeral copy. */
5693 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5694 if (r < 0) {
5695 r = log_error_errno(r, "Failed to create image lock: %m");
5696 goto finish;
5697 }
5698
7bf011e3
LP
5699 {
5700 BLOCK_SIGNALS(SIGINT);
5701 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5702 }
5703 if (r == -EINTR) {
5704 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5705 goto finish;
5706 }
0f3be6ca
LP
5707 if (r < 0) {
5708 r = log_error_errno(r, "Failed to copy image file: %m");
5709 goto finish;
5710 }
5711
1cc6c93a 5712 free_and_replace(arg_image, np);
0f3be6ca
LP
5713 remove_image = true;
5714 } else {
5715 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5716 if (r == -EBUSY) {
5717 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5718 goto finish;
5719 }
5720 if (r < 0) {
5721 r = log_error_errno(r, "Failed to create image lock: %m");
5722 goto finish;
5723 }
4623e8e6 5724
89e62e0b
LP
5725 r = verity_settings_load(
5726 &arg_verity_settings,
5727 arg_image, NULL, NULL);
e7cbe5cb
LB
5728 if (r < 0) {
5729 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5730 goto finish;
78ebe980 5731 }
89e62e0b
LP
5732
5733 if (arg_verity_settings.data_path)
5734 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5735 }
5736
c67b0082 5737 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5738 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5739 goto finish;
1b9e5b12 5740 }
6b9132a9 5741
c67b0082
LP
5742 remove_tmprootdir = true;
5743
5744 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5745 if (!arg_directory) {
5746 r = log_oom();
5747 goto finish;
6b9132a9 5748 }
88213476 5749
89e62e0b
LP
5750 r = loop_device_make_by_path(
5751 arg_image,
5752 arg_read_only ? O_RDONLY : O_RDWR,
5753 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5754 &loop);
2d845785
LP
5755 if (r < 0) {
5756 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5757 goto finish;
5758 }
1b9e5b12 5759
41bc4849
LP
5760 /* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */
5761 r = loop_device_flock(loop, LOCK_SH);
5762 if (r < 0) {
5763 log_error_errno(r, "Failed to take lock on loopback block device: %m");
5764 goto finish;
5765 }
5766
4526113f 5767 r = dissect_image_and_warn(
e0f9e7bd 5768 loop->fd,
4526113f 5769 arg_image,
89e62e0b 5770 &arg_verity_settings,
18d73705 5771 NULL,
a3642997 5772 loop->diskseq,
75dc190d 5773 loop->uevent_seqnum_not_before,
4a62257d 5774 loop->timestamp_not_before,
e7cbe5cb 5775 dissect_image_flags,
e0f9e7bd 5776 &dissected_image);
2d845785 5777 if (r == -ENOPKG) {
4526113f 5778 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5779 log_notice("Note that the disk image needs to\n"
5780 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5781 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
19ac32cd 5782 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
2d845785
LP
5783 " d) or contain a file system without a partition table\n"
5784 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5785 goto finish;
2d845785 5786 }
4526113f 5787 if (r < 0)
842f3b0f 5788 goto finish;
1b9e5b12 5789
88b3300f
LP
5790 r = dissected_image_load_verity_sig_partition(
5791 dissected_image,
5792 loop->fd,
5793 &arg_verity_settings);
5794 if (r < 0)
5795 goto finish;
5796
8ee9615e
LP
5797 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5798 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5799 "root hash signature found! Proceeding without integrity checking.", arg_image);
4623e8e6 5800
89e62e0b
LP
5801 r = dissected_image_decrypt_interactively(
5802 dissected_image,
5803 NULL,
5804 &arg_verity_settings,
5805 0,
5806 &decrypted_image);
1b9e5b12
LP
5807 if (r < 0)
5808 goto finish;
0f3be6ca
LP
5809
5810 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5811 if (remove_image && unlink(arg_image) >= 0)
5812 remove_image = false;
842f3b0f 5813 }
842f3b0f 5814
86c0dd4a 5815 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5816 if (r < 0)
5817 goto finish;
5818
de40a303
LP
5819 if (arg_console_mode < 0)
5820 arg_console_mode =
5821 isatty(STDIN_FILENO) > 0 &&
5822 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5823
de40a303
LP
5824 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5825 arg_quiet = true;
a258bf26 5826
9c857b9d
LP
5827 if (!arg_quiet)
5828 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5829 arg_machine, arg_image ?: arg_directory);
5830
72c0a2c2 5831 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5832
66edd963 5833 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5834 r = log_error_errno(errno, "Failed to become subreaper: %m");
5835 goto finish;
5836 }
5837
761cf19d
FW
5838 if (arg_expose_ports) {
5839 r = fw_ctx_new(&fw_ctx);
5840 if (r < 0) {
5841 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5842 goto finish;
5843 }
5844 expose_args.fw_ctx = fw_ctx;
5845 }
d87be9b0 5846 for (;;) {
3acc84eb 5847 r = run_container(dissected_image,
44dbef90
LP
5848 secondary,
5849 fds,
5850 veth_name, &veth_created,
761cf19d 5851 &expose_args, &master,
44dbef90 5852 &pid, &ret);
b0067625 5853 if (r <= 0)
d87be9b0 5854 break;
d87be9b0 5855 }
88213476
LP
5856
5857finish:
04f590a4
LP
5858 (void) sd_notify(false,
5859 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5860 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5861
9444b1f2 5862 if (pid > 0)
c67b0082 5863 (void) kill(pid, SIGKILL);
88213476 5864
503546da 5865 /* Try to flush whatever is still queued in the pty */
6a0f896b 5866 if (master >= 0) {
f5fbe71d 5867 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5868 master = safe_close(master);
5869 }
5870
5871 if (pid > 0)
5872 (void) wait_for_terminate(pid, NULL);
503546da 5873
50ebcf6c
LP
5874 pager_close();
5875
17cbb288 5876 if (remove_directory && arg_directory) {
ec16945e
LP
5877 int k;
5878
17cbb288 5879 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5880 if (k < 0)
17cbb288 5881 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5882 }
5883
0f3be6ca
LP
5884 if (remove_image && arg_image) {
5885 if (unlink(arg_image) < 0)
5886 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5887 }
5888
c67b0082
LP
5889 if (remove_tmprootdir) {
5890 if (rmdir(tmprootdir) < 0)
5891 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5892 }
5893
785890ac
LP
5894 if (arg_machine) {
5895 const char *p;
5896
63c372cb 5897 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5898 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5899 }
5900
deff68e7
FW
5901 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5902 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5903
5904 if (veth_created)
5905 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5906 (void) remove_bridge(arg_network_zone);
f757855e 5907
f757855e
LP
5908 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5909 expose_port_free_all(arg_expose_ports);
bf428efb 5910 rlimit_free_all(arg_rlimit);
b2645747 5911 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5912 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5913
44dbef90
LP
5914 if (r < 0)
5915 return r;
5916
5917 return ret;
88213476 5918}
44dbef90
LP
5919
5920DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);