]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
dissect: wrap verity settings in new VeritySettings structure
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
8fe0087e
LP
14#include <sys/personality.h>
15#include <sys/prctl.h>
16#include <sys/types.h>
6916b164 17#include <sys/wait.h>
8fe0087e 18#include <unistd.h>
1b9e5b12 19
b053cd5f 20#include "sd-bus.h"
1f0cd86b 21#include "sd-daemon.h"
1f0cd86b 22#include "sd-id128.h"
8fe0087e 23
b5efdb8a 24#include "alloc-util.h"
8fe0087e
LP
25#include "barrier.h"
26#include "base-filesystem.h"
27#include "blkid-util.h"
28#include "btrfs-util.h"
b8ea7a6e 29#include "bus-error.h"
b053cd5f 30#include "bus-util.h"
8fe0087e 31#include "cap-list.h"
430f0182 32#include "capability-util.h"
04d391da 33#include "cgroup-util.h"
8fe0087e 34#include "copy.h"
d107bb7d 35#include "cpu-set-util.h"
4fc9982c 36#include "dev-setup.h"
2d845785 37#include "dissect-image.h"
8fe0087e 38#include "env-util.h"
3652872a 39#include "escape.h"
3ffd4af2 40#include "fd-util.h"
842f3b0f 41#include "fdset.h"
a5c32cff 42#include "fileio.h"
f97b34a6 43#include "format-util.h"
f4f15635 44#include "fs-util.h"
1b9e5b12 45#include "gpt.h"
4623e8e6 46#include "hexdecoct.h"
8fe0087e 47#include "hostname-util.h"
910fd145 48#include "id128-util.h"
3652872a 49#include "io-util.h"
8fe0087e 50#include "log.h"
2d845785 51#include "loop-util.h"
8fe0087e 52#include "loopback-setup.h"
1b9cebf6 53#include "machine-image.h"
8fe0087e 54#include "macro.h"
44dbef90 55#include "main-func.h"
f5947a5e 56#include "missing_sched.h"
8fe0087e 57#include "mkdir.h"
4349cd7c 58#include "mount-util.h"
049af8ad 59#include "mountpoint-util.h"
0cb8e3d1 60#include "namespace-util.h"
8fe0087e 61#include "netlink-util.h"
07630cea 62#include "nspawn-cgroup.h"
3652872a 63#include "nspawn-creds.h"
3603efde 64#include "nspawn-def.h"
07630cea
LP
65#include "nspawn-expose-ports.h"
66#include "nspawn-mount.h"
67#include "nspawn-network.h"
de40a303 68#include "nspawn-oci.h"
7336138e 69#include "nspawn-patch-uid.h"
07630cea 70#include "nspawn-register.h"
910fd145 71#include "nspawn-seccomp.h"
07630cea
LP
72#include "nspawn-settings.h"
73#include "nspawn-setuid.h"
7732f92b 74#include "nspawn-stub-pid1.h"
d8b4d14d 75#include "nulstr-util.h"
d58ad743 76#include "os-util.h"
50ebcf6c 77#include "pager.h"
6bedfcbb 78#include "parse-util.h"
8fe0087e 79#include "path-util.h"
294bf0c3 80#include "pretty-print.h"
0b452006 81#include "process-util.h"
8fe0087e
LP
82#include "ptyfwd.h"
83#include "random-util.h"
8869a0b4 84#include "raw-clone.h"
86775e35 85#include "resolve-util.h"
bf428efb 86#include "rlimit-util.h"
8fe0087e 87#include "rm-rf.h"
de40a303
LP
88#if HAVE_SECCOMP
89#include "seccomp-util.h"
90#endif
68b02049 91#include "selinux-util.h"
8fe0087e 92#include "signal-util.h"
2583fbea 93#include "socket-util.h"
8fcde012 94#include "stat-util.h"
15a5e950 95#include "stdio-util.h"
5c828e66 96#include "string-table.h"
07630cea 97#include "string-util.h"
8fe0087e 98#include "strv.h"
de40a303 99#include "sysctl-util.h"
8fe0087e 100#include "terminal-util.h"
e4de7287 101#include "tmpfile-util.h"
affb60b1 102#include "umask-util.h"
43c3fb46 103#include "unit-name.h"
b1d4f8e1 104#include "user-util.h"
8fe0087e 105#include "util.h"
e9642be2 106
e96ceaba
LP
107/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
108#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
0e7ac751 109
2a49b612
ZJS
110#define EXIT_FORCE_RESTART 133
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
6145bb4f 114 CONTAINER_REBOOTED,
113cea80
DH
115} ContainerStatus;
116
88213476 117static char *arg_directory = NULL;
ec16945e 118static char *arg_template = NULL;
5f932eb9 119static char *arg_chdir = NULL;
b53ede69
PW
120static char *arg_pivot_root_new = NULL;
121static char *arg_pivot_root_old = NULL;
687d0825 122static char *arg_user = NULL;
de40a303
LP
123static uid_t arg_uid = UID_INVALID;
124static gid_t arg_gid = GID_INVALID;
125static gid_t* arg_supplementary_gids = NULL;
126static size_t arg_n_supplementary_gids = 0;
9444b1f2 127static sd_id128_t arg_uuid = {};
3a9530e5
LP
128static char *arg_machine = NULL; /* The name used by the host to refer to this */
129static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
130static const char *arg_selinux_context = NULL;
131static const char *arg_selinux_apifs_context = NULL;
de40a303 132static char *arg_slice = NULL;
ff01d048 133static bool arg_private_network = false;
bc2f673e 134static bool arg_read_only = false;
7732f92b 135static StartMode arg_start_mode = START_PID1;
ec16945e 136static bool arg_ephemeral = false;
57fb9fb5 137static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 138static bool arg_link_journal_try = false;
520e0d54 139static uint64_t arg_caps_retain =
50b52222
LP
140 (1ULL << CAP_AUDIT_CONTROL) |
141 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
142 (1ULL << CAP_CHOWN) |
143 (1ULL << CAP_DAC_OVERRIDE) |
144 (1ULL << CAP_DAC_READ_SEARCH) |
145 (1ULL << CAP_FOWNER) |
146 (1ULL << CAP_FSETID) |
147 (1ULL << CAP_IPC_OWNER) |
148 (1ULL << CAP_KILL) |
149 (1ULL << CAP_LEASE) |
150 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 151 (1ULL << CAP_MKNOD) |
5076f0cc
LP
152 (1ULL << CAP_NET_BIND_SERVICE) |
153 (1ULL << CAP_NET_BROADCAST) |
154 (1ULL << CAP_NET_RAW) |
5076f0cc 155 (1ULL << CAP_SETFCAP) |
50b52222 156 (1ULL << CAP_SETGID) |
5076f0cc
LP
157 (1ULL << CAP_SETPCAP) |
158 (1ULL << CAP_SETUID) |
159 (1ULL << CAP_SYS_ADMIN) |
50b52222 160 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
161 (1ULL << CAP_SYS_CHROOT) |
162 (1ULL << CAP_SYS_NICE) |
163 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 164 (1ULL << CAP_SYS_RESOURCE) |
50b52222 165 (1ULL << CAP_SYS_TTY_CONFIG);
de40a303 166static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 167static CustomMount *arg_custom_mounts = NULL;
88614c8a 168static size_t arg_n_custom_mounts = 0;
f4889f65 169static char **arg_setenv = NULL;
284c0b91 170static bool arg_quiet = false;
eb91eb18 171static bool arg_register = true;
89f7c846 172static bool arg_keep_unit = false;
aa28aefe 173static char **arg_network_interfaces = NULL;
c74e630d 174static char **arg_network_macvlan = NULL;
4bbfe7ad 175static char **arg_network_ipvlan = NULL;
69c79d3c 176static bool arg_network_veth = false;
f6d6bad1 177static char **arg_network_veth_extra = NULL;
f757855e 178static char *arg_network_bridge = NULL;
22b28dfd 179static char *arg_network_zone = NULL;
d7bea6b6 180static char *arg_network_namespace_path = NULL;
bb068de0 181static PagerFlags arg_pager_flags = 0;
050f7277 182static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 183static char *arg_image = NULL;
de40a303 184static char *arg_oci_bundle = NULL;
f757855e 185static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 186static ExposePort *arg_expose_ports = NULL;
f36933fe 187static char **arg_property = NULL;
de40a303 188static sd_bus_message *arg_property_message = NULL;
0de7acce 189static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 190static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 191static bool arg_userns_chown = false;
c6c8f6e2 192static int arg_kill_signal = 0;
5da38d07 193static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
194static SettingsMask arg_settings_mask = 0;
195static int arg_settings_trusted = -1;
196static char **arg_parameters = NULL;
6aadfa4c 197static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 198static bool arg_notify_ready = false;
5a8ff0e6 199static bool arg_use_cgns = true;
0c582db0 200static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 201static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
89e62e0b 202static VeritySettings arg_verity_settings = {};
6b000af4
LP
203static char **arg_syscall_allow_list = NULL;
204static char **arg_syscall_deny_list = NULL;
de40a303
LP
205#if HAVE_SECCOMP
206static scmp_filter_ctx arg_seccomp = NULL;
207#endif
bf428efb 208static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 209static bool arg_no_new_privileges = false;
81f345df
LP
210static int arg_oom_score_adjust = 0;
211static bool arg_oom_score_adjust_set = false;
0985c7c4 212static CPUSet arg_cpu_set = {};
09d423e9 213static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 214static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
215static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
216static DeviceNode* arg_extra_nodes = NULL;
217static size_t arg_n_extra_nodes = 0;
218static char **arg_sysctl = NULL;
219static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
220static Credential *arg_credentials = NULL;
221static size_t arg_n_credentials = 0;
88213476 222
6145bb4f
LP
223STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
224STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
225STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
226STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
227STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
228STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
229STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
230STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
231STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
232STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
234STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
235STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
236STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
237STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
238STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
239STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
244STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
245STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 246STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
247STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
248STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
249#if HAVE_SECCOMP
250STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
251#endif
0985c7c4 252STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f
LP
253STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
254
dce66ffe
ZJS
255static int handle_arg_console(const char *arg) {
256 if (streq(arg, "help")) {
257 puts("interactive\n"
258 "read-only\n"
259 "passive\n"
260 "pipe");
261 return 0;
262 }
263
264 if (streq(arg, "interactive"))
265 arg_console_mode = CONSOLE_INTERACTIVE;
266 else if (streq(arg, "read-only"))
267 arg_console_mode = CONSOLE_READ_ONLY;
268 else if (streq(arg, "passive"))
269 arg_console_mode = CONSOLE_PASSIVE;
270 else if (streq(arg, "pipe"))
271 arg_console_mode = CONSOLE_PIPE;
272 else
273 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
274
275 arg_settings_mask |= SETTING_CONSOLE_MODE;
276 return 1;
277}
278
37ec0fdd
LP
279static int help(void) {
280 _cleanup_free_ char *link = NULL;
281 int r;
282
bb068de0 283 (void) pager_open(arg_pager_flags);
50ebcf6c 284
37ec0fdd
LP
285 r = terminal_urlify_man("systemd-nspawn", "1", &link);
286 if (r < 0)
287 return log_oom();
288
25148653 289 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 290 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
291 " -h --help Show this help\n"
292 " --version Print version string\n"
69c79d3c 293 " -q --quiet Do not show status information\n"
bb068de0 294 " --no-pager Do not pipe output into a pager\n"
25148653
LP
295 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
296 "%3$sImage:%4$s\n"
1b9e5b12 297 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
298 " --template=PATH Initialize root directory from template directory,\n"
299 " if missing\n"
300 " -x --ephemeral Run container with snapshot of root directory, and\n"
301 " remove it after exit\n"
25e68fd3
LP
302 " -i --image=PATH Root file system disk image (or device node) for\n"
303 " the container\n"
de40a303 304 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
305 " --read-only Mount the root directory read-only\n"
306 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 307 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
308 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
309 " as a DER encoded PKCS7, either as a path to a file\n"
310 " or as an ASCII base64 encoded string prefixed by\n"
311 " 'base64:'\n"
e7cbe5cb 312 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
313 " --pivot-root=PATH[:PATH]\n"
314 " Pivot root to given directory in the container\n\n"
315 "%3$sExecution:%4$s\n"
7732f92b 316 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 317 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 318 " --chdir=PATH Set working directory in the container\n"
25148653
LP
319 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
320 " -u --user=USER Run the command under specified user or UID\n"
321 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
322 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
323 "%3$sSystem Identity:%4$s\n"
a8828ed9 324 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 325 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
326 " --uuid=UUID Set a specific machine UUID for the container\n\n"
327 "%3$sProperties:%4$s\n"
a8828ed9 328 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 329 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
330 " --register=BOOLEAN Register container as machine\n"
331 " --keep-unit Do not register a scope for the machine, reuse\n"
332 " the service unit nspawn is running in\n\n"
333 "%3$sUser Namespacing:%4$s\n"
90b4a64d 334 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 335 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 336 " Similar, but with user configured UID/GID range\n"
25148653
LP
337 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
338 "%3$sNetworking:%4$s\n"
69c79d3c
LP
339 " --private-network Disable network in container\n"
340 " --network-interface=INTERFACE\n"
341 " Assign an existing network interface to the\n"
342 " container\n"
c74e630d
LP
343 " --network-macvlan=INTERFACE\n"
344 " Create a macvlan network interface based on an\n"
345 " existing network interface to the container\n"
4bbfe7ad
TG
346 " --network-ipvlan=INTERFACE\n"
347 " Create a ipvlan network interface based on an\n"
348 " existing network interface to the container\n"
a8eaaee7 349 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 350 " and container\n"
f6d6bad1
LP
351 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
352 " Add an additional virtual Ethernet link between\n"
353 " host and container\n"
ab046dde 354 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
355 " Add a virtual Ethernet connection to the container\n"
356 " and attach it to an existing bridge on the host\n"
357 " --network-zone=NAME Similar, but attach the new interface to an\n"
358 " an automatically managed bridge interface\n"
d7bea6b6
DP
359 " --network-namespace-path=PATH\n"
360 " Set network namespace to the one represented by\n"
361 " the specified kernel namespace file node\n"
6d0b55c2 362 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
363 " Expose a container IP port on the host\n\n"
364 "%3$sSecurity:%4$s\n"
a8828ed9
DW
365 " --capability=CAP In addition to the default, retain specified\n"
366 " capability\n"
367 " --drop-capability=CAP Drop the specified capability from the default set\n"
f4e803c8 368 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
369 " --system-call-filter=LIST|~LIST\n"
370 " Permit/prohibit specific system calls\n"
25148653
LP
371 " -Z --selinux-context=SECLABEL\n"
372 " Set the SELinux security context to be used by\n"
373 " processes in the container\n"
374 " -L --selinux-apifs-context=SECLABEL\n"
375 " Set the SELinux security context to be used by\n"
376 " API/tmpfs file systems in the container\n\n"
377 "%3$sResources:%4$s\n"
bf428efb 378 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
379 " --oom-score-adjust=VALUE\n"
380 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
381 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
382 " --personality=ARCH Pick personality for this container\n\n"
25148653 383 "%3$sIntegration:%4$s\n"
09d423e9 384 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 385 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
386 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
387 " host, try-guest, try-host\n"
388 " -j Equivalent to --link-journal=try-guest\n\n"
389 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
390 " --bind=PATH[:PATH[:OPTIONS]]\n"
391 " Bind mount a file or directory from the host into\n"
a8828ed9 392 " the container\n"
5e5bfa6e
EY
393 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
394 " Similar, but creates a read-only bind mount\n"
de40a303
LP
395 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
396 " it\n"
06c17c39 397 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
398 " --overlay=PATH[:PATH...]:PATH\n"
399 " Create an overlay mount from the host to \n"
400 " the container\n"
401 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
402 " Similar, but creates a read-only overlay mount\n\n"
403 "%3$sInput/Output:%4$s\n"
de40a303
LP
404 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
405 " set up for the container.\n"
3652872a
LP
406 " -P --pipe Equivalent to --console=pipe\n\n"
407 "%3$sCredentials:%4$s\n"
408 " --set-credential=ID:VALUE\n"
409 " Pass a credential with literal value to container.\n"
410 " --load-credential=ID:PATH\n"
411 " Load credential to pass to container from file or\n"
412 " AF_UNIX stream socket.\n"
25148653 413 "\nSee the %2$s for details.\n"
37ec0fdd
LP
414 , program_invocation_short_name
415 , link
37a92352
LP
416 , ansi_underline(), ansi_normal()
417 , ansi_highlight(), ansi_normal()
418 );
37ec0fdd
LP
419
420 return 0;
88213476
LP
421}
422
86c0dd4a 423static int custom_mount_check_all(void) {
88614c8a 424 size_t i;
5a8af538 425
5a8af538
LP
426 for (i = 0; i < arg_n_custom_mounts; i++) {
427 CustomMount *m = &arg_custom_mounts[i];
428
0de7acce 429 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
430 if (arg_userns_chown)
431 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
432 "--private-users-chown may not be combined with custom root mounts.");
433 else if (arg_uid_shift == UID_INVALID)
434 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
435 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 436 }
5a8af538
LP
437 }
438
439 return 0;
440}
441
8199d554 442static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 443 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 444 int r;
5da38d07 445
efdb0237 446 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
447
448 e = getenv(var);
449 if (!e) {
d5fc5b2f 450 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
451 var = "UNIFIED_CGROUP_HIERARCHY";
452 e = getenv(var);
c78c095b
ZJS
453 }
454
455 if (!isempty(e)) {
efdb0237
LP
456 r = parse_boolean(e);
457 if (r < 0)
c78c095b 458 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
459 if (r > 0)
460 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
461 else
462 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
463 }
464
8199d554
LP
465 return 0;
466}
467
468static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
469 int r;
470
75b0d8b8
ZJS
471 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
472 * in the image actually supports. */
b4cccbc1
LP
473 r = cg_all_unified();
474 if (r < 0)
475 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
476 if (r > 0) {
a8725a06
ZJS
477 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
478 * routine only detects 231, so we'll have a false negative here for 230. */
479 r = systemd_installation_has_version(directory, 230);
480 if (r < 0)
481 return log_error_errno(r, "Failed to determine systemd version in container: %m");
482 if (r > 0)
483 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
484 else
485 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 486 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
487 /* Mixed cgroup hierarchy support was added in 233 */
488 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
489 if (r < 0)
490 return log_error_errno(r, "Failed to determine systemd version in container: %m");
491 if (r > 0)
492 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
493 else
494 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
495 } else
5da38d07 496 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 497
8199d554
LP
498 log_debug("Using %s hierarchy for container.",
499 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
500 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
501
efdb0237
LP
502 return 0;
503}
504
8a99bd0c
ZJS
505static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
506 uint64_t mask = 0;
507 int r;
508
509 for (;;) {
510 _cleanup_free_ char *t = NULL;
511
512 r = extract_first_word(&spec, &t, ",", 0);
513 if (r < 0)
514 return log_error_errno(r, "Failed to parse capability %s.", t);
515 if (r == 0)
516 break;
517
518 if (streq(t, "help")) {
519 for (int i = 0; i < capability_list_length(); i++) {
520 const char *name;
521
522 name = capability_to_name(i);
523 if (name)
524 puts(name);
525 }
526
527 return 0; /* quit */
528 }
529
530 if (streq(t, "all"))
531 mask = (uint64_t) -1;
532 else {
533 r = capability_from_name(t);
534 if (r < 0)
535 return log_error_errno(r, "Failed to parse capability %s.", t);
536
537 mask |= 1ULL << r;
538 }
539 }
540
541 *ret_mask = mask;
542 return 1; /* continue */
543}
544
49048684 545static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
546 int r;
547
548 r = getenv_bool(name);
549 if (r == -ENXIO)
49048684 550 return 0;
0c582db0 551 if (r < 0)
49048684 552 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 553
0c582db0 554 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 555 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 556 return 0;
0c582db0
LB
557}
558
49048684 559static int parse_mount_settings_env(void) {
4f086aab 560 const char *e;
1099ceeb
LP
561 int r;
562
563 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
564 if (r < 0 && r != -ENXIO)
565 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
566 if (r >= 0)
567 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
568
569 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 570 if (streq_ptr(e, "network"))
4f086aab 571 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 572
49048684
ZJS
573 else if (e) {
574 r = parse_boolean(e);
575 if (r < 0)
576 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
577
578 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
579 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 580 }
4f086aab 581
49048684 582 return 0;
4f086aab
SU
583}
584
49048684 585static int parse_environment(void) {
d5455d2f
LP
586 const char *e;
587 int r;
588
49048684
ZJS
589 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
590 if (r < 0)
591 return r;
592 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
593 if (r < 0)
594 return r;
595 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
596 if (r < 0)
597 return r;
598 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
599 if (r < 0)
600 return r;
d5455d2f 601
49048684
ZJS
602 r = parse_mount_settings_env();
603 if (r < 0)
604 return r;
d5455d2f 605
489fae52
ZJS
606 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
607 * even if it is supported. If not supported, it has no effect. */
de40a303 608 if (!cg_ns_supported())
489fae52 609 arg_use_cgns = false;
de40a303
LP
610 else {
611 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
612 if (r < 0) {
613 if (r != -ENXIO)
49048684 614 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
615
616 arg_use_cgns = true;
617 } else {
618 arg_use_cgns = r > 0;
619 arg_settings_mask |= SETTING_USE_CGNS;
620 }
621 }
d5455d2f
LP
622
623 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
624 if (e)
625 arg_container_service_name = e;
626
49048684 627 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
628}
629
88213476 630static int parse_argv(int argc, char *argv[]) {
a41fe3a2 631 enum {
acbeb427
ZJS
632 ARG_VERSION = 0x100,
633 ARG_PRIVATE_NETWORK,
bc2f673e 634 ARG_UUID,
5076f0cc 635 ARG_READ_ONLY,
57fb9fb5 636 ARG_CAPABILITY,
420c7379 637 ARG_DROP_CAPABILITY,
17fe0523
LP
638 ARG_LINK_JOURNAL,
639 ARG_BIND,
f4889f65 640 ARG_BIND_RO,
06c17c39 641 ARG_TMPFS,
5a8af538
LP
642 ARG_OVERLAY,
643 ARG_OVERLAY_RO,
de40a303 644 ARG_INACCESSIBLE,
eb91eb18 645 ARG_SHARE_SYSTEM,
89f7c846 646 ARG_REGISTER,
aa28aefe 647 ARG_KEEP_UNIT,
69c79d3c 648 ARG_NETWORK_INTERFACE,
c74e630d 649 ARG_NETWORK_MACVLAN,
4bbfe7ad 650 ARG_NETWORK_IPVLAN,
ab046dde 651 ARG_NETWORK_BRIDGE,
22b28dfd 652 ARG_NETWORK_ZONE,
f6d6bad1 653 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 654 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 655 ARG_PERSONALITY,
4d9f07b4 656 ARG_VOLATILE,
ec16945e 657 ARG_TEMPLATE,
f36933fe 658 ARG_PROPERTY,
6dac160c 659 ARG_PRIVATE_USERS,
c6c8f6e2 660 ARG_KILL_SIGNAL,
f757855e 661 ARG_SETTINGS,
5f932eb9 662 ARG_CHDIR,
b53ede69 663 ARG_PIVOT_ROOT,
7336138e 664 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 665 ARG_NOTIFY_READY,
4623e8e6 666 ARG_ROOT_HASH,
89e62e0b
LP
667 ARG_ROOT_HASH_SIG,
668 ARG_VERITY_DATA,
960e4569 669 ARG_SYSTEM_CALL_FILTER,
bf428efb 670 ARG_RLIMIT,
3a9530e5 671 ARG_HOSTNAME,
66edd963 672 ARG_NO_NEW_PRIVILEGES,
81f345df 673 ARG_OOM_SCORE_ADJUST,
d107bb7d 674 ARG_CPU_AFFINITY,
09d423e9 675 ARG_RESOLV_CONF,
1688841f 676 ARG_TIMEZONE,
de40a303
LP
677 ARG_CONSOLE,
678 ARG_PIPE,
679 ARG_OCI_BUNDLE,
bb068de0 680 ARG_NO_PAGER,
3652872a
LP
681 ARG_SET_CREDENTIAL,
682 ARG_LOAD_CREDENTIAL,
a41fe3a2
LP
683 };
684
88213476 685 static const struct option options[] = {
d7bea6b6
DP
686 { "help", no_argument, NULL, 'h' },
687 { "version", no_argument, NULL, ARG_VERSION },
688 { "directory", required_argument, NULL, 'D' },
689 { "template", required_argument, NULL, ARG_TEMPLATE },
690 { "ephemeral", no_argument, NULL, 'x' },
691 { "user", required_argument, NULL, 'u' },
692 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
693 { "as-pid2", no_argument, NULL, 'a' },
694 { "boot", no_argument, NULL, 'b' },
695 { "uuid", required_argument, NULL, ARG_UUID },
696 { "read-only", no_argument, NULL, ARG_READ_ONLY },
697 { "capability", required_argument, NULL, ARG_CAPABILITY },
698 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 699 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
700 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
701 { "bind", required_argument, NULL, ARG_BIND },
702 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
703 { "tmpfs", required_argument, NULL, ARG_TMPFS },
704 { "overlay", required_argument, NULL, ARG_OVERLAY },
705 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 706 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 707 { "machine", required_argument, NULL, 'M' },
3a9530e5 708 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
709 { "slice", required_argument, NULL, 'S' },
710 { "setenv", required_argument, NULL, 'E' },
711 { "selinux-context", required_argument, NULL, 'Z' },
712 { "selinux-apifs-context", required_argument, NULL, 'L' },
713 { "quiet", no_argument, NULL, 'q' },
714 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
715 { "register", required_argument, NULL, ARG_REGISTER },
716 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
717 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
718 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
719 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
720 { "network-veth", no_argument, NULL, 'n' },
721 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
722 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
723 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
724 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
725 { "personality", required_argument, NULL, ARG_PERSONALITY },
726 { "image", required_argument, NULL, 'i' },
727 { "volatile", optional_argument, NULL, ARG_VOLATILE },
728 { "port", required_argument, NULL, 'p' },
729 { "property", required_argument, NULL, ARG_PROPERTY },
730 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
731 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
732 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
733 { "settings", required_argument, NULL, ARG_SETTINGS },
734 { "chdir", required_argument, NULL, ARG_CHDIR },
735 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
736 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
737 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
738 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
739 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 740 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 741 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 742 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 743 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 744 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 745 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
746 { "console", required_argument, NULL, ARG_CONSOLE },
747 { "pipe", no_argument, NULL, ARG_PIPE },
748 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 749 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
750 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
751 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
eb9da376 752 {}
88213476
LP
753 };
754
9444b1f2 755 int c, r;
a42c8b54 756 uint64_t plus = 0, minus = 0;
f757855e 757 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
758
759 assert(argc >= 0);
760 assert(argv);
761
de40a303 762 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
763 switch (c) {
764
765 case 'h':
37ec0fdd 766 return help();
88213476 767
acbeb427 768 case ARG_VERSION:
3f6fd1ba 769 return version();
acbeb427 770
88213476 771 case 'D':
0f03c2a4 772 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 773 if (r < 0)
0f03c2a4 774 return r;
de40a303
LP
775
776 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
777 break;
778
779 case ARG_TEMPLATE:
0f03c2a4 780 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 781 if (r < 0)
0f03c2a4 782 return r;
de40a303
LP
783
784 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
785 break;
786
1b9e5b12 787 case 'i':
0f03c2a4 788 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 789 if (r < 0)
0f03c2a4 790 return r;
de40a303
LP
791
792 arg_settings_mask |= SETTING_DIRECTORY;
793 break;
794
795 case ARG_OCI_BUNDLE:
796 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
797 if (r < 0)
798 return r;
799
ec16945e
LP
800 break;
801
802 case 'x':
803 arg_ephemeral = true;
a2f577fc 804 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
805 break;
806
687d0825 807 case 'u':
2fc09a9c
DM
808 r = free_and_strdup(&arg_user, optarg);
809 if (r < 0)
7027ff61 810 return log_oom();
687d0825 811
f757855e 812 arg_settings_mask |= SETTING_USER;
687d0825
MV
813 break;
814
22b28dfd
LP
815 case ARG_NETWORK_ZONE: {
816 char *j;
817
b910cc72 818 j = strjoin("vz-", optarg);
22b28dfd
LP
819 if (!j)
820 return log_oom();
821
822 if (!ifname_valid(j)) {
823 log_error("Network zone name not valid: %s", j);
824 free(j);
825 return -EINVAL;
826 }
827
df1fac6d 828 free_and_replace(arg_network_zone, j);
22b28dfd
LP
829
830 arg_network_veth = true;
831 arg_private_network = true;
832 arg_settings_mask |= SETTING_NETWORK;
833 break;
834 }
835
ab046dde 836 case ARG_NETWORK_BRIDGE:
ef76dff2 837
baaa35ad
ZJS
838 if (!ifname_valid(optarg))
839 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
840 "Bridge interface name not valid: %s", optarg);
ef76dff2 841
f757855e
LP
842 r = free_and_strdup(&arg_network_bridge, optarg);
843 if (r < 0)
844 return log_oom();
ab046dde 845
4831981d 846 _fallthrough_;
0dfaa006 847 case 'n':
69c79d3c
LP
848 arg_network_veth = true;
849 arg_private_network = true;
f757855e 850 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
851 break;
852
f6d6bad1
LP
853 case ARG_NETWORK_VETH_EXTRA:
854 r = veth_extra_parse(&arg_network_veth_extra, optarg);
855 if (r < 0)
856 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
857
858 arg_private_network = true;
859 arg_settings_mask |= SETTING_NETWORK;
860 break;
861
aa28aefe 862 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
863 if (!ifname_valid(optarg))
864 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
865 "Network interface name not valid: %s", optarg);
ef76dff2 866
b390f178
DDM
867 r = test_network_interface_initialized(optarg);
868 if (r < 0)
869 return r;
870
c74e630d
LP
871 if (strv_extend(&arg_network_interfaces, optarg) < 0)
872 return log_oom();
873
874 arg_private_network = true;
f757855e 875 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
876 break;
877
878 case ARG_NETWORK_MACVLAN:
ef76dff2 879
baaa35ad
ZJS
880 if (!ifname_valid(optarg))
881 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
882 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 883
b390f178
DDM
884 r = test_network_interface_initialized(optarg);
885 if (r < 0)
886 return r;
887
c74e630d 888 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
889 return log_oom();
890
4bbfe7ad 891 arg_private_network = true;
f757855e 892 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
893 break;
894
895 case ARG_NETWORK_IPVLAN:
ef76dff2 896
baaa35ad
ZJS
897 if (!ifname_valid(optarg))
898 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
899 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 900
b390f178
DDM
901 r = test_network_interface_initialized(optarg);
902 if (r < 0)
903 return r;
904
4bbfe7ad
TG
905 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
906 return log_oom();
907
4831981d 908 _fallthrough_;
ff01d048
LP
909 case ARG_PRIVATE_NETWORK:
910 arg_private_network = true;
f757855e 911 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
912 break;
913
d7bea6b6
DP
914 case ARG_NETWORK_NAMESPACE_PATH:
915 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
916 if (r < 0)
917 return r;
918
de40a303 919 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
920 break;
921
0f0dbc46 922 case 'b':
baaa35ad
ZJS
923 if (arg_start_mode == START_PID2)
924 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
925 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
926
927 arg_start_mode = START_BOOT;
928 arg_settings_mask |= SETTING_START_MODE;
929 break;
930
931 case 'a':
baaa35ad
ZJS
932 if (arg_start_mode == START_BOOT)
933 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
934 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
935
936 arg_start_mode = START_PID2;
937 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
938 break;
939
144f0fc0 940 case ARG_UUID:
9444b1f2 941 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
942 if (r < 0)
943 return log_error_errno(r, "Invalid UUID: %s", optarg);
944
baaa35ad
ZJS
945 if (sd_id128_is_null(arg_uuid))
946 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
947 "Machine UUID may not be all zeroes.");
f757855e
LP
948
949 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 950 break;
aa96c6cb 951
43c3fb46
LP
952 case 'S': {
953 _cleanup_free_ char *mangled = NULL;
954
955 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
956 if (r < 0)
957 return log_oom();
958
43c3fb46 959 free_and_replace(arg_slice, mangled);
de40a303 960 arg_settings_mask |= SETTING_SLICE;
144f0fc0 961 break;
43c3fb46 962 }
144f0fc0 963
7027ff61 964 case 'M':
c1521918 965 if (isempty(optarg))
97b11eed 966 arg_machine = mfree(arg_machine);
c1521918 967 else {
baaa35ad
ZJS
968 if (!machine_name_is_valid(optarg))
969 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
970 "Invalid machine name: %s", optarg);
7027ff61 971
0c3c4284
LP
972 r = free_and_strdup(&arg_machine, optarg);
973 if (r < 0)
eb91eb18 974 return log_oom();
eb91eb18 975 }
9ce6d1b3 976 break;
7027ff61 977
3a9530e5
LP
978 case ARG_HOSTNAME:
979 if (isempty(optarg))
980 arg_hostname = mfree(arg_hostname);
981 else {
baaa35ad
ZJS
982 if (!hostname_is_valid(optarg, false))
983 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
984 "Invalid hostname: %s", optarg);
3a9530e5
LP
985
986 r = free_and_strdup(&arg_hostname, optarg);
987 if (r < 0)
988 return log_oom();
989 }
990
991 arg_settings_mask |= SETTING_HOSTNAME;
992 break;
993
82adf6af
LP
994 case 'Z':
995 arg_selinux_context = optarg;
a8828ed9
DW
996 break;
997
82adf6af
LP
998 case 'L':
999 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1000 break;
1001
bc2f673e
LP
1002 case ARG_READ_ONLY:
1003 arg_read_only = true;
f757855e 1004 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1005 break;
1006
420c7379
LP
1007 case ARG_CAPABILITY:
1008 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1009 uint64_t m;
1010 r = parse_capability_spec(optarg, &m);
1011 if (r <= 0)
1012 return r;
5076f0cc 1013
8a99bd0c
ZJS
1014 if (c == ARG_CAPABILITY)
1015 plus |= m;
1016 else
1017 minus |= m;
f757855e 1018 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1019 break;
1020 }
66edd963
LP
1021 case ARG_NO_NEW_PRIVILEGES:
1022 r = parse_boolean(optarg);
1023 if (r < 0)
1024 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1025
1026 arg_no_new_privileges = r;
1027 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1028 break;
1029
57fb9fb5
LP
1030 case 'j':
1031 arg_link_journal = LINK_GUEST;
574edc90 1032 arg_link_journal_try = true;
4e1d6aa9 1033 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1034 break;
1035
1036 case ARG_LINK_JOURNAL:
4e1d6aa9 1037 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1038 if (r < 0)
1039 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1040
4e1d6aa9 1041 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1042 break;
1043
17fe0523 1044 case ARG_BIND:
f757855e
LP
1045 case ARG_BIND_RO:
1046 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1047 if (r < 0)
1048 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1049
f757855e 1050 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1051 break;
06c17c39 1052
f757855e
LP
1053 case ARG_TMPFS:
1054 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1055 if (r < 0)
1056 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1057
f757855e 1058 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1059 break;
5a8af538
LP
1060
1061 case ARG_OVERLAY:
ad85779a
LP
1062 case ARG_OVERLAY_RO:
1063 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1064 if (r == -EADDRNOTAVAIL)
1065 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1066 if (r < 0)
1067 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1068
f757855e 1069 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1070 break;
06c17c39 1071
de40a303
LP
1072 case ARG_INACCESSIBLE:
1073 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1074 if (r < 0)
1075 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1076
1077 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1078 break;
1079
a5f1cb3b 1080 case 'E': {
f4889f65
LP
1081 char **n;
1082
baaa35ad
ZJS
1083 if (!env_assignment_is_valid(optarg))
1084 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1085 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
1086
1087 n = strv_env_set(arg_setenv, optarg);
1088 if (!n)
1089 return log_oom();
1090
130d3d22 1091 strv_free_and_replace(arg_setenv, n);
f757855e 1092 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1093 break;
1094 }
1095
284c0b91
LP
1096 case 'q':
1097 arg_quiet = true;
1098 break;
1099
8a96d94e 1100 case ARG_SHARE_SYSTEM:
a6b5216c 1101 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1102 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1103 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1104 arg_clone_ns_flags = 0;
8a96d94e
LP
1105 break;
1106
eb91eb18
LP
1107 case ARG_REGISTER:
1108 r = parse_boolean(optarg);
1109 if (r < 0) {
1110 log_error("Failed to parse --register= argument: %s", optarg);
1111 return r;
1112 }
1113
1114 arg_register = r;
1115 break;
1116
89f7c846
LP
1117 case ARG_KEEP_UNIT:
1118 arg_keep_unit = true;
1119 break;
1120
6afc95b7
LP
1121 case ARG_PERSONALITY:
1122
ac45f971 1123 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1124 if (arg_personality == PERSONALITY_INVALID)
1125 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1126 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1127
f757855e 1128 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1129 break;
1130
4d9f07b4
LP
1131 case ARG_VOLATILE:
1132
1133 if (!optarg)
f757855e 1134 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1135 else if (streq(optarg, "help")) {
1136 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1137 return 0;
1138 } else {
f757855e 1139 VolatileMode m;
4d9f07b4 1140
f757855e 1141 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1142 if (m < 0)
1143 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1144 "Failed to parse --volatile= argument: %s", optarg);
1145 else
f757855e 1146 arg_volatile_mode = m;
6d0b55c2
LP
1147 }
1148
f757855e
LP
1149 arg_settings_mask |= SETTING_VOLATILE_MODE;
1150 break;
6d0b55c2 1151
f757855e
LP
1152 case 'p':
1153 r = expose_port_parse(&arg_expose_ports, optarg);
1154 if (r == -EEXIST)
1155 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1156 if (r < 0)
1157 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1158
f757855e 1159 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1160 break;
6d0b55c2 1161
f36933fe
LP
1162 case ARG_PROPERTY:
1163 if (strv_extend(&arg_property, optarg) < 0)
1164 return log_oom();
1165
1166 break;
1167
ae209204
ZJS
1168 case ARG_PRIVATE_USERS: {
1169 int boolean = -1;
0de7acce 1170
ae209204
ZJS
1171 if (!optarg)
1172 boolean = true;
1173 else if (!in_charset(optarg, DIGITS))
1174 /* do *not* parse numbers as booleans */
1175 boolean = parse_boolean(optarg);
1176
1177 if (boolean == false) {
0de7acce
LP
1178 /* no: User namespacing off */
1179 arg_userns_mode = USER_NAMESPACE_NO;
1180 arg_uid_shift = UID_INVALID;
1181 arg_uid_range = UINT32_C(0x10000);
ae209204 1182 } else if (boolean == true) {
0de7acce
LP
1183 /* yes: User namespacing on, UID range is read from root dir */
1184 arg_userns_mode = USER_NAMESPACE_FIXED;
1185 arg_uid_shift = UID_INVALID;
1186 arg_uid_range = UINT32_C(0x10000);
1187 } else if (streq(optarg, "pick")) {
1188 /* pick: User namespacing on, UID range is picked randomly */
1189 arg_userns_mode = USER_NAMESPACE_PICK;
1190 arg_uid_shift = UID_INVALID;
1191 arg_uid_range = UINT32_C(0x10000);
1192 } else {
6c2058b3 1193 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1194 const char *range, *shift;
1195
0de7acce
LP
1196 /* anything else: User namespacing on, UID range is explicitly configured */
1197
6dac160c
LP
1198 range = strchr(optarg, ':');
1199 if (range) {
6c2058b3
ZJS
1200 buffer = strndup(optarg, range - optarg);
1201 if (!buffer)
1202 return log_oom();
1203 shift = buffer;
6dac160c
LP
1204
1205 range++;
bfd292ec
ZJS
1206 r = safe_atou32(range, &arg_uid_range);
1207 if (r < 0)
be715731 1208 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1209 } else
1210 shift = optarg;
1211
be715731
ZJS
1212 r = parse_uid(shift, &arg_uid_shift);
1213 if (r < 0)
1214 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1215
1216 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1217 }
1218
baaa35ad
ZJS
1219 if (arg_uid_range <= 0)
1220 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1221 "UID range cannot be 0.");
be715731 1222
0de7acce 1223 arg_settings_mask |= SETTING_USERNS;
6dac160c 1224 break;
ae209204 1225 }
6dac160c 1226
0de7acce 1227 case 'U':
ccabee0d
LP
1228 if (userns_supported()) {
1229 arg_userns_mode = USER_NAMESPACE_PICK;
1230 arg_uid_shift = UID_INVALID;
1231 arg_uid_range = UINT32_C(0x10000);
1232
1233 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1234 }
1235
7336138e
LP
1236 break;
1237
0de7acce 1238 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1239 arg_userns_chown = true;
0de7acce
LP
1240
1241 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1242 break;
1243
c6c8f6e2 1244 case ARG_KILL_SIGNAL:
5c828e66
LP
1245 if (streq(optarg, "help")) {
1246 DUMP_STRING_TABLE(signal, int, _NSIG);
1247 return 0;
1248 }
1249
29a3db75 1250 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1251 if (arg_kill_signal < 0)
1252 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1253 "Cannot parse signal: %s", optarg);
c6c8f6e2 1254
f757855e
LP
1255 arg_settings_mask |= SETTING_KILL_SIGNAL;
1256 break;
1257
1258 case ARG_SETTINGS:
1259
1260 /* no → do not read files
1261 * yes → read files, do not override cmdline, trust only subset
1262 * override → read files, override cmdline, trust only subset
1263 * trusted → read files, do not override cmdline, trust all
1264 */
1265
1266 r = parse_boolean(optarg);
1267 if (r < 0) {
1268 if (streq(optarg, "trusted")) {
1269 mask_all_settings = false;
1270 mask_no_settings = false;
1271 arg_settings_trusted = true;
1272
1273 } else if (streq(optarg, "override")) {
1274 mask_all_settings = false;
1275 mask_no_settings = true;
1276 arg_settings_trusted = -1;
1277 } else
1278 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1279 } else if (r > 0) {
1280 /* yes */
1281 mask_all_settings = false;
1282 mask_no_settings = false;
1283 arg_settings_trusted = -1;
1284 } else {
1285 /* no */
1286 mask_all_settings = true;
1287 mask_no_settings = false;
1288 arg_settings_trusted = false;
1289 }
1290
c6c8f6e2
LP
1291 break;
1292
5f932eb9 1293 case ARG_CHDIR:
baaa35ad
ZJS
1294 if (!path_is_absolute(optarg))
1295 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1296 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1297
1298 r = free_and_strdup(&arg_chdir, optarg);
1299 if (r < 0)
1300 return log_oom();
1301
1302 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1303 break;
1304
b53ede69
PW
1305 case ARG_PIVOT_ROOT:
1306 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1307 if (r < 0)
1308 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1309
1310 arg_settings_mask |= SETTING_PIVOT_ROOT;
1311 break;
1312
9c1e04d0
AP
1313 case ARG_NOTIFY_READY:
1314 r = parse_boolean(optarg);
baaa35ad
ZJS
1315 if (r < 0)
1316 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1317 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1318 arg_notify_ready = r;
1319 arg_settings_mask |= SETTING_NOTIFY_READY;
1320 break;
1321
4623e8e6 1322 case ARG_ROOT_HASH: {
89e62e0b 1323 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1324 size_t l;
1325
1326 r = unhexmem(optarg, strlen(optarg), &k, &l);
1327 if (r < 0)
1328 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1329 if (l < sizeof(sd_id128_t))
c6147113 1330 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1331
89e62e0b
LP
1332 free_and_replace(arg_verity_settings.root_hash, k);
1333 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1334 break;
1335 }
1336
c2923fdc
LB
1337 case ARG_ROOT_HASH_SIG: {
1338 char *value;
89e62e0b
LP
1339 size_t l;
1340 void *p;
c2923fdc
LB
1341
1342 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1343 r = unbase64mem(value, strlen(value), &p, &l);
1344 if (r < 0)
1345 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1346
c2923fdc 1347 } else {
89e62e0b 1348 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1349 if (r < 0)
89e62e0b 1350 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1351 }
1352
89e62e0b
LP
1353 free_and_replace(arg_verity_settings.root_hash_sig, p);
1354 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1355 break;
1356 }
1357
89e62e0b
LP
1358 case ARG_VERITY_DATA:
1359 r = parse_path_argument_and_warn(optarg, false, &arg_verity_settings.data_path);
1360 if (r < 0)
1361 return r;
1362 break;
1363
960e4569
LP
1364 case ARG_SYSTEM_CALL_FILTER: {
1365 bool negative;
1366 const char *items;
1367
1368 negative = optarg[0] == '~';
1369 items = negative ? optarg + 1 : optarg;
1370
1371 for (;;) {
1372 _cleanup_free_ char *word = NULL;
1373
1374 r = extract_first_word(&items, &word, NULL, 0);
1375 if (r == 0)
1376 break;
1377 if (r == -ENOMEM)
1378 return log_oom();
1379 if (r < 0)
1380 return log_error_errno(r, "Failed to parse system call filter: %m");
1381
1382 if (negative)
6b000af4 1383 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1384 else
6b000af4 1385 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1386 if (r < 0)
1387 return log_oom();
1388 }
1389
1390 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1391 break;
1392 }
1393
bf428efb
LP
1394 case ARG_RLIMIT: {
1395 const char *eq;
622ecfa8 1396 _cleanup_free_ char *name = NULL;
bf428efb
LP
1397 int rl;
1398
5c828e66
LP
1399 if (streq(optarg, "help")) {
1400 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1401 return 0;
1402 }
1403
bf428efb 1404 eq = strchr(optarg, '=');
baaa35ad
ZJS
1405 if (!eq)
1406 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1407 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1408
1409 name = strndup(optarg, eq - optarg);
1410 if (!name)
1411 return log_oom();
1412
1413 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1414 if (rl < 0)
1415 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1416 "Unknown resource limit: %s", name);
bf428efb
LP
1417
1418 if (!arg_rlimit[rl]) {
1419 arg_rlimit[rl] = new0(struct rlimit, 1);
1420 if (!arg_rlimit[rl])
1421 return log_oom();
1422 }
1423
1424 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1425 if (r < 0)
1426 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1427
1428 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1429 break;
1430 }
1431
81f345df
LP
1432 case ARG_OOM_SCORE_ADJUST:
1433 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1434 if (r < 0)
1435 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1436
1437 arg_oom_score_adjust_set = true;
1438 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1439 break;
1440
d107bb7d 1441 case ARG_CPU_AFFINITY: {
0985c7c4 1442 CPUSet cpuset;
d107bb7d
LP
1443
1444 r = parse_cpu_set(optarg, &cpuset);
1445 if (r < 0)
0985c7c4 1446 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1447
0985c7c4
ZJS
1448 cpu_set_reset(&arg_cpu_set);
1449 arg_cpu_set = cpuset;
d107bb7d
LP
1450 arg_settings_mask |= SETTING_CPU_AFFINITY;
1451 break;
1452 }
1453
09d423e9
LP
1454 case ARG_RESOLV_CONF:
1455 if (streq(optarg, "help")) {
1456 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1457 return 0;
1458 }
1459
1460 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1461 if (arg_resolv_conf < 0)
1462 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1463 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1464
1465 arg_settings_mask |= SETTING_RESOLV_CONF;
1466 break;
1467
1688841f
LP
1468 case ARG_TIMEZONE:
1469 if (streq(optarg, "help")) {
1470 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1471 return 0;
1472 }
1473
1474 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1475 if (arg_timezone < 0)
1476 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1477 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1478
1479 arg_settings_mask |= SETTING_TIMEZONE;
1480 break;
1481
de40a303 1482 case ARG_CONSOLE:
dce66ffe
ZJS
1483 r = handle_arg_console(optarg);
1484 if (r <= 0)
1485 return r;
de40a303
LP
1486 break;
1487
1488 case 'P':
1489 case ARG_PIPE:
dce66ffe
ZJS
1490 r = handle_arg_console("pipe");
1491 if (r <= 0)
1492 return r;
de40a303
LP
1493 break;
1494
bb068de0
ZJS
1495 case ARG_NO_PAGER:
1496 arg_pager_flags |= PAGER_DISABLE;
1497 break;
1498
3652872a
LP
1499 case ARG_SET_CREDENTIAL: {
1500 _cleanup_free_ char *word = NULL, *data = NULL;
1501 const char *p = optarg;
1502 Credential *a;
1503 size_t i;
1504 int l;
1505
1506 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1507 if (r == -ENOMEM)
1508 return log_oom();
1509 if (r < 0)
1510 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1511 if (r == 0 || !p)
1512 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1513
1514 if (!credential_name_valid(word))
1515 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1516
1517 for (i = 0; i < arg_n_credentials; i++)
1518 if (streq(arg_credentials[i].id, word))
1519 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1520
1521 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1522 if (l < 0)
1523 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1524
1525 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1526 if (!a)
1527 return log_oom();
1528
1529 a[arg_n_credentials++] = (Credential) {
1530 .id = TAKE_PTR(word),
1531 .data = TAKE_PTR(data),
1532 .size = l,
1533 };
1534
1535 arg_credentials = a;
1536
1537 arg_settings_mask |= SETTING_CREDENTIALS;
1538 break;
1539 }
1540
1541 case ARG_LOAD_CREDENTIAL: {
1542 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1543 _cleanup_(erase_and_freep) char *data = NULL;
1544 _cleanup_free_ char *word = NULL, *j = NULL;
1545 const char *p = optarg;
1546 Credential *a;
1547 size_t size, i;
1548
1549 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1550 if (r == -ENOMEM)
1551 return log_oom();
1552 if (r < 0)
1553 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1554 if (r == 0 || !p)
1555 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1556
1557 if (!credential_name_valid(word))
1558 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1559
1560 for (i = 0; i < arg_n_credentials; i++)
1561 if (streq(arg_credentials[i].id, word))
1562 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1563
1564 if (path_is_absolute(p))
1565 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1566 else {
1567 const char *e;
1568
1569 e = getenv("CREDENTIALS_DIRECTORY");
1570 if (!e)
1571 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential not available (no credentials passed at all): %s", word);
1572
1573 j = path_join(e, p);
1574 if (!j)
1575 return log_oom();
1576 }
1577
1578 r = read_full_file_full(AT_FDCWD, j ?: p, flags, &data, &size);
1579 if (r < 0)
1580 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1581
1582 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1583 if (!a)
1584 return log_oom();
1585
1586 a[arg_n_credentials++] = (Credential) {
1587 .id = TAKE_PTR(word),
1588 .data = TAKE_PTR(data),
1589 .size = size,
1590 };
1591
1592 arg_credentials = a;
1593
1594 arg_settings_mask |= SETTING_CREDENTIALS;
1595 break;
1596 }
1597
88213476
LP
1598 case '?':
1599 return -EINVAL;
1600
1601 default:
eb9da376 1602 assert_not_reached("Unhandled option");
88213476 1603 }
88213476 1604
60f1ec13
LP
1605 if (argc > optind) {
1606 strv_free(arg_parameters);
1607 arg_parameters = strv_copy(argv + optind);
1608 if (!arg_parameters)
1609 return log_oom();
d7bea6b6 1610
60f1ec13
LP
1611 arg_settings_mask |= SETTING_START_MODE;
1612 }
1613
1614 if (arg_ephemeral && arg_template && !arg_directory)
1615 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1616 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1617 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1618 * --directory=". */
1619 arg_directory = TAKE_PTR(arg_template);
1620
bd4b15f2 1621 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1622
de40a303 1623 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1624 r = parse_environment();
1625 if (r < 0)
1626 return r;
de40a303 1627
60f1ec13
LP
1628 /* Load all settings from .nspawn files */
1629 if (mask_no_settings)
1630 arg_settings_mask = 0;
1631
1632 /* Don't load any settings from .nspawn files */
1633 if (mask_all_settings)
1634 arg_settings_mask = _SETTINGS_MASK_ALL;
1635
1636 return 1;
1637}
1638
1639static int verify_arguments(void) {
1640 int r;
a6b5216c 1641
75b0d8b8
ZJS
1642 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1643 /* If we are running the stub init in the container, we don't need to look at what the init
1644 * in the container supports, because we are not using it. Let's immediately pick the right
1645 * setting based on the host system configuration.
1646 *
1647 * We only do this, if the user didn't use an environment variable to override the detection.
1648 */
1649
1650 r = cg_all_unified();
1651 if (r < 0)
1652 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1653 if (r > 0)
1654 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1655 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1656 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1657 else
1658 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1659 }
1660
4f086aab
SU
1661 if (arg_userns_mode != USER_NAMESPACE_NO)
1662 arg_mount_settings |= MOUNT_USE_USERNS;
1663
1664 if (arg_private_network)
1665 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1666
48a8d337
LB
1667 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1668 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1669 arg_register = false;
baaa35ad 1670 if (arg_start_mode != START_PID1)
60f1ec13 1671 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1672 }
eb91eb18 1673
0de7acce 1674 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1675 arg_userns_chown = true;
1676
60f1ec13
LP
1677 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1678 arg_kill_signal = SIGRTMIN+3;
1679
e5a4bb0d
LP
1680 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1681 arg_read_only = true;
1682
2436ea76
DDM
1683 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1684 arg_read_only = true;
1685
baaa35ad 1686 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1687 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1688 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1689 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1690
baaa35ad 1691 if (arg_directory && arg_image)
60f1ec13 1692 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1693
baaa35ad 1694 if (arg_template && arg_image)
60f1ec13 1695 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1696
baaa35ad 1697 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1698 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1699
baaa35ad 1700 if (arg_ephemeral && arg_template)
60f1ec13 1701 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1702
baaa35ad 1703 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1704 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1705
baaa35ad 1706 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1707 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1708
baaa35ad 1709 if (arg_userns_chown && arg_read_only)
de40a303
LP
1710 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1711 "--read-only and --private-users-chown may not be combined.");
f757855e 1712
e5a4bb0d
LP
1713 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1714 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
5238e957 1715 * copy-up (in case of overlay) making the entire exercise pointless. */
e5a4bb0d
LP
1716 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1717 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1718
679ecd36
SZ
1719 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1720 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1721 if (arg_network_namespace_path &&
1722 (arg_network_interfaces || arg_network_macvlan ||
1723 arg_network_ipvlan || arg_network_veth_extra ||
1724 arg_network_bridge || arg_network_zone ||
679ecd36 1725 arg_network_veth))
de40a303 1726 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1727
60f1ec13 1728 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1729 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1730 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1731
baaa35ad 1732 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1734
baaa35ad 1735 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1736 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1737
baaa35ad 1738 if (arg_expose_ports && !arg_private_network)
60f1ec13 1739 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1740
349cc4a5 1741#if ! HAVE_LIBIPTC
baaa35ad 1742 if (arg_expose_ports)
60f1ec13 1743 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1744#endif
1745
60f1ec13
LP
1746 r = custom_mount_check_all();
1747 if (r < 0)
1748 return r;
c6c8f6e2 1749
f757855e 1750 return 0;
88213476
LP
1751}
1752
03cfe0d5
LP
1753static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1754 assert(p);
1755
0de7acce 1756 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1757 return 0;
1758
1759 if (uid == UID_INVALID && gid == GID_INVALID)
1760 return 0;
1761
1762 if (uid != UID_INVALID) {
1763 uid += arg_uid_shift;
1764
1765 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1766 return -EOVERFLOW;
1767 }
1768
1769 if (gid != GID_INVALID) {
1770 gid += (gid_t) arg_uid_shift;
1771
1772 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1773 return -EOVERFLOW;
1774 }
1775
1776 if (lchown(p, uid, gid) < 0)
1777 return -errno;
b12afc8c
LP
1778
1779 return 0;
1780}
1781
03cfe0d5
LP
1782static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1783 const char *q;
dae8b82e 1784 int r;
03cfe0d5
LP
1785
1786 q = prefix_roota(root, path);
dae8b82e
ZJS
1787 r = mkdir_errno_wrapper(q, mode);
1788 if (r == -EEXIST)
1789 return 0;
1790 if (r < 0)
1791 return r;
03cfe0d5
LP
1792
1793 return userns_lchown(q, uid, gid);
1794}
1795
1688841f 1796static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1797 return PATH_STARTSWITH_SET(
1798 path,
1799 "../usr/share/zoneinfo/",
1800 "/usr/share/zoneinfo/");
1688841f
LP
1801}
1802
83205269
LP
1803static bool etc_writable(void) {
1804 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1805}
1806
e58a1277 1807static int setup_timezone(const char *dest) {
1688841f
LP
1808 _cleanup_free_ char *p = NULL, *etc = NULL;
1809 const char *where, *check;
1810 TimezoneMode m;
d4036145 1811 int r;
f8440af5 1812
e58a1277
LP
1813 assert(dest);
1814
1688841f 1815 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1816 r = readlink_malloc("/etc/localtime", &p);
1817 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1818 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1819 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1820 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1821 else if (r < 0) {
1822 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1823 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1824 * file.
1825 *
1826 * Example:
1827 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1828 */
1829 return 0;
1830 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1831 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1832 else
1833 m = arg_timezone;
1834 } else
1835 m = arg_timezone;
1836
1837 if (m == TIMEZONE_OFF)
1838 return 0;
1839
a5648b80 1840 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1841 if (r < 0) {
1688841f 1842 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1843 return 0;
1844 }
1845
1688841f
LP
1846 where = strjoina(etc, "/localtime");
1847
1848 switch (m) {
1849
1850 case TIMEZONE_DELETE:
1851 if (unlink(where) < 0)
1852 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1853
d4036145 1854 return 0;
d4036145 1855
1688841f
LP
1856 case TIMEZONE_SYMLINK: {
1857 _cleanup_free_ char *q = NULL;
1858 const char *z, *what;
4d1c38b8 1859
1688841f
LP
1860 z = timezone_from_path(p);
1861 if (!z) {
1862 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1863 return 0;
1688841f 1864 }
d4036145 1865
1688841f
LP
1866 r = readlink_malloc(where, &q);
1867 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1868 return 0; /* Already pointing to the right place? Then do nothing .. */
1869
1870 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1871 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1872 if (r < 0)
1873 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1874 else {
1875 if (unlink(where) < 0 && errno != ENOENT) {
1876 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1877 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1878 return 0;
1879 }
1880
1881 what = strjoina("../usr/share/zoneinfo/", z);
1882 if (symlink(what, where) < 0) {
1883 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1884 errno, "Failed to correct timezone of container, ignoring: %m");
1885 return 0;
1886 }
1887
1888 break;
1889 }
1890
1891 _fallthrough_;
d4036145 1892 }
68fb0892 1893
1688841f
LP
1894 case TIMEZONE_BIND: {
1895 _cleanup_free_ char *resolved = NULL;
1896 int found;
1897
a5648b80 1898 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1899 if (found < 0) {
1900 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1901 return 0;
1902 }
1903
1904 if (found == 0) /* missing? */
1905 (void) touch(resolved);
1906
1907 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1908 if (r >= 0)
1909 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1910
1911 _fallthrough_;
79d80fc1 1912 }
4d9f07b4 1913
1688841f
LP
1914 case TIMEZONE_COPY:
1915 /* If mounting failed, try to copy */
8a016c74 1916 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1917 if (r < 0) {
1918 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1919 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1920 return 0;
1921 }
1922
1923 break;
1924
1925 default:
1926 assert_not_reached("unexpected mode");
d4036145 1927 }
e58a1277 1928
1688841f 1929 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1930 r = userns_lchown(where, 0, 0);
1931 if (r < 0)
1688841f 1932 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1933
e58a1277 1934 return 0;
88213476
LP
1935}
1936
09d423e9
LP
1937static int have_resolv_conf(const char *path) {
1938 assert(path);
1939
1940 if (access(path, F_OK) < 0) {
1941 if (errno == ENOENT)
1942 return 0;
1943
1944 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1945 }
1946
1947 return 1;
1948}
1949
7357272e 1950static int resolved_listening(void) {
b8ea7a6e 1951 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1952 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1953 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1954 int r;
1955
7357272e 1956 /* Check if resolved is listening */
b053cd5f
LP
1957
1958 r = sd_bus_open_system(&bus);
1959 if (r < 0)
b8ea7a6e 1960 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1961
7357272e 1962 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1963 if (r < 0)
1964 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1965 if (r == 0)
1966 return 0;
7357272e
DM
1967
1968 r = sd_bus_get_property_string(bus,
1969 "org.freedesktop.resolve1",
1970 "/org/freedesktop/resolve1",
1971 "org.freedesktop.resolve1.Manager",
1972 "DNSStubListener",
b8ea7a6e 1973 &error,
7357272e
DM
1974 &dns_stub_listener_mode);
1975 if (r < 0)
b8ea7a6e 1976 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1977
1978 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1979}
1980
2547bb41 1981static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1982 _cleanup_free_ char *etc = NULL;
1983 const char *where, *what;
1984 ResolvConfMode m;
1985 int r;
2547bb41
LP
1986
1987 assert(dest);
1988
09d423e9
LP
1989 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1990 if (arg_private_network)
1991 m = RESOLV_CONF_OFF;
86775e35
LP
1992 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
1993 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 1994 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 1995 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 1996 else
83205269 1997 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 1998
09d423e9
LP
1999 } else
2000 m = arg_resolv_conf;
2001
2002 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2003 return 0;
2004
a5648b80 2005 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2006 if (r < 0) {
2007 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2008 return 0;
2009 }
2010
2011 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2012
2013 if (m == RESOLV_CONF_DELETE) {
2014 if (unlink(where) < 0)
2015 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2016
87447ae4
LP
2017 return 0;
2018 }
79d80fc1 2019
86775e35
LP
2020 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2021 what = PRIVATE_STATIC_RESOLV_CONF;
2022 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2023 what = PRIVATE_UPLINK_RESOLV_CONF;
2024 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2025 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2026 else
2027 what = "/etc/resolv.conf";
87447ae4 2028
86775e35 2029 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2030 _cleanup_free_ char *resolved = NULL;
2031 int found;
2032
a5648b80 2033 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2034 if (found < 0) {
2035 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2036 return 0;
2037 }
3539724c 2038
87447ae4
LP
2039 if (found == 0) /* missing? */
2040 (void) touch(resolved);
5367354d 2041
09d423e9 2042 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2043 if (r >= 0)
87447ae4 2044 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2045
2046 /* If that didn't work, let's copy the file */
3539724c
LP
2047 }
2048
86775e35
LP
2049 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2050 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2051 else
2052 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 2053 if (r < 0) {
3539724c
LP
2054 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2055 * resolved or something similar runs inside and the symlink points there.
68a313c5 2056 *
3539724c 2057 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2058 */
86775e35
LP
2059 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2060 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2061 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2062 return 0;
2063 }
2547bb41 2064
03cfe0d5
LP
2065 r = userns_lchown(where, 0, 0);
2066 if (r < 0)
3539724c 2067 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2068
2547bb41
LP
2069 return 0;
2070}
2071
1e4f1671 2072static int setup_boot_id(void) {
cdde6ba6
LP
2073 _cleanup_(unlink_and_freep) char *from = NULL;
2074 _cleanup_free_ char *path = NULL;
3bbaff3e 2075 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2076 const char *to;
04bc4a3f
LP
2077 int r;
2078
1eacc470 2079 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2080
1eacc470 2081 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2082 if (r < 0)
2083 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2084
2085 r = sd_id128_randomize(&rnd);
f647962d
MS
2086 if (r < 0)
2087 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2088
cdde6ba6 2089 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
2090 if (r < 0)
2091 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2092
cdde6ba6
LP
2093 from = TAKE_PTR(path);
2094 to = "/proc/sys/kernel/random/boot_id";
2095
60e76d48 2096 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2097 if (r < 0)
2098 return r;
04bc4a3f 2099
cdde6ba6 2100 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2101}
2102
e58a1277 2103static int copy_devnodes(const char *dest) {
88213476
LP
2104 static const char devnodes[] =
2105 "null\0"
2106 "zero\0"
2107 "full\0"
2108 "random\0"
2109 "urandom\0"
85614d66
TG
2110 "tty\0"
2111 "net/tun\0";
88213476 2112
de40a303 2113 _cleanup_umask_ mode_t u;
88213476 2114 const char *d;
e58a1277 2115 int r = 0;
a258bf26
LP
2116
2117 assert(dest);
124640f1
LP
2118
2119 u = umask(0000);
88213476 2120
03cfe0d5
LP
2121 /* Create /dev/net, so that we can create /dev/net/tun in it */
2122 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2123 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2124
88213476 2125 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2126 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2127 struct stat st;
88213476 2128
c6134d3e 2129 from = path_join("/dev/", d);
8967f291
LP
2130 if (!from)
2131 return log_oom();
2132
c6134d3e 2133 to = path_join(dest, from);
8967f291
LP
2134 if (!to)
2135 return log_oom();
88213476
LP
2136
2137 if (stat(from, &st) < 0) {
2138
4a62c710
MS
2139 if (errno != ENOENT)
2140 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2141
baaa35ad
ZJS
2142 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2143 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2144 "%s is not a char or block device, cannot copy.", from);
2145 else {
8dfce114
LP
2146 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2147
81f5049b 2148 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2149 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2150 if (errno == EEXIST)
8dbf71ec 2151 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2152 if (errno != EPERM)
2153 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2154
8dfce114 2155 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2156 r = touch(to);
2157 if (r < 0)
2158 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
2159 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2160 if (r < 0)
2161 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2162 }
6278cf60 2163
03cfe0d5
LP
2164 r = userns_lchown(to, 0, 0);
2165 if (r < 0)
2166 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2167
657ee2d8 2168 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2169 if (!dn)
2170 return log_oom();
2171
2172 r = userns_mkdir(dest, dn, 0755, 0, 0);
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to create '%s': %m", dn);
2175
2176 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2177 return log_oom();
2178
c6134d3e 2179 prefixed = path_join(dest, sl);
8dfce114
LP
2180 if (!prefixed)
2181 return log_oom();
2182
2d9b74ba 2183 t = path_join("..", d);
8dfce114
LP
2184 if (!t)
2185 return log_oom();
2186
2187 if (symlink(t, prefixed) < 0)
2188 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2189 }
88213476
LP
2190 }
2191
e58a1277
LP
2192 return r;
2193}
88213476 2194
de40a303
LP
2195static int make_extra_nodes(const char *dest) {
2196 _cleanup_umask_ mode_t u;
2197 size_t i;
2198 int r;
2199
2200 u = umask(0000);
2201
2202 for (i = 0; i < arg_n_extra_nodes; i++) {
2203 _cleanup_free_ char *path = NULL;
2204 DeviceNode *n = arg_extra_nodes + i;
2205
c6134d3e 2206 path = path_join(dest, n->path);
de40a303
LP
2207 if (!path)
2208 return log_oom();
2209
2210 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2211 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2212
2213 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2214 if (r < 0)
2215 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2216 }
2217
2218 return 0;
2219}
2220
03cfe0d5
LP
2221static int setup_pts(const char *dest) {
2222 _cleanup_free_ char *options = NULL;
2223 const char *p;
709f6e46 2224 int r;
03cfe0d5 2225
349cc4a5 2226#if HAVE_SELINUX
03cfe0d5
LP
2227 if (arg_selinux_apifs_context)
2228 (void) asprintf(&options,
3dce8915 2229 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2230 arg_uid_shift + TTY_GID,
2231 arg_selinux_apifs_context);
2232 else
2233#endif
2234 (void) asprintf(&options,
3dce8915 2235 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2236 arg_uid_shift + TTY_GID);
f2d88580 2237
03cfe0d5 2238 if (!options)
f2d88580
LP
2239 return log_oom();
2240
03cfe0d5 2241 /* Mount /dev/pts itself */
cc9fce65 2242 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2243 r = mkdir_errno_wrapper(p, 0755);
2244 if (r < 0)
2245 return log_error_errno(r, "Failed to create /dev/pts: %m");
2246
60e76d48
ZJS
2247 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2248 if (r < 0)
2249 return r;
709f6e46
MS
2250 r = userns_lchown(p, 0, 0);
2251 if (r < 0)
2252 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2253
2254 /* Create /dev/ptmx symlink */
2255 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2256 if (symlink("pts/ptmx", p) < 0)
2257 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2258 r = userns_lchown(p, 0, 0);
2259 if (r < 0)
2260 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2261
03cfe0d5
LP
2262 /* And fix /dev/pts/ptmx ownership */
2263 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2264 r = userns_lchown(p, 0, 0);
2265 if (r < 0)
2266 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2267
f2d88580
LP
2268 return 0;
2269}
2270
3acc84eb
FB
2271static int setup_stdio_as_dev_console(void) {
2272 int terminal;
e58a1277 2273 int r;
e58a1277 2274
3acc84eb
FB
2275 terminal = open_terminal("/dev/console", O_RDWR);
2276 if (terminal < 0)
2277 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2278
3acc84eb
FB
2279 /* Make sure we can continue logging to the original stderr, even if
2280 * stderr points elsewhere now */
2281 r = log_dup_console();
2282 if (r < 0)
2283 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2284
3acc84eb
FB
2285 /* invalidates 'terminal' on success and failure */
2286 r = rearrange_stdio(terminal, terminal, terminal);
f647962d 2287 if (r < 0)
3acc84eb
FB
2288 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2289
2290 return 0;
2291}
88213476 2292
3acc84eb
FB
2293static int setup_dev_console(const char *console) {
2294 _cleanup_free_ char *p = NULL;
2295 int r;
a258bf26 2296
3acc84eb
FB
2297 /* Create /dev/console symlink */
2298 r = path_make_relative("/dev", console, &p);
81f5049b 2299 if (r < 0)
3acc84eb
FB
2300 return log_error_errno(r, "Failed to create relative path: %m");
2301
2302 if (symlink(p, "/dev/console") < 0)
2303 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2304
3acc84eb 2305 return 0;
e58a1277
LP
2306}
2307
8e5430c4
LP
2308static int setup_keyring(void) {
2309 key_serial_t keyring;
2310
6b000af4
LP
2311 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2312 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2313 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2314 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2315 * into the container. */
8e5430c4
LP
2316
2317 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2318 if (keyring == -1) {
2319 if (errno == ENOSYS)
2320 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2321 else if (IN_SET(errno, EACCES, EPERM))
2322 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2323 else
2324 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2325 }
2326
2327 return 0;
2328}
2329
3652872a
LP
2330static int setup_credentials(const char *root) {
2331 const char *q;
2332 int r;
2333
2334 if (arg_n_credentials <= 0)
2335 return 0;
2336
2337 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2338 if (r < 0)
2339 return log_error_errno(r, "Failed to create /run/host: %m");
2340
2341 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2342 if (r < 0)
2343 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2344
2345 q = prefix_roota(root, "/run/host/credentials");
2346 r = mount_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2347 if (r < 0)
2348 return r;
2349
2350 for (size_t i = 0; i < arg_n_credentials; i++) {
2351 _cleanup_free_ char *j = NULL;
2352 _cleanup_close_ int fd = -1;
2353
2354 j = path_join(q, arg_credentials[i].id);
2355 if (!j)
2356 return log_oom();
2357
2358 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2359 if (fd < 0)
2360 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2361
2362 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2363 if (r < 0)
2364 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2365
2366 if (fchmod(fd, 0400) < 0)
2367 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2368
2369 if (arg_userns_mode != USER_NAMESPACE_NO) {
2370 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2371 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2372 }
2373 }
2374
2375 if (chmod(q, 0500) < 0)
2376 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2377
2378 r = userns_lchown(q, 0, 0);
2379 if (r < 0)
2380 return r;
2381
2382 /* Make both mount and superblock read-only now */
2383 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2384 if (r < 0)
2385 return r;
2386
2387 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2388}
2389
1e4f1671 2390static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2391 _cleanup_(unlink_and_freep) char *from = NULL;
2392 _cleanup_free_ char *fifo = NULL;
2393 _cleanup_close_ int fd = -1;
7fd1b19b 2394 _cleanup_umask_ mode_t u;
9ec5a93c 2395 int r;
e58a1277 2396
e58a1277 2397 assert(kmsg_socket >= 0);
a258bf26 2398
e58a1277 2399 u = umask(0000);
a258bf26 2400
1eacc470 2401 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2402 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2403 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2404 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2405
1eacc470 2406 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2407 if (r < 0)
2408 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2409
9ec5a93c 2410 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2411 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2412
2413 from = TAKE_PTR(fifo);
9ec5a93c 2414
1eacc470 2415 r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2416 if (r < 0)
2417 return r;
e58a1277 2418
669fc4e5 2419 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2420 if (fd < 0)
2421 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2422
9ec5a93c 2423 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2424 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2425 if (r < 0)
2426 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2427
25ea79fe 2428 return 0;
88213476
LP
2429}
2430
1c4baffc 2431static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2432 union in_addr_union *exposed = userdata;
2433
2434 assert(rtnl);
2435 assert(m);
2436 assert(exposed);
2437
7a8f6325 2438 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
2439 return 0;
2440}
2441
3a74cea5 2442static int setup_hostname(void) {
c818eef1 2443 int r;
3a74cea5 2444
0c582db0 2445 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2446 return 0;
2447
c818eef1
LP
2448 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2449 if (r < 0)
2450 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2451
7027ff61 2452 return 0;
3a74cea5
LP
2453}
2454
57fb9fb5 2455static int setup_journal(const char *directory) {
0f5e1382 2456 _cleanup_free_ char *d = NULL;
5905d7cf 2457 char id[SD_ID128_STRING_MAX];
b2238e38
LP
2458 const char *dirname, *p, *q;
2459 sd_id128_t this_id;
8054d749 2460 bool try;
57fb9fb5
LP
2461 int r;
2462
df9a75e4
LP
2463 /* Don't link journals in ephemeral mode */
2464 if (arg_ephemeral)
2465 return 0;
2466
8054d749
LP
2467 if (arg_link_journal == LINK_NO)
2468 return 0;
2469
2470 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2471
4d680aee 2472 r = sd_id128_get_machine(&this_id);
f647962d
MS
2473 if (r < 0)
2474 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2475
e01ff70a 2476 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2477 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2478 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2479 if (try)
4d680aee 2480 return 0;
df9a75e4 2481 return -EEXIST;
4d680aee
ZJS
2482 }
2483
369ca6da
ZJS
2484 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2485 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2486 if (r < 0) {
2487 bool ignore = r == -EROFS && try;
2488 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2489 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2490 return ignore ? 0 : r;
2491 }
2492 }
03cfe0d5 2493
e01ff70a
MS
2494 (void) sd_id128_to_string(arg_uuid, id);
2495
03cfe0d5
LP
2496 p = strjoina("/var/log/journal/", id);
2497 q = prefix_roota(directory, p);
27407a01 2498
e1873695 2499 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2500 if (try)
2501 return 0;
27407a01 2502
baaa35ad
ZJS
2503 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2504 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2505 }
2506
e1873695 2507 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2508 if (try)
2509 return 0;
57fb9fb5 2510
baaa35ad
ZJS
2511 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2512 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2513 }
2514
2515 r = readlink_and_make_absolute(p, &d);
2516 if (r >= 0) {
3742095b 2517 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2518 path_equal(d, q)) {
2519
03cfe0d5 2520 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2521 if (r < 0)
709f6e46 2522 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2523 return 0;
57fb9fb5
LP
2524 }
2525
4a62c710
MS
2526 if (unlink(p) < 0)
2527 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2528 } else if (r == -EINVAL) {
2529
2530 if (arg_link_journal == LINK_GUEST &&
2531 rmdir(p) < 0) {
2532
27407a01
ZJS
2533 if (errno == ENOTDIR) {
2534 log_error("%s already exists and is neither a symlink nor a directory", p);
2535 return r;
4314d33f
MS
2536 } else
2537 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2538 }
4314d33f
MS
2539 } else if (r != -ENOENT)
2540 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2541
2542 if (arg_link_journal == LINK_GUEST) {
2543
2544 if (symlink(q, p) < 0) {
8054d749 2545 if (try) {
56f64d95 2546 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2547 return 0;
4314d33f
MS
2548 } else
2549 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2550 }
2551
03cfe0d5 2552 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2553 if (r < 0)
709f6e46 2554 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2555 return 0;
57fb9fb5
LP
2556 }
2557
2558 if (arg_link_journal == LINK_HOST) {
ccddd104 2559 /* don't create parents here — if the host doesn't have
574edc90 2560 * permanent journal set up, don't force it here */
ba8e6c4d 2561
dae8b82e
ZJS
2562 r = mkdir_errno_wrapper(p, 0755);
2563 if (r < 0 && r != -EEXIST) {
8054d749 2564 if (try) {
dae8b82e 2565 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2566 return 0;
4314d33f 2567 } else
dae8b82e 2568 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2569 }
2570
27407a01
ZJS
2571 } else if (access(p, F_OK) < 0)
2572 return 0;
57fb9fb5 2573
cdb2b9d0
LP
2574 if (dir_is_empty(q) == 0)
2575 log_warning("%s is not empty, proceeding anyway.", q);
2576
03cfe0d5 2577 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2578 if (r < 0)
2579 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2580
60e76d48
ZJS
2581 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2582 if (r < 0)
4a62c710 2583 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2584
27407a01 2585 return 0;
57fb9fb5
LP
2586}
2587
de40a303
LP
2588static int drop_capabilities(uid_t uid) {
2589 CapabilityQuintet q;
2590
2591 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2592 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2593 * arg_caps_retain. */
2594
2595 if (capability_quintet_is_set(&arg_full_capabilities)) {
2596 q = arg_full_capabilities;
2597
2598 if (q.bounding == (uint64_t) -1)
2599 q.bounding = uid == 0 ? arg_caps_retain : 0;
2600
2601 if (q.effective == (uint64_t) -1)
2602 q.effective = uid == 0 ? q.bounding : 0;
2603
2604 if (q.inheritable == (uint64_t) -1)
2605 q.inheritable = uid == 0 ? q.bounding : 0;
2606
2607 if (q.permitted == (uint64_t) -1)
2608 q.permitted = uid == 0 ? q.bounding : 0;
2609
2610 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2611 q.ambient = 0;
f66ad460
AZ
2612
2613 if (capability_quintet_mangle(&q))
2614 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2615
2616 } else {
de40a303
LP
2617 q = (CapabilityQuintet) {
2618 .bounding = arg_caps_retain,
2619 .effective = uid == 0 ? arg_caps_retain : 0,
2620 .inheritable = uid == 0 ? arg_caps_retain : 0,
2621 .permitted = uid == 0 ? arg_caps_retain : 0,
2622 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2623 };
2624
f66ad460
AZ
2625 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2626 * in order to maintain the same behavior as systemd < 242. */
2627 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2628 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2629 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2630
2631 }
2632
de40a303 2633 return capability_quintet_enforce(&q);
88213476
LP
2634}
2635
db999e0f
LP
2636static int reset_audit_loginuid(void) {
2637 _cleanup_free_ char *p = NULL;
2638 int r;
2639
0c582db0 2640 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2641 return 0;
2642
2643 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2644 if (r == -ENOENT)
db999e0f 2645 return 0;
f647962d
MS
2646 if (r < 0)
2647 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2648
2649 /* Already reset? */
2650 if (streq(p, "4294967295"))
2651 return 0;
2652
57512c89 2653 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2654 if (r < 0) {
10a87006
LP
2655 log_error_errno(r,
2656 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2657 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2658 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2659 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2660 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2661
db999e0f 2662 sleep(5);
77b6e194 2663 }
db999e0f
LP
2664
2665 return 0;
77b6e194
LP
2666}
2667
785890ac
LP
2668static int setup_propagate(const char *root) {
2669 const char *p, *q;
709f6e46 2670 int r;
785890ac
LP
2671
2672 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2673 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2674 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2675 (void) mkdir_p(p, 0600);
2676
5a27b395 2677 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2678 if (r < 0)
5a27b395 2679 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2680
5a27b395 2681 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
709f6e46 2682 if (r < 0)
5a27b395 2683 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
03cfe0d5 2684
5a27b395 2685 q = prefix_roota(root, "/run/host/incoming");
60e76d48
ZJS
2686 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2687 if (r < 0)
2688 return r;
785890ac 2689
60e76d48
ZJS
2690 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2691 if (r < 0)
2692 return r;
785890ac 2693
5a27b395 2694 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
60e76d48 2695 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2696}
2697
317feb4d 2698static int setup_machine_id(const char *directory) {
691675ba
LP
2699 const char *etc_machine_id;
2700 sd_id128_t id;
3bbaff3e 2701 int r;
e01ff70a 2702
317feb4d
LP
2703 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2704 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2705 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2706 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2707 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2708 * container behaves nicely). */
2709
e01ff70a
MS
2710 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2711
691675ba 2712 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2713 if (r < 0) {
2714 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2715 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2716
317feb4d
LP
2717 if (sd_id128_is_null(arg_uuid)) {
2718 r = sd_id128_randomize(&arg_uuid);
2719 if (r < 0)
2720 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2721 }
2722 } else {
baaa35ad
ZJS
2723 if (sd_id128_is_null(id))
2724 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2725 "Machine ID in container image is zero, refusing.");
e01ff70a 2726
317feb4d
LP
2727 arg_uuid = id;
2728 }
691675ba 2729
e01ff70a
MS
2730 return 0;
2731}
2732
7336138e
LP
2733static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2734 int r;
2735
2736 assert(directory);
2737
0de7acce 2738 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2739 return 0;
2740
2741 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2742 if (r == -EOPNOTSUPP)
2743 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2744 if (r == -EBADE)
2745 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2746 if (r < 0)
2747 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2748 if (r == 0)
2749 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2750 else
2751 log_debug("Patched directory tree to match UID/GID range.");
2752
2753 return r;
2754}
2755
113cea80 2756/*
6d416b9c
LS
2757 * Return values:
2758 * < 0 : wait_for_terminate() failed to get the state of the
2759 * container, the container was terminated by a signal, or
2760 * failed for an unknown reason. No change is made to the
2761 * container argument.
2762 * > 0 : The program executed in the container terminated with an
2763 * error. The exit code of the program executed in the
919699ec
LP
2764 * container is returned. The container argument has been set
2765 * to CONTAINER_TERMINATED.
6d416b9c
LS
2766 * 0 : The container is being rebooted, has been shut down or exited
2767 * successfully. The container argument has been set to either
2768 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2769 *
6d416b9c
LS
2770 * That is, success is indicated by a return value of zero, and an
2771 * error is indicated by a non-zero value.
113cea80
DH
2772 */
2773static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2774 siginfo_t status;
919699ec 2775 int r;
113cea80
DH
2776
2777 r = wait_for_terminate(pid, &status);
f647962d
MS
2778 if (r < 0)
2779 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2780
2781 switch (status.si_code) {
fddbb89c 2782
113cea80 2783 case CLD_EXITED:
b5a2179b 2784 if (status.si_status == 0)
919699ec 2785 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2786 else
919699ec 2787 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2788
919699ec
LP
2789 *container = CONTAINER_TERMINATED;
2790 return status.si_status;
113cea80
DH
2791
2792 case CLD_KILLED:
2793 if (status.si_status == SIGINT) {
919699ec 2794 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2795 *container = CONTAINER_TERMINATED;
919699ec
LP
2796 return 0;
2797
113cea80 2798 } else if (status.si_status == SIGHUP) {
919699ec 2799 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2800 *container = CONTAINER_REBOOTED;
919699ec 2801 return 0;
113cea80 2802 }
919699ec 2803
4831981d 2804 _fallthrough_;
113cea80 2805 case CLD_DUMPED:
baaa35ad
ZJS
2806 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2807 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2808
2809 default:
baaa35ad
ZJS
2810 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2811 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2812 }
113cea80
DH
2813}
2814
023fb90b
LP
2815static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2816 pid_t pid;
2817
4a0b58c4 2818 pid = PTR_TO_PID(userdata);
023fb90b 2819 if (pid > 0) {
c6c8f6e2 2820 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2821 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2822 sd_event_source_set_userdata(s, NULL);
2823 return 0;
2824 }
2825 }
2826
2827 sd_event_exit(sd_event_source_get_event(s), 0);
2828 return 0;
2829}
2830
6916b164 2831static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2832 pid_t pid;
2833
2834 assert(s);
2835 assert(ssi);
2836
2837 pid = PTR_TO_PID(userdata);
2838
6916b164
AU
2839 for (;;) {
2840 siginfo_t si = {};
abdb9b08 2841
6916b164
AU
2842 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2843 return log_error_errno(errno, "Failed to waitid(): %m");
2844 if (si.si_pid == 0) /* No pending children. */
2845 break;
abdb9b08 2846 if (si.si_pid == pid) {
6916b164
AU
2847 /* The main process we care for has exited. Return from
2848 * signal handler but leave the zombie. */
2849 sd_event_exit(sd_event_source_get_event(s), 0);
2850 break;
2851 }
abdb9b08 2852
6916b164
AU
2853 /* Reap all other children. */
2854 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2855 }
2856
2857 return 0;
2858}
2859
abdb9b08
LP
2860static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2861 pid_t pid;
2862
2863 assert(m);
2864
2865 pid = PTR_TO_PID(userdata);
2866
2867 if (arg_kill_signal > 0) {
2868 log_info("Container termination requested. Attempting to halt container.");
2869 (void) kill(pid, arg_kill_signal);
2870 } else {
2871 log_info("Container termination requested. Exiting.");
2872 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2873 }
2874
2875 return 0;
2876}
2877
ec16945e 2878static int determine_names(void) {
1b9cebf6 2879 int r;
ec16945e 2880
c1521918
LP
2881 if (arg_template && !arg_directory && arg_machine) {
2882
2883 /* If --template= was specified then we should not
2884 * search for a machine, but instead create a new one
2885 * in /var/lib/machine. */
2886
657ee2d8 2887 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2888 if (!arg_directory)
2889 return log_oom();
2890 }
2891
ec16945e 2892 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2893 if (arg_machine) {
2894 _cleanup_(image_unrefp) Image *i = NULL;
2895
5ef46e5f 2896 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2897 if (r == -ENOENT)
2898 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2899 if (r < 0)
2900 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2901
eb38edce 2902 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2903 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2904 else
0f03c2a4 2905 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2906 if (r < 0)
0f3be6ca 2907 return log_oom();
1b9cebf6 2908
aee327b8
LP
2909 if (!arg_ephemeral)
2910 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2911 } else {
2912 r = safe_getcwd(&arg_directory);
2913 if (r < 0)
2914 return log_error_errno(r, "Failed to determine current directory: %m");
2915 }
ec16945e 2916
c6147113
LP
2917 if (!arg_directory && !arg_image)
2918 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2919 }
2920
2921 if (!arg_machine) {
b9ba4dab
LP
2922 if (arg_directory && path_equal(arg_directory, "/"))
2923 arg_machine = gethostname_malloc();
4827ab48
LP
2924 else {
2925 if (arg_image) {
2926 char *e;
2927
2928 arg_machine = strdup(basename(arg_image));
2929
2930 /* Truncate suffix if there is one */
2931 e = endswith(arg_machine, ".raw");
2932 if (e)
2933 *e = 0;
2934 } else
2935 arg_machine = strdup(basename(arg_directory));
2936 }
ec16945e
LP
2937 if (!arg_machine)
2938 return log_oom();
2939
ae691c1d 2940 hostname_cleanup(arg_machine);
c6147113
LP
2941 if (!machine_name_is_valid(arg_machine))
2942 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab
LP
2943
2944 if (arg_ephemeral) {
2945 char *b;
2946
2947 /* Add a random suffix when this is an
2948 * ephemeral machine, so that we can run many
2949 * instances at once without manually having
2950 * to specify -M each time. */
2951
2952 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2953 return log_oom();
2954
2955 free(arg_machine);
2956 arg_machine = b;
2957 }
ec16945e
LP
2958 }
2959
2960 return 0;
2961}
2962
8d4aa2bb 2963static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2964 char *chased;
2965 int r;
2966
2967 assert(p);
2968
2969 if (!*p)
2970 return 0;
2971
a5648b80 2972 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
2973 if (r < 0)
2974 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2975
a5648b80 2976 return free_and_replace(*p, chased);
3f342ec4
LP
2977}
2978
03cfe0d5 2979static int determine_uid_shift(const char *directory) {
6dac160c
LP
2980 int r;
2981
0de7acce 2982 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2983 arg_uid_shift = 0;
6dac160c 2984 return 0;
03cfe0d5 2985 }
6dac160c
LP
2986
2987 if (arg_uid_shift == UID_INVALID) {
2988 struct stat st;
2989
03cfe0d5 2990 r = stat(directory, &st);
6dac160c 2991 if (r < 0)
03cfe0d5 2992 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2993
2994 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2995
baaa35ad
ZJS
2996 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2997 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2998 "UID and GID base of %s don't match.", directory);
6dac160c
LP
2999
3000 arg_uid_range = UINT32_C(0x10000);
3001 }
3002
baaa35ad
ZJS
3003 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
3004 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3005 "UID base too high for UID range.");
6dac160c 3006
6dac160c
LP
3007 return 0;
3008}
3009
de40a303
LP
3010static unsigned long effective_clone_ns_flags(void) {
3011 unsigned long flags = arg_clone_ns_flags;
3012
3013 if (arg_private_network)
3014 flags |= CLONE_NEWNET;
3015 if (arg_use_cgns)
3016 flags |= CLONE_NEWCGROUP;
3017 if (arg_userns_mode != USER_NAMESPACE_NO)
3018 flags |= CLONE_NEWUSER;
3019
3020 return flags;
3021}
3022
3023static int patch_sysctl(void) {
3024
3025 /* This table is inspired by runc's sysctl() function */
3026 static const struct {
3027 const char *key;
3028 bool prefix;
3029 unsigned long clone_flags;
3030 } safe_sysctl[] = {
3031 { "kernel.hostname", false, CLONE_NEWUTS },
3032 { "kernel.domainname", false, CLONE_NEWUTS },
3033 { "kernel.msgmax", false, CLONE_NEWIPC },
3034 { "kernel.msgmnb", false, CLONE_NEWIPC },
3035 { "kernel.msgmni", false, CLONE_NEWIPC },
3036 { "kernel.sem", false, CLONE_NEWIPC },
3037 { "kernel.shmall", false, CLONE_NEWIPC },
3038 { "kernel.shmmax", false, CLONE_NEWIPC },
3039 { "kernel.shmmni", false, CLONE_NEWIPC },
3040 { "fs.mqueue.", true, CLONE_NEWIPC },
3041 { "net.", true, CLONE_NEWNET },
3042 };
3043
3044 unsigned long flags;
3045 char **k, **v;
3046 int r;
3047
3048 flags = effective_clone_ns_flags();
3049
3050 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3051 bool good = false;
3052 size_t i;
3053
3054 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3055
3056 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3057 continue;
3058
3059 if (safe_sysctl[i].prefix)
3060 good = startswith(*k, safe_sysctl[i].key);
3061 else
3062 good = streq(*k, safe_sysctl[i].key);
3063
3064 if (good)
3065 break;
3066 }
3067
c6147113
LP
3068 if (!good)
3069 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3070
3071 r = sysctl_write(*k, *v);
3072 if (r < 0)
3073 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3074 }
3075
3076 return 0;
3077}
3078
03cfe0d5
LP
3079static int inner_child(
3080 Barrier *barrier,
3081 const char *directory,
3082 bool secondary,
3083 int kmsg_socket,
3084 int rtnl_socket,
3acc84eb 3085 int master_pty_socket,
e1bb4b0d
LB
3086 FDSet *fds,
3087 char **os_release_pairs) {
69c79d3c 3088
03cfe0d5 3089 _cleanup_free_ char *home = NULL;
b5ea030d 3090 char as_uuid[ID128_UUID_STRING_MAX];
88614c8a 3091 size_t n_env = 1;
03cfe0d5 3092 const char *envp[] = {
0c300adf 3093 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3094 NULL, /* container */
03cfe0d5
LP
3095 NULL, /* TERM */
3096 NULL, /* HOME */
3097 NULL, /* USER */
3098 NULL, /* LOGNAME */
3099 NULL, /* container_uuid */
3100 NULL, /* LISTEN_FDS */
3101 NULL, /* LISTEN_PID */
9c1e04d0 3102 NULL, /* NOTIFY_SOCKET */
3652872a 3103 NULL, /* CREDENTIALS_DIRECTORY */
03cfe0d5
LP
3104 NULL
3105 };
1a68e1e5 3106 const char *exec_target;
2371271c 3107 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3108 int r, which_failed;
88213476 3109
b37469d7
LP
3110 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3111 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3112 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3113 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3114 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3115 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3116 * namespace.
3117 *
3118 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3119 * unshare(). See below. */
3120
03cfe0d5
LP
3121 assert(barrier);
3122 assert(directory);
3123 assert(kmsg_socket >= 0);
88213476 3124
de40a303
LP
3125 log_debug("Inner child is initializing.");
3126
0de7acce 3127 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3128 /* Tell the parent, that it now can write the UID map. */
3129 (void) barrier_place(barrier); /* #1 */
7027ff61 3130
03cfe0d5 3131 /* Wait until the parent wrote the UID map */
baaa35ad 3132 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3133 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3134
2a2e78e9
LP
3135 /* Become the new root user inside our namespace */
3136 r = reset_uid_gid();
3137 if (r < 0)
3138 return log_error_errno(r, "Couldn't become new root: %m");
3139
3140 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3141 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3142 * propagation, but simply create new peer groups for all our mounts). */
3143 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3144 if (r < 0)
3145 return r;
3146 }
6d66bd3b 3147
0de7acce 3148 r = mount_all(NULL,
4f086aab 3149 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3150 arg_uid_shift,
0de7acce 3151 arg_selinux_apifs_context);
03cfe0d5
LP
3152 if (r < 0)
3153 return r;
3154
04413780
ZJS
3155 if (!arg_network_namespace_path && arg_private_network) {
3156 r = unshare(CLONE_NEWNET);
3157 if (r < 0)
3158 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3159
3160 /* Tell the parent that it can setup network interfaces. */
3161 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3162 }
3163
4f086aab 3164 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3165 if (r < 0)
3166 return r;
3167
03cfe0d5
LP
3168 /* Wait until we are cgroup-ified, so that we
3169 * can mount the right cgroup path writable */
baaa35ad
ZJS
3170 if (!barrier_place_and_sync(barrier)) /* #4 */
3171 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3172 "Parent died too early");
88213476 3173
489fae52 3174 if (arg_use_cgns) {
0996ef00
CB
3175 r = unshare(CLONE_NEWCGROUP);
3176 if (r < 0)
04413780 3177 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3178 r = mount_cgroups(
3179 "",
3180 arg_unified_cgroup_hierarchy,
3181 arg_userns_mode != USER_NAMESPACE_NO,
3182 arg_uid_shift,
3183 arg_uid_range,
5a8ff0e6 3184 arg_selinux_apifs_context,
ada54120 3185 true);
1433e0f2 3186 } else
0996ef00 3187 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3188 if (r < 0)
3189 return r;
ec16945e 3190
1e4f1671 3191 r = setup_boot_id();
03cfe0d5
LP
3192 if (r < 0)
3193 return r;
ec16945e 3194
1e4f1671 3195 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
3196 if (r < 0)
3197 return r;
3198 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3199
de40a303
LP
3200 r = mount_custom(
3201 "/",
3202 arg_custom_mounts,
3203 arg_n_custom_mounts,
de40a303
LP
3204 0,
3205 arg_selinux_apifs_context,
5f0a6347 3206 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3207 if (r < 0)
3208 return r;
3209
03cfe0d5
LP
3210 if (setsid() < 0)
3211 return log_error_errno(errno, "setsid() failed: %m");
3212
3213 if (arg_private_network)
df883de9 3214 (void) loopback_setup();
03cfe0d5 3215
7a8f6325
LP
3216 if (arg_expose_ports) {
3217 r = expose_port_send_rtnl(rtnl_socket);
3218 if (r < 0)
3219 return r;
3220 rtnl_socket = safe_close(rtnl_socket);
3221 }
03cfe0d5 3222
3acc84eb 3223 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3224 _cleanup_close_ int master = -1;
3acc84eb
FB
3225 _cleanup_free_ char *console = NULL;
3226
3227 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3228 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3229 if (master < 0)
dc98caea 3230 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3231
3232 r = setup_dev_console(console);
3233 if (r < 0)
105a1a36 3234 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb
FB
3235
3236 r = send_one_fd(master_pty_socket, master, 0);
3237 if (r < 0)
3238 return log_error_errno(r, "Failed to send master fd: %m");
3239 master_pty_socket = safe_close(master_pty_socket);
3240
3241 r = setup_stdio_as_dev_console();
3242 if (r < 0)
3243 return r;
3244 }
3245
de40a303
LP
3246 r = patch_sysctl();
3247 if (r < 0)
3248 return r;
3249
81f345df
LP
3250 if (arg_oom_score_adjust_set) {
3251 r = set_oom_score_adjust(arg_oom_score_adjust);
3252 if (r < 0)
3253 return log_error_errno(r, "Failed to adjust OOM score: %m");
3254 }
3255
0985c7c4
ZJS
3256 if (arg_cpu_set.set)
3257 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3258 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3259
c818eef1 3260 (void) setup_hostname();
03cfe0d5 3261
050f7277 3262 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3263 r = safe_personality(arg_personality);
3264 if (r < 0)
3265 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3266 } else if (secondary) {
21022b9d
LP
3267 r = safe_personality(PER_LINUX32);
3268 if (r < 0)
3269 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3270 }
3271
de40a303
LP
3272 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3273 if (r < 0)
3274 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3275
3276#if HAVE_SECCOMP
3277 if (arg_seccomp) {
3278
3279 if (is_seccomp_available()) {
3280
3281 r = seccomp_load(arg_seccomp);
7bc5e0b1 3282 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3283 return log_error_errno(r, "Failed to install seccomp filter: %m");
3284 if (r < 0)
3285 log_debug_errno(r, "Failed to install seccomp filter: %m");
3286 }
3287 } else
3288#endif
3289 {
6b000af4 3290 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3291 if (r < 0)
3292 return r;
3293 }
3294
349cc4a5 3295#if HAVE_SELINUX
03cfe0d5 3296 if (arg_selinux_context)
2ed96880 3297 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3298 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3299#endif
3300
de40a303
LP
3301 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3302 * if we need to later on. */
3303 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3304 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3305
3306 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3307 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3308 else
3309 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
3310 if (r < 0)
3311 return r;
3312
de40a303
LP
3313 r = drop_capabilities(getuid());
3314 if (r < 0)
3315 return log_error_errno(r, "Dropping capabilities failed: %m");
3316
66edd963
LP
3317 if (arg_no_new_privileges)
3318 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3319 return log_error_errno(errno, "Failed to disable new privileges: %m");
3320
6aadfa4c
ILG
3321 /* LXC sets container=lxc, so follow the scheme here */
3322 envp[n_env++] = strjoina("container=", arg_container_service_name);
3323
03cfe0d5
LP
3324 envp[n_env] = strv_find_prefix(environ, "TERM=");
3325 if (envp[n_env])
313cefa1 3326 n_env++;
03cfe0d5 3327
de40a303
LP
3328 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3329 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3330 return log_oom();
3331
3332 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3333 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3334 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3335 return log_oom();
03cfe0d5 3336
3bbaff3e 3337 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3338
691675ba 3339 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3340 return log_oom();
03cfe0d5
LP
3341
3342 if (fdset_size(fds) > 0) {
3343 r = fdset_cloexec(fds, false);
3344 if (r < 0)
3345 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3346
3347 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3348 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3349 return log_oom();
3350 }
9c1e04d0
AP
3351 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3352 return log_oom();
03cfe0d5 3353
3652872a
LP
3354 if (arg_n_credentials > 0) {
3355 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3356 if (!envp[n_env])
3357 return log_oom();
3358 n_env++;
3359 }
3360
ed4512d0 3361 env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
2371271c
TG
3362 if (!env_use)
3363 return log_oom();
03cfe0d5
LP
3364
3365 /* Let the parent know that we are ready and
3366 * wait until the parent is ready with the
3367 * setup, too... */
baaa35ad
ZJS
3368 if (!barrier_place_and_sync(barrier)) /* #5 */
3369 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3370 "Parent died too early");
03cfe0d5 3371
5f932eb9
LP
3372 if (arg_chdir)
3373 if (chdir(arg_chdir) < 0)
3374 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3375
7732f92b 3376 if (arg_start_mode == START_PID2) {
75bf701f 3377 r = stub_pid1(arg_uuid);
7732f92b
LP
3378 if (r < 0)
3379 return r;
3380 }
3381
de40a303
LP
3382 log_debug("Inner child completed, invoking payload.");
3383
8ca082b4
LP
3384 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3385 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3386 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3387 log_close();
8ca082b4
LP
3388 log_set_open_when_needed(true);
3389
03cfe0d5
LP
3390 (void) fdset_close_others(fds);
3391
7732f92b 3392 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3393 char **a;
3394 size_t m;
3395
3396 /* Automatically search for the init system */
3397
75f32f04
ZJS
3398 m = strv_length(arg_parameters);
3399 a = newa(char*, m + 2);
3400 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3401 a[1 + m] = NULL;
03cfe0d5 3402
ced58da7 3403 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3404 execve(a[0], a, env_use);
3405
ced58da7 3406 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3407 execve(a[0], a, env_use);
3408
ced58da7 3409 a[0] = (char*) "/sbin/init";
03cfe0d5 3410 execve(a[0], a, env_use);
ced58da7
LP
3411
3412 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3413 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3414 const char *dollar_path;
3415
1a68e1e5 3416 exec_target = arg_parameters[0];
b6b180b7
LP
3417
3418 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3419 * binary. */
3420 dollar_path = strv_env_get(env_use, "PATH");
3421 if (dollar_path) {
6f646e01 3422 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3423 return log_error_errno(errno, "Failed to update $PATH: %m");
3424 }
3425
f757855e 3426 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3427 } else {
5f932eb9 3428 if (!arg_chdir)
d929b0f9
ZJS
3429 /* If we cannot change the directory, we'll end up in /, that is expected. */
3430 (void) chdir(home ?: "/root");
5f932eb9 3431
03cfe0d5
LP
3432 execle("/bin/bash", "-bash", NULL, env_use);
3433 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3434
3435 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3436 }
3437
8ca082b4 3438 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3439}
3440
e96ceaba 3441static int setup_notify_child(void) {
271f518f 3442 _cleanup_close_ int fd = -1;
9c1e04d0 3443 union sockaddr_union sa = {
44ed5214
LP
3444 .un.sun_family = AF_UNIX,
3445 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3446 };
3447 int r;
3448
3449 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3450 if (fd < 0)
3451 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3452
3453 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3454 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3455
9c1e04d0 3456 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3457 if (r < 0)
44ed5214 3458 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3459
adc7d9f0 3460 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3461 if (r < 0)
adc7d9f0 3462 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3463
2ff48e98 3464 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3465 if (r < 0)
2ff48e98 3466 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3467
271f518f 3468 return TAKE_FD(fd);
9c1e04d0
AP
3469}
3470
03cfe0d5
LP
3471static int outer_child(
3472 Barrier *barrier,
3473 const char *directory,
2d845785 3474 DissectedImage *dissected_image,
03cfe0d5
LP
3475 bool secondary,
3476 int pid_socket,
e01ff70a 3477 int uuid_socket,
9c1e04d0 3478 int notify_socket,
03cfe0d5
LP
3479 int kmsg_socket,
3480 int rtnl_socket,
825d5287 3481 int uid_shift_socket,
3acc84eb 3482 int master_pty_socket,
8199d554 3483 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3484 FDSet *fds,
3485 int netns_fd) {
03cfe0d5 3486
e1bb4b0d 3487 _cleanup_strv_free_ char **os_release_pairs = NULL;
bf428efb 3488 _cleanup_close_ int fd = -1;
e5f10caf 3489 const char *p;
03cfe0d5
LP
3490 pid_t pid;
3491 ssize_t l;
de40a303 3492 int r;
03cfe0d5 3493
b37469d7
LP
3494 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3495 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3496 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3497 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3498
03cfe0d5
LP
3499 assert(barrier);
3500 assert(directory);
03cfe0d5 3501 assert(pid_socket >= 0);
e01ff70a 3502 assert(uuid_socket >= 0);
9c1e04d0 3503 assert(notify_socket >= 0);
3acc84eb 3504 assert(master_pty_socket >= 0);
03cfe0d5
LP
3505 assert(kmsg_socket >= 0);
3506
de40a303
LP
3507 log_debug("Outer child is initializing.");
3508
e1bb4b0d
LB
3509 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3510 if (r < 0)
3511 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3512
03cfe0d5
LP
3513 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3514 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3515
03cfe0d5
LP
3516 r = reset_audit_loginuid();
3517 if (r < 0)
3518 return r;
3519
2a2e78e9
LP
3520 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3521 * mounts to the real root. */
60e76d48
ZJS
3522 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3523 if (r < 0)
3524 return r;
03cfe0d5 3525
2d845785 3526 if (dissected_image) {
2d3a5a73
LP
3527 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3528 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3529 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3530 * makes sure ESP partitions and userns are compatible. */
3531
af187ab2
LP
3532 r = dissected_image_mount_and_warn(
3533 dissected_image, directory, arg_uid_shift,
3534 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3535 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK)|
3536 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3537 if (r < 0)
af187ab2 3538 return r;
2d845785 3539 }
03cfe0d5 3540
391567f4
LP
3541 r = determine_uid_shift(directory);
3542 if (r < 0)
3543 return r;
3544
0de7acce 3545 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3546 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3547 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3548 if (l < 0)
3549 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3550 if (l != sizeof(arg_uid_shift))
3551 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3552 "Short write while sending UID shift.");
0e7ac751 3553
0de7acce 3554 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3555 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3556 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3557 * not it will pick a different one, and send it back to us. */
3558
3559 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3560 if (l < 0)
3561 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3562 if (l != sizeof(arg_uid_shift))
3563 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3564 "Short read while receiving UID shift.");
0e7ac751
LP
3565 }
3566
ff6c6cc1
LP
3567 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3568 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3569 }
3570
6f83d3d1
LP
3571 if (path_equal(directory, "/")) {
3572 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3573 * place, so that we can make changes to its mount structure (for example, to implement
3574 * --volatile=) without this interfering with our ability to access files such as
3575 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3576 * (instead of a temporary directory, since we are living in our own mount namspace here
3577 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3578 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3579
3580 r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3581 if (r < 0)
3582 return r;
3583
3584 directory = "/run/systemd/nspawn-root";
e50cd82f 3585 }
7d0ecdd6
LP
3586
3587 r = setup_pivot_root(
3588 directory,
3589 arg_pivot_root_new,
3590 arg_pivot_root_old);
3591 if (r < 0)
3592 return r;
3593
3594 r = setup_volatile_mode(
3595 directory,
3596 arg_volatile_mode,
7d0ecdd6 3597 arg_uid_shift,
8f1ed04a 3598 arg_selinux_apifs_context);
7d0ecdd6
LP
3599 if (r < 0)
3600 return r;
3601
5f0a6347
DDM
3602 r = mount_custom(
3603 directory,
3604 arg_custom_mounts,
3605 arg_n_custom_mounts,
5f0a6347 3606 arg_uid_shift,
5f0a6347
DDM
3607 arg_selinux_apifs_context,
3608 MOUNT_ROOT_ONLY);
3609 if (r < 0)
3610 return r;
3611
5530dc87
DDM
3612 /* Make sure we always have a mount that we can move to root later on. */
3613 if (!path_is_mount_point(directory, NULL, 0)) {
3614 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3615 if (r < 0)
3616 return r;
3617 }
3618
2d3a5a73
LP
3619 if (dissected_image) {
3620 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3621 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
4fcb96ce
LP
3622 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK));
3623 if (r == -EUCLEAN)
3624 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3625 if (r < 0)
4fcb96ce 3626 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3627 }
3628
8199d554
LP
3629 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3630 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3631
3632 r = detect_unified_cgroup_hierarchy_from_image(directory);
3633 if (r < 0)
3634 return r;
3635
3636 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3637 if (l < 0)
3638 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3639 if (l != sizeof(arg_unified_cgroup_hierarchy))
3640 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3641 "Short write while sending cgroup mode.");
8199d554
LP
3642
3643 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3644 }
3645
4ad14eff
LP
3646 /* Mark everything as shared so our mounts get propagated down. This is
3647 * required to make new bind mounts available in systemd services
5238e957 3648 * inside the container that create a new mount namespace.
4ad14eff
LP
3649 * See https://github.com/systemd/systemd/issues/3860
3650 * Further submounts (such as /dev) done after this will inherit the
5f0a6347
DDM
3651 * shared propagation mode.
3652 *
3653 * IMPORTANT: Do not overmount the root directory anymore from now on to
3654 * enable moving the root directory mount to root later on.
3655 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3656 */
4ad14eff
LP
3657 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3658 if (r < 0)
3659 return r;
3660
3661 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3662 if (r < 0)
3663 return r;
3664
03cfe0d5
LP
3665 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3666 if (r < 0)
3667 return r;
3668
bbd407ea
DDM
3669 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3670 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3671 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3672 if (r < 0)
3673 return log_error_errno(r, "Failed to make tree read-only: %m");
3674 }
3675
0de7acce 3676 r = mount_all(directory,
4f086aab 3677 arg_mount_settings,
0de7acce 3678 arg_uid_shift,
0de7acce 3679 arg_selinux_apifs_context);
03cfe0d5
LP
3680 if (r < 0)
3681 return r;
3682
07fa00f9
LP
3683 r = copy_devnodes(directory);
3684 if (r < 0)
03cfe0d5
LP
3685 return r;
3686
de40a303
LP
3687 r = make_extra_nodes(directory);
3688 if (r < 0)
3689 return r;
3690
3691 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3692
9fac5029 3693 p = prefix_roota(directory, "/run/host");
e5f10caf 3694 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3695
07fa00f9
LP
3696 r = setup_pts(directory);
3697 if (r < 0)
03cfe0d5
LP
3698 return r;
3699
3700 r = setup_propagate(directory);
3701 if (r < 0)
3702 return r;
3703
8e5430c4
LP
3704 r = setup_keyring();
3705 if (r < 0)
3706 return r;
3707
3652872a
LP
3708 r = setup_credentials(directory);
3709 if (r < 0)
3710 return r;
3711
5c4deb9a
MJ
3712 r = mount_custom(
3713 directory,
3714 arg_custom_mounts,
3715 arg_n_custom_mounts,
3716 arg_uid_shift,
3717 arg_selinux_apifs_context,
3718 MOUNT_NON_ROOT_ONLY);
3719 if (r < 0)
3720 return r;
3721
03cfe0d5
LP
3722 r = setup_timezone(directory);
3723 if (r < 0)
3724 return r;
3725
3726 r = setup_resolv_conf(directory);
3727 if (r < 0)
3728 return r;
3729
e01ff70a
MS
3730 r = setup_machine_id(directory);
3731 if (r < 0)
3732 return r;
3733
03cfe0d5
LP
3734 r = setup_journal(directory);
3735 if (r < 0)
3736 return r;
3737
0f48ba7b
LP
3738 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3739 p = prefix_roota(directory, "/run/host/container-manager");
3740 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3741
3742 /* The same stuff as the $container_uuid env var */
3743 p = prefix_roota(directory, "/run/host/container-uuid");
3744 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3745
489fae52 3746 if (!arg_use_cgns) {
0996ef00
CB
3747 r = mount_cgroups(
3748 directory,
3749 arg_unified_cgroup_hierarchy,
3750 arg_userns_mode != USER_NAMESPACE_NO,
3751 arg_uid_shift,
3752 arg_uid_range,
5a8ff0e6 3753 arg_selinux_apifs_context,
ada54120 3754 false);
0996ef00
CB
3755 if (r < 0)
3756 return r;
3757 }
03cfe0d5
LP
3758
3759 r = mount_move_root(directory);
3760 if (r < 0)
3761 return log_error_errno(r, "Failed to move root directory: %m");
3762
e96ceaba 3763 fd = setup_notify_child();
9c1e04d0
AP
3764 if (fd < 0)
3765 return fd;
3766
03cfe0d5 3767 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3768 arg_clone_ns_flags |
8869a0b4 3769 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3770 if (pid < 0)
3771 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3772 if (pid == 0) {
3773 pid_socket = safe_close(pid_socket);
e01ff70a 3774 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3775 notify_socket = safe_close(notify_socket);
825d5287 3776 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5 3777
2a2e78e9
LP
3778 /* The inner child has all namespaces that are requested, so that we all are owned by the
3779 * user if user namespaces are turned on. */
03cfe0d5 3780
d7bea6b6
DP
3781 if (arg_network_namespace_path) {
3782 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3783 if (r < 0)
e2d39e54 3784 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3785 }
3786
e1bb4b0d 3787 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
03cfe0d5
LP
3788 if (r < 0)
3789 _exit(EXIT_FAILURE);
3790
3791 _exit(EXIT_SUCCESS);
3792 }
3793
3794 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3795 if (l < 0)
3796 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3797 if (l != sizeof(pid))
3798 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3799 "Short write while sending PID.");
03cfe0d5 3800
e01ff70a
MS
3801 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3802 if (l < 0)
3803 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3804 if (l != sizeof(arg_uuid))
3805 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3806 "Short write while sending machine ID.");
e01ff70a 3807
9c1e04d0
AP
3808 l = send_one_fd(notify_socket, fd, 0);
3809 if (l < 0)
ba72801d 3810 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3811
03cfe0d5 3812 pid_socket = safe_close(pid_socket);
e01ff70a 3813 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3814 notify_socket = safe_close(notify_socket);
3acc84eb 3815 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3816 kmsg_socket = safe_close(kmsg_socket);
3817 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3818 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3819
3820 return 0;
3821}
3822
0e7ac751 3823static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3824 bool tried_hashed = false;
0e7ac751
LP
3825 unsigned n_tries = 100;
3826 uid_t candidate;
3827 int r;
3828
3829 assert(shift);
3830 assert(ret_lock_file);
0de7acce 3831 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3832 assert(arg_uid_range == 0x10000U);
3833
3834 candidate = *shift;
3835
3836 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3837
3838 for (;;) {
fbd0b64f 3839 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3840 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3841
3842 if (--n_tries <= 0)
3843 return -EBUSY;
3844
87d5e4f2 3845 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3846 goto next;
3847 if ((candidate & UINT32_C(0xFFFF)) != 0)
3848 goto next;
3849
3850 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3851 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3852 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3853 goto next;
3854 if (r < 0)
3855 return r;
3856
3857 /* Make some superficial checks whether the range is currently known in the user database */
3858 if (getpwuid(candidate))
3859 goto next;
3860 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3861 goto next;
3862 if (getgrgid(candidate))
3863 goto next;
3864 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3865 goto next;
3866
3867 *ret_lock_file = lf;
3868 lf = (struct LockFile) LOCK_FILE_INIT;
3869 *shift = candidate;
3870 return 0;
3871
3872 next:
d381c8a6
LP
3873 if (arg_machine && !tried_hashed) {
3874 /* Try to hash the base from the container name */
3875
3876 static const uint8_t hash_key[] = {
3877 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3878 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3879 };
3880
3881 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3882
3883 tried_hashed = true;
3884 } else
3885 random_bytes(&candidate, sizeof(candidate));
3886
87d5e4f2 3887 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3888 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3889 }
3890}
3891
03cfe0d5 3892static int setup_uid_map(pid_t pid) {
fbd0b64f 3893 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3894 int r;
3895
3896 assert(pid > 1);
3897
3898 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3899 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3900 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3901 if (r < 0)
3902 return log_error_errno(r, "Failed to write UID map: %m");
3903
3904 /* We always assign the same UID and GID ranges */
3905 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3906 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3907 if (r < 0)
3908 return log_error_errno(r, "Failed to write GID map: %m");
3909
3910 return 0;
3911}
3912
9c1e04d0 3913static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3914 char buf[NOTIFY_BUFFER_MAX+1];
3915 char *p = NULL;
3916 struct iovec iovec = {
3917 .iov_base = buf,
3918 .iov_len = sizeof(buf)-1,
3919 };
fb29cdbe
LP
3920 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
3921 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
3922 struct msghdr msghdr = {
3923 .msg_iov = &iovec,
3924 .msg_iovlen = 1,
3925 .msg_control = &control,
3926 .msg_controllen = sizeof(control),
3927 };
371d72e0 3928 struct ucred *ucred;
9c1e04d0
AP
3929 ssize_t n;
3930 pid_t inner_child_pid;
3931 _cleanup_strv_free_ char **tags = NULL;
3932
3933 assert(userdata);
3934
3935 inner_child_pid = PTR_TO_PID(userdata);
3936
3937 if (revents != EPOLLIN) {
3938 log_warning("Got unexpected poll event for notify fd.");
3939 return 0;
3940 }
3941
3691bcf3
LP
3942 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3943 if (IN_SET(n, -EAGAIN, -EINTR))
3944 return 0;
3945 if (n < 0)
3946 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 3947
9c1e04d0
AP
3948 cmsg_close_all(&msghdr);
3949
371d72e0 3950 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 3951 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3952 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3953 return 0;
3954 }
3955
3956 if ((size_t) n >= sizeof(buf)) {
3957 log_warning("Received notify message exceeded maximum size. Ignoring.");
3958 return 0;
3959 }
3960
3961 buf[n] = 0;
3962 tags = strv_split(buf, "\n\r");
3963 if (!tags)
3964 return log_oom();
3965
3966 if (strv_find(tags, "READY=1"))
04f590a4 3967 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
3968
3969 p = strv_find_startswith(tags, "STATUS=");
3970 if (p)
04f590a4 3971 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
3972
3973 return 0;
3974}
3975
e96ceaba 3976static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3977 int r;
9c1e04d0 3978
5773024d 3979 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3980 if (r < 0)
3981 return log_error_errno(r, "Failed to allocate notify event source: %m");
3982
5773024d 3983 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3984
3985 return 0;
3986}
3987
5d961407
LP
3988static int merge_settings(Settings *settings, const char *path) {
3989 int rl;
f757855e 3990
5d961407
LP
3991 assert(settings);
3992 assert(path);
f757855e 3993
5d961407
LP
3994 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3995 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3996
7732f92b
LP
3997 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3998 settings->start_mode >= 0) {
3999 arg_start_mode = settings->start_mode;
130d3d22 4000 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4001 }
4002
a2f577fc
JL
4003 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
4004 arg_ephemeral = settings->ephemeral;
4005
de40a303
LP
4006 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4007 settings->root) {
4008
4009 if (!arg_settings_trusted)
4010 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4011 else
4012 free_and_replace(arg_directory, settings->root);
4013 }
4014
b53ede69
PW
4015 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4016 settings->pivot_root_new) {
4017 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4018 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4019 }
4020
5f932eb9 4021 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4022 settings->working_directory)
4023 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4024
f757855e 4025 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4026 settings->environment)
4027 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4028
de40a303
LP
4029 if ((arg_settings_mask & SETTING_USER) == 0) {
4030
4031 if (settings->user)
4032 free_and_replace(arg_user, settings->user);
4033
4034 if (uid_is_valid(settings->uid))
4035 arg_uid = settings->uid;
4036 if (gid_is_valid(settings->gid))
4037 arg_gid = settings->gid;
4038 if (settings->n_supplementary_gids > 0) {
4039 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4040 arg_n_supplementary_gids = settings->n_supplementary_gids;
4041 }
4042 }
f757855e
LP
4043
4044 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4045 uint64_t plus, minus;
7be830c6 4046 uint64_t network_minus = 0;
f757855e 4047
de40a303
LP
4048 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4049 * Settings structure */
4050
0e265674 4051 plus = settings->capability;
a3fc6b55
LP
4052 minus = settings->drop_capability;
4053
4054 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
4055 if (settings_private_network(settings))
4056 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4057 else
7be830c6 4058 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4059 }
0e265674
LP
4060
4061 if (!arg_settings_trusted && plus != 0) {
4062 if (settings->capability != 0)
5d961407 4063 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4064 } else {
4065 arg_caps_retain &= ~network_minus;
520e0d54 4066 arg_caps_retain |= plus;
7be830c6 4067 }
f757855e 4068
a3fc6b55 4069 arg_caps_retain &= ~minus;
de40a303
LP
4070
4071 /* Copy the full capabilities over too */
4072 if (capability_quintet_is_set(&settings->full_capabilities)) {
4073 if (!arg_settings_trusted)
5238e957 4074 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4075 else
4076 arg_full_capabilities = settings->full_capabilities;
4077 }
f757855e
LP
4078 }
4079
4080 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4081 settings->kill_signal > 0)
4082 arg_kill_signal = settings->kill_signal;
4083
4084 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4085 settings->personality != PERSONALITY_INVALID)
4086 arg_personality = settings->personality;
4087
4088 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4089 !sd_id128_is_null(settings->machine_id)) {
4090
4091 if (!arg_settings_trusted)
5d961407 4092 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4093 else
4094 arg_uuid = settings->machine_id;
4095 }
4096
4097 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4098 settings->read_only >= 0)
4099 arg_read_only = settings->read_only;
4100
4101 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4102 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4103 arg_volatile_mode = settings->volatile_mode;
4104
4105 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4106 settings->n_custom_mounts > 0) {
4107
4108 if (!arg_settings_trusted)
5d961407 4109 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4110 else {
4111 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4112 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4113 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4114 settings->n_custom_mounts = 0;
4115 }
4116 }
4117
4118 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4119 (settings->private_network >= 0 ||
4120 settings->network_veth >= 0 ||
4121 settings->network_bridge ||
22b28dfd 4122 settings->network_zone ||
f757855e
LP
4123 settings->network_interfaces ||
4124 settings->network_macvlan ||
f6d6bad1 4125 settings->network_ipvlan ||
de40a303
LP
4126 settings->network_veth_extra ||
4127 settings->network_namespace_path)) {
f757855e
LP
4128
4129 if (!arg_settings_trusted)
5d961407 4130 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4131 else {
f6d6bad1 4132 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4133 arg_private_network = settings_private_network(settings);
4134
130d3d22
YW
4135 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4136 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4137 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4138 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4139
1cc6c93a
YW
4140 free_and_replace(arg_network_bridge, settings->network_bridge);
4141 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4142
4143 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4144 }
4145 }
4146
4147 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4148 settings->expose_ports) {
4149
4150 if (!arg_settings_trusted)
5d961407 4151 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4152 else {
4153 expose_port_free_all(arg_expose_ports);
1cc6c93a 4154 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4155 }
4156 }
4157
0de7acce
LP
4158 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4159 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4160
4161 if (!arg_settings_trusted)
5d961407 4162 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4163 else {
4164 arg_userns_mode = settings->userns_mode;
4165 arg_uid_shift = settings->uid_shift;
4166 arg_uid_range = settings->uid_range;
4167 arg_userns_chown = settings->userns_chown;
4168 }
4169 }
4170
9c1e04d0
AP
4171 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
4172 arg_notify_ready = settings->notify_ready;
4173
960e4569
LP
4174 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4175
6b000af4 4176 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
5d961407 4177 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 4178 else {
6b000af4
LP
4179 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4180 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
960e4569 4181 }
de40a303
LP
4182
4183#if HAVE_SECCOMP
4184 if (!arg_settings_trusted && settings->seccomp)
4185 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4186 else {
4187 seccomp_release(arg_seccomp);
4188 arg_seccomp = TAKE_PTR(settings->seccomp);
4189 }
4190#endif
960e4569
LP
4191 }
4192
bf428efb
LP
4193 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4194 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4195 continue;
4196
4197 if (!settings->rlimit[rl])
4198 continue;
4199
4200 if (!arg_settings_trusted) {
5d961407 4201 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4202 continue;
4203 }
4204
4205 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4206 }
4207
3a9530e5
LP
4208 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4209 settings->hostname)
4210 free_and_replace(arg_hostname, settings->hostname);
4211
66edd963
LP
4212 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4213 settings->no_new_privileges >= 0)
4214 arg_no_new_privileges = settings->no_new_privileges;
4215
81f345df
LP
4216 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4217 settings->oom_score_adjust_set) {
4218
4219 if (!arg_settings_trusted)
5d961407 4220 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4221 else {
4222 arg_oom_score_adjust = settings->oom_score_adjust;
4223 arg_oom_score_adjust_set = true;
4224 }
4225 }
4226
d107bb7d 4227 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4228 settings->cpu_set.set) {
d107bb7d
LP
4229
4230 if (!arg_settings_trusted)
5d961407 4231 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4232 else {
0985c7c4
ZJS
4233 cpu_set_reset(&arg_cpu_set);
4234 arg_cpu_set = settings->cpu_set;
4235 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4236 }
4237 }
4238
09d423e9
LP
4239 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4240 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4241 arg_resolv_conf = settings->resolv_conf;
4242
4e1d6aa9
LP
4243 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4244 settings->link_journal != _LINK_JOURNAL_INVALID) {
4245
4246 if (!arg_settings_trusted)
4247 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4248 else {
4249 arg_link_journal = settings->link_journal;
4250 arg_link_journal_try = settings->link_journal_try;
4251 }
4252 }
4253
1688841f
LP
4254 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4255 settings->timezone != _TIMEZONE_MODE_INVALID)
4256 arg_timezone = settings->timezone;
4257
de40a303
LP
4258 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4259 settings->slice) {
4260
4261 if (!arg_settings_trusted)
4262 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4263 else
4264 free_and_replace(arg_slice, settings->slice);
4265 }
4266
4267 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4268 settings->use_cgns >= 0) {
4269
4270 if (!arg_settings_trusted)
4271 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4272 else
4273 arg_use_cgns = settings->use_cgns;
4274 }
4275
4276 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4277 settings->clone_ns_flags != (unsigned long) -1) {
4278
4279 if (!arg_settings_trusted)
4280 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4281 else
4282 arg_clone_ns_flags = settings->clone_ns_flags;
4283 }
4284
4285 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4286 settings->console_mode >= 0) {
4287
4288 if (!arg_settings_trusted)
4289 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4290 else
4291 arg_console_mode = settings->console_mode;
4292 }
4293
4294 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4295 * don't consult arg_settings_mask for them. */
4296
4297 sd_bus_message_unref(arg_property_message);
4298 arg_property_message = TAKE_PTR(settings->properties);
4299
4300 arg_console_width = settings->console_width;
4301 arg_console_height = settings->console_height;
4302
b2645747 4303 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4304 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4305 arg_n_extra_nodes = settings->n_extra_nodes;
4306
f757855e
LP
4307 return 0;
4308}
4309
5d961407
LP
4310static int load_settings(void) {
4311 _cleanup_(settings_freep) Settings *settings = NULL;
4312 _cleanup_fclose_ FILE *f = NULL;
4313 _cleanup_free_ char *p = NULL;
4314 const char *fn, *i;
4315 int r;
4316
de40a303
LP
4317 if (arg_oci_bundle)
4318 return 0;
4319
5d961407
LP
4320 /* If all settings are masked, there's no point in looking for
4321 * the settings file */
4322 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4323 return 0;
4324
4325 fn = strjoina(arg_machine, ".nspawn");
4326
4327 /* We first look in the admin's directories in /etc and /run */
4328 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4329 _cleanup_free_ char *j = NULL;
4330
657ee2d8 4331 j = path_join(i, fn);
5d961407
LP
4332 if (!j)
4333 return log_oom();
4334
4335 f = fopen(j, "re");
4336 if (f) {
4337 p = TAKE_PTR(j);
4338
4339 /* By default, we trust configuration from /etc and /run */
4340 if (arg_settings_trusted < 0)
4341 arg_settings_trusted = true;
4342
4343 break;
4344 }
4345
4346 if (errno != ENOENT)
4347 return log_error_errno(errno, "Failed to open %s: %m", j);
4348 }
4349
4350 if (!f) {
4351 /* After that, let's look for a file next to the
4352 * actual image we shall boot. */
4353
4354 if (arg_image) {
4355 p = file_in_same_dir(arg_image, fn);
4356 if (!p)
4357 return log_oom();
cd6e3914 4358 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4359 p = file_in_same_dir(arg_directory, fn);
4360 if (!p)
4361 return log_oom();
4362 }
4363
4364 if (p) {
4365 f = fopen(p, "re");
4366 if (!f && errno != ENOENT)
4367 return log_error_errno(errno, "Failed to open %s: %m", p);
4368
4369 /* By default, we do not trust configuration from /var/lib/machines */
4370 if (arg_settings_trusted < 0)
4371 arg_settings_trusted = false;
4372 }
4373 }
4374
4375 if (!f)
4376 return 0;
4377
4378 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4379
4380 r = settings_load(f, p, &settings);
4381 if (r < 0)
4382 return r;
4383
4384 return merge_settings(settings, p);
4385}
4386
de40a303
LP
4387static int load_oci_bundle(void) {
4388 _cleanup_(settings_freep) Settings *settings = NULL;
4389 int r;
4390
4391 if (!arg_oci_bundle)
4392 return 0;
4393
4394 /* By default let's trust OCI bundles */
4395 if (arg_settings_trusted < 0)
4396 arg_settings_trusted = true;
4397
4398 r = oci_load(NULL, arg_oci_bundle, &settings);
4399 if (r < 0)
4400 return r;
4401
4402 return merge_settings(settings, arg_oci_bundle);
4403}
4404
3acc84eb 4405static int run_container(
2d845785 4406 DissectedImage *dissected_image,
b0067625
ZJS
4407 bool secondary,
4408 FDSet *fds,
4409 char veth_name[IFNAMSIZ], bool *veth_created,
4410 union in_addr_union *exposed,
3acc84eb 4411 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4412
4413 static const struct sigaction sa = {
4414 .sa_handler = nop_signal_handler,
e28c7cd0 4415 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4416 };
4417
8e766630 4418 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4419 _cleanup_close_ int etc_passwd_lock = -1;
4420 _cleanup_close_pair_ int
4421 kmsg_socket_pair[2] = { -1, -1 },
4422 rtnl_socket_pair[2] = { -1, -1 },
4423 pid_socket_pair[2] = { -1, -1 },
4424 uuid_socket_pair[2] = { -1, -1 },
4425 notify_socket_pair[2] = { -1, -1 },
8199d554 4426 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4427 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4428 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4429
3acc84eb 4430 _cleanup_close_ int notify_socket = -1;
b0067625 4431 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4432 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4433 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4434 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4435 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4436 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4437 ContainerStatus container_status = 0;
b0067625
ZJS
4438 int ifi = 0, r;
4439 ssize_t l;
4440 sigset_t mask_chld;
5b4855ab 4441 _cleanup_close_ int child_netns_fd = -1;
b0067625
ZJS
4442
4443 assert_se(sigemptyset(&mask_chld) == 0);
4444 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4445
4446 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4447 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4448 * check with getpwuid() if the specific user already exists. Note that /etc might be
4449 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4450 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4451 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4452 * really ours. */
4453
4454 etc_passwd_lock = take_etc_passwd_lock(NULL);
4455 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4456 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4457 }
4458
4459 r = barrier_create(&barrier);
4460 if (r < 0)
4461 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4462
4463 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4464 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4465
4466 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4467 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4468
4469 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4470 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4471
4472 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4473 return log_error_errno(errno, "Failed to create id socket pair: %m");
4474
4475 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4476 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4477
3acc84eb
FB
4478 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4479 return log_error_errno(errno, "Failed to create console socket pair: %m");
4480
b0067625
ZJS
4481 if (arg_userns_mode != USER_NAMESPACE_NO)
4482 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4483 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4484
8199d554
LP
4485 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4486 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4487 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4488
b0067625
ZJS
4489 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4490 * parent's blocking calls and give it a chance to call wait() and terminate. */
4491 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4492 if (r < 0)
4493 return log_error_errno(errno, "Failed to change the signal mask: %m");
4494
4495 r = sigaction(SIGCHLD, &sa, NULL);
4496 if (r < 0)
4497 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4498
d7bea6b6 4499 if (arg_network_namespace_path) {
5b4855ab
DDM
4500 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4501 if (child_netns_fd < 0)
d7bea6b6
DP
4502 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4503
5b4855ab 4504 r = fd_is_network_ns(child_netns_fd);
6619ad88
LP
4505 if (r == -EUCLEAN)
4506 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4507 else if (r < 0)
d7bea6b6 4508 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4509 else if (r == 0)
4510 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4511 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4512 }
4513
b0067625
ZJS
4514 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4515 if (*pid < 0)
4516 return log_error_errno(errno, "clone() failed%s: %m",
4517 errno == EINVAL ?
4518 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4519
4520 if (*pid == 0) {
4521 /* The outer child only has a file system namespace. */
4522 barrier_set_role(&barrier, BARRIER_CHILD);
4523
b0067625
ZJS
4524 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4525 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4526 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4527 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4528 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4529 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4530 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4531 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4532
4533 (void) reset_all_signal_handlers();
4534 (void) reset_signal_mask();
4535
4536 r = outer_child(&barrier,
4537 arg_directory,
2d845785 4538 dissected_image,
b0067625
ZJS
4539 secondary,
4540 pid_socket_pair[1],
4541 uuid_socket_pair[1],
4542 notify_socket_pair[1],
4543 kmsg_socket_pair[1],
4544 rtnl_socket_pair[1],
4545 uid_shift_socket_pair[1],
3acc84eb 4546 master_pty_socket_pair[1],
8199d554 4547 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6 4548 fds,
5b4855ab 4549 child_netns_fd);
b0067625
ZJS
4550 if (r < 0)
4551 _exit(EXIT_FAILURE);
4552
4553 _exit(EXIT_SUCCESS);
4554 }
4555
4556 barrier_set_role(&barrier, BARRIER_PARENT);
4557
e4077ff6 4558 fdset_close(fds);
b0067625
ZJS
4559
4560 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4561 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4562 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4563 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4564 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4565 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4566 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4567 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4568
4569 if (arg_userns_mode != USER_NAMESPACE_NO) {
4570 /* The child just let us know the UID shift it might have read from the image. */
4571 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4572 if (l < 0)
4573 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4574 if (l != sizeof arg_uid_shift)
4575 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4576
4577 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4578 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4579 * image, but if that's already in use, pick a new one, and report back to the child,
4580 * which one we now picked. */
4581
4582 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4583 if (r < 0)
4584 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4585
4586 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4587 if (l < 0)
4588 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4589 if (l != sizeof arg_uid_shift)
4590 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625
ZJS
4591 }
4592 }
4593
8199d554
LP
4594 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4595 /* The child let us know the support cgroup mode it might have read from the image. */
4596 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4597 if (l < 0)
4598 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4599 if (l != sizeof(arg_unified_cgroup_hierarchy))
4600 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4601 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4602 }
4603
b0067625 4604 /* Wait for the outer child. */
d2e0ac3d
LP
4605 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4606 if (r < 0)
4607 return r;
4608 if (r != EXIT_SUCCESS)
4609 return -EIO;
b0067625
ZJS
4610
4611 /* And now retrieve the PID of the inner child. */
4612 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4613 if (l < 0)
4614 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4615 if (l != sizeof *pid)
4616 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4617
4618 /* We also retrieve container UUID in case it was generated by outer child */
4619 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4620 if (l < 0)
4621 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4622 if (l != sizeof(arg_uuid))
4623 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4624
4625 /* We also retrieve the socket used for notifications generated by outer child */
4626 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4627 if (notify_socket < 0)
4628 return log_error_errno(notify_socket,
4629 "Failed to receive notification socket from the outer child: %m");
4630
4631 log_debug("Init process invoked as PID "PID_FMT, *pid);
4632
4633 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4634 if (!barrier_place_and_sync(&barrier)) /* #1 */
4635 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4636
4637 r = setup_uid_map(*pid);
4638 if (r < 0)
4639 return r;
4640
4641 (void) barrier_place(&barrier); /* #2 */
4642 }
4643
4644 if (arg_private_network) {
75116558
PS
4645 if (!arg_network_namespace_path) {
4646 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4647 if (!barrier_place_and_sync(&barrier)) /* #3 */
4648 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4649 }
4650
5b4855ab
DDM
4651 if (child_netns_fd < 0) {
4652 /* Make sure we have an open file descriptor to the child's network
4653 * namespace so it stays alive even if the child exits. */
4654 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4655 if (r < 0)
4656 return log_error_errno(r, "Failed to open child network namespace: %m");
4657 }
4658
4659 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4660 if (r < 0)
4661 return r;
4662
4663 if (arg_network_veth) {
4664 r = setup_veth(arg_machine, *pid, veth_name,
4665 arg_network_bridge || arg_network_zone);
4666 if (r < 0)
4667 return r;
4668 else if (r > 0)
4669 ifi = r;
4670
4671 if (arg_network_bridge) {
4672 /* Add the interface to a bridge */
4673 r = setup_bridge(veth_name, arg_network_bridge, false);
4674 if (r < 0)
4675 return r;
4676 if (r > 0)
4677 ifi = r;
4678 } else if (arg_network_zone) {
4679 /* Add the interface to a bridge, possibly creating it */
4680 r = setup_bridge(veth_name, arg_network_zone, true);
4681 if (r < 0)
4682 return r;
4683 if (r > 0)
4684 ifi = r;
4685 }
4686 }
4687
4688 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4689 if (r < 0)
4690 return r;
4691
4692 /* We created the primary and extra veth links now; let's remember this, so that we know to
4693 remove them later on. Note that we don't bother with removing veth links that were created
4694 here when their setup failed half-way, because in that case the kernel should be able to
4695 remove them on its own, since they cannot be referenced by anything yet. */
4696 *veth_created = true;
4697
4698 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4699 if (r < 0)
4700 return r;
4701
4702 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4703 if (r < 0)
4704 return r;
4705 }
4706
abdb9b08
LP
4707 if (arg_register || !arg_keep_unit) {
4708 r = sd_bus_default_system(&bus);
4709 if (r < 0)
4710 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4711
4712 r = sd_bus_set_close_on_exit(bus, false);
4713 if (r < 0)
4714 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4715 }
4716
4717 if (!arg_keep_unit) {
4718 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4719 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4720 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4721
75152a4d
LP
4722 r = sd_bus_match_signal_async(
4723 bus,
4724 NULL,
4725 "org.freedesktop.systemd1",
4726 NULL,
4727 "org.freedesktop.systemd1.Scope",
4728 "RequestStop",
4729 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4730 if (r < 0)
75152a4d 4731 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4732 }
4733
b0067625
ZJS
4734 if (arg_register) {
4735 r = register_machine(
abdb9b08 4736 bus,
b0067625
ZJS
4737 arg_machine,
4738 *pid,
4739 arg_directory,
4740 arg_uuid,
4741 ifi,
4742 arg_slice,
4743 arg_custom_mounts, arg_n_custom_mounts,
4744 arg_kill_signal,
4745 arg_property,
de40a303 4746 arg_property_message,
b0067625
ZJS
4747 arg_keep_unit,
4748 arg_container_service_name);
4749 if (r < 0)
4750 return r;
abdb9b08 4751
cd2dfc6f
LP
4752 } else if (!arg_keep_unit) {
4753 r = allocate_scope(
abdb9b08 4754 bus,
cd2dfc6f
LP
4755 arg_machine,
4756 *pid,
4757 arg_slice,
4758 arg_custom_mounts, arg_n_custom_mounts,
4759 arg_kill_signal,
de40a303
LP
4760 arg_property,
4761 arg_property_message);
cd2dfc6f
LP
4762 if (r < 0)
4763 return r;
4764
4765 } else if (arg_slice || arg_property)
4766 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4767
27da7ef0 4768 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4769 if (r < 0)
4770 return r;
4771
27da7ef0 4772 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4773 if (r < 0)
4774 return r;
b0067625 4775
de54e02d 4776 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4777 if (r < 0)
4778 return r;
4779
4780 /* Notify the child that the parent is ready with all
4781 * its setup (including cgroup-ification), and that
4782 * the child can now hand over control to the code to
4783 * run inside the container. */
75116558 4784 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4785
4786 /* Block SIGCHLD here, before notifying child.
4787 * process_pty() will handle it with the other signals. */
4788 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4789
4790 /* Reset signal to default */
4791 r = default_signals(SIGCHLD, -1);
4792 if (r < 0)
4793 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4794
4795 r = sd_event_new(&event);
4796 if (r < 0)
4797 return log_error_errno(r, "Failed to get default event source: %m");
4798
8fd010bb
LP
4799 (void) sd_event_set_watchdog(event, true);
4800
abdb9b08
LP
4801 if (bus) {
4802 r = sd_bus_attach_event(bus, event, 0);
4803 if (r < 0)
4804 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4805 }
4806
e96ceaba 4807 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4808 if (r < 0)
4809 return r;
4810
4811 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
4812 if (!barrier_place_and_sync(&barrier)) /* #5 */
4813 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4814
38ccb557 4815 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
4816 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4817 etc_passwd_lock = safe_close(etc_passwd_lock);
4818
04f590a4
LP
4819 (void) sd_notifyf(false,
4820 "STATUS=Container running.\n"
4821 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4822 if (!arg_notify_ready)
919f5ae0 4823 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4824
4825 if (arg_kill_signal > 0) {
4826 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4827 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4828 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4829 } else {
4830 /* Immediately exit */
919f5ae0
LP
4831 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4832 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4833 }
4834
6916b164 4835 /* Exit when the child exits */
919f5ae0 4836 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4837
4838 if (arg_expose_ports) {
4839 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4840 if (r < 0)
4841 return r;
4842
4843 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4844 }
4845
4846 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4847
3acc84eb
FB
4848 if (arg_console_mode != CONSOLE_PIPE) {
4849 _cleanup_close_ int fd = -1;
4850 PTYForwardFlags flags = 0;
de40a303 4851
3acc84eb
FB
4852 /* Retrieve the master pty allocated by inner child */
4853 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4854 if (fd < 0)
4855 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4856
4857 switch (arg_console_mode) {
de40a303 4858
3acc84eb
FB
4859 case CONSOLE_READ_ONLY:
4860 flags |= PTY_FORWARD_READ_ONLY;
4861
4862 _fallthrough_;
4863
4864 case CONSOLE_INTERACTIVE:
4865 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4866
4867 r = pty_forward_new(event, fd, flags, &forward);
4868 if (r < 0)
4869 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4870
4871 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4872 (void) pty_forward_set_width_height(forward,
4873 arg_console_width,
4874 arg_console_height);
4875 break;
4876
4877 default:
4878 assert(arg_console_mode == CONSOLE_PASSIVE);
4879 }
4880
4881 *master = TAKE_FD(fd);
de40a303 4882 }
b0067625
ZJS
4883
4884 r = sd_event_loop(event);
4885 if (r < 0)
4886 return log_error_errno(r, "Failed to run event loop: %m");
4887
de40a303
LP
4888 if (forward) {
4889 char last_char = 0;
b0067625 4890
de40a303
LP
4891 (void) pty_forward_get_last_char(forward, &last_char);
4892 forward = pty_forward_free(forward);
b0067625 4893
de40a303
LP
4894 if (!arg_quiet && last_char != '\n')
4895 putc('\n', stdout);
4896 }
b0067625
ZJS
4897
4898 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
4899 if (!arg_register && !arg_keep_unit && bus)
4900 terminate_scope(bus, arg_machine);
b0067625
ZJS
4901
4902 /* Normally redundant, but better safe than sorry */
c67b0082 4903 (void) kill(*pid, SIGKILL);
b0067625 4904
5b4855ab
DDM
4905 if (arg_private_network) {
4906 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
4907 * to avoid having to move the parent to the child network namespace. */
4908 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
4909 if (r < 0)
4910 return r;
4911
4912 if (r == 0) {
4913 _cleanup_close_ int parent_netns_fd = -1;
4914
4915 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
4916 if (r < 0) {
4917 log_error_errno(r, "Failed to open parent network namespace: %m");
4918 _exit(EXIT_FAILURE);
4919 }
4920
4921 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
4922 if (r < 0) {
4923 log_error_errno(r, "Failed to enter child network namespace: %m");
4924 _exit(EXIT_FAILURE);
4925 }
4926
4927 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
4928 if (r < 0)
4929 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
4930
4931 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
4932 }
4933 }
4934
b0067625
ZJS
4935 r = wait_for_container(*pid, &container_status);
4936 *pid = 0;
4937
0bb0a9fa
ZJS
4938 /* Tell machined that we are gone. */
4939 if (bus)
4940 (void) unregister_machine(bus, arg_machine);
4941
b0067625
ZJS
4942 if (r < 0)
4943 /* We failed to wait for the container, or the container exited abnormally. */
4944 return r;
4945 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4946 /* r > 0 → The container exited with a non-zero status.
4947 * As a special case, we need to replace 133 with a different value,
4948 * because 133 is special-cased in the service file to reboot the container.
4949 * otherwise → The container exited with zero status and a reboot was not requested.
4950 */
2a49b612 4951 if (r == EXIT_FORCE_RESTART)
27e29a1e 4952 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4953 *ret = r;
b0067625
ZJS
4954 return 0; /* finito */
4955 }
4956
4957 /* CONTAINER_REBOOTED, loop again */
4958
4959 if (arg_keep_unit) {
4960 /* Special handling if we are running as a service: instead of simply
4961 * restarting the machine we want to restart the entire service, so let's
4962 * inform systemd about this with the special exit code 133. The service
4963 * file uses RestartForceExitStatus=133 so that this results in a full
4964 * nspawn restart. This is necessary since we might have cgroup parameters
4965 * set we want to have flushed out. */
2a49b612
ZJS
4966 *ret = EXIT_FORCE_RESTART;
4967 return 0; /* finito */
b0067625
ZJS
4968 }
4969
4970 expose_port_flush(arg_expose_ports, exposed);
4971
4972 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4973 *veth_created = false;
4974 return 1; /* loop again */
4975}
4976
bf428efb 4977static int initialize_rlimits(void) {
bf428efb
LP
4978 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4979 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4980 * container execution environments. */
4981
4982 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4983 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4984 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4985 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4986 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4987 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4988 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4989 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4990 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4991 [RLIMIT_NICE] = { 0, 0 },
4992 [RLIMIT_NOFILE] = { 1024, 4096 },
4993 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4994 [RLIMIT_RTPRIO] = { 0, 0 },
4995 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4996 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4997
4998 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4999 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5000 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5001 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5002 * that PID 1 changes a number of other resource limits during early initialization which is why we
5003 * don't read the other limits from PID 1 but prefer the static table above. */
5004 };
5005
5006 int rl;
5007
5008 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5009 /* Let's only fill in what the user hasn't explicitly configured anyway */
5010 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5011 const struct rlimit *v;
5012 struct rlimit buffer;
5013
5014 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5015 /* For these two let's read the limits off PID 1. See above for an explanation. */
5016
5017 if (prlimit(1, rl, NULL, &buffer) < 0)
5018 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5019
5020 v = &buffer;
5021 } else
5022 v = kernel_defaults + rl;
5023
5024 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5025 if (!arg_rlimit[rl])
5026 return log_oom();
5027 }
5028
5029 if (DEBUG_LOGGING) {
5030 _cleanup_free_ char *k = NULL;
5031
5032 (void) rlimit_format(arg_rlimit[rl], &k);
5033 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5034 }
5035 }
5036
5037 return 0;
5038}
5039
287b7376
LP
5040static int cant_be_in_netns(void) {
5041 union sockaddr_union sa = {
5042 .un = {
5043 .sun_family = AF_UNIX,
5044 .sun_path = "/run/udev/control",
5045 },
5046 };
5047 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5048 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5049 _cleanup_close_ int fd = -1;
5050 struct ucred ucred;
5051 int r;
5052
5053 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5054 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5055 * nice message. */
5056
5057 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5058 return 0;
5059
5060 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5061 if (fd < 0)
5062 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5063
5064 if (connect(fd, &sa.un, SOCKADDR_UN_LEN(sa.un)) < 0) {
5065
5066 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5067 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5068 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5069
5070 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5071 }
5072
5073 r = getpeercred(fd, &ucred);
5074 if (r < 0)
5075 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5076
5077 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5078 r = readlink_malloc(udev_path, &udev_ns);
5079 if (r < 0)
5080 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5081
5082 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5083 if (r < 0)
5084 return log_error_errno(r, "Failed to read our own network namespace: %m");
5085
5086 if (!streq(our_ns, udev_ns))
5087 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5088 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5089 return 0;
5090}
5091
44dbef90 5092static int run(int argc, char *argv[]) {
7bf011e3
LP
5093 bool secondary = false, remove_directory = false, remove_image = false,
5094 veth_created = false, remove_tmprootdir = false;
2d845785 5095 _cleanup_close_ int master = -1;
03cfe0d5 5096 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5097 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5098 char veth_name[IFNAMSIZ] = "";
03cfe0d5 5099 union in_addr_union exposed = {};
8e766630 5100 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5101 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5102 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
5103 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5104 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
7bf011e3 5105 pid_t pid = 0;
03cfe0d5
LP
5106
5107 log_parse_environment();
5108 log_open();
415fc41c 5109
03cfe0d5
LP
5110 r = parse_argv(argc, argv);
5111 if (r <= 0)
5112 goto finish;
5113
fba868fa
LP
5114 r = must_be_root();
5115 if (r < 0)
03cfe0d5 5116 goto finish;
fba868fa 5117
287b7376
LP
5118 r = cant_be_in_netns();
5119 if (r < 0)
5120 goto finish;
5121
bf428efb
LP
5122 r = initialize_rlimits();
5123 if (r < 0)
5124 goto finish;
5125
de40a303
LP
5126 r = load_oci_bundle();
5127 if (r < 0)
5128 goto finish;
5129
f757855e
LP
5130 r = determine_names();
5131 if (r < 0)
5132 goto finish;
5133
5134 r = load_settings();
5135 if (r < 0)
5136 goto finish;
5137
d4d99bc6 5138 r = cg_unified();
5eee8290
LP
5139 if (r < 0) {
5140 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5141 goto finish;
5142 }
5143
f757855e
LP
5144 r = verify_arguments();
5145 if (r < 0)
5146 goto finish;
03cfe0d5 5147
49048684
ZJS
5148 /* Reapply environment settings. */
5149 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5150
2949ff26
LP
5151 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5152 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5153 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5154 (void) ignore_signals(SIGPIPE, -1);
5155
03cfe0d5
LP
5156 n_fd_passed = sd_listen_fds(false);
5157 if (n_fd_passed > 0) {
5158 r = fdset_new_listen_fds(&fds, false);
5159 if (r < 0) {
5160 log_error_errno(r, "Failed to collect file descriptors: %m");
5161 goto finish;
5162 }
5163 }
5164
83e803a9
ZJS
5165 /* The "default" umask. This is appropriate for most file and directory
5166 * operations performed by nspawn, and is the umask that will be used for
5167 * the child. Functions like copy_devnodes() change the umask temporarily. */
5168 umask(0022);
5169
03cfe0d5
LP
5170 if (arg_directory) {
5171 assert(!arg_image);
5172
b35ca61a
LP
5173 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5174 * /var from the host will propagate into container dynamically (because bad things happen if
5175 * two systems write to the same /var). Let's allow it for the special cases where /var is
5176 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5177 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5178 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5179 r = -EINVAL;
5180 goto finish;
5181 }
5182
5183 if (arg_ephemeral) {
5184 _cleanup_free_ char *np = NULL;
5185
8d4aa2bb 5186 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
5187 if (r < 0)
5188 goto finish;
5189
7bf011e3
LP
5190 /* If the specified path is a mount point we generate the new snapshot immediately
5191 * inside it under a random name. However if the specified is not a mount point we
5192 * create the new snapshot in the parent directory, just next to it. */
e1873695 5193 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5194 if (r < 0) {
5195 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5196 goto finish;
5197 }
5198 if (r > 0)
770b5ce4 5199 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5200 else
770b5ce4 5201 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5202 if (r < 0) {
0f3be6ca 5203 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5204 goto finish;
5205 }
5206
6992459c 5207 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5208 * only owned by us and no one else. */
6992459c 5209 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5210 if (r < 0) {
5211 log_error_errno(r, "Failed to lock %s: %m", np);
5212 goto finish;
5213 }
5214
7bf011e3
LP
5215 {
5216 BLOCK_SIGNALS(SIGINT);
5217 r = btrfs_subvol_snapshot(arg_directory, np,
5218 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5219 BTRFS_SNAPSHOT_FALLBACK_COPY |
5220 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5221 BTRFS_SNAPSHOT_RECURSIVE |
5222 BTRFS_SNAPSHOT_QUOTA |
5223 BTRFS_SNAPSHOT_SIGINT);
5224 }
5225 if (r == -EINTR) {
5226 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5227 goto finish;
5228 }
03cfe0d5
LP
5229 if (r < 0) {
5230 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5231 goto finish;
ec16945e
LP
5232 }
5233
1cc6c93a 5234 free_and_replace(arg_directory, np);
17cbb288 5235 remove_directory = true;
30535c16 5236 } else {
cb638b5e 5237 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5238 if (r < 0)
5239 goto finish;
5240
30535c16
LP
5241 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5242 if (r == -EBUSY) {
5243 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5244 goto finish;
5245 }
5246 if (r < 0) {
5247 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5248 goto finish;
30535c16
LP
5249 }
5250
5251 if (arg_template) {
8d4aa2bb 5252 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
5253 if (r < 0)
5254 goto finish;
5255
7bf011e3
LP
5256 {
5257 BLOCK_SIGNALS(SIGINT);
5258 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5259 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5260 BTRFS_SNAPSHOT_FALLBACK_COPY |
5261 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5262 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5263 BTRFS_SNAPSHOT_RECURSIVE |
5264 BTRFS_SNAPSHOT_QUOTA |
5265 BTRFS_SNAPSHOT_SIGINT);
5266 }
ff6c6cc1
LP
5267 if (r == -EEXIST)
5268 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5269 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5270 else if (r == -EINTR) {
5271 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5272 goto finish;
5273 } else if (r < 0) {
83521414 5274 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5275 goto finish;
ff6c6cc1
LP
5276 } else
5277 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5278 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5279 }
ec16945e
LP
5280 }
5281
7732f92b 5282 if (arg_start_mode == START_BOOT) {
a5201ed6 5283 const char *p;
c9fe05e0 5284
a5201ed6
LP
5285 if (arg_pivot_root_new)
5286 p = prefix_roota(arg_directory, arg_pivot_root_new);
5287 else
5288 p = arg_directory;
c9fe05e0
AR
5289
5290 if (path_is_os_tree(p) <= 0) {
5291 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 5292 r = -EINVAL;
1b9e5b12
LP
5293 goto finish;
5294 }
5295 } else {
c9fe05e0
AR
5296 const char *p, *q;
5297
a5201ed6
LP
5298 if (arg_pivot_root_new)
5299 p = prefix_roota(arg_directory, arg_pivot_root_new);
5300 else
5301 p = arg_directory;
c9fe05e0
AR
5302
5303 q = strjoina(p, "/usr/");
1b9e5b12 5304
c9fe05e0
AR
5305 if (laccess(q, F_OK) < 0) {
5306 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 5307 r = -EINVAL;
1b9e5b12 5308 goto finish;
1b9e5b12
LP
5309 }
5310 }
ec16945e 5311
6b9132a9 5312 } else {
e7cbe5cb 5313 DissectImageFlags dissect_image_flags = DISSECT_IMAGE_REQUIRE_ROOT | DISSECT_IMAGE_RELAX_VAR_CHECK;
ec16945e
LP
5314 assert(arg_image);
5315 assert(!arg_template);
5316
8d4aa2bb 5317 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
5318 if (r < 0)
5319 goto finish;
5320
0f3be6ca
LP
5321 if (arg_ephemeral) {
5322 _cleanup_free_ char *np = NULL;
5323
5324 r = tempfn_random(arg_image, "machine.", &np);
5325 if (r < 0) {
5326 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5327 goto finish;
5328 }
5329
6992459c
LP
5330 /* Always take an exclusive lock on our own ephemeral copy. */
5331 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5332 if (r < 0) {
5333 r = log_error_errno(r, "Failed to create image lock: %m");
5334 goto finish;
5335 }
5336
7bf011e3
LP
5337 {
5338 BLOCK_SIGNALS(SIGINT);
5339 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5340 }
5341 if (r == -EINTR) {
5342 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5343 goto finish;
5344 }
0f3be6ca
LP
5345 if (r < 0) {
5346 r = log_error_errno(r, "Failed to copy image file: %m");
5347 goto finish;
5348 }
5349
1cc6c93a 5350 free_and_replace(arg_image, np);
0f3be6ca
LP
5351 remove_image = true;
5352 } else {
5353 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5354 if (r == -EBUSY) {
5355 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5356 goto finish;
5357 }
5358 if (r < 0) {
5359 r = log_error_errno(r, "Failed to create image lock: %m");
5360 goto finish;
5361 }
4623e8e6 5362
89e62e0b
LP
5363 r = verity_settings_load(
5364 &arg_verity_settings,
5365 arg_image, NULL, NULL);
e7cbe5cb
LB
5366 if (r < 0) {
5367 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5368 goto finish;
78ebe980 5369 }
89e62e0b
LP
5370
5371 if (arg_verity_settings.data_path)
5372 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5373 }
5374
c67b0082 5375 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5376 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5377 goto finish;
1b9e5b12 5378 }
6b9132a9 5379
c67b0082
LP
5380 remove_tmprootdir = true;
5381
5382 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5383 if (!arg_directory) {
5384 r = log_oom();
5385 goto finish;
6b9132a9 5386 }
88213476 5387
89e62e0b
LP
5388 r = loop_device_make_by_path(
5389 arg_image,
5390 arg_read_only ? O_RDONLY : O_RDWR,
5391 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5392 &loop);
2d845785
LP
5393 if (r < 0) {
5394 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5395 goto finish;
5396 }
1b9e5b12 5397
4526113f 5398 r = dissect_image_and_warn(
e0f9e7bd 5399 loop->fd,
4526113f 5400 arg_image,
89e62e0b 5401 &arg_verity_settings,
18d73705 5402 NULL,
e7cbe5cb 5403 dissect_image_flags,
e0f9e7bd 5404 &dissected_image);
2d845785 5405 if (r == -ENOPKG) {
4526113f 5406 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5407 log_notice("Note that the disk image needs to\n"
5408 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5409 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
19ac32cd 5410 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
2d845785
LP
5411 " d) or contain a file system without a partition table\n"
5412 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5413 goto finish;
2d845785 5414 }
4526113f 5415 if (r < 0)
842f3b0f 5416 goto finish;
1b9e5b12 5417
89e62e0b 5418 if (!arg_verity_settings.root_hash && dissected_image->can_verity)
4623e8e6
LP
5419 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5420
89e62e0b
LP
5421 r = dissected_image_decrypt_interactively(
5422 dissected_image,
5423 NULL,
5424 &arg_verity_settings,
5425 0,
5426 &decrypted_image);
1b9e5b12
LP
5427 if (r < 0)
5428 goto finish;
0f3be6ca
LP
5429
5430 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5431 if (remove_image && unlink(arg_image) >= 0)
5432 remove_image = false;
842f3b0f 5433 }
842f3b0f 5434
86c0dd4a 5435 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5436 if (r < 0)
5437 goto finish;
5438
de40a303
LP
5439 if (arg_console_mode < 0)
5440 arg_console_mode =
5441 isatty(STDIN_FILENO) > 0 &&
5442 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5443
de40a303
LP
5444 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5445 arg_quiet = true;
a258bf26 5446
9c857b9d
LP
5447 if (!arg_quiet)
5448 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5449 arg_machine, arg_image ?: arg_directory);
5450
72c0a2c2 5451 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5452
66edd963 5453 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5454 r = log_error_errno(errno, "Failed to become subreaper: %m");
5455 goto finish;
5456 }
5457
d87be9b0 5458 for (;;) {
3acc84eb 5459 r = run_container(dissected_image,
44dbef90
LP
5460 secondary,
5461 fds,
5462 veth_name, &veth_created,
3acc84eb 5463 &exposed, &master,
44dbef90 5464 &pid, &ret);
b0067625 5465 if (r <= 0)
d87be9b0 5466 break;
d87be9b0 5467 }
88213476
LP
5468
5469finish:
04f590a4
LP
5470 (void) sd_notify(false,
5471 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5472 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5473
9444b1f2 5474 if (pid > 0)
c67b0082 5475 (void) kill(pid, SIGKILL);
88213476 5476
503546da 5477 /* Try to flush whatever is still queued in the pty */
6a0f896b 5478 if (master >= 0) {
1c876927 5479 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
5480 master = safe_close(master);
5481 }
5482
5483 if (pid > 0)
5484 (void) wait_for_terminate(pid, NULL);
503546da 5485
50ebcf6c
LP
5486 pager_close();
5487
17cbb288 5488 if (remove_directory && arg_directory) {
ec16945e
LP
5489 int k;
5490
17cbb288 5491 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5492 if (k < 0)
17cbb288 5493 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5494 }
5495
0f3be6ca
LP
5496 if (remove_image && arg_image) {
5497 if (unlink(arg_image) < 0)
5498 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5499 }
5500
c67b0082
LP
5501 if (remove_tmprootdir) {
5502 if (rmdir(tmprootdir) < 0)
5503 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5504 }
5505
785890ac
LP
5506 if (arg_machine) {
5507 const char *p;
5508
63c372cb 5509 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5510 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5511 }
5512
7a8f6325 5513 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
5514
5515 if (veth_created)
5516 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5517 (void) remove_bridge(arg_network_zone);
f757855e 5518
f757855e
LP
5519 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5520 expose_port_free_all(arg_expose_ports);
bf428efb 5521 rlimit_free_all(arg_rlimit);
b2645747 5522 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5523 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5524
44dbef90
LP
5525 if (r < 0)
5526 return r;
5527
5528 return ret;
88213476 5529}
44dbef90
LP
5530
5531DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);