]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: fix fd leak on failure path
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
8fe0087e
LP
14#include <sys/personality.h>
15#include <sys/prctl.h>
16#include <sys/types.h>
6916b164 17#include <sys/wait.h>
8fe0087e 18#include <unistd.h>
1b9e5b12 19
b053cd5f 20#include "sd-bus.h"
1f0cd86b 21#include "sd-daemon.h"
1f0cd86b 22#include "sd-id128.h"
8fe0087e 23
b5efdb8a 24#include "alloc-util.h"
8fe0087e
LP
25#include "barrier.h"
26#include "base-filesystem.h"
27#include "blkid-util.h"
28#include "btrfs-util.h"
b8ea7a6e 29#include "bus-error.h"
b053cd5f 30#include "bus-util.h"
8fe0087e 31#include "cap-list.h"
430f0182 32#include "capability-util.h"
04d391da 33#include "cgroup-util.h"
8fe0087e 34#include "copy.h"
d107bb7d 35#include "cpu-set-util.h"
4fc9982c 36#include "dev-setup.h"
2d845785 37#include "dissect-image.h"
8fe0087e 38#include "env-util.h"
3652872a 39#include "escape.h"
3ffd4af2 40#include "fd-util.h"
842f3b0f 41#include "fdset.h"
a5c32cff 42#include "fileio.h"
f97b34a6 43#include "format-util.h"
f4f15635 44#include "fs-util.h"
1b9e5b12 45#include "gpt.h"
4623e8e6 46#include "hexdecoct.h"
8fe0087e 47#include "hostname-util.h"
910fd145 48#include "id128-util.h"
3652872a 49#include "io-util.h"
8fe0087e 50#include "log.h"
2d845785 51#include "loop-util.h"
8fe0087e 52#include "loopback-setup.h"
1b9cebf6 53#include "machine-image.h"
8fe0087e 54#include "macro.h"
44dbef90 55#include "main-func.h"
f5947a5e 56#include "missing_sched.h"
8fe0087e 57#include "mkdir.h"
4349cd7c 58#include "mount-util.h"
049af8ad 59#include "mountpoint-util.h"
0cb8e3d1 60#include "namespace-util.h"
8fe0087e 61#include "netlink-util.h"
07630cea 62#include "nspawn-cgroup.h"
3652872a 63#include "nspawn-creds.h"
3603efde 64#include "nspawn-def.h"
07630cea
LP
65#include "nspawn-expose-ports.h"
66#include "nspawn-mount.h"
67#include "nspawn-network.h"
de40a303 68#include "nspawn-oci.h"
7336138e 69#include "nspawn-patch-uid.h"
07630cea 70#include "nspawn-register.h"
910fd145 71#include "nspawn-seccomp.h"
07630cea
LP
72#include "nspawn-settings.h"
73#include "nspawn-setuid.h"
7732f92b 74#include "nspawn-stub-pid1.h"
d8b4d14d 75#include "nulstr-util.h"
d58ad743 76#include "os-util.h"
50ebcf6c 77#include "pager.h"
6bedfcbb 78#include "parse-util.h"
8fe0087e 79#include "path-util.h"
294bf0c3 80#include "pretty-print.h"
0b452006 81#include "process-util.h"
8fe0087e
LP
82#include "ptyfwd.h"
83#include "random-util.h"
8869a0b4 84#include "raw-clone.h"
86775e35 85#include "resolve-util.h"
bf428efb 86#include "rlimit-util.h"
8fe0087e 87#include "rm-rf.h"
de40a303
LP
88#if HAVE_SECCOMP
89#include "seccomp-util.h"
90#endif
68b02049 91#include "selinux-util.h"
8fe0087e 92#include "signal-util.h"
2583fbea 93#include "socket-util.h"
8fcde012 94#include "stat-util.h"
15a5e950 95#include "stdio-util.h"
5c828e66 96#include "string-table.h"
07630cea 97#include "string-util.h"
8fe0087e 98#include "strv.h"
de40a303 99#include "sysctl-util.h"
8fe0087e 100#include "terminal-util.h"
e4de7287 101#include "tmpfile-util.h"
affb60b1 102#include "umask-util.h"
43c3fb46 103#include "unit-name.h"
b1d4f8e1 104#include "user-util.h"
8fe0087e 105#include "util.h"
e9642be2 106
e96ceaba
LP
107/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
108#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
0e7ac751 109
2a49b612
ZJS
110#define EXIT_FORCE_RESTART 133
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
6145bb4f 114 CONTAINER_REBOOTED,
113cea80
DH
115} ContainerStatus;
116
88213476 117static char *arg_directory = NULL;
ec16945e 118static char *arg_template = NULL;
5f932eb9 119static char *arg_chdir = NULL;
b53ede69
PW
120static char *arg_pivot_root_new = NULL;
121static char *arg_pivot_root_old = NULL;
687d0825 122static char *arg_user = NULL;
de40a303
LP
123static uid_t arg_uid = UID_INVALID;
124static gid_t arg_gid = GID_INVALID;
125static gid_t* arg_supplementary_gids = NULL;
126static size_t arg_n_supplementary_gids = 0;
9444b1f2 127static sd_id128_t arg_uuid = {};
3a9530e5
LP
128static char *arg_machine = NULL; /* The name used by the host to refer to this */
129static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
130static const char *arg_selinux_context = NULL;
131static const char *arg_selinux_apifs_context = NULL;
de40a303 132static char *arg_slice = NULL;
ff01d048 133static bool arg_private_network = false;
bc2f673e 134static bool arg_read_only = false;
7732f92b 135static StartMode arg_start_mode = START_PID1;
ec16945e 136static bool arg_ephemeral = false;
57fb9fb5 137static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 138static bool arg_link_journal_try = false;
520e0d54 139static uint64_t arg_caps_retain =
50b52222
LP
140 (1ULL << CAP_AUDIT_CONTROL) |
141 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
142 (1ULL << CAP_CHOWN) |
143 (1ULL << CAP_DAC_OVERRIDE) |
144 (1ULL << CAP_DAC_READ_SEARCH) |
145 (1ULL << CAP_FOWNER) |
146 (1ULL << CAP_FSETID) |
147 (1ULL << CAP_IPC_OWNER) |
148 (1ULL << CAP_KILL) |
149 (1ULL << CAP_LEASE) |
150 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 151 (1ULL << CAP_MKNOD) |
5076f0cc
LP
152 (1ULL << CAP_NET_BIND_SERVICE) |
153 (1ULL << CAP_NET_BROADCAST) |
154 (1ULL << CAP_NET_RAW) |
5076f0cc 155 (1ULL << CAP_SETFCAP) |
50b52222 156 (1ULL << CAP_SETGID) |
5076f0cc
LP
157 (1ULL << CAP_SETPCAP) |
158 (1ULL << CAP_SETUID) |
159 (1ULL << CAP_SYS_ADMIN) |
50b52222 160 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
161 (1ULL << CAP_SYS_CHROOT) |
162 (1ULL << CAP_SYS_NICE) |
163 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 164 (1ULL << CAP_SYS_RESOURCE) |
50b52222 165 (1ULL << CAP_SYS_TTY_CONFIG);
de40a303 166static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 167static CustomMount *arg_custom_mounts = NULL;
88614c8a 168static size_t arg_n_custom_mounts = 0;
f4889f65 169static char **arg_setenv = NULL;
284c0b91 170static bool arg_quiet = false;
eb91eb18 171static bool arg_register = true;
89f7c846 172static bool arg_keep_unit = false;
aa28aefe 173static char **arg_network_interfaces = NULL;
c74e630d 174static char **arg_network_macvlan = NULL;
4bbfe7ad 175static char **arg_network_ipvlan = NULL;
69c79d3c 176static bool arg_network_veth = false;
f6d6bad1 177static char **arg_network_veth_extra = NULL;
f757855e 178static char *arg_network_bridge = NULL;
22b28dfd 179static char *arg_network_zone = NULL;
d7bea6b6 180static char *arg_network_namespace_path = NULL;
bb068de0 181static PagerFlags arg_pager_flags = 0;
050f7277 182static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 183static char *arg_image = NULL;
de40a303 184static char *arg_oci_bundle = NULL;
f757855e 185static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 186static ExposePort *arg_expose_ports = NULL;
f36933fe 187static char **arg_property = NULL;
de40a303 188static sd_bus_message *arg_property_message = NULL;
0de7acce 189static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 190static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 191static bool arg_userns_chown = false;
c6c8f6e2 192static int arg_kill_signal = 0;
5da38d07 193static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
194static SettingsMask arg_settings_mask = 0;
195static int arg_settings_trusted = -1;
196static char **arg_parameters = NULL;
6aadfa4c 197static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 198static bool arg_notify_ready = false;
5a8ff0e6 199static bool arg_use_cgns = true;
0c582db0 200static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 201static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
89e62e0b 202static VeritySettings arg_verity_settings = {};
6b000af4
LP
203static char **arg_syscall_allow_list = NULL;
204static char **arg_syscall_deny_list = NULL;
de40a303
LP
205#if HAVE_SECCOMP
206static scmp_filter_ctx arg_seccomp = NULL;
207#endif
bf428efb 208static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 209static bool arg_no_new_privileges = false;
81f345df
LP
210static int arg_oom_score_adjust = 0;
211static bool arg_oom_score_adjust_set = false;
0985c7c4 212static CPUSet arg_cpu_set = {};
09d423e9 213static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 214static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
215static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
216static DeviceNode* arg_extra_nodes = NULL;
217static size_t arg_n_extra_nodes = 0;
218static char **arg_sysctl = NULL;
219static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
220static Credential *arg_credentials = NULL;
221static size_t arg_n_credentials = 0;
88213476 222
6145bb4f
LP
223STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
224STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
225STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
226STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
227STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
228STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
229STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
230STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
231STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
232STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
234STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
235STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
236STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
237STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
238STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
239STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
244STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
245STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 246STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
247STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
248STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
249#if HAVE_SECCOMP
250STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
251#endif
0985c7c4 252STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f
LP
253STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
254
dce66ffe
ZJS
255static int handle_arg_console(const char *arg) {
256 if (streq(arg, "help")) {
257 puts("interactive\n"
258 "read-only\n"
259 "passive\n"
260 "pipe");
261 return 0;
262 }
263
264 if (streq(arg, "interactive"))
265 arg_console_mode = CONSOLE_INTERACTIVE;
266 else if (streq(arg, "read-only"))
267 arg_console_mode = CONSOLE_READ_ONLY;
268 else if (streq(arg, "passive"))
269 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
270 else if (streq(arg, "pipe")) {
271 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
272 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
273 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
274 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
275 "Proceeding anyway.");
276
dce66ffe 277 arg_console_mode = CONSOLE_PIPE;
554c4beb 278 } else
dce66ffe
ZJS
279 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
280
281 arg_settings_mask |= SETTING_CONSOLE_MODE;
282 return 1;
283}
284
37ec0fdd
LP
285static int help(void) {
286 _cleanup_free_ char *link = NULL;
287 int r;
288
bb068de0 289 (void) pager_open(arg_pager_flags);
50ebcf6c 290
37ec0fdd
LP
291 r = terminal_urlify_man("systemd-nspawn", "1", &link);
292 if (r < 0)
293 return log_oom();
294
25148653 295 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 296 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
297 " -h --help Show this help\n"
298 " --version Print version string\n"
69c79d3c 299 " -q --quiet Do not show status information\n"
bb068de0 300 " --no-pager Do not pipe output into a pager\n"
25148653
LP
301 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
302 "%3$sImage:%4$s\n"
1b9e5b12 303 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
304 " --template=PATH Initialize root directory from template directory,\n"
305 " if missing\n"
306 " -x --ephemeral Run container with snapshot of root directory, and\n"
307 " remove it after exit\n"
25e68fd3
LP
308 " -i --image=PATH Root file system disk image (or device node) for\n"
309 " the container\n"
de40a303 310 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
311 " --read-only Mount the root directory read-only\n"
312 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 313 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
314 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
315 " as a DER encoded PKCS7, either as a path to a file\n"
316 " or as an ASCII base64 encoded string prefixed by\n"
317 " 'base64:'\n"
e7cbe5cb 318 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
319 " --pivot-root=PATH[:PATH]\n"
320 " Pivot root to given directory in the container\n\n"
321 "%3$sExecution:%4$s\n"
7732f92b 322 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 323 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 324 " --chdir=PATH Set working directory in the container\n"
25148653
LP
325 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
326 " -u --user=USER Run the command under specified user or UID\n"
327 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
328 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
329 "%3$sSystem Identity:%4$s\n"
a8828ed9 330 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 331 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
332 " --uuid=UUID Set a specific machine UUID for the container\n\n"
333 "%3$sProperties:%4$s\n"
a8828ed9 334 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 335 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
336 " --register=BOOLEAN Register container as machine\n"
337 " --keep-unit Do not register a scope for the machine, reuse\n"
338 " the service unit nspawn is running in\n\n"
339 "%3$sUser Namespacing:%4$s\n"
90b4a64d 340 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 341 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 342 " Similar, but with user configured UID/GID range\n"
25148653
LP
343 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
344 "%3$sNetworking:%4$s\n"
69c79d3c
LP
345 " --private-network Disable network in container\n"
346 " --network-interface=INTERFACE\n"
347 " Assign an existing network interface to the\n"
348 " container\n"
c74e630d
LP
349 " --network-macvlan=INTERFACE\n"
350 " Create a macvlan network interface based on an\n"
351 " existing network interface to the container\n"
4bbfe7ad
TG
352 " --network-ipvlan=INTERFACE\n"
353 " Create a ipvlan network interface based on an\n"
354 " existing network interface to the container\n"
a8eaaee7 355 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 356 " and container\n"
f6d6bad1
LP
357 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
358 " Add an additional virtual Ethernet link between\n"
359 " host and container\n"
ab046dde 360 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
361 " Add a virtual Ethernet connection to the container\n"
362 " and attach it to an existing bridge on the host\n"
363 " --network-zone=NAME Similar, but attach the new interface to an\n"
364 " an automatically managed bridge interface\n"
d7bea6b6
DP
365 " --network-namespace-path=PATH\n"
366 " Set network namespace to the one represented by\n"
367 " the specified kernel namespace file node\n"
6d0b55c2 368 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
369 " Expose a container IP port on the host\n\n"
370 "%3$sSecurity:%4$s\n"
a8828ed9
DW
371 " --capability=CAP In addition to the default, retain specified\n"
372 " capability\n"
373 " --drop-capability=CAP Drop the specified capability from the default set\n"
f4e803c8 374 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
375 " --system-call-filter=LIST|~LIST\n"
376 " Permit/prohibit specific system calls\n"
25148653
LP
377 " -Z --selinux-context=SECLABEL\n"
378 " Set the SELinux security context to be used by\n"
379 " processes in the container\n"
380 " -L --selinux-apifs-context=SECLABEL\n"
381 " Set the SELinux security context to be used by\n"
382 " API/tmpfs file systems in the container\n\n"
383 "%3$sResources:%4$s\n"
bf428efb 384 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
385 " --oom-score-adjust=VALUE\n"
386 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
387 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
388 " --personality=ARCH Pick personality for this container\n\n"
25148653 389 "%3$sIntegration:%4$s\n"
09d423e9 390 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 391 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
392 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
393 " host, try-guest, try-host\n"
394 " -j Equivalent to --link-journal=try-guest\n\n"
395 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
396 " --bind=PATH[:PATH[:OPTIONS]]\n"
397 " Bind mount a file or directory from the host into\n"
a8828ed9 398 " the container\n"
5e5bfa6e
EY
399 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
400 " Similar, but creates a read-only bind mount\n"
de40a303
LP
401 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
402 " it\n"
06c17c39 403 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
404 " --overlay=PATH[:PATH...]:PATH\n"
405 " Create an overlay mount from the host to \n"
406 " the container\n"
407 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
408 " Similar, but creates a read-only overlay mount\n\n"
409 "%3$sInput/Output:%4$s\n"
de40a303
LP
410 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
411 " set up for the container.\n"
3652872a
LP
412 " -P --pipe Equivalent to --console=pipe\n\n"
413 "%3$sCredentials:%4$s\n"
414 " --set-credential=ID:VALUE\n"
415 " Pass a credential with literal value to container.\n"
416 " --load-credential=ID:PATH\n"
417 " Load credential to pass to container from file or\n"
418 " AF_UNIX stream socket.\n"
25148653 419 "\nSee the %2$s for details.\n"
37ec0fdd
LP
420 , program_invocation_short_name
421 , link
37a92352
LP
422 , ansi_underline(), ansi_normal()
423 , ansi_highlight(), ansi_normal()
424 );
37ec0fdd
LP
425
426 return 0;
88213476
LP
427}
428
86c0dd4a 429static int custom_mount_check_all(void) {
88614c8a 430 size_t i;
5a8af538 431
5a8af538
LP
432 for (i = 0; i < arg_n_custom_mounts; i++) {
433 CustomMount *m = &arg_custom_mounts[i];
434
0de7acce 435 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
436 if (arg_userns_chown)
437 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
438 "--private-users-chown may not be combined with custom root mounts.");
439 else if (arg_uid_shift == UID_INVALID)
440 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
441 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 442 }
5a8af538
LP
443 }
444
445 return 0;
446}
447
8199d554 448static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 449 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 450 int r;
5da38d07 451
efdb0237 452 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
453
454 e = getenv(var);
455 if (!e) {
d5fc5b2f 456 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
457 var = "UNIFIED_CGROUP_HIERARCHY";
458 e = getenv(var);
c78c095b
ZJS
459 }
460
461 if (!isempty(e)) {
efdb0237
LP
462 r = parse_boolean(e);
463 if (r < 0)
c78c095b 464 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
465 if (r > 0)
466 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
467 else
468 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
469 }
470
8199d554
LP
471 return 0;
472}
473
474static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
475 int r;
476
75b0d8b8
ZJS
477 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
478 * in the image actually supports. */
b4cccbc1
LP
479 r = cg_all_unified();
480 if (r < 0)
481 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
482 if (r > 0) {
a8725a06
ZJS
483 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
484 * routine only detects 231, so we'll have a false negative here for 230. */
485 r = systemd_installation_has_version(directory, 230);
486 if (r < 0)
487 return log_error_errno(r, "Failed to determine systemd version in container: %m");
488 if (r > 0)
489 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
490 else
491 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 492 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
493 /* Mixed cgroup hierarchy support was added in 233 */
494 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
495 if (r < 0)
496 return log_error_errno(r, "Failed to determine systemd version in container: %m");
497 if (r > 0)
498 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
499 else
500 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
501 } else
5da38d07 502 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 503
8199d554
LP
504 log_debug("Using %s hierarchy for container.",
505 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
506 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
507
efdb0237
LP
508 return 0;
509}
510
8a99bd0c
ZJS
511static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
512 uint64_t mask = 0;
513 int r;
514
515 for (;;) {
516 _cleanup_free_ char *t = NULL;
517
518 r = extract_first_word(&spec, &t, ",", 0);
519 if (r < 0)
520 return log_error_errno(r, "Failed to parse capability %s.", t);
521 if (r == 0)
522 break;
523
524 if (streq(t, "help")) {
525 for (int i = 0; i < capability_list_length(); i++) {
526 const char *name;
527
528 name = capability_to_name(i);
529 if (name)
530 puts(name);
531 }
532
533 return 0; /* quit */
534 }
535
536 if (streq(t, "all"))
537 mask = (uint64_t) -1;
538 else {
539 r = capability_from_name(t);
540 if (r < 0)
541 return log_error_errno(r, "Failed to parse capability %s.", t);
542
543 mask |= 1ULL << r;
544 }
545 }
546
547 *ret_mask = mask;
548 return 1; /* continue */
549}
550
49048684 551static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
552 int r;
553
554 r = getenv_bool(name);
555 if (r == -ENXIO)
49048684 556 return 0;
0c582db0 557 if (r < 0)
49048684 558 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 559
0c582db0 560 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 561 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 562 return 0;
0c582db0
LB
563}
564
49048684 565static int parse_mount_settings_env(void) {
4f086aab 566 const char *e;
1099ceeb
LP
567 int r;
568
569 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
570 if (r < 0 && r != -ENXIO)
571 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
572 if (r >= 0)
573 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
574
575 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 576 if (streq_ptr(e, "network"))
4f086aab 577 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 578
49048684
ZJS
579 else if (e) {
580 r = parse_boolean(e);
581 if (r < 0)
582 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
583
584 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
585 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 586 }
4f086aab 587
49048684 588 return 0;
4f086aab
SU
589}
590
49048684 591static int parse_environment(void) {
d5455d2f
LP
592 const char *e;
593 int r;
594
49048684
ZJS
595 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
596 if (r < 0)
597 return r;
598 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
599 if (r < 0)
600 return r;
601 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
602 if (r < 0)
603 return r;
604 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
605 if (r < 0)
606 return r;
d5455d2f 607
49048684
ZJS
608 r = parse_mount_settings_env();
609 if (r < 0)
610 return r;
d5455d2f 611
489fae52
ZJS
612 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
613 * even if it is supported. If not supported, it has no effect. */
de40a303 614 if (!cg_ns_supported())
489fae52 615 arg_use_cgns = false;
de40a303
LP
616 else {
617 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
618 if (r < 0) {
619 if (r != -ENXIO)
49048684 620 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
621
622 arg_use_cgns = true;
623 } else {
624 arg_use_cgns = r > 0;
625 arg_settings_mask |= SETTING_USE_CGNS;
626 }
627 }
d5455d2f
LP
628
629 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
630 if (e)
631 arg_container_service_name = e;
632
49048684 633 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
634}
635
88213476 636static int parse_argv(int argc, char *argv[]) {
a41fe3a2 637 enum {
acbeb427
ZJS
638 ARG_VERSION = 0x100,
639 ARG_PRIVATE_NETWORK,
bc2f673e 640 ARG_UUID,
5076f0cc 641 ARG_READ_ONLY,
57fb9fb5 642 ARG_CAPABILITY,
420c7379 643 ARG_DROP_CAPABILITY,
17fe0523
LP
644 ARG_LINK_JOURNAL,
645 ARG_BIND,
f4889f65 646 ARG_BIND_RO,
06c17c39 647 ARG_TMPFS,
5a8af538
LP
648 ARG_OVERLAY,
649 ARG_OVERLAY_RO,
de40a303 650 ARG_INACCESSIBLE,
eb91eb18 651 ARG_SHARE_SYSTEM,
89f7c846 652 ARG_REGISTER,
aa28aefe 653 ARG_KEEP_UNIT,
69c79d3c 654 ARG_NETWORK_INTERFACE,
c74e630d 655 ARG_NETWORK_MACVLAN,
4bbfe7ad 656 ARG_NETWORK_IPVLAN,
ab046dde 657 ARG_NETWORK_BRIDGE,
22b28dfd 658 ARG_NETWORK_ZONE,
f6d6bad1 659 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 660 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 661 ARG_PERSONALITY,
4d9f07b4 662 ARG_VOLATILE,
ec16945e 663 ARG_TEMPLATE,
f36933fe 664 ARG_PROPERTY,
6dac160c 665 ARG_PRIVATE_USERS,
c6c8f6e2 666 ARG_KILL_SIGNAL,
f757855e 667 ARG_SETTINGS,
5f932eb9 668 ARG_CHDIR,
b53ede69 669 ARG_PIVOT_ROOT,
7336138e 670 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 671 ARG_NOTIFY_READY,
4623e8e6 672 ARG_ROOT_HASH,
89e62e0b
LP
673 ARG_ROOT_HASH_SIG,
674 ARG_VERITY_DATA,
960e4569 675 ARG_SYSTEM_CALL_FILTER,
bf428efb 676 ARG_RLIMIT,
3a9530e5 677 ARG_HOSTNAME,
66edd963 678 ARG_NO_NEW_PRIVILEGES,
81f345df 679 ARG_OOM_SCORE_ADJUST,
d107bb7d 680 ARG_CPU_AFFINITY,
09d423e9 681 ARG_RESOLV_CONF,
1688841f 682 ARG_TIMEZONE,
de40a303
LP
683 ARG_CONSOLE,
684 ARG_PIPE,
685 ARG_OCI_BUNDLE,
bb068de0 686 ARG_NO_PAGER,
3652872a
LP
687 ARG_SET_CREDENTIAL,
688 ARG_LOAD_CREDENTIAL,
a41fe3a2
LP
689 };
690
88213476 691 static const struct option options[] = {
d7bea6b6
DP
692 { "help", no_argument, NULL, 'h' },
693 { "version", no_argument, NULL, ARG_VERSION },
694 { "directory", required_argument, NULL, 'D' },
695 { "template", required_argument, NULL, ARG_TEMPLATE },
696 { "ephemeral", no_argument, NULL, 'x' },
697 { "user", required_argument, NULL, 'u' },
698 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
699 { "as-pid2", no_argument, NULL, 'a' },
700 { "boot", no_argument, NULL, 'b' },
701 { "uuid", required_argument, NULL, ARG_UUID },
702 { "read-only", no_argument, NULL, ARG_READ_ONLY },
703 { "capability", required_argument, NULL, ARG_CAPABILITY },
704 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 705 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
706 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
707 { "bind", required_argument, NULL, ARG_BIND },
708 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
709 { "tmpfs", required_argument, NULL, ARG_TMPFS },
710 { "overlay", required_argument, NULL, ARG_OVERLAY },
711 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 712 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 713 { "machine", required_argument, NULL, 'M' },
3a9530e5 714 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
715 { "slice", required_argument, NULL, 'S' },
716 { "setenv", required_argument, NULL, 'E' },
717 { "selinux-context", required_argument, NULL, 'Z' },
718 { "selinux-apifs-context", required_argument, NULL, 'L' },
719 { "quiet", no_argument, NULL, 'q' },
720 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
721 { "register", required_argument, NULL, ARG_REGISTER },
722 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
723 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
724 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
725 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
726 { "network-veth", no_argument, NULL, 'n' },
727 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
728 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
729 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
730 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
731 { "personality", required_argument, NULL, ARG_PERSONALITY },
732 { "image", required_argument, NULL, 'i' },
733 { "volatile", optional_argument, NULL, ARG_VOLATILE },
734 { "port", required_argument, NULL, 'p' },
735 { "property", required_argument, NULL, ARG_PROPERTY },
736 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
737 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
738 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
739 { "settings", required_argument, NULL, ARG_SETTINGS },
740 { "chdir", required_argument, NULL, ARG_CHDIR },
741 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
742 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
743 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
744 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
745 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 746 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 747 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 748 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 749 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 750 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 751 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
752 { "console", required_argument, NULL, ARG_CONSOLE },
753 { "pipe", no_argument, NULL, ARG_PIPE },
754 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 755 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
756 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
757 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
eb9da376 758 {}
88213476
LP
759 };
760
9444b1f2 761 int c, r;
a42c8b54 762 uint64_t plus = 0, minus = 0;
f757855e 763 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
764
765 assert(argc >= 0);
766 assert(argv);
767
de40a303 768 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
769 switch (c) {
770
771 case 'h':
37ec0fdd 772 return help();
88213476 773
acbeb427 774 case ARG_VERSION:
3f6fd1ba 775 return version();
acbeb427 776
88213476 777 case 'D':
0f03c2a4 778 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 779 if (r < 0)
0f03c2a4 780 return r;
de40a303
LP
781
782 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
783 break;
784
785 case ARG_TEMPLATE:
0f03c2a4 786 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 787 if (r < 0)
0f03c2a4 788 return r;
de40a303
LP
789
790 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
791 break;
792
1b9e5b12 793 case 'i':
0f03c2a4 794 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 795 if (r < 0)
0f03c2a4 796 return r;
de40a303
LP
797
798 arg_settings_mask |= SETTING_DIRECTORY;
799 break;
800
801 case ARG_OCI_BUNDLE:
802 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
803 if (r < 0)
804 return r;
805
ec16945e
LP
806 break;
807
808 case 'x':
809 arg_ephemeral = true;
a2f577fc 810 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
811 break;
812
687d0825 813 case 'u':
2fc09a9c
DM
814 r = free_and_strdup(&arg_user, optarg);
815 if (r < 0)
7027ff61 816 return log_oom();
687d0825 817
f757855e 818 arg_settings_mask |= SETTING_USER;
687d0825
MV
819 break;
820
22b28dfd
LP
821 case ARG_NETWORK_ZONE: {
822 char *j;
823
b910cc72 824 j = strjoin("vz-", optarg);
22b28dfd
LP
825 if (!j)
826 return log_oom();
827
828 if (!ifname_valid(j)) {
829 log_error("Network zone name not valid: %s", j);
830 free(j);
831 return -EINVAL;
832 }
833
df1fac6d 834 free_and_replace(arg_network_zone, j);
22b28dfd
LP
835
836 arg_network_veth = true;
837 arg_private_network = true;
838 arg_settings_mask |= SETTING_NETWORK;
839 break;
840 }
841
ab046dde 842 case ARG_NETWORK_BRIDGE:
ef76dff2 843
baaa35ad
ZJS
844 if (!ifname_valid(optarg))
845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
846 "Bridge interface name not valid: %s", optarg);
ef76dff2 847
f757855e
LP
848 r = free_and_strdup(&arg_network_bridge, optarg);
849 if (r < 0)
850 return log_oom();
ab046dde 851
4831981d 852 _fallthrough_;
0dfaa006 853 case 'n':
69c79d3c
LP
854 arg_network_veth = true;
855 arg_private_network = true;
f757855e 856 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
857 break;
858
f6d6bad1
LP
859 case ARG_NETWORK_VETH_EXTRA:
860 r = veth_extra_parse(&arg_network_veth_extra, optarg);
861 if (r < 0)
862 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
863
864 arg_private_network = true;
865 arg_settings_mask |= SETTING_NETWORK;
866 break;
867
aa28aefe 868 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
869 if (!ifname_valid(optarg))
870 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
871 "Network interface name not valid: %s", optarg);
ef76dff2 872
b390f178
DDM
873 r = test_network_interface_initialized(optarg);
874 if (r < 0)
875 return r;
876
c74e630d
LP
877 if (strv_extend(&arg_network_interfaces, optarg) < 0)
878 return log_oom();
879
880 arg_private_network = true;
f757855e 881 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
882 break;
883
884 case ARG_NETWORK_MACVLAN:
ef76dff2 885
baaa35ad
ZJS
886 if (!ifname_valid(optarg))
887 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
888 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 889
b390f178
DDM
890 r = test_network_interface_initialized(optarg);
891 if (r < 0)
892 return r;
893
c74e630d 894 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
895 return log_oom();
896
4bbfe7ad 897 arg_private_network = true;
f757855e 898 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
899 break;
900
901 case ARG_NETWORK_IPVLAN:
ef76dff2 902
baaa35ad
ZJS
903 if (!ifname_valid(optarg))
904 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
905 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 906
b390f178
DDM
907 r = test_network_interface_initialized(optarg);
908 if (r < 0)
909 return r;
910
4bbfe7ad
TG
911 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
912 return log_oom();
913
4831981d 914 _fallthrough_;
ff01d048
LP
915 case ARG_PRIVATE_NETWORK:
916 arg_private_network = true;
f757855e 917 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
918 break;
919
d7bea6b6
DP
920 case ARG_NETWORK_NAMESPACE_PATH:
921 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
922 if (r < 0)
923 return r;
924
de40a303 925 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
926 break;
927
0f0dbc46 928 case 'b':
baaa35ad
ZJS
929 if (arg_start_mode == START_PID2)
930 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
931 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
932
933 arg_start_mode = START_BOOT;
934 arg_settings_mask |= SETTING_START_MODE;
935 break;
936
937 case 'a':
baaa35ad
ZJS
938 if (arg_start_mode == START_BOOT)
939 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
940 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
941
942 arg_start_mode = START_PID2;
943 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
944 break;
945
144f0fc0 946 case ARG_UUID:
9444b1f2 947 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
948 if (r < 0)
949 return log_error_errno(r, "Invalid UUID: %s", optarg);
950
baaa35ad
ZJS
951 if (sd_id128_is_null(arg_uuid))
952 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
953 "Machine UUID may not be all zeroes.");
f757855e
LP
954
955 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 956 break;
aa96c6cb 957
43c3fb46
LP
958 case 'S': {
959 _cleanup_free_ char *mangled = NULL;
960
961 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
962 if (r < 0)
963 return log_oom();
964
43c3fb46 965 free_and_replace(arg_slice, mangled);
de40a303 966 arg_settings_mask |= SETTING_SLICE;
144f0fc0 967 break;
43c3fb46 968 }
144f0fc0 969
7027ff61 970 case 'M':
c1521918 971 if (isempty(optarg))
97b11eed 972 arg_machine = mfree(arg_machine);
c1521918 973 else {
baaa35ad
ZJS
974 if (!machine_name_is_valid(optarg))
975 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
976 "Invalid machine name: %s", optarg);
7027ff61 977
0c3c4284
LP
978 r = free_and_strdup(&arg_machine, optarg);
979 if (r < 0)
eb91eb18 980 return log_oom();
eb91eb18 981 }
9ce6d1b3 982 break;
7027ff61 983
3a9530e5
LP
984 case ARG_HOSTNAME:
985 if (isempty(optarg))
986 arg_hostname = mfree(arg_hostname);
987 else {
baaa35ad
ZJS
988 if (!hostname_is_valid(optarg, false))
989 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
990 "Invalid hostname: %s", optarg);
3a9530e5
LP
991
992 r = free_and_strdup(&arg_hostname, optarg);
993 if (r < 0)
994 return log_oom();
995 }
996
997 arg_settings_mask |= SETTING_HOSTNAME;
998 break;
999
82adf6af
LP
1000 case 'Z':
1001 arg_selinux_context = optarg;
a8828ed9
DW
1002 break;
1003
82adf6af
LP
1004 case 'L':
1005 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1006 break;
1007
bc2f673e
LP
1008 case ARG_READ_ONLY:
1009 arg_read_only = true;
f757855e 1010 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1011 break;
1012
420c7379
LP
1013 case ARG_CAPABILITY:
1014 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1015 uint64_t m;
1016 r = parse_capability_spec(optarg, &m);
1017 if (r <= 0)
1018 return r;
5076f0cc 1019
8a99bd0c
ZJS
1020 if (c == ARG_CAPABILITY)
1021 plus |= m;
1022 else
1023 minus |= m;
f757855e 1024 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1025 break;
1026 }
66edd963
LP
1027 case ARG_NO_NEW_PRIVILEGES:
1028 r = parse_boolean(optarg);
1029 if (r < 0)
1030 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1031
1032 arg_no_new_privileges = r;
1033 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1034 break;
1035
57fb9fb5
LP
1036 case 'j':
1037 arg_link_journal = LINK_GUEST;
574edc90 1038 arg_link_journal_try = true;
4e1d6aa9 1039 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1040 break;
1041
1042 case ARG_LINK_JOURNAL:
4e1d6aa9 1043 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1044 if (r < 0)
1045 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1046
4e1d6aa9 1047 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1048 break;
1049
17fe0523 1050 case ARG_BIND:
f757855e
LP
1051 case ARG_BIND_RO:
1052 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1053 if (r < 0)
1054 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1055
f757855e 1056 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1057 break;
06c17c39 1058
f757855e
LP
1059 case ARG_TMPFS:
1060 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1061 if (r < 0)
1062 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1063
f757855e 1064 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1065 break;
5a8af538
LP
1066
1067 case ARG_OVERLAY:
ad85779a
LP
1068 case ARG_OVERLAY_RO:
1069 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1070 if (r == -EADDRNOTAVAIL)
1071 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1072 if (r < 0)
1073 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1074
f757855e 1075 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1076 break;
06c17c39 1077
de40a303
LP
1078 case ARG_INACCESSIBLE:
1079 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1080 if (r < 0)
1081 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1082
1083 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1084 break;
1085
a5f1cb3b 1086 case 'E': {
f4889f65
LP
1087 char **n;
1088
baaa35ad
ZJS
1089 if (!env_assignment_is_valid(optarg))
1090 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1091 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
1092
1093 n = strv_env_set(arg_setenv, optarg);
1094 if (!n)
1095 return log_oom();
1096
130d3d22 1097 strv_free_and_replace(arg_setenv, n);
f757855e 1098 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1099 break;
1100 }
1101
284c0b91
LP
1102 case 'q':
1103 arg_quiet = true;
1104 break;
1105
8a96d94e 1106 case ARG_SHARE_SYSTEM:
a6b5216c 1107 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1108 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1109 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1110 arg_clone_ns_flags = 0;
8a96d94e
LP
1111 break;
1112
eb91eb18
LP
1113 case ARG_REGISTER:
1114 r = parse_boolean(optarg);
1115 if (r < 0) {
1116 log_error("Failed to parse --register= argument: %s", optarg);
1117 return r;
1118 }
1119
1120 arg_register = r;
1121 break;
1122
89f7c846
LP
1123 case ARG_KEEP_UNIT:
1124 arg_keep_unit = true;
1125 break;
1126
6afc95b7
LP
1127 case ARG_PERSONALITY:
1128
ac45f971 1129 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1130 if (arg_personality == PERSONALITY_INVALID)
1131 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1132 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1133
f757855e 1134 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1135 break;
1136
4d9f07b4
LP
1137 case ARG_VOLATILE:
1138
1139 if (!optarg)
f757855e 1140 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1141 else if (streq(optarg, "help")) {
1142 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1143 return 0;
1144 } else {
f757855e 1145 VolatileMode m;
4d9f07b4 1146
f757855e 1147 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1148 if (m < 0)
1149 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1150 "Failed to parse --volatile= argument: %s", optarg);
1151 else
f757855e 1152 arg_volatile_mode = m;
6d0b55c2
LP
1153 }
1154
f757855e
LP
1155 arg_settings_mask |= SETTING_VOLATILE_MODE;
1156 break;
6d0b55c2 1157
f757855e
LP
1158 case 'p':
1159 r = expose_port_parse(&arg_expose_ports, optarg);
1160 if (r == -EEXIST)
1161 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1162 if (r < 0)
1163 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1164
f757855e 1165 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1166 break;
6d0b55c2 1167
f36933fe
LP
1168 case ARG_PROPERTY:
1169 if (strv_extend(&arg_property, optarg) < 0)
1170 return log_oom();
1171
1172 break;
1173
ae209204
ZJS
1174 case ARG_PRIVATE_USERS: {
1175 int boolean = -1;
0de7acce 1176
ae209204
ZJS
1177 if (!optarg)
1178 boolean = true;
1179 else if (!in_charset(optarg, DIGITS))
1180 /* do *not* parse numbers as booleans */
1181 boolean = parse_boolean(optarg);
1182
1183 if (boolean == false) {
0de7acce
LP
1184 /* no: User namespacing off */
1185 arg_userns_mode = USER_NAMESPACE_NO;
1186 arg_uid_shift = UID_INVALID;
1187 arg_uid_range = UINT32_C(0x10000);
ae209204 1188 } else if (boolean == true) {
0de7acce
LP
1189 /* yes: User namespacing on, UID range is read from root dir */
1190 arg_userns_mode = USER_NAMESPACE_FIXED;
1191 arg_uid_shift = UID_INVALID;
1192 arg_uid_range = UINT32_C(0x10000);
1193 } else if (streq(optarg, "pick")) {
1194 /* pick: User namespacing on, UID range is picked randomly */
1195 arg_userns_mode = USER_NAMESPACE_PICK;
1196 arg_uid_shift = UID_INVALID;
1197 arg_uid_range = UINT32_C(0x10000);
1198 } else {
6c2058b3 1199 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1200 const char *range, *shift;
1201
0de7acce
LP
1202 /* anything else: User namespacing on, UID range is explicitly configured */
1203
6dac160c
LP
1204 range = strchr(optarg, ':');
1205 if (range) {
6c2058b3
ZJS
1206 buffer = strndup(optarg, range - optarg);
1207 if (!buffer)
1208 return log_oom();
1209 shift = buffer;
6dac160c
LP
1210
1211 range++;
bfd292ec
ZJS
1212 r = safe_atou32(range, &arg_uid_range);
1213 if (r < 0)
be715731 1214 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1215 } else
1216 shift = optarg;
1217
be715731
ZJS
1218 r = parse_uid(shift, &arg_uid_shift);
1219 if (r < 0)
1220 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1221
1222 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1223 }
1224
baaa35ad
ZJS
1225 if (arg_uid_range <= 0)
1226 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1227 "UID range cannot be 0.");
be715731 1228
0de7acce 1229 arg_settings_mask |= SETTING_USERNS;
6dac160c 1230 break;
ae209204 1231 }
6dac160c 1232
0de7acce 1233 case 'U':
ccabee0d
LP
1234 if (userns_supported()) {
1235 arg_userns_mode = USER_NAMESPACE_PICK;
1236 arg_uid_shift = UID_INVALID;
1237 arg_uid_range = UINT32_C(0x10000);
1238
1239 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1240 }
1241
7336138e
LP
1242 break;
1243
0de7acce 1244 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1245 arg_userns_chown = true;
0de7acce
LP
1246
1247 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1248 break;
1249
c6c8f6e2 1250 case ARG_KILL_SIGNAL:
5c828e66
LP
1251 if (streq(optarg, "help")) {
1252 DUMP_STRING_TABLE(signal, int, _NSIG);
1253 return 0;
1254 }
1255
29a3db75 1256 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1257 if (arg_kill_signal < 0)
1258 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1259 "Cannot parse signal: %s", optarg);
c6c8f6e2 1260
f757855e
LP
1261 arg_settings_mask |= SETTING_KILL_SIGNAL;
1262 break;
1263
1264 case ARG_SETTINGS:
1265
1266 /* no → do not read files
1267 * yes → read files, do not override cmdline, trust only subset
1268 * override → read files, override cmdline, trust only subset
1269 * trusted → read files, do not override cmdline, trust all
1270 */
1271
1272 r = parse_boolean(optarg);
1273 if (r < 0) {
1274 if (streq(optarg, "trusted")) {
1275 mask_all_settings = false;
1276 mask_no_settings = false;
1277 arg_settings_trusted = true;
1278
1279 } else if (streq(optarg, "override")) {
1280 mask_all_settings = false;
1281 mask_no_settings = true;
1282 arg_settings_trusted = -1;
1283 } else
1284 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1285 } else if (r > 0) {
1286 /* yes */
1287 mask_all_settings = false;
1288 mask_no_settings = false;
1289 arg_settings_trusted = -1;
1290 } else {
1291 /* no */
1292 mask_all_settings = true;
1293 mask_no_settings = false;
1294 arg_settings_trusted = false;
1295 }
1296
c6c8f6e2
LP
1297 break;
1298
5f932eb9 1299 case ARG_CHDIR:
baaa35ad
ZJS
1300 if (!path_is_absolute(optarg))
1301 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1302 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1303
1304 r = free_and_strdup(&arg_chdir, optarg);
1305 if (r < 0)
1306 return log_oom();
1307
1308 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1309 break;
1310
b53ede69
PW
1311 case ARG_PIVOT_ROOT:
1312 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1313 if (r < 0)
1314 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1315
1316 arg_settings_mask |= SETTING_PIVOT_ROOT;
1317 break;
1318
9c1e04d0
AP
1319 case ARG_NOTIFY_READY:
1320 r = parse_boolean(optarg);
baaa35ad
ZJS
1321 if (r < 0)
1322 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1323 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1324 arg_notify_ready = r;
1325 arg_settings_mask |= SETTING_NOTIFY_READY;
1326 break;
1327
4623e8e6 1328 case ARG_ROOT_HASH: {
89e62e0b 1329 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1330 size_t l;
1331
1332 r = unhexmem(optarg, strlen(optarg), &k, &l);
1333 if (r < 0)
1334 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1335 if (l < sizeof(sd_id128_t))
c6147113 1336 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1337
89e62e0b
LP
1338 free_and_replace(arg_verity_settings.root_hash, k);
1339 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1340 break;
1341 }
1342
c2923fdc
LB
1343 case ARG_ROOT_HASH_SIG: {
1344 char *value;
89e62e0b
LP
1345 size_t l;
1346 void *p;
c2923fdc
LB
1347
1348 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1349 r = unbase64mem(value, strlen(value), &p, &l);
1350 if (r < 0)
1351 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1352
c2923fdc 1353 } else {
89e62e0b 1354 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1355 if (r < 0)
89e62e0b 1356 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1357 }
1358
89e62e0b
LP
1359 free_and_replace(arg_verity_settings.root_hash_sig, p);
1360 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1361 break;
1362 }
1363
89e62e0b
LP
1364 case ARG_VERITY_DATA:
1365 r = parse_path_argument_and_warn(optarg, false, &arg_verity_settings.data_path);
1366 if (r < 0)
1367 return r;
1368 break;
1369
960e4569
LP
1370 case ARG_SYSTEM_CALL_FILTER: {
1371 bool negative;
1372 const char *items;
1373
1374 negative = optarg[0] == '~';
1375 items = negative ? optarg + 1 : optarg;
1376
1377 for (;;) {
1378 _cleanup_free_ char *word = NULL;
1379
1380 r = extract_first_word(&items, &word, NULL, 0);
1381 if (r == 0)
1382 break;
1383 if (r == -ENOMEM)
1384 return log_oom();
1385 if (r < 0)
1386 return log_error_errno(r, "Failed to parse system call filter: %m");
1387
1388 if (negative)
6b000af4 1389 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1390 else
6b000af4 1391 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1392 if (r < 0)
1393 return log_oom();
1394 }
1395
1396 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1397 break;
1398 }
1399
bf428efb
LP
1400 case ARG_RLIMIT: {
1401 const char *eq;
622ecfa8 1402 _cleanup_free_ char *name = NULL;
bf428efb
LP
1403 int rl;
1404
5c828e66
LP
1405 if (streq(optarg, "help")) {
1406 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1407 return 0;
1408 }
1409
bf428efb 1410 eq = strchr(optarg, '=');
baaa35ad
ZJS
1411 if (!eq)
1412 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1413 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1414
1415 name = strndup(optarg, eq - optarg);
1416 if (!name)
1417 return log_oom();
1418
1419 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1420 if (rl < 0)
1421 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1422 "Unknown resource limit: %s", name);
bf428efb
LP
1423
1424 if (!arg_rlimit[rl]) {
1425 arg_rlimit[rl] = new0(struct rlimit, 1);
1426 if (!arg_rlimit[rl])
1427 return log_oom();
1428 }
1429
1430 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1431 if (r < 0)
1432 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1433
1434 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1435 break;
1436 }
1437
81f345df
LP
1438 case ARG_OOM_SCORE_ADJUST:
1439 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1440 if (r < 0)
1441 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1442
1443 arg_oom_score_adjust_set = true;
1444 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1445 break;
1446
d107bb7d 1447 case ARG_CPU_AFFINITY: {
0985c7c4 1448 CPUSet cpuset;
d107bb7d
LP
1449
1450 r = parse_cpu_set(optarg, &cpuset);
1451 if (r < 0)
0985c7c4 1452 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1453
0985c7c4
ZJS
1454 cpu_set_reset(&arg_cpu_set);
1455 arg_cpu_set = cpuset;
d107bb7d
LP
1456 arg_settings_mask |= SETTING_CPU_AFFINITY;
1457 break;
1458 }
1459
09d423e9
LP
1460 case ARG_RESOLV_CONF:
1461 if (streq(optarg, "help")) {
1462 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1463 return 0;
1464 }
1465
1466 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1467 if (arg_resolv_conf < 0)
1468 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1469 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1470
1471 arg_settings_mask |= SETTING_RESOLV_CONF;
1472 break;
1473
1688841f
LP
1474 case ARG_TIMEZONE:
1475 if (streq(optarg, "help")) {
1476 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1477 return 0;
1478 }
1479
1480 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1481 if (arg_timezone < 0)
1482 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1483 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1484
1485 arg_settings_mask |= SETTING_TIMEZONE;
1486 break;
1487
de40a303 1488 case ARG_CONSOLE:
dce66ffe
ZJS
1489 r = handle_arg_console(optarg);
1490 if (r <= 0)
1491 return r;
de40a303
LP
1492 break;
1493
1494 case 'P':
1495 case ARG_PIPE:
dce66ffe
ZJS
1496 r = handle_arg_console("pipe");
1497 if (r <= 0)
1498 return r;
de40a303
LP
1499 break;
1500
bb068de0
ZJS
1501 case ARG_NO_PAGER:
1502 arg_pager_flags |= PAGER_DISABLE;
1503 break;
1504
3652872a
LP
1505 case ARG_SET_CREDENTIAL: {
1506 _cleanup_free_ char *word = NULL, *data = NULL;
1507 const char *p = optarg;
1508 Credential *a;
1509 size_t i;
1510 int l;
1511
1512 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1513 if (r == -ENOMEM)
1514 return log_oom();
1515 if (r < 0)
1516 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1517 if (r == 0 || !p)
1518 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1519
1520 if (!credential_name_valid(word))
1521 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1522
1523 for (i = 0; i < arg_n_credentials; i++)
1524 if (streq(arg_credentials[i].id, word))
1525 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1526
1527 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1528 if (l < 0)
1529 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1530
1531 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1532 if (!a)
1533 return log_oom();
1534
1535 a[arg_n_credentials++] = (Credential) {
1536 .id = TAKE_PTR(word),
1537 .data = TAKE_PTR(data),
1538 .size = l,
1539 };
1540
1541 arg_credentials = a;
1542
1543 arg_settings_mask |= SETTING_CREDENTIALS;
1544 break;
1545 }
1546
1547 case ARG_LOAD_CREDENTIAL: {
1548 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1549 _cleanup_(erase_and_freep) char *data = NULL;
1550 _cleanup_free_ char *word = NULL, *j = NULL;
1551 const char *p = optarg;
1552 Credential *a;
1553 size_t size, i;
1554
1555 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1556 if (r == -ENOMEM)
1557 return log_oom();
1558 if (r < 0)
1559 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1560 if (r == 0 || !p)
1561 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1562
1563 if (!credential_name_valid(word))
1564 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1565
1566 for (i = 0; i < arg_n_credentials; i++)
1567 if (streq(arg_credentials[i].id, word))
1568 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1569
1570 if (path_is_absolute(p))
1571 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1572 else {
1573 const char *e;
1574
1575 e = getenv("CREDENTIALS_DIRECTORY");
1576 if (!e)
1577 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential not available (no credentials passed at all): %s", word);
1578
1579 j = path_join(e, p);
1580 if (!j)
1581 return log_oom();
1582 }
1583
1584 r = read_full_file_full(AT_FDCWD, j ?: p, flags, &data, &size);
1585 if (r < 0)
1586 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1587
1588 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1589 if (!a)
1590 return log_oom();
1591
1592 a[arg_n_credentials++] = (Credential) {
1593 .id = TAKE_PTR(word),
1594 .data = TAKE_PTR(data),
1595 .size = size,
1596 };
1597
1598 arg_credentials = a;
1599
1600 arg_settings_mask |= SETTING_CREDENTIALS;
1601 break;
1602 }
1603
88213476
LP
1604 case '?':
1605 return -EINVAL;
1606
1607 default:
eb9da376 1608 assert_not_reached("Unhandled option");
88213476 1609 }
88213476 1610
60f1ec13
LP
1611 if (argc > optind) {
1612 strv_free(arg_parameters);
1613 arg_parameters = strv_copy(argv + optind);
1614 if (!arg_parameters)
1615 return log_oom();
d7bea6b6 1616
60f1ec13
LP
1617 arg_settings_mask |= SETTING_START_MODE;
1618 }
1619
1620 if (arg_ephemeral && arg_template && !arg_directory)
1621 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1622 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1623 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1624 * --directory=". */
1625 arg_directory = TAKE_PTR(arg_template);
1626
bd4b15f2 1627 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1628
de40a303 1629 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1630 r = parse_environment();
1631 if (r < 0)
1632 return r;
de40a303 1633
60f1ec13
LP
1634 /* Load all settings from .nspawn files */
1635 if (mask_no_settings)
1636 arg_settings_mask = 0;
1637
1638 /* Don't load any settings from .nspawn files */
1639 if (mask_all_settings)
1640 arg_settings_mask = _SETTINGS_MASK_ALL;
1641
1642 return 1;
1643}
1644
1645static int verify_arguments(void) {
1646 int r;
a6b5216c 1647
75b0d8b8
ZJS
1648 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1649 /* If we are running the stub init in the container, we don't need to look at what the init
1650 * in the container supports, because we are not using it. Let's immediately pick the right
1651 * setting based on the host system configuration.
1652 *
1653 * We only do this, if the user didn't use an environment variable to override the detection.
1654 */
1655
1656 r = cg_all_unified();
1657 if (r < 0)
1658 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1659 if (r > 0)
1660 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1661 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1662 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1663 else
1664 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1665 }
1666
4f086aab
SU
1667 if (arg_userns_mode != USER_NAMESPACE_NO)
1668 arg_mount_settings |= MOUNT_USE_USERNS;
1669
1670 if (arg_private_network)
1671 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1672
48a8d337
LB
1673 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1674 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1675 arg_register = false;
baaa35ad 1676 if (arg_start_mode != START_PID1)
60f1ec13 1677 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1678 }
eb91eb18 1679
0de7acce 1680 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1681 arg_userns_chown = true;
1682
60f1ec13
LP
1683 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1684 arg_kill_signal = SIGRTMIN+3;
1685
e5a4bb0d
LP
1686 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1687 arg_read_only = true;
1688
2436ea76
DDM
1689 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1690 arg_read_only = true;
1691
baaa35ad 1692 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1693 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1694 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1695 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1696
baaa35ad 1697 if (arg_directory && arg_image)
60f1ec13 1698 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1699
baaa35ad 1700 if (arg_template && arg_image)
60f1ec13 1701 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1702
baaa35ad 1703 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1704 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1705
baaa35ad 1706 if (arg_ephemeral && arg_template)
60f1ec13 1707 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1708
baaa35ad 1709 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1710 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1711
baaa35ad 1712 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1713 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1714
baaa35ad 1715 if (arg_userns_chown && arg_read_only)
de40a303
LP
1716 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1717 "--read-only and --private-users-chown may not be combined.");
f757855e 1718
e5a4bb0d
LP
1719 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1720 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
5238e957 1721 * copy-up (in case of overlay) making the entire exercise pointless. */
e5a4bb0d
LP
1722 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1723 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1724
679ecd36
SZ
1725 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1726 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1727 if (arg_network_namespace_path &&
1728 (arg_network_interfaces || arg_network_macvlan ||
1729 arg_network_ipvlan || arg_network_veth_extra ||
1730 arg_network_bridge || arg_network_zone ||
679ecd36 1731 arg_network_veth))
de40a303 1732 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1733
60f1ec13 1734 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1735 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1736 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1737
baaa35ad 1738 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1739 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1740
baaa35ad 1741 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1742 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1743
baaa35ad 1744 if (arg_expose_ports && !arg_private_network)
60f1ec13 1745 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1746
349cc4a5 1747#if ! HAVE_LIBIPTC
baaa35ad 1748 if (arg_expose_ports)
60f1ec13 1749 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1750#endif
1751
60f1ec13
LP
1752 r = custom_mount_check_all();
1753 if (r < 0)
1754 return r;
c6c8f6e2 1755
f757855e 1756 return 0;
88213476
LP
1757}
1758
03cfe0d5
LP
1759static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1760 assert(p);
1761
0de7acce 1762 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1763 return 0;
1764
1765 if (uid == UID_INVALID && gid == GID_INVALID)
1766 return 0;
1767
1768 if (uid != UID_INVALID) {
1769 uid += arg_uid_shift;
1770
1771 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1772 return -EOVERFLOW;
1773 }
1774
1775 if (gid != GID_INVALID) {
1776 gid += (gid_t) arg_uid_shift;
1777
1778 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1779 return -EOVERFLOW;
1780 }
1781
1782 if (lchown(p, uid, gid) < 0)
1783 return -errno;
b12afc8c
LP
1784
1785 return 0;
1786}
1787
03cfe0d5
LP
1788static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1789 const char *q;
dae8b82e 1790 int r;
03cfe0d5
LP
1791
1792 q = prefix_roota(root, path);
dae8b82e
ZJS
1793 r = mkdir_errno_wrapper(q, mode);
1794 if (r == -EEXIST)
1795 return 0;
1796 if (r < 0)
1797 return r;
03cfe0d5
LP
1798
1799 return userns_lchown(q, uid, gid);
1800}
1801
1688841f 1802static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1803 return PATH_STARTSWITH_SET(
1804 path,
1805 "../usr/share/zoneinfo/",
1806 "/usr/share/zoneinfo/");
1688841f
LP
1807}
1808
83205269
LP
1809static bool etc_writable(void) {
1810 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1811}
1812
e58a1277 1813static int setup_timezone(const char *dest) {
1688841f
LP
1814 _cleanup_free_ char *p = NULL, *etc = NULL;
1815 const char *where, *check;
1816 TimezoneMode m;
d4036145 1817 int r;
f8440af5 1818
e58a1277
LP
1819 assert(dest);
1820
1688841f 1821 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1822 r = readlink_malloc("/etc/localtime", &p);
1823 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1824 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1825 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1826 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1827 else if (r < 0) {
1828 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1829 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1830 * file.
1831 *
1832 * Example:
1833 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1834 */
1835 return 0;
1836 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1837 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1838 else
1839 m = arg_timezone;
1840 } else
1841 m = arg_timezone;
1842
1843 if (m == TIMEZONE_OFF)
1844 return 0;
1845
a5648b80 1846 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1847 if (r < 0) {
1688841f 1848 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1849 return 0;
1850 }
1851
1688841f
LP
1852 where = strjoina(etc, "/localtime");
1853
1854 switch (m) {
1855
1856 case TIMEZONE_DELETE:
1857 if (unlink(where) < 0)
1858 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1859
d4036145 1860 return 0;
d4036145 1861
1688841f
LP
1862 case TIMEZONE_SYMLINK: {
1863 _cleanup_free_ char *q = NULL;
1864 const char *z, *what;
4d1c38b8 1865
1688841f
LP
1866 z = timezone_from_path(p);
1867 if (!z) {
1868 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1869 return 0;
1688841f 1870 }
d4036145 1871
1688841f
LP
1872 r = readlink_malloc(where, &q);
1873 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1874 return 0; /* Already pointing to the right place? Then do nothing .. */
1875
1876 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1877 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1878 if (r < 0)
1879 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1880 else {
1881 if (unlink(where) < 0 && errno != ENOENT) {
1882 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1883 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1884 return 0;
1885 }
1886
1887 what = strjoina("../usr/share/zoneinfo/", z);
1888 if (symlink(what, where) < 0) {
1889 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1890 errno, "Failed to correct timezone of container, ignoring: %m");
1891 return 0;
1892 }
1893
1894 break;
1895 }
1896
1897 _fallthrough_;
d4036145 1898 }
68fb0892 1899
1688841f
LP
1900 case TIMEZONE_BIND: {
1901 _cleanup_free_ char *resolved = NULL;
1902 int found;
1903
a5648b80 1904 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1905 if (found < 0) {
1906 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1907 return 0;
1908 }
1909
1910 if (found == 0) /* missing? */
1911 (void) touch(resolved);
1912
1913 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1914 if (r >= 0)
1915 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1916
1917 _fallthrough_;
79d80fc1 1918 }
4d9f07b4 1919
1688841f
LP
1920 case TIMEZONE_COPY:
1921 /* If mounting failed, try to copy */
8a016c74 1922 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1923 if (r < 0) {
1924 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1925 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1926 return 0;
1927 }
1928
1929 break;
1930
1931 default:
1932 assert_not_reached("unexpected mode");
d4036145 1933 }
e58a1277 1934
1688841f 1935 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1936 r = userns_lchown(where, 0, 0);
1937 if (r < 0)
1688841f 1938 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1939
e58a1277 1940 return 0;
88213476
LP
1941}
1942
09d423e9
LP
1943static int have_resolv_conf(const char *path) {
1944 assert(path);
1945
1946 if (access(path, F_OK) < 0) {
1947 if (errno == ENOENT)
1948 return 0;
1949
1950 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1951 }
1952
1953 return 1;
1954}
1955
7357272e 1956static int resolved_listening(void) {
b8ea7a6e 1957 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1958 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1959 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1960 int r;
1961
7357272e 1962 /* Check if resolved is listening */
b053cd5f
LP
1963
1964 r = sd_bus_open_system(&bus);
1965 if (r < 0)
b8ea7a6e 1966 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1967
7357272e 1968 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1969 if (r < 0)
1970 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1971 if (r == 0)
1972 return 0;
7357272e
DM
1973
1974 r = sd_bus_get_property_string(bus,
1975 "org.freedesktop.resolve1",
1976 "/org/freedesktop/resolve1",
1977 "org.freedesktop.resolve1.Manager",
1978 "DNSStubListener",
b8ea7a6e 1979 &error,
7357272e
DM
1980 &dns_stub_listener_mode);
1981 if (r < 0)
b8ea7a6e 1982 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1983
1984 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1985}
1986
2547bb41 1987static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1988 _cleanup_free_ char *etc = NULL;
1989 const char *where, *what;
1990 ResolvConfMode m;
1991 int r;
2547bb41
LP
1992
1993 assert(dest);
1994
09d423e9
LP
1995 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1996 if (arg_private_network)
1997 m = RESOLV_CONF_OFF;
86775e35
LP
1998 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
1999 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2000 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2001 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2002 else
83205269 2003 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2004
09d423e9
LP
2005 } else
2006 m = arg_resolv_conf;
2007
2008 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2009 return 0;
2010
a5648b80 2011 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2012 if (r < 0) {
2013 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2014 return 0;
2015 }
2016
2017 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2018
2019 if (m == RESOLV_CONF_DELETE) {
2020 if (unlink(where) < 0)
2021 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2022
87447ae4
LP
2023 return 0;
2024 }
79d80fc1 2025
86775e35
LP
2026 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2027 what = PRIVATE_STATIC_RESOLV_CONF;
2028 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2029 what = PRIVATE_UPLINK_RESOLV_CONF;
2030 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2031 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2032 else
2033 what = "/etc/resolv.conf";
87447ae4 2034
86775e35 2035 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2036 _cleanup_free_ char *resolved = NULL;
2037 int found;
2038
a5648b80 2039 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2040 if (found < 0) {
2041 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2042 return 0;
2043 }
3539724c 2044
87447ae4
LP
2045 if (found == 0) /* missing? */
2046 (void) touch(resolved);
5367354d 2047
09d423e9 2048 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2049 if (r >= 0)
87447ae4 2050 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2051
2052 /* If that didn't work, let's copy the file */
3539724c
LP
2053 }
2054
86775e35
LP
2055 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2056 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2057 else
2058 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 2059 if (r < 0) {
3539724c
LP
2060 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2061 * resolved or something similar runs inside and the symlink points there.
68a313c5 2062 *
3539724c 2063 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2064 */
86775e35
LP
2065 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2066 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2067 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2068 return 0;
2069 }
2547bb41 2070
03cfe0d5
LP
2071 r = userns_lchown(where, 0, 0);
2072 if (r < 0)
3539724c 2073 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2074
2547bb41
LP
2075 return 0;
2076}
2077
1e4f1671 2078static int setup_boot_id(void) {
cdde6ba6
LP
2079 _cleanup_(unlink_and_freep) char *from = NULL;
2080 _cleanup_free_ char *path = NULL;
3bbaff3e 2081 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2082 const char *to;
04bc4a3f
LP
2083 int r;
2084
1eacc470 2085 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2086
1eacc470 2087 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2088 if (r < 0)
2089 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2090
2091 r = sd_id128_randomize(&rnd);
f647962d
MS
2092 if (r < 0)
2093 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2094
cdde6ba6 2095 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
2096 if (r < 0)
2097 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2098
cdde6ba6
LP
2099 from = TAKE_PTR(path);
2100 to = "/proc/sys/kernel/random/boot_id";
2101
60e76d48 2102 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2103 if (r < 0)
2104 return r;
04bc4a3f 2105
cdde6ba6 2106 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2107}
2108
e58a1277 2109static int copy_devnodes(const char *dest) {
88213476
LP
2110 static const char devnodes[] =
2111 "null\0"
2112 "zero\0"
2113 "full\0"
2114 "random\0"
2115 "urandom\0"
85614d66
TG
2116 "tty\0"
2117 "net/tun\0";
88213476 2118
de40a303 2119 _cleanup_umask_ mode_t u;
88213476 2120 const char *d;
e58a1277 2121 int r = 0;
a258bf26
LP
2122
2123 assert(dest);
124640f1
LP
2124
2125 u = umask(0000);
88213476 2126
03cfe0d5
LP
2127 /* Create /dev/net, so that we can create /dev/net/tun in it */
2128 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2129 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2130
88213476 2131 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2132 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2133 struct stat st;
88213476 2134
c6134d3e 2135 from = path_join("/dev/", d);
8967f291
LP
2136 if (!from)
2137 return log_oom();
2138
c6134d3e 2139 to = path_join(dest, from);
8967f291
LP
2140 if (!to)
2141 return log_oom();
88213476
LP
2142
2143 if (stat(from, &st) < 0) {
2144
4a62c710
MS
2145 if (errno != ENOENT)
2146 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2147
baaa35ad
ZJS
2148 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2149 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2150 "%s is not a char or block device, cannot copy.", from);
2151 else {
8dfce114
LP
2152 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2153
81f5049b 2154 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2155 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2156 if (errno == EEXIST)
8dbf71ec 2157 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2158 if (errno != EPERM)
2159 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2160
8dfce114 2161 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2162 r = touch(to);
2163 if (r < 0)
2164 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
2165 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2166 if (r < 0)
2167 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2168 }
6278cf60 2169
03cfe0d5
LP
2170 r = userns_lchown(to, 0, 0);
2171 if (r < 0)
2172 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2173
657ee2d8 2174 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2175 if (!dn)
2176 return log_oom();
2177
2178 r = userns_mkdir(dest, dn, 0755, 0, 0);
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to create '%s': %m", dn);
2181
2182 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2183 return log_oom();
2184
c6134d3e 2185 prefixed = path_join(dest, sl);
8dfce114
LP
2186 if (!prefixed)
2187 return log_oom();
2188
2d9b74ba 2189 t = path_join("..", d);
8dfce114
LP
2190 if (!t)
2191 return log_oom();
2192
2193 if (symlink(t, prefixed) < 0)
2194 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2195 }
88213476
LP
2196 }
2197
e58a1277
LP
2198 return r;
2199}
88213476 2200
de40a303
LP
2201static int make_extra_nodes(const char *dest) {
2202 _cleanup_umask_ mode_t u;
2203 size_t i;
2204 int r;
2205
2206 u = umask(0000);
2207
2208 for (i = 0; i < arg_n_extra_nodes; i++) {
2209 _cleanup_free_ char *path = NULL;
2210 DeviceNode *n = arg_extra_nodes + i;
2211
c6134d3e 2212 path = path_join(dest, n->path);
de40a303
LP
2213 if (!path)
2214 return log_oom();
2215
2216 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2217 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2218
2219 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2222 }
2223
2224 return 0;
2225}
2226
03cfe0d5
LP
2227static int setup_pts(const char *dest) {
2228 _cleanup_free_ char *options = NULL;
2229 const char *p;
709f6e46 2230 int r;
03cfe0d5 2231
349cc4a5 2232#if HAVE_SELINUX
03cfe0d5
LP
2233 if (arg_selinux_apifs_context)
2234 (void) asprintf(&options,
3dce8915 2235 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2236 arg_uid_shift + TTY_GID,
2237 arg_selinux_apifs_context);
2238 else
2239#endif
2240 (void) asprintf(&options,
3dce8915 2241 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2242 arg_uid_shift + TTY_GID);
f2d88580 2243
03cfe0d5 2244 if (!options)
f2d88580
LP
2245 return log_oom();
2246
03cfe0d5 2247 /* Mount /dev/pts itself */
cc9fce65 2248 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2249 r = mkdir_errno_wrapper(p, 0755);
2250 if (r < 0)
2251 return log_error_errno(r, "Failed to create /dev/pts: %m");
2252
60e76d48
ZJS
2253 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2254 if (r < 0)
2255 return r;
709f6e46
MS
2256 r = userns_lchown(p, 0, 0);
2257 if (r < 0)
2258 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2259
2260 /* Create /dev/ptmx symlink */
2261 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2262 if (symlink("pts/ptmx", p) < 0)
2263 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2264 r = userns_lchown(p, 0, 0);
2265 if (r < 0)
2266 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2267
03cfe0d5
LP
2268 /* And fix /dev/pts/ptmx ownership */
2269 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2270 r = userns_lchown(p, 0, 0);
2271 if (r < 0)
2272 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2273
f2d88580
LP
2274 return 0;
2275}
2276
3acc84eb 2277static int setup_stdio_as_dev_console(void) {
2fef50cd 2278 _cleanup_close_ int terminal = -1;
e58a1277 2279 int r;
e58a1277 2280
3acc84eb
FB
2281 terminal = open_terminal("/dev/console", O_RDWR);
2282 if (terminal < 0)
2283 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2284
3acc84eb
FB
2285 /* Make sure we can continue logging to the original stderr, even if
2286 * stderr points elsewhere now */
2287 r = log_dup_console();
2288 if (r < 0)
2289 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2290
3acc84eb
FB
2291 /* invalidates 'terminal' on success and failure */
2292 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2293 TAKE_FD(terminal);
f647962d 2294 if (r < 0)
3acc84eb
FB
2295 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2296
2297 return 0;
2298}
88213476 2299
3acc84eb
FB
2300static int setup_dev_console(const char *console) {
2301 _cleanup_free_ char *p = NULL;
2302 int r;
a258bf26 2303
3acc84eb
FB
2304 /* Create /dev/console symlink */
2305 r = path_make_relative("/dev", console, &p);
81f5049b 2306 if (r < 0)
3acc84eb
FB
2307 return log_error_errno(r, "Failed to create relative path: %m");
2308
2309 if (symlink(p, "/dev/console") < 0)
2310 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2311
3acc84eb 2312 return 0;
e58a1277
LP
2313}
2314
8e5430c4
LP
2315static int setup_keyring(void) {
2316 key_serial_t keyring;
2317
6b000af4
LP
2318 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2319 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2320 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2321 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2322 * into the container. */
8e5430c4
LP
2323
2324 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2325 if (keyring == -1) {
2326 if (errno == ENOSYS)
2327 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2328 else if (IN_SET(errno, EACCES, EPERM))
2329 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2330 else
2331 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2332 }
2333
2334 return 0;
2335}
2336
3652872a
LP
2337static int setup_credentials(const char *root) {
2338 const char *q;
2339 int r;
2340
2341 if (arg_n_credentials <= 0)
2342 return 0;
2343
2344 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2345 if (r < 0)
2346 return log_error_errno(r, "Failed to create /run/host: %m");
2347
2348 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2349 if (r < 0)
2350 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2351
2352 q = prefix_roota(root, "/run/host/credentials");
2353 r = mount_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2354 if (r < 0)
2355 return r;
2356
2357 for (size_t i = 0; i < arg_n_credentials; i++) {
2358 _cleanup_free_ char *j = NULL;
2359 _cleanup_close_ int fd = -1;
2360
2361 j = path_join(q, arg_credentials[i].id);
2362 if (!j)
2363 return log_oom();
2364
2365 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2366 if (fd < 0)
2367 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2368
2369 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2370 if (r < 0)
2371 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2372
2373 if (fchmod(fd, 0400) < 0)
2374 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2375
2376 if (arg_userns_mode != USER_NAMESPACE_NO) {
2377 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2378 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2379 }
2380 }
2381
2382 if (chmod(q, 0500) < 0)
2383 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2384
2385 r = userns_lchown(q, 0, 0);
2386 if (r < 0)
2387 return r;
2388
2389 /* Make both mount and superblock read-only now */
2390 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2391 if (r < 0)
2392 return r;
2393
2394 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2395}
2396
1e4f1671 2397static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2398 _cleanup_(unlink_and_freep) char *from = NULL;
2399 _cleanup_free_ char *fifo = NULL;
2400 _cleanup_close_ int fd = -1;
7fd1b19b 2401 _cleanup_umask_ mode_t u;
9ec5a93c 2402 int r;
e58a1277 2403
e58a1277 2404 assert(kmsg_socket >= 0);
a258bf26 2405
e58a1277 2406 u = umask(0000);
a258bf26 2407
1eacc470 2408 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2409 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2410 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2411 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2412
1eacc470 2413 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2414 if (r < 0)
2415 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2416
9ec5a93c 2417 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2418 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2419
2420 from = TAKE_PTR(fifo);
9ec5a93c 2421
1eacc470 2422 r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2423 if (r < 0)
2424 return r;
e58a1277 2425
669fc4e5 2426 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2427 if (fd < 0)
2428 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2429
9ec5a93c 2430 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2431 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2432 if (r < 0)
2433 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2434
25ea79fe 2435 return 0;
88213476
LP
2436}
2437
1c4baffc 2438static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2439 union in_addr_union *exposed = userdata;
2440
2441 assert(rtnl);
2442 assert(m);
2443 assert(exposed);
2444
7a8f6325 2445 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
2446 return 0;
2447}
2448
3a74cea5 2449static int setup_hostname(void) {
c818eef1 2450 int r;
3a74cea5 2451
0c582db0 2452 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2453 return 0;
2454
c818eef1
LP
2455 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2456 if (r < 0)
2457 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2458
7027ff61 2459 return 0;
3a74cea5
LP
2460}
2461
57fb9fb5 2462static int setup_journal(const char *directory) {
0f5e1382 2463 _cleanup_free_ char *d = NULL;
5905d7cf 2464 char id[SD_ID128_STRING_MAX];
b2238e38
LP
2465 const char *dirname, *p, *q;
2466 sd_id128_t this_id;
8054d749 2467 bool try;
57fb9fb5
LP
2468 int r;
2469
df9a75e4
LP
2470 /* Don't link journals in ephemeral mode */
2471 if (arg_ephemeral)
2472 return 0;
2473
8054d749
LP
2474 if (arg_link_journal == LINK_NO)
2475 return 0;
2476
2477 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2478
4d680aee 2479 r = sd_id128_get_machine(&this_id);
f647962d
MS
2480 if (r < 0)
2481 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2482
e01ff70a 2483 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2484 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2485 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2486 if (try)
4d680aee 2487 return 0;
df9a75e4 2488 return -EEXIST;
4d680aee
ZJS
2489 }
2490
369ca6da
ZJS
2491 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2492 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2493 if (r < 0) {
2494 bool ignore = r == -EROFS && try;
2495 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2496 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2497 return ignore ? 0 : r;
2498 }
2499 }
03cfe0d5 2500
e01ff70a
MS
2501 (void) sd_id128_to_string(arg_uuid, id);
2502
03cfe0d5
LP
2503 p = strjoina("/var/log/journal/", id);
2504 q = prefix_roota(directory, p);
27407a01 2505
e1873695 2506 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2507 if (try)
2508 return 0;
27407a01 2509
baaa35ad
ZJS
2510 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2511 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2512 }
2513
e1873695 2514 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2515 if (try)
2516 return 0;
57fb9fb5 2517
baaa35ad
ZJS
2518 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2519 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2520 }
2521
2522 r = readlink_and_make_absolute(p, &d);
2523 if (r >= 0) {
3742095b 2524 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2525 path_equal(d, q)) {
2526
03cfe0d5 2527 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2528 if (r < 0)
709f6e46 2529 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2530 return 0;
57fb9fb5
LP
2531 }
2532
4a62c710
MS
2533 if (unlink(p) < 0)
2534 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2535 } else if (r == -EINVAL) {
2536
2537 if (arg_link_journal == LINK_GUEST &&
2538 rmdir(p) < 0) {
2539
27407a01
ZJS
2540 if (errno == ENOTDIR) {
2541 log_error("%s already exists and is neither a symlink nor a directory", p);
2542 return r;
4314d33f
MS
2543 } else
2544 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2545 }
4314d33f
MS
2546 } else if (r != -ENOENT)
2547 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2548
2549 if (arg_link_journal == LINK_GUEST) {
2550
2551 if (symlink(q, p) < 0) {
8054d749 2552 if (try) {
56f64d95 2553 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2554 return 0;
4314d33f
MS
2555 } else
2556 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2557 }
2558
03cfe0d5 2559 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2560 if (r < 0)
709f6e46 2561 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2562 return 0;
57fb9fb5
LP
2563 }
2564
2565 if (arg_link_journal == LINK_HOST) {
ccddd104 2566 /* don't create parents here — if the host doesn't have
574edc90 2567 * permanent journal set up, don't force it here */
ba8e6c4d 2568
dae8b82e
ZJS
2569 r = mkdir_errno_wrapper(p, 0755);
2570 if (r < 0 && r != -EEXIST) {
8054d749 2571 if (try) {
dae8b82e 2572 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2573 return 0;
4314d33f 2574 } else
dae8b82e 2575 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2576 }
2577
27407a01
ZJS
2578 } else if (access(p, F_OK) < 0)
2579 return 0;
57fb9fb5 2580
cdb2b9d0
LP
2581 if (dir_is_empty(q) == 0)
2582 log_warning("%s is not empty, proceeding anyway.", q);
2583
03cfe0d5 2584 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2585 if (r < 0)
2586 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2587
60e76d48
ZJS
2588 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2589 if (r < 0)
4a62c710 2590 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2591
27407a01 2592 return 0;
57fb9fb5
LP
2593}
2594
de40a303
LP
2595static int drop_capabilities(uid_t uid) {
2596 CapabilityQuintet q;
2597
2598 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2599 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2600 * arg_caps_retain. */
2601
2602 if (capability_quintet_is_set(&arg_full_capabilities)) {
2603 q = arg_full_capabilities;
2604
2605 if (q.bounding == (uint64_t) -1)
2606 q.bounding = uid == 0 ? arg_caps_retain : 0;
2607
2608 if (q.effective == (uint64_t) -1)
2609 q.effective = uid == 0 ? q.bounding : 0;
2610
2611 if (q.inheritable == (uint64_t) -1)
2612 q.inheritable = uid == 0 ? q.bounding : 0;
2613
2614 if (q.permitted == (uint64_t) -1)
2615 q.permitted = uid == 0 ? q.bounding : 0;
2616
2617 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2618 q.ambient = 0;
f66ad460
AZ
2619
2620 if (capability_quintet_mangle(&q))
2621 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2622
2623 } else {
de40a303
LP
2624 q = (CapabilityQuintet) {
2625 .bounding = arg_caps_retain,
2626 .effective = uid == 0 ? arg_caps_retain : 0,
2627 .inheritable = uid == 0 ? arg_caps_retain : 0,
2628 .permitted = uid == 0 ? arg_caps_retain : 0,
2629 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2630 };
2631
f66ad460
AZ
2632 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2633 * in order to maintain the same behavior as systemd < 242. */
2634 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2635 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2636 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2637
2638 }
2639
de40a303 2640 return capability_quintet_enforce(&q);
88213476
LP
2641}
2642
db999e0f
LP
2643static int reset_audit_loginuid(void) {
2644 _cleanup_free_ char *p = NULL;
2645 int r;
2646
0c582db0 2647 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2648 return 0;
2649
2650 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2651 if (r == -ENOENT)
db999e0f 2652 return 0;
f647962d
MS
2653 if (r < 0)
2654 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2655
2656 /* Already reset? */
2657 if (streq(p, "4294967295"))
2658 return 0;
2659
57512c89 2660 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2661 if (r < 0) {
10a87006
LP
2662 log_error_errno(r,
2663 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2664 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2665 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2666 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2667 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2668
db999e0f 2669 sleep(5);
77b6e194 2670 }
db999e0f
LP
2671
2672 return 0;
77b6e194
LP
2673}
2674
785890ac
LP
2675static int setup_propagate(const char *root) {
2676 const char *p, *q;
709f6e46 2677 int r;
785890ac
LP
2678
2679 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2680 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2681 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2682 (void) mkdir_p(p, 0600);
2683
5a27b395 2684 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2685 if (r < 0)
5a27b395 2686 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2687
5a27b395 2688 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
709f6e46 2689 if (r < 0)
5a27b395 2690 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
03cfe0d5 2691
5a27b395 2692 q = prefix_roota(root, "/run/host/incoming");
60e76d48
ZJS
2693 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2694 if (r < 0)
2695 return r;
785890ac 2696
60e76d48
ZJS
2697 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2698 if (r < 0)
2699 return r;
785890ac 2700
5a27b395 2701 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
60e76d48 2702 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2703}
2704
317feb4d 2705static int setup_machine_id(const char *directory) {
691675ba
LP
2706 const char *etc_machine_id;
2707 sd_id128_t id;
3bbaff3e 2708 int r;
e01ff70a 2709
317feb4d
LP
2710 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2711 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2712 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2713 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2714 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2715 * container behaves nicely). */
2716
e01ff70a
MS
2717 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2718
691675ba 2719 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2720 if (r < 0) {
2721 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2722 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2723
317feb4d
LP
2724 if (sd_id128_is_null(arg_uuid)) {
2725 r = sd_id128_randomize(&arg_uuid);
2726 if (r < 0)
2727 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2728 }
2729 } else {
baaa35ad
ZJS
2730 if (sd_id128_is_null(id))
2731 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2732 "Machine ID in container image is zero, refusing.");
e01ff70a 2733
317feb4d
LP
2734 arg_uuid = id;
2735 }
691675ba 2736
e01ff70a
MS
2737 return 0;
2738}
2739
7336138e
LP
2740static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2741 int r;
2742
2743 assert(directory);
2744
0de7acce 2745 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2746 return 0;
2747
2748 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2749 if (r == -EOPNOTSUPP)
2750 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2751 if (r == -EBADE)
2752 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2753 if (r < 0)
2754 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2755 if (r == 0)
2756 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2757 else
2758 log_debug("Patched directory tree to match UID/GID range.");
2759
2760 return r;
2761}
2762
113cea80 2763/*
6d416b9c
LS
2764 * Return values:
2765 * < 0 : wait_for_terminate() failed to get the state of the
2766 * container, the container was terminated by a signal, or
2767 * failed for an unknown reason. No change is made to the
2768 * container argument.
2769 * > 0 : The program executed in the container terminated with an
2770 * error. The exit code of the program executed in the
919699ec
LP
2771 * container is returned. The container argument has been set
2772 * to CONTAINER_TERMINATED.
6d416b9c
LS
2773 * 0 : The container is being rebooted, has been shut down or exited
2774 * successfully. The container argument has been set to either
2775 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2776 *
6d416b9c
LS
2777 * That is, success is indicated by a return value of zero, and an
2778 * error is indicated by a non-zero value.
113cea80
DH
2779 */
2780static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2781 siginfo_t status;
919699ec 2782 int r;
113cea80
DH
2783
2784 r = wait_for_terminate(pid, &status);
f647962d
MS
2785 if (r < 0)
2786 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2787
2788 switch (status.si_code) {
fddbb89c 2789
113cea80 2790 case CLD_EXITED:
b5a2179b 2791 if (status.si_status == 0)
919699ec 2792 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2793 else
919699ec 2794 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2795
919699ec
LP
2796 *container = CONTAINER_TERMINATED;
2797 return status.si_status;
113cea80
DH
2798
2799 case CLD_KILLED:
2800 if (status.si_status == SIGINT) {
919699ec 2801 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2802 *container = CONTAINER_TERMINATED;
919699ec
LP
2803 return 0;
2804
113cea80 2805 } else if (status.si_status == SIGHUP) {
919699ec 2806 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2807 *container = CONTAINER_REBOOTED;
919699ec 2808 return 0;
113cea80 2809 }
919699ec 2810
4831981d 2811 _fallthrough_;
113cea80 2812 case CLD_DUMPED:
baaa35ad
ZJS
2813 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2814 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2815
2816 default:
baaa35ad
ZJS
2817 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2818 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2819 }
113cea80
DH
2820}
2821
023fb90b
LP
2822static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2823 pid_t pid;
2824
4a0b58c4 2825 pid = PTR_TO_PID(userdata);
023fb90b 2826 if (pid > 0) {
c6c8f6e2 2827 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2828 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2829 sd_event_source_set_userdata(s, NULL);
2830 return 0;
2831 }
2832 }
2833
2834 sd_event_exit(sd_event_source_get_event(s), 0);
2835 return 0;
2836}
2837
6916b164 2838static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2839 pid_t pid;
2840
2841 assert(s);
2842 assert(ssi);
2843
2844 pid = PTR_TO_PID(userdata);
2845
6916b164
AU
2846 for (;;) {
2847 siginfo_t si = {};
abdb9b08 2848
6916b164
AU
2849 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2850 return log_error_errno(errno, "Failed to waitid(): %m");
2851 if (si.si_pid == 0) /* No pending children. */
2852 break;
abdb9b08 2853 if (si.si_pid == pid) {
6916b164
AU
2854 /* The main process we care for has exited. Return from
2855 * signal handler but leave the zombie. */
2856 sd_event_exit(sd_event_source_get_event(s), 0);
2857 break;
2858 }
abdb9b08 2859
6916b164
AU
2860 /* Reap all other children. */
2861 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2862 }
2863
2864 return 0;
2865}
2866
abdb9b08
LP
2867static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2868 pid_t pid;
2869
2870 assert(m);
2871
2872 pid = PTR_TO_PID(userdata);
2873
2874 if (arg_kill_signal > 0) {
2875 log_info("Container termination requested. Attempting to halt container.");
2876 (void) kill(pid, arg_kill_signal);
2877 } else {
2878 log_info("Container termination requested. Exiting.");
2879 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2880 }
2881
2882 return 0;
2883}
2884
ec16945e 2885static int determine_names(void) {
1b9cebf6 2886 int r;
ec16945e 2887
c1521918
LP
2888 if (arg_template && !arg_directory && arg_machine) {
2889
2890 /* If --template= was specified then we should not
2891 * search for a machine, but instead create a new one
2892 * in /var/lib/machine. */
2893
657ee2d8 2894 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2895 if (!arg_directory)
2896 return log_oom();
2897 }
2898
ec16945e 2899 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2900 if (arg_machine) {
2901 _cleanup_(image_unrefp) Image *i = NULL;
2902
5ef46e5f 2903 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2904 if (r == -ENOENT)
2905 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2906 if (r < 0)
2907 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2908
eb38edce 2909 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2910 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2911 else
0f03c2a4 2912 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2913 if (r < 0)
0f3be6ca 2914 return log_oom();
1b9cebf6 2915
aee327b8
LP
2916 if (!arg_ephemeral)
2917 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2918 } else {
2919 r = safe_getcwd(&arg_directory);
2920 if (r < 0)
2921 return log_error_errno(r, "Failed to determine current directory: %m");
2922 }
ec16945e 2923
c6147113
LP
2924 if (!arg_directory && !arg_image)
2925 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2926 }
2927
2928 if (!arg_machine) {
b9ba4dab
LP
2929 if (arg_directory && path_equal(arg_directory, "/"))
2930 arg_machine = gethostname_malloc();
4827ab48
LP
2931 else {
2932 if (arg_image) {
2933 char *e;
2934
2935 arg_machine = strdup(basename(arg_image));
2936
2937 /* Truncate suffix if there is one */
2938 e = endswith(arg_machine, ".raw");
2939 if (e)
2940 *e = 0;
2941 } else
2942 arg_machine = strdup(basename(arg_directory));
2943 }
ec16945e
LP
2944 if (!arg_machine)
2945 return log_oom();
2946
ae691c1d 2947 hostname_cleanup(arg_machine);
c6147113
LP
2948 if (!machine_name_is_valid(arg_machine))
2949 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab
LP
2950
2951 if (arg_ephemeral) {
2952 char *b;
2953
2954 /* Add a random suffix when this is an
2955 * ephemeral machine, so that we can run many
2956 * instances at once without manually having
2957 * to specify -M each time. */
2958
2959 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2960 return log_oom();
2961
2962 free(arg_machine);
2963 arg_machine = b;
2964 }
ec16945e
LP
2965 }
2966
2967 return 0;
2968}
2969
8d4aa2bb 2970static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2971 char *chased;
2972 int r;
2973
2974 assert(p);
2975
2976 if (!*p)
2977 return 0;
2978
a5648b80 2979 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
2980 if (r < 0)
2981 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2982
a5648b80 2983 return free_and_replace(*p, chased);
3f342ec4
LP
2984}
2985
03cfe0d5 2986static int determine_uid_shift(const char *directory) {
6dac160c
LP
2987 int r;
2988
0de7acce 2989 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2990 arg_uid_shift = 0;
6dac160c 2991 return 0;
03cfe0d5 2992 }
6dac160c
LP
2993
2994 if (arg_uid_shift == UID_INVALID) {
2995 struct stat st;
2996
03cfe0d5 2997 r = stat(directory, &st);
6dac160c 2998 if (r < 0)
03cfe0d5 2999 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3000
3001 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3002
baaa35ad
ZJS
3003 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3004 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3005 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3006
3007 arg_uid_range = UINT32_C(0x10000);
3008 }
3009
baaa35ad
ZJS
3010 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
3011 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3012 "UID base too high for UID range.");
6dac160c 3013
6dac160c
LP
3014 return 0;
3015}
3016
de40a303
LP
3017static unsigned long effective_clone_ns_flags(void) {
3018 unsigned long flags = arg_clone_ns_flags;
3019
3020 if (arg_private_network)
3021 flags |= CLONE_NEWNET;
3022 if (arg_use_cgns)
3023 flags |= CLONE_NEWCGROUP;
3024 if (arg_userns_mode != USER_NAMESPACE_NO)
3025 flags |= CLONE_NEWUSER;
3026
3027 return flags;
3028}
3029
3030static int patch_sysctl(void) {
3031
3032 /* This table is inspired by runc's sysctl() function */
3033 static const struct {
3034 const char *key;
3035 bool prefix;
3036 unsigned long clone_flags;
3037 } safe_sysctl[] = {
3038 { "kernel.hostname", false, CLONE_NEWUTS },
3039 { "kernel.domainname", false, CLONE_NEWUTS },
3040 { "kernel.msgmax", false, CLONE_NEWIPC },
3041 { "kernel.msgmnb", false, CLONE_NEWIPC },
3042 { "kernel.msgmni", false, CLONE_NEWIPC },
3043 { "kernel.sem", false, CLONE_NEWIPC },
3044 { "kernel.shmall", false, CLONE_NEWIPC },
3045 { "kernel.shmmax", false, CLONE_NEWIPC },
3046 { "kernel.shmmni", false, CLONE_NEWIPC },
3047 { "fs.mqueue.", true, CLONE_NEWIPC },
3048 { "net.", true, CLONE_NEWNET },
3049 };
3050
3051 unsigned long flags;
3052 char **k, **v;
3053 int r;
3054
3055 flags = effective_clone_ns_flags();
3056
3057 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3058 bool good = false;
3059 size_t i;
3060
3061 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3062
3063 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3064 continue;
3065
3066 if (safe_sysctl[i].prefix)
3067 good = startswith(*k, safe_sysctl[i].key);
3068 else
3069 good = streq(*k, safe_sysctl[i].key);
3070
3071 if (good)
3072 break;
3073 }
3074
c6147113
LP
3075 if (!good)
3076 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3077
3078 r = sysctl_write(*k, *v);
3079 if (r < 0)
3080 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3081 }
3082
3083 return 0;
3084}
3085
03cfe0d5
LP
3086static int inner_child(
3087 Barrier *barrier,
3088 const char *directory,
3089 bool secondary,
3090 int kmsg_socket,
3091 int rtnl_socket,
3acc84eb 3092 int master_pty_socket,
e1bb4b0d
LB
3093 FDSet *fds,
3094 char **os_release_pairs) {
69c79d3c 3095
03cfe0d5 3096 _cleanup_free_ char *home = NULL;
b5ea030d 3097 char as_uuid[ID128_UUID_STRING_MAX];
88614c8a 3098 size_t n_env = 1;
03cfe0d5 3099 const char *envp[] = {
0c300adf 3100 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3101 NULL, /* container */
03cfe0d5
LP
3102 NULL, /* TERM */
3103 NULL, /* HOME */
3104 NULL, /* USER */
3105 NULL, /* LOGNAME */
3106 NULL, /* container_uuid */
3107 NULL, /* LISTEN_FDS */
3108 NULL, /* LISTEN_PID */
9c1e04d0 3109 NULL, /* NOTIFY_SOCKET */
3652872a 3110 NULL, /* CREDENTIALS_DIRECTORY */
03cfe0d5
LP
3111 NULL
3112 };
1a68e1e5 3113 const char *exec_target;
2371271c 3114 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3115 int r, which_failed;
88213476 3116
b37469d7
LP
3117 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3118 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3119 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3120 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3121 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3122 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3123 * namespace.
3124 *
3125 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3126 * unshare(). See below. */
3127
03cfe0d5
LP
3128 assert(barrier);
3129 assert(directory);
3130 assert(kmsg_socket >= 0);
88213476 3131
de40a303
LP
3132 log_debug("Inner child is initializing.");
3133
0de7acce 3134 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3135 /* Tell the parent, that it now can write the UID map. */
3136 (void) barrier_place(barrier); /* #1 */
7027ff61 3137
03cfe0d5 3138 /* Wait until the parent wrote the UID map */
baaa35ad 3139 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3140 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3141
2a2e78e9
LP
3142 /* Become the new root user inside our namespace */
3143 r = reset_uid_gid();
3144 if (r < 0)
3145 return log_error_errno(r, "Couldn't become new root: %m");
3146
3147 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3148 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3149 * propagation, but simply create new peer groups for all our mounts). */
3150 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3151 if (r < 0)
3152 return r;
3153 }
6d66bd3b 3154
0de7acce 3155 r = mount_all(NULL,
4f086aab 3156 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3157 arg_uid_shift,
0de7acce 3158 arg_selinux_apifs_context);
03cfe0d5
LP
3159 if (r < 0)
3160 return r;
3161
04413780
ZJS
3162 if (!arg_network_namespace_path && arg_private_network) {
3163 r = unshare(CLONE_NEWNET);
3164 if (r < 0)
3165 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3166
3167 /* Tell the parent that it can setup network interfaces. */
3168 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3169 }
3170
4f086aab 3171 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3172 if (r < 0)
3173 return r;
3174
03cfe0d5
LP
3175 /* Wait until we are cgroup-ified, so that we
3176 * can mount the right cgroup path writable */
baaa35ad
ZJS
3177 if (!barrier_place_and_sync(barrier)) /* #4 */
3178 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3179 "Parent died too early");
88213476 3180
489fae52 3181 if (arg_use_cgns) {
0996ef00
CB
3182 r = unshare(CLONE_NEWCGROUP);
3183 if (r < 0)
04413780 3184 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3185 r = mount_cgroups(
3186 "",
3187 arg_unified_cgroup_hierarchy,
3188 arg_userns_mode != USER_NAMESPACE_NO,
3189 arg_uid_shift,
3190 arg_uid_range,
5a8ff0e6 3191 arg_selinux_apifs_context,
ada54120 3192 true);
1433e0f2 3193 } else
0996ef00 3194 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3195 if (r < 0)
3196 return r;
ec16945e 3197
1e4f1671 3198 r = setup_boot_id();
03cfe0d5
LP
3199 if (r < 0)
3200 return r;
ec16945e 3201
1e4f1671 3202 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
3203 if (r < 0)
3204 return r;
3205 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3206
de40a303
LP
3207 r = mount_custom(
3208 "/",
3209 arg_custom_mounts,
3210 arg_n_custom_mounts,
de40a303
LP
3211 0,
3212 arg_selinux_apifs_context,
5f0a6347 3213 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3214 if (r < 0)
3215 return r;
3216
03cfe0d5
LP
3217 if (setsid() < 0)
3218 return log_error_errno(errno, "setsid() failed: %m");
3219
3220 if (arg_private_network)
df883de9 3221 (void) loopback_setup();
03cfe0d5 3222
7a8f6325
LP
3223 if (arg_expose_ports) {
3224 r = expose_port_send_rtnl(rtnl_socket);
3225 if (r < 0)
3226 return r;
3227 rtnl_socket = safe_close(rtnl_socket);
3228 }
03cfe0d5 3229
3acc84eb 3230 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3231 _cleanup_close_ int master = -1;
3acc84eb
FB
3232 _cleanup_free_ char *console = NULL;
3233
3234 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3235 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3236 if (master < 0)
dc98caea 3237 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3238
3239 r = setup_dev_console(console);
3240 if (r < 0)
105a1a36 3241 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb
FB
3242
3243 r = send_one_fd(master_pty_socket, master, 0);
3244 if (r < 0)
3245 return log_error_errno(r, "Failed to send master fd: %m");
3246 master_pty_socket = safe_close(master_pty_socket);
3247
3248 r = setup_stdio_as_dev_console();
3249 if (r < 0)
3250 return r;
3251 }
3252
de40a303
LP
3253 r = patch_sysctl();
3254 if (r < 0)
3255 return r;
3256
81f345df
LP
3257 if (arg_oom_score_adjust_set) {
3258 r = set_oom_score_adjust(arg_oom_score_adjust);
3259 if (r < 0)
3260 return log_error_errno(r, "Failed to adjust OOM score: %m");
3261 }
3262
0985c7c4
ZJS
3263 if (arg_cpu_set.set)
3264 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3265 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3266
c818eef1 3267 (void) setup_hostname();
03cfe0d5 3268
050f7277 3269 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3270 r = safe_personality(arg_personality);
3271 if (r < 0)
3272 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3273 } else if (secondary) {
21022b9d
LP
3274 r = safe_personality(PER_LINUX32);
3275 if (r < 0)
3276 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3277 }
3278
de40a303
LP
3279 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3280 if (r < 0)
3281 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3282
3283#if HAVE_SECCOMP
3284 if (arg_seccomp) {
3285
3286 if (is_seccomp_available()) {
3287
3288 r = seccomp_load(arg_seccomp);
7bc5e0b1 3289 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3290 return log_error_errno(r, "Failed to install seccomp filter: %m");
3291 if (r < 0)
3292 log_debug_errno(r, "Failed to install seccomp filter: %m");
3293 }
3294 } else
3295#endif
3296 {
6b000af4 3297 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3298 if (r < 0)
3299 return r;
3300 }
3301
349cc4a5 3302#if HAVE_SELINUX
03cfe0d5 3303 if (arg_selinux_context)
2ed96880 3304 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3305 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3306#endif
3307
de40a303
LP
3308 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3309 * if we need to later on. */
3310 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3311 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3312
3313 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3314 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3315 else
3316 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
3317 if (r < 0)
3318 return r;
3319
de40a303
LP
3320 r = drop_capabilities(getuid());
3321 if (r < 0)
3322 return log_error_errno(r, "Dropping capabilities failed: %m");
3323
66edd963
LP
3324 if (arg_no_new_privileges)
3325 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3326 return log_error_errno(errno, "Failed to disable new privileges: %m");
3327
6aadfa4c
ILG
3328 /* LXC sets container=lxc, so follow the scheme here */
3329 envp[n_env++] = strjoina("container=", arg_container_service_name);
3330
03cfe0d5
LP
3331 envp[n_env] = strv_find_prefix(environ, "TERM=");
3332 if (envp[n_env])
313cefa1 3333 n_env++;
03cfe0d5 3334
de40a303
LP
3335 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3336 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3337 return log_oom();
3338
3339 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3340 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3341 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3342 return log_oom();
03cfe0d5 3343
3bbaff3e 3344 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3345
691675ba 3346 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3347 return log_oom();
03cfe0d5
LP
3348
3349 if (fdset_size(fds) > 0) {
3350 r = fdset_cloexec(fds, false);
3351 if (r < 0)
3352 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3353
3354 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3355 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3356 return log_oom();
3357 }
9c1e04d0
AP
3358 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3359 return log_oom();
03cfe0d5 3360
3652872a
LP
3361 if (arg_n_credentials > 0) {
3362 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3363 if (!envp[n_env])
3364 return log_oom();
3365 n_env++;
3366 }
3367
ed4512d0 3368 env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
2371271c
TG
3369 if (!env_use)
3370 return log_oom();
03cfe0d5
LP
3371
3372 /* Let the parent know that we are ready and
3373 * wait until the parent is ready with the
3374 * setup, too... */
baaa35ad
ZJS
3375 if (!barrier_place_and_sync(barrier)) /* #5 */
3376 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3377 "Parent died too early");
03cfe0d5 3378
5f932eb9
LP
3379 if (arg_chdir)
3380 if (chdir(arg_chdir) < 0)
3381 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3382
7732f92b 3383 if (arg_start_mode == START_PID2) {
75bf701f 3384 r = stub_pid1(arg_uuid);
7732f92b
LP
3385 if (r < 0)
3386 return r;
3387 }
3388
de40a303
LP
3389 log_debug("Inner child completed, invoking payload.");
3390
8ca082b4
LP
3391 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3392 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3393 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3394 log_close();
8ca082b4
LP
3395 log_set_open_when_needed(true);
3396
03cfe0d5
LP
3397 (void) fdset_close_others(fds);
3398
7732f92b 3399 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3400 char **a;
3401 size_t m;
3402
3403 /* Automatically search for the init system */
3404
75f32f04
ZJS
3405 m = strv_length(arg_parameters);
3406 a = newa(char*, m + 2);
3407 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3408 a[1 + m] = NULL;
03cfe0d5 3409
ced58da7 3410 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3411 execve(a[0], a, env_use);
3412
ced58da7 3413 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3414 execve(a[0], a, env_use);
3415
ced58da7 3416 a[0] = (char*) "/sbin/init";
03cfe0d5 3417 execve(a[0], a, env_use);
ced58da7
LP
3418
3419 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3420 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3421 const char *dollar_path;
3422
1a68e1e5 3423 exec_target = arg_parameters[0];
b6b180b7
LP
3424
3425 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3426 * binary. */
3427 dollar_path = strv_env_get(env_use, "PATH");
3428 if (dollar_path) {
6f646e01 3429 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3430 return log_error_errno(errno, "Failed to update $PATH: %m");
3431 }
3432
f757855e 3433 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3434 } else {
5f932eb9 3435 if (!arg_chdir)
d929b0f9
ZJS
3436 /* If we cannot change the directory, we'll end up in /, that is expected. */
3437 (void) chdir(home ?: "/root");
5f932eb9 3438
03cfe0d5
LP
3439 execle("/bin/bash", "-bash", NULL, env_use);
3440 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3441
3442 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3443 }
3444
8ca082b4 3445 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3446}
3447
e96ceaba 3448static int setup_notify_child(void) {
271f518f 3449 _cleanup_close_ int fd = -1;
9c1e04d0 3450 union sockaddr_union sa = {
44ed5214
LP
3451 .un.sun_family = AF_UNIX,
3452 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3453 };
3454 int r;
3455
3456 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3457 if (fd < 0)
3458 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3459
3460 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3461 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3462
9c1e04d0 3463 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3464 if (r < 0)
44ed5214 3465 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3466
adc7d9f0 3467 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3468 if (r < 0)
adc7d9f0 3469 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3470
2ff48e98 3471 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3472 if (r < 0)
2ff48e98 3473 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3474
271f518f 3475 return TAKE_FD(fd);
9c1e04d0
AP
3476}
3477
03cfe0d5
LP
3478static int outer_child(
3479 Barrier *barrier,
3480 const char *directory,
2d845785 3481 DissectedImage *dissected_image,
03cfe0d5
LP
3482 bool secondary,
3483 int pid_socket,
e01ff70a 3484 int uuid_socket,
9c1e04d0 3485 int notify_socket,
03cfe0d5
LP
3486 int kmsg_socket,
3487 int rtnl_socket,
825d5287 3488 int uid_shift_socket,
3acc84eb 3489 int master_pty_socket,
8199d554 3490 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3491 FDSet *fds,
3492 int netns_fd) {
03cfe0d5 3493
e1bb4b0d 3494 _cleanup_strv_free_ char **os_release_pairs = NULL;
bf428efb 3495 _cleanup_close_ int fd = -1;
e5f10caf 3496 const char *p;
03cfe0d5
LP
3497 pid_t pid;
3498 ssize_t l;
de40a303 3499 int r;
03cfe0d5 3500
b37469d7
LP
3501 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3502 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3503 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3504 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3505
03cfe0d5
LP
3506 assert(barrier);
3507 assert(directory);
03cfe0d5 3508 assert(pid_socket >= 0);
e01ff70a 3509 assert(uuid_socket >= 0);
9c1e04d0 3510 assert(notify_socket >= 0);
3acc84eb 3511 assert(master_pty_socket >= 0);
03cfe0d5
LP
3512 assert(kmsg_socket >= 0);
3513
de40a303
LP
3514 log_debug("Outer child is initializing.");
3515
e1bb4b0d
LB
3516 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3517 if (r < 0)
3518 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3519
03cfe0d5
LP
3520 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3521 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3522
03cfe0d5
LP
3523 r = reset_audit_loginuid();
3524 if (r < 0)
3525 return r;
3526
2a2e78e9
LP
3527 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3528 * mounts to the real root. */
60e76d48
ZJS
3529 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3530 if (r < 0)
3531 return r;
03cfe0d5 3532
2d845785 3533 if (dissected_image) {
2d3a5a73
LP
3534 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3535 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3536 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3537 * makes sure ESP partitions and userns are compatible. */
3538
af187ab2
LP
3539 r = dissected_image_mount_and_warn(
3540 dissected_image, directory, arg_uid_shift,
3541 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3542 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK)|
3543 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3544 if (r < 0)
af187ab2 3545 return r;
2d845785 3546 }
03cfe0d5 3547
391567f4
LP
3548 r = determine_uid_shift(directory);
3549 if (r < 0)
3550 return r;
3551
0de7acce 3552 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3553 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3554 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3555 if (l < 0)
3556 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3557 if (l != sizeof(arg_uid_shift))
3558 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3559 "Short write while sending UID shift.");
0e7ac751 3560
0de7acce 3561 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3562 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3563 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3564 * not it will pick a different one, and send it back to us. */
3565
3566 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3567 if (l < 0)
3568 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3569 if (l != sizeof(arg_uid_shift))
3570 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3571 "Short read while receiving UID shift.");
0e7ac751
LP
3572 }
3573
ff6c6cc1
LP
3574 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3575 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3576 }
3577
6f83d3d1
LP
3578 if (path_equal(directory, "/")) {
3579 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3580 * place, so that we can make changes to its mount structure (for example, to implement
3581 * --volatile=) without this interfering with our ability to access files such as
3582 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3583 * (instead of a temporary directory, since we are living in our own mount namspace here
3584 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3585 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3586
3587 r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3588 if (r < 0)
3589 return r;
3590
3591 directory = "/run/systemd/nspawn-root";
e50cd82f 3592 }
7d0ecdd6
LP
3593
3594 r = setup_pivot_root(
3595 directory,
3596 arg_pivot_root_new,
3597 arg_pivot_root_old);
3598 if (r < 0)
3599 return r;
3600
3601 r = setup_volatile_mode(
3602 directory,
3603 arg_volatile_mode,
7d0ecdd6 3604 arg_uid_shift,
8f1ed04a 3605 arg_selinux_apifs_context);
7d0ecdd6
LP
3606 if (r < 0)
3607 return r;
3608
5f0a6347
DDM
3609 r = mount_custom(
3610 directory,
3611 arg_custom_mounts,
3612 arg_n_custom_mounts,
5f0a6347 3613 arg_uid_shift,
5f0a6347
DDM
3614 arg_selinux_apifs_context,
3615 MOUNT_ROOT_ONLY);
3616 if (r < 0)
3617 return r;
3618
5530dc87
DDM
3619 /* Make sure we always have a mount that we can move to root later on. */
3620 if (!path_is_mount_point(directory, NULL, 0)) {
3621 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3622 if (r < 0)
3623 return r;
3624 }
3625
2d3a5a73
LP
3626 if (dissected_image) {
3627 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3628 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
4fcb96ce
LP
3629 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK));
3630 if (r == -EUCLEAN)
3631 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3632 if (r < 0)
4fcb96ce 3633 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3634 }
3635
8199d554
LP
3636 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3637 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3638
3639 r = detect_unified_cgroup_hierarchy_from_image(directory);
3640 if (r < 0)
3641 return r;
3642
3643 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3644 if (l < 0)
3645 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3646 if (l != sizeof(arg_unified_cgroup_hierarchy))
3647 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3648 "Short write while sending cgroup mode.");
8199d554
LP
3649
3650 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3651 }
3652
4ad14eff
LP
3653 /* Mark everything as shared so our mounts get propagated down. This is
3654 * required to make new bind mounts available in systemd services
5238e957 3655 * inside the container that create a new mount namespace.
4ad14eff
LP
3656 * See https://github.com/systemd/systemd/issues/3860
3657 * Further submounts (such as /dev) done after this will inherit the
5f0a6347
DDM
3658 * shared propagation mode.
3659 *
3660 * IMPORTANT: Do not overmount the root directory anymore from now on to
3661 * enable moving the root directory mount to root later on.
3662 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3663 */
4ad14eff
LP
3664 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3665 if (r < 0)
3666 return r;
3667
3668 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3669 if (r < 0)
3670 return r;
3671
03cfe0d5
LP
3672 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3673 if (r < 0)
3674 return r;
3675
bbd407ea
DDM
3676 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3677 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3678 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3679 if (r < 0)
3680 return log_error_errno(r, "Failed to make tree read-only: %m");
3681 }
3682
0de7acce 3683 r = mount_all(directory,
4f086aab 3684 arg_mount_settings,
0de7acce 3685 arg_uid_shift,
0de7acce 3686 arg_selinux_apifs_context);
03cfe0d5
LP
3687 if (r < 0)
3688 return r;
3689
07fa00f9
LP
3690 r = copy_devnodes(directory);
3691 if (r < 0)
03cfe0d5
LP
3692 return r;
3693
de40a303
LP
3694 r = make_extra_nodes(directory);
3695 if (r < 0)
3696 return r;
3697
3698 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3699
9fac5029 3700 p = prefix_roota(directory, "/run/host");
e5f10caf 3701 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3702
07fa00f9
LP
3703 r = setup_pts(directory);
3704 if (r < 0)
03cfe0d5
LP
3705 return r;
3706
3707 r = setup_propagate(directory);
3708 if (r < 0)
3709 return r;
3710
8e5430c4
LP
3711 r = setup_keyring();
3712 if (r < 0)
3713 return r;
3714
3652872a
LP
3715 r = setup_credentials(directory);
3716 if (r < 0)
3717 return r;
3718
5c4deb9a
MJ
3719 r = mount_custom(
3720 directory,
3721 arg_custom_mounts,
3722 arg_n_custom_mounts,
3723 arg_uid_shift,
3724 arg_selinux_apifs_context,
3725 MOUNT_NON_ROOT_ONLY);
3726 if (r < 0)
3727 return r;
3728
03cfe0d5
LP
3729 r = setup_timezone(directory);
3730 if (r < 0)
3731 return r;
3732
3733 r = setup_resolv_conf(directory);
3734 if (r < 0)
3735 return r;
3736
e01ff70a
MS
3737 r = setup_machine_id(directory);
3738 if (r < 0)
3739 return r;
3740
03cfe0d5
LP
3741 r = setup_journal(directory);
3742 if (r < 0)
3743 return r;
3744
0f48ba7b
LP
3745 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3746 p = prefix_roota(directory, "/run/host/container-manager");
3747 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3748
3749 /* The same stuff as the $container_uuid env var */
3750 p = prefix_roota(directory, "/run/host/container-uuid");
3751 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3752
489fae52 3753 if (!arg_use_cgns) {
0996ef00
CB
3754 r = mount_cgroups(
3755 directory,
3756 arg_unified_cgroup_hierarchy,
3757 arg_userns_mode != USER_NAMESPACE_NO,
3758 arg_uid_shift,
3759 arg_uid_range,
5a8ff0e6 3760 arg_selinux_apifs_context,
ada54120 3761 false);
0996ef00
CB
3762 if (r < 0)
3763 return r;
3764 }
03cfe0d5
LP
3765
3766 r = mount_move_root(directory);
3767 if (r < 0)
3768 return log_error_errno(r, "Failed to move root directory: %m");
3769
e96ceaba 3770 fd = setup_notify_child();
9c1e04d0
AP
3771 if (fd < 0)
3772 return fd;
3773
03cfe0d5 3774 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3775 arg_clone_ns_flags |
8869a0b4 3776 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3777 if (pid < 0)
3778 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3779 if (pid == 0) {
3780 pid_socket = safe_close(pid_socket);
e01ff70a 3781 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3782 notify_socket = safe_close(notify_socket);
825d5287 3783 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5 3784
2a2e78e9
LP
3785 /* The inner child has all namespaces that are requested, so that we all are owned by the
3786 * user if user namespaces are turned on. */
03cfe0d5 3787
d7bea6b6
DP
3788 if (arg_network_namespace_path) {
3789 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3790 if (r < 0)
e2d39e54 3791 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3792 }
3793
e1bb4b0d 3794 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
03cfe0d5
LP
3795 if (r < 0)
3796 _exit(EXIT_FAILURE);
3797
3798 _exit(EXIT_SUCCESS);
3799 }
3800
3801 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3802 if (l < 0)
3803 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3804 if (l != sizeof(pid))
3805 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3806 "Short write while sending PID.");
03cfe0d5 3807
e01ff70a
MS
3808 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3809 if (l < 0)
3810 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3811 if (l != sizeof(arg_uuid))
3812 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3813 "Short write while sending machine ID.");
e01ff70a 3814
9c1e04d0
AP
3815 l = send_one_fd(notify_socket, fd, 0);
3816 if (l < 0)
ba72801d 3817 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3818
03cfe0d5 3819 pid_socket = safe_close(pid_socket);
e01ff70a 3820 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3821 notify_socket = safe_close(notify_socket);
3acc84eb 3822 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3823 kmsg_socket = safe_close(kmsg_socket);
3824 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3825 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3826
3827 return 0;
3828}
3829
0e7ac751 3830static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3831 bool tried_hashed = false;
0e7ac751
LP
3832 unsigned n_tries = 100;
3833 uid_t candidate;
3834 int r;
3835
3836 assert(shift);
3837 assert(ret_lock_file);
0de7acce 3838 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3839 assert(arg_uid_range == 0x10000U);
3840
3841 candidate = *shift;
3842
3843 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3844
3845 for (;;) {
fbd0b64f 3846 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3847 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3848
3849 if (--n_tries <= 0)
3850 return -EBUSY;
3851
87d5e4f2 3852 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3853 goto next;
3854 if ((candidate & UINT32_C(0xFFFF)) != 0)
3855 goto next;
3856
3857 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3858 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3859 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3860 goto next;
3861 if (r < 0)
3862 return r;
3863
3864 /* Make some superficial checks whether the range is currently known in the user database */
3865 if (getpwuid(candidate))
3866 goto next;
3867 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3868 goto next;
3869 if (getgrgid(candidate))
3870 goto next;
3871 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3872 goto next;
3873
3874 *ret_lock_file = lf;
3875 lf = (struct LockFile) LOCK_FILE_INIT;
3876 *shift = candidate;
3877 return 0;
3878
3879 next:
d381c8a6
LP
3880 if (arg_machine && !tried_hashed) {
3881 /* Try to hash the base from the container name */
3882
3883 static const uint8_t hash_key[] = {
3884 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3885 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3886 };
3887
3888 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3889
3890 tried_hashed = true;
3891 } else
3892 random_bytes(&candidate, sizeof(candidate));
3893
87d5e4f2 3894 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3895 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3896 }
3897}
3898
03cfe0d5 3899static int setup_uid_map(pid_t pid) {
fbd0b64f 3900 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3901 int r;
3902
3903 assert(pid > 1);
3904
3905 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3906 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3907 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3908 if (r < 0)
3909 return log_error_errno(r, "Failed to write UID map: %m");
3910
3911 /* We always assign the same UID and GID ranges */
3912 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3913 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3914 if (r < 0)
3915 return log_error_errno(r, "Failed to write GID map: %m");
3916
3917 return 0;
3918}
3919
9c1e04d0 3920static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3921 char buf[NOTIFY_BUFFER_MAX+1];
3922 char *p = NULL;
3923 struct iovec iovec = {
3924 .iov_base = buf,
3925 .iov_len = sizeof(buf)-1,
3926 };
fb29cdbe
LP
3927 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
3928 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
3929 struct msghdr msghdr = {
3930 .msg_iov = &iovec,
3931 .msg_iovlen = 1,
3932 .msg_control = &control,
3933 .msg_controllen = sizeof(control),
3934 };
371d72e0 3935 struct ucred *ucred;
9c1e04d0
AP
3936 ssize_t n;
3937 pid_t inner_child_pid;
3938 _cleanup_strv_free_ char **tags = NULL;
3939
3940 assert(userdata);
3941
3942 inner_child_pid = PTR_TO_PID(userdata);
3943
3944 if (revents != EPOLLIN) {
3945 log_warning("Got unexpected poll event for notify fd.");
3946 return 0;
3947 }
3948
3691bcf3
LP
3949 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3950 if (IN_SET(n, -EAGAIN, -EINTR))
3951 return 0;
3952 if (n < 0)
3953 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 3954
9c1e04d0
AP
3955 cmsg_close_all(&msghdr);
3956
371d72e0 3957 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 3958 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3959 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3960 return 0;
3961 }
3962
3963 if ((size_t) n >= sizeof(buf)) {
3964 log_warning("Received notify message exceeded maximum size. Ignoring.");
3965 return 0;
3966 }
3967
3968 buf[n] = 0;
3969 tags = strv_split(buf, "\n\r");
3970 if (!tags)
3971 return log_oom();
3972
3973 if (strv_find(tags, "READY=1"))
04f590a4 3974 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
3975
3976 p = strv_find_startswith(tags, "STATUS=");
3977 if (p)
04f590a4 3978 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
3979
3980 return 0;
3981}
3982
e96ceaba 3983static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3984 int r;
9c1e04d0 3985
5773024d 3986 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3987 if (r < 0)
3988 return log_error_errno(r, "Failed to allocate notify event source: %m");
3989
5773024d 3990 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3991
3992 return 0;
3993}
3994
5d961407
LP
3995static int merge_settings(Settings *settings, const char *path) {
3996 int rl;
f757855e 3997
5d961407
LP
3998 assert(settings);
3999 assert(path);
f757855e 4000
5d961407
LP
4001 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4002 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4003
7732f92b
LP
4004 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4005 settings->start_mode >= 0) {
4006 arg_start_mode = settings->start_mode;
130d3d22 4007 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4008 }
4009
a2f577fc
JL
4010 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
4011 arg_ephemeral = settings->ephemeral;
4012
de40a303
LP
4013 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4014 settings->root) {
4015
4016 if (!arg_settings_trusted)
4017 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4018 else
4019 free_and_replace(arg_directory, settings->root);
4020 }
4021
b53ede69
PW
4022 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4023 settings->pivot_root_new) {
4024 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4025 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4026 }
4027
5f932eb9 4028 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4029 settings->working_directory)
4030 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4031
f757855e 4032 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4033 settings->environment)
4034 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4035
de40a303
LP
4036 if ((arg_settings_mask & SETTING_USER) == 0) {
4037
4038 if (settings->user)
4039 free_and_replace(arg_user, settings->user);
4040
4041 if (uid_is_valid(settings->uid))
4042 arg_uid = settings->uid;
4043 if (gid_is_valid(settings->gid))
4044 arg_gid = settings->gid;
4045 if (settings->n_supplementary_gids > 0) {
4046 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4047 arg_n_supplementary_gids = settings->n_supplementary_gids;
4048 }
4049 }
f757855e
LP
4050
4051 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4052 uint64_t plus, minus;
7be830c6 4053 uint64_t network_minus = 0;
f757855e 4054
de40a303
LP
4055 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4056 * Settings structure */
4057
0e265674 4058 plus = settings->capability;
a3fc6b55
LP
4059 minus = settings->drop_capability;
4060
4061 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
4062 if (settings_private_network(settings))
4063 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4064 else
7be830c6 4065 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4066 }
0e265674
LP
4067
4068 if (!arg_settings_trusted && plus != 0) {
4069 if (settings->capability != 0)
5d961407 4070 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4071 } else {
4072 arg_caps_retain &= ~network_minus;
520e0d54 4073 arg_caps_retain |= plus;
7be830c6 4074 }
f757855e 4075
a3fc6b55 4076 arg_caps_retain &= ~minus;
de40a303
LP
4077
4078 /* Copy the full capabilities over too */
4079 if (capability_quintet_is_set(&settings->full_capabilities)) {
4080 if (!arg_settings_trusted)
5238e957 4081 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4082 else
4083 arg_full_capabilities = settings->full_capabilities;
4084 }
f757855e
LP
4085 }
4086
4087 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4088 settings->kill_signal > 0)
4089 arg_kill_signal = settings->kill_signal;
4090
4091 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4092 settings->personality != PERSONALITY_INVALID)
4093 arg_personality = settings->personality;
4094
4095 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4096 !sd_id128_is_null(settings->machine_id)) {
4097
4098 if (!arg_settings_trusted)
5d961407 4099 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4100 else
4101 arg_uuid = settings->machine_id;
4102 }
4103
4104 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4105 settings->read_only >= 0)
4106 arg_read_only = settings->read_only;
4107
4108 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4109 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4110 arg_volatile_mode = settings->volatile_mode;
4111
4112 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4113 settings->n_custom_mounts > 0) {
4114
4115 if (!arg_settings_trusted)
5d961407 4116 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4117 else {
4118 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4119 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4120 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4121 settings->n_custom_mounts = 0;
4122 }
4123 }
4124
4125 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4126 (settings->private_network >= 0 ||
4127 settings->network_veth >= 0 ||
4128 settings->network_bridge ||
22b28dfd 4129 settings->network_zone ||
f757855e
LP
4130 settings->network_interfaces ||
4131 settings->network_macvlan ||
f6d6bad1 4132 settings->network_ipvlan ||
de40a303
LP
4133 settings->network_veth_extra ||
4134 settings->network_namespace_path)) {
f757855e
LP
4135
4136 if (!arg_settings_trusted)
5d961407 4137 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4138 else {
f6d6bad1 4139 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4140 arg_private_network = settings_private_network(settings);
4141
130d3d22
YW
4142 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4143 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4144 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4145 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4146
1cc6c93a
YW
4147 free_and_replace(arg_network_bridge, settings->network_bridge);
4148 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4149
4150 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4151 }
4152 }
4153
4154 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4155 settings->expose_ports) {
4156
4157 if (!arg_settings_trusted)
5d961407 4158 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4159 else {
4160 expose_port_free_all(arg_expose_ports);
1cc6c93a 4161 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4162 }
4163 }
4164
0de7acce
LP
4165 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4166 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4167
4168 if (!arg_settings_trusted)
5d961407 4169 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4170 else {
4171 arg_userns_mode = settings->userns_mode;
4172 arg_uid_shift = settings->uid_shift;
4173 arg_uid_range = settings->uid_range;
4174 arg_userns_chown = settings->userns_chown;
4175 }
4176 }
4177
9c1e04d0
AP
4178 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
4179 arg_notify_ready = settings->notify_ready;
4180
960e4569
LP
4181 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4182
6b000af4 4183 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
5d961407 4184 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 4185 else {
6b000af4
LP
4186 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4187 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
960e4569 4188 }
de40a303
LP
4189
4190#if HAVE_SECCOMP
4191 if (!arg_settings_trusted && settings->seccomp)
4192 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4193 else {
4194 seccomp_release(arg_seccomp);
4195 arg_seccomp = TAKE_PTR(settings->seccomp);
4196 }
4197#endif
960e4569
LP
4198 }
4199
bf428efb
LP
4200 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4201 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4202 continue;
4203
4204 if (!settings->rlimit[rl])
4205 continue;
4206
4207 if (!arg_settings_trusted) {
5d961407 4208 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4209 continue;
4210 }
4211
4212 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4213 }
4214
3a9530e5
LP
4215 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4216 settings->hostname)
4217 free_and_replace(arg_hostname, settings->hostname);
4218
66edd963
LP
4219 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4220 settings->no_new_privileges >= 0)
4221 arg_no_new_privileges = settings->no_new_privileges;
4222
81f345df
LP
4223 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4224 settings->oom_score_adjust_set) {
4225
4226 if (!arg_settings_trusted)
5d961407 4227 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4228 else {
4229 arg_oom_score_adjust = settings->oom_score_adjust;
4230 arg_oom_score_adjust_set = true;
4231 }
4232 }
4233
d107bb7d 4234 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4235 settings->cpu_set.set) {
d107bb7d
LP
4236
4237 if (!arg_settings_trusted)
5d961407 4238 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4239 else {
0985c7c4
ZJS
4240 cpu_set_reset(&arg_cpu_set);
4241 arg_cpu_set = settings->cpu_set;
4242 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4243 }
4244 }
4245
09d423e9
LP
4246 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4247 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4248 arg_resolv_conf = settings->resolv_conf;
4249
4e1d6aa9
LP
4250 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4251 settings->link_journal != _LINK_JOURNAL_INVALID) {
4252
4253 if (!arg_settings_trusted)
4254 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4255 else {
4256 arg_link_journal = settings->link_journal;
4257 arg_link_journal_try = settings->link_journal_try;
4258 }
4259 }
4260
1688841f
LP
4261 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4262 settings->timezone != _TIMEZONE_MODE_INVALID)
4263 arg_timezone = settings->timezone;
4264
de40a303
LP
4265 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4266 settings->slice) {
4267
4268 if (!arg_settings_trusted)
4269 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4270 else
4271 free_and_replace(arg_slice, settings->slice);
4272 }
4273
4274 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4275 settings->use_cgns >= 0) {
4276
4277 if (!arg_settings_trusted)
4278 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4279 else
4280 arg_use_cgns = settings->use_cgns;
4281 }
4282
4283 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4284 settings->clone_ns_flags != (unsigned long) -1) {
4285
4286 if (!arg_settings_trusted)
4287 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4288 else
4289 arg_clone_ns_flags = settings->clone_ns_flags;
4290 }
4291
4292 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4293 settings->console_mode >= 0) {
4294
4295 if (!arg_settings_trusted)
4296 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4297 else
4298 arg_console_mode = settings->console_mode;
4299 }
4300
4301 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4302 * don't consult arg_settings_mask for them. */
4303
4304 sd_bus_message_unref(arg_property_message);
4305 arg_property_message = TAKE_PTR(settings->properties);
4306
4307 arg_console_width = settings->console_width;
4308 arg_console_height = settings->console_height;
4309
b2645747 4310 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4311 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4312 arg_n_extra_nodes = settings->n_extra_nodes;
4313
f757855e
LP
4314 return 0;
4315}
4316
5d961407
LP
4317static int load_settings(void) {
4318 _cleanup_(settings_freep) Settings *settings = NULL;
4319 _cleanup_fclose_ FILE *f = NULL;
4320 _cleanup_free_ char *p = NULL;
4321 const char *fn, *i;
4322 int r;
4323
de40a303
LP
4324 if (arg_oci_bundle)
4325 return 0;
4326
5d961407
LP
4327 /* If all settings are masked, there's no point in looking for
4328 * the settings file */
4329 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4330 return 0;
4331
4332 fn = strjoina(arg_machine, ".nspawn");
4333
4334 /* We first look in the admin's directories in /etc and /run */
4335 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4336 _cleanup_free_ char *j = NULL;
4337
657ee2d8 4338 j = path_join(i, fn);
5d961407
LP
4339 if (!j)
4340 return log_oom();
4341
4342 f = fopen(j, "re");
4343 if (f) {
4344 p = TAKE_PTR(j);
4345
4346 /* By default, we trust configuration from /etc and /run */
4347 if (arg_settings_trusted < 0)
4348 arg_settings_trusted = true;
4349
4350 break;
4351 }
4352
4353 if (errno != ENOENT)
4354 return log_error_errno(errno, "Failed to open %s: %m", j);
4355 }
4356
4357 if (!f) {
4358 /* After that, let's look for a file next to the
4359 * actual image we shall boot. */
4360
4361 if (arg_image) {
4362 p = file_in_same_dir(arg_image, fn);
4363 if (!p)
4364 return log_oom();
cd6e3914 4365 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4366 p = file_in_same_dir(arg_directory, fn);
4367 if (!p)
4368 return log_oom();
4369 }
4370
4371 if (p) {
4372 f = fopen(p, "re");
4373 if (!f && errno != ENOENT)
4374 return log_error_errno(errno, "Failed to open %s: %m", p);
4375
4376 /* By default, we do not trust configuration from /var/lib/machines */
4377 if (arg_settings_trusted < 0)
4378 arg_settings_trusted = false;
4379 }
4380 }
4381
4382 if (!f)
4383 return 0;
4384
4385 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4386
4387 r = settings_load(f, p, &settings);
4388 if (r < 0)
4389 return r;
4390
4391 return merge_settings(settings, p);
4392}
4393
de40a303
LP
4394static int load_oci_bundle(void) {
4395 _cleanup_(settings_freep) Settings *settings = NULL;
4396 int r;
4397
4398 if (!arg_oci_bundle)
4399 return 0;
4400
4401 /* By default let's trust OCI bundles */
4402 if (arg_settings_trusted < 0)
4403 arg_settings_trusted = true;
4404
4405 r = oci_load(NULL, arg_oci_bundle, &settings);
4406 if (r < 0)
4407 return r;
4408
4409 return merge_settings(settings, arg_oci_bundle);
4410}
4411
3acc84eb 4412static int run_container(
2d845785 4413 DissectedImage *dissected_image,
b0067625
ZJS
4414 bool secondary,
4415 FDSet *fds,
4416 char veth_name[IFNAMSIZ], bool *veth_created,
4417 union in_addr_union *exposed,
3acc84eb 4418 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4419
4420 static const struct sigaction sa = {
4421 .sa_handler = nop_signal_handler,
e28c7cd0 4422 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4423 };
4424
8e766630 4425 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4426 _cleanup_close_ int etc_passwd_lock = -1;
4427 _cleanup_close_pair_ int
4428 kmsg_socket_pair[2] = { -1, -1 },
4429 rtnl_socket_pair[2] = { -1, -1 },
4430 pid_socket_pair[2] = { -1, -1 },
4431 uuid_socket_pair[2] = { -1, -1 },
4432 notify_socket_pair[2] = { -1, -1 },
8199d554 4433 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4434 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4435 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4436
3acc84eb 4437 _cleanup_close_ int notify_socket = -1;
b0067625 4438 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4439 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4440 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4441 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4442 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4443 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4444 ContainerStatus container_status = 0;
b0067625
ZJS
4445 int ifi = 0, r;
4446 ssize_t l;
4447 sigset_t mask_chld;
5b4855ab 4448 _cleanup_close_ int child_netns_fd = -1;
b0067625
ZJS
4449
4450 assert_se(sigemptyset(&mask_chld) == 0);
4451 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4452
4453 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4454 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4455 * check with getpwuid() if the specific user already exists. Note that /etc might be
4456 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4457 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4458 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4459 * really ours. */
4460
4461 etc_passwd_lock = take_etc_passwd_lock(NULL);
4462 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4463 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4464 }
4465
4466 r = barrier_create(&barrier);
4467 if (r < 0)
4468 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4469
4470 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4471 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4472
4473 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4474 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4475
4476 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4477 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4478
4479 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4480 return log_error_errno(errno, "Failed to create id socket pair: %m");
4481
4482 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4483 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4484
3acc84eb
FB
4485 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4486 return log_error_errno(errno, "Failed to create console socket pair: %m");
4487
b0067625
ZJS
4488 if (arg_userns_mode != USER_NAMESPACE_NO)
4489 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4490 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4491
8199d554
LP
4492 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4493 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4494 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4495
b0067625
ZJS
4496 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4497 * parent's blocking calls and give it a chance to call wait() and terminate. */
4498 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4499 if (r < 0)
4500 return log_error_errno(errno, "Failed to change the signal mask: %m");
4501
4502 r = sigaction(SIGCHLD, &sa, NULL);
4503 if (r < 0)
4504 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4505
d7bea6b6 4506 if (arg_network_namespace_path) {
5b4855ab
DDM
4507 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4508 if (child_netns_fd < 0)
d7bea6b6
DP
4509 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4510
5b4855ab 4511 r = fd_is_network_ns(child_netns_fd);
6619ad88
LP
4512 if (r == -EUCLEAN)
4513 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4514 else if (r < 0)
d7bea6b6 4515 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4516 else if (r == 0)
4517 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4518 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4519 }
4520
b0067625
ZJS
4521 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4522 if (*pid < 0)
4523 return log_error_errno(errno, "clone() failed%s: %m",
4524 errno == EINVAL ?
4525 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4526
4527 if (*pid == 0) {
4528 /* The outer child only has a file system namespace. */
4529 barrier_set_role(&barrier, BARRIER_CHILD);
4530
b0067625
ZJS
4531 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4532 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4533 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4534 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4535 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4536 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4537 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4538 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4539
4540 (void) reset_all_signal_handlers();
4541 (void) reset_signal_mask();
4542
4543 r = outer_child(&barrier,
4544 arg_directory,
2d845785 4545 dissected_image,
b0067625
ZJS
4546 secondary,
4547 pid_socket_pair[1],
4548 uuid_socket_pair[1],
4549 notify_socket_pair[1],
4550 kmsg_socket_pair[1],
4551 rtnl_socket_pair[1],
4552 uid_shift_socket_pair[1],
3acc84eb 4553 master_pty_socket_pair[1],
8199d554 4554 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6 4555 fds,
5b4855ab 4556 child_netns_fd);
b0067625
ZJS
4557 if (r < 0)
4558 _exit(EXIT_FAILURE);
4559
4560 _exit(EXIT_SUCCESS);
4561 }
4562
4563 barrier_set_role(&barrier, BARRIER_PARENT);
4564
e4077ff6 4565 fdset_close(fds);
b0067625
ZJS
4566
4567 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4568 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4569 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4570 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4571 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4572 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4573 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4574 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4575
4576 if (arg_userns_mode != USER_NAMESPACE_NO) {
4577 /* The child just let us know the UID shift it might have read from the image. */
4578 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4579 if (l < 0)
4580 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4581 if (l != sizeof arg_uid_shift)
4582 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4583
4584 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4585 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4586 * image, but if that's already in use, pick a new one, and report back to the child,
4587 * which one we now picked. */
4588
4589 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4590 if (r < 0)
4591 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4592
4593 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4594 if (l < 0)
4595 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4596 if (l != sizeof arg_uid_shift)
4597 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625
ZJS
4598 }
4599 }
4600
8199d554
LP
4601 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4602 /* The child let us know the support cgroup mode it might have read from the image. */
4603 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4604 if (l < 0)
4605 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4606 if (l != sizeof(arg_unified_cgroup_hierarchy))
4607 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4608 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4609 }
4610
b0067625 4611 /* Wait for the outer child. */
d2e0ac3d
LP
4612 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4613 if (r < 0)
4614 return r;
4615 if (r != EXIT_SUCCESS)
4616 return -EIO;
b0067625
ZJS
4617
4618 /* And now retrieve the PID of the inner child. */
4619 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4620 if (l < 0)
4621 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4622 if (l != sizeof *pid)
4623 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4624
4625 /* We also retrieve container UUID in case it was generated by outer child */
4626 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4627 if (l < 0)
4628 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4629 if (l != sizeof(arg_uuid))
4630 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4631
4632 /* We also retrieve the socket used for notifications generated by outer child */
4633 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4634 if (notify_socket < 0)
4635 return log_error_errno(notify_socket,
4636 "Failed to receive notification socket from the outer child: %m");
4637
4638 log_debug("Init process invoked as PID "PID_FMT, *pid);
4639
4640 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4641 if (!barrier_place_and_sync(&barrier)) /* #1 */
4642 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4643
4644 r = setup_uid_map(*pid);
4645 if (r < 0)
4646 return r;
4647
4648 (void) barrier_place(&barrier); /* #2 */
4649 }
4650
4651 if (arg_private_network) {
75116558
PS
4652 if (!arg_network_namespace_path) {
4653 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4654 if (!barrier_place_and_sync(&barrier)) /* #3 */
4655 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4656 }
4657
5b4855ab
DDM
4658 if (child_netns_fd < 0) {
4659 /* Make sure we have an open file descriptor to the child's network
4660 * namespace so it stays alive even if the child exits. */
4661 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4662 if (r < 0)
4663 return log_error_errno(r, "Failed to open child network namespace: %m");
4664 }
4665
4666 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4667 if (r < 0)
4668 return r;
4669
4670 if (arg_network_veth) {
4671 r = setup_veth(arg_machine, *pid, veth_name,
4672 arg_network_bridge || arg_network_zone);
4673 if (r < 0)
4674 return r;
4675 else if (r > 0)
4676 ifi = r;
4677
4678 if (arg_network_bridge) {
4679 /* Add the interface to a bridge */
4680 r = setup_bridge(veth_name, arg_network_bridge, false);
4681 if (r < 0)
4682 return r;
4683 if (r > 0)
4684 ifi = r;
4685 } else if (arg_network_zone) {
4686 /* Add the interface to a bridge, possibly creating it */
4687 r = setup_bridge(veth_name, arg_network_zone, true);
4688 if (r < 0)
4689 return r;
4690 if (r > 0)
4691 ifi = r;
4692 }
4693 }
4694
4695 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4696 if (r < 0)
4697 return r;
4698
4699 /* We created the primary and extra veth links now; let's remember this, so that we know to
4700 remove them later on. Note that we don't bother with removing veth links that were created
4701 here when their setup failed half-way, because in that case the kernel should be able to
4702 remove them on its own, since they cannot be referenced by anything yet. */
4703 *veth_created = true;
4704
4705 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4706 if (r < 0)
4707 return r;
4708
4709 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4710 if (r < 0)
4711 return r;
4712 }
4713
abdb9b08
LP
4714 if (arg_register || !arg_keep_unit) {
4715 r = sd_bus_default_system(&bus);
4716 if (r < 0)
4717 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4718
4719 r = sd_bus_set_close_on_exit(bus, false);
4720 if (r < 0)
4721 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4722 }
4723
4724 if (!arg_keep_unit) {
4725 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4726 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4727 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4728
75152a4d
LP
4729 r = sd_bus_match_signal_async(
4730 bus,
4731 NULL,
4732 "org.freedesktop.systemd1",
4733 NULL,
4734 "org.freedesktop.systemd1.Scope",
4735 "RequestStop",
4736 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4737 if (r < 0)
75152a4d 4738 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4739 }
4740
b0067625
ZJS
4741 if (arg_register) {
4742 r = register_machine(
abdb9b08 4743 bus,
b0067625
ZJS
4744 arg_machine,
4745 *pid,
4746 arg_directory,
4747 arg_uuid,
4748 ifi,
4749 arg_slice,
4750 arg_custom_mounts, arg_n_custom_mounts,
4751 arg_kill_signal,
4752 arg_property,
de40a303 4753 arg_property_message,
b0067625
ZJS
4754 arg_keep_unit,
4755 arg_container_service_name);
4756 if (r < 0)
4757 return r;
abdb9b08 4758
cd2dfc6f
LP
4759 } else if (!arg_keep_unit) {
4760 r = allocate_scope(
abdb9b08 4761 bus,
cd2dfc6f
LP
4762 arg_machine,
4763 *pid,
4764 arg_slice,
4765 arg_custom_mounts, arg_n_custom_mounts,
4766 arg_kill_signal,
de40a303
LP
4767 arg_property,
4768 arg_property_message);
cd2dfc6f
LP
4769 if (r < 0)
4770 return r;
4771
4772 } else if (arg_slice || arg_property)
4773 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4774
27da7ef0 4775 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4776 if (r < 0)
4777 return r;
4778
27da7ef0 4779 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4780 if (r < 0)
4781 return r;
b0067625 4782
de54e02d 4783 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4784 if (r < 0)
4785 return r;
4786
4787 /* Notify the child that the parent is ready with all
4788 * its setup (including cgroup-ification), and that
4789 * the child can now hand over control to the code to
4790 * run inside the container. */
75116558 4791 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4792
4793 /* Block SIGCHLD here, before notifying child.
4794 * process_pty() will handle it with the other signals. */
4795 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4796
4797 /* Reset signal to default */
4798 r = default_signals(SIGCHLD, -1);
4799 if (r < 0)
4800 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4801
4802 r = sd_event_new(&event);
4803 if (r < 0)
4804 return log_error_errno(r, "Failed to get default event source: %m");
4805
8fd010bb
LP
4806 (void) sd_event_set_watchdog(event, true);
4807
abdb9b08
LP
4808 if (bus) {
4809 r = sd_bus_attach_event(bus, event, 0);
4810 if (r < 0)
4811 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4812 }
4813
e96ceaba 4814 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4815 if (r < 0)
4816 return r;
4817
4818 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
4819 if (!barrier_place_and_sync(&barrier)) /* #5 */
4820 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4821
38ccb557 4822 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
4823 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4824 etc_passwd_lock = safe_close(etc_passwd_lock);
4825
04f590a4
LP
4826 (void) sd_notifyf(false,
4827 "STATUS=Container running.\n"
4828 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4829 if (!arg_notify_ready)
919f5ae0 4830 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4831
4832 if (arg_kill_signal > 0) {
4833 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4834 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4835 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4836 } else {
4837 /* Immediately exit */
919f5ae0
LP
4838 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4839 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4840 }
4841
6916b164 4842 /* Exit when the child exits */
919f5ae0 4843 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4844
4845 if (arg_expose_ports) {
4846 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4847 if (r < 0)
4848 return r;
4849
4850 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4851 }
4852
4853 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4854
3acc84eb
FB
4855 if (arg_console_mode != CONSOLE_PIPE) {
4856 _cleanup_close_ int fd = -1;
4857 PTYForwardFlags flags = 0;
de40a303 4858
3acc84eb
FB
4859 /* Retrieve the master pty allocated by inner child */
4860 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4861 if (fd < 0)
4862 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4863
4864 switch (arg_console_mode) {
de40a303 4865
3acc84eb
FB
4866 case CONSOLE_READ_ONLY:
4867 flags |= PTY_FORWARD_READ_ONLY;
4868
4869 _fallthrough_;
4870
4871 case CONSOLE_INTERACTIVE:
4872 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4873
4874 r = pty_forward_new(event, fd, flags, &forward);
4875 if (r < 0)
4876 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4877
4878 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4879 (void) pty_forward_set_width_height(forward,
4880 arg_console_width,
4881 arg_console_height);
4882 break;
4883
4884 default:
4885 assert(arg_console_mode == CONSOLE_PASSIVE);
4886 }
4887
4888 *master = TAKE_FD(fd);
de40a303 4889 }
b0067625
ZJS
4890
4891 r = sd_event_loop(event);
4892 if (r < 0)
4893 return log_error_errno(r, "Failed to run event loop: %m");
4894
de40a303
LP
4895 if (forward) {
4896 char last_char = 0;
b0067625 4897
de40a303
LP
4898 (void) pty_forward_get_last_char(forward, &last_char);
4899 forward = pty_forward_free(forward);
b0067625 4900
de40a303
LP
4901 if (!arg_quiet && last_char != '\n')
4902 putc('\n', stdout);
4903 }
b0067625
ZJS
4904
4905 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
4906 if (!arg_register && !arg_keep_unit && bus)
4907 terminate_scope(bus, arg_machine);
b0067625
ZJS
4908
4909 /* Normally redundant, but better safe than sorry */
c67b0082 4910 (void) kill(*pid, SIGKILL);
b0067625 4911
5b4855ab
DDM
4912 if (arg_private_network) {
4913 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
4914 * to avoid having to move the parent to the child network namespace. */
4915 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
4916 if (r < 0)
4917 return r;
4918
4919 if (r == 0) {
4920 _cleanup_close_ int parent_netns_fd = -1;
4921
4922 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
4923 if (r < 0) {
4924 log_error_errno(r, "Failed to open parent network namespace: %m");
4925 _exit(EXIT_FAILURE);
4926 }
4927
4928 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
4929 if (r < 0) {
4930 log_error_errno(r, "Failed to enter child network namespace: %m");
4931 _exit(EXIT_FAILURE);
4932 }
4933
4934 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
4935 if (r < 0)
4936 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
4937
4938 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
4939 }
4940 }
4941
b0067625
ZJS
4942 r = wait_for_container(*pid, &container_status);
4943 *pid = 0;
4944
0bb0a9fa
ZJS
4945 /* Tell machined that we are gone. */
4946 if (bus)
4947 (void) unregister_machine(bus, arg_machine);
4948
b0067625
ZJS
4949 if (r < 0)
4950 /* We failed to wait for the container, or the container exited abnormally. */
4951 return r;
4952 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4953 /* r > 0 → The container exited with a non-zero status.
4954 * As a special case, we need to replace 133 with a different value,
4955 * because 133 is special-cased in the service file to reboot the container.
4956 * otherwise → The container exited with zero status and a reboot was not requested.
4957 */
2a49b612 4958 if (r == EXIT_FORCE_RESTART)
27e29a1e 4959 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4960 *ret = r;
b0067625
ZJS
4961 return 0; /* finito */
4962 }
4963
4964 /* CONTAINER_REBOOTED, loop again */
4965
4966 if (arg_keep_unit) {
4967 /* Special handling if we are running as a service: instead of simply
4968 * restarting the machine we want to restart the entire service, so let's
4969 * inform systemd about this with the special exit code 133. The service
4970 * file uses RestartForceExitStatus=133 so that this results in a full
4971 * nspawn restart. This is necessary since we might have cgroup parameters
4972 * set we want to have flushed out. */
2a49b612
ZJS
4973 *ret = EXIT_FORCE_RESTART;
4974 return 0; /* finito */
b0067625
ZJS
4975 }
4976
4977 expose_port_flush(arg_expose_ports, exposed);
4978
4979 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4980 *veth_created = false;
4981 return 1; /* loop again */
4982}
4983
bf428efb 4984static int initialize_rlimits(void) {
bf428efb
LP
4985 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4986 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4987 * container execution environments. */
4988
4989 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4990 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4991 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4992 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4993 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4994 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4995 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4996 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4997 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4998 [RLIMIT_NICE] = { 0, 0 },
4999 [RLIMIT_NOFILE] = { 1024, 4096 },
5000 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5001 [RLIMIT_RTPRIO] = { 0, 0 },
5002 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5003 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5004
5005 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5006 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5007 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5008 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5009 * that PID 1 changes a number of other resource limits during early initialization which is why we
5010 * don't read the other limits from PID 1 but prefer the static table above. */
5011 };
5012
5013 int rl;
5014
5015 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5016 /* Let's only fill in what the user hasn't explicitly configured anyway */
5017 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5018 const struct rlimit *v;
5019 struct rlimit buffer;
5020
5021 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5022 /* For these two let's read the limits off PID 1. See above for an explanation. */
5023
5024 if (prlimit(1, rl, NULL, &buffer) < 0)
5025 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5026
5027 v = &buffer;
5028 } else
5029 v = kernel_defaults + rl;
5030
5031 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5032 if (!arg_rlimit[rl])
5033 return log_oom();
5034 }
5035
5036 if (DEBUG_LOGGING) {
5037 _cleanup_free_ char *k = NULL;
5038
5039 (void) rlimit_format(arg_rlimit[rl], &k);
5040 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5041 }
5042 }
5043
5044 return 0;
5045}
5046
287b7376
LP
5047static int cant_be_in_netns(void) {
5048 union sockaddr_union sa = {
5049 .un = {
5050 .sun_family = AF_UNIX,
5051 .sun_path = "/run/udev/control",
5052 },
5053 };
5054 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5055 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5056 _cleanup_close_ int fd = -1;
5057 struct ucred ucred;
5058 int r;
5059
5060 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5061 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5062 * nice message. */
5063
5064 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5065 return 0;
5066
5067 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5068 if (fd < 0)
5069 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5070
5071 if (connect(fd, &sa.un, SOCKADDR_UN_LEN(sa.un)) < 0) {
5072
5073 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5074 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5075 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5076
5077 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5078 }
5079
5080 r = getpeercred(fd, &ucred);
5081 if (r < 0)
5082 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5083
5084 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5085 r = readlink_malloc(udev_path, &udev_ns);
5086 if (r < 0)
5087 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5088
5089 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5090 if (r < 0)
5091 return log_error_errno(r, "Failed to read our own network namespace: %m");
5092
5093 if (!streq(our_ns, udev_ns))
5094 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5095 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5096 return 0;
5097}
5098
44dbef90 5099static int run(int argc, char *argv[]) {
7bf011e3
LP
5100 bool secondary = false, remove_directory = false, remove_image = false,
5101 veth_created = false, remove_tmprootdir = false;
2d845785 5102 _cleanup_close_ int master = -1;
03cfe0d5 5103 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5104 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5105 char veth_name[IFNAMSIZ] = "";
03cfe0d5 5106 union in_addr_union exposed = {};
8e766630 5107 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5108 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5109 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
5110 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5111 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
7bf011e3 5112 pid_t pid = 0;
03cfe0d5
LP
5113
5114 log_parse_environment();
5115 log_open();
415fc41c 5116
03cfe0d5
LP
5117 r = parse_argv(argc, argv);
5118 if (r <= 0)
5119 goto finish;
5120
fba868fa
LP
5121 r = must_be_root();
5122 if (r < 0)
03cfe0d5 5123 goto finish;
fba868fa 5124
287b7376
LP
5125 r = cant_be_in_netns();
5126 if (r < 0)
5127 goto finish;
5128
bf428efb
LP
5129 r = initialize_rlimits();
5130 if (r < 0)
5131 goto finish;
5132
de40a303
LP
5133 r = load_oci_bundle();
5134 if (r < 0)
5135 goto finish;
5136
f757855e
LP
5137 r = determine_names();
5138 if (r < 0)
5139 goto finish;
5140
5141 r = load_settings();
5142 if (r < 0)
5143 goto finish;
5144
d4d99bc6 5145 r = cg_unified();
5eee8290
LP
5146 if (r < 0) {
5147 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5148 goto finish;
5149 }
5150
f757855e
LP
5151 r = verify_arguments();
5152 if (r < 0)
5153 goto finish;
03cfe0d5 5154
49048684
ZJS
5155 /* Reapply environment settings. */
5156 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5157
2949ff26
LP
5158 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5159 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5160 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5161 (void) ignore_signals(SIGPIPE, -1);
5162
03cfe0d5
LP
5163 n_fd_passed = sd_listen_fds(false);
5164 if (n_fd_passed > 0) {
5165 r = fdset_new_listen_fds(&fds, false);
5166 if (r < 0) {
5167 log_error_errno(r, "Failed to collect file descriptors: %m");
5168 goto finish;
5169 }
5170 }
5171
83e803a9
ZJS
5172 /* The "default" umask. This is appropriate for most file and directory
5173 * operations performed by nspawn, and is the umask that will be used for
5174 * the child. Functions like copy_devnodes() change the umask temporarily. */
5175 umask(0022);
5176
03cfe0d5
LP
5177 if (arg_directory) {
5178 assert(!arg_image);
5179
b35ca61a
LP
5180 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5181 * /var from the host will propagate into container dynamically (because bad things happen if
5182 * two systems write to the same /var). Let's allow it for the special cases where /var is
5183 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5184 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5185 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5186 r = -EINVAL;
5187 goto finish;
5188 }
5189
5190 if (arg_ephemeral) {
5191 _cleanup_free_ char *np = NULL;
5192
8d4aa2bb 5193 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
5194 if (r < 0)
5195 goto finish;
5196
7bf011e3
LP
5197 /* If the specified path is a mount point we generate the new snapshot immediately
5198 * inside it under a random name. However if the specified is not a mount point we
5199 * create the new snapshot in the parent directory, just next to it. */
e1873695 5200 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5201 if (r < 0) {
5202 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5203 goto finish;
5204 }
5205 if (r > 0)
770b5ce4 5206 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5207 else
770b5ce4 5208 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5209 if (r < 0) {
0f3be6ca 5210 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5211 goto finish;
5212 }
5213
6992459c 5214 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5215 * only owned by us and no one else. */
6992459c 5216 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5217 if (r < 0) {
5218 log_error_errno(r, "Failed to lock %s: %m", np);
5219 goto finish;
5220 }
5221
7bf011e3
LP
5222 {
5223 BLOCK_SIGNALS(SIGINT);
5224 r = btrfs_subvol_snapshot(arg_directory, np,
5225 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5226 BTRFS_SNAPSHOT_FALLBACK_COPY |
5227 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5228 BTRFS_SNAPSHOT_RECURSIVE |
5229 BTRFS_SNAPSHOT_QUOTA |
5230 BTRFS_SNAPSHOT_SIGINT);
5231 }
5232 if (r == -EINTR) {
5233 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5234 goto finish;
5235 }
03cfe0d5
LP
5236 if (r < 0) {
5237 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5238 goto finish;
ec16945e
LP
5239 }
5240
1cc6c93a 5241 free_and_replace(arg_directory, np);
17cbb288 5242 remove_directory = true;
30535c16 5243 } else {
cb638b5e 5244 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5245 if (r < 0)
5246 goto finish;
5247
30535c16
LP
5248 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5249 if (r == -EBUSY) {
5250 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5251 goto finish;
5252 }
5253 if (r < 0) {
5254 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5255 goto finish;
30535c16
LP
5256 }
5257
5258 if (arg_template) {
8d4aa2bb 5259 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
5260 if (r < 0)
5261 goto finish;
5262
7bf011e3
LP
5263 {
5264 BLOCK_SIGNALS(SIGINT);
5265 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5266 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5267 BTRFS_SNAPSHOT_FALLBACK_COPY |
5268 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5269 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5270 BTRFS_SNAPSHOT_RECURSIVE |
5271 BTRFS_SNAPSHOT_QUOTA |
5272 BTRFS_SNAPSHOT_SIGINT);
5273 }
ff6c6cc1
LP
5274 if (r == -EEXIST)
5275 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5276 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5277 else if (r == -EINTR) {
5278 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5279 goto finish;
5280 } else if (r < 0) {
83521414 5281 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5282 goto finish;
ff6c6cc1
LP
5283 } else
5284 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5285 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5286 }
ec16945e
LP
5287 }
5288
7732f92b 5289 if (arg_start_mode == START_BOOT) {
a5201ed6 5290 const char *p;
c9fe05e0 5291
a5201ed6
LP
5292 if (arg_pivot_root_new)
5293 p = prefix_roota(arg_directory, arg_pivot_root_new);
5294 else
5295 p = arg_directory;
c9fe05e0
AR
5296
5297 if (path_is_os_tree(p) <= 0) {
5298 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 5299 r = -EINVAL;
1b9e5b12
LP
5300 goto finish;
5301 }
5302 } else {
c9fe05e0
AR
5303 const char *p, *q;
5304
a5201ed6
LP
5305 if (arg_pivot_root_new)
5306 p = prefix_roota(arg_directory, arg_pivot_root_new);
5307 else
5308 p = arg_directory;
c9fe05e0
AR
5309
5310 q = strjoina(p, "/usr/");
1b9e5b12 5311
c9fe05e0
AR
5312 if (laccess(q, F_OK) < 0) {
5313 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 5314 r = -EINVAL;
1b9e5b12 5315 goto finish;
1b9e5b12
LP
5316 }
5317 }
ec16945e 5318
6b9132a9 5319 } else {
e7cbe5cb 5320 DissectImageFlags dissect_image_flags = DISSECT_IMAGE_REQUIRE_ROOT | DISSECT_IMAGE_RELAX_VAR_CHECK;
ec16945e
LP
5321 assert(arg_image);
5322 assert(!arg_template);
5323
8d4aa2bb 5324 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
5325 if (r < 0)
5326 goto finish;
5327
0f3be6ca
LP
5328 if (arg_ephemeral) {
5329 _cleanup_free_ char *np = NULL;
5330
5331 r = tempfn_random(arg_image, "machine.", &np);
5332 if (r < 0) {
5333 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5334 goto finish;
5335 }
5336
6992459c
LP
5337 /* Always take an exclusive lock on our own ephemeral copy. */
5338 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5339 if (r < 0) {
5340 r = log_error_errno(r, "Failed to create image lock: %m");
5341 goto finish;
5342 }
5343
7bf011e3
LP
5344 {
5345 BLOCK_SIGNALS(SIGINT);
5346 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5347 }
5348 if (r == -EINTR) {
5349 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5350 goto finish;
5351 }
0f3be6ca
LP
5352 if (r < 0) {
5353 r = log_error_errno(r, "Failed to copy image file: %m");
5354 goto finish;
5355 }
5356
1cc6c93a 5357 free_and_replace(arg_image, np);
0f3be6ca
LP
5358 remove_image = true;
5359 } else {
5360 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5361 if (r == -EBUSY) {
5362 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5363 goto finish;
5364 }
5365 if (r < 0) {
5366 r = log_error_errno(r, "Failed to create image lock: %m");
5367 goto finish;
5368 }
4623e8e6 5369
89e62e0b
LP
5370 r = verity_settings_load(
5371 &arg_verity_settings,
5372 arg_image, NULL, NULL);
e7cbe5cb
LB
5373 if (r < 0) {
5374 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5375 goto finish;
78ebe980 5376 }
89e62e0b
LP
5377
5378 if (arg_verity_settings.data_path)
5379 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5380 }
5381
c67b0082 5382 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5383 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5384 goto finish;
1b9e5b12 5385 }
6b9132a9 5386
c67b0082
LP
5387 remove_tmprootdir = true;
5388
5389 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5390 if (!arg_directory) {
5391 r = log_oom();
5392 goto finish;
6b9132a9 5393 }
88213476 5394
89e62e0b
LP
5395 r = loop_device_make_by_path(
5396 arg_image,
5397 arg_read_only ? O_RDONLY : O_RDWR,
5398 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5399 &loop);
2d845785
LP
5400 if (r < 0) {
5401 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5402 goto finish;
5403 }
1b9e5b12 5404
4526113f 5405 r = dissect_image_and_warn(
e0f9e7bd 5406 loop->fd,
4526113f 5407 arg_image,
89e62e0b 5408 &arg_verity_settings,
18d73705 5409 NULL,
e7cbe5cb 5410 dissect_image_flags,
e0f9e7bd 5411 &dissected_image);
2d845785 5412 if (r == -ENOPKG) {
4526113f 5413 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5414 log_notice("Note that the disk image needs to\n"
5415 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5416 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
19ac32cd 5417 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
2d845785
LP
5418 " d) or contain a file system without a partition table\n"
5419 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5420 goto finish;
2d845785 5421 }
4526113f 5422 if (r < 0)
842f3b0f 5423 goto finish;
1b9e5b12 5424
89e62e0b 5425 if (!arg_verity_settings.root_hash && dissected_image->can_verity)
4623e8e6
LP
5426 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5427
89e62e0b
LP
5428 r = dissected_image_decrypt_interactively(
5429 dissected_image,
5430 NULL,
5431 &arg_verity_settings,
5432 0,
5433 &decrypted_image);
1b9e5b12
LP
5434 if (r < 0)
5435 goto finish;
0f3be6ca
LP
5436
5437 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5438 if (remove_image && unlink(arg_image) >= 0)
5439 remove_image = false;
842f3b0f 5440 }
842f3b0f 5441
86c0dd4a 5442 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5443 if (r < 0)
5444 goto finish;
5445
de40a303
LP
5446 if (arg_console_mode < 0)
5447 arg_console_mode =
5448 isatty(STDIN_FILENO) > 0 &&
5449 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5450
de40a303
LP
5451 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5452 arg_quiet = true;
a258bf26 5453
9c857b9d
LP
5454 if (!arg_quiet)
5455 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5456 arg_machine, arg_image ?: arg_directory);
5457
72c0a2c2 5458 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5459
66edd963 5460 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5461 r = log_error_errno(errno, "Failed to become subreaper: %m");
5462 goto finish;
5463 }
5464
d87be9b0 5465 for (;;) {
3acc84eb 5466 r = run_container(dissected_image,
44dbef90
LP
5467 secondary,
5468 fds,
5469 veth_name, &veth_created,
3acc84eb 5470 &exposed, &master,
44dbef90 5471 &pid, &ret);
b0067625 5472 if (r <= 0)
d87be9b0 5473 break;
d87be9b0 5474 }
88213476
LP
5475
5476finish:
04f590a4
LP
5477 (void) sd_notify(false,
5478 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5479 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5480
9444b1f2 5481 if (pid > 0)
c67b0082 5482 (void) kill(pid, SIGKILL);
88213476 5483
503546da 5484 /* Try to flush whatever is still queued in the pty */
6a0f896b 5485 if (master >= 0) {
1c876927 5486 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
5487 master = safe_close(master);
5488 }
5489
5490 if (pid > 0)
5491 (void) wait_for_terminate(pid, NULL);
503546da 5492
50ebcf6c
LP
5493 pager_close();
5494
17cbb288 5495 if (remove_directory && arg_directory) {
ec16945e
LP
5496 int k;
5497
17cbb288 5498 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5499 if (k < 0)
17cbb288 5500 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5501 }
5502
0f3be6ca
LP
5503 if (remove_image && arg_image) {
5504 if (unlink(arg_image) < 0)
5505 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5506 }
5507
c67b0082
LP
5508 if (remove_tmprootdir) {
5509 if (rmdir(tmprootdir) < 0)
5510 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5511 }
5512
785890ac
LP
5513 if (arg_machine) {
5514 const char *p;
5515
63c372cb 5516 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5517 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5518 }
5519
7a8f6325 5520 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
5521
5522 if (veth_created)
5523 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5524 (void) remove_bridge(arg_network_zone);
f757855e 5525
f757855e
LP
5526 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5527 expose_port_free_all(arg_expose_ports);
bf428efb 5528 rlimit_free_all(arg_rlimit);
b2645747 5529 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5530 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5531
44dbef90
LP
5532 if (r < 0)
5533 return r;
5534
5535 return ret;
88213476 5536}
44dbef90
LP
5537
5538DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);