]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #16929 from ssahani/network-bare-udp
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
8fe0087e
LP
14#include <sys/personality.h>
15#include <sys/prctl.h>
16#include <sys/types.h>
6916b164 17#include <sys/wait.h>
8fe0087e 18#include <unistd.h>
1b9e5b12 19
b053cd5f 20#include "sd-bus.h"
1f0cd86b 21#include "sd-daemon.h"
1f0cd86b 22#include "sd-id128.h"
8fe0087e 23
b5efdb8a 24#include "alloc-util.h"
8fe0087e
LP
25#include "barrier.h"
26#include "base-filesystem.h"
27#include "blkid-util.h"
28#include "btrfs-util.h"
b8ea7a6e 29#include "bus-error.h"
b053cd5f 30#include "bus-util.h"
8fe0087e 31#include "cap-list.h"
430f0182 32#include "capability-util.h"
04d391da 33#include "cgroup-util.h"
8fe0087e 34#include "copy.h"
d107bb7d 35#include "cpu-set-util.h"
4fc9982c 36#include "dev-setup.h"
2d845785 37#include "dissect-image.h"
8fe0087e 38#include "env-util.h"
3652872a 39#include "escape.h"
3ffd4af2 40#include "fd-util.h"
842f3b0f 41#include "fdset.h"
a5c32cff 42#include "fileio.h"
f97b34a6 43#include "format-util.h"
f4f15635 44#include "fs-util.h"
1b9e5b12 45#include "gpt.h"
4623e8e6 46#include "hexdecoct.h"
8fe0087e 47#include "hostname-util.h"
910fd145 48#include "id128-util.h"
3652872a 49#include "io-util.h"
8fe0087e 50#include "log.h"
2d845785 51#include "loop-util.h"
8fe0087e 52#include "loopback-setup.h"
1b9cebf6 53#include "machine-image.h"
8fe0087e 54#include "macro.h"
44dbef90 55#include "main-func.h"
f5947a5e 56#include "missing_sched.h"
8fe0087e 57#include "mkdir.h"
4349cd7c 58#include "mount-util.h"
049af8ad 59#include "mountpoint-util.h"
0cb8e3d1 60#include "namespace-util.h"
8fe0087e 61#include "netlink-util.h"
07630cea 62#include "nspawn-cgroup.h"
3652872a 63#include "nspawn-creds.h"
3603efde 64#include "nspawn-def.h"
07630cea
LP
65#include "nspawn-expose-ports.h"
66#include "nspawn-mount.h"
67#include "nspawn-network.h"
de40a303 68#include "nspawn-oci.h"
7336138e 69#include "nspawn-patch-uid.h"
07630cea 70#include "nspawn-register.h"
910fd145 71#include "nspawn-seccomp.h"
07630cea
LP
72#include "nspawn-settings.h"
73#include "nspawn-setuid.h"
7732f92b 74#include "nspawn-stub-pid1.h"
d8b4d14d 75#include "nulstr-util.h"
d58ad743 76#include "os-util.h"
50ebcf6c 77#include "pager.h"
6bedfcbb 78#include "parse-util.h"
8fe0087e 79#include "path-util.h"
294bf0c3 80#include "pretty-print.h"
0b452006 81#include "process-util.h"
8fe0087e
LP
82#include "ptyfwd.h"
83#include "random-util.h"
8869a0b4 84#include "raw-clone.h"
86775e35 85#include "resolve-util.h"
bf428efb 86#include "rlimit-util.h"
8fe0087e 87#include "rm-rf.h"
de40a303
LP
88#if HAVE_SECCOMP
89#include "seccomp-util.h"
90#endif
68b02049 91#include "selinux-util.h"
8fe0087e 92#include "signal-util.h"
2583fbea 93#include "socket-util.h"
8fcde012 94#include "stat-util.h"
15a5e950 95#include "stdio-util.h"
5c828e66 96#include "string-table.h"
07630cea 97#include "string-util.h"
8fe0087e 98#include "strv.h"
de40a303 99#include "sysctl-util.h"
8fe0087e 100#include "terminal-util.h"
e4de7287 101#include "tmpfile-util.h"
affb60b1 102#include "umask-util.h"
43c3fb46 103#include "unit-name.h"
b1d4f8e1 104#include "user-util.h"
8fe0087e 105#include "util.h"
e9642be2 106
e96ceaba
LP
107/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
108#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
0e7ac751 109
2a49b612
ZJS
110#define EXIT_FORCE_RESTART 133
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
6145bb4f 114 CONTAINER_REBOOTED,
113cea80
DH
115} ContainerStatus;
116
88213476 117static char *arg_directory = NULL;
ec16945e 118static char *arg_template = NULL;
5f932eb9 119static char *arg_chdir = NULL;
b53ede69
PW
120static char *arg_pivot_root_new = NULL;
121static char *arg_pivot_root_old = NULL;
687d0825 122static char *arg_user = NULL;
de40a303
LP
123static uid_t arg_uid = UID_INVALID;
124static gid_t arg_gid = GID_INVALID;
125static gid_t* arg_supplementary_gids = NULL;
126static size_t arg_n_supplementary_gids = 0;
9444b1f2 127static sd_id128_t arg_uuid = {};
3a9530e5
LP
128static char *arg_machine = NULL; /* The name used by the host to refer to this */
129static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
130static const char *arg_selinux_context = NULL;
131static const char *arg_selinux_apifs_context = NULL;
de40a303 132static char *arg_slice = NULL;
ff01d048 133static bool arg_private_network = false;
bc2f673e 134static bool arg_read_only = false;
7732f92b 135static StartMode arg_start_mode = START_PID1;
ec16945e 136static bool arg_ephemeral = false;
57fb9fb5 137static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 138static bool arg_link_journal_try = false;
520e0d54 139static uint64_t arg_caps_retain =
50b52222
LP
140 (1ULL << CAP_AUDIT_CONTROL) |
141 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
142 (1ULL << CAP_CHOWN) |
143 (1ULL << CAP_DAC_OVERRIDE) |
144 (1ULL << CAP_DAC_READ_SEARCH) |
145 (1ULL << CAP_FOWNER) |
146 (1ULL << CAP_FSETID) |
147 (1ULL << CAP_IPC_OWNER) |
148 (1ULL << CAP_KILL) |
149 (1ULL << CAP_LEASE) |
150 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 151 (1ULL << CAP_MKNOD) |
5076f0cc
LP
152 (1ULL << CAP_NET_BIND_SERVICE) |
153 (1ULL << CAP_NET_BROADCAST) |
154 (1ULL << CAP_NET_RAW) |
5076f0cc 155 (1ULL << CAP_SETFCAP) |
50b52222 156 (1ULL << CAP_SETGID) |
5076f0cc
LP
157 (1ULL << CAP_SETPCAP) |
158 (1ULL << CAP_SETUID) |
159 (1ULL << CAP_SYS_ADMIN) |
50b52222 160 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
161 (1ULL << CAP_SYS_CHROOT) |
162 (1ULL << CAP_SYS_NICE) |
163 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 164 (1ULL << CAP_SYS_RESOURCE) |
50b52222 165 (1ULL << CAP_SYS_TTY_CONFIG);
de40a303 166static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 167static CustomMount *arg_custom_mounts = NULL;
88614c8a 168static size_t arg_n_custom_mounts = 0;
f4889f65 169static char **arg_setenv = NULL;
284c0b91 170static bool arg_quiet = false;
eb91eb18 171static bool arg_register = true;
89f7c846 172static bool arg_keep_unit = false;
aa28aefe 173static char **arg_network_interfaces = NULL;
c74e630d 174static char **arg_network_macvlan = NULL;
4bbfe7ad 175static char **arg_network_ipvlan = NULL;
69c79d3c 176static bool arg_network_veth = false;
f6d6bad1 177static char **arg_network_veth_extra = NULL;
f757855e 178static char *arg_network_bridge = NULL;
22b28dfd 179static char *arg_network_zone = NULL;
d7bea6b6 180static char *arg_network_namespace_path = NULL;
bb068de0 181static PagerFlags arg_pager_flags = 0;
050f7277 182static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 183static char *arg_image = NULL;
de40a303 184static char *arg_oci_bundle = NULL;
f757855e 185static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 186static ExposePort *arg_expose_ports = NULL;
f36933fe 187static char **arg_property = NULL;
de40a303 188static sd_bus_message *arg_property_message = NULL;
0de7acce 189static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 190static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 191static bool arg_userns_chown = false;
c6c8f6e2 192static int arg_kill_signal = 0;
5da38d07 193static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
194static SettingsMask arg_settings_mask = 0;
195static int arg_settings_trusted = -1;
196static char **arg_parameters = NULL;
6aadfa4c 197static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 198static bool arg_notify_ready = false;
5a8ff0e6 199static bool arg_use_cgns = true;
0c582db0 200static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 201static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
4623e8e6 202static void *arg_root_hash = NULL;
e7cbe5cb 203static char *arg_verity_data = NULL;
c2923fdc
LB
204static char *arg_root_hash_sig_path = NULL;
205static void *arg_root_hash_sig = NULL;
206static size_t arg_root_hash_sig_size = 0;
4623e8e6 207static size_t arg_root_hash_size = 0;
6b000af4
LP
208static char **arg_syscall_allow_list = NULL;
209static char **arg_syscall_deny_list = NULL;
de40a303
LP
210#if HAVE_SECCOMP
211static scmp_filter_ctx arg_seccomp = NULL;
212#endif
bf428efb 213static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 214static bool arg_no_new_privileges = false;
81f345df
LP
215static int arg_oom_score_adjust = 0;
216static bool arg_oom_score_adjust_set = false;
0985c7c4 217static CPUSet arg_cpu_set = {};
09d423e9 218static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 219static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
220static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
221static DeviceNode* arg_extra_nodes = NULL;
222static size_t arg_n_extra_nodes = 0;
223static char **arg_sysctl = NULL;
224static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
225static Credential *arg_credentials = NULL;
226static size_t arg_n_credentials = 0;
88213476 227
6145bb4f
LP
228STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
229STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
230STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
231STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
232STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
234STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
235STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
237STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
238STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
239STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
240STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
241STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
242STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
243STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
249STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
250STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
e7cbe5cb 252STATIC_DESTRUCTOR_REGISTER(arg_verity_data, freep);
c2923fdc
LB
253STATIC_DESTRUCTOR_REGISTER(arg_root_hash_sig_path, freep);
254STATIC_DESTRUCTOR_REGISTER(arg_root_hash_sig, freep);
6b000af4
LP
255STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
256STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
257#if HAVE_SECCOMP
258STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
259#endif
0985c7c4 260STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f
LP
261STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
262
dce66ffe
ZJS
263static int handle_arg_console(const char *arg) {
264 if (streq(arg, "help")) {
265 puts("interactive\n"
266 "read-only\n"
267 "passive\n"
268 "pipe");
269 return 0;
270 }
271
272 if (streq(arg, "interactive"))
273 arg_console_mode = CONSOLE_INTERACTIVE;
274 else if (streq(arg, "read-only"))
275 arg_console_mode = CONSOLE_READ_ONLY;
276 else if (streq(arg, "passive"))
277 arg_console_mode = CONSOLE_PASSIVE;
278 else if (streq(arg, "pipe"))
279 arg_console_mode = CONSOLE_PIPE;
280 else
281 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
282
283 arg_settings_mask |= SETTING_CONSOLE_MODE;
284 return 1;
285}
286
37ec0fdd
LP
287static int help(void) {
288 _cleanup_free_ char *link = NULL;
289 int r;
290
bb068de0 291 (void) pager_open(arg_pager_flags);
50ebcf6c 292
37ec0fdd
LP
293 r = terminal_urlify_man("systemd-nspawn", "1", &link);
294 if (r < 0)
295 return log_oom();
296
25148653 297 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 298 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
299 " -h --help Show this help\n"
300 " --version Print version string\n"
69c79d3c 301 " -q --quiet Do not show status information\n"
bb068de0 302 " --no-pager Do not pipe output into a pager\n"
25148653
LP
303 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
304 "%3$sImage:%4$s\n"
1b9e5b12 305 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
306 " --template=PATH Initialize root directory from template directory,\n"
307 " if missing\n"
308 " -x --ephemeral Run container with snapshot of root directory, and\n"
309 " remove it after exit\n"
25e68fd3
LP
310 " -i --image=PATH Root file system disk image (or device node) for\n"
311 " the container\n"
de40a303 312 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
313 " --read-only Mount the root directory read-only\n"
314 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 315 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
316 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
317 " as a DER encoded PKCS7, either as a path to a file\n"
318 " or as an ASCII base64 encoded string prefixed by\n"
319 " 'base64:'\n"
e7cbe5cb 320 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
321 " --pivot-root=PATH[:PATH]\n"
322 " Pivot root to given directory in the container\n\n"
323 "%3$sExecution:%4$s\n"
7732f92b 324 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 325 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 326 " --chdir=PATH Set working directory in the container\n"
25148653
LP
327 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
328 " -u --user=USER Run the command under specified user or UID\n"
329 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
330 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
331 "%3$sSystem Identity:%4$s\n"
a8828ed9 332 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 333 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
334 " --uuid=UUID Set a specific machine UUID for the container\n\n"
335 "%3$sProperties:%4$s\n"
a8828ed9 336 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 337 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
338 " --register=BOOLEAN Register container as machine\n"
339 " --keep-unit Do not register a scope for the machine, reuse\n"
340 " the service unit nspawn is running in\n\n"
341 "%3$sUser Namespacing:%4$s\n"
90b4a64d 342 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 343 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 344 " Similar, but with user configured UID/GID range\n"
25148653
LP
345 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
346 "%3$sNetworking:%4$s\n"
69c79d3c
LP
347 " --private-network Disable network in container\n"
348 " --network-interface=INTERFACE\n"
349 " Assign an existing network interface to the\n"
350 " container\n"
c74e630d
LP
351 " --network-macvlan=INTERFACE\n"
352 " Create a macvlan network interface based on an\n"
353 " existing network interface to the container\n"
4bbfe7ad
TG
354 " --network-ipvlan=INTERFACE\n"
355 " Create a ipvlan network interface based on an\n"
356 " existing network interface to the container\n"
a8eaaee7 357 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 358 " and container\n"
f6d6bad1
LP
359 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
360 " Add an additional virtual Ethernet link between\n"
361 " host and container\n"
ab046dde 362 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
363 " Add a virtual Ethernet connection to the container\n"
364 " and attach it to an existing bridge on the host\n"
365 " --network-zone=NAME Similar, but attach the new interface to an\n"
366 " an automatically managed bridge interface\n"
d7bea6b6
DP
367 " --network-namespace-path=PATH\n"
368 " Set network namespace to the one represented by\n"
369 " the specified kernel namespace file node\n"
6d0b55c2 370 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
371 " Expose a container IP port on the host\n\n"
372 "%3$sSecurity:%4$s\n"
a8828ed9
DW
373 " --capability=CAP In addition to the default, retain specified\n"
374 " capability\n"
375 " --drop-capability=CAP Drop the specified capability from the default set\n"
f4e803c8 376 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
377 " --system-call-filter=LIST|~LIST\n"
378 " Permit/prohibit specific system calls\n"
25148653
LP
379 " -Z --selinux-context=SECLABEL\n"
380 " Set the SELinux security context to be used by\n"
381 " processes in the container\n"
382 " -L --selinux-apifs-context=SECLABEL\n"
383 " Set the SELinux security context to be used by\n"
384 " API/tmpfs file systems in the container\n\n"
385 "%3$sResources:%4$s\n"
bf428efb 386 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
387 " --oom-score-adjust=VALUE\n"
388 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
389 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
390 " --personality=ARCH Pick personality for this container\n\n"
25148653 391 "%3$sIntegration:%4$s\n"
09d423e9 392 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 393 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
394 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
395 " host, try-guest, try-host\n"
396 " -j Equivalent to --link-journal=try-guest\n\n"
397 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
398 " --bind=PATH[:PATH[:OPTIONS]]\n"
399 " Bind mount a file or directory from the host into\n"
a8828ed9 400 " the container\n"
5e5bfa6e
EY
401 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
402 " Similar, but creates a read-only bind mount\n"
de40a303
LP
403 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
404 " it\n"
06c17c39 405 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
406 " --overlay=PATH[:PATH...]:PATH\n"
407 " Create an overlay mount from the host to \n"
408 " the container\n"
409 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
410 " Similar, but creates a read-only overlay mount\n\n"
411 "%3$sInput/Output:%4$s\n"
de40a303
LP
412 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
413 " set up for the container.\n"
3652872a
LP
414 " -P --pipe Equivalent to --console=pipe\n\n"
415 "%3$sCredentials:%4$s\n"
416 " --set-credential=ID:VALUE\n"
417 " Pass a credential with literal value to container.\n"
418 " --load-credential=ID:PATH\n"
419 " Load credential to pass to container from file or\n"
420 " AF_UNIX stream socket.\n"
25148653 421 "\nSee the %2$s for details.\n"
37ec0fdd
LP
422 , program_invocation_short_name
423 , link
37a92352
LP
424 , ansi_underline(), ansi_normal()
425 , ansi_highlight(), ansi_normal()
426 );
37ec0fdd
LP
427
428 return 0;
88213476
LP
429}
430
86c0dd4a 431static int custom_mount_check_all(void) {
88614c8a 432 size_t i;
5a8af538 433
5a8af538
LP
434 for (i = 0; i < arg_n_custom_mounts; i++) {
435 CustomMount *m = &arg_custom_mounts[i];
436
0de7acce 437 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
438 if (arg_userns_chown)
439 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
440 "--private-users-chown may not be combined with custom root mounts.");
441 else if (arg_uid_shift == UID_INVALID)
442 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
443 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 444 }
5a8af538
LP
445 }
446
447 return 0;
448}
449
8199d554 450static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 451 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 452 int r;
5da38d07 453
efdb0237 454 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
455
456 e = getenv(var);
457 if (!e) {
d5fc5b2f 458 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
459 var = "UNIFIED_CGROUP_HIERARCHY";
460 e = getenv(var);
c78c095b
ZJS
461 }
462
463 if (!isempty(e)) {
efdb0237
LP
464 r = parse_boolean(e);
465 if (r < 0)
c78c095b 466 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
467 if (r > 0)
468 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
469 else
470 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
471 }
472
8199d554
LP
473 return 0;
474}
475
476static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
477 int r;
478
75b0d8b8
ZJS
479 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
480 * in the image actually supports. */
b4cccbc1
LP
481 r = cg_all_unified();
482 if (r < 0)
483 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
484 if (r > 0) {
a8725a06
ZJS
485 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
486 * routine only detects 231, so we'll have a false negative here for 230. */
487 r = systemd_installation_has_version(directory, 230);
488 if (r < 0)
489 return log_error_errno(r, "Failed to determine systemd version in container: %m");
490 if (r > 0)
491 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
492 else
493 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 494 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
495 /* Mixed cgroup hierarchy support was added in 233 */
496 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
497 if (r < 0)
498 return log_error_errno(r, "Failed to determine systemd version in container: %m");
499 if (r > 0)
500 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
501 else
502 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
503 } else
5da38d07 504 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 505
8199d554
LP
506 log_debug("Using %s hierarchy for container.",
507 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
508 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
509
efdb0237
LP
510 return 0;
511}
512
8a99bd0c
ZJS
513static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
514 uint64_t mask = 0;
515 int r;
516
517 for (;;) {
518 _cleanup_free_ char *t = NULL;
519
520 r = extract_first_word(&spec, &t, ",", 0);
521 if (r < 0)
522 return log_error_errno(r, "Failed to parse capability %s.", t);
523 if (r == 0)
524 break;
525
526 if (streq(t, "help")) {
527 for (int i = 0; i < capability_list_length(); i++) {
528 const char *name;
529
530 name = capability_to_name(i);
531 if (name)
532 puts(name);
533 }
534
535 return 0; /* quit */
536 }
537
538 if (streq(t, "all"))
539 mask = (uint64_t) -1;
540 else {
541 r = capability_from_name(t);
542 if (r < 0)
543 return log_error_errno(r, "Failed to parse capability %s.", t);
544
545 mask |= 1ULL << r;
546 }
547 }
548
549 *ret_mask = mask;
550 return 1; /* continue */
551}
552
49048684 553static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
554 int r;
555
556 r = getenv_bool(name);
557 if (r == -ENXIO)
49048684 558 return 0;
0c582db0 559 if (r < 0)
49048684 560 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 561
0c582db0 562 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 563 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 564 return 0;
0c582db0
LB
565}
566
49048684 567static int parse_mount_settings_env(void) {
4f086aab 568 const char *e;
1099ceeb
LP
569 int r;
570
571 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
572 if (r < 0 && r != -ENXIO)
573 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
574 if (r >= 0)
575 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
576
577 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 578 if (streq_ptr(e, "network"))
4f086aab 579 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 580
49048684
ZJS
581 else if (e) {
582 r = parse_boolean(e);
583 if (r < 0)
584 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
585
586 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
587 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 588 }
4f086aab 589
49048684 590 return 0;
4f086aab
SU
591}
592
49048684 593static int parse_environment(void) {
d5455d2f
LP
594 const char *e;
595 int r;
596
49048684
ZJS
597 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
598 if (r < 0)
599 return r;
600 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
601 if (r < 0)
602 return r;
603 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
604 if (r < 0)
605 return r;
606 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
607 if (r < 0)
608 return r;
d5455d2f 609
49048684
ZJS
610 r = parse_mount_settings_env();
611 if (r < 0)
612 return r;
d5455d2f 613
489fae52
ZJS
614 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
615 * even if it is supported. If not supported, it has no effect. */
de40a303 616 if (!cg_ns_supported())
489fae52 617 arg_use_cgns = false;
de40a303
LP
618 else {
619 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
620 if (r < 0) {
621 if (r != -ENXIO)
49048684 622 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
623
624 arg_use_cgns = true;
625 } else {
626 arg_use_cgns = r > 0;
627 arg_settings_mask |= SETTING_USE_CGNS;
628 }
629 }
d5455d2f
LP
630
631 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
632 if (e)
633 arg_container_service_name = e;
634
49048684 635 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
636}
637
88213476 638static int parse_argv(int argc, char *argv[]) {
a41fe3a2 639 enum {
acbeb427
ZJS
640 ARG_VERSION = 0x100,
641 ARG_PRIVATE_NETWORK,
bc2f673e 642 ARG_UUID,
5076f0cc 643 ARG_READ_ONLY,
57fb9fb5 644 ARG_CAPABILITY,
420c7379 645 ARG_DROP_CAPABILITY,
17fe0523
LP
646 ARG_LINK_JOURNAL,
647 ARG_BIND,
f4889f65 648 ARG_BIND_RO,
06c17c39 649 ARG_TMPFS,
5a8af538
LP
650 ARG_OVERLAY,
651 ARG_OVERLAY_RO,
de40a303 652 ARG_INACCESSIBLE,
eb91eb18 653 ARG_SHARE_SYSTEM,
89f7c846 654 ARG_REGISTER,
aa28aefe 655 ARG_KEEP_UNIT,
69c79d3c 656 ARG_NETWORK_INTERFACE,
c74e630d 657 ARG_NETWORK_MACVLAN,
4bbfe7ad 658 ARG_NETWORK_IPVLAN,
ab046dde 659 ARG_NETWORK_BRIDGE,
22b28dfd 660 ARG_NETWORK_ZONE,
f6d6bad1 661 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 662 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 663 ARG_PERSONALITY,
4d9f07b4 664 ARG_VOLATILE,
ec16945e 665 ARG_TEMPLATE,
f36933fe 666 ARG_PROPERTY,
6dac160c 667 ARG_PRIVATE_USERS,
c6c8f6e2 668 ARG_KILL_SIGNAL,
f757855e 669 ARG_SETTINGS,
5f932eb9 670 ARG_CHDIR,
b53ede69 671 ARG_PIVOT_ROOT,
7336138e 672 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 673 ARG_NOTIFY_READY,
4623e8e6 674 ARG_ROOT_HASH,
960e4569 675 ARG_SYSTEM_CALL_FILTER,
bf428efb 676 ARG_RLIMIT,
3a9530e5 677 ARG_HOSTNAME,
66edd963 678 ARG_NO_NEW_PRIVILEGES,
81f345df 679 ARG_OOM_SCORE_ADJUST,
d107bb7d 680 ARG_CPU_AFFINITY,
09d423e9 681 ARG_RESOLV_CONF,
1688841f 682 ARG_TIMEZONE,
de40a303
LP
683 ARG_CONSOLE,
684 ARG_PIPE,
685 ARG_OCI_BUNDLE,
bb068de0 686 ARG_NO_PAGER,
e7cbe5cb 687 ARG_VERITY_DATA,
c2923fdc 688 ARG_ROOT_HASH_SIG,
3652872a
LP
689 ARG_SET_CREDENTIAL,
690 ARG_LOAD_CREDENTIAL,
a41fe3a2
LP
691 };
692
88213476 693 static const struct option options[] = {
d7bea6b6
DP
694 { "help", no_argument, NULL, 'h' },
695 { "version", no_argument, NULL, ARG_VERSION },
696 { "directory", required_argument, NULL, 'D' },
697 { "template", required_argument, NULL, ARG_TEMPLATE },
698 { "ephemeral", no_argument, NULL, 'x' },
699 { "user", required_argument, NULL, 'u' },
700 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
701 { "as-pid2", no_argument, NULL, 'a' },
702 { "boot", no_argument, NULL, 'b' },
703 { "uuid", required_argument, NULL, ARG_UUID },
704 { "read-only", no_argument, NULL, ARG_READ_ONLY },
705 { "capability", required_argument, NULL, ARG_CAPABILITY },
706 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 707 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
708 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
709 { "bind", required_argument, NULL, ARG_BIND },
710 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
711 { "tmpfs", required_argument, NULL, ARG_TMPFS },
712 { "overlay", required_argument, NULL, ARG_OVERLAY },
713 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 714 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 715 { "machine", required_argument, NULL, 'M' },
3a9530e5 716 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
717 { "slice", required_argument, NULL, 'S' },
718 { "setenv", required_argument, NULL, 'E' },
719 { "selinux-context", required_argument, NULL, 'Z' },
720 { "selinux-apifs-context", required_argument, NULL, 'L' },
721 { "quiet", no_argument, NULL, 'q' },
722 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
723 { "register", required_argument, NULL, ARG_REGISTER },
724 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
725 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
726 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
727 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
728 { "network-veth", no_argument, NULL, 'n' },
729 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
730 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
731 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
732 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
733 { "personality", required_argument, NULL, ARG_PERSONALITY },
734 { "image", required_argument, NULL, 'i' },
735 { "volatile", optional_argument, NULL, ARG_VOLATILE },
736 { "port", required_argument, NULL, 'p' },
737 { "property", required_argument, NULL, ARG_PROPERTY },
738 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
739 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
740 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
741 { "settings", required_argument, NULL, ARG_SETTINGS },
742 { "chdir", required_argument, NULL, ARG_CHDIR },
743 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
744 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
745 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
746 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 747 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 748 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 749 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 750 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 751 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
752 { "console", required_argument, NULL, ARG_CONSOLE },
753 { "pipe", no_argument, NULL, ARG_PIPE },
754 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 755 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
e7cbe5cb 756 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
c2923fdc 757 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
3652872a
LP
758 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
759 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
eb9da376 760 {}
88213476
LP
761 };
762
9444b1f2 763 int c, r;
a42c8b54 764 uint64_t plus = 0, minus = 0;
f757855e 765 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
766
767 assert(argc >= 0);
768 assert(argv);
769
de40a303 770 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
771 switch (c) {
772
773 case 'h':
37ec0fdd 774 return help();
88213476 775
acbeb427 776 case ARG_VERSION:
3f6fd1ba 777 return version();
acbeb427 778
88213476 779 case 'D':
0f03c2a4 780 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 781 if (r < 0)
0f03c2a4 782 return r;
de40a303
LP
783
784 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
785 break;
786
787 case ARG_TEMPLATE:
0f03c2a4 788 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 789 if (r < 0)
0f03c2a4 790 return r;
de40a303
LP
791
792 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
793 break;
794
1b9e5b12 795 case 'i':
0f03c2a4 796 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 797 if (r < 0)
0f03c2a4 798 return r;
de40a303
LP
799
800 arg_settings_mask |= SETTING_DIRECTORY;
801 break;
802
803 case ARG_OCI_BUNDLE:
804 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
805 if (r < 0)
806 return r;
807
ec16945e
LP
808 break;
809
810 case 'x':
811 arg_ephemeral = true;
a2f577fc 812 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
813 break;
814
687d0825 815 case 'u':
2fc09a9c
DM
816 r = free_and_strdup(&arg_user, optarg);
817 if (r < 0)
7027ff61 818 return log_oom();
687d0825 819
f757855e 820 arg_settings_mask |= SETTING_USER;
687d0825
MV
821 break;
822
22b28dfd
LP
823 case ARG_NETWORK_ZONE: {
824 char *j;
825
b910cc72 826 j = strjoin("vz-", optarg);
22b28dfd
LP
827 if (!j)
828 return log_oom();
829
830 if (!ifname_valid(j)) {
831 log_error("Network zone name not valid: %s", j);
832 free(j);
833 return -EINVAL;
834 }
835
df1fac6d 836 free_and_replace(arg_network_zone, j);
22b28dfd
LP
837
838 arg_network_veth = true;
839 arg_private_network = true;
840 arg_settings_mask |= SETTING_NETWORK;
841 break;
842 }
843
ab046dde 844 case ARG_NETWORK_BRIDGE:
ef76dff2 845
baaa35ad
ZJS
846 if (!ifname_valid(optarg))
847 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
848 "Bridge interface name not valid: %s", optarg);
ef76dff2 849
f757855e
LP
850 r = free_and_strdup(&arg_network_bridge, optarg);
851 if (r < 0)
852 return log_oom();
ab046dde 853
4831981d 854 _fallthrough_;
0dfaa006 855 case 'n':
69c79d3c
LP
856 arg_network_veth = true;
857 arg_private_network = true;
f757855e 858 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
859 break;
860
f6d6bad1
LP
861 case ARG_NETWORK_VETH_EXTRA:
862 r = veth_extra_parse(&arg_network_veth_extra, optarg);
863 if (r < 0)
864 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
865
866 arg_private_network = true;
867 arg_settings_mask |= SETTING_NETWORK;
868 break;
869
aa28aefe 870 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
871 if (!ifname_valid(optarg))
872 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
873 "Network interface name not valid: %s", optarg);
ef76dff2 874
b390f178
DDM
875 r = test_network_interface_initialized(optarg);
876 if (r < 0)
877 return r;
878
c74e630d
LP
879 if (strv_extend(&arg_network_interfaces, optarg) < 0)
880 return log_oom();
881
882 arg_private_network = true;
f757855e 883 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
884 break;
885
886 case ARG_NETWORK_MACVLAN:
ef76dff2 887
baaa35ad
ZJS
888 if (!ifname_valid(optarg))
889 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
890 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 891
b390f178
DDM
892 r = test_network_interface_initialized(optarg);
893 if (r < 0)
894 return r;
895
c74e630d 896 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
897 return log_oom();
898
4bbfe7ad 899 arg_private_network = true;
f757855e 900 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
901 break;
902
903 case ARG_NETWORK_IPVLAN:
ef76dff2 904
baaa35ad
ZJS
905 if (!ifname_valid(optarg))
906 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
907 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 908
b390f178
DDM
909 r = test_network_interface_initialized(optarg);
910 if (r < 0)
911 return r;
912
4bbfe7ad
TG
913 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
914 return log_oom();
915
4831981d 916 _fallthrough_;
ff01d048
LP
917 case ARG_PRIVATE_NETWORK:
918 arg_private_network = true;
f757855e 919 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
920 break;
921
d7bea6b6
DP
922 case ARG_NETWORK_NAMESPACE_PATH:
923 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
924 if (r < 0)
925 return r;
926
de40a303 927 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
928 break;
929
0f0dbc46 930 case 'b':
baaa35ad
ZJS
931 if (arg_start_mode == START_PID2)
932 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
933 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
934
935 arg_start_mode = START_BOOT;
936 arg_settings_mask |= SETTING_START_MODE;
937 break;
938
939 case 'a':
baaa35ad
ZJS
940 if (arg_start_mode == START_BOOT)
941 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
942 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
943
944 arg_start_mode = START_PID2;
945 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
946 break;
947
144f0fc0 948 case ARG_UUID:
9444b1f2 949 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
950 if (r < 0)
951 return log_error_errno(r, "Invalid UUID: %s", optarg);
952
baaa35ad
ZJS
953 if (sd_id128_is_null(arg_uuid))
954 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
955 "Machine UUID may not be all zeroes.");
f757855e
LP
956
957 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 958 break;
aa96c6cb 959
43c3fb46
LP
960 case 'S': {
961 _cleanup_free_ char *mangled = NULL;
962
963 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
964 if (r < 0)
965 return log_oom();
966
43c3fb46 967 free_and_replace(arg_slice, mangled);
de40a303 968 arg_settings_mask |= SETTING_SLICE;
144f0fc0 969 break;
43c3fb46 970 }
144f0fc0 971
7027ff61 972 case 'M':
c1521918 973 if (isempty(optarg))
97b11eed 974 arg_machine = mfree(arg_machine);
c1521918 975 else {
baaa35ad
ZJS
976 if (!machine_name_is_valid(optarg))
977 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
978 "Invalid machine name: %s", optarg);
7027ff61 979
0c3c4284
LP
980 r = free_and_strdup(&arg_machine, optarg);
981 if (r < 0)
eb91eb18 982 return log_oom();
eb91eb18 983 }
9ce6d1b3 984 break;
7027ff61 985
3a9530e5
LP
986 case ARG_HOSTNAME:
987 if (isempty(optarg))
988 arg_hostname = mfree(arg_hostname);
989 else {
baaa35ad
ZJS
990 if (!hostname_is_valid(optarg, false))
991 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
992 "Invalid hostname: %s", optarg);
3a9530e5
LP
993
994 r = free_and_strdup(&arg_hostname, optarg);
995 if (r < 0)
996 return log_oom();
997 }
998
999 arg_settings_mask |= SETTING_HOSTNAME;
1000 break;
1001
82adf6af
LP
1002 case 'Z':
1003 arg_selinux_context = optarg;
a8828ed9
DW
1004 break;
1005
82adf6af
LP
1006 case 'L':
1007 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1008 break;
1009
bc2f673e
LP
1010 case ARG_READ_ONLY:
1011 arg_read_only = true;
f757855e 1012 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1013 break;
1014
420c7379
LP
1015 case ARG_CAPABILITY:
1016 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1017 uint64_t m;
1018 r = parse_capability_spec(optarg, &m);
1019 if (r <= 0)
1020 return r;
5076f0cc 1021
8a99bd0c
ZJS
1022 if (c == ARG_CAPABILITY)
1023 plus |= m;
1024 else
1025 minus |= m;
f757855e 1026 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1027 break;
1028 }
66edd963
LP
1029 case ARG_NO_NEW_PRIVILEGES:
1030 r = parse_boolean(optarg);
1031 if (r < 0)
1032 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1033
1034 arg_no_new_privileges = r;
1035 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1036 break;
1037
57fb9fb5
LP
1038 case 'j':
1039 arg_link_journal = LINK_GUEST;
574edc90 1040 arg_link_journal_try = true;
4e1d6aa9 1041 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1042 break;
1043
1044 case ARG_LINK_JOURNAL:
4e1d6aa9 1045 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1046 if (r < 0)
1047 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1048
4e1d6aa9 1049 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1050 break;
1051
17fe0523 1052 case ARG_BIND:
f757855e
LP
1053 case ARG_BIND_RO:
1054 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1055 if (r < 0)
1056 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1057
f757855e 1058 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1059 break;
06c17c39 1060
f757855e
LP
1061 case ARG_TMPFS:
1062 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1063 if (r < 0)
1064 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1065
f757855e 1066 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1067 break;
5a8af538
LP
1068
1069 case ARG_OVERLAY:
ad85779a
LP
1070 case ARG_OVERLAY_RO:
1071 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1072 if (r == -EADDRNOTAVAIL)
1073 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1074 if (r < 0)
1075 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1076
f757855e 1077 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1078 break;
06c17c39 1079
de40a303
LP
1080 case ARG_INACCESSIBLE:
1081 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1082 if (r < 0)
1083 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1084
1085 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1086 break;
1087
a5f1cb3b 1088 case 'E': {
f4889f65
LP
1089 char **n;
1090
baaa35ad
ZJS
1091 if (!env_assignment_is_valid(optarg))
1092 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1093 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
1094
1095 n = strv_env_set(arg_setenv, optarg);
1096 if (!n)
1097 return log_oom();
1098
130d3d22 1099 strv_free_and_replace(arg_setenv, n);
f757855e 1100 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1101 break;
1102 }
1103
284c0b91
LP
1104 case 'q':
1105 arg_quiet = true;
1106 break;
1107
8a96d94e 1108 case ARG_SHARE_SYSTEM:
a6b5216c 1109 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1110 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1111 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1112 arg_clone_ns_flags = 0;
8a96d94e
LP
1113 break;
1114
eb91eb18
LP
1115 case ARG_REGISTER:
1116 r = parse_boolean(optarg);
1117 if (r < 0) {
1118 log_error("Failed to parse --register= argument: %s", optarg);
1119 return r;
1120 }
1121
1122 arg_register = r;
1123 break;
1124
89f7c846
LP
1125 case ARG_KEEP_UNIT:
1126 arg_keep_unit = true;
1127 break;
1128
6afc95b7
LP
1129 case ARG_PERSONALITY:
1130
ac45f971 1131 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1132 if (arg_personality == PERSONALITY_INVALID)
1133 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1134 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1135
f757855e 1136 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1137 break;
1138
4d9f07b4
LP
1139 case ARG_VOLATILE:
1140
1141 if (!optarg)
f757855e 1142 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1143 else if (streq(optarg, "help")) {
1144 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1145 return 0;
1146 } else {
f757855e 1147 VolatileMode m;
4d9f07b4 1148
f757855e 1149 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1150 if (m < 0)
1151 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1152 "Failed to parse --volatile= argument: %s", optarg);
1153 else
f757855e 1154 arg_volatile_mode = m;
6d0b55c2
LP
1155 }
1156
f757855e
LP
1157 arg_settings_mask |= SETTING_VOLATILE_MODE;
1158 break;
6d0b55c2 1159
f757855e
LP
1160 case 'p':
1161 r = expose_port_parse(&arg_expose_ports, optarg);
1162 if (r == -EEXIST)
1163 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1164 if (r < 0)
1165 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1166
f757855e 1167 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1168 break;
6d0b55c2 1169
f36933fe
LP
1170 case ARG_PROPERTY:
1171 if (strv_extend(&arg_property, optarg) < 0)
1172 return log_oom();
1173
1174 break;
1175
ae209204
ZJS
1176 case ARG_PRIVATE_USERS: {
1177 int boolean = -1;
0de7acce 1178
ae209204
ZJS
1179 if (!optarg)
1180 boolean = true;
1181 else if (!in_charset(optarg, DIGITS))
1182 /* do *not* parse numbers as booleans */
1183 boolean = parse_boolean(optarg);
1184
1185 if (boolean == false) {
0de7acce
LP
1186 /* no: User namespacing off */
1187 arg_userns_mode = USER_NAMESPACE_NO;
1188 arg_uid_shift = UID_INVALID;
1189 arg_uid_range = UINT32_C(0x10000);
ae209204 1190 } else if (boolean == true) {
0de7acce
LP
1191 /* yes: User namespacing on, UID range is read from root dir */
1192 arg_userns_mode = USER_NAMESPACE_FIXED;
1193 arg_uid_shift = UID_INVALID;
1194 arg_uid_range = UINT32_C(0x10000);
1195 } else if (streq(optarg, "pick")) {
1196 /* pick: User namespacing on, UID range is picked randomly */
1197 arg_userns_mode = USER_NAMESPACE_PICK;
1198 arg_uid_shift = UID_INVALID;
1199 arg_uid_range = UINT32_C(0x10000);
1200 } else {
6c2058b3 1201 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1202 const char *range, *shift;
1203
0de7acce
LP
1204 /* anything else: User namespacing on, UID range is explicitly configured */
1205
6dac160c
LP
1206 range = strchr(optarg, ':');
1207 if (range) {
6c2058b3
ZJS
1208 buffer = strndup(optarg, range - optarg);
1209 if (!buffer)
1210 return log_oom();
1211 shift = buffer;
6dac160c
LP
1212
1213 range++;
bfd292ec
ZJS
1214 r = safe_atou32(range, &arg_uid_range);
1215 if (r < 0)
be715731 1216 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1217 } else
1218 shift = optarg;
1219
be715731
ZJS
1220 r = parse_uid(shift, &arg_uid_shift);
1221 if (r < 0)
1222 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1223
1224 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1225 }
1226
baaa35ad
ZJS
1227 if (arg_uid_range <= 0)
1228 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1229 "UID range cannot be 0.");
be715731 1230
0de7acce 1231 arg_settings_mask |= SETTING_USERNS;
6dac160c 1232 break;
ae209204 1233 }
6dac160c 1234
0de7acce 1235 case 'U':
ccabee0d
LP
1236 if (userns_supported()) {
1237 arg_userns_mode = USER_NAMESPACE_PICK;
1238 arg_uid_shift = UID_INVALID;
1239 arg_uid_range = UINT32_C(0x10000);
1240
1241 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1242 }
1243
7336138e
LP
1244 break;
1245
0de7acce 1246 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1247 arg_userns_chown = true;
0de7acce
LP
1248
1249 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1250 break;
1251
c6c8f6e2 1252 case ARG_KILL_SIGNAL:
5c828e66
LP
1253 if (streq(optarg, "help")) {
1254 DUMP_STRING_TABLE(signal, int, _NSIG);
1255 return 0;
1256 }
1257
29a3db75 1258 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1259 if (arg_kill_signal < 0)
1260 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1261 "Cannot parse signal: %s", optarg);
c6c8f6e2 1262
f757855e
LP
1263 arg_settings_mask |= SETTING_KILL_SIGNAL;
1264 break;
1265
1266 case ARG_SETTINGS:
1267
1268 /* no → do not read files
1269 * yes → read files, do not override cmdline, trust only subset
1270 * override → read files, override cmdline, trust only subset
1271 * trusted → read files, do not override cmdline, trust all
1272 */
1273
1274 r = parse_boolean(optarg);
1275 if (r < 0) {
1276 if (streq(optarg, "trusted")) {
1277 mask_all_settings = false;
1278 mask_no_settings = false;
1279 arg_settings_trusted = true;
1280
1281 } else if (streq(optarg, "override")) {
1282 mask_all_settings = false;
1283 mask_no_settings = true;
1284 arg_settings_trusted = -1;
1285 } else
1286 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1287 } else if (r > 0) {
1288 /* yes */
1289 mask_all_settings = false;
1290 mask_no_settings = false;
1291 arg_settings_trusted = -1;
1292 } else {
1293 /* no */
1294 mask_all_settings = true;
1295 mask_no_settings = false;
1296 arg_settings_trusted = false;
1297 }
1298
c6c8f6e2
LP
1299 break;
1300
5f932eb9 1301 case ARG_CHDIR:
baaa35ad
ZJS
1302 if (!path_is_absolute(optarg))
1303 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1304 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1305
1306 r = free_and_strdup(&arg_chdir, optarg);
1307 if (r < 0)
1308 return log_oom();
1309
1310 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1311 break;
1312
b53ede69
PW
1313 case ARG_PIVOT_ROOT:
1314 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1315 if (r < 0)
1316 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1317
1318 arg_settings_mask |= SETTING_PIVOT_ROOT;
1319 break;
1320
9c1e04d0
AP
1321 case ARG_NOTIFY_READY:
1322 r = parse_boolean(optarg);
baaa35ad
ZJS
1323 if (r < 0)
1324 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1325 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1326 arg_notify_ready = r;
1327 arg_settings_mask |= SETTING_NOTIFY_READY;
1328 break;
1329
4623e8e6
LP
1330 case ARG_ROOT_HASH: {
1331 void *k;
1332 size_t l;
1333
1334 r = unhexmem(optarg, strlen(optarg), &k, &l);
1335 if (r < 0)
1336 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1337 if (l < sizeof(sd_id128_t)) {
4623e8e6 1338 free(k);
c6147113 1339 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6
LP
1340 }
1341
1342 free(arg_root_hash);
1343 arg_root_hash = k;
1344 arg_root_hash_size = l;
1345 break;
1346 }
1347
e7cbe5cb
LB
1348 case ARG_VERITY_DATA:
1349 r = parse_path_argument_and_warn(optarg, false, &arg_verity_data);
1350 if (r < 0)
1351 return r;
1352 break;
1353
c2923fdc
LB
1354 case ARG_ROOT_HASH_SIG: {
1355 char *value;
1356
1357 if ((value = startswith(optarg, "base64:"))) {
1358 void *p;
1359 size_t l;
1360
1361 r = unbase64mem(value, strlen(value), &p, &l);
1362 if (r < 0)
1363 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1364
1365 free_and_replace(arg_root_hash_sig, p);
1366 arg_root_hash_sig_size = l;
1367 arg_root_hash_sig_path = mfree(arg_root_hash_sig_path);
1368 } else {
1369 r = parse_path_argument_and_warn(optarg, false, &arg_root_hash_sig_path);
1370 if (r < 0)
1371 return r;
1372 arg_root_hash_sig = mfree(arg_root_hash_sig);
1373 arg_root_hash_sig_size = 0;
1374 }
1375
1376 break;
1377 }
1378
960e4569
LP
1379 case ARG_SYSTEM_CALL_FILTER: {
1380 bool negative;
1381 const char *items;
1382
1383 negative = optarg[0] == '~';
1384 items = negative ? optarg + 1 : optarg;
1385
1386 for (;;) {
1387 _cleanup_free_ char *word = NULL;
1388
1389 r = extract_first_word(&items, &word, NULL, 0);
1390 if (r == 0)
1391 break;
1392 if (r == -ENOMEM)
1393 return log_oom();
1394 if (r < 0)
1395 return log_error_errno(r, "Failed to parse system call filter: %m");
1396
1397 if (negative)
6b000af4 1398 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1399 else
6b000af4 1400 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1401 if (r < 0)
1402 return log_oom();
1403 }
1404
1405 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1406 break;
1407 }
1408
bf428efb
LP
1409 case ARG_RLIMIT: {
1410 const char *eq;
622ecfa8 1411 _cleanup_free_ char *name = NULL;
bf428efb
LP
1412 int rl;
1413
5c828e66
LP
1414 if (streq(optarg, "help")) {
1415 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1416 return 0;
1417 }
1418
bf428efb 1419 eq = strchr(optarg, '=');
baaa35ad
ZJS
1420 if (!eq)
1421 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1422 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1423
1424 name = strndup(optarg, eq - optarg);
1425 if (!name)
1426 return log_oom();
1427
1428 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1429 if (rl < 0)
1430 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1431 "Unknown resource limit: %s", name);
bf428efb
LP
1432
1433 if (!arg_rlimit[rl]) {
1434 arg_rlimit[rl] = new0(struct rlimit, 1);
1435 if (!arg_rlimit[rl])
1436 return log_oom();
1437 }
1438
1439 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1440 if (r < 0)
1441 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1442
1443 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1444 break;
1445 }
1446
81f345df
LP
1447 case ARG_OOM_SCORE_ADJUST:
1448 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1449 if (r < 0)
1450 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1451
1452 arg_oom_score_adjust_set = true;
1453 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1454 break;
1455
d107bb7d 1456 case ARG_CPU_AFFINITY: {
0985c7c4 1457 CPUSet cpuset;
d107bb7d
LP
1458
1459 r = parse_cpu_set(optarg, &cpuset);
1460 if (r < 0)
0985c7c4 1461 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1462
0985c7c4
ZJS
1463 cpu_set_reset(&arg_cpu_set);
1464 arg_cpu_set = cpuset;
d107bb7d
LP
1465 arg_settings_mask |= SETTING_CPU_AFFINITY;
1466 break;
1467 }
1468
09d423e9
LP
1469 case ARG_RESOLV_CONF:
1470 if (streq(optarg, "help")) {
1471 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1472 return 0;
1473 }
1474
1475 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1476 if (arg_resolv_conf < 0)
1477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1478 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1479
1480 arg_settings_mask |= SETTING_RESOLV_CONF;
1481 break;
1482
1688841f
LP
1483 case ARG_TIMEZONE:
1484 if (streq(optarg, "help")) {
1485 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1486 return 0;
1487 }
1488
1489 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1490 if (arg_timezone < 0)
1491 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1492 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1493
1494 arg_settings_mask |= SETTING_TIMEZONE;
1495 break;
1496
de40a303 1497 case ARG_CONSOLE:
dce66ffe
ZJS
1498 r = handle_arg_console(optarg);
1499 if (r <= 0)
1500 return r;
de40a303
LP
1501 break;
1502
1503 case 'P':
1504 case ARG_PIPE:
dce66ffe
ZJS
1505 r = handle_arg_console("pipe");
1506 if (r <= 0)
1507 return r;
de40a303
LP
1508 break;
1509
bb068de0
ZJS
1510 case ARG_NO_PAGER:
1511 arg_pager_flags |= PAGER_DISABLE;
1512 break;
1513
3652872a
LP
1514 case ARG_SET_CREDENTIAL: {
1515 _cleanup_free_ char *word = NULL, *data = NULL;
1516 const char *p = optarg;
1517 Credential *a;
1518 size_t i;
1519 int l;
1520
1521 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1522 if (r == -ENOMEM)
1523 return log_oom();
1524 if (r < 0)
1525 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1526 if (r == 0 || !p)
1527 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1528
1529 if (!credential_name_valid(word))
1530 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1531
1532 for (i = 0; i < arg_n_credentials; i++)
1533 if (streq(arg_credentials[i].id, word))
1534 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1535
1536 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1537 if (l < 0)
1538 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1539
1540 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1541 if (!a)
1542 return log_oom();
1543
1544 a[arg_n_credentials++] = (Credential) {
1545 .id = TAKE_PTR(word),
1546 .data = TAKE_PTR(data),
1547 .size = l,
1548 };
1549
1550 arg_credentials = a;
1551
1552 arg_settings_mask |= SETTING_CREDENTIALS;
1553 break;
1554 }
1555
1556 case ARG_LOAD_CREDENTIAL: {
1557 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1558 _cleanup_(erase_and_freep) char *data = NULL;
1559 _cleanup_free_ char *word = NULL, *j = NULL;
1560 const char *p = optarg;
1561 Credential *a;
1562 size_t size, i;
1563
1564 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1565 if (r == -ENOMEM)
1566 return log_oom();
1567 if (r < 0)
1568 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1569 if (r == 0 || !p)
1570 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1571
1572 if (!credential_name_valid(word))
1573 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1574
1575 for (i = 0; i < arg_n_credentials; i++)
1576 if (streq(arg_credentials[i].id, word))
1577 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1578
1579 if (path_is_absolute(p))
1580 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1581 else {
1582 const char *e;
1583
1584 e = getenv("CREDENTIALS_DIRECTORY");
1585 if (!e)
1586 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential not available (no credentials passed at all): %s", word);
1587
1588 j = path_join(e, p);
1589 if (!j)
1590 return log_oom();
1591 }
1592
1593 r = read_full_file_full(AT_FDCWD, j ?: p, flags, &data, &size);
1594 if (r < 0)
1595 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1596
1597 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1598 if (!a)
1599 return log_oom();
1600
1601 a[arg_n_credentials++] = (Credential) {
1602 .id = TAKE_PTR(word),
1603 .data = TAKE_PTR(data),
1604 .size = size,
1605 };
1606
1607 arg_credentials = a;
1608
1609 arg_settings_mask |= SETTING_CREDENTIALS;
1610 break;
1611 }
1612
88213476
LP
1613 case '?':
1614 return -EINVAL;
1615
1616 default:
eb9da376 1617 assert_not_reached("Unhandled option");
88213476 1618 }
88213476 1619
60f1ec13
LP
1620 if (argc > optind) {
1621 strv_free(arg_parameters);
1622 arg_parameters = strv_copy(argv + optind);
1623 if (!arg_parameters)
1624 return log_oom();
d7bea6b6 1625
60f1ec13
LP
1626 arg_settings_mask |= SETTING_START_MODE;
1627 }
1628
1629 if (arg_ephemeral && arg_template && !arg_directory)
1630 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1631 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1632 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1633 * --directory=". */
1634 arg_directory = TAKE_PTR(arg_template);
1635
bd4b15f2 1636 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1637
de40a303 1638 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1639 r = parse_environment();
1640 if (r < 0)
1641 return r;
de40a303 1642
60f1ec13
LP
1643 /* Load all settings from .nspawn files */
1644 if (mask_no_settings)
1645 arg_settings_mask = 0;
1646
1647 /* Don't load any settings from .nspawn files */
1648 if (mask_all_settings)
1649 arg_settings_mask = _SETTINGS_MASK_ALL;
1650
1651 return 1;
1652}
1653
1654static int verify_arguments(void) {
1655 int r;
a6b5216c 1656
75b0d8b8
ZJS
1657 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1658 /* If we are running the stub init in the container, we don't need to look at what the init
1659 * in the container supports, because we are not using it. Let's immediately pick the right
1660 * setting based on the host system configuration.
1661 *
1662 * We only do this, if the user didn't use an environment variable to override the detection.
1663 */
1664
1665 r = cg_all_unified();
1666 if (r < 0)
1667 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1668 if (r > 0)
1669 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1670 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1671 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1672 else
1673 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1674 }
1675
4f086aab
SU
1676 if (arg_userns_mode != USER_NAMESPACE_NO)
1677 arg_mount_settings |= MOUNT_USE_USERNS;
1678
1679 if (arg_private_network)
1680 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1681
48a8d337
LB
1682 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1683 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1684 arg_register = false;
baaa35ad 1685 if (arg_start_mode != START_PID1)
60f1ec13 1686 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1687 }
eb91eb18 1688
0de7acce 1689 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1690 arg_userns_chown = true;
1691
60f1ec13
LP
1692 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1693 arg_kill_signal = SIGRTMIN+3;
1694
e5a4bb0d
LP
1695 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1696 arg_read_only = true;
1697
2436ea76
DDM
1698 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1699 arg_read_only = true;
1700
baaa35ad 1701 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1702 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1703 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1704 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1705
baaa35ad 1706 if (arg_directory && arg_image)
60f1ec13 1707 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1708
baaa35ad 1709 if (arg_template && arg_image)
60f1ec13 1710 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1711
baaa35ad 1712 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1713 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1714
baaa35ad 1715 if (arg_ephemeral && arg_template)
60f1ec13 1716 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1717
baaa35ad 1718 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1719 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1720
baaa35ad 1721 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1722 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1723
baaa35ad 1724 if (arg_userns_chown && arg_read_only)
de40a303
LP
1725 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1726 "--read-only and --private-users-chown may not be combined.");
f757855e 1727
e5a4bb0d
LP
1728 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1729 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
5238e957 1730 * copy-up (in case of overlay) making the entire exercise pointless. */
e5a4bb0d
LP
1731 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1732 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1733
679ecd36
SZ
1734 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1735 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1736 if (arg_network_namespace_path &&
1737 (arg_network_interfaces || arg_network_macvlan ||
1738 arg_network_ipvlan || arg_network_veth_extra ||
1739 arg_network_bridge || arg_network_zone ||
679ecd36 1740 arg_network_veth))
de40a303 1741 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1742
60f1ec13 1743 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1744 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1745 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1746
baaa35ad 1747 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1748 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1749
baaa35ad 1750 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1751 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1752
baaa35ad 1753 if (arg_expose_ports && !arg_private_network)
60f1ec13 1754 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1755
349cc4a5 1756#if ! HAVE_LIBIPTC
baaa35ad 1757 if (arg_expose_ports)
60f1ec13 1758 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1759#endif
1760
60f1ec13
LP
1761 r = custom_mount_check_all();
1762 if (r < 0)
1763 return r;
c6c8f6e2 1764
f757855e 1765 return 0;
88213476
LP
1766}
1767
03cfe0d5
LP
1768static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1769 assert(p);
1770
0de7acce 1771 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1772 return 0;
1773
1774 if (uid == UID_INVALID && gid == GID_INVALID)
1775 return 0;
1776
1777 if (uid != UID_INVALID) {
1778 uid += arg_uid_shift;
1779
1780 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1781 return -EOVERFLOW;
1782 }
1783
1784 if (gid != GID_INVALID) {
1785 gid += (gid_t) arg_uid_shift;
1786
1787 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1788 return -EOVERFLOW;
1789 }
1790
1791 if (lchown(p, uid, gid) < 0)
1792 return -errno;
b12afc8c
LP
1793
1794 return 0;
1795}
1796
03cfe0d5
LP
1797static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1798 const char *q;
dae8b82e 1799 int r;
03cfe0d5
LP
1800
1801 q = prefix_roota(root, path);
dae8b82e
ZJS
1802 r = mkdir_errno_wrapper(q, mode);
1803 if (r == -EEXIST)
1804 return 0;
1805 if (r < 0)
1806 return r;
03cfe0d5
LP
1807
1808 return userns_lchown(q, uid, gid);
1809}
1810
1688841f 1811static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1812 return PATH_STARTSWITH_SET(
1813 path,
1814 "../usr/share/zoneinfo/",
1815 "/usr/share/zoneinfo/");
1688841f
LP
1816}
1817
83205269
LP
1818static bool etc_writable(void) {
1819 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1820}
1821
e58a1277 1822static int setup_timezone(const char *dest) {
1688841f
LP
1823 _cleanup_free_ char *p = NULL, *etc = NULL;
1824 const char *where, *check;
1825 TimezoneMode m;
d4036145 1826 int r;
f8440af5 1827
e58a1277
LP
1828 assert(dest);
1829
1688841f 1830 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1831 r = readlink_malloc("/etc/localtime", &p);
1832 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1833 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1834 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1835 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1836 else if (r < 0) {
1837 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1838 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1839 * file.
1840 *
1841 * Example:
1842 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1843 */
1844 return 0;
1845 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1846 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1847 else
1848 m = arg_timezone;
1849 } else
1850 m = arg_timezone;
1851
1852 if (m == TIMEZONE_OFF)
1853 return 0;
1854
a5648b80 1855 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1856 if (r < 0) {
1688841f 1857 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1858 return 0;
1859 }
1860
1688841f
LP
1861 where = strjoina(etc, "/localtime");
1862
1863 switch (m) {
1864
1865 case TIMEZONE_DELETE:
1866 if (unlink(where) < 0)
1867 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1868
d4036145 1869 return 0;
d4036145 1870
1688841f
LP
1871 case TIMEZONE_SYMLINK: {
1872 _cleanup_free_ char *q = NULL;
1873 const char *z, *what;
4d1c38b8 1874
1688841f
LP
1875 z = timezone_from_path(p);
1876 if (!z) {
1877 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1878 return 0;
1688841f 1879 }
d4036145 1880
1688841f
LP
1881 r = readlink_malloc(where, &q);
1882 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1883 return 0; /* Already pointing to the right place? Then do nothing .. */
1884
1885 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1886 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1887 if (r < 0)
1888 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1889 else {
1890 if (unlink(where) < 0 && errno != ENOENT) {
1891 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1892 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1893 return 0;
1894 }
1895
1896 what = strjoina("../usr/share/zoneinfo/", z);
1897 if (symlink(what, where) < 0) {
1898 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1899 errno, "Failed to correct timezone of container, ignoring: %m");
1900 return 0;
1901 }
1902
1903 break;
1904 }
1905
1906 _fallthrough_;
d4036145 1907 }
68fb0892 1908
1688841f
LP
1909 case TIMEZONE_BIND: {
1910 _cleanup_free_ char *resolved = NULL;
1911 int found;
1912
a5648b80 1913 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1914 if (found < 0) {
1915 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1916 return 0;
1917 }
1918
1919 if (found == 0) /* missing? */
1920 (void) touch(resolved);
1921
1922 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1923 if (r >= 0)
1924 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1925
1926 _fallthrough_;
79d80fc1 1927 }
4d9f07b4 1928
1688841f
LP
1929 case TIMEZONE_COPY:
1930 /* If mounting failed, try to copy */
8a016c74 1931 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1932 if (r < 0) {
1933 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1934 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1935 return 0;
1936 }
1937
1938 break;
1939
1940 default:
1941 assert_not_reached("unexpected mode");
d4036145 1942 }
e58a1277 1943
1688841f 1944 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1945 r = userns_lchown(where, 0, 0);
1946 if (r < 0)
1688841f 1947 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1948
e58a1277 1949 return 0;
88213476
LP
1950}
1951
09d423e9
LP
1952static int have_resolv_conf(const char *path) {
1953 assert(path);
1954
1955 if (access(path, F_OK) < 0) {
1956 if (errno == ENOENT)
1957 return 0;
1958
1959 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1960 }
1961
1962 return 1;
1963}
1964
7357272e 1965static int resolved_listening(void) {
b8ea7a6e 1966 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1967 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1968 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1969 int r;
1970
7357272e 1971 /* Check if resolved is listening */
b053cd5f
LP
1972
1973 r = sd_bus_open_system(&bus);
1974 if (r < 0)
b8ea7a6e 1975 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1976
7357272e 1977 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1978 if (r < 0)
1979 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1980 if (r == 0)
1981 return 0;
7357272e
DM
1982
1983 r = sd_bus_get_property_string(bus,
1984 "org.freedesktop.resolve1",
1985 "/org/freedesktop/resolve1",
1986 "org.freedesktop.resolve1.Manager",
1987 "DNSStubListener",
b8ea7a6e 1988 &error,
7357272e
DM
1989 &dns_stub_listener_mode);
1990 if (r < 0)
b8ea7a6e 1991 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1992
1993 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1994}
1995
2547bb41 1996static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1997 _cleanup_free_ char *etc = NULL;
1998 const char *where, *what;
1999 ResolvConfMode m;
2000 int r;
2547bb41
LP
2001
2002 assert(dest);
2003
09d423e9
LP
2004 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2005 if (arg_private_network)
2006 m = RESOLV_CONF_OFF;
86775e35
LP
2007 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2008 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2009 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2010 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2011 else
83205269 2012 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2013
09d423e9
LP
2014 } else
2015 m = arg_resolv_conf;
2016
2017 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2018 return 0;
2019
a5648b80 2020 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2021 if (r < 0) {
2022 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2023 return 0;
2024 }
2025
2026 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2027
2028 if (m == RESOLV_CONF_DELETE) {
2029 if (unlink(where) < 0)
2030 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2031
87447ae4
LP
2032 return 0;
2033 }
79d80fc1 2034
86775e35
LP
2035 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2036 what = PRIVATE_STATIC_RESOLV_CONF;
2037 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2038 what = PRIVATE_UPLINK_RESOLV_CONF;
2039 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2040 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2041 else
2042 what = "/etc/resolv.conf";
87447ae4 2043
86775e35 2044 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2045 _cleanup_free_ char *resolved = NULL;
2046 int found;
2047
a5648b80 2048 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2049 if (found < 0) {
2050 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2051 return 0;
2052 }
3539724c 2053
87447ae4
LP
2054 if (found == 0) /* missing? */
2055 (void) touch(resolved);
5367354d 2056
09d423e9 2057 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2058 if (r >= 0)
87447ae4 2059 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2060
2061 /* If that didn't work, let's copy the file */
3539724c
LP
2062 }
2063
86775e35
LP
2064 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2065 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2066 else
2067 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 2068 if (r < 0) {
3539724c
LP
2069 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2070 * resolved or something similar runs inside and the symlink points there.
68a313c5 2071 *
3539724c 2072 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2073 */
86775e35
LP
2074 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2075 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2076 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2077 return 0;
2078 }
2547bb41 2079
03cfe0d5
LP
2080 r = userns_lchown(where, 0, 0);
2081 if (r < 0)
3539724c 2082 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2083
2547bb41
LP
2084 return 0;
2085}
2086
1e4f1671 2087static int setup_boot_id(void) {
cdde6ba6
LP
2088 _cleanup_(unlink_and_freep) char *from = NULL;
2089 _cleanup_free_ char *path = NULL;
3bbaff3e 2090 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2091 const char *to;
04bc4a3f
LP
2092 int r;
2093
1eacc470 2094 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2095
1eacc470 2096 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2097 if (r < 0)
2098 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2099
2100 r = sd_id128_randomize(&rnd);
f647962d
MS
2101 if (r < 0)
2102 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2103
cdde6ba6 2104 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
2105 if (r < 0)
2106 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2107
cdde6ba6
LP
2108 from = TAKE_PTR(path);
2109 to = "/proc/sys/kernel/random/boot_id";
2110
60e76d48 2111 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2112 if (r < 0)
2113 return r;
04bc4a3f 2114
cdde6ba6 2115 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2116}
2117
e58a1277 2118static int copy_devnodes(const char *dest) {
88213476
LP
2119 static const char devnodes[] =
2120 "null\0"
2121 "zero\0"
2122 "full\0"
2123 "random\0"
2124 "urandom\0"
85614d66
TG
2125 "tty\0"
2126 "net/tun\0";
88213476 2127
de40a303 2128 _cleanup_umask_ mode_t u;
88213476 2129 const char *d;
e58a1277 2130 int r = 0;
a258bf26
LP
2131
2132 assert(dest);
124640f1
LP
2133
2134 u = umask(0000);
88213476 2135
03cfe0d5
LP
2136 /* Create /dev/net, so that we can create /dev/net/tun in it */
2137 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2138 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2139
88213476 2140 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2141 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2142 struct stat st;
88213476 2143
c6134d3e 2144 from = path_join("/dev/", d);
8967f291
LP
2145 if (!from)
2146 return log_oom();
2147
c6134d3e 2148 to = path_join(dest, from);
8967f291
LP
2149 if (!to)
2150 return log_oom();
88213476
LP
2151
2152 if (stat(from, &st) < 0) {
2153
4a62c710
MS
2154 if (errno != ENOENT)
2155 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2156
baaa35ad
ZJS
2157 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2158 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2159 "%s is not a char or block device, cannot copy.", from);
2160 else {
8dfce114
LP
2161 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2162
81f5049b 2163 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2164 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2165 if (errno == EEXIST)
8dbf71ec 2166 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2167 if (errno != EPERM)
2168 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2169
8dfce114 2170 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2171 r = touch(to);
2172 if (r < 0)
2173 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
2174 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2175 if (r < 0)
2176 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2177 }
6278cf60 2178
03cfe0d5
LP
2179 r = userns_lchown(to, 0, 0);
2180 if (r < 0)
2181 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2182
657ee2d8 2183 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2184 if (!dn)
2185 return log_oom();
2186
2187 r = userns_mkdir(dest, dn, 0755, 0, 0);
2188 if (r < 0)
2189 return log_error_errno(r, "Failed to create '%s': %m", dn);
2190
2191 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2192 return log_oom();
2193
c6134d3e 2194 prefixed = path_join(dest, sl);
8dfce114
LP
2195 if (!prefixed)
2196 return log_oom();
2197
2d9b74ba 2198 t = path_join("..", d);
8dfce114
LP
2199 if (!t)
2200 return log_oom();
2201
2202 if (symlink(t, prefixed) < 0)
2203 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2204 }
88213476
LP
2205 }
2206
e58a1277
LP
2207 return r;
2208}
88213476 2209
de40a303
LP
2210static int make_extra_nodes(const char *dest) {
2211 _cleanup_umask_ mode_t u;
2212 size_t i;
2213 int r;
2214
2215 u = umask(0000);
2216
2217 for (i = 0; i < arg_n_extra_nodes; i++) {
2218 _cleanup_free_ char *path = NULL;
2219 DeviceNode *n = arg_extra_nodes + i;
2220
c6134d3e 2221 path = path_join(dest, n->path);
de40a303
LP
2222 if (!path)
2223 return log_oom();
2224
2225 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2226 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2227
2228 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2229 if (r < 0)
2230 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2231 }
2232
2233 return 0;
2234}
2235
03cfe0d5
LP
2236static int setup_pts(const char *dest) {
2237 _cleanup_free_ char *options = NULL;
2238 const char *p;
709f6e46 2239 int r;
03cfe0d5 2240
349cc4a5 2241#if HAVE_SELINUX
03cfe0d5
LP
2242 if (arg_selinux_apifs_context)
2243 (void) asprintf(&options,
3dce8915 2244 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2245 arg_uid_shift + TTY_GID,
2246 arg_selinux_apifs_context);
2247 else
2248#endif
2249 (void) asprintf(&options,
3dce8915 2250 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2251 arg_uid_shift + TTY_GID);
f2d88580 2252
03cfe0d5 2253 if (!options)
f2d88580
LP
2254 return log_oom();
2255
03cfe0d5 2256 /* Mount /dev/pts itself */
cc9fce65 2257 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2258 r = mkdir_errno_wrapper(p, 0755);
2259 if (r < 0)
2260 return log_error_errno(r, "Failed to create /dev/pts: %m");
2261
60e76d48
ZJS
2262 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2263 if (r < 0)
2264 return r;
709f6e46
MS
2265 r = userns_lchown(p, 0, 0);
2266 if (r < 0)
2267 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2268
2269 /* Create /dev/ptmx symlink */
2270 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2271 if (symlink("pts/ptmx", p) < 0)
2272 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2273 r = userns_lchown(p, 0, 0);
2274 if (r < 0)
2275 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2276
03cfe0d5
LP
2277 /* And fix /dev/pts/ptmx ownership */
2278 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2279 r = userns_lchown(p, 0, 0);
2280 if (r < 0)
2281 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2282
f2d88580
LP
2283 return 0;
2284}
2285
3acc84eb
FB
2286static int setup_stdio_as_dev_console(void) {
2287 int terminal;
e58a1277 2288 int r;
e58a1277 2289
3acc84eb
FB
2290 terminal = open_terminal("/dev/console", O_RDWR);
2291 if (terminal < 0)
2292 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2293
3acc84eb
FB
2294 /* Make sure we can continue logging to the original stderr, even if
2295 * stderr points elsewhere now */
2296 r = log_dup_console();
2297 if (r < 0)
2298 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2299
3acc84eb
FB
2300 /* invalidates 'terminal' on success and failure */
2301 r = rearrange_stdio(terminal, terminal, terminal);
f647962d 2302 if (r < 0)
3acc84eb
FB
2303 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2304
2305 return 0;
2306}
88213476 2307
3acc84eb
FB
2308static int setup_dev_console(const char *console) {
2309 _cleanup_free_ char *p = NULL;
2310 int r;
a258bf26 2311
3acc84eb
FB
2312 /* Create /dev/console symlink */
2313 r = path_make_relative("/dev", console, &p);
81f5049b 2314 if (r < 0)
3acc84eb
FB
2315 return log_error_errno(r, "Failed to create relative path: %m");
2316
2317 if (symlink(p, "/dev/console") < 0)
2318 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2319
3acc84eb 2320 return 0;
e58a1277
LP
2321}
2322
8e5430c4
LP
2323static int setup_keyring(void) {
2324 key_serial_t keyring;
2325
6b000af4
LP
2326 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2327 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2328 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2329 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2330 * into the container. */
8e5430c4
LP
2331
2332 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2333 if (keyring == -1) {
2334 if (errno == ENOSYS)
2335 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2336 else if (IN_SET(errno, EACCES, EPERM))
2337 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2338 else
2339 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2340 }
2341
2342 return 0;
2343}
2344
3652872a
LP
2345static int setup_credentials(const char *root) {
2346 const char *q;
2347 int r;
2348
2349 if (arg_n_credentials <= 0)
2350 return 0;
2351
2352 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2353 if (r < 0)
2354 return log_error_errno(r, "Failed to create /run/host: %m");
2355
2356 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2357 if (r < 0)
2358 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2359
2360 q = prefix_roota(root, "/run/host/credentials");
2361 r = mount_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2362 if (r < 0)
2363 return r;
2364
2365 for (size_t i = 0; i < arg_n_credentials; i++) {
2366 _cleanup_free_ char *j = NULL;
2367 _cleanup_close_ int fd = -1;
2368
2369 j = path_join(q, arg_credentials[i].id);
2370 if (!j)
2371 return log_oom();
2372
2373 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2374 if (fd < 0)
2375 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2376
2377 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2380
2381 if (fchmod(fd, 0400) < 0)
2382 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2383
2384 if (arg_userns_mode != USER_NAMESPACE_NO) {
2385 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2386 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2387 }
2388 }
2389
2390 if (chmod(q, 0500) < 0)
2391 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2392
2393 r = userns_lchown(q, 0, 0);
2394 if (r < 0)
2395 return r;
2396
2397 /* Make both mount and superblock read-only now */
2398 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2399 if (r < 0)
2400 return r;
2401
2402 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2403}
2404
1e4f1671 2405static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2406 _cleanup_(unlink_and_freep) char *from = NULL;
2407 _cleanup_free_ char *fifo = NULL;
2408 _cleanup_close_ int fd = -1;
7fd1b19b 2409 _cleanup_umask_ mode_t u;
9ec5a93c 2410 int r;
e58a1277 2411
e58a1277 2412 assert(kmsg_socket >= 0);
a258bf26 2413
e58a1277 2414 u = umask(0000);
a258bf26 2415
1eacc470 2416 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2417 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2418 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2419 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2420
1eacc470 2421 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2422 if (r < 0)
2423 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2424
9ec5a93c 2425 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2426 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2427
2428 from = TAKE_PTR(fifo);
9ec5a93c 2429
1eacc470 2430 r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2431 if (r < 0)
2432 return r;
e58a1277 2433
669fc4e5 2434 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2435 if (fd < 0)
2436 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2437
9ec5a93c 2438 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2439 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2440 if (r < 0)
2441 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2442
25ea79fe 2443 return 0;
88213476
LP
2444}
2445
1c4baffc 2446static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2447 union in_addr_union *exposed = userdata;
2448
2449 assert(rtnl);
2450 assert(m);
2451 assert(exposed);
2452
7a8f6325 2453 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
2454 return 0;
2455}
2456
3a74cea5 2457static int setup_hostname(void) {
c818eef1 2458 int r;
3a74cea5 2459
0c582db0 2460 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2461 return 0;
2462
c818eef1
LP
2463 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2464 if (r < 0)
2465 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2466
7027ff61 2467 return 0;
3a74cea5
LP
2468}
2469
57fb9fb5 2470static int setup_journal(const char *directory) {
0f5e1382 2471 _cleanup_free_ char *d = NULL;
5905d7cf 2472 char id[SD_ID128_STRING_MAX];
b2238e38
LP
2473 const char *dirname, *p, *q;
2474 sd_id128_t this_id;
8054d749 2475 bool try;
57fb9fb5
LP
2476 int r;
2477
df9a75e4
LP
2478 /* Don't link journals in ephemeral mode */
2479 if (arg_ephemeral)
2480 return 0;
2481
8054d749
LP
2482 if (arg_link_journal == LINK_NO)
2483 return 0;
2484
2485 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2486
4d680aee 2487 r = sd_id128_get_machine(&this_id);
f647962d
MS
2488 if (r < 0)
2489 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2490
e01ff70a 2491 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2492 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2493 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2494 if (try)
4d680aee 2495 return 0;
df9a75e4 2496 return -EEXIST;
4d680aee
ZJS
2497 }
2498
369ca6da
ZJS
2499 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2500 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2501 if (r < 0) {
2502 bool ignore = r == -EROFS && try;
2503 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2504 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2505 return ignore ? 0 : r;
2506 }
2507 }
03cfe0d5 2508
e01ff70a
MS
2509 (void) sd_id128_to_string(arg_uuid, id);
2510
03cfe0d5
LP
2511 p = strjoina("/var/log/journal/", id);
2512 q = prefix_roota(directory, p);
27407a01 2513
e1873695 2514 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2515 if (try)
2516 return 0;
27407a01 2517
baaa35ad
ZJS
2518 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2519 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2520 }
2521
e1873695 2522 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2523 if (try)
2524 return 0;
57fb9fb5 2525
baaa35ad
ZJS
2526 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2527 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2528 }
2529
2530 r = readlink_and_make_absolute(p, &d);
2531 if (r >= 0) {
3742095b 2532 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2533 path_equal(d, q)) {
2534
03cfe0d5 2535 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2536 if (r < 0)
709f6e46 2537 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2538 return 0;
57fb9fb5
LP
2539 }
2540
4a62c710
MS
2541 if (unlink(p) < 0)
2542 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2543 } else if (r == -EINVAL) {
2544
2545 if (arg_link_journal == LINK_GUEST &&
2546 rmdir(p) < 0) {
2547
27407a01
ZJS
2548 if (errno == ENOTDIR) {
2549 log_error("%s already exists and is neither a symlink nor a directory", p);
2550 return r;
4314d33f
MS
2551 } else
2552 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2553 }
4314d33f
MS
2554 } else if (r != -ENOENT)
2555 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2556
2557 if (arg_link_journal == LINK_GUEST) {
2558
2559 if (symlink(q, p) < 0) {
8054d749 2560 if (try) {
56f64d95 2561 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2562 return 0;
4314d33f
MS
2563 } else
2564 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2565 }
2566
03cfe0d5 2567 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2568 if (r < 0)
709f6e46 2569 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2570 return 0;
57fb9fb5
LP
2571 }
2572
2573 if (arg_link_journal == LINK_HOST) {
ccddd104 2574 /* don't create parents here — if the host doesn't have
574edc90 2575 * permanent journal set up, don't force it here */
ba8e6c4d 2576
dae8b82e
ZJS
2577 r = mkdir_errno_wrapper(p, 0755);
2578 if (r < 0 && r != -EEXIST) {
8054d749 2579 if (try) {
dae8b82e 2580 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2581 return 0;
4314d33f 2582 } else
dae8b82e 2583 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2584 }
2585
27407a01
ZJS
2586 } else if (access(p, F_OK) < 0)
2587 return 0;
57fb9fb5 2588
cdb2b9d0
LP
2589 if (dir_is_empty(q) == 0)
2590 log_warning("%s is not empty, proceeding anyway.", q);
2591
03cfe0d5 2592 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2593 if (r < 0)
2594 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2595
60e76d48
ZJS
2596 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2597 if (r < 0)
4a62c710 2598 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2599
27407a01 2600 return 0;
57fb9fb5
LP
2601}
2602
de40a303
LP
2603static int drop_capabilities(uid_t uid) {
2604 CapabilityQuintet q;
2605
2606 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2607 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2608 * arg_caps_retain. */
2609
2610 if (capability_quintet_is_set(&arg_full_capabilities)) {
2611 q = arg_full_capabilities;
2612
2613 if (q.bounding == (uint64_t) -1)
2614 q.bounding = uid == 0 ? arg_caps_retain : 0;
2615
2616 if (q.effective == (uint64_t) -1)
2617 q.effective = uid == 0 ? q.bounding : 0;
2618
2619 if (q.inheritable == (uint64_t) -1)
2620 q.inheritable = uid == 0 ? q.bounding : 0;
2621
2622 if (q.permitted == (uint64_t) -1)
2623 q.permitted = uid == 0 ? q.bounding : 0;
2624
2625 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2626 q.ambient = 0;
f66ad460
AZ
2627
2628 if (capability_quintet_mangle(&q))
2629 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2630
2631 } else {
de40a303
LP
2632 q = (CapabilityQuintet) {
2633 .bounding = arg_caps_retain,
2634 .effective = uid == 0 ? arg_caps_retain : 0,
2635 .inheritable = uid == 0 ? arg_caps_retain : 0,
2636 .permitted = uid == 0 ? arg_caps_retain : 0,
2637 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2638 };
2639
f66ad460
AZ
2640 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2641 * in order to maintain the same behavior as systemd < 242. */
2642 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2643 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2644 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2645
2646 }
2647
de40a303 2648 return capability_quintet_enforce(&q);
88213476
LP
2649}
2650
db999e0f
LP
2651static int reset_audit_loginuid(void) {
2652 _cleanup_free_ char *p = NULL;
2653 int r;
2654
0c582db0 2655 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2656 return 0;
2657
2658 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2659 if (r == -ENOENT)
db999e0f 2660 return 0;
f647962d
MS
2661 if (r < 0)
2662 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2663
2664 /* Already reset? */
2665 if (streq(p, "4294967295"))
2666 return 0;
2667
57512c89 2668 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2669 if (r < 0) {
10a87006
LP
2670 log_error_errno(r,
2671 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2672 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2673 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2674 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2675 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2676
db999e0f 2677 sleep(5);
77b6e194 2678 }
db999e0f
LP
2679
2680 return 0;
77b6e194
LP
2681}
2682
785890ac
LP
2683static int setup_propagate(const char *root) {
2684 const char *p, *q;
709f6e46 2685 int r;
785890ac
LP
2686
2687 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2688 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2689 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2690 (void) mkdir_p(p, 0600);
2691
5a27b395 2692 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2693 if (r < 0)
5a27b395 2694 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2695
5a27b395 2696 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
709f6e46 2697 if (r < 0)
5a27b395 2698 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
03cfe0d5 2699
5a27b395 2700 q = prefix_roota(root, "/run/host/incoming");
60e76d48
ZJS
2701 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2702 if (r < 0)
2703 return r;
785890ac 2704
60e76d48
ZJS
2705 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2706 if (r < 0)
2707 return r;
785890ac 2708
5a27b395 2709 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
60e76d48 2710 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2711}
2712
317feb4d 2713static int setup_machine_id(const char *directory) {
691675ba
LP
2714 const char *etc_machine_id;
2715 sd_id128_t id;
3bbaff3e 2716 int r;
e01ff70a 2717
317feb4d
LP
2718 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2719 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2720 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2721 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2722 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2723 * container behaves nicely). */
2724
e01ff70a
MS
2725 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2726
691675ba 2727 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2728 if (r < 0) {
2729 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2730 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2731
317feb4d
LP
2732 if (sd_id128_is_null(arg_uuid)) {
2733 r = sd_id128_randomize(&arg_uuid);
2734 if (r < 0)
2735 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2736 }
2737 } else {
baaa35ad
ZJS
2738 if (sd_id128_is_null(id))
2739 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2740 "Machine ID in container image is zero, refusing.");
e01ff70a 2741
317feb4d
LP
2742 arg_uuid = id;
2743 }
691675ba 2744
e01ff70a
MS
2745 return 0;
2746}
2747
7336138e
LP
2748static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2749 int r;
2750
2751 assert(directory);
2752
0de7acce 2753 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2754 return 0;
2755
2756 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2757 if (r == -EOPNOTSUPP)
2758 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2759 if (r == -EBADE)
2760 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2761 if (r < 0)
2762 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2763 if (r == 0)
2764 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2765 else
2766 log_debug("Patched directory tree to match UID/GID range.");
2767
2768 return r;
2769}
2770
113cea80 2771/*
6d416b9c
LS
2772 * Return values:
2773 * < 0 : wait_for_terminate() failed to get the state of the
2774 * container, the container was terminated by a signal, or
2775 * failed for an unknown reason. No change is made to the
2776 * container argument.
2777 * > 0 : The program executed in the container terminated with an
2778 * error. The exit code of the program executed in the
919699ec
LP
2779 * container is returned. The container argument has been set
2780 * to CONTAINER_TERMINATED.
6d416b9c
LS
2781 * 0 : The container is being rebooted, has been shut down or exited
2782 * successfully. The container argument has been set to either
2783 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2784 *
6d416b9c
LS
2785 * That is, success is indicated by a return value of zero, and an
2786 * error is indicated by a non-zero value.
113cea80
DH
2787 */
2788static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2789 siginfo_t status;
919699ec 2790 int r;
113cea80
DH
2791
2792 r = wait_for_terminate(pid, &status);
f647962d
MS
2793 if (r < 0)
2794 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2795
2796 switch (status.si_code) {
fddbb89c 2797
113cea80 2798 case CLD_EXITED:
b5a2179b 2799 if (status.si_status == 0)
919699ec 2800 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2801 else
919699ec 2802 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2803
919699ec
LP
2804 *container = CONTAINER_TERMINATED;
2805 return status.si_status;
113cea80
DH
2806
2807 case CLD_KILLED:
2808 if (status.si_status == SIGINT) {
919699ec 2809 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2810 *container = CONTAINER_TERMINATED;
919699ec
LP
2811 return 0;
2812
113cea80 2813 } else if (status.si_status == SIGHUP) {
919699ec 2814 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2815 *container = CONTAINER_REBOOTED;
919699ec 2816 return 0;
113cea80 2817 }
919699ec 2818
4831981d 2819 _fallthrough_;
113cea80 2820 case CLD_DUMPED:
baaa35ad
ZJS
2821 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2822 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2823
2824 default:
baaa35ad
ZJS
2825 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2826 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2827 }
113cea80
DH
2828}
2829
023fb90b
LP
2830static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2831 pid_t pid;
2832
4a0b58c4 2833 pid = PTR_TO_PID(userdata);
023fb90b 2834 if (pid > 0) {
c6c8f6e2 2835 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2836 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2837 sd_event_source_set_userdata(s, NULL);
2838 return 0;
2839 }
2840 }
2841
2842 sd_event_exit(sd_event_source_get_event(s), 0);
2843 return 0;
2844}
2845
6916b164 2846static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2847 pid_t pid;
2848
2849 assert(s);
2850 assert(ssi);
2851
2852 pid = PTR_TO_PID(userdata);
2853
6916b164
AU
2854 for (;;) {
2855 siginfo_t si = {};
abdb9b08 2856
6916b164
AU
2857 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2858 return log_error_errno(errno, "Failed to waitid(): %m");
2859 if (si.si_pid == 0) /* No pending children. */
2860 break;
abdb9b08 2861 if (si.si_pid == pid) {
6916b164
AU
2862 /* The main process we care for has exited. Return from
2863 * signal handler but leave the zombie. */
2864 sd_event_exit(sd_event_source_get_event(s), 0);
2865 break;
2866 }
abdb9b08 2867
6916b164
AU
2868 /* Reap all other children. */
2869 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2870 }
2871
2872 return 0;
2873}
2874
abdb9b08
LP
2875static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2876 pid_t pid;
2877
2878 assert(m);
2879
2880 pid = PTR_TO_PID(userdata);
2881
2882 if (arg_kill_signal > 0) {
2883 log_info("Container termination requested. Attempting to halt container.");
2884 (void) kill(pid, arg_kill_signal);
2885 } else {
2886 log_info("Container termination requested. Exiting.");
2887 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2888 }
2889
2890 return 0;
2891}
2892
ec16945e 2893static int determine_names(void) {
1b9cebf6 2894 int r;
ec16945e 2895
c1521918
LP
2896 if (arg_template && !arg_directory && arg_machine) {
2897
2898 /* If --template= was specified then we should not
2899 * search for a machine, but instead create a new one
2900 * in /var/lib/machine. */
2901
657ee2d8 2902 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2903 if (!arg_directory)
2904 return log_oom();
2905 }
2906
ec16945e 2907 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2908 if (arg_machine) {
2909 _cleanup_(image_unrefp) Image *i = NULL;
2910
5ef46e5f 2911 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2912 if (r == -ENOENT)
2913 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2914 if (r < 0)
2915 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2916
eb38edce 2917 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2918 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2919 else
0f03c2a4 2920 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2921 if (r < 0)
0f3be6ca 2922 return log_oom();
1b9cebf6 2923
aee327b8
LP
2924 if (!arg_ephemeral)
2925 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2926 } else {
2927 r = safe_getcwd(&arg_directory);
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to determine current directory: %m");
2930 }
ec16945e 2931
c6147113
LP
2932 if (!arg_directory && !arg_image)
2933 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2934 }
2935
2936 if (!arg_machine) {
b9ba4dab
LP
2937 if (arg_directory && path_equal(arg_directory, "/"))
2938 arg_machine = gethostname_malloc();
4827ab48
LP
2939 else {
2940 if (arg_image) {
2941 char *e;
2942
2943 arg_machine = strdup(basename(arg_image));
2944
2945 /* Truncate suffix if there is one */
2946 e = endswith(arg_machine, ".raw");
2947 if (e)
2948 *e = 0;
2949 } else
2950 arg_machine = strdup(basename(arg_directory));
2951 }
ec16945e
LP
2952 if (!arg_machine)
2953 return log_oom();
2954
ae691c1d 2955 hostname_cleanup(arg_machine);
c6147113
LP
2956 if (!machine_name_is_valid(arg_machine))
2957 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab
LP
2958
2959 if (arg_ephemeral) {
2960 char *b;
2961
2962 /* Add a random suffix when this is an
2963 * ephemeral machine, so that we can run many
2964 * instances at once without manually having
2965 * to specify -M each time. */
2966
2967 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2968 return log_oom();
2969
2970 free(arg_machine);
2971 arg_machine = b;
2972 }
ec16945e
LP
2973 }
2974
2975 return 0;
2976}
2977
8d4aa2bb 2978static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2979 char *chased;
2980 int r;
2981
2982 assert(p);
2983
2984 if (!*p)
2985 return 0;
2986
a5648b80 2987 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
2988 if (r < 0)
2989 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2990
a5648b80 2991 return free_and_replace(*p, chased);
3f342ec4
LP
2992}
2993
03cfe0d5 2994static int determine_uid_shift(const char *directory) {
6dac160c
LP
2995 int r;
2996
0de7acce 2997 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2998 arg_uid_shift = 0;
6dac160c 2999 return 0;
03cfe0d5 3000 }
6dac160c
LP
3001
3002 if (arg_uid_shift == UID_INVALID) {
3003 struct stat st;
3004
03cfe0d5 3005 r = stat(directory, &st);
6dac160c 3006 if (r < 0)
03cfe0d5 3007 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3008
3009 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3010
baaa35ad
ZJS
3011 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3012 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3013 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3014
3015 arg_uid_range = UINT32_C(0x10000);
3016 }
3017
baaa35ad
ZJS
3018 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
3019 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3020 "UID base too high for UID range.");
6dac160c 3021
6dac160c
LP
3022 return 0;
3023}
3024
de40a303
LP
3025static unsigned long effective_clone_ns_flags(void) {
3026 unsigned long flags = arg_clone_ns_flags;
3027
3028 if (arg_private_network)
3029 flags |= CLONE_NEWNET;
3030 if (arg_use_cgns)
3031 flags |= CLONE_NEWCGROUP;
3032 if (arg_userns_mode != USER_NAMESPACE_NO)
3033 flags |= CLONE_NEWUSER;
3034
3035 return flags;
3036}
3037
3038static int patch_sysctl(void) {
3039
3040 /* This table is inspired by runc's sysctl() function */
3041 static const struct {
3042 const char *key;
3043 bool prefix;
3044 unsigned long clone_flags;
3045 } safe_sysctl[] = {
3046 { "kernel.hostname", false, CLONE_NEWUTS },
3047 { "kernel.domainname", false, CLONE_NEWUTS },
3048 { "kernel.msgmax", false, CLONE_NEWIPC },
3049 { "kernel.msgmnb", false, CLONE_NEWIPC },
3050 { "kernel.msgmni", false, CLONE_NEWIPC },
3051 { "kernel.sem", false, CLONE_NEWIPC },
3052 { "kernel.shmall", false, CLONE_NEWIPC },
3053 { "kernel.shmmax", false, CLONE_NEWIPC },
3054 { "kernel.shmmni", false, CLONE_NEWIPC },
3055 { "fs.mqueue.", true, CLONE_NEWIPC },
3056 { "net.", true, CLONE_NEWNET },
3057 };
3058
3059 unsigned long flags;
3060 char **k, **v;
3061 int r;
3062
3063 flags = effective_clone_ns_flags();
3064
3065 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3066 bool good = false;
3067 size_t i;
3068
3069 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3070
3071 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3072 continue;
3073
3074 if (safe_sysctl[i].prefix)
3075 good = startswith(*k, safe_sysctl[i].key);
3076 else
3077 good = streq(*k, safe_sysctl[i].key);
3078
3079 if (good)
3080 break;
3081 }
3082
c6147113
LP
3083 if (!good)
3084 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3085
3086 r = sysctl_write(*k, *v);
3087 if (r < 0)
3088 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3089 }
3090
3091 return 0;
3092}
3093
03cfe0d5
LP
3094static int inner_child(
3095 Barrier *barrier,
3096 const char *directory,
3097 bool secondary,
3098 int kmsg_socket,
3099 int rtnl_socket,
3acc84eb 3100 int master_pty_socket,
e1bb4b0d
LB
3101 FDSet *fds,
3102 char **os_release_pairs) {
69c79d3c 3103
03cfe0d5 3104 _cleanup_free_ char *home = NULL;
b5ea030d 3105 char as_uuid[ID128_UUID_STRING_MAX];
88614c8a 3106 size_t n_env = 1;
03cfe0d5 3107 const char *envp[] = {
0c300adf 3108 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3109 NULL, /* container */
03cfe0d5
LP
3110 NULL, /* TERM */
3111 NULL, /* HOME */
3112 NULL, /* USER */
3113 NULL, /* LOGNAME */
3114 NULL, /* container_uuid */
3115 NULL, /* LISTEN_FDS */
3116 NULL, /* LISTEN_PID */
9c1e04d0 3117 NULL, /* NOTIFY_SOCKET */
3652872a 3118 NULL, /* CREDENTIALS_DIRECTORY */
03cfe0d5
LP
3119 NULL
3120 };
1a68e1e5 3121 const char *exec_target;
2371271c 3122 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3123 int r, which_failed;
88213476 3124
b37469d7
LP
3125 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3126 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3127 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3128 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3129 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3130 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3131 * namespace.
3132 *
3133 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3134 * unshare(). See below. */
3135
03cfe0d5
LP
3136 assert(barrier);
3137 assert(directory);
3138 assert(kmsg_socket >= 0);
88213476 3139
de40a303
LP
3140 log_debug("Inner child is initializing.");
3141
0de7acce 3142 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3143 /* Tell the parent, that it now can write the UID map. */
3144 (void) barrier_place(barrier); /* #1 */
7027ff61 3145
03cfe0d5 3146 /* Wait until the parent wrote the UID map */
baaa35ad 3147 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3148 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3149
2a2e78e9
LP
3150 /* Become the new root user inside our namespace */
3151 r = reset_uid_gid();
3152 if (r < 0)
3153 return log_error_errno(r, "Couldn't become new root: %m");
3154
3155 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3156 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3157 * propagation, but simply create new peer groups for all our mounts). */
3158 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3159 if (r < 0)
3160 return r;
3161 }
6d66bd3b 3162
0de7acce 3163 r = mount_all(NULL,
4f086aab 3164 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3165 arg_uid_shift,
0de7acce 3166 arg_selinux_apifs_context);
03cfe0d5
LP
3167 if (r < 0)
3168 return r;
3169
04413780
ZJS
3170 if (!arg_network_namespace_path && arg_private_network) {
3171 r = unshare(CLONE_NEWNET);
3172 if (r < 0)
3173 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3174
3175 /* Tell the parent that it can setup network interfaces. */
3176 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3177 }
3178
4f086aab 3179 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3180 if (r < 0)
3181 return r;
3182
03cfe0d5
LP
3183 /* Wait until we are cgroup-ified, so that we
3184 * can mount the right cgroup path writable */
baaa35ad
ZJS
3185 if (!barrier_place_and_sync(barrier)) /* #4 */
3186 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3187 "Parent died too early");
88213476 3188
489fae52 3189 if (arg_use_cgns) {
0996ef00
CB
3190 r = unshare(CLONE_NEWCGROUP);
3191 if (r < 0)
04413780 3192 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3193 r = mount_cgroups(
3194 "",
3195 arg_unified_cgroup_hierarchy,
3196 arg_userns_mode != USER_NAMESPACE_NO,
3197 arg_uid_shift,
3198 arg_uid_range,
5a8ff0e6 3199 arg_selinux_apifs_context,
ada54120 3200 true);
1433e0f2 3201 } else
0996ef00 3202 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3203 if (r < 0)
3204 return r;
ec16945e 3205
1e4f1671 3206 r = setup_boot_id();
03cfe0d5
LP
3207 if (r < 0)
3208 return r;
ec16945e 3209
1e4f1671 3210 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
3211 if (r < 0)
3212 return r;
3213 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3214
de40a303
LP
3215 r = mount_custom(
3216 "/",
3217 arg_custom_mounts,
3218 arg_n_custom_mounts,
de40a303
LP
3219 0,
3220 arg_selinux_apifs_context,
5f0a6347 3221 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3222 if (r < 0)
3223 return r;
3224
03cfe0d5
LP
3225 if (setsid() < 0)
3226 return log_error_errno(errno, "setsid() failed: %m");
3227
3228 if (arg_private_network)
df883de9 3229 (void) loopback_setup();
03cfe0d5 3230
7a8f6325
LP
3231 if (arg_expose_ports) {
3232 r = expose_port_send_rtnl(rtnl_socket);
3233 if (r < 0)
3234 return r;
3235 rtnl_socket = safe_close(rtnl_socket);
3236 }
03cfe0d5 3237
3acc84eb 3238 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3239 _cleanup_close_ int master = -1;
3acc84eb
FB
3240 _cleanup_free_ char *console = NULL;
3241
3242 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3243 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3244 if (master < 0)
dc98caea 3245 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3246
3247 r = setup_dev_console(console);
3248 if (r < 0)
105a1a36 3249 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb
FB
3250
3251 r = send_one_fd(master_pty_socket, master, 0);
3252 if (r < 0)
3253 return log_error_errno(r, "Failed to send master fd: %m");
3254 master_pty_socket = safe_close(master_pty_socket);
3255
3256 r = setup_stdio_as_dev_console();
3257 if (r < 0)
3258 return r;
3259 }
3260
de40a303
LP
3261 r = patch_sysctl();
3262 if (r < 0)
3263 return r;
3264
81f345df
LP
3265 if (arg_oom_score_adjust_set) {
3266 r = set_oom_score_adjust(arg_oom_score_adjust);
3267 if (r < 0)
3268 return log_error_errno(r, "Failed to adjust OOM score: %m");
3269 }
3270
0985c7c4
ZJS
3271 if (arg_cpu_set.set)
3272 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3273 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3274
c818eef1 3275 (void) setup_hostname();
03cfe0d5 3276
050f7277 3277 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3278 r = safe_personality(arg_personality);
3279 if (r < 0)
3280 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3281 } else if (secondary) {
21022b9d
LP
3282 r = safe_personality(PER_LINUX32);
3283 if (r < 0)
3284 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3285 }
3286
de40a303
LP
3287 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3288 if (r < 0)
3289 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3290
3291#if HAVE_SECCOMP
3292 if (arg_seccomp) {
3293
3294 if (is_seccomp_available()) {
3295
3296 r = seccomp_load(arg_seccomp);
7bc5e0b1 3297 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3298 return log_error_errno(r, "Failed to install seccomp filter: %m");
3299 if (r < 0)
3300 log_debug_errno(r, "Failed to install seccomp filter: %m");
3301 }
3302 } else
3303#endif
3304 {
6b000af4 3305 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3306 if (r < 0)
3307 return r;
3308 }
3309
349cc4a5 3310#if HAVE_SELINUX
03cfe0d5 3311 if (arg_selinux_context)
2ed96880 3312 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3313 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3314#endif
3315
de40a303
LP
3316 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3317 * if we need to later on. */
3318 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3319 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3320
3321 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3322 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3323 else
3324 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
3325 if (r < 0)
3326 return r;
3327
de40a303
LP
3328 r = drop_capabilities(getuid());
3329 if (r < 0)
3330 return log_error_errno(r, "Dropping capabilities failed: %m");
3331
66edd963
LP
3332 if (arg_no_new_privileges)
3333 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3334 return log_error_errno(errno, "Failed to disable new privileges: %m");
3335
6aadfa4c
ILG
3336 /* LXC sets container=lxc, so follow the scheme here */
3337 envp[n_env++] = strjoina("container=", arg_container_service_name);
3338
03cfe0d5
LP
3339 envp[n_env] = strv_find_prefix(environ, "TERM=");
3340 if (envp[n_env])
313cefa1 3341 n_env++;
03cfe0d5 3342
de40a303
LP
3343 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3344 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3345 return log_oom();
3346
3347 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3348 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3349 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3350 return log_oom();
03cfe0d5 3351
3bbaff3e 3352 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3353
691675ba 3354 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3355 return log_oom();
03cfe0d5
LP
3356
3357 if (fdset_size(fds) > 0) {
3358 r = fdset_cloexec(fds, false);
3359 if (r < 0)
3360 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3361
3362 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3363 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3364 return log_oom();
3365 }
9c1e04d0
AP
3366 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3367 return log_oom();
03cfe0d5 3368
3652872a
LP
3369 if (arg_n_credentials > 0) {
3370 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3371 if (!envp[n_env])
3372 return log_oom();
3373 n_env++;
3374 }
3375
ed4512d0 3376 env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
2371271c
TG
3377 if (!env_use)
3378 return log_oom();
03cfe0d5
LP
3379
3380 /* Let the parent know that we are ready and
3381 * wait until the parent is ready with the
3382 * setup, too... */
baaa35ad
ZJS
3383 if (!barrier_place_and_sync(barrier)) /* #5 */
3384 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3385 "Parent died too early");
03cfe0d5 3386
5f932eb9
LP
3387 if (arg_chdir)
3388 if (chdir(arg_chdir) < 0)
3389 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3390
7732f92b 3391 if (arg_start_mode == START_PID2) {
75bf701f 3392 r = stub_pid1(arg_uuid);
7732f92b
LP
3393 if (r < 0)
3394 return r;
3395 }
3396
de40a303
LP
3397 log_debug("Inner child completed, invoking payload.");
3398
8ca082b4
LP
3399 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3400 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3401 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3402 log_close();
8ca082b4
LP
3403 log_set_open_when_needed(true);
3404
03cfe0d5
LP
3405 (void) fdset_close_others(fds);
3406
7732f92b 3407 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3408 char **a;
3409 size_t m;
3410
3411 /* Automatically search for the init system */
3412
75f32f04
ZJS
3413 m = strv_length(arg_parameters);
3414 a = newa(char*, m + 2);
3415 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3416 a[1 + m] = NULL;
03cfe0d5 3417
ced58da7 3418 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3419 execve(a[0], a, env_use);
3420
ced58da7 3421 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3422 execve(a[0], a, env_use);
3423
ced58da7 3424 a[0] = (char*) "/sbin/init";
03cfe0d5 3425 execve(a[0], a, env_use);
ced58da7
LP
3426
3427 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3428 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3429 const char *dollar_path;
3430
1a68e1e5 3431 exec_target = arg_parameters[0];
b6b180b7
LP
3432
3433 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3434 * binary. */
3435 dollar_path = strv_env_get(env_use, "PATH");
3436 if (dollar_path) {
6f646e01 3437 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3438 return log_error_errno(errno, "Failed to update $PATH: %m");
3439 }
3440
f757855e 3441 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3442 } else {
5f932eb9 3443 if (!arg_chdir)
d929b0f9
ZJS
3444 /* If we cannot change the directory, we'll end up in /, that is expected. */
3445 (void) chdir(home ?: "/root");
5f932eb9 3446
03cfe0d5
LP
3447 execle("/bin/bash", "-bash", NULL, env_use);
3448 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3449
3450 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3451 }
3452
8ca082b4 3453 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3454}
3455
e96ceaba 3456static int setup_notify_child(void) {
271f518f 3457 _cleanup_close_ int fd = -1;
9c1e04d0 3458 union sockaddr_union sa = {
44ed5214
LP
3459 .un.sun_family = AF_UNIX,
3460 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3461 };
3462 int r;
3463
3464 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3465 if (fd < 0)
3466 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3467
3468 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3469 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3470
9c1e04d0 3471 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3472 if (r < 0)
44ed5214 3473 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3474
adc7d9f0 3475 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3476 if (r < 0)
adc7d9f0 3477 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3478
2ff48e98 3479 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3480 if (r < 0)
2ff48e98 3481 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3482
271f518f 3483 return TAKE_FD(fd);
9c1e04d0
AP
3484}
3485
03cfe0d5
LP
3486static int outer_child(
3487 Barrier *barrier,
3488 const char *directory,
2d845785 3489 DissectedImage *dissected_image,
03cfe0d5
LP
3490 bool secondary,
3491 int pid_socket,
e01ff70a 3492 int uuid_socket,
9c1e04d0 3493 int notify_socket,
03cfe0d5
LP
3494 int kmsg_socket,
3495 int rtnl_socket,
825d5287 3496 int uid_shift_socket,
3acc84eb 3497 int master_pty_socket,
8199d554 3498 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3499 FDSet *fds,
3500 int netns_fd) {
03cfe0d5 3501
e1bb4b0d 3502 _cleanup_strv_free_ char **os_release_pairs = NULL;
bf428efb 3503 _cleanup_close_ int fd = -1;
e5f10caf 3504 const char *p;
03cfe0d5
LP
3505 pid_t pid;
3506 ssize_t l;
de40a303 3507 int r;
03cfe0d5 3508
b37469d7
LP
3509 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3510 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3511 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3512 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3513
03cfe0d5
LP
3514 assert(barrier);
3515 assert(directory);
03cfe0d5 3516 assert(pid_socket >= 0);
e01ff70a 3517 assert(uuid_socket >= 0);
9c1e04d0 3518 assert(notify_socket >= 0);
3acc84eb 3519 assert(master_pty_socket >= 0);
03cfe0d5
LP
3520 assert(kmsg_socket >= 0);
3521
de40a303
LP
3522 log_debug("Outer child is initializing.");
3523
e1bb4b0d
LB
3524 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3525 if (r < 0)
3526 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3527
03cfe0d5
LP
3528 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3529 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3530
03cfe0d5
LP
3531 r = reset_audit_loginuid();
3532 if (r < 0)
3533 return r;
3534
2a2e78e9
LP
3535 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3536 * mounts to the real root. */
60e76d48
ZJS
3537 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3538 if (r < 0)
3539 return r;
03cfe0d5 3540
2d845785 3541 if (dissected_image) {
2d3a5a73
LP
3542 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3543 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3544 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3545 * makes sure ESP partitions and userns are compatible. */
3546
af187ab2
LP
3547 r = dissected_image_mount_and_warn(
3548 dissected_image, directory, arg_uid_shift,
3549 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3550 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK)|
3551 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3552 if (r < 0)
af187ab2 3553 return r;
2d845785 3554 }
03cfe0d5 3555
391567f4
LP
3556 r = determine_uid_shift(directory);
3557 if (r < 0)
3558 return r;
3559
0de7acce 3560 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3561 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3562 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3563 if (l < 0)
3564 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3565 if (l != sizeof(arg_uid_shift))
3566 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3567 "Short write while sending UID shift.");
0e7ac751 3568
0de7acce 3569 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3570 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3571 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3572 * not it will pick a different one, and send it back to us. */
3573
3574 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3575 if (l < 0)
3576 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3577 if (l != sizeof(arg_uid_shift))
3578 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3579 "Short read while receiving UID shift.");
0e7ac751
LP
3580 }
3581
ff6c6cc1
LP
3582 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3583 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3584 }
3585
6f83d3d1
LP
3586 if (path_equal(directory, "/")) {
3587 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3588 * place, so that we can make changes to its mount structure (for example, to implement
3589 * --volatile=) without this interfering with our ability to access files such as
3590 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3591 * (instead of a temporary directory, since we are living in our own mount namspace here
3592 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3593 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3594
3595 r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3596 if (r < 0)
3597 return r;
3598
3599 directory = "/run/systemd/nspawn-root";
e50cd82f 3600 }
7d0ecdd6
LP
3601
3602 r = setup_pivot_root(
3603 directory,
3604 arg_pivot_root_new,
3605 arg_pivot_root_old);
3606 if (r < 0)
3607 return r;
3608
3609 r = setup_volatile_mode(
3610 directory,
3611 arg_volatile_mode,
7d0ecdd6 3612 arg_uid_shift,
8f1ed04a 3613 arg_selinux_apifs_context);
7d0ecdd6
LP
3614 if (r < 0)
3615 return r;
3616
5f0a6347
DDM
3617 r = mount_custom(
3618 directory,
3619 arg_custom_mounts,
3620 arg_n_custom_mounts,
5f0a6347 3621 arg_uid_shift,
5f0a6347
DDM
3622 arg_selinux_apifs_context,
3623 MOUNT_ROOT_ONLY);
3624 if (r < 0)
3625 return r;
3626
5530dc87
DDM
3627 /* Make sure we always have a mount that we can move to root later on. */
3628 if (!path_is_mount_point(directory, NULL, 0)) {
3629 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3630 if (r < 0)
3631 return r;
3632 }
3633
2d3a5a73
LP
3634 if (dissected_image) {
3635 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3636 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
4fcb96ce
LP
3637 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK));
3638 if (r == -EUCLEAN)
3639 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3640 if (r < 0)
4fcb96ce 3641 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3642 }
3643
8199d554
LP
3644 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3645 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3646
3647 r = detect_unified_cgroup_hierarchy_from_image(directory);
3648 if (r < 0)
3649 return r;
3650
3651 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3652 if (l < 0)
3653 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3654 if (l != sizeof(arg_unified_cgroup_hierarchy))
3655 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3656 "Short write while sending cgroup mode.");
8199d554
LP
3657
3658 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3659 }
3660
4ad14eff
LP
3661 /* Mark everything as shared so our mounts get propagated down. This is
3662 * required to make new bind mounts available in systemd services
5238e957 3663 * inside the container that create a new mount namespace.
4ad14eff
LP
3664 * See https://github.com/systemd/systemd/issues/3860
3665 * Further submounts (such as /dev) done after this will inherit the
5f0a6347
DDM
3666 * shared propagation mode.
3667 *
3668 * IMPORTANT: Do not overmount the root directory anymore from now on to
3669 * enable moving the root directory mount to root later on.
3670 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3671 */
4ad14eff
LP
3672 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3673 if (r < 0)
3674 return r;
3675
3676 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3677 if (r < 0)
3678 return r;
3679
03cfe0d5
LP
3680 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3681 if (r < 0)
3682 return r;
3683
bbd407ea
DDM
3684 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3685 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3686 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3687 if (r < 0)
3688 return log_error_errno(r, "Failed to make tree read-only: %m");
3689 }
3690
0de7acce 3691 r = mount_all(directory,
4f086aab 3692 arg_mount_settings,
0de7acce 3693 arg_uid_shift,
0de7acce 3694 arg_selinux_apifs_context);
03cfe0d5
LP
3695 if (r < 0)
3696 return r;
3697
07fa00f9
LP
3698 r = copy_devnodes(directory);
3699 if (r < 0)
03cfe0d5
LP
3700 return r;
3701
de40a303
LP
3702 r = make_extra_nodes(directory);
3703 if (r < 0)
3704 return r;
3705
3706 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3707
9fac5029 3708 p = prefix_roota(directory, "/run/host");
e5f10caf 3709 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3710
07fa00f9
LP
3711 r = setup_pts(directory);
3712 if (r < 0)
03cfe0d5
LP
3713 return r;
3714
3715 r = setup_propagate(directory);
3716 if (r < 0)
3717 return r;
3718
8e5430c4
LP
3719 r = setup_keyring();
3720 if (r < 0)
3721 return r;
3722
3652872a
LP
3723 r = setup_credentials(directory);
3724 if (r < 0)
3725 return r;
3726
5c4deb9a
MJ
3727 r = mount_custom(
3728 directory,
3729 arg_custom_mounts,
3730 arg_n_custom_mounts,
3731 arg_uid_shift,
3732 arg_selinux_apifs_context,
3733 MOUNT_NON_ROOT_ONLY);
3734 if (r < 0)
3735 return r;
3736
03cfe0d5
LP
3737 r = setup_timezone(directory);
3738 if (r < 0)
3739 return r;
3740
3741 r = setup_resolv_conf(directory);
3742 if (r < 0)
3743 return r;
3744
e01ff70a
MS
3745 r = setup_machine_id(directory);
3746 if (r < 0)
3747 return r;
3748
03cfe0d5
LP
3749 r = setup_journal(directory);
3750 if (r < 0)
3751 return r;
3752
0f48ba7b
LP
3753 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3754 p = prefix_roota(directory, "/run/host/container-manager");
3755 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3756
3757 /* The same stuff as the $container_uuid env var */
3758 p = prefix_roota(directory, "/run/host/container-uuid");
3759 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3760
489fae52 3761 if (!arg_use_cgns) {
0996ef00
CB
3762 r = mount_cgroups(
3763 directory,
3764 arg_unified_cgroup_hierarchy,
3765 arg_userns_mode != USER_NAMESPACE_NO,
3766 arg_uid_shift,
3767 arg_uid_range,
5a8ff0e6 3768 arg_selinux_apifs_context,
ada54120 3769 false);
0996ef00
CB
3770 if (r < 0)
3771 return r;
3772 }
03cfe0d5
LP
3773
3774 r = mount_move_root(directory);
3775 if (r < 0)
3776 return log_error_errno(r, "Failed to move root directory: %m");
3777
e96ceaba 3778 fd = setup_notify_child();
9c1e04d0
AP
3779 if (fd < 0)
3780 return fd;
3781
03cfe0d5 3782 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3783 arg_clone_ns_flags |
8869a0b4 3784 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3785 if (pid < 0)
3786 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3787 if (pid == 0) {
3788 pid_socket = safe_close(pid_socket);
e01ff70a 3789 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3790 notify_socket = safe_close(notify_socket);
825d5287 3791 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5 3792
2a2e78e9
LP
3793 /* The inner child has all namespaces that are requested, so that we all are owned by the
3794 * user if user namespaces are turned on. */
03cfe0d5 3795
d7bea6b6
DP
3796 if (arg_network_namespace_path) {
3797 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3798 if (r < 0)
e2d39e54 3799 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3800 }
3801
e1bb4b0d 3802 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
03cfe0d5
LP
3803 if (r < 0)
3804 _exit(EXIT_FAILURE);
3805
3806 _exit(EXIT_SUCCESS);
3807 }
3808
3809 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3810 if (l < 0)
3811 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3812 if (l != sizeof(pid))
3813 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3814 "Short write while sending PID.");
03cfe0d5 3815
e01ff70a
MS
3816 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3817 if (l < 0)
3818 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3819 if (l != sizeof(arg_uuid))
3820 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3821 "Short write while sending machine ID.");
e01ff70a 3822
9c1e04d0
AP
3823 l = send_one_fd(notify_socket, fd, 0);
3824 if (l < 0)
ba72801d 3825 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3826
03cfe0d5 3827 pid_socket = safe_close(pid_socket);
e01ff70a 3828 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3829 notify_socket = safe_close(notify_socket);
3acc84eb 3830 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3831 kmsg_socket = safe_close(kmsg_socket);
3832 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3833 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3834
3835 return 0;
3836}
3837
0e7ac751 3838static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3839 bool tried_hashed = false;
0e7ac751
LP
3840 unsigned n_tries = 100;
3841 uid_t candidate;
3842 int r;
3843
3844 assert(shift);
3845 assert(ret_lock_file);
0de7acce 3846 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3847 assert(arg_uid_range == 0x10000U);
3848
3849 candidate = *shift;
3850
3851 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3852
3853 for (;;) {
fbd0b64f 3854 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3855 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3856
3857 if (--n_tries <= 0)
3858 return -EBUSY;
3859
87d5e4f2 3860 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3861 goto next;
3862 if ((candidate & UINT32_C(0xFFFF)) != 0)
3863 goto next;
3864
3865 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3866 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3867 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3868 goto next;
3869 if (r < 0)
3870 return r;
3871
3872 /* Make some superficial checks whether the range is currently known in the user database */
3873 if (getpwuid(candidate))
3874 goto next;
3875 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3876 goto next;
3877 if (getgrgid(candidate))
3878 goto next;
3879 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3880 goto next;
3881
3882 *ret_lock_file = lf;
3883 lf = (struct LockFile) LOCK_FILE_INIT;
3884 *shift = candidate;
3885 return 0;
3886
3887 next:
d381c8a6
LP
3888 if (arg_machine && !tried_hashed) {
3889 /* Try to hash the base from the container name */
3890
3891 static const uint8_t hash_key[] = {
3892 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3893 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3894 };
3895
3896 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3897
3898 tried_hashed = true;
3899 } else
3900 random_bytes(&candidate, sizeof(candidate));
3901
87d5e4f2 3902 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3903 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3904 }
3905}
3906
03cfe0d5 3907static int setup_uid_map(pid_t pid) {
fbd0b64f 3908 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3909 int r;
3910
3911 assert(pid > 1);
3912
3913 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3914 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3915 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3916 if (r < 0)
3917 return log_error_errno(r, "Failed to write UID map: %m");
3918
3919 /* We always assign the same UID and GID ranges */
3920 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3921 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3922 if (r < 0)
3923 return log_error_errno(r, "Failed to write GID map: %m");
3924
3925 return 0;
3926}
3927
9c1e04d0 3928static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3929 char buf[NOTIFY_BUFFER_MAX+1];
3930 char *p = NULL;
3931 struct iovec iovec = {
3932 .iov_base = buf,
3933 .iov_len = sizeof(buf)-1,
3934 };
fb29cdbe
LP
3935 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
3936 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
3937 struct msghdr msghdr = {
3938 .msg_iov = &iovec,
3939 .msg_iovlen = 1,
3940 .msg_control = &control,
3941 .msg_controllen = sizeof(control),
3942 };
371d72e0 3943 struct ucred *ucred;
9c1e04d0
AP
3944 ssize_t n;
3945 pid_t inner_child_pid;
3946 _cleanup_strv_free_ char **tags = NULL;
3947
3948 assert(userdata);
3949
3950 inner_child_pid = PTR_TO_PID(userdata);
3951
3952 if (revents != EPOLLIN) {
3953 log_warning("Got unexpected poll event for notify fd.");
3954 return 0;
3955 }
3956
3691bcf3
LP
3957 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3958 if (IN_SET(n, -EAGAIN, -EINTR))
3959 return 0;
3960 if (n < 0)
3961 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 3962
9c1e04d0
AP
3963 cmsg_close_all(&msghdr);
3964
371d72e0 3965 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 3966 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3967 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3968 return 0;
3969 }
3970
3971 if ((size_t) n >= sizeof(buf)) {
3972 log_warning("Received notify message exceeded maximum size. Ignoring.");
3973 return 0;
3974 }
3975
3976 buf[n] = 0;
3977 tags = strv_split(buf, "\n\r");
3978 if (!tags)
3979 return log_oom();
3980
3981 if (strv_find(tags, "READY=1"))
04f590a4 3982 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
3983
3984 p = strv_find_startswith(tags, "STATUS=");
3985 if (p)
04f590a4 3986 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
3987
3988 return 0;
3989}
3990
e96ceaba 3991static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3992 int r;
9c1e04d0 3993
5773024d 3994 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3995 if (r < 0)
3996 return log_error_errno(r, "Failed to allocate notify event source: %m");
3997
5773024d 3998 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3999
4000 return 0;
4001}
4002
5d961407
LP
4003static int merge_settings(Settings *settings, const char *path) {
4004 int rl;
f757855e 4005
5d961407
LP
4006 assert(settings);
4007 assert(path);
f757855e 4008
5d961407
LP
4009 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4010 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4011
7732f92b
LP
4012 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4013 settings->start_mode >= 0) {
4014 arg_start_mode = settings->start_mode;
130d3d22 4015 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4016 }
4017
a2f577fc
JL
4018 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
4019 arg_ephemeral = settings->ephemeral;
4020
de40a303
LP
4021 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4022 settings->root) {
4023
4024 if (!arg_settings_trusted)
4025 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4026 else
4027 free_and_replace(arg_directory, settings->root);
4028 }
4029
b53ede69
PW
4030 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4031 settings->pivot_root_new) {
4032 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4033 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4034 }
4035
5f932eb9 4036 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4037 settings->working_directory)
4038 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4039
f757855e 4040 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4041 settings->environment)
4042 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4043
de40a303
LP
4044 if ((arg_settings_mask & SETTING_USER) == 0) {
4045
4046 if (settings->user)
4047 free_and_replace(arg_user, settings->user);
4048
4049 if (uid_is_valid(settings->uid))
4050 arg_uid = settings->uid;
4051 if (gid_is_valid(settings->gid))
4052 arg_gid = settings->gid;
4053 if (settings->n_supplementary_gids > 0) {
4054 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4055 arg_n_supplementary_gids = settings->n_supplementary_gids;
4056 }
4057 }
f757855e
LP
4058
4059 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4060 uint64_t plus, minus;
7be830c6 4061 uint64_t network_minus = 0;
f757855e 4062
de40a303
LP
4063 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4064 * Settings structure */
4065
0e265674 4066 plus = settings->capability;
a3fc6b55
LP
4067 minus = settings->drop_capability;
4068
4069 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
4070 if (settings_private_network(settings))
4071 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4072 else
7be830c6 4073 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4074 }
0e265674
LP
4075
4076 if (!arg_settings_trusted && plus != 0) {
4077 if (settings->capability != 0)
5d961407 4078 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4079 } else {
4080 arg_caps_retain &= ~network_minus;
520e0d54 4081 arg_caps_retain |= plus;
7be830c6 4082 }
f757855e 4083
a3fc6b55 4084 arg_caps_retain &= ~minus;
de40a303
LP
4085
4086 /* Copy the full capabilities over too */
4087 if (capability_quintet_is_set(&settings->full_capabilities)) {
4088 if (!arg_settings_trusted)
5238e957 4089 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4090 else
4091 arg_full_capabilities = settings->full_capabilities;
4092 }
f757855e
LP
4093 }
4094
4095 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4096 settings->kill_signal > 0)
4097 arg_kill_signal = settings->kill_signal;
4098
4099 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4100 settings->personality != PERSONALITY_INVALID)
4101 arg_personality = settings->personality;
4102
4103 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4104 !sd_id128_is_null(settings->machine_id)) {
4105
4106 if (!arg_settings_trusted)
5d961407 4107 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4108 else
4109 arg_uuid = settings->machine_id;
4110 }
4111
4112 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4113 settings->read_only >= 0)
4114 arg_read_only = settings->read_only;
4115
4116 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4117 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4118 arg_volatile_mode = settings->volatile_mode;
4119
4120 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4121 settings->n_custom_mounts > 0) {
4122
4123 if (!arg_settings_trusted)
5d961407 4124 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4125 else {
4126 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4127 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4128 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4129 settings->n_custom_mounts = 0;
4130 }
4131 }
4132
4133 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4134 (settings->private_network >= 0 ||
4135 settings->network_veth >= 0 ||
4136 settings->network_bridge ||
22b28dfd 4137 settings->network_zone ||
f757855e
LP
4138 settings->network_interfaces ||
4139 settings->network_macvlan ||
f6d6bad1 4140 settings->network_ipvlan ||
de40a303
LP
4141 settings->network_veth_extra ||
4142 settings->network_namespace_path)) {
f757855e
LP
4143
4144 if (!arg_settings_trusted)
5d961407 4145 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4146 else {
f6d6bad1 4147 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4148 arg_private_network = settings_private_network(settings);
4149
130d3d22
YW
4150 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4151 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4152 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4153 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4154
1cc6c93a
YW
4155 free_and_replace(arg_network_bridge, settings->network_bridge);
4156 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4157
4158 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4159 }
4160 }
4161
4162 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4163 settings->expose_ports) {
4164
4165 if (!arg_settings_trusted)
5d961407 4166 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4167 else {
4168 expose_port_free_all(arg_expose_ports);
1cc6c93a 4169 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4170 }
4171 }
4172
0de7acce
LP
4173 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4174 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4175
4176 if (!arg_settings_trusted)
5d961407 4177 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4178 else {
4179 arg_userns_mode = settings->userns_mode;
4180 arg_uid_shift = settings->uid_shift;
4181 arg_uid_range = settings->uid_range;
4182 arg_userns_chown = settings->userns_chown;
4183 }
4184 }
4185
9c1e04d0
AP
4186 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
4187 arg_notify_ready = settings->notify_ready;
4188
960e4569
LP
4189 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4190
6b000af4 4191 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
5d961407 4192 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 4193 else {
6b000af4
LP
4194 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4195 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
960e4569 4196 }
de40a303
LP
4197
4198#if HAVE_SECCOMP
4199 if (!arg_settings_trusted && settings->seccomp)
4200 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4201 else {
4202 seccomp_release(arg_seccomp);
4203 arg_seccomp = TAKE_PTR(settings->seccomp);
4204 }
4205#endif
960e4569
LP
4206 }
4207
bf428efb
LP
4208 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4209 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4210 continue;
4211
4212 if (!settings->rlimit[rl])
4213 continue;
4214
4215 if (!arg_settings_trusted) {
5d961407 4216 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4217 continue;
4218 }
4219
4220 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4221 }
4222
3a9530e5
LP
4223 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4224 settings->hostname)
4225 free_and_replace(arg_hostname, settings->hostname);
4226
66edd963
LP
4227 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4228 settings->no_new_privileges >= 0)
4229 arg_no_new_privileges = settings->no_new_privileges;
4230
81f345df
LP
4231 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4232 settings->oom_score_adjust_set) {
4233
4234 if (!arg_settings_trusted)
5d961407 4235 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4236 else {
4237 arg_oom_score_adjust = settings->oom_score_adjust;
4238 arg_oom_score_adjust_set = true;
4239 }
4240 }
4241
d107bb7d 4242 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4243 settings->cpu_set.set) {
d107bb7d
LP
4244
4245 if (!arg_settings_trusted)
5d961407 4246 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4247 else {
0985c7c4
ZJS
4248 cpu_set_reset(&arg_cpu_set);
4249 arg_cpu_set = settings->cpu_set;
4250 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4251 }
4252 }
4253
09d423e9
LP
4254 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4255 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4256 arg_resolv_conf = settings->resolv_conf;
4257
4e1d6aa9
LP
4258 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4259 settings->link_journal != _LINK_JOURNAL_INVALID) {
4260
4261 if (!arg_settings_trusted)
4262 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4263 else {
4264 arg_link_journal = settings->link_journal;
4265 arg_link_journal_try = settings->link_journal_try;
4266 }
4267 }
4268
1688841f
LP
4269 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4270 settings->timezone != _TIMEZONE_MODE_INVALID)
4271 arg_timezone = settings->timezone;
4272
de40a303
LP
4273 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4274 settings->slice) {
4275
4276 if (!arg_settings_trusted)
4277 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4278 else
4279 free_and_replace(arg_slice, settings->slice);
4280 }
4281
4282 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4283 settings->use_cgns >= 0) {
4284
4285 if (!arg_settings_trusted)
4286 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4287 else
4288 arg_use_cgns = settings->use_cgns;
4289 }
4290
4291 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4292 settings->clone_ns_flags != (unsigned long) -1) {
4293
4294 if (!arg_settings_trusted)
4295 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4296 else
4297 arg_clone_ns_flags = settings->clone_ns_flags;
4298 }
4299
4300 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4301 settings->console_mode >= 0) {
4302
4303 if (!arg_settings_trusted)
4304 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4305 else
4306 arg_console_mode = settings->console_mode;
4307 }
4308
4309 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4310 * don't consult arg_settings_mask for them. */
4311
4312 sd_bus_message_unref(arg_property_message);
4313 arg_property_message = TAKE_PTR(settings->properties);
4314
4315 arg_console_width = settings->console_width;
4316 arg_console_height = settings->console_height;
4317
b2645747 4318 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4319 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4320 arg_n_extra_nodes = settings->n_extra_nodes;
4321
f757855e
LP
4322 return 0;
4323}
4324
5d961407
LP
4325static int load_settings(void) {
4326 _cleanup_(settings_freep) Settings *settings = NULL;
4327 _cleanup_fclose_ FILE *f = NULL;
4328 _cleanup_free_ char *p = NULL;
4329 const char *fn, *i;
4330 int r;
4331
de40a303
LP
4332 if (arg_oci_bundle)
4333 return 0;
4334
5d961407
LP
4335 /* If all settings are masked, there's no point in looking for
4336 * the settings file */
4337 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4338 return 0;
4339
4340 fn = strjoina(arg_machine, ".nspawn");
4341
4342 /* We first look in the admin's directories in /etc and /run */
4343 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4344 _cleanup_free_ char *j = NULL;
4345
657ee2d8 4346 j = path_join(i, fn);
5d961407
LP
4347 if (!j)
4348 return log_oom();
4349
4350 f = fopen(j, "re");
4351 if (f) {
4352 p = TAKE_PTR(j);
4353
4354 /* By default, we trust configuration from /etc and /run */
4355 if (arg_settings_trusted < 0)
4356 arg_settings_trusted = true;
4357
4358 break;
4359 }
4360
4361 if (errno != ENOENT)
4362 return log_error_errno(errno, "Failed to open %s: %m", j);
4363 }
4364
4365 if (!f) {
4366 /* After that, let's look for a file next to the
4367 * actual image we shall boot. */
4368
4369 if (arg_image) {
4370 p = file_in_same_dir(arg_image, fn);
4371 if (!p)
4372 return log_oom();
cd6e3914 4373 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4374 p = file_in_same_dir(arg_directory, fn);
4375 if (!p)
4376 return log_oom();
4377 }
4378
4379 if (p) {
4380 f = fopen(p, "re");
4381 if (!f && errno != ENOENT)
4382 return log_error_errno(errno, "Failed to open %s: %m", p);
4383
4384 /* By default, we do not trust configuration from /var/lib/machines */
4385 if (arg_settings_trusted < 0)
4386 arg_settings_trusted = false;
4387 }
4388 }
4389
4390 if (!f)
4391 return 0;
4392
4393 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4394
4395 r = settings_load(f, p, &settings);
4396 if (r < 0)
4397 return r;
4398
4399 return merge_settings(settings, p);
4400}
4401
de40a303
LP
4402static int load_oci_bundle(void) {
4403 _cleanup_(settings_freep) Settings *settings = NULL;
4404 int r;
4405
4406 if (!arg_oci_bundle)
4407 return 0;
4408
4409 /* By default let's trust OCI bundles */
4410 if (arg_settings_trusted < 0)
4411 arg_settings_trusted = true;
4412
4413 r = oci_load(NULL, arg_oci_bundle, &settings);
4414 if (r < 0)
4415 return r;
4416
4417 return merge_settings(settings, arg_oci_bundle);
4418}
4419
3acc84eb 4420static int run_container(
2d845785 4421 DissectedImage *dissected_image,
b0067625
ZJS
4422 bool secondary,
4423 FDSet *fds,
4424 char veth_name[IFNAMSIZ], bool *veth_created,
4425 union in_addr_union *exposed,
3acc84eb 4426 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4427
4428 static const struct sigaction sa = {
4429 .sa_handler = nop_signal_handler,
e28c7cd0 4430 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4431 };
4432
8e766630 4433 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4434 _cleanup_close_ int etc_passwd_lock = -1;
4435 _cleanup_close_pair_ int
4436 kmsg_socket_pair[2] = { -1, -1 },
4437 rtnl_socket_pair[2] = { -1, -1 },
4438 pid_socket_pair[2] = { -1, -1 },
4439 uuid_socket_pair[2] = { -1, -1 },
4440 notify_socket_pair[2] = { -1, -1 },
8199d554 4441 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4442 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4443 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4444
3acc84eb 4445 _cleanup_close_ int notify_socket = -1;
b0067625 4446 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4447 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4448 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4449 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4450 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4451 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4452 ContainerStatus container_status = 0;
b0067625
ZJS
4453 int ifi = 0, r;
4454 ssize_t l;
4455 sigset_t mask_chld;
5b4855ab 4456 _cleanup_close_ int child_netns_fd = -1;
b0067625
ZJS
4457
4458 assert_se(sigemptyset(&mask_chld) == 0);
4459 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4460
4461 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4462 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4463 * check with getpwuid() if the specific user already exists. Note that /etc might be
4464 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4465 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4466 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4467 * really ours. */
4468
4469 etc_passwd_lock = take_etc_passwd_lock(NULL);
4470 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4471 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4472 }
4473
4474 r = barrier_create(&barrier);
4475 if (r < 0)
4476 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4477
4478 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4479 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4480
4481 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4482 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4483
4484 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4485 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4486
4487 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4488 return log_error_errno(errno, "Failed to create id socket pair: %m");
4489
4490 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4491 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4492
3acc84eb
FB
4493 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4494 return log_error_errno(errno, "Failed to create console socket pair: %m");
4495
b0067625
ZJS
4496 if (arg_userns_mode != USER_NAMESPACE_NO)
4497 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4498 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4499
8199d554
LP
4500 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4501 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4502 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4503
b0067625
ZJS
4504 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4505 * parent's blocking calls and give it a chance to call wait() and terminate. */
4506 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4507 if (r < 0)
4508 return log_error_errno(errno, "Failed to change the signal mask: %m");
4509
4510 r = sigaction(SIGCHLD, &sa, NULL);
4511 if (r < 0)
4512 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4513
d7bea6b6 4514 if (arg_network_namespace_path) {
5b4855ab
DDM
4515 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4516 if (child_netns_fd < 0)
d7bea6b6
DP
4517 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4518
5b4855ab 4519 r = fd_is_network_ns(child_netns_fd);
6619ad88
LP
4520 if (r == -EUCLEAN)
4521 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4522 else if (r < 0)
d7bea6b6 4523 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4524 else if (r == 0)
4525 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4526 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4527 }
4528
b0067625
ZJS
4529 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4530 if (*pid < 0)
4531 return log_error_errno(errno, "clone() failed%s: %m",
4532 errno == EINVAL ?
4533 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4534
4535 if (*pid == 0) {
4536 /* The outer child only has a file system namespace. */
4537 barrier_set_role(&barrier, BARRIER_CHILD);
4538
b0067625
ZJS
4539 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4540 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4541 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4542 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4543 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4544 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4545 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4546 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4547
4548 (void) reset_all_signal_handlers();
4549 (void) reset_signal_mask();
4550
4551 r = outer_child(&barrier,
4552 arg_directory,
2d845785 4553 dissected_image,
b0067625
ZJS
4554 secondary,
4555 pid_socket_pair[1],
4556 uuid_socket_pair[1],
4557 notify_socket_pair[1],
4558 kmsg_socket_pair[1],
4559 rtnl_socket_pair[1],
4560 uid_shift_socket_pair[1],
3acc84eb 4561 master_pty_socket_pair[1],
8199d554 4562 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6 4563 fds,
5b4855ab 4564 child_netns_fd);
b0067625
ZJS
4565 if (r < 0)
4566 _exit(EXIT_FAILURE);
4567
4568 _exit(EXIT_SUCCESS);
4569 }
4570
4571 barrier_set_role(&barrier, BARRIER_PARENT);
4572
e4077ff6 4573 fdset_close(fds);
b0067625
ZJS
4574
4575 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4576 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4577 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4578 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4579 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4580 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4581 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4582 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4583
4584 if (arg_userns_mode != USER_NAMESPACE_NO) {
4585 /* The child just let us know the UID shift it might have read from the image. */
4586 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4587 if (l < 0)
4588 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4589 if (l != sizeof arg_uid_shift)
4590 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4591
4592 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4593 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4594 * image, but if that's already in use, pick a new one, and report back to the child,
4595 * which one we now picked. */
4596
4597 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4598 if (r < 0)
4599 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4600
4601 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4602 if (l < 0)
4603 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4604 if (l != sizeof arg_uid_shift)
4605 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625
ZJS
4606 }
4607 }
4608
8199d554
LP
4609 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4610 /* The child let us know the support cgroup mode it might have read from the image. */
4611 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4612 if (l < 0)
4613 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4614 if (l != sizeof(arg_unified_cgroup_hierarchy))
4615 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4616 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4617 }
4618
b0067625 4619 /* Wait for the outer child. */
d2e0ac3d
LP
4620 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4621 if (r < 0)
4622 return r;
4623 if (r != EXIT_SUCCESS)
4624 return -EIO;
b0067625
ZJS
4625
4626 /* And now retrieve the PID of the inner child. */
4627 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4628 if (l < 0)
4629 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4630 if (l != sizeof *pid)
4631 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4632
4633 /* We also retrieve container UUID in case it was generated by outer child */
4634 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4635 if (l < 0)
4636 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4637 if (l != sizeof(arg_uuid))
4638 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4639
4640 /* We also retrieve the socket used for notifications generated by outer child */
4641 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4642 if (notify_socket < 0)
4643 return log_error_errno(notify_socket,
4644 "Failed to receive notification socket from the outer child: %m");
4645
4646 log_debug("Init process invoked as PID "PID_FMT, *pid);
4647
4648 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4649 if (!barrier_place_and_sync(&barrier)) /* #1 */
4650 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4651
4652 r = setup_uid_map(*pid);
4653 if (r < 0)
4654 return r;
4655
4656 (void) barrier_place(&barrier); /* #2 */
4657 }
4658
4659 if (arg_private_network) {
75116558
PS
4660 if (!arg_network_namespace_path) {
4661 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4662 if (!barrier_place_and_sync(&barrier)) /* #3 */
4663 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4664 }
4665
5b4855ab
DDM
4666 if (child_netns_fd < 0) {
4667 /* Make sure we have an open file descriptor to the child's network
4668 * namespace so it stays alive even if the child exits. */
4669 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4670 if (r < 0)
4671 return log_error_errno(r, "Failed to open child network namespace: %m");
4672 }
4673
4674 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4675 if (r < 0)
4676 return r;
4677
4678 if (arg_network_veth) {
4679 r = setup_veth(arg_machine, *pid, veth_name,
4680 arg_network_bridge || arg_network_zone);
4681 if (r < 0)
4682 return r;
4683 else if (r > 0)
4684 ifi = r;
4685
4686 if (arg_network_bridge) {
4687 /* Add the interface to a bridge */
4688 r = setup_bridge(veth_name, arg_network_bridge, false);
4689 if (r < 0)
4690 return r;
4691 if (r > 0)
4692 ifi = r;
4693 } else if (arg_network_zone) {
4694 /* Add the interface to a bridge, possibly creating it */
4695 r = setup_bridge(veth_name, arg_network_zone, true);
4696 if (r < 0)
4697 return r;
4698 if (r > 0)
4699 ifi = r;
4700 }
4701 }
4702
4703 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4704 if (r < 0)
4705 return r;
4706
4707 /* We created the primary and extra veth links now; let's remember this, so that we know to
4708 remove them later on. Note that we don't bother with removing veth links that were created
4709 here when their setup failed half-way, because in that case the kernel should be able to
4710 remove them on its own, since they cannot be referenced by anything yet. */
4711 *veth_created = true;
4712
4713 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4714 if (r < 0)
4715 return r;
4716
4717 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4718 if (r < 0)
4719 return r;
4720 }
4721
abdb9b08
LP
4722 if (arg_register || !arg_keep_unit) {
4723 r = sd_bus_default_system(&bus);
4724 if (r < 0)
4725 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4726
4727 r = sd_bus_set_close_on_exit(bus, false);
4728 if (r < 0)
4729 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4730 }
4731
4732 if (!arg_keep_unit) {
4733 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4734 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4735 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4736
75152a4d
LP
4737 r = sd_bus_match_signal_async(
4738 bus,
4739 NULL,
4740 "org.freedesktop.systemd1",
4741 NULL,
4742 "org.freedesktop.systemd1.Scope",
4743 "RequestStop",
4744 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4745 if (r < 0)
75152a4d 4746 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4747 }
4748
b0067625
ZJS
4749 if (arg_register) {
4750 r = register_machine(
abdb9b08 4751 bus,
b0067625
ZJS
4752 arg_machine,
4753 *pid,
4754 arg_directory,
4755 arg_uuid,
4756 ifi,
4757 arg_slice,
4758 arg_custom_mounts, arg_n_custom_mounts,
4759 arg_kill_signal,
4760 arg_property,
de40a303 4761 arg_property_message,
b0067625
ZJS
4762 arg_keep_unit,
4763 arg_container_service_name);
4764 if (r < 0)
4765 return r;
abdb9b08 4766
cd2dfc6f
LP
4767 } else if (!arg_keep_unit) {
4768 r = allocate_scope(
abdb9b08 4769 bus,
cd2dfc6f
LP
4770 arg_machine,
4771 *pid,
4772 arg_slice,
4773 arg_custom_mounts, arg_n_custom_mounts,
4774 arg_kill_signal,
de40a303
LP
4775 arg_property,
4776 arg_property_message);
cd2dfc6f
LP
4777 if (r < 0)
4778 return r;
4779
4780 } else if (arg_slice || arg_property)
4781 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4782
27da7ef0 4783 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4784 if (r < 0)
4785 return r;
4786
27da7ef0 4787 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4788 if (r < 0)
4789 return r;
b0067625 4790
de54e02d 4791 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4792 if (r < 0)
4793 return r;
4794
4795 /* Notify the child that the parent is ready with all
4796 * its setup (including cgroup-ification), and that
4797 * the child can now hand over control to the code to
4798 * run inside the container. */
75116558 4799 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4800
4801 /* Block SIGCHLD here, before notifying child.
4802 * process_pty() will handle it with the other signals. */
4803 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4804
4805 /* Reset signal to default */
4806 r = default_signals(SIGCHLD, -1);
4807 if (r < 0)
4808 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4809
4810 r = sd_event_new(&event);
4811 if (r < 0)
4812 return log_error_errno(r, "Failed to get default event source: %m");
4813
8fd010bb
LP
4814 (void) sd_event_set_watchdog(event, true);
4815
abdb9b08
LP
4816 if (bus) {
4817 r = sd_bus_attach_event(bus, event, 0);
4818 if (r < 0)
4819 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4820 }
4821
e96ceaba 4822 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4823 if (r < 0)
4824 return r;
4825
4826 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
4827 if (!barrier_place_and_sync(&barrier)) /* #5 */
4828 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4829
38ccb557 4830 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
4831 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4832 etc_passwd_lock = safe_close(etc_passwd_lock);
4833
04f590a4
LP
4834 (void) sd_notifyf(false,
4835 "STATUS=Container running.\n"
4836 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4837 if (!arg_notify_ready)
919f5ae0 4838 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4839
4840 if (arg_kill_signal > 0) {
4841 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4842 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4843 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4844 } else {
4845 /* Immediately exit */
919f5ae0
LP
4846 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4847 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4848 }
4849
6916b164 4850 /* Exit when the child exits */
919f5ae0 4851 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4852
4853 if (arg_expose_ports) {
4854 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4855 if (r < 0)
4856 return r;
4857
4858 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4859 }
4860
4861 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4862
3acc84eb
FB
4863 if (arg_console_mode != CONSOLE_PIPE) {
4864 _cleanup_close_ int fd = -1;
4865 PTYForwardFlags flags = 0;
de40a303 4866
3acc84eb
FB
4867 /* Retrieve the master pty allocated by inner child */
4868 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4869 if (fd < 0)
4870 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4871
4872 switch (arg_console_mode) {
de40a303 4873
3acc84eb
FB
4874 case CONSOLE_READ_ONLY:
4875 flags |= PTY_FORWARD_READ_ONLY;
4876
4877 _fallthrough_;
4878
4879 case CONSOLE_INTERACTIVE:
4880 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4881
4882 r = pty_forward_new(event, fd, flags, &forward);
4883 if (r < 0)
4884 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4885
4886 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4887 (void) pty_forward_set_width_height(forward,
4888 arg_console_width,
4889 arg_console_height);
4890 break;
4891
4892 default:
4893 assert(arg_console_mode == CONSOLE_PASSIVE);
4894 }
4895
4896 *master = TAKE_FD(fd);
de40a303 4897 }
b0067625
ZJS
4898
4899 r = sd_event_loop(event);
4900 if (r < 0)
4901 return log_error_errno(r, "Failed to run event loop: %m");
4902
de40a303
LP
4903 if (forward) {
4904 char last_char = 0;
b0067625 4905
de40a303
LP
4906 (void) pty_forward_get_last_char(forward, &last_char);
4907 forward = pty_forward_free(forward);
b0067625 4908
de40a303
LP
4909 if (!arg_quiet && last_char != '\n')
4910 putc('\n', stdout);
4911 }
b0067625
ZJS
4912
4913 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
4914 if (!arg_register && !arg_keep_unit && bus)
4915 terminate_scope(bus, arg_machine);
b0067625
ZJS
4916
4917 /* Normally redundant, but better safe than sorry */
c67b0082 4918 (void) kill(*pid, SIGKILL);
b0067625 4919
5b4855ab
DDM
4920 if (arg_private_network) {
4921 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
4922 * to avoid having to move the parent to the child network namespace. */
4923 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
4924 if (r < 0)
4925 return r;
4926
4927 if (r == 0) {
4928 _cleanup_close_ int parent_netns_fd = -1;
4929
4930 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
4931 if (r < 0) {
4932 log_error_errno(r, "Failed to open parent network namespace: %m");
4933 _exit(EXIT_FAILURE);
4934 }
4935
4936 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
4937 if (r < 0) {
4938 log_error_errno(r, "Failed to enter child network namespace: %m");
4939 _exit(EXIT_FAILURE);
4940 }
4941
4942 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
4943 if (r < 0)
4944 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
4945
4946 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
4947 }
4948 }
4949
b0067625
ZJS
4950 r = wait_for_container(*pid, &container_status);
4951 *pid = 0;
4952
0bb0a9fa
ZJS
4953 /* Tell machined that we are gone. */
4954 if (bus)
4955 (void) unregister_machine(bus, arg_machine);
4956
b0067625
ZJS
4957 if (r < 0)
4958 /* We failed to wait for the container, or the container exited abnormally. */
4959 return r;
4960 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4961 /* r > 0 → The container exited with a non-zero status.
4962 * As a special case, we need to replace 133 with a different value,
4963 * because 133 is special-cased in the service file to reboot the container.
4964 * otherwise → The container exited with zero status and a reboot was not requested.
4965 */
2a49b612 4966 if (r == EXIT_FORCE_RESTART)
27e29a1e 4967 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4968 *ret = r;
b0067625
ZJS
4969 return 0; /* finito */
4970 }
4971
4972 /* CONTAINER_REBOOTED, loop again */
4973
4974 if (arg_keep_unit) {
4975 /* Special handling if we are running as a service: instead of simply
4976 * restarting the machine we want to restart the entire service, so let's
4977 * inform systemd about this with the special exit code 133. The service
4978 * file uses RestartForceExitStatus=133 so that this results in a full
4979 * nspawn restart. This is necessary since we might have cgroup parameters
4980 * set we want to have flushed out. */
2a49b612
ZJS
4981 *ret = EXIT_FORCE_RESTART;
4982 return 0; /* finito */
b0067625
ZJS
4983 }
4984
4985 expose_port_flush(arg_expose_ports, exposed);
4986
4987 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4988 *veth_created = false;
4989 return 1; /* loop again */
4990}
4991
bf428efb 4992static int initialize_rlimits(void) {
bf428efb
LP
4993 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4994 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4995 * container execution environments. */
4996
4997 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4998 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4999 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5000 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5001 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5002 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5003 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5004 [RLIMIT_MEMLOCK] = { 65536, 65536 },
5005 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5006 [RLIMIT_NICE] = { 0, 0 },
5007 [RLIMIT_NOFILE] = { 1024, 4096 },
5008 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5009 [RLIMIT_RTPRIO] = { 0, 0 },
5010 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5011 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5012
5013 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5014 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5015 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5016 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5017 * that PID 1 changes a number of other resource limits during early initialization which is why we
5018 * don't read the other limits from PID 1 but prefer the static table above. */
5019 };
5020
5021 int rl;
5022
5023 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5024 /* Let's only fill in what the user hasn't explicitly configured anyway */
5025 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5026 const struct rlimit *v;
5027 struct rlimit buffer;
5028
5029 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5030 /* For these two let's read the limits off PID 1. See above for an explanation. */
5031
5032 if (prlimit(1, rl, NULL, &buffer) < 0)
5033 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5034
5035 v = &buffer;
5036 } else
5037 v = kernel_defaults + rl;
5038
5039 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5040 if (!arg_rlimit[rl])
5041 return log_oom();
5042 }
5043
5044 if (DEBUG_LOGGING) {
5045 _cleanup_free_ char *k = NULL;
5046
5047 (void) rlimit_format(arg_rlimit[rl], &k);
5048 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5049 }
5050 }
5051
5052 return 0;
5053}
5054
287b7376
LP
5055static int cant_be_in_netns(void) {
5056 union sockaddr_union sa = {
5057 .un = {
5058 .sun_family = AF_UNIX,
5059 .sun_path = "/run/udev/control",
5060 },
5061 };
5062 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5063 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5064 _cleanup_close_ int fd = -1;
5065 struct ucred ucred;
5066 int r;
5067
5068 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5069 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5070 * nice message. */
5071
5072 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5073 return 0;
5074
5075 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5076 if (fd < 0)
5077 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5078
5079 if (connect(fd, &sa.un, SOCKADDR_UN_LEN(sa.un)) < 0) {
5080
5081 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5082 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5083 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5084
5085 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5086 }
5087
5088 r = getpeercred(fd, &ucred);
5089 if (r < 0)
5090 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5091
5092 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5093 r = readlink_malloc(udev_path, &udev_ns);
5094 if (r < 0)
5095 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5096
5097 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5098 if (r < 0)
5099 return log_error_errno(r, "Failed to read our own network namespace: %m");
5100
5101 if (!streq(our_ns, udev_ns))
5102 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5103 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5104 return 0;
5105}
5106
44dbef90 5107static int run(int argc, char *argv[]) {
7bf011e3
LP
5108 bool secondary = false, remove_directory = false, remove_image = false,
5109 veth_created = false, remove_tmprootdir = false;
2d845785 5110 _cleanup_close_ int master = -1;
03cfe0d5 5111 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5112 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5113 char veth_name[IFNAMSIZ] = "";
03cfe0d5 5114 union in_addr_union exposed = {};
8e766630 5115 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5116 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5117 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
5118 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5119 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
7bf011e3 5120 pid_t pid = 0;
03cfe0d5
LP
5121
5122 log_parse_environment();
5123 log_open();
415fc41c 5124
03cfe0d5
LP
5125 r = parse_argv(argc, argv);
5126 if (r <= 0)
5127 goto finish;
5128
fba868fa
LP
5129 r = must_be_root();
5130 if (r < 0)
03cfe0d5 5131 goto finish;
fba868fa 5132
287b7376
LP
5133 r = cant_be_in_netns();
5134 if (r < 0)
5135 goto finish;
5136
bf428efb
LP
5137 r = initialize_rlimits();
5138 if (r < 0)
5139 goto finish;
5140
de40a303
LP
5141 r = load_oci_bundle();
5142 if (r < 0)
5143 goto finish;
5144
f757855e
LP
5145 r = determine_names();
5146 if (r < 0)
5147 goto finish;
5148
5149 r = load_settings();
5150 if (r < 0)
5151 goto finish;
5152
d4d99bc6 5153 r = cg_unified();
5eee8290
LP
5154 if (r < 0) {
5155 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5156 goto finish;
5157 }
5158
f757855e
LP
5159 r = verify_arguments();
5160 if (r < 0)
5161 goto finish;
03cfe0d5 5162
49048684
ZJS
5163 /* Reapply environment settings. */
5164 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5165
2949ff26
LP
5166 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5167 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5168 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5169 (void) ignore_signals(SIGPIPE, -1);
5170
03cfe0d5
LP
5171 n_fd_passed = sd_listen_fds(false);
5172 if (n_fd_passed > 0) {
5173 r = fdset_new_listen_fds(&fds, false);
5174 if (r < 0) {
5175 log_error_errno(r, "Failed to collect file descriptors: %m");
5176 goto finish;
5177 }
5178 }
5179
83e803a9
ZJS
5180 /* The "default" umask. This is appropriate for most file and directory
5181 * operations performed by nspawn, and is the umask that will be used for
5182 * the child. Functions like copy_devnodes() change the umask temporarily. */
5183 umask(0022);
5184
03cfe0d5
LP
5185 if (arg_directory) {
5186 assert(!arg_image);
5187
b35ca61a
LP
5188 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5189 * /var from the host will propagate into container dynamically (because bad things happen if
5190 * two systems write to the same /var). Let's allow it for the special cases where /var is
5191 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5192 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5193 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5194 r = -EINVAL;
5195 goto finish;
5196 }
5197
5198 if (arg_ephemeral) {
5199 _cleanup_free_ char *np = NULL;
5200
8d4aa2bb 5201 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
5202 if (r < 0)
5203 goto finish;
5204
7bf011e3
LP
5205 /* If the specified path is a mount point we generate the new snapshot immediately
5206 * inside it under a random name. However if the specified is not a mount point we
5207 * create the new snapshot in the parent directory, just next to it. */
e1873695 5208 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5209 if (r < 0) {
5210 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5211 goto finish;
5212 }
5213 if (r > 0)
770b5ce4 5214 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5215 else
770b5ce4 5216 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5217 if (r < 0) {
0f3be6ca 5218 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5219 goto finish;
5220 }
5221
6992459c 5222 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5223 * only owned by us and no one else. */
6992459c 5224 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5225 if (r < 0) {
5226 log_error_errno(r, "Failed to lock %s: %m", np);
5227 goto finish;
5228 }
5229
7bf011e3
LP
5230 {
5231 BLOCK_SIGNALS(SIGINT);
5232 r = btrfs_subvol_snapshot(arg_directory, np,
5233 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5234 BTRFS_SNAPSHOT_FALLBACK_COPY |
5235 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5236 BTRFS_SNAPSHOT_RECURSIVE |
5237 BTRFS_SNAPSHOT_QUOTA |
5238 BTRFS_SNAPSHOT_SIGINT);
5239 }
5240 if (r == -EINTR) {
5241 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5242 goto finish;
5243 }
03cfe0d5
LP
5244 if (r < 0) {
5245 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5246 goto finish;
ec16945e
LP
5247 }
5248
1cc6c93a 5249 free_and_replace(arg_directory, np);
17cbb288 5250 remove_directory = true;
30535c16 5251 } else {
cb638b5e 5252 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5253 if (r < 0)
5254 goto finish;
5255
30535c16
LP
5256 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5257 if (r == -EBUSY) {
5258 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5259 goto finish;
5260 }
5261 if (r < 0) {
5262 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5263 goto finish;
30535c16
LP
5264 }
5265
5266 if (arg_template) {
8d4aa2bb 5267 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
5268 if (r < 0)
5269 goto finish;
5270
7bf011e3
LP
5271 {
5272 BLOCK_SIGNALS(SIGINT);
5273 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5274 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5275 BTRFS_SNAPSHOT_FALLBACK_COPY |
5276 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5277 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5278 BTRFS_SNAPSHOT_RECURSIVE |
5279 BTRFS_SNAPSHOT_QUOTA |
5280 BTRFS_SNAPSHOT_SIGINT);
5281 }
ff6c6cc1
LP
5282 if (r == -EEXIST)
5283 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5284 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5285 else if (r == -EINTR) {
5286 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5287 goto finish;
5288 } else if (r < 0) {
83521414 5289 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5290 goto finish;
ff6c6cc1
LP
5291 } else
5292 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5293 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5294 }
ec16945e
LP
5295 }
5296
7732f92b 5297 if (arg_start_mode == START_BOOT) {
a5201ed6 5298 const char *p;
c9fe05e0 5299
a5201ed6
LP
5300 if (arg_pivot_root_new)
5301 p = prefix_roota(arg_directory, arg_pivot_root_new);
5302 else
5303 p = arg_directory;
c9fe05e0
AR
5304
5305 if (path_is_os_tree(p) <= 0) {
5306 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 5307 r = -EINVAL;
1b9e5b12
LP
5308 goto finish;
5309 }
5310 } else {
c9fe05e0
AR
5311 const char *p, *q;
5312
a5201ed6
LP
5313 if (arg_pivot_root_new)
5314 p = prefix_roota(arg_directory, arg_pivot_root_new);
5315 else
5316 p = arg_directory;
c9fe05e0
AR
5317
5318 q = strjoina(p, "/usr/");
1b9e5b12 5319
c9fe05e0
AR
5320 if (laccess(q, F_OK) < 0) {
5321 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 5322 r = -EINVAL;
1b9e5b12 5323 goto finish;
1b9e5b12
LP
5324 }
5325 }
ec16945e 5326
6b9132a9 5327 } else {
e7cbe5cb 5328 DissectImageFlags dissect_image_flags = DISSECT_IMAGE_REQUIRE_ROOT | DISSECT_IMAGE_RELAX_VAR_CHECK;
ec16945e
LP
5329 assert(arg_image);
5330 assert(!arg_template);
5331
8d4aa2bb 5332 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
5333 if (r < 0)
5334 goto finish;
5335
0f3be6ca
LP
5336 if (arg_ephemeral) {
5337 _cleanup_free_ char *np = NULL;
5338
5339 r = tempfn_random(arg_image, "machine.", &np);
5340 if (r < 0) {
5341 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5342 goto finish;
5343 }
5344
6992459c
LP
5345 /* Always take an exclusive lock on our own ephemeral copy. */
5346 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5347 if (r < 0) {
5348 r = log_error_errno(r, "Failed to create image lock: %m");
5349 goto finish;
5350 }
5351
7bf011e3
LP
5352 {
5353 BLOCK_SIGNALS(SIGINT);
5354 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5355 }
5356 if (r == -EINTR) {
5357 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5358 goto finish;
5359 }
0f3be6ca
LP
5360 if (r < 0) {
5361 r = log_error_errno(r, "Failed to copy image file: %m");
5362 goto finish;
5363 }
5364
1cc6c93a 5365 free_and_replace(arg_image, np);
0f3be6ca
LP
5366 remove_image = true;
5367 } else {
5368 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5369 if (r == -EBUSY) {
5370 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5371 goto finish;
5372 }
5373 if (r < 0) {
5374 r = log_error_errno(r, "Failed to create image lock: %m");
5375 goto finish;
5376 }
4623e8e6 5377
0389f4fa 5378 r = verity_metadata_load(arg_image, NULL, arg_root_hash ? NULL : &arg_root_hash, &arg_root_hash_size,
c2923fdc
LB
5379 arg_verity_data ? NULL : &arg_verity_data,
5380 arg_root_hash_sig_path || arg_root_hash_sig ? NULL : &arg_root_hash_sig_path);
e7cbe5cb
LB
5381 if (r < 0) {
5382 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5383 goto finish;
78ebe980 5384 }
e7cbe5cb 5385 dissect_image_flags |= arg_verity_data ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0;
30535c16
LP
5386 }
5387
c67b0082 5388 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5389 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5390 goto finish;
1b9e5b12 5391 }
6b9132a9 5392
c67b0082
LP
5393 remove_tmprootdir = true;
5394
5395 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5396 if (!arg_directory) {
5397 r = log_oom();
5398 goto finish;
6b9132a9 5399 }
88213476 5400
e08f94ac 5401 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, LO_FLAGS_PARTSCAN, &loop);
2d845785
LP
5402 if (r < 0) {
5403 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5404 goto finish;
5405 }
1b9e5b12 5406
4526113f 5407 r = dissect_image_and_warn(
e0f9e7bd 5408 loop->fd,
4526113f 5409 arg_image,
e0f9e7bd 5410 arg_root_hash, arg_root_hash_size,
e7cbe5cb 5411 arg_verity_data,
18d73705 5412 NULL,
e7cbe5cb 5413 dissect_image_flags,
e0f9e7bd 5414 &dissected_image);
2d845785 5415 if (r == -ENOPKG) {
4526113f 5416 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5417 log_notice("Note that the disk image needs to\n"
5418 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5419 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
19ac32cd 5420 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
2d845785
LP
5421 " d) or contain a file system without a partition table\n"
5422 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5423 goto finish;
2d845785 5424 }
4526113f 5425 if (r < 0)
842f3b0f 5426 goto finish;
1b9e5b12 5427
4623e8e6
LP
5428 if (!arg_root_hash && dissected_image->can_verity)
5429 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5430
c2923fdc 5431 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, arg_verity_data, arg_root_hash_sig_path, arg_root_hash_sig, arg_root_hash_sig_size, 0, &decrypted_image);
1b9e5b12
LP
5432 if (r < 0)
5433 goto finish;
0f3be6ca
LP
5434
5435 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5436 if (remove_image && unlink(arg_image) >= 0)
5437 remove_image = false;
842f3b0f 5438 }
842f3b0f 5439
86c0dd4a 5440 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5441 if (r < 0)
5442 goto finish;
5443
de40a303
LP
5444 if (arg_console_mode < 0)
5445 arg_console_mode =
5446 isatty(STDIN_FILENO) > 0 &&
5447 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5448
de40a303
LP
5449 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5450 arg_quiet = true;
a258bf26 5451
9c857b9d
LP
5452 if (!arg_quiet)
5453 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5454 arg_machine, arg_image ?: arg_directory);
5455
72c0a2c2 5456 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5457
66edd963 5458 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5459 r = log_error_errno(errno, "Failed to become subreaper: %m");
5460 goto finish;
5461 }
5462
d87be9b0 5463 for (;;) {
3acc84eb 5464 r = run_container(dissected_image,
44dbef90
LP
5465 secondary,
5466 fds,
5467 veth_name, &veth_created,
3acc84eb 5468 &exposed, &master,
44dbef90 5469 &pid, &ret);
b0067625 5470 if (r <= 0)
d87be9b0 5471 break;
d87be9b0 5472 }
88213476
LP
5473
5474finish:
04f590a4
LP
5475 (void) sd_notify(false,
5476 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5477 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5478
9444b1f2 5479 if (pid > 0)
c67b0082 5480 (void) kill(pid, SIGKILL);
88213476 5481
503546da 5482 /* Try to flush whatever is still queued in the pty */
6a0f896b 5483 if (master >= 0) {
1c876927 5484 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
5485 master = safe_close(master);
5486 }
5487
5488 if (pid > 0)
5489 (void) wait_for_terminate(pid, NULL);
503546da 5490
50ebcf6c
LP
5491 pager_close();
5492
17cbb288 5493 if (remove_directory && arg_directory) {
ec16945e
LP
5494 int k;
5495
17cbb288 5496 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5497 if (k < 0)
17cbb288 5498 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5499 }
5500
0f3be6ca
LP
5501 if (remove_image && arg_image) {
5502 if (unlink(arg_image) < 0)
5503 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5504 }
5505
c67b0082
LP
5506 if (remove_tmprootdir) {
5507 if (rmdir(tmprootdir) < 0)
5508 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5509 }
5510
785890ac
LP
5511 if (arg_machine) {
5512 const char *p;
5513
63c372cb 5514 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5515 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5516 }
5517
7a8f6325 5518 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
5519
5520 if (veth_created)
5521 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5522 (void) remove_bridge(arg_network_zone);
f757855e 5523
f757855e
LP
5524 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5525 expose_port_free_all(arg_expose_ports);
bf428efb 5526 rlimit_free_all(arg_rlimit);
b2645747 5527 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5528 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5529
44dbef90
LP
5530 if (r < 0)
5531 return r;
5532
5533 return ret;
88213476 5534}
44dbef90
LP
5535
5536DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);