]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
test-network: disable NDISC on veth-peer
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
b8ea7a6e 31#include "bus-error.h"
b053cd5f 32#include "bus-util.h"
8fe0087e 33#include "cap-list.h"
430f0182 34#include "capability-util.h"
04d391da 35#include "cgroup-util.h"
8fe0087e 36#include "copy.h"
d107bb7d 37#include "cpu-set-util.h"
4fc9982c 38#include "dev-setup.h"
2d845785 39#include "dissect-image.h"
8fe0087e 40#include "env-util.h"
3652872a 41#include "escape.h"
3ffd4af2 42#include "fd-util.h"
842f3b0f 43#include "fdset.h"
a5c32cff 44#include "fileio.h"
f97b34a6 45#include "format-util.h"
f4f15635 46#include "fs-util.h"
1b9e5b12 47#include "gpt.h"
4623e8e6 48#include "hexdecoct.h"
e2054217 49#include "hostname-setup.h"
8fe0087e 50#include "hostname-util.h"
910fd145 51#include "id128-util.h"
3652872a 52#include "io-util.h"
8fe0087e 53#include "log.h"
2d845785 54#include "loop-util.h"
8fe0087e 55#include "loopback-setup.h"
1b9cebf6 56#include "machine-image.h"
8fe0087e 57#include "macro.h"
44dbef90 58#include "main-func.h"
f5947a5e 59#include "missing_sched.h"
8fe0087e 60#include "mkdir.h"
4349cd7c 61#include "mount-util.h"
049af8ad 62#include "mountpoint-util.h"
0cb8e3d1 63#include "namespace-util.h"
8fe0087e 64#include "netlink-util.h"
07630cea 65#include "nspawn-cgroup.h"
3652872a 66#include "nspawn-creds.h"
3603efde 67#include "nspawn-def.h"
07630cea
LP
68#include "nspawn-expose-ports.h"
69#include "nspawn-mount.h"
70#include "nspawn-network.h"
de40a303 71#include "nspawn-oci.h"
7336138e 72#include "nspawn-patch-uid.h"
07630cea 73#include "nspawn-register.h"
910fd145 74#include "nspawn-seccomp.h"
07630cea
LP
75#include "nspawn-settings.h"
76#include "nspawn-setuid.h"
7732f92b 77#include "nspawn-stub-pid1.h"
d8b4d14d 78#include "nulstr-util.h"
d58ad743 79#include "os-util.h"
50ebcf6c 80#include "pager.h"
6bedfcbb 81#include "parse-util.h"
8fe0087e 82#include "path-util.h"
294bf0c3 83#include "pretty-print.h"
0b452006 84#include "process-util.h"
8fe0087e
LP
85#include "ptyfwd.h"
86#include "random-util.h"
8869a0b4 87#include "raw-clone.h"
86775e35 88#include "resolve-util.h"
bf428efb 89#include "rlimit-util.h"
8fe0087e 90#include "rm-rf.h"
de40a303
LP
91#if HAVE_SECCOMP
92#include "seccomp-util.h"
93#endif
68b02049 94#include "selinux-util.h"
8fe0087e 95#include "signal-util.h"
2583fbea 96#include "socket-util.h"
8fcde012 97#include "stat-util.h"
15a5e950 98#include "stdio-util.h"
5c828e66 99#include "string-table.h"
07630cea 100#include "string-util.h"
8fe0087e 101#include "strv.h"
de40a303 102#include "sysctl-util.h"
8fe0087e 103#include "terminal-util.h"
e4de7287 104#include "tmpfile-util.h"
affb60b1 105#include "umask-util.h"
43c3fb46 106#include "unit-name.h"
b1d4f8e1 107#include "user-util.h"
8fe0087e 108#include "util.h"
e9642be2 109
e96ceaba
LP
110/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
111#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
0e7ac751 112
2a49b612
ZJS
113#define EXIT_FORCE_RESTART 133
114
113cea80
DH
115typedef enum ContainerStatus {
116 CONTAINER_TERMINATED,
6145bb4f 117 CONTAINER_REBOOTED,
113cea80
DH
118} ContainerStatus;
119
88213476 120static char *arg_directory = NULL;
ec16945e 121static char *arg_template = NULL;
5f932eb9 122static char *arg_chdir = NULL;
b53ede69
PW
123static char *arg_pivot_root_new = NULL;
124static char *arg_pivot_root_old = NULL;
687d0825 125static char *arg_user = NULL;
de40a303
LP
126static uid_t arg_uid = UID_INVALID;
127static gid_t arg_gid = GID_INVALID;
128static gid_t* arg_supplementary_gids = NULL;
129static size_t arg_n_supplementary_gids = 0;
9444b1f2 130static sd_id128_t arg_uuid = {};
3a9530e5
LP
131static char *arg_machine = NULL; /* The name used by the host to refer to this */
132static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
133static const char *arg_selinux_context = NULL;
134static const char *arg_selinux_apifs_context = NULL;
de40a303 135static char *arg_slice = NULL;
ff01d048 136static bool arg_private_network = false;
bc2f673e 137static bool arg_read_only = false;
7732f92b 138static StartMode arg_start_mode = START_PID1;
ec16945e 139static bool arg_ephemeral = false;
57fb9fb5 140static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 141static bool arg_link_journal_try = false;
520e0d54 142static uint64_t arg_caps_retain =
50b52222
LP
143 (1ULL << CAP_AUDIT_CONTROL) |
144 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
145 (1ULL << CAP_CHOWN) |
146 (1ULL << CAP_DAC_OVERRIDE) |
147 (1ULL << CAP_DAC_READ_SEARCH) |
148 (1ULL << CAP_FOWNER) |
149 (1ULL << CAP_FSETID) |
150 (1ULL << CAP_IPC_OWNER) |
151 (1ULL << CAP_KILL) |
152 (1ULL << CAP_LEASE) |
153 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 154 (1ULL << CAP_MKNOD) |
5076f0cc
LP
155 (1ULL << CAP_NET_BIND_SERVICE) |
156 (1ULL << CAP_NET_BROADCAST) |
157 (1ULL << CAP_NET_RAW) |
5076f0cc 158 (1ULL << CAP_SETFCAP) |
50b52222 159 (1ULL << CAP_SETGID) |
5076f0cc
LP
160 (1ULL << CAP_SETPCAP) |
161 (1ULL << CAP_SETUID) |
162 (1ULL << CAP_SYS_ADMIN) |
50b52222 163 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
164 (1ULL << CAP_SYS_CHROOT) |
165 (1ULL << CAP_SYS_NICE) |
166 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 167 (1ULL << CAP_SYS_RESOURCE) |
50b52222 168 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 169static uint64_t arg_caps_ambient = 0;
de40a303 170static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 171static CustomMount *arg_custom_mounts = NULL;
88614c8a 172static size_t arg_n_custom_mounts = 0;
f4889f65 173static char **arg_setenv = NULL;
284c0b91 174static bool arg_quiet = false;
eb91eb18 175static bool arg_register = true;
89f7c846 176static bool arg_keep_unit = false;
aa28aefe 177static char **arg_network_interfaces = NULL;
c74e630d 178static char **arg_network_macvlan = NULL;
4bbfe7ad 179static char **arg_network_ipvlan = NULL;
69c79d3c 180static bool arg_network_veth = false;
f6d6bad1 181static char **arg_network_veth_extra = NULL;
f757855e 182static char *arg_network_bridge = NULL;
22b28dfd 183static char *arg_network_zone = NULL;
d7bea6b6 184static char *arg_network_namespace_path = NULL;
bb068de0 185static PagerFlags arg_pager_flags = 0;
050f7277 186static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 187static char *arg_image = NULL;
de40a303 188static char *arg_oci_bundle = NULL;
f757855e 189static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 190static ExposePort *arg_expose_ports = NULL;
f36933fe 191static char **arg_property = NULL;
de40a303 192static sd_bus_message *arg_property_message = NULL;
0de7acce 193static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 194static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 195static bool arg_userns_chown = false;
c6c8f6e2 196static int arg_kill_signal = 0;
5da38d07 197static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
198static SettingsMask arg_settings_mask = 0;
199static int arg_settings_trusted = -1;
200static char **arg_parameters = NULL;
6aadfa4c 201static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 202static bool arg_notify_ready = false;
5a8ff0e6 203static bool arg_use_cgns = true;
0c582db0 204static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 205static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 206static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
207static char **arg_syscall_allow_list = NULL;
208static char **arg_syscall_deny_list = NULL;
de40a303
LP
209#if HAVE_SECCOMP
210static scmp_filter_ctx arg_seccomp = NULL;
211#endif
bf428efb 212static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 213static bool arg_no_new_privileges = false;
81f345df
LP
214static int arg_oom_score_adjust = 0;
215static bool arg_oom_score_adjust_set = false;
0985c7c4 216static CPUSet arg_cpu_set = {};
09d423e9 217static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 218static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
219static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
220static DeviceNode* arg_extra_nodes = NULL;
221static size_t arg_n_extra_nodes = 0;
222static char **arg_sysctl = NULL;
223static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
224static Credential *arg_credentials = NULL;
225static size_t arg_n_credentials = 0;
88213476 226
6145bb4f
LP
227STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
228STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
229STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
230STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
231STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
232STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
234STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
235STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
237STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
238STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
239STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
240STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
241STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
242STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
248STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
249STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 250STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
251STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
253#if HAVE_SECCOMP
254STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
255#endif
0985c7c4 256STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f
LP
257STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
258
dce66ffe
ZJS
259static int handle_arg_console(const char *arg) {
260 if (streq(arg, "help")) {
10e8a60b
LP
261 puts("autopipe\n"
262 "interactive\n"
dce66ffe 263 "passive\n"
10e8a60b
LP
264 "pipe\n"
265 "read-only");
dce66ffe
ZJS
266 return 0;
267 }
268
269 if (streq(arg, "interactive"))
270 arg_console_mode = CONSOLE_INTERACTIVE;
271 else if (streq(arg, "read-only"))
272 arg_console_mode = CONSOLE_READ_ONLY;
273 else if (streq(arg, "passive"))
274 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
275 else if (streq(arg, "pipe")) {
276 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
277 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
278 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
279 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
280 "Proceeding anyway.");
281
dce66ffe 282 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
283 } else if (streq(arg, "autopipe")) {
284 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
285 arg_console_mode = CONSOLE_INTERACTIVE;
286 else
287 arg_console_mode = CONSOLE_PIPE;
554c4beb 288 } else
dce66ffe
ZJS
289 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
290
291 arg_settings_mask |= SETTING_CONSOLE_MODE;
292 return 1;
293}
294
37ec0fdd
LP
295static int help(void) {
296 _cleanup_free_ char *link = NULL;
297 int r;
298
bb068de0 299 (void) pager_open(arg_pager_flags);
50ebcf6c 300
37ec0fdd
LP
301 r = terminal_urlify_man("systemd-nspawn", "1", &link);
302 if (r < 0)
303 return log_oom();
304
25148653 305 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 306 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
307 " -h --help Show this help\n"
308 " --version Print version string\n"
69c79d3c 309 " -q --quiet Do not show status information\n"
bb068de0 310 " --no-pager Do not pipe output into a pager\n"
25148653
LP
311 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
312 "%3$sImage:%4$s\n"
1b9e5b12 313 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
314 " --template=PATH Initialize root directory from template directory,\n"
315 " if missing\n"
316 " -x --ephemeral Run container with snapshot of root directory, and\n"
317 " remove it after exit\n"
25e68fd3
LP
318 " -i --image=PATH Root file system disk image (or device node) for\n"
319 " the container\n"
de40a303 320 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
321 " --read-only Mount the root directory read-only\n"
322 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 323 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
324 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
325 " as a DER encoded PKCS7, either as a path to a file\n"
326 " or as an ASCII base64 encoded string prefixed by\n"
327 " 'base64:'\n"
e7cbe5cb 328 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
329 " --pivot-root=PATH[:PATH]\n"
330 " Pivot root to given directory in the container\n\n"
331 "%3$sExecution:%4$s\n"
7732f92b 332 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 333 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 334 " --chdir=PATH Set working directory in the container\n"
25148653
LP
335 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
336 " -u --user=USER Run the command under specified user or UID\n"
337 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
338 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
339 "%3$sSystem Identity:%4$s\n"
a8828ed9 340 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 341 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
342 " --uuid=UUID Set a specific machine UUID for the container\n\n"
343 "%3$sProperties:%4$s\n"
a8828ed9 344 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 345 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
346 " --register=BOOLEAN Register container as machine\n"
347 " --keep-unit Do not register a scope for the machine, reuse\n"
348 " the service unit nspawn is running in\n\n"
349 "%3$sUser Namespacing:%4$s\n"
90b4a64d 350 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 351 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 352 " Similar, but with user configured UID/GID range\n"
25148653
LP
353 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
354 "%3$sNetworking:%4$s\n"
69c79d3c
LP
355 " --private-network Disable network in container\n"
356 " --network-interface=INTERFACE\n"
357 " Assign an existing network interface to the\n"
358 " container\n"
c74e630d
LP
359 " --network-macvlan=INTERFACE\n"
360 " Create a macvlan network interface based on an\n"
361 " existing network interface to the container\n"
4bbfe7ad
TG
362 " --network-ipvlan=INTERFACE\n"
363 " Create a ipvlan network interface based on an\n"
364 " existing network interface to the container\n"
a8eaaee7 365 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 366 " and container\n"
f6d6bad1
LP
367 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
368 " Add an additional virtual Ethernet link between\n"
369 " host and container\n"
ab046dde 370 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
371 " Add a virtual Ethernet connection to the container\n"
372 " and attach it to an existing bridge on the host\n"
373 " --network-zone=NAME Similar, but attach the new interface to an\n"
374 " an automatically managed bridge interface\n"
d7bea6b6
DP
375 " --network-namespace-path=PATH\n"
376 " Set network namespace to the one represented by\n"
377 " the specified kernel namespace file node\n"
6d0b55c2 378 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
379 " Expose a container IP port on the host\n\n"
380 "%3$sSecurity:%4$s\n"
a8828ed9
DW
381 " --capability=CAP In addition to the default, retain specified\n"
382 " capability\n"
383 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
384 " --ambient-capability=CAP\n"
385 " Sets the specified capability for the started\n"
386 " process. Not useful if booting a machine.\n"
f4e803c8 387 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
388 " --system-call-filter=LIST|~LIST\n"
389 " Permit/prohibit specific system calls\n"
25148653
LP
390 " -Z --selinux-context=SECLABEL\n"
391 " Set the SELinux security context to be used by\n"
392 " processes in the container\n"
393 " -L --selinux-apifs-context=SECLABEL\n"
394 " Set the SELinux security context to be used by\n"
395 " API/tmpfs file systems in the container\n\n"
396 "%3$sResources:%4$s\n"
bf428efb 397 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
398 " --oom-score-adjust=VALUE\n"
399 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
400 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
401 " --personality=ARCH Pick personality for this container\n\n"
25148653 402 "%3$sIntegration:%4$s\n"
09d423e9 403 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 404 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
405 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
406 " host, try-guest, try-host\n"
407 " -j Equivalent to --link-journal=try-guest\n\n"
408 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
409 " --bind=PATH[:PATH[:OPTIONS]]\n"
410 " Bind mount a file or directory from the host into\n"
a8828ed9 411 " the container\n"
5e5bfa6e
EY
412 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
413 " Similar, but creates a read-only bind mount\n"
de40a303
LP
414 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
415 " it\n"
06c17c39 416 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
417 " --overlay=PATH[:PATH...]:PATH\n"
418 " Create an overlay mount from the host to \n"
419 " the container\n"
420 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
421 " Similar, but creates a read-only overlay mount\n\n"
422 "%3$sInput/Output:%4$s\n"
de40a303
LP
423 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
424 " set up for the container.\n"
3652872a
LP
425 " -P --pipe Equivalent to --console=pipe\n\n"
426 "%3$sCredentials:%4$s\n"
427 " --set-credential=ID:VALUE\n"
428 " Pass a credential with literal value to container.\n"
429 " --load-credential=ID:PATH\n"
430 " Load credential to pass to container from file or\n"
431 " AF_UNIX stream socket.\n"
bc556335
DDM
432 "\nSee the %2$s for details.\n",
433 program_invocation_short_name,
434 link,
435 ansi_underline(),
436 ansi_normal(),
437 ansi_highlight(),
438 ansi_normal());
37ec0fdd
LP
439
440 return 0;
88213476
LP
441}
442
86c0dd4a 443static int custom_mount_check_all(void) {
88614c8a 444 size_t i;
5a8af538 445
5a8af538
LP
446 for (i = 0; i < arg_n_custom_mounts; i++) {
447 CustomMount *m = &arg_custom_mounts[i];
448
0de7acce 449 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
450 if (arg_userns_chown)
451 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
452 "--private-users-chown may not be combined with custom root mounts.");
453 else if (arg_uid_shift == UID_INVALID)
454 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
455 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 456 }
5a8af538
LP
457 }
458
459 return 0;
460}
461
8199d554 462static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 463 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 464 int r;
5da38d07 465
efdb0237 466 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
467
468 e = getenv(var);
469 if (!e) {
d5fc5b2f 470 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
471 var = "UNIFIED_CGROUP_HIERARCHY";
472 e = getenv(var);
c78c095b
ZJS
473 }
474
475 if (!isempty(e)) {
efdb0237
LP
476 r = parse_boolean(e);
477 if (r < 0)
c78c095b 478 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
479 if (r > 0)
480 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
481 else
482 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
483 }
484
8199d554
LP
485 return 0;
486}
487
488static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
489 int r;
490
75b0d8b8
ZJS
491 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
492 * in the image actually supports. */
b4cccbc1
LP
493 r = cg_all_unified();
494 if (r < 0)
495 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
496 if (r > 0) {
a8725a06
ZJS
497 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
498 * routine only detects 231, so we'll have a false negative here for 230. */
499 r = systemd_installation_has_version(directory, 230);
500 if (r < 0)
501 return log_error_errno(r, "Failed to determine systemd version in container: %m");
502 if (r > 0)
503 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
504 else
505 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 506 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
507 /* Mixed cgroup hierarchy support was added in 233 */
508 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
509 if (r < 0)
510 return log_error_errno(r, "Failed to determine systemd version in container: %m");
511 if (r > 0)
512 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
513 else
514 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
515 } else
5da38d07 516 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 517
8199d554
LP
518 log_debug("Using %s hierarchy for container.",
519 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
520 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
521
efdb0237
LP
522 return 0;
523}
524
8a99bd0c
ZJS
525static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
526 uint64_t mask = 0;
527 int r;
528
529 for (;;) {
530 _cleanup_free_ char *t = NULL;
531
532 r = extract_first_word(&spec, &t, ",", 0);
533 if (r < 0)
534 return log_error_errno(r, "Failed to parse capability %s.", t);
535 if (r == 0)
536 break;
537
538 if (streq(t, "help")) {
539 for (int i = 0; i < capability_list_length(); i++) {
540 const char *name;
541
542 name = capability_to_name(i);
543 if (name)
544 puts(name);
545 }
546
547 return 0; /* quit */
548 }
549
550 if (streq(t, "all"))
551 mask = (uint64_t) -1;
552 else {
553 r = capability_from_name(t);
554 if (r < 0)
555 return log_error_errno(r, "Failed to parse capability %s.", t);
556
557 mask |= 1ULL << r;
558 }
559 }
560
561 *ret_mask = mask;
562 return 1; /* continue */
563}
564
49048684 565static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
566 int r;
567
568 r = getenv_bool(name);
569 if (r == -ENXIO)
49048684 570 return 0;
0c582db0 571 if (r < 0)
49048684 572 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 573
0c582db0 574 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 575 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 576 return 0;
0c582db0
LB
577}
578
49048684 579static int parse_mount_settings_env(void) {
4f086aab 580 const char *e;
1099ceeb
LP
581 int r;
582
583 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
584 if (r < 0 && r != -ENXIO)
585 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
586 if (r >= 0)
587 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
588
589 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 590 if (streq_ptr(e, "network"))
4f086aab 591 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 592
49048684
ZJS
593 else if (e) {
594 r = parse_boolean(e);
595 if (r < 0)
596 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
597
598 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
599 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 600 }
4f086aab 601
49048684 602 return 0;
4f086aab
SU
603}
604
49048684 605static int parse_environment(void) {
d5455d2f
LP
606 const char *e;
607 int r;
608
49048684
ZJS
609 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
610 if (r < 0)
611 return r;
612 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
613 if (r < 0)
614 return r;
615 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
616 if (r < 0)
617 return r;
618 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
619 if (r < 0)
620 return r;
d5455d2f 621
49048684
ZJS
622 r = parse_mount_settings_env();
623 if (r < 0)
624 return r;
d5455d2f 625
489fae52
ZJS
626 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
627 * even if it is supported. If not supported, it has no effect. */
de40a303 628 if (!cg_ns_supported())
489fae52 629 arg_use_cgns = false;
de40a303
LP
630 else {
631 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
632 if (r < 0) {
633 if (r != -ENXIO)
49048684 634 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
635
636 arg_use_cgns = true;
637 } else {
638 arg_use_cgns = r > 0;
639 arg_settings_mask |= SETTING_USE_CGNS;
640 }
641 }
d5455d2f
LP
642
643 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
644 if (e)
645 arg_container_service_name = e;
646
49048684 647 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
648}
649
88213476 650static int parse_argv(int argc, char *argv[]) {
a41fe3a2 651 enum {
acbeb427
ZJS
652 ARG_VERSION = 0x100,
653 ARG_PRIVATE_NETWORK,
bc2f673e 654 ARG_UUID,
5076f0cc 655 ARG_READ_ONLY,
57fb9fb5 656 ARG_CAPABILITY,
88fc9c9b 657 ARG_AMBIENT_CAPABILITY,
420c7379 658 ARG_DROP_CAPABILITY,
17fe0523
LP
659 ARG_LINK_JOURNAL,
660 ARG_BIND,
f4889f65 661 ARG_BIND_RO,
06c17c39 662 ARG_TMPFS,
5a8af538
LP
663 ARG_OVERLAY,
664 ARG_OVERLAY_RO,
de40a303 665 ARG_INACCESSIBLE,
eb91eb18 666 ARG_SHARE_SYSTEM,
89f7c846 667 ARG_REGISTER,
aa28aefe 668 ARG_KEEP_UNIT,
69c79d3c 669 ARG_NETWORK_INTERFACE,
c74e630d 670 ARG_NETWORK_MACVLAN,
4bbfe7ad 671 ARG_NETWORK_IPVLAN,
ab046dde 672 ARG_NETWORK_BRIDGE,
22b28dfd 673 ARG_NETWORK_ZONE,
f6d6bad1 674 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 675 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 676 ARG_PERSONALITY,
4d9f07b4 677 ARG_VOLATILE,
ec16945e 678 ARG_TEMPLATE,
f36933fe 679 ARG_PROPERTY,
6dac160c 680 ARG_PRIVATE_USERS,
c6c8f6e2 681 ARG_KILL_SIGNAL,
f757855e 682 ARG_SETTINGS,
5f932eb9 683 ARG_CHDIR,
b53ede69 684 ARG_PIVOT_ROOT,
7336138e 685 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 686 ARG_NOTIFY_READY,
4623e8e6 687 ARG_ROOT_HASH,
89e62e0b
LP
688 ARG_ROOT_HASH_SIG,
689 ARG_VERITY_DATA,
960e4569 690 ARG_SYSTEM_CALL_FILTER,
bf428efb 691 ARG_RLIMIT,
3a9530e5 692 ARG_HOSTNAME,
66edd963 693 ARG_NO_NEW_PRIVILEGES,
81f345df 694 ARG_OOM_SCORE_ADJUST,
d107bb7d 695 ARG_CPU_AFFINITY,
09d423e9 696 ARG_RESOLV_CONF,
1688841f 697 ARG_TIMEZONE,
de40a303
LP
698 ARG_CONSOLE,
699 ARG_PIPE,
700 ARG_OCI_BUNDLE,
bb068de0 701 ARG_NO_PAGER,
3652872a
LP
702 ARG_SET_CREDENTIAL,
703 ARG_LOAD_CREDENTIAL,
a41fe3a2
LP
704 };
705
88213476 706 static const struct option options[] = {
d7bea6b6
DP
707 { "help", no_argument, NULL, 'h' },
708 { "version", no_argument, NULL, ARG_VERSION },
709 { "directory", required_argument, NULL, 'D' },
710 { "template", required_argument, NULL, ARG_TEMPLATE },
711 { "ephemeral", no_argument, NULL, 'x' },
712 { "user", required_argument, NULL, 'u' },
713 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
714 { "as-pid2", no_argument, NULL, 'a' },
715 { "boot", no_argument, NULL, 'b' },
716 { "uuid", required_argument, NULL, ARG_UUID },
717 { "read-only", no_argument, NULL, ARG_READ_ONLY },
718 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 719 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 720 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 721 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
722 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
723 { "bind", required_argument, NULL, ARG_BIND },
724 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
725 { "tmpfs", required_argument, NULL, ARG_TMPFS },
726 { "overlay", required_argument, NULL, ARG_OVERLAY },
727 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 728 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 729 { "machine", required_argument, NULL, 'M' },
3a9530e5 730 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
731 { "slice", required_argument, NULL, 'S' },
732 { "setenv", required_argument, NULL, 'E' },
733 { "selinux-context", required_argument, NULL, 'Z' },
734 { "selinux-apifs-context", required_argument, NULL, 'L' },
735 { "quiet", no_argument, NULL, 'q' },
736 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
737 { "register", required_argument, NULL, ARG_REGISTER },
738 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
739 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
740 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
741 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
742 { "network-veth", no_argument, NULL, 'n' },
743 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
744 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
745 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
746 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
747 { "personality", required_argument, NULL, ARG_PERSONALITY },
748 { "image", required_argument, NULL, 'i' },
749 { "volatile", optional_argument, NULL, ARG_VOLATILE },
750 { "port", required_argument, NULL, 'p' },
751 { "property", required_argument, NULL, ARG_PROPERTY },
752 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
753 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
754 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
755 { "settings", required_argument, NULL, ARG_SETTINGS },
756 { "chdir", required_argument, NULL, ARG_CHDIR },
757 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
758 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
759 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
760 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
761 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 762 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 763 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 764 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 765 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 766 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 767 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
768 { "console", required_argument, NULL, ARG_CONSOLE },
769 { "pipe", no_argument, NULL, ARG_PIPE },
770 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 771 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
772 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
773 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
eb9da376 774 {}
88213476
LP
775 };
776
9444b1f2 777 int c, r;
a42c8b54 778 uint64_t plus = 0, minus = 0;
f757855e 779 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
780
781 assert(argc >= 0);
782 assert(argv);
783
de40a303 784 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
785 switch (c) {
786
787 case 'h':
37ec0fdd 788 return help();
88213476 789
acbeb427 790 case ARG_VERSION:
3f6fd1ba 791 return version();
acbeb427 792
88213476 793 case 'D':
0f03c2a4 794 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 795 if (r < 0)
0f03c2a4 796 return r;
de40a303
LP
797
798 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
799 break;
800
801 case ARG_TEMPLATE:
0f03c2a4 802 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 803 if (r < 0)
0f03c2a4 804 return r;
de40a303
LP
805
806 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
807 break;
808
1b9e5b12 809 case 'i':
0f03c2a4 810 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 811 if (r < 0)
0f03c2a4 812 return r;
de40a303
LP
813
814 arg_settings_mask |= SETTING_DIRECTORY;
815 break;
816
817 case ARG_OCI_BUNDLE:
818 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
819 if (r < 0)
820 return r;
821
ec16945e
LP
822 break;
823
824 case 'x':
825 arg_ephemeral = true;
a2f577fc 826 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
827 break;
828
687d0825 829 case 'u':
2fc09a9c
DM
830 r = free_and_strdup(&arg_user, optarg);
831 if (r < 0)
7027ff61 832 return log_oom();
687d0825 833
f757855e 834 arg_settings_mask |= SETTING_USER;
687d0825
MV
835 break;
836
22b28dfd
LP
837 case ARG_NETWORK_ZONE: {
838 char *j;
839
b910cc72 840 j = strjoin("vz-", optarg);
22b28dfd
LP
841 if (!j)
842 return log_oom();
843
844 if (!ifname_valid(j)) {
845 log_error("Network zone name not valid: %s", j);
846 free(j);
847 return -EINVAL;
848 }
849
df1fac6d 850 free_and_replace(arg_network_zone, j);
22b28dfd
LP
851
852 arg_network_veth = true;
853 arg_private_network = true;
854 arg_settings_mask |= SETTING_NETWORK;
855 break;
856 }
857
ab046dde 858 case ARG_NETWORK_BRIDGE:
ef76dff2 859
baaa35ad
ZJS
860 if (!ifname_valid(optarg))
861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
862 "Bridge interface name not valid: %s", optarg);
ef76dff2 863
f757855e
LP
864 r = free_and_strdup(&arg_network_bridge, optarg);
865 if (r < 0)
866 return log_oom();
ab046dde 867
4831981d 868 _fallthrough_;
0dfaa006 869 case 'n':
69c79d3c
LP
870 arg_network_veth = true;
871 arg_private_network = true;
f757855e 872 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
873 break;
874
f6d6bad1
LP
875 case ARG_NETWORK_VETH_EXTRA:
876 r = veth_extra_parse(&arg_network_veth_extra, optarg);
877 if (r < 0)
878 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
879
880 arg_private_network = true;
881 arg_settings_mask |= SETTING_NETWORK;
882 break;
883
aa28aefe 884 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
885 if (!ifname_valid(optarg))
886 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
887 "Network interface name not valid: %s", optarg);
ef76dff2 888
b390f178
DDM
889 r = test_network_interface_initialized(optarg);
890 if (r < 0)
891 return r;
892
c74e630d
LP
893 if (strv_extend(&arg_network_interfaces, optarg) < 0)
894 return log_oom();
895
896 arg_private_network = true;
f757855e 897 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
898 break;
899
900 case ARG_NETWORK_MACVLAN:
ef76dff2 901
baaa35ad
ZJS
902 if (!ifname_valid(optarg))
903 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
904 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 905
b390f178
DDM
906 r = test_network_interface_initialized(optarg);
907 if (r < 0)
908 return r;
909
c74e630d 910 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
911 return log_oom();
912
4bbfe7ad 913 arg_private_network = true;
f757855e 914 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
915 break;
916
917 case ARG_NETWORK_IPVLAN:
ef76dff2 918
baaa35ad
ZJS
919 if (!ifname_valid(optarg))
920 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
921 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 922
b390f178
DDM
923 r = test_network_interface_initialized(optarg);
924 if (r < 0)
925 return r;
926
4bbfe7ad
TG
927 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
928 return log_oom();
929
4831981d 930 _fallthrough_;
ff01d048
LP
931 case ARG_PRIVATE_NETWORK:
932 arg_private_network = true;
f757855e 933 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
934 break;
935
d7bea6b6
DP
936 case ARG_NETWORK_NAMESPACE_PATH:
937 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
938 if (r < 0)
939 return r;
940
de40a303 941 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
942 break;
943
0f0dbc46 944 case 'b':
baaa35ad
ZJS
945 if (arg_start_mode == START_PID2)
946 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
947 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
948
949 arg_start_mode = START_BOOT;
950 arg_settings_mask |= SETTING_START_MODE;
951 break;
952
953 case 'a':
baaa35ad
ZJS
954 if (arg_start_mode == START_BOOT)
955 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
956 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
957
958 arg_start_mode = START_PID2;
959 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
960 break;
961
144f0fc0 962 case ARG_UUID:
9444b1f2 963 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
964 if (r < 0)
965 return log_error_errno(r, "Invalid UUID: %s", optarg);
966
baaa35ad
ZJS
967 if (sd_id128_is_null(arg_uuid))
968 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
969 "Machine UUID may not be all zeroes.");
f757855e
LP
970
971 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 972 break;
aa96c6cb 973
43c3fb46
LP
974 case 'S': {
975 _cleanup_free_ char *mangled = NULL;
976
977 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
978 if (r < 0)
979 return log_oom();
980
43c3fb46 981 free_and_replace(arg_slice, mangled);
de40a303 982 arg_settings_mask |= SETTING_SLICE;
144f0fc0 983 break;
43c3fb46 984 }
144f0fc0 985
7027ff61 986 case 'M':
c1521918 987 if (isempty(optarg))
97b11eed 988 arg_machine = mfree(arg_machine);
c1521918 989 else {
52ef5dd7 990 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
991 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
992 "Invalid machine name: %s", optarg);
7027ff61 993
0c3c4284
LP
994 r = free_and_strdup(&arg_machine, optarg);
995 if (r < 0)
eb91eb18 996 return log_oom();
eb91eb18 997 }
9ce6d1b3 998 break;
7027ff61 999
3a9530e5
LP
1000 case ARG_HOSTNAME:
1001 if (isempty(optarg))
1002 arg_hostname = mfree(arg_hostname);
1003 else {
52ef5dd7 1004 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1005 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1006 "Invalid hostname: %s", optarg);
3a9530e5
LP
1007
1008 r = free_and_strdup(&arg_hostname, optarg);
1009 if (r < 0)
1010 return log_oom();
1011 }
1012
1013 arg_settings_mask |= SETTING_HOSTNAME;
1014 break;
1015
82adf6af
LP
1016 case 'Z':
1017 arg_selinux_context = optarg;
a8828ed9
DW
1018 break;
1019
82adf6af
LP
1020 case 'L':
1021 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1022 break;
1023
bc2f673e
LP
1024 case ARG_READ_ONLY:
1025 arg_read_only = true;
f757855e 1026 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1027 break;
1028
88fc9c9b
TH
1029 case ARG_AMBIENT_CAPABILITY: {
1030 uint64_t m;
1031 r = parse_capability_spec(optarg, &m);
1032 if (r <= 0)
1033 return r;
1034 arg_caps_ambient |= m;
1035 arg_settings_mask |= SETTING_CAPABILITY;
1036 break;
1037 }
420c7379
LP
1038 case ARG_CAPABILITY:
1039 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1040 uint64_t m;
1041 r = parse_capability_spec(optarg, &m);
1042 if (r <= 0)
1043 return r;
5076f0cc 1044
8a99bd0c
ZJS
1045 if (c == ARG_CAPABILITY)
1046 plus |= m;
1047 else
1048 minus |= m;
f757855e 1049 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1050 break;
1051 }
66edd963
LP
1052 case ARG_NO_NEW_PRIVILEGES:
1053 r = parse_boolean(optarg);
1054 if (r < 0)
1055 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1056
1057 arg_no_new_privileges = r;
1058 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1059 break;
1060
57fb9fb5
LP
1061 case 'j':
1062 arg_link_journal = LINK_GUEST;
574edc90 1063 arg_link_journal_try = true;
4e1d6aa9 1064 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1065 break;
1066
1067 case ARG_LINK_JOURNAL:
4e1d6aa9 1068 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1069 if (r < 0)
1070 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1071
4e1d6aa9 1072 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1073 break;
1074
17fe0523 1075 case ARG_BIND:
f757855e
LP
1076 case ARG_BIND_RO:
1077 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1078 if (r < 0)
1079 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1080
f757855e 1081 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1082 break;
06c17c39 1083
f757855e
LP
1084 case ARG_TMPFS:
1085 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1086 if (r < 0)
1087 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1088
f757855e 1089 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1090 break;
5a8af538
LP
1091
1092 case ARG_OVERLAY:
ad85779a
LP
1093 case ARG_OVERLAY_RO:
1094 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1095 if (r == -EADDRNOTAVAIL)
1096 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1097 if (r < 0)
1098 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1099
f757855e 1100 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1101 break;
06c17c39 1102
de40a303
LP
1103 case ARG_INACCESSIBLE:
1104 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1105 if (r < 0)
1106 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1107
1108 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1109 break;
1110
a5f1cb3b 1111 case 'E': {
f4889f65
LP
1112 char **n;
1113
baaa35ad
ZJS
1114 if (!env_assignment_is_valid(optarg))
1115 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1116 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
1117
1118 n = strv_env_set(arg_setenv, optarg);
1119 if (!n)
1120 return log_oom();
1121
130d3d22 1122 strv_free_and_replace(arg_setenv, n);
f757855e 1123 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1124 break;
1125 }
1126
284c0b91
LP
1127 case 'q':
1128 arg_quiet = true;
1129 break;
1130
8a96d94e 1131 case ARG_SHARE_SYSTEM:
a6b5216c 1132 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1133 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1134 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1135 arg_clone_ns_flags = 0;
8a96d94e
LP
1136 break;
1137
eb91eb18
LP
1138 case ARG_REGISTER:
1139 r = parse_boolean(optarg);
1140 if (r < 0) {
1141 log_error("Failed to parse --register= argument: %s", optarg);
1142 return r;
1143 }
1144
1145 arg_register = r;
1146 break;
1147
89f7c846
LP
1148 case ARG_KEEP_UNIT:
1149 arg_keep_unit = true;
1150 break;
1151
6afc95b7
LP
1152 case ARG_PERSONALITY:
1153
ac45f971 1154 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1155 if (arg_personality == PERSONALITY_INVALID)
1156 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1157 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1158
f757855e 1159 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1160 break;
1161
4d9f07b4
LP
1162 case ARG_VOLATILE:
1163
1164 if (!optarg)
f757855e 1165 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1166 else if (streq(optarg, "help")) {
1167 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1168 return 0;
1169 } else {
f757855e 1170 VolatileMode m;
4d9f07b4 1171
f757855e 1172 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1173 if (m < 0)
1174 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1175 "Failed to parse --volatile= argument: %s", optarg);
1176 else
f757855e 1177 arg_volatile_mode = m;
6d0b55c2
LP
1178 }
1179
f757855e
LP
1180 arg_settings_mask |= SETTING_VOLATILE_MODE;
1181 break;
6d0b55c2 1182
f757855e
LP
1183 case 'p':
1184 r = expose_port_parse(&arg_expose_ports, optarg);
1185 if (r == -EEXIST)
1186 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1187 if (r < 0)
1188 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1189
f757855e 1190 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1191 break;
6d0b55c2 1192
f36933fe
LP
1193 case ARG_PROPERTY:
1194 if (strv_extend(&arg_property, optarg) < 0)
1195 return log_oom();
1196
1197 break;
1198
ae209204
ZJS
1199 case ARG_PRIVATE_USERS: {
1200 int boolean = -1;
0de7acce 1201
ae209204
ZJS
1202 if (!optarg)
1203 boolean = true;
1204 else if (!in_charset(optarg, DIGITS))
1205 /* do *not* parse numbers as booleans */
1206 boolean = parse_boolean(optarg);
1207
1208 if (boolean == false) {
0de7acce
LP
1209 /* no: User namespacing off */
1210 arg_userns_mode = USER_NAMESPACE_NO;
1211 arg_uid_shift = UID_INVALID;
1212 arg_uid_range = UINT32_C(0x10000);
ae209204 1213 } else if (boolean == true) {
0de7acce
LP
1214 /* yes: User namespacing on, UID range is read from root dir */
1215 arg_userns_mode = USER_NAMESPACE_FIXED;
1216 arg_uid_shift = UID_INVALID;
1217 arg_uid_range = UINT32_C(0x10000);
1218 } else if (streq(optarg, "pick")) {
1219 /* pick: User namespacing on, UID range is picked randomly */
1220 arg_userns_mode = USER_NAMESPACE_PICK;
1221 arg_uid_shift = UID_INVALID;
1222 arg_uid_range = UINT32_C(0x10000);
1223 } else {
6c2058b3 1224 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1225 const char *range, *shift;
1226
0de7acce
LP
1227 /* anything else: User namespacing on, UID range is explicitly configured */
1228
6dac160c
LP
1229 range = strchr(optarg, ':');
1230 if (range) {
6c2058b3
ZJS
1231 buffer = strndup(optarg, range - optarg);
1232 if (!buffer)
1233 return log_oom();
1234 shift = buffer;
6dac160c
LP
1235
1236 range++;
bfd292ec
ZJS
1237 r = safe_atou32(range, &arg_uid_range);
1238 if (r < 0)
be715731 1239 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1240 } else
1241 shift = optarg;
1242
be715731
ZJS
1243 r = parse_uid(shift, &arg_uid_shift);
1244 if (r < 0)
1245 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1246
1247 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1248 }
1249
baaa35ad
ZJS
1250 if (arg_uid_range <= 0)
1251 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1252 "UID range cannot be 0.");
be715731 1253
0de7acce 1254 arg_settings_mask |= SETTING_USERNS;
6dac160c 1255 break;
ae209204 1256 }
6dac160c 1257
0de7acce 1258 case 'U':
ccabee0d
LP
1259 if (userns_supported()) {
1260 arg_userns_mode = USER_NAMESPACE_PICK;
1261 arg_uid_shift = UID_INVALID;
1262 arg_uid_range = UINT32_C(0x10000);
1263
1264 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1265 }
1266
7336138e
LP
1267 break;
1268
0de7acce 1269 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1270 arg_userns_chown = true;
0de7acce
LP
1271
1272 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1273 break;
1274
c6c8f6e2 1275 case ARG_KILL_SIGNAL:
5c828e66
LP
1276 if (streq(optarg, "help")) {
1277 DUMP_STRING_TABLE(signal, int, _NSIG);
1278 return 0;
1279 }
1280
29a3db75 1281 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1282 if (arg_kill_signal < 0)
1283 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1284 "Cannot parse signal: %s", optarg);
c6c8f6e2 1285
f757855e
LP
1286 arg_settings_mask |= SETTING_KILL_SIGNAL;
1287 break;
1288
1289 case ARG_SETTINGS:
1290
1291 /* no → do not read files
1292 * yes → read files, do not override cmdline, trust only subset
1293 * override → read files, override cmdline, trust only subset
1294 * trusted → read files, do not override cmdline, trust all
1295 */
1296
1297 r = parse_boolean(optarg);
1298 if (r < 0) {
1299 if (streq(optarg, "trusted")) {
1300 mask_all_settings = false;
1301 mask_no_settings = false;
1302 arg_settings_trusted = true;
1303
1304 } else if (streq(optarg, "override")) {
1305 mask_all_settings = false;
1306 mask_no_settings = true;
1307 arg_settings_trusted = -1;
1308 } else
1309 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1310 } else if (r > 0) {
1311 /* yes */
1312 mask_all_settings = false;
1313 mask_no_settings = false;
1314 arg_settings_trusted = -1;
1315 } else {
1316 /* no */
1317 mask_all_settings = true;
1318 mask_no_settings = false;
1319 arg_settings_trusted = false;
1320 }
1321
c6c8f6e2
LP
1322 break;
1323
5f932eb9 1324 case ARG_CHDIR:
baaa35ad
ZJS
1325 if (!path_is_absolute(optarg))
1326 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1327 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1328
1329 r = free_and_strdup(&arg_chdir, optarg);
1330 if (r < 0)
1331 return log_oom();
1332
1333 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1334 break;
1335
b53ede69
PW
1336 case ARG_PIVOT_ROOT:
1337 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1338 if (r < 0)
1339 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1340
1341 arg_settings_mask |= SETTING_PIVOT_ROOT;
1342 break;
1343
9c1e04d0
AP
1344 case ARG_NOTIFY_READY:
1345 r = parse_boolean(optarg);
baaa35ad
ZJS
1346 if (r < 0)
1347 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1348 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1349 arg_notify_ready = r;
1350 arg_settings_mask |= SETTING_NOTIFY_READY;
1351 break;
1352
4623e8e6 1353 case ARG_ROOT_HASH: {
89e62e0b 1354 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1355 size_t l;
1356
1357 r = unhexmem(optarg, strlen(optarg), &k, &l);
1358 if (r < 0)
1359 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1360 if (l < sizeof(sd_id128_t))
c6147113 1361 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1362
89e62e0b
LP
1363 free_and_replace(arg_verity_settings.root_hash, k);
1364 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1365 break;
1366 }
1367
c2923fdc
LB
1368 case ARG_ROOT_HASH_SIG: {
1369 char *value;
89e62e0b
LP
1370 size_t l;
1371 void *p;
c2923fdc
LB
1372
1373 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1374 r = unbase64mem(value, strlen(value), &p, &l);
1375 if (r < 0)
1376 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1377
c2923fdc 1378 } else {
89e62e0b 1379 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1380 if (r < 0)
89e62e0b 1381 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1382 }
1383
89e62e0b
LP
1384 free_and_replace(arg_verity_settings.root_hash_sig, p);
1385 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1386 break;
1387 }
1388
89e62e0b
LP
1389 case ARG_VERITY_DATA:
1390 r = parse_path_argument_and_warn(optarg, false, &arg_verity_settings.data_path);
1391 if (r < 0)
1392 return r;
1393 break;
1394
960e4569
LP
1395 case ARG_SYSTEM_CALL_FILTER: {
1396 bool negative;
1397 const char *items;
1398
1399 negative = optarg[0] == '~';
1400 items = negative ? optarg + 1 : optarg;
1401
1402 for (;;) {
1403 _cleanup_free_ char *word = NULL;
1404
1405 r = extract_first_word(&items, &word, NULL, 0);
1406 if (r == 0)
1407 break;
1408 if (r == -ENOMEM)
1409 return log_oom();
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to parse system call filter: %m");
1412
1413 if (negative)
6b000af4 1414 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1415 else
6b000af4 1416 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1417 if (r < 0)
1418 return log_oom();
1419 }
1420
1421 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1422 break;
1423 }
1424
bf428efb
LP
1425 case ARG_RLIMIT: {
1426 const char *eq;
622ecfa8 1427 _cleanup_free_ char *name = NULL;
bf428efb
LP
1428 int rl;
1429
5c828e66
LP
1430 if (streq(optarg, "help")) {
1431 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1432 return 0;
1433 }
1434
bf428efb 1435 eq = strchr(optarg, '=');
baaa35ad
ZJS
1436 if (!eq)
1437 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1438 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1439
1440 name = strndup(optarg, eq - optarg);
1441 if (!name)
1442 return log_oom();
1443
1444 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1445 if (rl < 0)
1446 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1447 "Unknown resource limit: %s", name);
bf428efb
LP
1448
1449 if (!arg_rlimit[rl]) {
1450 arg_rlimit[rl] = new0(struct rlimit, 1);
1451 if (!arg_rlimit[rl])
1452 return log_oom();
1453 }
1454
1455 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1456 if (r < 0)
1457 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1458
1459 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1460 break;
1461 }
1462
81f345df
LP
1463 case ARG_OOM_SCORE_ADJUST:
1464 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1465 if (r < 0)
1466 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1467
1468 arg_oom_score_adjust_set = true;
1469 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1470 break;
1471
d107bb7d 1472 case ARG_CPU_AFFINITY: {
0985c7c4 1473 CPUSet cpuset;
d107bb7d
LP
1474
1475 r = parse_cpu_set(optarg, &cpuset);
1476 if (r < 0)
0985c7c4 1477 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1478
0985c7c4
ZJS
1479 cpu_set_reset(&arg_cpu_set);
1480 arg_cpu_set = cpuset;
d107bb7d
LP
1481 arg_settings_mask |= SETTING_CPU_AFFINITY;
1482 break;
1483 }
1484
09d423e9
LP
1485 case ARG_RESOLV_CONF:
1486 if (streq(optarg, "help")) {
1487 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1488 return 0;
1489 }
1490
1491 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1492 if (arg_resolv_conf < 0)
1493 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1494 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1495
1496 arg_settings_mask |= SETTING_RESOLV_CONF;
1497 break;
1498
1688841f
LP
1499 case ARG_TIMEZONE:
1500 if (streq(optarg, "help")) {
1501 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1502 return 0;
1503 }
1504
1505 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1506 if (arg_timezone < 0)
1507 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1508 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1509
1510 arg_settings_mask |= SETTING_TIMEZONE;
1511 break;
1512
de40a303 1513 case ARG_CONSOLE:
dce66ffe
ZJS
1514 r = handle_arg_console(optarg);
1515 if (r <= 0)
1516 return r;
de40a303
LP
1517 break;
1518
1519 case 'P':
1520 case ARG_PIPE:
dce66ffe
ZJS
1521 r = handle_arg_console("pipe");
1522 if (r <= 0)
1523 return r;
de40a303
LP
1524 break;
1525
bb068de0
ZJS
1526 case ARG_NO_PAGER:
1527 arg_pager_flags |= PAGER_DISABLE;
1528 break;
1529
3652872a
LP
1530 case ARG_SET_CREDENTIAL: {
1531 _cleanup_free_ char *word = NULL, *data = NULL;
1532 const char *p = optarg;
1533 Credential *a;
1534 size_t i;
1535 int l;
1536
1537 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1538 if (r == -ENOMEM)
1539 return log_oom();
1540 if (r < 0)
1541 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1542 if (r == 0 || !p)
1543 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1544
1545 if (!credential_name_valid(word))
1546 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1547
1548 for (i = 0; i < arg_n_credentials; i++)
1549 if (streq(arg_credentials[i].id, word))
1550 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1551
1552 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1553 if (l < 0)
1554 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1555
1556 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1557 if (!a)
1558 return log_oom();
1559
1560 a[arg_n_credentials++] = (Credential) {
1561 .id = TAKE_PTR(word),
1562 .data = TAKE_PTR(data),
1563 .size = l,
1564 };
1565
1566 arg_credentials = a;
1567
1568 arg_settings_mask |= SETTING_CREDENTIALS;
1569 break;
1570 }
1571
1572 case ARG_LOAD_CREDENTIAL: {
1573 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1574 _cleanup_(erase_and_freep) char *data = NULL;
1575 _cleanup_free_ char *word = NULL, *j = NULL;
1576 const char *p = optarg;
1577 Credential *a;
1578 size_t size, i;
1579
1580 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1581 if (r == -ENOMEM)
1582 return log_oom();
1583 if (r < 0)
1584 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1585 if (r == 0 || !p)
1586 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1587
1588 if (!credential_name_valid(word))
1589 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1590
1591 for (i = 0; i < arg_n_credentials; i++)
1592 if (streq(arg_credentials[i].id, word))
1593 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1594
1595 if (path_is_absolute(p))
1596 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1597 else {
1598 const char *e;
1599
1600 e = getenv("CREDENTIALS_DIRECTORY");
1601 if (!e)
1602 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential not available (no credentials passed at all): %s", word);
1603
1604 j = path_join(e, p);
1605 if (!j)
1606 return log_oom();
1607 }
1608
986311c2
LP
1609 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1610 flags,
1611 NULL,
1612 &data, &size);
3652872a
LP
1613 if (r < 0)
1614 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1615
1616 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1617 if (!a)
1618 return log_oom();
1619
1620 a[arg_n_credentials++] = (Credential) {
1621 .id = TAKE_PTR(word),
1622 .data = TAKE_PTR(data),
1623 .size = size,
1624 };
1625
1626 arg_credentials = a;
1627
1628 arg_settings_mask |= SETTING_CREDENTIALS;
1629 break;
1630 }
1631
88213476
LP
1632 case '?':
1633 return -EINVAL;
1634
1635 default:
eb9da376 1636 assert_not_reached("Unhandled option");
88213476 1637 }
88213476 1638
60f1ec13
LP
1639 if (argc > optind) {
1640 strv_free(arg_parameters);
1641 arg_parameters = strv_copy(argv + optind);
1642 if (!arg_parameters)
1643 return log_oom();
d7bea6b6 1644
60f1ec13
LP
1645 arg_settings_mask |= SETTING_START_MODE;
1646 }
1647
1648 if (arg_ephemeral && arg_template && !arg_directory)
1649 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1650 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1651 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1652 * --directory=". */
1653 arg_directory = TAKE_PTR(arg_template);
1654
bd4b15f2 1655 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1656
de40a303 1657 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1658 r = parse_environment();
1659 if (r < 0)
1660 return r;
de40a303 1661
60f1ec13
LP
1662 /* Load all settings from .nspawn files */
1663 if (mask_no_settings)
1664 arg_settings_mask = 0;
1665
1666 /* Don't load any settings from .nspawn files */
1667 if (mask_all_settings)
1668 arg_settings_mask = _SETTINGS_MASK_ALL;
1669
1670 return 1;
1671}
1672
1673static int verify_arguments(void) {
1674 int r;
a6b5216c 1675
75b0d8b8
ZJS
1676 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1677 /* If we are running the stub init in the container, we don't need to look at what the init
1678 * in the container supports, because we are not using it. Let's immediately pick the right
1679 * setting based on the host system configuration.
1680 *
1681 * We only do this, if the user didn't use an environment variable to override the detection.
1682 */
1683
1684 r = cg_all_unified();
1685 if (r < 0)
1686 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1687 if (r > 0)
1688 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1689 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1690 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1691 else
1692 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1693 }
1694
4f086aab
SU
1695 if (arg_userns_mode != USER_NAMESPACE_NO)
1696 arg_mount_settings |= MOUNT_USE_USERNS;
1697
1698 if (arg_private_network)
1699 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1700
48a8d337
LB
1701 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1702 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1703 arg_register = false;
baaa35ad 1704 if (arg_start_mode != START_PID1)
60f1ec13 1705 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1706 }
eb91eb18 1707
0de7acce 1708 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1709 arg_userns_chown = true;
1710
60f1ec13
LP
1711 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1712 arg_kill_signal = SIGRTMIN+3;
1713
e5a4bb0d
LP
1714 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1715 arg_read_only = true;
1716
2436ea76
DDM
1717 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1718 arg_read_only = true;
1719
baaa35ad 1720 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1721 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1722 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1723 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1724
baaa35ad 1725 if (arg_directory && arg_image)
60f1ec13 1726 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1727
baaa35ad 1728 if (arg_template && arg_image)
60f1ec13 1729 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1730
baaa35ad 1731 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1732 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1733
baaa35ad 1734 if (arg_ephemeral && arg_template)
60f1ec13 1735 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1736
baaa35ad 1737 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1738 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1739
baaa35ad 1740 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1741 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1742
baaa35ad 1743 if (arg_userns_chown && arg_read_only)
de40a303
LP
1744 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1745 "--read-only and --private-users-chown may not be combined.");
f757855e 1746
e5a4bb0d
LP
1747 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1748 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
5238e957 1749 * copy-up (in case of overlay) making the entire exercise pointless. */
e5a4bb0d
LP
1750 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1751 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1752
679ecd36
SZ
1753 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1754 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1755 if (arg_network_namespace_path &&
1756 (arg_network_interfaces || arg_network_macvlan ||
1757 arg_network_ipvlan || arg_network_veth_extra ||
1758 arg_network_bridge || arg_network_zone ||
679ecd36 1759 arg_network_veth))
de40a303 1760 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1761
60f1ec13 1762 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1763 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1764 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1765
baaa35ad 1766 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1767 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1768
baaa35ad 1769 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1770 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1771
baaa35ad 1772 if (arg_expose_ports && !arg_private_network)
60f1ec13 1773 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1774
88fc9c9b
TH
1775 if (arg_caps_ambient) {
1776 if (arg_caps_ambient == (uint64_t)-1)
1777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1778
1779 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1780 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1781
1782 if (arg_start_mode == START_BOOT)
1783 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1784 }
1785
60f1ec13
LP
1786 r = custom_mount_check_all();
1787 if (r < 0)
1788 return r;
c6c8f6e2 1789
f757855e 1790 return 0;
88213476
LP
1791}
1792
03cfe0d5
LP
1793static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1794 assert(p);
1795
0de7acce 1796 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1797 return 0;
1798
1799 if (uid == UID_INVALID && gid == GID_INVALID)
1800 return 0;
1801
1802 if (uid != UID_INVALID) {
1803 uid += arg_uid_shift;
1804
1805 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1806 return -EOVERFLOW;
1807 }
1808
1809 if (gid != GID_INVALID) {
1810 gid += (gid_t) arg_uid_shift;
1811
1812 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1813 return -EOVERFLOW;
1814 }
1815
1816 if (lchown(p, uid, gid) < 0)
1817 return -errno;
b12afc8c
LP
1818
1819 return 0;
1820}
1821
03cfe0d5
LP
1822static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1823 const char *q;
dae8b82e 1824 int r;
03cfe0d5
LP
1825
1826 q = prefix_roota(root, path);
dae8b82e
ZJS
1827 r = mkdir_errno_wrapper(q, mode);
1828 if (r == -EEXIST)
1829 return 0;
1830 if (r < 0)
1831 return r;
03cfe0d5
LP
1832
1833 return userns_lchown(q, uid, gid);
1834}
1835
1688841f 1836static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1837 return PATH_STARTSWITH_SET(
1838 path,
1839 "../usr/share/zoneinfo/",
1840 "/usr/share/zoneinfo/");
1688841f
LP
1841}
1842
83205269
LP
1843static bool etc_writable(void) {
1844 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1845}
1846
e58a1277 1847static int setup_timezone(const char *dest) {
1688841f
LP
1848 _cleanup_free_ char *p = NULL, *etc = NULL;
1849 const char *where, *check;
1850 TimezoneMode m;
d4036145 1851 int r;
f8440af5 1852
e58a1277
LP
1853 assert(dest);
1854
1688841f 1855 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1856 r = readlink_malloc("/etc/localtime", &p);
1857 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1858 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1859 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1860 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1861 else if (r < 0) {
1862 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1863 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1864 * file.
1865 *
1866 * Example:
1867 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1868 */
1869 return 0;
1870 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1871 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1872 else
1873 m = arg_timezone;
1874 } else
1875 m = arg_timezone;
1876
1877 if (m == TIMEZONE_OFF)
1878 return 0;
1879
a5648b80 1880 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1881 if (r < 0) {
1688841f 1882 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1883 return 0;
1884 }
1885
1688841f
LP
1886 where = strjoina(etc, "/localtime");
1887
1888 switch (m) {
1889
1890 case TIMEZONE_DELETE:
1891 if (unlink(where) < 0)
1892 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1893
d4036145 1894 return 0;
d4036145 1895
1688841f
LP
1896 case TIMEZONE_SYMLINK: {
1897 _cleanup_free_ char *q = NULL;
1898 const char *z, *what;
4d1c38b8 1899
1688841f
LP
1900 z = timezone_from_path(p);
1901 if (!z) {
1902 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1903 return 0;
1688841f 1904 }
d4036145 1905
1688841f
LP
1906 r = readlink_malloc(where, &q);
1907 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1908 return 0; /* Already pointing to the right place? Then do nothing .. */
1909
1910 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1911 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1912 if (r < 0)
1913 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1914 else {
1915 if (unlink(where) < 0 && errno != ENOENT) {
1916 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1917 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1918 return 0;
1919 }
1920
1921 what = strjoina("../usr/share/zoneinfo/", z);
1922 if (symlink(what, where) < 0) {
1923 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1924 errno, "Failed to correct timezone of container, ignoring: %m");
1925 return 0;
1926 }
1927
1928 break;
1929 }
1930
1931 _fallthrough_;
d4036145 1932 }
68fb0892 1933
1688841f
LP
1934 case TIMEZONE_BIND: {
1935 _cleanup_free_ char *resolved = NULL;
1936 int found;
1937
a5648b80 1938 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1939 if (found < 0) {
1940 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1941 return 0;
1942 }
1943
1944 if (found == 0) /* missing? */
1945 (void) touch(resolved);
1946
511a8cfe 1947 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 1948 if (r >= 0)
511a8cfe 1949 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
1950
1951 _fallthrough_;
79d80fc1 1952 }
4d9f07b4 1953
1688841f
LP
1954 case TIMEZONE_COPY:
1955 /* If mounting failed, try to copy */
8a016c74 1956 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1957 if (r < 0) {
1958 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1959 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1960 return 0;
1961 }
1962
1963 break;
1964
1965 default:
1966 assert_not_reached("unexpected mode");
d4036145 1967 }
e58a1277 1968
1688841f 1969 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1970 r = userns_lchown(where, 0, 0);
1971 if (r < 0)
1688841f 1972 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1973
e58a1277 1974 return 0;
88213476
LP
1975}
1976
09d423e9
LP
1977static int have_resolv_conf(const char *path) {
1978 assert(path);
1979
1980 if (access(path, F_OK) < 0) {
1981 if (errno == ENOENT)
1982 return 0;
1983
1984 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1985 }
1986
1987 return 1;
1988}
1989
7357272e 1990static int resolved_listening(void) {
b8ea7a6e 1991 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1992 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1993 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1994 int r;
1995
7357272e 1996 /* Check if resolved is listening */
b053cd5f
LP
1997
1998 r = sd_bus_open_system(&bus);
1999 if (r < 0)
b8ea7a6e 2000 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2001
7357272e 2002 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2003 if (r < 0)
2004 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2005 if (r == 0)
2006 return 0;
7357272e
DM
2007
2008 r = sd_bus_get_property_string(bus,
2009 "org.freedesktop.resolve1",
2010 "/org/freedesktop/resolve1",
2011 "org.freedesktop.resolve1.Manager",
2012 "DNSStubListener",
b8ea7a6e 2013 &error,
7357272e
DM
2014 &dns_stub_listener_mode);
2015 if (r < 0)
b8ea7a6e 2016 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2017
2018 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2019}
2020
2547bb41 2021static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2022 _cleanup_free_ char *etc = NULL;
2023 const char *where, *what;
2024 ResolvConfMode m;
2025 int r;
2547bb41
LP
2026
2027 assert(dest);
2028
09d423e9
LP
2029 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2030 if (arg_private_network)
2031 m = RESOLV_CONF_OFF;
86775e35
LP
2032 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2033 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2034 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2035 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2036 else
83205269 2037 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2038
09d423e9
LP
2039 } else
2040 m = arg_resolv_conf;
2041
2042 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2043 return 0;
2044
a5648b80 2045 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2046 if (r < 0) {
2047 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2048 return 0;
2049 }
2050
2051 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2052
2053 if (m == RESOLV_CONF_DELETE) {
2054 if (unlink(where) < 0)
2055 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2056
87447ae4
LP
2057 return 0;
2058 }
79d80fc1 2059
86775e35
LP
2060 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2061 what = PRIVATE_STATIC_RESOLV_CONF;
2062 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2063 what = PRIVATE_UPLINK_RESOLV_CONF;
2064 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2065 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2066 else
2067 what = "/etc/resolv.conf";
87447ae4 2068
86775e35 2069 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2070 _cleanup_free_ char *resolved = NULL;
2071 int found;
2072
a5648b80 2073 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2074 if (found < 0) {
2075 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2076 return 0;
2077 }
3539724c 2078
87447ae4
LP
2079 if (found == 0) /* missing? */
2080 (void) touch(resolved);
5367354d 2081
511a8cfe 2082 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2083 if (r >= 0)
511a8cfe 2084 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2085
2086 /* If that didn't work, let's copy the file */
3539724c
LP
2087 }
2088
86775e35
LP
2089 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2090 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2091 else
2092 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 2093 if (r < 0) {
3539724c
LP
2094 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2095 * resolved or something similar runs inside and the symlink points there.
68a313c5 2096 *
3539724c 2097 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2098 */
86775e35
LP
2099 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2100 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2101 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2102 return 0;
2103 }
2547bb41 2104
03cfe0d5
LP
2105 r = userns_lchown(where, 0, 0);
2106 if (r < 0)
3539724c 2107 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2108
2547bb41
LP
2109 return 0;
2110}
2111
1e4f1671 2112static int setup_boot_id(void) {
cdde6ba6
LP
2113 _cleanup_(unlink_and_freep) char *from = NULL;
2114 _cleanup_free_ char *path = NULL;
3bbaff3e 2115 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2116 const char *to;
04bc4a3f
LP
2117 int r;
2118
1eacc470 2119 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2120
1eacc470 2121 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2122 if (r < 0)
2123 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2124
2125 r = sd_id128_randomize(&rnd);
f647962d
MS
2126 if (r < 0)
2127 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2128
cdde6ba6 2129 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
2130 if (r < 0)
2131 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2132
cdde6ba6
LP
2133 from = TAKE_PTR(path);
2134 to = "/proc/sys/kernel/random/boot_id";
2135
511a8cfe 2136 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2137 if (r < 0)
2138 return r;
04bc4a3f 2139
511a8cfe 2140 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2141}
2142
e58a1277 2143static int copy_devnodes(const char *dest) {
88213476
LP
2144 static const char devnodes[] =
2145 "null\0"
2146 "zero\0"
2147 "full\0"
2148 "random\0"
2149 "urandom\0"
85614d66
TG
2150 "tty\0"
2151 "net/tun\0";
88213476 2152
de40a303 2153 _cleanup_umask_ mode_t u;
88213476 2154 const char *d;
e58a1277 2155 int r = 0;
a258bf26
LP
2156
2157 assert(dest);
124640f1
LP
2158
2159 u = umask(0000);
88213476 2160
03cfe0d5
LP
2161 /* Create /dev/net, so that we can create /dev/net/tun in it */
2162 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2163 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2164
88213476 2165 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2166 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2167 struct stat st;
88213476 2168
c6134d3e 2169 from = path_join("/dev/", d);
8967f291
LP
2170 if (!from)
2171 return log_oom();
2172
c6134d3e 2173 to = path_join(dest, from);
8967f291
LP
2174 if (!to)
2175 return log_oom();
88213476
LP
2176
2177 if (stat(from, &st) < 0) {
2178
4a62c710
MS
2179 if (errno != ENOENT)
2180 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2181
baaa35ad
ZJS
2182 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2183 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2184 "%s is not a char or block device, cannot copy.", from);
2185 else {
8dfce114
LP
2186 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2187
81f5049b 2188 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2189 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2190 if (errno == EEXIST)
8dbf71ec 2191 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2192 if (errno != EPERM)
2193 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2194
8dfce114 2195 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2196 r = touch(to);
2197 if (r < 0)
2198 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2199 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2200 if (r < 0)
2201 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2202 }
6278cf60 2203
03cfe0d5
LP
2204 r = userns_lchown(to, 0, 0);
2205 if (r < 0)
2206 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2207
657ee2d8 2208 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2209 if (!dn)
2210 return log_oom();
2211
2212 r = userns_mkdir(dest, dn, 0755, 0, 0);
2213 if (r < 0)
2214 return log_error_errno(r, "Failed to create '%s': %m", dn);
2215
2216 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2217 return log_oom();
2218
c6134d3e 2219 prefixed = path_join(dest, sl);
8dfce114
LP
2220 if (!prefixed)
2221 return log_oom();
2222
2d9b74ba 2223 t = path_join("..", d);
8dfce114
LP
2224 if (!t)
2225 return log_oom();
2226
2227 if (symlink(t, prefixed) < 0)
2228 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2229 }
88213476
LP
2230 }
2231
e58a1277
LP
2232 return r;
2233}
88213476 2234
de40a303
LP
2235static int make_extra_nodes(const char *dest) {
2236 _cleanup_umask_ mode_t u;
2237 size_t i;
2238 int r;
2239
2240 u = umask(0000);
2241
2242 for (i = 0; i < arg_n_extra_nodes; i++) {
2243 _cleanup_free_ char *path = NULL;
2244 DeviceNode *n = arg_extra_nodes + i;
2245
c6134d3e 2246 path = path_join(dest, n->path);
de40a303
LP
2247 if (!path)
2248 return log_oom();
2249
2250 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2251 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2252
2253 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2254 if (r < 0)
2255 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2256 }
2257
2258 return 0;
2259}
2260
03cfe0d5
LP
2261static int setup_pts(const char *dest) {
2262 _cleanup_free_ char *options = NULL;
2263 const char *p;
709f6e46 2264 int r;
03cfe0d5 2265
349cc4a5 2266#if HAVE_SELINUX
03cfe0d5
LP
2267 if (arg_selinux_apifs_context)
2268 (void) asprintf(&options,
3dce8915 2269 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2270 arg_uid_shift + TTY_GID,
2271 arg_selinux_apifs_context);
2272 else
2273#endif
2274 (void) asprintf(&options,
3dce8915 2275 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2276 arg_uid_shift + TTY_GID);
f2d88580 2277
03cfe0d5 2278 if (!options)
f2d88580
LP
2279 return log_oom();
2280
03cfe0d5 2281 /* Mount /dev/pts itself */
cc9fce65 2282 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2283 r = mkdir_errno_wrapper(p, 0755);
2284 if (r < 0)
2285 return log_error_errno(r, "Failed to create /dev/pts: %m");
2286
511a8cfe 2287 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2288 if (r < 0)
2289 return r;
709f6e46
MS
2290 r = userns_lchown(p, 0, 0);
2291 if (r < 0)
2292 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2293
2294 /* Create /dev/ptmx symlink */
2295 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2296 if (symlink("pts/ptmx", p) < 0)
2297 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2298 r = userns_lchown(p, 0, 0);
2299 if (r < 0)
2300 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2301
03cfe0d5
LP
2302 /* And fix /dev/pts/ptmx ownership */
2303 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2304 r = userns_lchown(p, 0, 0);
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2307
f2d88580
LP
2308 return 0;
2309}
2310
3acc84eb 2311static int setup_stdio_as_dev_console(void) {
2fef50cd 2312 _cleanup_close_ int terminal = -1;
e58a1277 2313 int r;
e58a1277 2314
335d2ead
LP
2315 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2316 * explicitly, if we are configured to. */
2317 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2318 if (terminal < 0)
2319 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2320
3acc84eb
FB
2321 /* Make sure we can continue logging to the original stderr, even if
2322 * stderr points elsewhere now */
2323 r = log_dup_console();
2324 if (r < 0)
2325 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2326
3acc84eb
FB
2327 /* invalidates 'terminal' on success and failure */
2328 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2329 TAKE_FD(terminal);
f647962d 2330 if (r < 0)
3acc84eb
FB
2331 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2332
2333 return 0;
2334}
88213476 2335
3acc84eb
FB
2336static int setup_dev_console(const char *console) {
2337 _cleanup_free_ char *p = NULL;
2338 int r;
a258bf26 2339
3acc84eb
FB
2340 /* Create /dev/console symlink */
2341 r = path_make_relative("/dev", console, &p);
81f5049b 2342 if (r < 0)
3acc84eb
FB
2343 return log_error_errno(r, "Failed to create relative path: %m");
2344
2345 if (symlink(p, "/dev/console") < 0)
2346 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2347
3acc84eb 2348 return 0;
e58a1277
LP
2349}
2350
8e5430c4
LP
2351static int setup_keyring(void) {
2352 key_serial_t keyring;
2353
6b000af4
LP
2354 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2355 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2356 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2357 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2358 * into the container. */
8e5430c4
LP
2359
2360 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2361 if (keyring == -1) {
2362 if (errno == ENOSYS)
2363 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2364 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2365 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2366 else
2367 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2368 }
2369
2370 return 0;
2371}
2372
3652872a
LP
2373static int setup_credentials(const char *root) {
2374 const char *q;
2375 int r;
2376
2377 if (arg_n_credentials <= 0)
2378 return 0;
2379
2380 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2381 if (r < 0)
2382 return log_error_errno(r, "Failed to create /run/host: %m");
2383
2384 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2385 if (r < 0)
2386 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2387
2388 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2389 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2390 if (r < 0)
2391 return r;
2392
2393 for (size_t i = 0; i < arg_n_credentials; i++) {
2394 _cleanup_free_ char *j = NULL;
2395 _cleanup_close_ int fd = -1;
2396
2397 j = path_join(q, arg_credentials[i].id);
2398 if (!j)
2399 return log_oom();
2400
2401 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2402 if (fd < 0)
2403 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2404
2405 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2406 if (r < 0)
2407 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2408
2409 if (fchmod(fd, 0400) < 0)
2410 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2411
2412 if (arg_userns_mode != USER_NAMESPACE_NO) {
2413 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2414 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2415 }
2416 }
2417
2418 if (chmod(q, 0500) < 0)
2419 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2420
2421 r = userns_lchown(q, 0, 0);
2422 if (r < 0)
2423 return r;
2424
2425 /* Make both mount and superblock read-only now */
511a8cfe 2426 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2427 if (r < 0)
2428 return r;
2429
511a8cfe 2430 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2431}
2432
1e4f1671 2433static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2434 _cleanup_(unlink_and_freep) char *from = NULL;
2435 _cleanup_free_ char *fifo = NULL;
2436 _cleanup_close_ int fd = -1;
7fd1b19b 2437 _cleanup_umask_ mode_t u;
9ec5a93c 2438 int r;
e58a1277 2439
e58a1277 2440 assert(kmsg_socket >= 0);
a258bf26 2441
e58a1277 2442 u = umask(0000);
a258bf26 2443
1eacc470 2444 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2445 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2446 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2447 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2448
1eacc470 2449 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2450 if (r < 0)
2451 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2452
9ec5a93c 2453 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2454 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2455
2456 from = TAKE_PTR(fifo);
9ec5a93c 2457
511a8cfe 2458 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2459 if (r < 0)
2460 return r;
e58a1277 2461
669fc4e5 2462 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2463 if (fd < 0)
2464 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2465
9ec5a93c 2466 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2467 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2468 if (r < 0)
2469 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2470
25ea79fe 2471 return 0;
88213476
LP
2472}
2473
761cf19d
FW
2474struct ExposeArgs {
2475 union in_addr_union address;
2476 struct FirewallContext *fw_ctx;
2477};
2478
1c4baffc 2479static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
761cf19d 2480 struct ExposeArgs *args = userdata;
6d0b55c2
LP
2481
2482 assert(rtnl);
2483 assert(m);
761cf19d 2484 assert(args);
6d0b55c2 2485
761cf19d 2486 expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, &args->address);
6d0b55c2
LP
2487 return 0;
2488}
2489
3a74cea5 2490static int setup_hostname(void) {
c818eef1 2491 int r;
3a74cea5 2492
0c582db0 2493 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2494 return 0;
2495
c818eef1
LP
2496 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2497 if (r < 0)
2498 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2499
7027ff61 2500 return 0;
3a74cea5
LP
2501}
2502
57fb9fb5 2503static int setup_journal(const char *directory) {
0f5e1382 2504 _cleanup_free_ char *d = NULL;
5905d7cf 2505 char id[SD_ID128_STRING_MAX];
b2238e38
LP
2506 const char *dirname, *p, *q;
2507 sd_id128_t this_id;
8054d749 2508 bool try;
57fb9fb5
LP
2509 int r;
2510
df9a75e4
LP
2511 /* Don't link journals in ephemeral mode */
2512 if (arg_ephemeral)
2513 return 0;
2514
8054d749
LP
2515 if (arg_link_journal == LINK_NO)
2516 return 0;
2517
2518 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2519
4d680aee 2520 r = sd_id128_get_machine(&this_id);
f647962d
MS
2521 if (r < 0)
2522 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2523
e01ff70a 2524 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2525 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2526 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2527 if (try)
4d680aee 2528 return 0;
df9a75e4 2529 return -EEXIST;
4d680aee
ZJS
2530 }
2531
369ca6da
ZJS
2532 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2533 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2534 if (r < 0) {
2535 bool ignore = r == -EROFS && try;
2536 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2537 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2538 return ignore ? 0 : r;
2539 }
2540 }
03cfe0d5 2541
e01ff70a
MS
2542 (void) sd_id128_to_string(arg_uuid, id);
2543
03cfe0d5
LP
2544 p = strjoina("/var/log/journal/", id);
2545 q = prefix_roota(directory, p);
27407a01 2546
e1873695 2547 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2548 if (try)
2549 return 0;
27407a01 2550
baaa35ad
ZJS
2551 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2552 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2553 }
2554
e1873695 2555 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2556 if (try)
2557 return 0;
57fb9fb5 2558
baaa35ad
ZJS
2559 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2560 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2561 }
2562
2563 r = readlink_and_make_absolute(p, &d);
2564 if (r >= 0) {
3742095b 2565 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2566 path_equal(d, q)) {
2567
03cfe0d5 2568 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2569 if (r < 0)
709f6e46 2570 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2571 return 0;
57fb9fb5
LP
2572 }
2573
4a62c710
MS
2574 if (unlink(p) < 0)
2575 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2576 } else if (r == -EINVAL) {
2577
2578 if (arg_link_journal == LINK_GUEST &&
2579 rmdir(p) < 0) {
2580
27407a01
ZJS
2581 if (errno == ENOTDIR) {
2582 log_error("%s already exists and is neither a symlink nor a directory", p);
2583 return r;
4314d33f
MS
2584 } else
2585 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2586 }
4314d33f
MS
2587 } else if (r != -ENOENT)
2588 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2589
2590 if (arg_link_journal == LINK_GUEST) {
2591
2592 if (symlink(q, p) < 0) {
8054d749 2593 if (try) {
56f64d95 2594 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2595 return 0;
4314d33f
MS
2596 } else
2597 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2598 }
2599
03cfe0d5 2600 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2601 if (r < 0)
709f6e46 2602 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2603 return 0;
57fb9fb5
LP
2604 }
2605
2606 if (arg_link_journal == LINK_HOST) {
ccddd104 2607 /* don't create parents here — if the host doesn't have
574edc90 2608 * permanent journal set up, don't force it here */
ba8e6c4d 2609
dae8b82e
ZJS
2610 r = mkdir_errno_wrapper(p, 0755);
2611 if (r < 0 && r != -EEXIST) {
8054d749 2612 if (try) {
dae8b82e 2613 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2614 return 0;
4314d33f 2615 } else
dae8b82e 2616 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2617 }
2618
27407a01
ZJS
2619 } else if (access(p, F_OK) < 0)
2620 return 0;
57fb9fb5 2621
cdb2b9d0
LP
2622 if (dir_is_empty(q) == 0)
2623 log_warning("%s is not empty, proceeding anyway.", q);
2624
03cfe0d5 2625 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2628
511a8cfe 2629 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2630 if (r < 0)
4a62c710 2631 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2632
27407a01 2633 return 0;
57fb9fb5
LP
2634}
2635
de40a303
LP
2636static int drop_capabilities(uid_t uid) {
2637 CapabilityQuintet q;
2638
2639 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2640 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2641 * arg_caps_retain. */
2642
2643 if (capability_quintet_is_set(&arg_full_capabilities)) {
2644 q = arg_full_capabilities;
2645
2646 if (q.bounding == (uint64_t) -1)
2647 q.bounding = uid == 0 ? arg_caps_retain : 0;
2648
2649 if (q.effective == (uint64_t) -1)
2650 q.effective = uid == 0 ? q.bounding : 0;
2651
2652 if (q.inheritable == (uint64_t) -1)
88fc9c9b 2653 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303
LP
2654
2655 if (q.permitted == (uint64_t) -1)
88fc9c9b 2656 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303
LP
2657
2658 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
88fc9c9b 2659 q.ambient = arg_caps_ambient;
f66ad460
AZ
2660
2661 if (capability_quintet_mangle(&q))
2662 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2663
2664 } else {
de40a303
LP
2665 q = (CapabilityQuintet) {
2666 .bounding = arg_caps_retain,
2667 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2668 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2669 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2670 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : (uint64_t) -1,
de40a303
LP
2671 };
2672
f66ad460
AZ
2673 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2674 * in order to maintain the same behavior as systemd < 242. */
2675 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2676 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2677 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2678
2679 }
2680
de40a303 2681 return capability_quintet_enforce(&q);
88213476
LP
2682}
2683
db999e0f
LP
2684static int reset_audit_loginuid(void) {
2685 _cleanup_free_ char *p = NULL;
2686 int r;
2687
0c582db0 2688 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2689 return 0;
2690
2691 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2692 if (r == -ENOENT)
db999e0f 2693 return 0;
f647962d
MS
2694 if (r < 0)
2695 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2696
2697 /* Already reset? */
2698 if (streq(p, "4294967295"))
2699 return 0;
2700
57512c89 2701 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2702 if (r < 0) {
10a87006
LP
2703 log_error_errno(r,
2704 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2705 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2706 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2707 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2708 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2709
db999e0f 2710 sleep(5);
77b6e194 2711 }
db999e0f
LP
2712
2713 return 0;
77b6e194
LP
2714}
2715
785890ac
LP
2716static int setup_propagate(const char *root) {
2717 const char *p, *q;
709f6e46 2718 int r;
785890ac
LP
2719
2720 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2721 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2722 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2723 (void) mkdir_p(p, 0600);
2724
5a27b395 2725 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2726 if (r < 0)
5a27b395 2727 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2728
5a27b395 2729 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
709f6e46 2730 if (r < 0)
5a27b395 2731 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
03cfe0d5 2732
5a27b395 2733 q = prefix_roota(root, "/run/host/incoming");
511a8cfe 2734 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2735 if (r < 0)
2736 return r;
785890ac 2737
511a8cfe 2738 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2739 if (r < 0)
2740 return r;
785890ac 2741
5a27b395 2742 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
511a8cfe 2743 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2744}
2745
317feb4d 2746static int setup_machine_id(const char *directory) {
691675ba
LP
2747 const char *etc_machine_id;
2748 sd_id128_t id;
3bbaff3e 2749 int r;
e01ff70a 2750
317feb4d
LP
2751 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2752 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2753 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2754 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2755 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2756 * container behaves nicely). */
2757
e01ff70a
MS
2758 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2759
c5fbeedb 2760 r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
317feb4d
LP
2761 if (r < 0) {
2762 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2763 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2764
317feb4d
LP
2765 if (sd_id128_is_null(arg_uuid)) {
2766 r = sd_id128_randomize(&arg_uuid);
2767 if (r < 0)
2768 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2769 }
2770 } else {
baaa35ad
ZJS
2771 if (sd_id128_is_null(id))
2772 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2773 "Machine ID in container image is zero, refusing.");
e01ff70a 2774
317feb4d
LP
2775 arg_uuid = id;
2776 }
691675ba 2777
e01ff70a
MS
2778 return 0;
2779}
2780
7336138e
LP
2781static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2782 int r;
2783
2784 assert(directory);
2785
0de7acce 2786 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2787 return 0;
2788
2789 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2790 if (r == -EOPNOTSUPP)
2791 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2792 if (r == -EBADE)
2793 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2794 if (r < 0)
2795 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2796 if (r == 0)
2797 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2798 else
2799 log_debug("Patched directory tree to match UID/GID range.");
2800
2801 return r;
2802}
2803
113cea80 2804/*
6d416b9c
LS
2805 * Return values:
2806 * < 0 : wait_for_terminate() failed to get the state of the
2807 * container, the container was terminated by a signal, or
2808 * failed for an unknown reason. No change is made to the
2809 * container argument.
2810 * > 0 : The program executed in the container terminated with an
2811 * error. The exit code of the program executed in the
919699ec
LP
2812 * container is returned. The container argument has been set
2813 * to CONTAINER_TERMINATED.
6d416b9c
LS
2814 * 0 : The container is being rebooted, has been shut down or exited
2815 * successfully. The container argument has been set to either
2816 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2817 *
6d416b9c
LS
2818 * That is, success is indicated by a return value of zero, and an
2819 * error is indicated by a non-zero value.
113cea80
DH
2820 */
2821static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2822 siginfo_t status;
919699ec 2823 int r;
113cea80
DH
2824
2825 r = wait_for_terminate(pid, &status);
f647962d
MS
2826 if (r < 0)
2827 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2828
2829 switch (status.si_code) {
fddbb89c 2830
113cea80 2831 case CLD_EXITED:
b5a2179b 2832 if (status.si_status == 0)
919699ec 2833 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2834 else
919699ec 2835 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2836
919699ec
LP
2837 *container = CONTAINER_TERMINATED;
2838 return status.si_status;
113cea80
DH
2839
2840 case CLD_KILLED:
2841 if (status.si_status == SIGINT) {
919699ec 2842 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2843 *container = CONTAINER_TERMINATED;
919699ec
LP
2844 return 0;
2845
113cea80 2846 } else if (status.si_status == SIGHUP) {
919699ec 2847 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2848 *container = CONTAINER_REBOOTED;
919699ec 2849 return 0;
113cea80 2850 }
919699ec 2851
4831981d 2852 _fallthrough_;
113cea80 2853 case CLD_DUMPED:
baaa35ad
ZJS
2854 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2855 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2856
2857 default:
baaa35ad
ZJS
2858 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2859 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2860 }
113cea80
DH
2861}
2862
023fb90b
LP
2863static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2864 pid_t pid;
2865
4a0b58c4 2866 pid = PTR_TO_PID(userdata);
023fb90b 2867 if (pid > 0) {
c6c8f6e2 2868 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2869 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2870 sd_event_source_set_userdata(s, NULL);
2871 return 0;
2872 }
2873 }
2874
2875 sd_event_exit(sd_event_source_get_event(s), 0);
2876 return 0;
2877}
2878
6916b164 2879static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2880 pid_t pid;
2881
2882 assert(s);
2883 assert(ssi);
2884
2885 pid = PTR_TO_PID(userdata);
2886
6916b164
AU
2887 for (;;) {
2888 siginfo_t si = {};
abdb9b08 2889
6916b164
AU
2890 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2891 return log_error_errno(errno, "Failed to waitid(): %m");
2892 if (si.si_pid == 0) /* No pending children. */
2893 break;
abdb9b08 2894 if (si.si_pid == pid) {
6916b164
AU
2895 /* The main process we care for has exited. Return from
2896 * signal handler but leave the zombie. */
2897 sd_event_exit(sd_event_source_get_event(s), 0);
2898 break;
2899 }
abdb9b08 2900
6916b164
AU
2901 /* Reap all other children. */
2902 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2903 }
2904
2905 return 0;
2906}
2907
abdb9b08
LP
2908static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2909 pid_t pid;
2910
2911 assert(m);
2912
2913 pid = PTR_TO_PID(userdata);
2914
2915 if (arg_kill_signal > 0) {
2916 log_info("Container termination requested. Attempting to halt container.");
2917 (void) kill(pid, arg_kill_signal);
2918 } else {
2919 log_info("Container termination requested. Exiting.");
2920 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2921 }
2922
2923 return 0;
2924}
2925
ec16945e 2926static int determine_names(void) {
1b9cebf6 2927 int r;
ec16945e 2928
c1521918
LP
2929 if (arg_template && !arg_directory && arg_machine) {
2930
2931 /* If --template= was specified then we should not
2932 * search for a machine, but instead create a new one
2933 * in /var/lib/machine. */
2934
657ee2d8 2935 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2936 if (!arg_directory)
2937 return log_oom();
2938 }
2939
ec16945e 2940 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2941 if (arg_machine) {
2942 _cleanup_(image_unrefp) Image *i = NULL;
2943
d577d4a4 2944 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
2945 if (r == -ENOENT)
2946 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2947 if (r < 0)
2948 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2949
eb38edce 2950 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2951 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2952 else
0f03c2a4 2953 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2954 if (r < 0)
0f3be6ca 2955 return log_oom();
1b9cebf6 2956
aee327b8
LP
2957 if (!arg_ephemeral)
2958 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2959 } else {
2960 r = safe_getcwd(&arg_directory);
2961 if (r < 0)
2962 return log_error_errno(r, "Failed to determine current directory: %m");
2963 }
ec16945e 2964
c6147113
LP
2965 if (!arg_directory && !arg_image)
2966 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2967 }
2968
2969 if (!arg_machine) {
b9ba4dab
LP
2970 if (arg_directory && path_equal(arg_directory, "/"))
2971 arg_machine = gethostname_malloc();
4827ab48
LP
2972 else {
2973 if (arg_image) {
2974 char *e;
2975
2976 arg_machine = strdup(basename(arg_image));
2977
2978 /* Truncate suffix if there is one */
2979 e = endswith(arg_machine, ".raw");
2980 if (e)
2981 *e = 0;
2982 } else
2983 arg_machine = strdup(basename(arg_directory));
2984 }
ec16945e
LP
2985 if (!arg_machine)
2986 return log_oom();
2987
ae691c1d 2988 hostname_cleanup(arg_machine);
52ef5dd7 2989 if (!hostname_is_valid(arg_machine, 0))
c6147113 2990 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab
LP
2991
2992 if (arg_ephemeral) {
2993 char *b;
2994
2995 /* Add a random suffix when this is an
2996 * ephemeral machine, so that we can run many
2997 * instances at once without manually having
2998 * to specify -M each time. */
2999
3000 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3001 return log_oom();
3002
3003 free(arg_machine);
3004 arg_machine = b;
3005 }
ec16945e
LP
3006 }
3007
3008 return 0;
3009}
3010
8d4aa2bb 3011static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
3012 char *chased;
3013 int r;
3014
3015 assert(p);
3016
3017 if (!*p)
3018 return 0;
3019
a5648b80 3020 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3021 if (r < 0)
3022 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3023
a5648b80 3024 return free_and_replace(*p, chased);
3f342ec4
LP
3025}
3026
03cfe0d5 3027static int determine_uid_shift(const char *directory) {
6dac160c
LP
3028 int r;
3029
0de7acce 3030 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3031 arg_uid_shift = 0;
6dac160c 3032 return 0;
03cfe0d5 3033 }
6dac160c
LP
3034
3035 if (arg_uid_shift == UID_INVALID) {
3036 struct stat st;
3037
03cfe0d5 3038 r = stat(directory, &st);
6dac160c 3039 if (r < 0)
03cfe0d5 3040 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3041
3042 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3043
baaa35ad
ZJS
3044 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3045 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3046 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3047
3048 arg_uid_range = UINT32_C(0x10000);
3049 }
3050
baaa35ad
ZJS
3051 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
3052 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3053 "UID base too high for UID range.");
6dac160c 3054
6dac160c
LP
3055 return 0;
3056}
3057
de40a303
LP
3058static unsigned long effective_clone_ns_flags(void) {
3059 unsigned long flags = arg_clone_ns_flags;
3060
3061 if (arg_private_network)
3062 flags |= CLONE_NEWNET;
3063 if (arg_use_cgns)
3064 flags |= CLONE_NEWCGROUP;
3065 if (arg_userns_mode != USER_NAMESPACE_NO)
3066 flags |= CLONE_NEWUSER;
3067
3068 return flags;
3069}
3070
3071static int patch_sysctl(void) {
3072
3073 /* This table is inspired by runc's sysctl() function */
3074 static const struct {
3075 const char *key;
3076 bool prefix;
3077 unsigned long clone_flags;
3078 } safe_sysctl[] = {
3079 { "kernel.hostname", false, CLONE_NEWUTS },
3080 { "kernel.domainname", false, CLONE_NEWUTS },
3081 { "kernel.msgmax", false, CLONE_NEWIPC },
3082 { "kernel.msgmnb", false, CLONE_NEWIPC },
3083 { "kernel.msgmni", false, CLONE_NEWIPC },
3084 { "kernel.sem", false, CLONE_NEWIPC },
3085 { "kernel.shmall", false, CLONE_NEWIPC },
3086 { "kernel.shmmax", false, CLONE_NEWIPC },
3087 { "kernel.shmmni", false, CLONE_NEWIPC },
3088 { "fs.mqueue.", true, CLONE_NEWIPC },
3089 { "net.", true, CLONE_NEWNET },
3090 };
3091
3092 unsigned long flags;
3093 char **k, **v;
3094 int r;
3095
3096 flags = effective_clone_ns_flags();
3097
3098 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3099 bool good = false;
3100 size_t i;
3101
3102 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3103
3104 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3105 continue;
3106
3107 if (safe_sysctl[i].prefix)
3108 good = startswith(*k, safe_sysctl[i].key);
3109 else
3110 good = streq(*k, safe_sysctl[i].key);
3111
3112 if (good)
3113 break;
3114 }
3115
c6147113
LP
3116 if (!good)
3117 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3118
3119 r = sysctl_write(*k, *v);
3120 if (r < 0)
3121 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3122 }
3123
3124 return 0;
3125}
3126
03cfe0d5
LP
3127static int inner_child(
3128 Barrier *barrier,
3129 const char *directory,
3130 bool secondary,
3131 int kmsg_socket,
3132 int rtnl_socket,
3acc84eb 3133 int master_pty_socket,
e1bb4b0d
LB
3134 FDSet *fds,
3135 char **os_release_pairs) {
69c79d3c 3136
03cfe0d5 3137 _cleanup_free_ char *home = NULL;
b5ea030d 3138 char as_uuid[ID128_UUID_STRING_MAX];
88614c8a 3139 size_t n_env = 1;
03cfe0d5 3140 const char *envp[] = {
0c300adf 3141 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3142 NULL, /* container */
03cfe0d5
LP
3143 NULL, /* TERM */
3144 NULL, /* HOME */
3145 NULL, /* USER */
3146 NULL, /* LOGNAME */
3147 NULL, /* container_uuid */
3148 NULL, /* LISTEN_FDS */
3149 NULL, /* LISTEN_PID */
9c1e04d0 3150 NULL, /* NOTIFY_SOCKET */
3652872a 3151 NULL, /* CREDENTIALS_DIRECTORY */
03cfe0d5
LP
3152 NULL
3153 };
1a68e1e5 3154 const char *exec_target;
2371271c 3155 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3156 int r, which_failed;
88213476 3157
b37469d7
LP
3158 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3159 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3160 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3161 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3162 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3163 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3164 * namespace.
3165 *
3166 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3167 * unshare(). See below. */
3168
03cfe0d5
LP
3169 assert(barrier);
3170 assert(directory);
3171 assert(kmsg_socket >= 0);
88213476 3172
de40a303
LP
3173 log_debug("Inner child is initializing.");
3174
0de7acce 3175 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3176 /* Tell the parent, that it now can write the UID map. */
3177 (void) barrier_place(barrier); /* #1 */
7027ff61 3178
03cfe0d5 3179 /* Wait until the parent wrote the UID map */
baaa35ad 3180 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3181 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3182
2a2e78e9
LP
3183 /* Become the new root user inside our namespace */
3184 r = reset_uid_gid();
3185 if (r < 0)
3186 return log_error_errno(r, "Couldn't become new root: %m");
3187
3188 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3189 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3190 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3191 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3192 if (r < 0)
3193 return r;
3194 }
6d66bd3b 3195
0de7acce 3196 r = mount_all(NULL,
4f086aab 3197 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3198 arg_uid_shift,
0de7acce 3199 arg_selinux_apifs_context);
03cfe0d5
LP
3200 if (r < 0)
3201 return r;
3202
04413780
ZJS
3203 if (!arg_network_namespace_path && arg_private_network) {
3204 r = unshare(CLONE_NEWNET);
3205 if (r < 0)
3206 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3207
3208 /* Tell the parent that it can setup network interfaces. */
3209 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3210 }
3211
4f086aab 3212 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3213 if (r < 0)
3214 return r;
3215
03cfe0d5
LP
3216 /* Wait until we are cgroup-ified, so that we
3217 * can mount the right cgroup path writable */
baaa35ad
ZJS
3218 if (!barrier_place_and_sync(barrier)) /* #4 */
3219 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3220 "Parent died too early");
88213476 3221
489fae52 3222 if (arg_use_cgns) {
0996ef00
CB
3223 r = unshare(CLONE_NEWCGROUP);
3224 if (r < 0)
04413780 3225 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3226 r = mount_cgroups(
3227 "",
3228 arg_unified_cgroup_hierarchy,
3229 arg_userns_mode != USER_NAMESPACE_NO,
3230 arg_uid_shift,
3231 arg_uid_range,
5a8ff0e6 3232 arg_selinux_apifs_context,
ada54120 3233 true);
1433e0f2 3234 } else
0996ef00 3235 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3236 if (r < 0)
3237 return r;
ec16945e 3238
1e4f1671 3239 r = setup_boot_id();
03cfe0d5
LP
3240 if (r < 0)
3241 return r;
ec16945e 3242
1e4f1671 3243 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
3244 if (r < 0)
3245 return r;
3246 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3247
de40a303
LP
3248 r = mount_custom(
3249 "/",
3250 arg_custom_mounts,
3251 arg_n_custom_mounts,
de40a303
LP
3252 0,
3253 arg_selinux_apifs_context,
5f0a6347 3254 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3255 if (r < 0)
3256 return r;
3257
03cfe0d5
LP
3258 if (setsid() < 0)
3259 return log_error_errno(errno, "setsid() failed: %m");
3260
3261 if (arg_private_network)
df883de9 3262 (void) loopback_setup();
03cfe0d5 3263
7a8f6325
LP
3264 if (arg_expose_ports) {
3265 r = expose_port_send_rtnl(rtnl_socket);
3266 if (r < 0)
3267 return r;
3268 rtnl_socket = safe_close(rtnl_socket);
3269 }
03cfe0d5 3270
3acc84eb 3271 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3272 _cleanup_close_ int master = -1;
3acc84eb
FB
3273 _cleanup_free_ char *console = NULL;
3274
3275 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3276 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3277 if (master < 0)
dc98caea 3278 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3279
3280 r = setup_dev_console(console);
3281 if (r < 0)
105a1a36 3282 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb
FB
3283
3284 r = send_one_fd(master_pty_socket, master, 0);
3285 if (r < 0)
3286 return log_error_errno(r, "Failed to send master fd: %m");
3287 master_pty_socket = safe_close(master_pty_socket);
3288
3289 r = setup_stdio_as_dev_console();
3290 if (r < 0)
3291 return r;
3292 }
3293
de40a303
LP
3294 r = patch_sysctl();
3295 if (r < 0)
3296 return r;
3297
81f345df
LP
3298 if (arg_oom_score_adjust_set) {
3299 r = set_oom_score_adjust(arg_oom_score_adjust);
3300 if (r < 0)
3301 return log_error_errno(r, "Failed to adjust OOM score: %m");
3302 }
3303
0985c7c4
ZJS
3304 if (arg_cpu_set.set)
3305 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3306 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3307
c818eef1 3308 (void) setup_hostname();
03cfe0d5 3309
050f7277 3310 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3311 r = safe_personality(arg_personality);
3312 if (r < 0)
3313 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3314 } else if (secondary) {
21022b9d
LP
3315 r = safe_personality(PER_LINUX32);
3316 if (r < 0)
3317 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3318 }
3319
de40a303
LP
3320 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3321 if (r < 0)
3322 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3323
3324#if HAVE_SECCOMP
3325 if (arg_seccomp) {
3326
3327 if (is_seccomp_available()) {
3328
3329 r = seccomp_load(arg_seccomp);
7bc5e0b1 3330 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3331 return log_error_errno(r, "Failed to install seccomp filter: %m");
3332 if (r < 0)
3333 log_debug_errno(r, "Failed to install seccomp filter: %m");
3334 }
3335 } else
3336#endif
3337 {
6b000af4 3338 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3339 if (r < 0)
3340 return r;
3341 }
3342
349cc4a5 3343#if HAVE_SELINUX
03cfe0d5 3344 if (arg_selinux_context)
2ed96880 3345 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3346 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3347#endif
3348
de40a303
LP
3349 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3350 * if we need to later on. */
3351 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3352 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3353
3354 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3355 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3356 else
3462d773 3357 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3358 if (r < 0)
3359 return r;
3360
de40a303
LP
3361 r = drop_capabilities(getuid());
3362 if (r < 0)
3363 return log_error_errno(r, "Dropping capabilities failed: %m");
3364
66edd963
LP
3365 if (arg_no_new_privileges)
3366 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3367 return log_error_errno(errno, "Failed to disable new privileges: %m");
3368
6aadfa4c
ILG
3369 /* LXC sets container=lxc, so follow the scheme here */
3370 envp[n_env++] = strjoina("container=", arg_container_service_name);
3371
03cfe0d5
LP
3372 envp[n_env] = strv_find_prefix(environ, "TERM=");
3373 if (envp[n_env])
313cefa1 3374 n_env++;
03cfe0d5 3375
de40a303
LP
3376 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3377 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3378 return log_oom();
3379
3380 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3381 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3382 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3383 return log_oom();
03cfe0d5 3384
3bbaff3e 3385 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3386
691675ba 3387 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3388 return log_oom();
03cfe0d5
LP
3389
3390 if (fdset_size(fds) > 0) {
3391 r = fdset_cloexec(fds, false);
3392 if (r < 0)
3393 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3394
3395 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3396 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3397 return log_oom();
3398 }
9c1e04d0
AP
3399 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3400 return log_oom();
03cfe0d5 3401
3652872a
LP
3402 if (arg_n_credentials > 0) {
3403 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3404 if (!envp[n_env])
3405 return log_oom();
3406 n_env++;
3407 }
3408
ed4512d0 3409 env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
2371271c
TG
3410 if (!env_use)
3411 return log_oom();
03cfe0d5
LP
3412
3413 /* Let the parent know that we are ready and
3414 * wait until the parent is ready with the
3415 * setup, too... */
baaa35ad 3416 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3417 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3418
5f932eb9
LP
3419 if (arg_chdir)
3420 if (chdir(arg_chdir) < 0)
3421 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3422
7732f92b 3423 if (arg_start_mode == START_PID2) {
75bf701f 3424 r = stub_pid1(arg_uuid);
7732f92b
LP
3425 if (r < 0)
3426 return r;
3427 }
3428
335d2ead
LP
3429 if (arg_console_mode != CONSOLE_PIPE) {
3430 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3431 * are configured for that. Acquire it as controlling tty. */
3432 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3433 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3434 }
3435
de40a303
LP
3436 log_debug("Inner child completed, invoking payload.");
3437
8ca082b4
LP
3438 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3439 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3440 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3441 log_close();
8ca082b4
LP
3442 log_set_open_when_needed(true);
3443
03cfe0d5
LP
3444 (void) fdset_close_others(fds);
3445
7732f92b 3446 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3447 char **a;
3448 size_t m;
3449
3450 /* Automatically search for the init system */
3451
75f32f04
ZJS
3452 m = strv_length(arg_parameters);
3453 a = newa(char*, m + 2);
3454 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3455 a[1 + m] = NULL;
03cfe0d5 3456
ced58da7 3457 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3458 execve(a[0], a, env_use);
3459
ced58da7 3460 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3461 execve(a[0], a, env_use);
3462
ced58da7 3463 a[0] = (char*) "/sbin/init";
03cfe0d5 3464 execve(a[0], a, env_use);
ced58da7
LP
3465
3466 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3467 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3468 const char *dollar_path;
3469
1a68e1e5 3470 exec_target = arg_parameters[0];
b6b180b7
LP
3471
3472 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3473 * binary. */
3474 dollar_path = strv_env_get(env_use, "PATH");
3475 if (dollar_path) {
6f646e01 3476 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3477 return log_error_errno(errno, "Failed to update $PATH: %m");
3478 }
3479
f757855e 3480 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3481 } else {
5f932eb9 3482 if (!arg_chdir)
d929b0f9
ZJS
3483 /* If we cannot change the directory, we'll end up in /, that is expected. */
3484 (void) chdir(home ?: "/root");
5f932eb9 3485
03cfe0d5
LP
3486 execle("/bin/bash", "-bash", NULL, env_use);
3487 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3488
3489 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3490 }
3491
8ca082b4 3492 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3493}
3494
e96ceaba 3495static int setup_notify_child(void) {
271f518f 3496 _cleanup_close_ int fd = -1;
9c1e04d0 3497 union sockaddr_union sa = {
44ed5214
LP
3498 .un.sun_family = AF_UNIX,
3499 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3500 };
3501 int r;
3502
3503 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3504 if (fd < 0)
3505 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3506
3507 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3508 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3509
9c1e04d0 3510 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3511 if (r < 0)
44ed5214 3512 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3513
adc7d9f0 3514 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3515 if (r < 0)
adc7d9f0 3516 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3517
2ff48e98 3518 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3519 if (r < 0)
2ff48e98 3520 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3521
271f518f 3522 return TAKE_FD(fd);
9c1e04d0
AP
3523}
3524
03cfe0d5
LP
3525static int outer_child(
3526 Barrier *barrier,
3527 const char *directory,
2d845785 3528 DissectedImage *dissected_image,
03cfe0d5
LP
3529 bool secondary,
3530 int pid_socket,
e01ff70a 3531 int uuid_socket,
9c1e04d0 3532 int notify_socket,
03cfe0d5
LP
3533 int kmsg_socket,
3534 int rtnl_socket,
825d5287 3535 int uid_shift_socket,
3acc84eb 3536 int master_pty_socket,
8199d554 3537 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3538 FDSet *fds,
3539 int netns_fd) {
03cfe0d5 3540
e1bb4b0d 3541 _cleanup_strv_free_ char **os_release_pairs = NULL;
bf428efb 3542 _cleanup_close_ int fd = -1;
e5f10caf 3543 const char *p;
03cfe0d5
LP
3544 pid_t pid;
3545 ssize_t l;
de40a303 3546 int r;
03cfe0d5 3547
b37469d7
LP
3548 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3549 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3550 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3551 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3552
03cfe0d5
LP
3553 assert(barrier);
3554 assert(directory);
03cfe0d5 3555 assert(pid_socket >= 0);
e01ff70a 3556 assert(uuid_socket >= 0);
9c1e04d0 3557 assert(notify_socket >= 0);
3acc84eb 3558 assert(master_pty_socket >= 0);
03cfe0d5
LP
3559 assert(kmsg_socket >= 0);
3560
de40a303
LP
3561 log_debug("Outer child is initializing.");
3562
e1bb4b0d
LB
3563 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3564 if (r < 0)
3565 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3566
03cfe0d5
LP
3567 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3568 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3569
03cfe0d5
LP
3570 r = reset_audit_loginuid();
3571 if (r < 0)
3572 return r;
3573
2a2e78e9
LP
3574 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3575 * mounts to the real root. */
511a8cfe 3576 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3577 if (r < 0)
3578 return r;
03cfe0d5 3579
2d845785 3580 if (dissected_image) {
2d3a5a73
LP
3581 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3582 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3583 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3584 * makes sure ESP partitions and userns are compatible. */
3585
af187ab2
LP
3586 r = dissected_image_mount_and_warn(
3587 dissected_image, directory, arg_uid_shift,
3588 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3589 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK)|
3590 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3591 if (r < 0)
af187ab2 3592 return r;
2d845785 3593 }
03cfe0d5 3594
391567f4
LP
3595 r = determine_uid_shift(directory);
3596 if (r < 0)
3597 return r;
3598
0de7acce 3599 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3600 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3601 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3602 if (l < 0)
3603 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3604 if (l != sizeof(arg_uid_shift))
3605 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3606 "Short write while sending UID shift.");
0e7ac751 3607
0de7acce 3608 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3609 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3610 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3611 * not it will pick a different one, and send it back to us. */
3612
3613 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3614 if (l < 0)
3615 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3616 if (l != sizeof(arg_uid_shift))
3617 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3618 "Short read while receiving UID shift.");
0e7ac751
LP
3619 }
3620
ff6c6cc1
LP
3621 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3622 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3623 }
3624
6f83d3d1
LP
3625 if (path_equal(directory, "/")) {
3626 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3627 * place, so that we can make changes to its mount structure (for example, to implement
3628 * --volatile=) without this interfering with our ability to access files such as
3629 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3630 * (instead of a temporary directory, since we are living in our own mount namspace here
3631 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3632 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3633
511a8cfe 3634 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3635 if (r < 0)
3636 return r;
3637
3638 directory = "/run/systemd/nspawn-root";
e50cd82f 3639 }
7d0ecdd6
LP
3640
3641 r = setup_pivot_root(
3642 directory,
3643 arg_pivot_root_new,
3644 arg_pivot_root_old);
3645 if (r < 0)
3646 return r;
3647
3648 r = setup_volatile_mode(
3649 directory,
3650 arg_volatile_mode,
7d0ecdd6 3651 arg_uid_shift,
8f1ed04a 3652 arg_selinux_apifs_context);
7d0ecdd6
LP
3653 if (r < 0)
3654 return r;
3655
5f0a6347
DDM
3656 r = mount_custom(
3657 directory,
3658 arg_custom_mounts,
3659 arg_n_custom_mounts,
5f0a6347 3660 arg_uid_shift,
5f0a6347
DDM
3661 arg_selinux_apifs_context,
3662 MOUNT_ROOT_ONLY);
3663 if (r < 0)
3664 return r;
3665
5530dc87
DDM
3666 /* Make sure we always have a mount that we can move to root later on. */
3667 if (!path_is_mount_point(directory, NULL, 0)) {
511a8cfe 3668 r = mount_nofollow_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
5530dc87
DDM
3669 if (r < 0)
3670 return r;
3671 }
3672
2d3a5a73
LP
3673 if (dissected_image) {
3674 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3675 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
4fcb96ce
LP
3676 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK));
3677 if (r == -EUCLEAN)
3678 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3679 if (r < 0)
4fcb96ce 3680 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3681 }
3682
8199d554
LP
3683 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3684 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3685
3686 r = detect_unified_cgroup_hierarchy_from_image(directory);
3687 if (r < 0)
3688 return r;
3689
3690 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3691 if (l < 0)
3692 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3693 if (l != sizeof(arg_unified_cgroup_hierarchy))
3694 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3695 "Short write while sending cgroup mode.");
8199d554
LP
3696
3697 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3698 }
3699
4ad14eff
LP
3700 /* Mark everything as shared so our mounts get propagated down. This is
3701 * required to make new bind mounts available in systemd services
5238e957 3702 * inside the container that create a new mount namespace.
4ad14eff
LP
3703 * See https://github.com/systemd/systemd/issues/3860
3704 * Further submounts (such as /dev) done after this will inherit the
5f0a6347
DDM
3705 * shared propagation mode.
3706 *
3707 * IMPORTANT: Do not overmount the root directory anymore from now on to
3708 * enable moving the root directory mount to root later on.
3709 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3710 */
511a8cfe 3711 r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
4ad14eff
LP
3712 if (r < 0)
3713 return r;
3714
3715 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3716 if (r < 0)
3717 return r;
3718
03cfe0d5
LP
3719 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3720 if (r < 0)
3721 return r;
3722
bbd407ea
DDM
3723 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3724 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3725 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3726 if (r < 0)
3727 return log_error_errno(r, "Failed to make tree read-only: %m");
3728 }
3729
0de7acce 3730 r = mount_all(directory,
4f086aab 3731 arg_mount_settings,
0de7acce 3732 arg_uid_shift,
0de7acce 3733 arg_selinux_apifs_context);
03cfe0d5
LP
3734 if (r < 0)
3735 return r;
3736
07fa00f9
LP
3737 r = copy_devnodes(directory);
3738 if (r < 0)
03cfe0d5
LP
3739 return r;
3740
de40a303
LP
3741 r = make_extra_nodes(directory);
3742 if (r < 0)
3743 return r;
3744
3745 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3746
9fac5029 3747 p = prefix_roota(directory, "/run/host");
e5f10caf 3748 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3749
07fa00f9
LP
3750 r = setup_pts(directory);
3751 if (r < 0)
03cfe0d5
LP
3752 return r;
3753
3754 r = setup_propagate(directory);
3755 if (r < 0)
3756 return r;
3757
8e5430c4
LP
3758 r = setup_keyring();
3759 if (r < 0)
3760 return r;
3761
3652872a
LP
3762 r = setup_credentials(directory);
3763 if (r < 0)
3764 return r;
3765
5c4deb9a
MJ
3766 r = mount_custom(
3767 directory,
3768 arg_custom_mounts,
3769 arg_n_custom_mounts,
3770 arg_uid_shift,
3771 arg_selinux_apifs_context,
3772 MOUNT_NON_ROOT_ONLY);
3773 if (r < 0)
3774 return r;
3775
03cfe0d5
LP
3776 r = setup_timezone(directory);
3777 if (r < 0)
3778 return r;
3779
3780 r = setup_resolv_conf(directory);
3781 if (r < 0)
3782 return r;
3783
e01ff70a
MS
3784 r = setup_machine_id(directory);
3785 if (r < 0)
3786 return r;
3787
03cfe0d5
LP
3788 r = setup_journal(directory);
3789 if (r < 0)
3790 return r;
3791
0f48ba7b
LP
3792 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3793 p = prefix_roota(directory, "/run/host/container-manager");
3794 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3795
3796 /* The same stuff as the $container_uuid env var */
3797 p = prefix_roota(directory, "/run/host/container-uuid");
3798 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3799
489fae52 3800 if (!arg_use_cgns) {
0996ef00
CB
3801 r = mount_cgroups(
3802 directory,
3803 arg_unified_cgroup_hierarchy,
3804 arg_userns_mode != USER_NAMESPACE_NO,
3805 arg_uid_shift,
3806 arg_uid_range,
5a8ff0e6 3807 arg_selinux_apifs_context,
ada54120 3808 false);
0996ef00
CB
3809 if (r < 0)
3810 return r;
3811 }
03cfe0d5
LP
3812
3813 r = mount_move_root(directory);
3814 if (r < 0)
3815 return log_error_errno(r, "Failed to move root directory: %m");
3816
e96ceaba 3817 fd = setup_notify_child();
9c1e04d0
AP
3818 if (fd < 0)
3819 return fd;
3820
03cfe0d5 3821 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3822 arg_clone_ns_flags |
8869a0b4 3823 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3824 if (pid < 0)
3825 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3826 if (pid == 0) {
3827 pid_socket = safe_close(pid_socket);
e01ff70a 3828 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3829 notify_socket = safe_close(notify_socket);
825d5287 3830 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5 3831
2a2e78e9
LP
3832 /* The inner child has all namespaces that are requested, so that we all are owned by the
3833 * user if user namespaces are turned on. */
03cfe0d5 3834
d7bea6b6
DP
3835 if (arg_network_namespace_path) {
3836 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3837 if (r < 0)
e2d39e54 3838 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3839 }
3840
e1bb4b0d 3841 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
03cfe0d5
LP
3842 if (r < 0)
3843 _exit(EXIT_FAILURE);
3844
3845 _exit(EXIT_SUCCESS);
3846 }
3847
3848 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3849 if (l < 0)
3850 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3851 if (l != sizeof(pid))
3852 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3853 "Short write while sending PID.");
03cfe0d5 3854
e01ff70a
MS
3855 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3856 if (l < 0)
3857 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3858 if (l != sizeof(arg_uuid))
3859 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3860 "Short write while sending machine ID.");
e01ff70a 3861
9c1e04d0
AP
3862 l = send_one_fd(notify_socket, fd, 0);
3863 if (l < 0)
ba72801d 3864 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3865
03cfe0d5 3866 pid_socket = safe_close(pid_socket);
e01ff70a 3867 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3868 notify_socket = safe_close(notify_socket);
3acc84eb 3869 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3870 kmsg_socket = safe_close(kmsg_socket);
3871 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3872 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3873
3874 return 0;
3875}
3876
0e7ac751 3877static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3878 bool tried_hashed = false;
0e7ac751
LP
3879 unsigned n_tries = 100;
3880 uid_t candidate;
3881 int r;
3882
3883 assert(shift);
3884 assert(ret_lock_file);
0de7acce 3885 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3886 assert(arg_uid_range == 0x10000U);
3887
3888 candidate = *shift;
3889
3890 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3891
3892 for (;;) {
fbd0b64f 3893 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3894 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3895
3896 if (--n_tries <= 0)
3897 return -EBUSY;
3898
87d5e4f2 3899 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3900 goto next;
3901 if ((candidate & UINT32_C(0xFFFF)) != 0)
3902 goto next;
3903
3904 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3905 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3906 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3907 goto next;
3908 if (r < 0)
3909 return r;
3910
3911 /* Make some superficial checks whether the range is currently known in the user database */
3912 if (getpwuid(candidate))
3913 goto next;
3914 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3915 goto next;
3916 if (getgrgid(candidate))
3917 goto next;
3918 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3919 goto next;
3920
3921 *ret_lock_file = lf;
3922 lf = (struct LockFile) LOCK_FILE_INIT;
3923 *shift = candidate;
3924 return 0;
3925
3926 next:
d381c8a6
LP
3927 if (arg_machine && !tried_hashed) {
3928 /* Try to hash the base from the container name */
3929
3930 static const uint8_t hash_key[] = {
3931 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3932 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3933 };
3934
3935 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3936
3937 tried_hashed = true;
3938 } else
3939 random_bytes(&candidate, sizeof(candidate));
3940
87d5e4f2 3941 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3942 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3943 }
3944}
3945
03cfe0d5 3946static int setup_uid_map(pid_t pid) {
fbd0b64f 3947 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3948 int r;
3949
3950 assert(pid > 1);
3951
3952 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3953 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3954 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3955 if (r < 0)
3956 return log_error_errno(r, "Failed to write UID map: %m");
3957
3958 /* We always assign the same UID and GID ranges */
3959 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3960 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3961 if (r < 0)
3962 return log_error_errno(r, "Failed to write GID map: %m");
3963
3964 return 0;
3965}
3966
9c1e04d0 3967static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3968 char buf[NOTIFY_BUFFER_MAX+1];
3969 char *p = NULL;
3970 struct iovec iovec = {
3971 .iov_base = buf,
3972 .iov_len = sizeof(buf)-1,
3973 };
fb29cdbe
LP
3974 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
3975 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
3976 struct msghdr msghdr = {
3977 .msg_iov = &iovec,
3978 .msg_iovlen = 1,
3979 .msg_control = &control,
3980 .msg_controllen = sizeof(control),
3981 };
371d72e0 3982 struct ucred *ucred;
9c1e04d0
AP
3983 ssize_t n;
3984 pid_t inner_child_pid;
3985 _cleanup_strv_free_ char **tags = NULL;
3986
3987 assert(userdata);
3988
3989 inner_child_pid = PTR_TO_PID(userdata);
3990
3991 if (revents != EPOLLIN) {
3992 log_warning("Got unexpected poll event for notify fd.");
3993 return 0;
3994 }
3995
3691bcf3
LP
3996 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3997 if (IN_SET(n, -EAGAIN, -EINTR))
3998 return 0;
741bfd7f
LP
3999 if (n == -EXFULL) {
4000 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4001 return 0;
4002 }
3691bcf3
LP
4003 if (n < 0)
4004 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 4005
9c1e04d0
AP
4006 cmsg_close_all(&msghdr);
4007
371d72e0 4008 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4009 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4010 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4011 return 0;
4012 }
4013
4014 if ((size_t) n >= sizeof(buf)) {
4015 log_warning("Received notify message exceeded maximum size. Ignoring.");
4016 return 0;
4017 }
4018
4019 buf[n] = 0;
4020 tags = strv_split(buf, "\n\r");
4021 if (!tags)
4022 return log_oom();
4023
4024 if (strv_find(tags, "READY=1"))
04f590a4 4025 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
4026
4027 p = strv_find_startswith(tags, "STATUS=");
4028 if (p)
04f590a4 4029 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4030
4031 return 0;
4032}
4033
e96ceaba 4034static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4035 int r;
9c1e04d0 4036
5773024d 4037 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4038 if (r < 0)
4039 return log_error_errno(r, "Failed to allocate notify event source: %m");
4040
5773024d 4041 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4042
4043 return 0;
4044}
4045
5d961407
LP
4046static int merge_settings(Settings *settings, const char *path) {
4047 int rl;
f757855e 4048
5d961407
LP
4049 assert(settings);
4050 assert(path);
f757855e 4051
5d961407
LP
4052 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4053 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4054
7732f92b
LP
4055 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4056 settings->start_mode >= 0) {
4057 arg_start_mode = settings->start_mode;
130d3d22 4058 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4059 }
4060
a2f577fc
JL
4061 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
4062 arg_ephemeral = settings->ephemeral;
4063
de40a303
LP
4064 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4065 settings->root) {
4066
4067 if (!arg_settings_trusted)
4068 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4069 else
4070 free_and_replace(arg_directory, settings->root);
4071 }
4072
b53ede69
PW
4073 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4074 settings->pivot_root_new) {
4075 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4076 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4077 }
4078
5f932eb9 4079 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4080 settings->working_directory)
4081 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4082
f757855e 4083 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4084 settings->environment)
4085 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4086
de40a303
LP
4087 if ((arg_settings_mask & SETTING_USER) == 0) {
4088
4089 if (settings->user)
4090 free_and_replace(arg_user, settings->user);
4091
4092 if (uid_is_valid(settings->uid))
4093 arg_uid = settings->uid;
4094 if (gid_is_valid(settings->gid))
4095 arg_gid = settings->gid;
4096 if (settings->n_supplementary_gids > 0) {
4097 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4098 arg_n_supplementary_gids = settings->n_supplementary_gids;
4099 }
4100 }
f757855e
LP
4101
4102 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4103 uint64_t plus, minus;
7be830c6 4104 uint64_t network_minus = 0;
88fc9c9b 4105 uint64_t ambient;
f757855e 4106
de40a303
LP
4107 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4108 * Settings structure */
4109
0e265674 4110 plus = settings->capability;
a3fc6b55
LP
4111 minus = settings->drop_capability;
4112
4113 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
4114 if (settings_private_network(settings))
4115 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4116 else
7be830c6 4117 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4118 }
0e265674
LP
4119
4120 if (!arg_settings_trusted && plus != 0) {
4121 if (settings->capability != 0)
5d961407 4122 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4123 } else {
4124 arg_caps_retain &= ~network_minus;
520e0d54 4125 arg_caps_retain |= plus;
7be830c6 4126 }
f757855e 4127
a3fc6b55 4128 arg_caps_retain &= ~minus;
de40a303
LP
4129
4130 /* Copy the full capabilities over too */
4131 if (capability_quintet_is_set(&settings->full_capabilities)) {
4132 if (!arg_settings_trusted)
5238e957 4133 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4134 else
4135 arg_full_capabilities = settings->full_capabilities;
4136 }
88fc9c9b
TH
4137
4138 ambient = settings->ambient_capability;
4139 if (!arg_settings_trusted && ambient != 0)
4140 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4141 else
4142 arg_caps_ambient |= ambient;
f757855e
LP
4143 }
4144
4145 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4146 settings->kill_signal > 0)
4147 arg_kill_signal = settings->kill_signal;
4148
4149 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4150 settings->personality != PERSONALITY_INVALID)
4151 arg_personality = settings->personality;
4152
4153 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4154 !sd_id128_is_null(settings->machine_id)) {
4155
4156 if (!arg_settings_trusted)
5d961407 4157 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4158 else
4159 arg_uuid = settings->machine_id;
4160 }
4161
4162 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4163 settings->read_only >= 0)
4164 arg_read_only = settings->read_only;
4165
4166 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4167 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4168 arg_volatile_mode = settings->volatile_mode;
4169
4170 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4171 settings->n_custom_mounts > 0) {
4172
4173 if (!arg_settings_trusted)
5d961407 4174 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4175 else {
4176 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4177 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4178 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4179 settings->n_custom_mounts = 0;
4180 }
4181 }
4182
4183 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4184 (settings->private_network >= 0 ||
4185 settings->network_veth >= 0 ||
4186 settings->network_bridge ||
22b28dfd 4187 settings->network_zone ||
f757855e
LP
4188 settings->network_interfaces ||
4189 settings->network_macvlan ||
f6d6bad1 4190 settings->network_ipvlan ||
de40a303
LP
4191 settings->network_veth_extra ||
4192 settings->network_namespace_path)) {
f757855e
LP
4193
4194 if (!arg_settings_trusted)
5d961407 4195 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4196 else {
f6d6bad1 4197 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4198 arg_private_network = settings_private_network(settings);
4199
130d3d22
YW
4200 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4201 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4202 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4203 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4204
1cc6c93a
YW
4205 free_and_replace(arg_network_bridge, settings->network_bridge);
4206 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4207
4208 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4209 }
4210 }
4211
4212 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4213 settings->expose_ports) {
4214
4215 if (!arg_settings_trusted)
5d961407 4216 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4217 else {
4218 expose_port_free_all(arg_expose_ports);
1cc6c93a 4219 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4220 }
4221 }
4222
0de7acce
LP
4223 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4224 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4225
4226 if (!arg_settings_trusted)
5d961407 4227 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4228 else {
4229 arg_userns_mode = settings->userns_mode;
4230 arg_uid_shift = settings->uid_shift;
4231 arg_uid_range = settings->uid_range;
4232 arg_userns_chown = settings->userns_chown;
4233 }
4234 }
4235
9c1e04d0
AP
4236 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
4237 arg_notify_ready = settings->notify_ready;
4238
960e4569
LP
4239 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4240
6b000af4 4241 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
5d961407 4242 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 4243 else {
6b000af4
LP
4244 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4245 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
960e4569 4246 }
de40a303
LP
4247
4248#if HAVE_SECCOMP
4249 if (!arg_settings_trusted && settings->seccomp)
4250 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4251 else {
4252 seccomp_release(arg_seccomp);
4253 arg_seccomp = TAKE_PTR(settings->seccomp);
4254 }
4255#endif
960e4569
LP
4256 }
4257
bf428efb
LP
4258 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4259 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4260 continue;
4261
4262 if (!settings->rlimit[rl])
4263 continue;
4264
4265 if (!arg_settings_trusted) {
5d961407 4266 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4267 continue;
4268 }
4269
4270 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4271 }
4272
3a9530e5
LP
4273 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4274 settings->hostname)
4275 free_and_replace(arg_hostname, settings->hostname);
4276
66edd963
LP
4277 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4278 settings->no_new_privileges >= 0)
4279 arg_no_new_privileges = settings->no_new_privileges;
4280
81f345df
LP
4281 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4282 settings->oom_score_adjust_set) {
4283
4284 if (!arg_settings_trusted)
5d961407 4285 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4286 else {
4287 arg_oom_score_adjust = settings->oom_score_adjust;
4288 arg_oom_score_adjust_set = true;
4289 }
4290 }
4291
d107bb7d 4292 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4293 settings->cpu_set.set) {
d107bb7d
LP
4294
4295 if (!arg_settings_trusted)
5d961407 4296 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4297 else {
0985c7c4
ZJS
4298 cpu_set_reset(&arg_cpu_set);
4299 arg_cpu_set = settings->cpu_set;
4300 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4301 }
4302 }
4303
09d423e9
LP
4304 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4305 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4306 arg_resolv_conf = settings->resolv_conf;
4307
4e1d6aa9
LP
4308 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4309 settings->link_journal != _LINK_JOURNAL_INVALID) {
4310
4311 if (!arg_settings_trusted)
4312 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4313 else {
4314 arg_link_journal = settings->link_journal;
4315 arg_link_journal_try = settings->link_journal_try;
4316 }
4317 }
4318
1688841f
LP
4319 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4320 settings->timezone != _TIMEZONE_MODE_INVALID)
4321 arg_timezone = settings->timezone;
4322
de40a303
LP
4323 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4324 settings->slice) {
4325
4326 if (!arg_settings_trusted)
4327 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4328 else
4329 free_and_replace(arg_slice, settings->slice);
4330 }
4331
4332 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4333 settings->use_cgns >= 0) {
4334
4335 if (!arg_settings_trusted)
4336 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4337 else
4338 arg_use_cgns = settings->use_cgns;
4339 }
4340
4341 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4342 settings->clone_ns_flags != (unsigned long) -1) {
4343
4344 if (!arg_settings_trusted)
4345 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4346 else
4347 arg_clone_ns_flags = settings->clone_ns_flags;
4348 }
4349
4350 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4351 settings->console_mode >= 0) {
4352
4353 if (!arg_settings_trusted)
4354 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4355 else
4356 arg_console_mode = settings->console_mode;
4357 }
4358
4359 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4360 * don't consult arg_settings_mask for them. */
4361
4362 sd_bus_message_unref(arg_property_message);
4363 arg_property_message = TAKE_PTR(settings->properties);
4364
4365 arg_console_width = settings->console_width;
4366 arg_console_height = settings->console_height;
4367
b2645747 4368 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4369 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4370 arg_n_extra_nodes = settings->n_extra_nodes;
4371
f757855e
LP
4372 return 0;
4373}
4374
5d961407
LP
4375static int load_settings(void) {
4376 _cleanup_(settings_freep) Settings *settings = NULL;
4377 _cleanup_fclose_ FILE *f = NULL;
4378 _cleanup_free_ char *p = NULL;
4379 const char *fn, *i;
4380 int r;
4381
de40a303
LP
4382 if (arg_oci_bundle)
4383 return 0;
4384
5d961407
LP
4385 /* If all settings are masked, there's no point in looking for
4386 * the settings file */
d7a0f1f4 4387 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4388 return 0;
4389
4390 fn = strjoina(arg_machine, ".nspawn");
4391
4392 /* We first look in the admin's directories in /etc and /run */
4393 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4394 _cleanup_free_ char *j = NULL;
4395
657ee2d8 4396 j = path_join(i, fn);
5d961407
LP
4397 if (!j)
4398 return log_oom();
4399
4400 f = fopen(j, "re");
4401 if (f) {
4402 p = TAKE_PTR(j);
4403
4404 /* By default, we trust configuration from /etc and /run */
4405 if (arg_settings_trusted < 0)
4406 arg_settings_trusted = true;
4407
4408 break;
4409 }
4410
4411 if (errno != ENOENT)
4412 return log_error_errno(errno, "Failed to open %s: %m", j);
4413 }
4414
4415 if (!f) {
4416 /* After that, let's look for a file next to the
4417 * actual image we shall boot. */
4418
4419 if (arg_image) {
4420 p = file_in_same_dir(arg_image, fn);
4421 if (!p)
4422 return log_oom();
cd6e3914 4423 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4424 p = file_in_same_dir(arg_directory, fn);
4425 if (!p)
4426 return log_oom();
4427 }
4428
4429 if (p) {
4430 f = fopen(p, "re");
4431 if (!f && errno != ENOENT)
4432 return log_error_errno(errno, "Failed to open %s: %m", p);
4433
4434 /* By default, we do not trust configuration from /var/lib/machines */
4435 if (arg_settings_trusted < 0)
4436 arg_settings_trusted = false;
4437 }
4438 }
4439
4440 if (!f)
4441 return 0;
4442
4443 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4444
4445 r = settings_load(f, p, &settings);
4446 if (r < 0)
4447 return r;
4448
4449 return merge_settings(settings, p);
4450}
4451
de40a303
LP
4452static int load_oci_bundle(void) {
4453 _cleanup_(settings_freep) Settings *settings = NULL;
4454 int r;
4455
4456 if (!arg_oci_bundle)
4457 return 0;
4458
4459 /* By default let's trust OCI bundles */
4460 if (arg_settings_trusted < 0)
4461 arg_settings_trusted = true;
4462
4463 r = oci_load(NULL, arg_oci_bundle, &settings);
4464 if (r < 0)
4465 return r;
4466
4467 return merge_settings(settings, arg_oci_bundle);
4468}
4469
3acc84eb 4470static int run_container(
2d845785 4471 DissectedImage *dissected_image,
b0067625
ZJS
4472 bool secondary,
4473 FDSet *fds,
4474 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4475 struct ExposeArgs *expose_args,
3acc84eb 4476 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4477
4478 static const struct sigaction sa = {
4479 .sa_handler = nop_signal_handler,
e28c7cd0 4480 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4481 };
4482
8e766630 4483 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4484 _cleanup_close_ int etc_passwd_lock = -1;
4485 _cleanup_close_pair_ int
4486 kmsg_socket_pair[2] = { -1, -1 },
4487 rtnl_socket_pair[2] = { -1, -1 },
4488 pid_socket_pair[2] = { -1, -1 },
4489 uuid_socket_pair[2] = { -1, -1 },
4490 notify_socket_pair[2] = { -1, -1 },
8199d554 4491 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4492 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4493 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4494
3acc84eb 4495 _cleanup_close_ int notify_socket = -1;
b0067625 4496 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4497 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4498 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4499 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4500 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4501 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4502 ContainerStatus container_status = 0;
b0067625
ZJS
4503 int ifi = 0, r;
4504 ssize_t l;
4505 sigset_t mask_chld;
5b4855ab 4506 _cleanup_close_ int child_netns_fd = -1;
b0067625
ZJS
4507
4508 assert_se(sigemptyset(&mask_chld) == 0);
4509 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4510
4511 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4512 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4513 * check with getpwuid() if the specific user already exists. Note that /etc might be
4514 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4515 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4516 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4517 * really ours. */
4518
4519 etc_passwd_lock = take_etc_passwd_lock(NULL);
4520 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4521 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4522 }
4523
4524 r = barrier_create(&barrier);
4525 if (r < 0)
4526 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4527
4528 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4529 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4530
4531 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4532 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4533
4534 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4535 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4536
4537 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4538 return log_error_errno(errno, "Failed to create id socket pair: %m");
4539
4540 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4541 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4542
3acc84eb
FB
4543 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4544 return log_error_errno(errno, "Failed to create console socket pair: %m");
4545
b0067625
ZJS
4546 if (arg_userns_mode != USER_NAMESPACE_NO)
4547 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4548 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4549
8199d554
LP
4550 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4551 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4552 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4553
b0067625
ZJS
4554 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4555 * parent's blocking calls and give it a chance to call wait() and terminate. */
4556 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4557 if (r < 0)
4558 return log_error_errno(errno, "Failed to change the signal mask: %m");
4559
4560 r = sigaction(SIGCHLD, &sa, NULL);
4561 if (r < 0)
4562 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4563
d7bea6b6 4564 if (arg_network_namespace_path) {
5b4855ab
DDM
4565 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4566 if (child_netns_fd < 0)
d7bea6b6
DP
4567 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4568
5b4855ab 4569 r = fd_is_network_ns(child_netns_fd);
6619ad88
LP
4570 if (r == -EUCLEAN)
4571 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4572 else if (r < 0)
d7bea6b6 4573 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4574 else if (r == 0)
4575 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4576 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4577 }
4578
b0067625
ZJS
4579 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4580 if (*pid < 0)
4581 return log_error_errno(errno, "clone() failed%s: %m",
4582 errno == EINVAL ?
4583 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4584
4585 if (*pid == 0) {
4586 /* The outer child only has a file system namespace. */
4587 barrier_set_role(&barrier, BARRIER_CHILD);
4588
b0067625
ZJS
4589 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4590 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4591 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4592 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4593 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4594 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4595 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4596 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4597
4598 (void) reset_all_signal_handlers();
4599 (void) reset_signal_mask();
4600
4601 r = outer_child(&barrier,
4602 arg_directory,
2d845785 4603 dissected_image,
b0067625
ZJS
4604 secondary,
4605 pid_socket_pair[1],
4606 uuid_socket_pair[1],
4607 notify_socket_pair[1],
4608 kmsg_socket_pair[1],
4609 rtnl_socket_pair[1],
4610 uid_shift_socket_pair[1],
3acc84eb 4611 master_pty_socket_pair[1],
8199d554 4612 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6 4613 fds,
5b4855ab 4614 child_netns_fd);
b0067625
ZJS
4615 if (r < 0)
4616 _exit(EXIT_FAILURE);
4617
4618 _exit(EXIT_SUCCESS);
4619 }
4620
4621 barrier_set_role(&barrier, BARRIER_PARENT);
4622
e4077ff6 4623 fdset_close(fds);
b0067625
ZJS
4624
4625 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4626 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4627 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4628 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4629 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4630 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4631 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4632 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4633
4634 if (arg_userns_mode != USER_NAMESPACE_NO) {
4635 /* The child just let us know the UID shift it might have read from the image. */
4636 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4637 if (l < 0)
4638 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4639 if (l != sizeof arg_uid_shift)
4640 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4641
4642 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4643 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4644 * image, but if that's already in use, pick a new one, and report back to the child,
4645 * which one we now picked. */
4646
4647 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4648 if (r < 0)
4649 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4650
4651 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4652 if (l < 0)
4653 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4654 if (l != sizeof arg_uid_shift)
4655 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625
ZJS
4656 }
4657 }
4658
8199d554
LP
4659 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4660 /* The child let us know the support cgroup mode it might have read from the image. */
4661 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4662 if (l < 0)
4663 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4664 if (l != sizeof(arg_unified_cgroup_hierarchy))
4665 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4666 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4667 }
4668
b0067625 4669 /* Wait for the outer child. */
d2e0ac3d
LP
4670 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4671 if (r < 0)
4672 return r;
4673 if (r != EXIT_SUCCESS)
4674 return -EIO;
b0067625
ZJS
4675
4676 /* And now retrieve the PID of the inner child. */
4677 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4678 if (l < 0)
4679 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4680 if (l != sizeof *pid)
4681 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4682
4683 /* We also retrieve container UUID in case it was generated by outer child */
4684 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4685 if (l < 0)
4686 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4687 if (l != sizeof(arg_uuid))
4688 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4689
4690 /* We also retrieve the socket used for notifications generated by outer child */
4691 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4692 if (notify_socket < 0)
4693 return log_error_errno(notify_socket,
4694 "Failed to receive notification socket from the outer child: %m");
4695
4696 log_debug("Init process invoked as PID "PID_FMT, *pid);
4697
4698 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4699 if (!barrier_place_and_sync(&barrier)) /* #1 */
4700 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4701
4702 r = setup_uid_map(*pid);
4703 if (r < 0)
4704 return r;
4705
4706 (void) barrier_place(&barrier); /* #2 */
4707 }
4708
4709 if (arg_private_network) {
75116558
PS
4710 if (!arg_network_namespace_path) {
4711 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4712 if (!barrier_place_and_sync(&barrier)) /* #3 */
4713 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4714 }
4715
5b4855ab
DDM
4716 if (child_netns_fd < 0) {
4717 /* Make sure we have an open file descriptor to the child's network
4718 * namespace so it stays alive even if the child exits. */
4719 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4720 if (r < 0)
4721 return log_error_errno(r, "Failed to open child network namespace: %m");
4722 }
4723
4724 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4725 if (r < 0)
4726 return r;
4727
4728 if (arg_network_veth) {
4729 r = setup_veth(arg_machine, *pid, veth_name,
4730 arg_network_bridge || arg_network_zone);
4731 if (r < 0)
4732 return r;
4733 else if (r > 0)
4734 ifi = r;
4735
4736 if (arg_network_bridge) {
4737 /* Add the interface to a bridge */
4738 r = setup_bridge(veth_name, arg_network_bridge, false);
4739 if (r < 0)
4740 return r;
4741 if (r > 0)
4742 ifi = r;
4743 } else if (arg_network_zone) {
4744 /* Add the interface to a bridge, possibly creating it */
4745 r = setup_bridge(veth_name, arg_network_zone, true);
4746 if (r < 0)
4747 return r;
4748 if (r > 0)
4749 ifi = r;
4750 }
4751 }
4752
4753 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4754 if (r < 0)
4755 return r;
4756
4757 /* We created the primary and extra veth links now; let's remember this, so that we know to
4758 remove them later on. Note that we don't bother with removing veth links that were created
4759 here when their setup failed half-way, because in that case the kernel should be able to
4760 remove them on its own, since they cannot be referenced by anything yet. */
4761 *veth_created = true;
4762
4763 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4764 if (r < 0)
4765 return r;
4766
4767 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4768 if (r < 0)
4769 return r;
4770 }
4771
abdb9b08
LP
4772 if (arg_register || !arg_keep_unit) {
4773 r = sd_bus_default_system(&bus);
4774 if (r < 0)
4775 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4776
4777 r = sd_bus_set_close_on_exit(bus, false);
4778 if (r < 0)
4779 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4780 }
4781
4782 if (!arg_keep_unit) {
4783 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4784 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4785 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4786
75152a4d
LP
4787 r = sd_bus_match_signal_async(
4788 bus,
4789 NULL,
4790 "org.freedesktop.systemd1",
4791 NULL,
4792 "org.freedesktop.systemd1.Scope",
4793 "RequestStop",
4794 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4795 if (r < 0)
75152a4d 4796 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4797 }
4798
b0067625
ZJS
4799 if (arg_register) {
4800 r = register_machine(
abdb9b08 4801 bus,
b0067625
ZJS
4802 arg_machine,
4803 *pid,
4804 arg_directory,
4805 arg_uuid,
4806 ifi,
4807 arg_slice,
4808 arg_custom_mounts, arg_n_custom_mounts,
4809 arg_kill_signal,
4810 arg_property,
de40a303 4811 arg_property_message,
b0067625
ZJS
4812 arg_keep_unit,
4813 arg_container_service_name);
4814 if (r < 0)
4815 return r;
abdb9b08 4816
cd2dfc6f
LP
4817 } else if (!arg_keep_unit) {
4818 r = allocate_scope(
abdb9b08 4819 bus,
cd2dfc6f
LP
4820 arg_machine,
4821 *pid,
4822 arg_slice,
4823 arg_custom_mounts, arg_n_custom_mounts,
4824 arg_kill_signal,
de40a303
LP
4825 arg_property,
4826 arg_property_message);
cd2dfc6f
LP
4827 if (r < 0)
4828 return r;
4829
4830 } else if (arg_slice || arg_property)
4831 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4832
27da7ef0 4833 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4834 if (r < 0)
4835 return r;
4836
27da7ef0 4837 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4838 if (r < 0)
4839 return r;
b0067625 4840
de54e02d 4841 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4842 if (r < 0)
4843 return r;
4844
4845 /* Notify the child that the parent is ready with all
4846 * its setup (including cgroup-ification), and that
4847 * the child can now hand over control to the code to
4848 * run inside the container. */
75116558 4849 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4850
4851 /* Block SIGCHLD here, before notifying child.
4852 * process_pty() will handle it with the other signals. */
4853 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4854
4855 /* Reset signal to default */
4856 r = default_signals(SIGCHLD, -1);
4857 if (r < 0)
4858 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4859
4860 r = sd_event_new(&event);
4861 if (r < 0)
4862 return log_error_errno(r, "Failed to get default event source: %m");
4863
8fd010bb
LP
4864 (void) sd_event_set_watchdog(event, true);
4865
abdb9b08
LP
4866 if (bus) {
4867 r = sd_bus_attach_event(bus, event, 0);
4868 if (r < 0)
4869 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4870 }
4871
e96ceaba 4872 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4873 if (r < 0)
4874 return r;
4875
4876 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
4877 if (!barrier_place_and_sync(&barrier)) /* #5 */
4878 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4879
38ccb557 4880 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
4881 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4882 etc_passwd_lock = safe_close(etc_passwd_lock);
4883
04f590a4
LP
4884 (void) sd_notifyf(false,
4885 "STATUS=Container running.\n"
4886 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4887 if (!arg_notify_ready)
919f5ae0 4888 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4889
4890 if (arg_kill_signal > 0) {
4891 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4892 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4893 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4894 } else {
4895 /* Immediately exit */
919f5ae0
LP
4896 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4897 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4898 }
4899
6916b164 4900 /* Exit when the child exits */
919f5ae0 4901 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4902
4903 if (arg_expose_ports) {
761cf19d 4904 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
4905 if (r < 0)
4906 return r;
4907
761cf19d 4908 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, &expose_args->address);
b0067625
ZJS
4909 }
4910
4911 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4912
3acc84eb
FB
4913 if (arg_console_mode != CONSOLE_PIPE) {
4914 _cleanup_close_ int fd = -1;
4915 PTYForwardFlags flags = 0;
de40a303 4916
3acc84eb
FB
4917 /* Retrieve the master pty allocated by inner child */
4918 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4919 if (fd < 0)
4920 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4921
4922 switch (arg_console_mode) {
de40a303 4923
3acc84eb
FB
4924 case CONSOLE_READ_ONLY:
4925 flags |= PTY_FORWARD_READ_ONLY;
4926
4927 _fallthrough_;
4928
4929 case CONSOLE_INTERACTIVE:
4930 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4931
4932 r = pty_forward_new(event, fd, flags, &forward);
4933 if (r < 0)
4934 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4935
4936 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4937 (void) pty_forward_set_width_height(forward,
4938 arg_console_width,
4939 arg_console_height);
4940 break;
4941
4942 default:
4943 assert(arg_console_mode == CONSOLE_PASSIVE);
4944 }
4945
4946 *master = TAKE_FD(fd);
de40a303 4947 }
b0067625
ZJS
4948
4949 r = sd_event_loop(event);
4950 if (r < 0)
4951 return log_error_errno(r, "Failed to run event loop: %m");
4952
de40a303
LP
4953 if (forward) {
4954 char last_char = 0;
b0067625 4955
de40a303
LP
4956 (void) pty_forward_get_last_char(forward, &last_char);
4957 forward = pty_forward_free(forward);
b0067625 4958
de40a303
LP
4959 if (!arg_quiet && last_char != '\n')
4960 putc('\n', stdout);
4961 }
b0067625
ZJS
4962
4963 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
4964 if (!arg_register && !arg_keep_unit && bus)
4965 terminate_scope(bus, arg_machine);
b0067625
ZJS
4966
4967 /* Normally redundant, but better safe than sorry */
c67b0082 4968 (void) kill(*pid, SIGKILL);
b0067625 4969
5b4855ab
DDM
4970 if (arg_private_network) {
4971 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
4972 * to avoid having to move the parent to the child network namespace. */
4973 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
4974 if (r < 0)
4975 return r;
4976
4977 if (r == 0) {
4978 _cleanup_close_ int parent_netns_fd = -1;
4979
4980 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
4981 if (r < 0) {
4982 log_error_errno(r, "Failed to open parent network namespace: %m");
4983 _exit(EXIT_FAILURE);
4984 }
4985
4986 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
4987 if (r < 0) {
4988 log_error_errno(r, "Failed to enter child network namespace: %m");
4989 _exit(EXIT_FAILURE);
4990 }
4991
4992 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
4993 if (r < 0)
4994 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
4995
4996 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
4997 }
4998 }
4999
b0067625
ZJS
5000 r = wait_for_container(*pid, &container_status);
5001 *pid = 0;
5002
0bb0a9fa
ZJS
5003 /* Tell machined that we are gone. */
5004 if (bus)
5005 (void) unregister_machine(bus, arg_machine);
5006
b0067625
ZJS
5007 if (r < 0)
5008 /* We failed to wait for the container, or the container exited abnormally. */
5009 return r;
5010 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5011 /* r > 0 → The container exited with a non-zero status.
5012 * As a special case, we need to replace 133 with a different value,
5013 * because 133 is special-cased in the service file to reboot the container.
5014 * otherwise → The container exited with zero status and a reboot was not requested.
5015 */
2a49b612 5016 if (r == EXIT_FORCE_RESTART)
27e29a1e 5017 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5018 *ret = r;
b0067625
ZJS
5019 return 0; /* finito */
5020 }
5021
5022 /* CONTAINER_REBOOTED, loop again */
5023
5024 if (arg_keep_unit) {
5025 /* Special handling if we are running as a service: instead of simply
5026 * restarting the machine we want to restart the entire service, so let's
5027 * inform systemd about this with the special exit code 133. The service
5028 * file uses RestartForceExitStatus=133 so that this results in a full
5029 * nspawn restart. This is necessary since we might have cgroup parameters
5030 * set we want to have flushed out. */
2a49b612
ZJS
5031 *ret = EXIT_FORCE_RESTART;
5032 return 0; /* finito */
b0067625
ZJS
5033 }
5034
761cf19d 5035 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, &expose_args->address);
b0067625
ZJS
5036
5037 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5038 *veth_created = false;
5039 return 1; /* loop again */
5040}
5041
bf428efb 5042static int initialize_rlimits(void) {
bf428efb
LP
5043 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
5044 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5045 * container execution environments. */
5046
5047 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5048 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5049 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5050 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5051 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5052 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5053 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5054 [RLIMIT_MEMLOCK] = { 65536, 65536 },
5055 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5056 [RLIMIT_NICE] = { 0, 0 },
5057 [RLIMIT_NOFILE] = { 1024, 4096 },
5058 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5059 [RLIMIT_RTPRIO] = { 0, 0 },
5060 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5061 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5062
5063 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5064 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5065 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5066 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5067 * that PID 1 changes a number of other resource limits during early initialization which is why we
5068 * don't read the other limits from PID 1 but prefer the static table above. */
5069 };
5070
5071 int rl;
5072
5073 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5074 /* Let's only fill in what the user hasn't explicitly configured anyway */
5075 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5076 const struct rlimit *v;
5077 struct rlimit buffer;
5078
5079 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5080 /* For these two let's read the limits off PID 1. See above for an explanation. */
5081
5082 if (prlimit(1, rl, NULL, &buffer) < 0)
5083 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5084
5085 v = &buffer;
5086 } else
5087 v = kernel_defaults + rl;
5088
5089 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5090 if (!arg_rlimit[rl])
5091 return log_oom();
5092 }
5093
5094 if (DEBUG_LOGGING) {
5095 _cleanup_free_ char *k = NULL;
5096
5097 (void) rlimit_format(arg_rlimit[rl], &k);
5098 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5099 }
5100 }
5101
5102 return 0;
5103}
5104
287b7376
LP
5105static int cant_be_in_netns(void) {
5106 union sockaddr_union sa = {
5107 .un = {
5108 .sun_family = AF_UNIX,
5109 .sun_path = "/run/udev/control",
5110 },
5111 };
5112 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5113 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5114 _cleanup_close_ int fd = -1;
5115 struct ucred ucred;
5116 int r;
5117
5118 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5119 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5120 * nice message. */
5121
5122 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5123 return 0;
5124
5125 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5126 if (fd < 0)
5127 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5128
5129 if (connect(fd, &sa.un, SOCKADDR_UN_LEN(sa.un)) < 0) {
5130
5131 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5132 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5133 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5134
5135 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5136 }
5137
5138 r = getpeercred(fd, &ucred);
5139 if (r < 0)
5140 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5141
5142 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5143 r = readlink_malloc(udev_path, &udev_ns);
5144 if (r < 0)
5145 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5146
5147 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5148 if (r < 0)
5149 return log_error_errno(r, "Failed to read our own network namespace: %m");
5150
5151 if (!streq(our_ns, udev_ns))
5152 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5153 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5154 return 0;
5155}
5156
44dbef90 5157static int run(int argc, char *argv[]) {
7bf011e3
LP
5158 bool secondary = false, remove_directory = false, remove_image = false,
5159 veth_created = false, remove_tmprootdir = false;
2d845785 5160 _cleanup_close_ int master = -1;
03cfe0d5 5161 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5162 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5163 char veth_name[IFNAMSIZ] = "";
761cf19d 5164 struct ExposeArgs expose_args = {};
8e766630 5165 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5166 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5167 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
5168 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5169 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5170 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5171 pid_t pid = 0;
03cfe0d5
LP
5172
5173 log_parse_environment();
5174 log_open();
415fc41c 5175
03cfe0d5
LP
5176 r = parse_argv(argc, argv);
5177 if (r <= 0)
5178 goto finish;
5179
38ee19c0
ZJS
5180 if (geteuid() != 0) {
5181 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5182 argc >= 2 ? "Need to be root." :
5183 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5184 goto finish;
38ee19c0 5185 }
fba868fa 5186
287b7376
LP
5187 r = cant_be_in_netns();
5188 if (r < 0)
5189 goto finish;
5190
bf428efb
LP
5191 r = initialize_rlimits();
5192 if (r < 0)
5193 goto finish;
5194
de40a303
LP
5195 r = load_oci_bundle();
5196 if (r < 0)
5197 goto finish;
5198
f757855e
LP
5199 r = determine_names();
5200 if (r < 0)
5201 goto finish;
5202
5203 r = load_settings();
5204 if (r < 0)
5205 goto finish;
5206
d4d99bc6 5207 r = cg_unified();
5eee8290
LP
5208 if (r < 0) {
5209 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5210 goto finish;
5211 }
5212
f757855e
LP
5213 r = verify_arguments();
5214 if (r < 0)
5215 goto finish;
03cfe0d5 5216
49048684
ZJS
5217 /* Reapply environment settings. */
5218 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5219
2949ff26
LP
5220 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5221 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5222 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5223 (void) ignore_signals(SIGPIPE, -1);
5224
03cfe0d5
LP
5225 n_fd_passed = sd_listen_fds(false);
5226 if (n_fd_passed > 0) {
5227 r = fdset_new_listen_fds(&fds, false);
5228 if (r < 0) {
5229 log_error_errno(r, "Failed to collect file descriptors: %m");
5230 goto finish;
5231 }
5232 }
5233
83e803a9
ZJS
5234 /* The "default" umask. This is appropriate for most file and directory
5235 * operations performed by nspawn, and is the umask that will be used for
5236 * the child. Functions like copy_devnodes() change the umask temporarily. */
5237 umask(0022);
5238
03cfe0d5
LP
5239 if (arg_directory) {
5240 assert(!arg_image);
5241
b35ca61a
LP
5242 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5243 * /var from the host will propagate into container dynamically (because bad things happen if
5244 * two systems write to the same /var). Let's allow it for the special cases where /var is
5245 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5246 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5247 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5248 r = -EINVAL;
5249 goto finish;
5250 }
5251
5252 if (arg_ephemeral) {
5253 _cleanup_free_ char *np = NULL;
5254
8d4aa2bb 5255 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
5256 if (r < 0)
5257 goto finish;
5258
7bf011e3
LP
5259 /* If the specified path is a mount point we generate the new snapshot immediately
5260 * inside it under a random name. However if the specified is not a mount point we
5261 * create the new snapshot in the parent directory, just next to it. */
e1873695 5262 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5263 if (r < 0) {
5264 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5265 goto finish;
5266 }
5267 if (r > 0)
770b5ce4 5268 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5269 else
770b5ce4 5270 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5271 if (r < 0) {
0f3be6ca 5272 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5273 goto finish;
5274 }
5275
6992459c 5276 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5277 * only owned by us and no one else. */
6992459c 5278 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5279 if (r < 0) {
5280 log_error_errno(r, "Failed to lock %s: %m", np);
5281 goto finish;
5282 }
5283
7bf011e3
LP
5284 {
5285 BLOCK_SIGNALS(SIGINT);
5286 r = btrfs_subvol_snapshot(arg_directory, np,
5287 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5288 BTRFS_SNAPSHOT_FALLBACK_COPY |
5289 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5290 BTRFS_SNAPSHOT_RECURSIVE |
5291 BTRFS_SNAPSHOT_QUOTA |
5292 BTRFS_SNAPSHOT_SIGINT);
5293 }
5294 if (r == -EINTR) {
5295 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5296 goto finish;
5297 }
03cfe0d5
LP
5298 if (r < 0) {
5299 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5300 goto finish;
ec16945e
LP
5301 }
5302
1cc6c93a 5303 free_and_replace(arg_directory, np);
17cbb288 5304 remove_directory = true;
30535c16 5305 } else {
cb638b5e 5306 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5307 if (r < 0)
5308 goto finish;
5309
30535c16
LP
5310 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5311 if (r == -EBUSY) {
5312 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5313 goto finish;
5314 }
5315 if (r < 0) {
5316 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5317 goto finish;
30535c16
LP
5318 }
5319
5320 if (arg_template) {
8d4aa2bb 5321 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
5322 if (r < 0)
5323 goto finish;
5324
7bf011e3
LP
5325 {
5326 BLOCK_SIGNALS(SIGINT);
5327 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5328 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5329 BTRFS_SNAPSHOT_FALLBACK_COPY |
5330 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5331 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5332 BTRFS_SNAPSHOT_RECURSIVE |
5333 BTRFS_SNAPSHOT_QUOTA |
5334 BTRFS_SNAPSHOT_SIGINT);
5335 }
ff6c6cc1
LP
5336 if (r == -EEXIST)
5337 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5338 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5339 else if (r == -EINTR) {
5340 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5341 goto finish;
5342 } else if (r < 0) {
83521414 5343 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5344 goto finish;
ff6c6cc1
LP
5345 } else
5346 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5347 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5348 }
ec16945e
LP
5349 }
5350
7732f92b 5351 if (arg_start_mode == START_BOOT) {
a5201ed6 5352 const char *p;
c9fe05e0 5353
a5201ed6
LP
5354 if (arg_pivot_root_new)
5355 p = prefix_roota(arg_directory, arg_pivot_root_new);
5356 else
5357 p = arg_directory;
c9fe05e0
AR
5358
5359 if (path_is_os_tree(p) <= 0) {
5360 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 5361 r = -EINVAL;
1b9e5b12
LP
5362 goto finish;
5363 }
5364 } else {
c9fe05e0
AR
5365 const char *p, *q;
5366
a5201ed6
LP
5367 if (arg_pivot_root_new)
5368 p = prefix_roota(arg_directory, arg_pivot_root_new);
5369 else
5370 p = arg_directory;
c9fe05e0
AR
5371
5372 q = strjoina(p, "/usr/");
1b9e5b12 5373
c9fe05e0
AR
5374 if (laccess(q, F_OK) < 0) {
5375 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 5376 r = -EINVAL;
1b9e5b12 5377 goto finish;
1b9e5b12
LP
5378 }
5379 }
ec16945e 5380
6b9132a9 5381 } else {
e7cbe5cb 5382 DissectImageFlags dissect_image_flags = DISSECT_IMAGE_REQUIRE_ROOT | DISSECT_IMAGE_RELAX_VAR_CHECK;
ec16945e
LP
5383 assert(arg_image);
5384 assert(!arg_template);
5385
8d4aa2bb 5386 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
5387 if (r < 0)
5388 goto finish;
5389
0f3be6ca
LP
5390 if (arg_ephemeral) {
5391 _cleanup_free_ char *np = NULL;
5392
5393 r = tempfn_random(arg_image, "machine.", &np);
5394 if (r < 0) {
5395 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5396 goto finish;
5397 }
5398
6992459c
LP
5399 /* Always take an exclusive lock on our own ephemeral copy. */
5400 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5401 if (r < 0) {
5402 r = log_error_errno(r, "Failed to create image lock: %m");
5403 goto finish;
5404 }
5405
7bf011e3
LP
5406 {
5407 BLOCK_SIGNALS(SIGINT);
5408 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5409 }
5410 if (r == -EINTR) {
5411 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5412 goto finish;
5413 }
0f3be6ca
LP
5414 if (r < 0) {
5415 r = log_error_errno(r, "Failed to copy image file: %m");
5416 goto finish;
5417 }
5418
1cc6c93a 5419 free_and_replace(arg_image, np);
0f3be6ca
LP
5420 remove_image = true;
5421 } else {
5422 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5423 if (r == -EBUSY) {
5424 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5425 goto finish;
5426 }
5427 if (r < 0) {
5428 r = log_error_errno(r, "Failed to create image lock: %m");
5429 goto finish;
5430 }
4623e8e6 5431
89e62e0b
LP
5432 r = verity_settings_load(
5433 &arg_verity_settings,
5434 arg_image, NULL, NULL);
e7cbe5cb
LB
5435 if (r < 0) {
5436 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5437 goto finish;
78ebe980 5438 }
89e62e0b
LP
5439
5440 if (arg_verity_settings.data_path)
5441 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5442 }
5443
c67b0082 5444 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5445 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5446 goto finish;
1b9e5b12 5447 }
6b9132a9 5448
c67b0082
LP
5449 remove_tmprootdir = true;
5450
5451 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5452 if (!arg_directory) {
5453 r = log_oom();
5454 goto finish;
6b9132a9 5455 }
88213476 5456
89e62e0b
LP
5457 r = loop_device_make_by_path(
5458 arg_image,
5459 arg_read_only ? O_RDONLY : O_RDWR,
5460 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5461 &loop);
2d845785
LP
5462 if (r < 0) {
5463 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5464 goto finish;
5465 }
1b9e5b12 5466
4526113f 5467 r = dissect_image_and_warn(
e0f9e7bd 5468 loop->fd,
4526113f 5469 arg_image,
89e62e0b 5470 &arg_verity_settings,
18d73705 5471 NULL,
e7cbe5cb 5472 dissect_image_flags,
e0f9e7bd 5473 &dissected_image);
2d845785 5474 if (r == -ENOPKG) {
4526113f 5475 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5476 log_notice("Note that the disk image needs to\n"
5477 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5478 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
19ac32cd 5479 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
2d845785
LP
5480 " d) or contain a file system without a partition table\n"
5481 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5482 goto finish;
2d845785 5483 }
4526113f 5484 if (r < 0)
842f3b0f 5485 goto finish;
1b9e5b12 5486
89e62e0b 5487 if (!arg_verity_settings.root_hash && dissected_image->can_verity)
4623e8e6
LP
5488 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5489
89e62e0b
LP
5490 r = dissected_image_decrypt_interactively(
5491 dissected_image,
5492 NULL,
5493 &arg_verity_settings,
5494 0,
5495 &decrypted_image);
1b9e5b12
LP
5496 if (r < 0)
5497 goto finish;
0f3be6ca
LP
5498
5499 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5500 if (remove_image && unlink(arg_image) >= 0)
5501 remove_image = false;
842f3b0f 5502 }
842f3b0f 5503
86c0dd4a 5504 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5505 if (r < 0)
5506 goto finish;
5507
de40a303
LP
5508 if (arg_console_mode < 0)
5509 arg_console_mode =
5510 isatty(STDIN_FILENO) > 0 &&
5511 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5512
de40a303
LP
5513 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5514 arg_quiet = true;
a258bf26 5515
9c857b9d
LP
5516 if (!arg_quiet)
5517 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5518 arg_machine, arg_image ?: arg_directory);
5519
72c0a2c2 5520 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5521
66edd963 5522 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5523 r = log_error_errno(errno, "Failed to become subreaper: %m");
5524 goto finish;
5525 }
5526
761cf19d
FW
5527 if (arg_expose_ports) {
5528 r = fw_ctx_new(&fw_ctx);
5529 if (r < 0) {
5530 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5531 goto finish;
5532 }
5533 expose_args.fw_ctx = fw_ctx;
5534 }
d87be9b0 5535 for (;;) {
3acc84eb 5536 r = run_container(dissected_image,
44dbef90
LP
5537 secondary,
5538 fds,
5539 veth_name, &veth_created,
761cf19d 5540 &expose_args, &master,
44dbef90 5541 &pid, &ret);
b0067625 5542 if (r <= 0)
d87be9b0 5543 break;
d87be9b0 5544 }
88213476
LP
5545
5546finish:
04f590a4
LP
5547 (void) sd_notify(false,
5548 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5549 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5550
9444b1f2 5551 if (pid > 0)
c67b0082 5552 (void) kill(pid, SIGKILL);
88213476 5553
503546da 5554 /* Try to flush whatever is still queued in the pty */
6a0f896b 5555 if (master >= 0) {
1c876927 5556 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
5557 master = safe_close(master);
5558 }
5559
5560 if (pid > 0)
5561 (void) wait_for_terminate(pid, NULL);
503546da 5562
50ebcf6c
LP
5563 pager_close();
5564
17cbb288 5565 if (remove_directory && arg_directory) {
ec16945e
LP
5566 int k;
5567
17cbb288 5568 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5569 if (k < 0)
17cbb288 5570 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5571 }
5572
0f3be6ca
LP
5573 if (remove_image && arg_image) {
5574 if (unlink(arg_image) < 0)
5575 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5576 }
5577
c67b0082
LP
5578 if (remove_tmprootdir) {
5579 if (rmdir(tmprootdir) < 0)
5580 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5581 }
5582
785890ac
LP
5583 if (arg_machine) {
5584 const char *p;
5585
63c372cb 5586 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5587 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5588 }
5589
761cf19d 5590 expose_port_flush(&fw_ctx, arg_expose_ports, &expose_args.address);
7513c5b8
LP
5591
5592 if (veth_created)
5593 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5594 (void) remove_bridge(arg_network_zone);
f757855e 5595
f757855e
LP
5596 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5597 expose_port_free_all(arg_expose_ports);
bf428efb 5598 rlimit_free_all(arg_rlimit);
b2645747 5599 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5600 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5601
44dbef90
LP
5602 if (r < 0)
5603 return r;
5604
5605 return ret;
88213476 5606}
44dbef90
LP
5607
5608DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);