]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
dissect-image: extend comment on returned errors a bit
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
b8ea7a6e 31#include "bus-error.h"
b053cd5f 32#include "bus-util.h"
8fe0087e 33#include "cap-list.h"
430f0182 34#include "capability-util.h"
04d391da 35#include "cgroup-util.h"
8fe0087e 36#include "copy.h"
d107bb7d 37#include "cpu-set-util.h"
4fc9982c 38#include "dev-setup.h"
57f1b61b 39#include "discover-image.h"
2d845785 40#include "dissect-image.h"
8fe0087e 41#include "env-util.h"
3652872a 42#include "escape.h"
3ffd4af2 43#include "fd-util.h"
842f3b0f 44#include "fdset.h"
a5c32cff 45#include "fileio.h"
f97b34a6 46#include "format-util.h"
f4f15635 47#include "fs-util.h"
1b9e5b12 48#include "gpt.h"
4623e8e6 49#include "hexdecoct.h"
e2054217 50#include "hostname-setup.h"
8fe0087e 51#include "hostname-util.h"
910fd145 52#include "id128-util.h"
3652872a 53#include "io-util.h"
8fe0087e 54#include "log.h"
2d845785 55#include "loop-util.h"
8fe0087e 56#include "loopback-setup.h"
8fe0087e 57#include "macro.h"
44dbef90 58#include "main-func.h"
f5947a5e 59#include "missing_sched.h"
8fe0087e 60#include "mkdir.h"
4349cd7c 61#include "mount-util.h"
049af8ad 62#include "mountpoint-util.h"
0cb8e3d1 63#include "namespace-util.h"
8fe0087e 64#include "netlink-util.h"
07630cea 65#include "nspawn-cgroup.h"
3652872a 66#include "nspawn-creds.h"
3603efde 67#include "nspawn-def.h"
07630cea
LP
68#include "nspawn-expose-ports.h"
69#include "nspawn-mount.h"
70#include "nspawn-network.h"
de40a303 71#include "nspawn-oci.h"
7336138e 72#include "nspawn-patch-uid.h"
07630cea 73#include "nspawn-register.h"
910fd145 74#include "nspawn-seccomp.h"
07630cea
LP
75#include "nspawn-settings.h"
76#include "nspawn-setuid.h"
7732f92b 77#include "nspawn-stub-pid1.h"
d8b4d14d 78#include "nulstr-util.h"
d58ad743 79#include "os-util.h"
50ebcf6c 80#include "pager.h"
614b022c 81#include "parse-argument.h"
6bedfcbb 82#include "parse-util.h"
8fe0087e 83#include "path-util.h"
294bf0c3 84#include "pretty-print.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
86775e35 89#include "resolve-util.h"
bf428efb 90#include "rlimit-util.h"
8fe0087e 91#include "rm-rf.h"
de40a303
LP
92#if HAVE_SECCOMP
93#include "seccomp-util.h"
94#endif
68b02049 95#include "selinux-util.h"
8fe0087e 96#include "signal-util.h"
2583fbea 97#include "socket-util.h"
8fcde012 98#include "stat-util.h"
15a5e950 99#include "stdio-util.h"
5c828e66 100#include "string-table.h"
07630cea 101#include "string-util.h"
8fe0087e 102#include "strv.h"
de40a303 103#include "sysctl-util.h"
8fe0087e 104#include "terminal-util.h"
e4de7287 105#include "tmpfile-util.h"
affb60b1 106#include "umask-util.h"
43c3fb46 107#include "unit-name.h"
b1d4f8e1 108#include "user-util.h"
8fe0087e 109#include "util.h"
e9642be2 110
e96ceaba
LP
111/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
0e7ac751 113
2a49b612
ZJS
114#define EXIT_FORCE_RESTART 133
115
113cea80
DH
116typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
6145bb4f 118 CONTAINER_REBOOTED,
113cea80
DH
119} ContainerStatus;
120
88213476 121static char *arg_directory = NULL;
ec16945e 122static char *arg_template = NULL;
5f932eb9 123static char *arg_chdir = NULL;
b53ede69
PW
124static char *arg_pivot_root_new = NULL;
125static char *arg_pivot_root_old = NULL;
687d0825 126static char *arg_user = NULL;
de40a303
LP
127static uid_t arg_uid = UID_INVALID;
128static gid_t arg_gid = GID_INVALID;
129static gid_t* arg_supplementary_gids = NULL;
130static size_t arg_n_supplementary_gids = 0;
9444b1f2 131static sd_id128_t arg_uuid = {};
3a9530e5
LP
132static char *arg_machine = NULL; /* The name used by the host to refer to this */
133static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
134static const char *arg_selinux_context = NULL;
135static const char *arg_selinux_apifs_context = NULL;
de40a303 136static char *arg_slice = NULL;
ff01d048 137static bool arg_private_network = false;
bc2f673e 138static bool arg_read_only = false;
7732f92b 139static StartMode arg_start_mode = START_PID1;
ec16945e 140static bool arg_ephemeral = false;
57fb9fb5 141static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 142static bool arg_link_journal_try = false;
520e0d54 143static uint64_t arg_caps_retain =
50b52222
LP
144 (1ULL << CAP_AUDIT_CONTROL) |
145 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
146 (1ULL << CAP_CHOWN) |
147 (1ULL << CAP_DAC_OVERRIDE) |
148 (1ULL << CAP_DAC_READ_SEARCH) |
149 (1ULL << CAP_FOWNER) |
150 (1ULL << CAP_FSETID) |
151 (1ULL << CAP_IPC_OWNER) |
152 (1ULL << CAP_KILL) |
153 (1ULL << CAP_LEASE) |
154 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 155 (1ULL << CAP_MKNOD) |
5076f0cc
LP
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
5076f0cc 159 (1ULL << CAP_SETFCAP) |
50b52222 160 (1ULL << CAP_SETGID) |
5076f0cc
LP
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
50b52222 164 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
165 (1ULL << CAP_SYS_CHROOT) |
166 (1ULL << CAP_SYS_NICE) |
167 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 168 (1ULL << CAP_SYS_RESOURCE) |
50b52222 169 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 170static uint64_t arg_caps_ambient = 0;
de40a303 171static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 172static CustomMount *arg_custom_mounts = NULL;
88614c8a 173static size_t arg_n_custom_mounts = 0;
f4889f65 174static char **arg_setenv = NULL;
284c0b91 175static bool arg_quiet = false;
eb91eb18 176static bool arg_register = true;
89f7c846 177static bool arg_keep_unit = false;
aa28aefe 178static char **arg_network_interfaces = NULL;
c74e630d 179static char **arg_network_macvlan = NULL;
4bbfe7ad 180static char **arg_network_ipvlan = NULL;
69c79d3c 181static bool arg_network_veth = false;
f6d6bad1 182static char **arg_network_veth_extra = NULL;
f757855e 183static char *arg_network_bridge = NULL;
22b28dfd 184static char *arg_network_zone = NULL;
d7bea6b6 185static char *arg_network_namespace_path = NULL;
bb068de0 186static PagerFlags arg_pager_flags = 0;
050f7277 187static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 188static char *arg_image = NULL;
de40a303 189static char *arg_oci_bundle = NULL;
f757855e 190static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 191static ExposePort *arg_expose_ports = NULL;
f36933fe 192static char **arg_property = NULL;
de40a303 193static sd_bus_message *arg_property_message = NULL;
0de7acce 194static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 195static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 196static bool arg_userns_chown = false;
c6c8f6e2 197static int arg_kill_signal = 0;
5da38d07 198static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
199static SettingsMask arg_settings_mask = 0;
200static int arg_settings_trusted = -1;
201static char **arg_parameters = NULL;
6aadfa4c 202static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 203static bool arg_notify_ready = false;
5a8ff0e6 204static bool arg_use_cgns = true;
0c582db0 205static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 206static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 207static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
208static char **arg_syscall_allow_list = NULL;
209static char **arg_syscall_deny_list = NULL;
de40a303
LP
210#if HAVE_SECCOMP
211static scmp_filter_ctx arg_seccomp = NULL;
212#endif
bf428efb 213static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 214static bool arg_no_new_privileges = false;
81f345df
LP
215static int arg_oom_score_adjust = 0;
216static bool arg_oom_score_adjust_set = false;
0985c7c4 217static CPUSet arg_cpu_set = {};
09d423e9 218static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 219static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 220static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
221static DeviceNode* arg_extra_nodes = NULL;
222static size_t arg_n_extra_nodes = 0;
223static char **arg_sysctl = NULL;
224static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
225static Credential *arg_credentials = NULL;
226static size_t arg_n_credentials = 0;
88213476 227
6145bb4f
LP
228STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
229STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
230STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
231STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
232STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
234STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
235STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
237STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
238STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
239STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
240STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
241STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
242STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
243STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
249STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
250STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 251STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
252STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
254#if HAVE_SECCOMP
255STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
256#endif
0985c7c4 257STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f
LP
258STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
259
dce66ffe
ZJS
260static int handle_arg_console(const char *arg) {
261 if (streq(arg, "help")) {
10e8a60b
LP
262 puts("autopipe\n"
263 "interactive\n"
dce66ffe 264 "passive\n"
10e8a60b
LP
265 "pipe\n"
266 "read-only");
dce66ffe
ZJS
267 return 0;
268 }
269
270 if (streq(arg, "interactive"))
271 arg_console_mode = CONSOLE_INTERACTIVE;
272 else if (streq(arg, "read-only"))
273 arg_console_mode = CONSOLE_READ_ONLY;
274 else if (streq(arg, "passive"))
275 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
276 else if (streq(arg, "pipe")) {
277 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
278 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
279 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
280 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
281 "Proceeding anyway.");
282
dce66ffe 283 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
284 } else if (streq(arg, "autopipe")) {
285 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
286 arg_console_mode = CONSOLE_INTERACTIVE;
287 else
288 arg_console_mode = CONSOLE_PIPE;
554c4beb 289 } else
dce66ffe
ZJS
290 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
291
292 arg_settings_mask |= SETTING_CONSOLE_MODE;
293 return 1;
294}
295
37ec0fdd
LP
296static int help(void) {
297 _cleanup_free_ char *link = NULL;
298 int r;
299
bb068de0 300 (void) pager_open(arg_pager_flags);
50ebcf6c 301
37ec0fdd
LP
302 r = terminal_urlify_man("systemd-nspawn", "1", &link);
303 if (r < 0)
304 return log_oom();
305
25148653 306 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 307 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
308 " -h --help Show this help\n"
309 " --version Print version string\n"
69c79d3c 310 " -q --quiet Do not show status information\n"
bb068de0 311 " --no-pager Do not pipe output into a pager\n"
25148653
LP
312 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
313 "%3$sImage:%4$s\n"
1b9e5b12 314 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
315 " --template=PATH Initialize root directory from template directory,\n"
316 " if missing\n"
317 " -x --ephemeral Run container with snapshot of root directory, and\n"
318 " remove it after exit\n"
25e68fd3
LP
319 " -i --image=PATH Root file system disk image (or device node) for\n"
320 " the container\n"
de40a303 321 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
322 " --read-only Mount the root directory read-only\n"
323 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 324 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
325 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
326 " as a DER encoded PKCS7, either as a path to a file\n"
327 " or as an ASCII base64 encoded string prefixed by\n"
328 " 'base64:'\n"
e7cbe5cb 329 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
330 " --pivot-root=PATH[:PATH]\n"
331 " Pivot root to given directory in the container\n\n"
332 "%3$sExecution:%4$s\n"
7732f92b 333 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 334 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 335 " --chdir=PATH Set working directory in the container\n"
25148653
LP
336 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
337 " -u --user=USER Run the command under specified user or UID\n"
338 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
339 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
340 "%3$sSystem Identity:%4$s\n"
a8828ed9 341 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 342 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
343 " --uuid=UUID Set a specific machine UUID for the container\n\n"
344 "%3$sProperties:%4$s\n"
a8828ed9 345 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 346 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
347 " --register=BOOLEAN Register container as machine\n"
348 " --keep-unit Do not register a scope for the machine, reuse\n"
349 " the service unit nspawn is running in\n\n"
350 "%3$sUser Namespacing:%4$s\n"
90b4a64d 351 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 352 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 353 " Similar, but with user configured UID/GID range\n"
25148653
LP
354 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
355 "%3$sNetworking:%4$s\n"
69c79d3c
LP
356 " --private-network Disable network in container\n"
357 " --network-interface=INTERFACE\n"
358 " Assign an existing network interface to the\n"
359 " container\n"
c74e630d
LP
360 " --network-macvlan=INTERFACE\n"
361 " Create a macvlan network interface based on an\n"
362 " existing network interface to the container\n"
4bbfe7ad
TG
363 " --network-ipvlan=INTERFACE\n"
364 " Create a ipvlan network interface based on an\n"
365 " existing network interface to the container\n"
a8eaaee7 366 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 367 " and container\n"
f6d6bad1
LP
368 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
369 " Add an additional virtual Ethernet link between\n"
370 " host and container\n"
ab046dde 371 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
372 " Add a virtual Ethernet connection to the container\n"
373 " and attach it to an existing bridge on the host\n"
374 " --network-zone=NAME Similar, but attach the new interface to an\n"
375 " an automatically managed bridge interface\n"
d7bea6b6
DP
376 " --network-namespace-path=PATH\n"
377 " Set network namespace to the one represented by\n"
378 " the specified kernel namespace file node\n"
6d0b55c2 379 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
380 " Expose a container IP port on the host\n\n"
381 "%3$sSecurity:%4$s\n"
a8828ed9
DW
382 " --capability=CAP In addition to the default, retain specified\n"
383 " capability\n"
384 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
385 " --ambient-capability=CAP\n"
386 " Sets the specified capability for the started\n"
387 " process. Not useful if booting a machine.\n"
f4e803c8 388 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
389 " --system-call-filter=LIST|~LIST\n"
390 " Permit/prohibit specific system calls\n"
25148653
LP
391 " -Z --selinux-context=SECLABEL\n"
392 " Set the SELinux security context to be used by\n"
393 " processes in the container\n"
394 " -L --selinux-apifs-context=SECLABEL\n"
395 " Set the SELinux security context to be used by\n"
396 " API/tmpfs file systems in the container\n\n"
397 "%3$sResources:%4$s\n"
bf428efb 398 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
399 " --oom-score-adjust=VALUE\n"
400 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
401 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
402 " --personality=ARCH Pick personality for this container\n\n"
25148653 403 "%3$sIntegration:%4$s\n"
09d423e9 404 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 405 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
406 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
407 " host, try-guest, try-host\n"
408 " -j Equivalent to --link-journal=try-guest\n\n"
409 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
410 " --bind=PATH[:PATH[:OPTIONS]]\n"
411 " Bind mount a file or directory from the host into\n"
a8828ed9 412 " the container\n"
5e5bfa6e
EY
413 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
414 " Similar, but creates a read-only bind mount\n"
de40a303
LP
415 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
416 " it\n"
06c17c39 417 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
418 " --overlay=PATH[:PATH...]:PATH\n"
419 " Create an overlay mount from the host to \n"
420 " the container\n"
421 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
422 " Similar, but creates a read-only overlay mount\n\n"
423 "%3$sInput/Output:%4$s\n"
de40a303
LP
424 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
425 " set up for the container.\n"
3652872a
LP
426 " -P --pipe Equivalent to --console=pipe\n\n"
427 "%3$sCredentials:%4$s\n"
428 " --set-credential=ID:VALUE\n"
429 " Pass a credential with literal value to container.\n"
430 " --load-credential=ID:PATH\n"
431 " Load credential to pass to container from file or\n"
432 " AF_UNIX stream socket.\n"
bc556335
DDM
433 "\nSee the %2$s for details.\n",
434 program_invocation_short_name,
435 link,
436 ansi_underline(),
437 ansi_normal(),
438 ansi_highlight(),
439 ansi_normal());
37ec0fdd
LP
440
441 return 0;
88213476
LP
442}
443
86c0dd4a 444static int custom_mount_check_all(void) {
88614c8a 445 size_t i;
5a8af538 446
5a8af538
LP
447 for (i = 0; i < arg_n_custom_mounts; i++) {
448 CustomMount *m = &arg_custom_mounts[i];
449
0de7acce 450 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
451 if (arg_userns_chown)
452 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
453 "--private-users-chown may not be combined with custom root mounts.");
454 else if (arg_uid_shift == UID_INVALID)
455 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
456 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 457 }
5a8af538
LP
458 }
459
460 return 0;
461}
462
8199d554 463static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 464 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 465 int r;
5da38d07 466
efdb0237 467 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
468
469 e = getenv(var);
470 if (!e) {
d5fc5b2f 471 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
472 var = "UNIFIED_CGROUP_HIERARCHY";
473 e = getenv(var);
c78c095b
ZJS
474 }
475
476 if (!isempty(e)) {
efdb0237
LP
477 r = parse_boolean(e);
478 if (r < 0)
c78c095b 479 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
480 if (r > 0)
481 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
482 else
483 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
484 }
485
8199d554
LP
486 return 0;
487}
488
489static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
490 int r;
491
75b0d8b8
ZJS
492 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
493 * in the image actually supports. */
b4cccbc1
LP
494 r = cg_all_unified();
495 if (r < 0)
496 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
497 if (r > 0) {
a8725a06
ZJS
498 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
499 * routine only detects 231, so we'll have a false negative here for 230. */
500 r = systemd_installation_has_version(directory, 230);
501 if (r < 0)
502 return log_error_errno(r, "Failed to determine systemd version in container: %m");
503 if (r > 0)
504 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
505 else
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 507 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
508 /* Mixed cgroup hierarchy support was added in 233 */
509 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
510 if (r < 0)
511 return log_error_errno(r, "Failed to determine systemd version in container: %m");
512 if (r > 0)
513 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
514 else
515 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
516 } else
5da38d07 517 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 518
8199d554
LP
519 log_debug("Using %s hierarchy for container.",
520 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
521 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
522
efdb0237
LP
523 return 0;
524}
525
8a99bd0c
ZJS
526static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
527 uint64_t mask = 0;
528 int r;
529
530 for (;;) {
531 _cleanup_free_ char *t = NULL;
532
533 r = extract_first_word(&spec, &t, ",", 0);
534 if (r < 0)
535 return log_error_errno(r, "Failed to parse capability %s.", t);
536 if (r == 0)
537 break;
538
539 if (streq(t, "help")) {
540 for (int i = 0; i < capability_list_length(); i++) {
541 const char *name;
542
543 name = capability_to_name(i);
544 if (name)
545 puts(name);
546 }
547
548 return 0; /* quit */
549 }
550
551 if (streq(t, "all"))
f5fbe71d 552 mask = UINT64_MAX;
8a99bd0c
ZJS
553 else {
554 r = capability_from_name(t);
555 if (r < 0)
556 return log_error_errno(r, "Failed to parse capability %s.", t);
557
558 mask |= 1ULL << r;
559 }
560 }
561
562 *ret_mask = mask;
563 return 1; /* continue */
564}
565
49048684 566static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
567 int r;
568
569 r = getenv_bool(name);
570 if (r == -ENXIO)
49048684 571 return 0;
0c582db0 572 if (r < 0)
49048684 573 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 574
0c582db0 575 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 576 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 577 return 0;
0c582db0
LB
578}
579
49048684 580static int parse_mount_settings_env(void) {
4f086aab 581 const char *e;
1099ceeb
LP
582 int r;
583
584 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
585 if (r < 0 && r != -ENXIO)
586 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
587 if (r >= 0)
588 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
589
590 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 591 if (streq_ptr(e, "network"))
4f086aab 592 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 593
49048684
ZJS
594 else if (e) {
595 r = parse_boolean(e);
596 if (r < 0)
597 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
598
599 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
600 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 601 }
4f086aab 602
49048684 603 return 0;
4f086aab
SU
604}
605
49048684 606static int parse_environment(void) {
d5455d2f
LP
607 const char *e;
608 int r;
609
49048684
ZJS
610 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
611 if (r < 0)
612 return r;
613 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
614 if (r < 0)
615 return r;
616 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
617 if (r < 0)
618 return r;
619 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
620 if (r < 0)
621 return r;
d5455d2f 622
49048684
ZJS
623 r = parse_mount_settings_env();
624 if (r < 0)
625 return r;
d5455d2f 626
489fae52
ZJS
627 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
628 * even if it is supported. If not supported, it has no effect. */
de40a303 629 if (!cg_ns_supported())
489fae52 630 arg_use_cgns = false;
de40a303
LP
631 else {
632 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
633 if (r < 0) {
634 if (r != -ENXIO)
49048684 635 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
636
637 arg_use_cgns = true;
638 } else {
639 arg_use_cgns = r > 0;
640 arg_settings_mask |= SETTING_USE_CGNS;
641 }
642 }
d5455d2f
LP
643
644 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
645 if (e)
646 arg_container_service_name = e;
647
49048684 648 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
649}
650
88213476 651static int parse_argv(int argc, char *argv[]) {
a41fe3a2 652 enum {
acbeb427
ZJS
653 ARG_VERSION = 0x100,
654 ARG_PRIVATE_NETWORK,
bc2f673e 655 ARG_UUID,
5076f0cc 656 ARG_READ_ONLY,
57fb9fb5 657 ARG_CAPABILITY,
88fc9c9b 658 ARG_AMBIENT_CAPABILITY,
420c7379 659 ARG_DROP_CAPABILITY,
17fe0523
LP
660 ARG_LINK_JOURNAL,
661 ARG_BIND,
f4889f65 662 ARG_BIND_RO,
06c17c39 663 ARG_TMPFS,
5a8af538
LP
664 ARG_OVERLAY,
665 ARG_OVERLAY_RO,
de40a303 666 ARG_INACCESSIBLE,
eb91eb18 667 ARG_SHARE_SYSTEM,
89f7c846 668 ARG_REGISTER,
aa28aefe 669 ARG_KEEP_UNIT,
69c79d3c 670 ARG_NETWORK_INTERFACE,
c74e630d 671 ARG_NETWORK_MACVLAN,
4bbfe7ad 672 ARG_NETWORK_IPVLAN,
ab046dde 673 ARG_NETWORK_BRIDGE,
22b28dfd 674 ARG_NETWORK_ZONE,
f6d6bad1 675 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 676 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 677 ARG_PERSONALITY,
4d9f07b4 678 ARG_VOLATILE,
ec16945e 679 ARG_TEMPLATE,
f36933fe 680 ARG_PROPERTY,
6dac160c 681 ARG_PRIVATE_USERS,
c6c8f6e2 682 ARG_KILL_SIGNAL,
f757855e 683 ARG_SETTINGS,
5f932eb9 684 ARG_CHDIR,
b53ede69 685 ARG_PIVOT_ROOT,
7336138e 686 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 687 ARG_NOTIFY_READY,
4623e8e6 688 ARG_ROOT_HASH,
89e62e0b
LP
689 ARG_ROOT_HASH_SIG,
690 ARG_VERITY_DATA,
960e4569 691 ARG_SYSTEM_CALL_FILTER,
bf428efb 692 ARG_RLIMIT,
3a9530e5 693 ARG_HOSTNAME,
66edd963 694 ARG_NO_NEW_PRIVILEGES,
81f345df 695 ARG_OOM_SCORE_ADJUST,
d107bb7d 696 ARG_CPU_AFFINITY,
09d423e9 697 ARG_RESOLV_CONF,
1688841f 698 ARG_TIMEZONE,
de40a303
LP
699 ARG_CONSOLE,
700 ARG_PIPE,
701 ARG_OCI_BUNDLE,
bb068de0 702 ARG_NO_PAGER,
3652872a
LP
703 ARG_SET_CREDENTIAL,
704 ARG_LOAD_CREDENTIAL,
a41fe3a2
LP
705 };
706
88213476 707 static const struct option options[] = {
d7bea6b6
DP
708 { "help", no_argument, NULL, 'h' },
709 { "version", no_argument, NULL, ARG_VERSION },
710 { "directory", required_argument, NULL, 'D' },
711 { "template", required_argument, NULL, ARG_TEMPLATE },
712 { "ephemeral", no_argument, NULL, 'x' },
713 { "user", required_argument, NULL, 'u' },
714 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
715 { "as-pid2", no_argument, NULL, 'a' },
716 { "boot", no_argument, NULL, 'b' },
717 { "uuid", required_argument, NULL, ARG_UUID },
718 { "read-only", no_argument, NULL, ARG_READ_ONLY },
719 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 720 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 721 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 722 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
723 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
724 { "bind", required_argument, NULL, ARG_BIND },
725 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
726 { "tmpfs", required_argument, NULL, ARG_TMPFS },
727 { "overlay", required_argument, NULL, ARG_OVERLAY },
728 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 729 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 730 { "machine", required_argument, NULL, 'M' },
3a9530e5 731 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
732 { "slice", required_argument, NULL, 'S' },
733 { "setenv", required_argument, NULL, 'E' },
734 { "selinux-context", required_argument, NULL, 'Z' },
735 { "selinux-apifs-context", required_argument, NULL, 'L' },
736 { "quiet", no_argument, NULL, 'q' },
737 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
738 { "register", required_argument, NULL, ARG_REGISTER },
739 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
740 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
741 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
742 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
743 { "network-veth", no_argument, NULL, 'n' },
744 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
745 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
746 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
747 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
748 { "personality", required_argument, NULL, ARG_PERSONALITY },
749 { "image", required_argument, NULL, 'i' },
750 { "volatile", optional_argument, NULL, ARG_VOLATILE },
751 { "port", required_argument, NULL, 'p' },
752 { "property", required_argument, NULL, ARG_PROPERTY },
753 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
754 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
755 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
756 { "settings", required_argument, NULL, ARG_SETTINGS },
757 { "chdir", required_argument, NULL, ARG_CHDIR },
758 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
759 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
760 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
761 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
762 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 763 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 764 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 765 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 766 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 767 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 768 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
769 { "console", required_argument, NULL, ARG_CONSOLE },
770 { "pipe", no_argument, NULL, ARG_PIPE },
771 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 772 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
773 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
774 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
eb9da376 775 {}
88213476
LP
776 };
777
9444b1f2 778 int c, r;
a42c8b54 779 uint64_t plus = 0, minus = 0;
f757855e 780 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
781
782 assert(argc >= 0);
783 assert(argv);
784
de40a303 785 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
786 switch (c) {
787
788 case 'h':
37ec0fdd 789 return help();
88213476 790
acbeb427 791 case ARG_VERSION:
3f6fd1ba 792 return version();
acbeb427 793
88213476 794 case 'D':
614b022c 795 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 796 if (r < 0)
0f03c2a4 797 return r;
de40a303
LP
798
799 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
800 break;
801
802 case ARG_TEMPLATE:
614b022c 803 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 804 if (r < 0)
0f03c2a4 805 return r;
de40a303
LP
806
807 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
808 break;
809
1b9e5b12 810 case 'i':
614b022c 811 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 812 if (r < 0)
0f03c2a4 813 return r;
de40a303
LP
814
815 arg_settings_mask |= SETTING_DIRECTORY;
816 break;
817
818 case ARG_OCI_BUNDLE:
614b022c 819 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
820 if (r < 0)
821 return r;
822
ec16945e
LP
823 break;
824
825 case 'x':
826 arg_ephemeral = true;
a2f577fc 827 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
828 break;
829
687d0825 830 case 'u':
2fc09a9c
DM
831 r = free_and_strdup(&arg_user, optarg);
832 if (r < 0)
7027ff61 833 return log_oom();
687d0825 834
f757855e 835 arg_settings_mask |= SETTING_USER;
687d0825
MV
836 break;
837
22b28dfd
LP
838 case ARG_NETWORK_ZONE: {
839 char *j;
840
b910cc72 841 j = strjoin("vz-", optarg);
22b28dfd
LP
842 if (!j)
843 return log_oom();
844
845 if (!ifname_valid(j)) {
846 log_error("Network zone name not valid: %s", j);
847 free(j);
848 return -EINVAL;
849 }
850
df1fac6d 851 free_and_replace(arg_network_zone, j);
22b28dfd
LP
852
853 arg_network_veth = true;
854 arg_private_network = true;
855 arg_settings_mask |= SETTING_NETWORK;
856 break;
857 }
858
ab046dde 859 case ARG_NETWORK_BRIDGE:
ef76dff2 860
baaa35ad
ZJS
861 if (!ifname_valid(optarg))
862 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
863 "Bridge interface name not valid: %s", optarg);
ef76dff2 864
f757855e
LP
865 r = free_and_strdup(&arg_network_bridge, optarg);
866 if (r < 0)
867 return log_oom();
ab046dde 868
4831981d 869 _fallthrough_;
0dfaa006 870 case 'n':
69c79d3c
LP
871 arg_network_veth = true;
872 arg_private_network = true;
f757855e 873 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
874 break;
875
f6d6bad1
LP
876 case ARG_NETWORK_VETH_EXTRA:
877 r = veth_extra_parse(&arg_network_veth_extra, optarg);
878 if (r < 0)
879 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
880
881 arg_private_network = true;
882 arg_settings_mask |= SETTING_NETWORK;
883 break;
884
aa28aefe 885 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
886 if (!ifname_valid(optarg))
887 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
888 "Network interface name not valid: %s", optarg);
ef76dff2 889
b390f178
DDM
890 r = test_network_interface_initialized(optarg);
891 if (r < 0)
892 return r;
893
c74e630d
LP
894 if (strv_extend(&arg_network_interfaces, optarg) < 0)
895 return log_oom();
896
897 arg_private_network = true;
f757855e 898 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
899 break;
900
901 case ARG_NETWORK_MACVLAN:
ef76dff2 902
baaa35ad
ZJS
903 if (!ifname_valid(optarg))
904 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
905 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 906
b390f178
DDM
907 r = test_network_interface_initialized(optarg);
908 if (r < 0)
909 return r;
910
c74e630d 911 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
912 return log_oom();
913
4bbfe7ad 914 arg_private_network = true;
f757855e 915 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
916 break;
917
918 case ARG_NETWORK_IPVLAN:
ef76dff2 919
baaa35ad
ZJS
920 if (!ifname_valid(optarg))
921 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
922 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 923
b390f178
DDM
924 r = test_network_interface_initialized(optarg);
925 if (r < 0)
926 return r;
927
4bbfe7ad
TG
928 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
929 return log_oom();
930
4831981d 931 _fallthrough_;
ff01d048
LP
932 case ARG_PRIVATE_NETWORK:
933 arg_private_network = true;
f757855e 934 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
935 break;
936
d7bea6b6 937 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 938 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
939 if (r < 0)
940 return r;
941
de40a303 942 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
943 break;
944
0f0dbc46 945 case 'b':
baaa35ad
ZJS
946 if (arg_start_mode == START_PID2)
947 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
948 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
949
950 arg_start_mode = START_BOOT;
951 arg_settings_mask |= SETTING_START_MODE;
952 break;
953
954 case 'a':
baaa35ad
ZJS
955 if (arg_start_mode == START_BOOT)
956 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
957 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
958
959 arg_start_mode = START_PID2;
960 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
961 break;
962
144f0fc0 963 case ARG_UUID:
9444b1f2 964 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
965 if (r < 0)
966 return log_error_errno(r, "Invalid UUID: %s", optarg);
967
baaa35ad
ZJS
968 if (sd_id128_is_null(arg_uuid))
969 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
970 "Machine UUID may not be all zeroes.");
f757855e
LP
971
972 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 973 break;
aa96c6cb 974
43c3fb46
LP
975 case 'S': {
976 _cleanup_free_ char *mangled = NULL;
977
978 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
979 if (r < 0)
980 return log_oom();
981
43c3fb46 982 free_and_replace(arg_slice, mangled);
de40a303 983 arg_settings_mask |= SETTING_SLICE;
144f0fc0 984 break;
43c3fb46 985 }
144f0fc0 986
7027ff61 987 case 'M':
c1521918 988 if (isempty(optarg))
97b11eed 989 arg_machine = mfree(arg_machine);
c1521918 990 else {
52ef5dd7 991 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
992 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
993 "Invalid machine name: %s", optarg);
7027ff61 994
0c3c4284
LP
995 r = free_and_strdup(&arg_machine, optarg);
996 if (r < 0)
eb91eb18 997 return log_oom();
eb91eb18 998 }
9ce6d1b3 999 break;
7027ff61 1000
3a9530e5
LP
1001 case ARG_HOSTNAME:
1002 if (isempty(optarg))
1003 arg_hostname = mfree(arg_hostname);
1004 else {
52ef5dd7 1005 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1006 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1007 "Invalid hostname: %s", optarg);
3a9530e5
LP
1008
1009 r = free_and_strdup(&arg_hostname, optarg);
1010 if (r < 0)
1011 return log_oom();
1012 }
1013
1014 arg_settings_mask |= SETTING_HOSTNAME;
1015 break;
1016
82adf6af
LP
1017 case 'Z':
1018 arg_selinux_context = optarg;
a8828ed9
DW
1019 break;
1020
82adf6af
LP
1021 case 'L':
1022 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1023 break;
1024
bc2f673e
LP
1025 case ARG_READ_ONLY:
1026 arg_read_only = true;
f757855e 1027 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1028 break;
1029
88fc9c9b
TH
1030 case ARG_AMBIENT_CAPABILITY: {
1031 uint64_t m;
1032 r = parse_capability_spec(optarg, &m);
1033 if (r <= 0)
1034 return r;
1035 arg_caps_ambient |= m;
1036 arg_settings_mask |= SETTING_CAPABILITY;
1037 break;
1038 }
420c7379
LP
1039 case ARG_CAPABILITY:
1040 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1041 uint64_t m;
1042 r = parse_capability_spec(optarg, &m);
1043 if (r <= 0)
1044 return r;
5076f0cc 1045
8a99bd0c
ZJS
1046 if (c == ARG_CAPABILITY)
1047 plus |= m;
1048 else
1049 minus |= m;
f757855e 1050 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1051 break;
1052 }
66edd963
LP
1053 case ARG_NO_NEW_PRIVILEGES:
1054 r = parse_boolean(optarg);
1055 if (r < 0)
1056 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1057
1058 arg_no_new_privileges = r;
1059 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1060 break;
1061
57fb9fb5
LP
1062 case 'j':
1063 arg_link_journal = LINK_GUEST;
574edc90 1064 arg_link_journal_try = true;
4e1d6aa9 1065 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1066 break;
1067
1068 case ARG_LINK_JOURNAL:
4e1d6aa9 1069 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1070 if (r < 0)
1071 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1072
4e1d6aa9 1073 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1074 break;
1075
17fe0523 1076 case ARG_BIND:
f757855e
LP
1077 case ARG_BIND_RO:
1078 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1079 if (r < 0)
1080 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1081
f757855e 1082 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1083 break;
06c17c39 1084
f757855e
LP
1085 case ARG_TMPFS:
1086 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1087 if (r < 0)
1088 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1089
f757855e 1090 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1091 break;
5a8af538
LP
1092
1093 case ARG_OVERLAY:
ad85779a
LP
1094 case ARG_OVERLAY_RO:
1095 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1096 if (r == -EADDRNOTAVAIL)
1097 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1098 if (r < 0)
1099 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1100
f757855e 1101 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1102 break;
06c17c39 1103
de40a303
LP
1104 case ARG_INACCESSIBLE:
1105 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1106 if (r < 0)
1107 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1108
1109 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1110 break;
1111
a5f1cb3b 1112 case 'E': {
baaa35ad
ZJS
1113 if (!env_assignment_is_valid(optarg))
1114 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1115 "Environment variable assignment '%s' is not valid.", optarg);
aaf057c4
ZJS
1116 r = strv_env_replace_strdup(&arg_setenv, optarg);
1117 if (r < 0)
1118 return r;
f4889f65 1119
f757855e 1120 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1121 break;
1122 }
1123
284c0b91
LP
1124 case 'q':
1125 arg_quiet = true;
1126 break;
1127
8a96d94e 1128 case ARG_SHARE_SYSTEM:
a6b5216c 1129 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1130 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1131 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1132 arg_clone_ns_flags = 0;
8a96d94e
LP
1133 break;
1134
eb91eb18
LP
1135 case ARG_REGISTER:
1136 r = parse_boolean(optarg);
1137 if (r < 0) {
1138 log_error("Failed to parse --register= argument: %s", optarg);
1139 return r;
1140 }
1141
1142 arg_register = r;
1143 break;
1144
89f7c846
LP
1145 case ARG_KEEP_UNIT:
1146 arg_keep_unit = true;
1147 break;
1148
6afc95b7
LP
1149 case ARG_PERSONALITY:
1150
ac45f971 1151 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1152 if (arg_personality == PERSONALITY_INVALID)
1153 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1154 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1155
f757855e 1156 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1157 break;
1158
4d9f07b4
LP
1159 case ARG_VOLATILE:
1160
1161 if (!optarg)
f757855e 1162 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1163 else if (streq(optarg, "help")) {
1164 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1165 return 0;
1166 } else {
f757855e 1167 VolatileMode m;
4d9f07b4 1168
f757855e 1169 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1170 if (m < 0)
1171 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1172 "Failed to parse --volatile= argument: %s", optarg);
1173 else
f757855e 1174 arg_volatile_mode = m;
6d0b55c2
LP
1175 }
1176
f757855e
LP
1177 arg_settings_mask |= SETTING_VOLATILE_MODE;
1178 break;
6d0b55c2 1179
f757855e
LP
1180 case 'p':
1181 r = expose_port_parse(&arg_expose_ports, optarg);
1182 if (r == -EEXIST)
1183 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1184 if (r < 0)
1185 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1186
f757855e 1187 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1188 break;
6d0b55c2 1189
f36933fe
LP
1190 case ARG_PROPERTY:
1191 if (strv_extend(&arg_property, optarg) < 0)
1192 return log_oom();
1193
1194 break;
1195
ae209204
ZJS
1196 case ARG_PRIVATE_USERS: {
1197 int boolean = -1;
0de7acce 1198
ae209204
ZJS
1199 if (!optarg)
1200 boolean = true;
1201 else if (!in_charset(optarg, DIGITS))
1202 /* do *not* parse numbers as booleans */
1203 boolean = parse_boolean(optarg);
1204
1205 if (boolean == false) {
0de7acce
LP
1206 /* no: User namespacing off */
1207 arg_userns_mode = USER_NAMESPACE_NO;
1208 arg_uid_shift = UID_INVALID;
1209 arg_uid_range = UINT32_C(0x10000);
ae209204 1210 } else if (boolean == true) {
0de7acce
LP
1211 /* yes: User namespacing on, UID range is read from root dir */
1212 arg_userns_mode = USER_NAMESPACE_FIXED;
1213 arg_uid_shift = UID_INVALID;
1214 arg_uid_range = UINT32_C(0x10000);
1215 } else if (streq(optarg, "pick")) {
1216 /* pick: User namespacing on, UID range is picked randomly */
1217 arg_userns_mode = USER_NAMESPACE_PICK;
1218 arg_uid_shift = UID_INVALID;
1219 arg_uid_range = UINT32_C(0x10000);
1220 } else {
6c2058b3 1221 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1222 const char *range, *shift;
1223
0de7acce
LP
1224 /* anything else: User namespacing on, UID range is explicitly configured */
1225
6dac160c
LP
1226 range = strchr(optarg, ':');
1227 if (range) {
6c2058b3
ZJS
1228 buffer = strndup(optarg, range - optarg);
1229 if (!buffer)
1230 return log_oom();
1231 shift = buffer;
6dac160c
LP
1232
1233 range++;
bfd292ec
ZJS
1234 r = safe_atou32(range, &arg_uid_range);
1235 if (r < 0)
be715731 1236 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1237 } else
1238 shift = optarg;
1239
be715731
ZJS
1240 r = parse_uid(shift, &arg_uid_shift);
1241 if (r < 0)
1242 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1243
1244 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1245 }
1246
baaa35ad
ZJS
1247 if (arg_uid_range <= 0)
1248 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1249 "UID range cannot be 0.");
be715731 1250
0de7acce 1251 arg_settings_mask |= SETTING_USERNS;
6dac160c 1252 break;
ae209204 1253 }
6dac160c 1254
0de7acce 1255 case 'U':
ccabee0d
LP
1256 if (userns_supported()) {
1257 arg_userns_mode = USER_NAMESPACE_PICK;
1258 arg_uid_shift = UID_INVALID;
1259 arg_uid_range = UINT32_C(0x10000);
1260
1261 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1262 }
1263
7336138e
LP
1264 break;
1265
0de7acce 1266 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1267 arg_userns_chown = true;
0de7acce
LP
1268
1269 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1270 break;
1271
c6c8f6e2 1272 case ARG_KILL_SIGNAL:
5c828e66
LP
1273 if (streq(optarg, "help")) {
1274 DUMP_STRING_TABLE(signal, int, _NSIG);
1275 return 0;
1276 }
1277
29a3db75 1278 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1279 if (arg_kill_signal < 0)
7211c853 1280 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1281
f757855e
LP
1282 arg_settings_mask |= SETTING_KILL_SIGNAL;
1283 break;
1284
1285 case ARG_SETTINGS:
1286
1287 /* no → do not read files
1288 * yes → read files, do not override cmdline, trust only subset
1289 * override → read files, override cmdline, trust only subset
1290 * trusted → read files, do not override cmdline, trust all
1291 */
1292
1293 r = parse_boolean(optarg);
1294 if (r < 0) {
1295 if (streq(optarg, "trusted")) {
1296 mask_all_settings = false;
1297 mask_no_settings = false;
1298 arg_settings_trusted = true;
1299
1300 } else if (streq(optarg, "override")) {
1301 mask_all_settings = false;
1302 mask_no_settings = true;
1303 arg_settings_trusted = -1;
1304 } else
1305 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1306 } else if (r > 0) {
1307 /* yes */
1308 mask_all_settings = false;
1309 mask_no_settings = false;
1310 arg_settings_trusted = -1;
1311 } else {
1312 /* no */
1313 mask_all_settings = true;
1314 mask_no_settings = false;
1315 arg_settings_trusted = false;
1316 }
1317
c6c8f6e2
LP
1318 break;
1319
5f932eb9 1320 case ARG_CHDIR:
baaa35ad
ZJS
1321 if (!path_is_absolute(optarg))
1322 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1323 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1324
1325 r = free_and_strdup(&arg_chdir, optarg);
1326 if (r < 0)
1327 return log_oom();
1328
1329 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1330 break;
1331
b53ede69
PW
1332 case ARG_PIVOT_ROOT:
1333 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1334 if (r < 0)
1335 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1336
1337 arg_settings_mask |= SETTING_PIVOT_ROOT;
1338 break;
1339
9c1e04d0
AP
1340 case ARG_NOTIFY_READY:
1341 r = parse_boolean(optarg);
baaa35ad
ZJS
1342 if (r < 0)
1343 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1344 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1345 arg_notify_ready = r;
1346 arg_settings_mask |= SETTING_NOTIFY_READY;
1347 break;
1348
4623e8e6 1349 case ARG_ROOT_HASH: {
89e62e0b 1350 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1351 size_t l;
1352
1353 r = unhexmem(optarg, strlen(optarg), &k, &l);
1354 if (r < 0)
1355 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1356 if (l < sizeof(sd_id128_t))
c6147113 1357 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1358
89e62e0b
LP
1359 free_and_replace(arg_verity_settings.root_hash, k);
1360 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1361 break;
1362 }
1363
c2923fdc
LB
1364 case ARG_ROOT_HASH_SIG: {
1365 char *value;
89e62e0b
LP
1366 size_t l;
1367 void *p;
c2923fdc
LB
1368
1369 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1370 r = unbase64mem(value, strlen(value), &p, &l);
1371 if (r < 0)
1372 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1373
c2923fdc 1374 } else {
89e62e0b 1375 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1376 if (r < 0)
89e62e0b 1377 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1378 }
1379
89e62e0b
LP
1380 free_and_replace(arg_verity_settings.root_hash_sig, p);
1381 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1382 break;
1383 }
1384
89e62e0b 1385 case ARG_VERITY_DATA:
614b022c 1386 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1387 if (r < 0)
1388 return r;
1389 break;
1390
960e4569
LP
1391 case ARG_SYSTEM_CALL_FILTER: {
1392 bool negative;
1393 const char *items;
1394
1395 negative = optarg[0] == '~';
1396 items = negative ? optarg + 1 : optarg;
1397
1398 for (;;) {
1399 _cleanup_free_ char *word = NULL;
1400
1401 r = extract_first_word(&items, &word, NULL, 0);
1402 if (r == 0)
1403 break;
1404 if (r == -ENOMEM)
1405 return log_oom();
1406 if (r < 0)
1407 return log_error_errno(r, "Failed to parse system call filter: %m");
1408
1409 if (negative)
6b000af4 1410 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1411 else
6b000af4 1412 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1413 if (r < 0)
1414 return log_oom();
1415 }
1416
1417 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1418 break;
1419 }
1420
bf428efb
LP
1421 case ARG_RLIMIT: {
1422 const char *eq;
622ecfa8 1423 _cleanup_free_ char *name = NULL;
bf428efb
LP
1424 int rl;
1425
5c828e66
LP
1426 if (streq(optarg, "help")) {
1427 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1428 return 0;
1429 }
1430
bf428efb 1431 eq = strchr(optarg, '=');
baaa35ad
ZJS
1432 if (!eq)
1433 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1434 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1435
1436 name = strndup(optarg, eq - optarg);
1437 if (!name)
1438 return log_oom();
1439
1440 rl = rlimit_from_string_harder(name);
baaa35ad 1441 if (rl < 0)
7211c853 1442 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1443
1444 if (!arg_rlimit[rl]) {
1445 arg_rlimit[rl] = new0(struct rlimit, 1);
1446 if (!arg_rlimit[rl])
1447 return log_oom();
1448 }
1449
1450 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1451 if (r < 0)
1452 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1453
1454 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1455 break;
1456 }
1457
81f345df
LP
1458 case ARG_OOM_SCORE_ADJUST:
1459 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1460 if (r < 0)
1461 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1462
1463 arg_oom_score_adjust_set = true;
1464 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1465 break;
1466
d107bb7d 1467 case ARG_CPU_AFFINITY: {
0985c7c4 1468 CPUSet cpuset;
d107bb7d
LP
1469
1470 r = parse_cpu_set(optarg, &cpuset);
1471 if (r < 0)
0985c7c4 1472 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1473
0985c7c4
ZJS
1474 cpu_set_reset(&arg_cpu_set);
1475 arg_cpu_set = cpuset;
d107bb7d
LP
1476 arg_settings_mask |= SETTING_CPU_AFFINITY;
1477 break;
1478 }
1479
09d423e9
LP
1480 case ARG_RESOLV_CONF:
1481 if (streq(optarg, "help")) {
1482 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1483 return 0;
1484 }
1485
1486 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1487 if (arg_resolv_conf < 0)
7211c853 1488 return log_error_errno(arg_resolv_conf,
baaa35ad 1489 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1490
1491 arg_settings_mask |= SETTING_RESOLV_CONF;
1492 break;
1493
1688841f
LP
1494 case ARG_TIMEZONE:
1495 if (streq(optarg, "help")) {
1496 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1497 return 0;
1498 }
1499
1500 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1501 if (arg_timezone < 0)
7211c853 1502 return log_error_errno(arg_timezone,
baaa35ad 1503 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1504
1505 arg_settings_mask |= SETTING_TIMEZONE;
1506 break;
1507
de40a303 1508 case ARG_CONSOLE:
dce66ffe
ZJS
1509 r = handle_arg_console(optarg);
1510 if (r <= 0)
1511 return r;
de40a303
LP
1512 break;
1513
1514 case 'P':
1515 case ARG_PIPE:
dce66ffe
ZJS
1516 r = handle_arg_console("pipe");
1517 if (r <= 0)
1518 return r;
de40a303
LP
1519 break;
1520
bb068de0
ZJS
1521 case ARG_NO_PAGER:
1522 arg_pager_flags |= PAGER_DISABLE;
1523 break;
1524
3652872a
LP
1525 case ARG_SET_CREDENTIAL: {
1526 _cleanup_free_ char *word = NULL, *data = NULL;
1527 const char *p = optarg;
1528 Credential *a;
1529 size_t i;
1530 int l;
1531
1532 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1533 if (r == -ENOMEM)
1534 return log_oom();
1535 if (r < 0)
1536 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1537 if (r == 0 || !p)
1538 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1539
1540 if (!credential_name_valid(word))
1541 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1542
1543 for (i = 0; i < arg_n_credentials; i++)
1544 if (streq(arg_credentials[i].id, word))
1545 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1546
1547 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1548 if (l < 0)
1549 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1550
1551 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1552 if (!a)
1553 return log_oom();
1554
1555 a[arg_n_credentials++] = (Credential) {
1556 .id = TAKE_PTR(word),
1557 .data = TAKE_PTR(data),
1558 .size = l,
1559 };
1560
1561 arg_credentials = a;
1562
1563 arg_settings_mask |= SETTING_CREDENTIALS;
1564 break;
1565 }
1566
1567 case ARG_LOAD_CREDENTIAL: {
1568 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1569 _cleanup_(erase_and_freep) char *data = NULL;
1570 _cleanup_free_ char *word = NULL, *j = NULL;
1571 const char *p = optarg;
1572 Credential *a;
1573 size_t size, i;
1574
1575 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1576 if (r == -ENOMEM)
1577 return log_oom();
1578 if (r < 0)
1579 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1580 if (r == 0 || !p)
1581 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1582
1583 if (!credential_name_valid(word))
1584 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1585
1586 for (i = 0; i < arg_n_credentials; i++)
1587 if (streq(arg_credentials[i].id, word))
1588 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1589
1590 if (path_is_absolute(p))
1591 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1592 else {
1593 const char *e;
1594
1595 e = getenv("CREDENTIALS_DIRECTORY");
1596 if (!e)
1597 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential not available (no credentials passed at all): %s", word);
1598
1599 j = path_join(e, p);
1600 if (!j)
1601 return log_oom();
1602 }
1603
986311c2
LP
1604 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1605 flags,
1606 NULL,
1607 &data, &size);
3652872a
LP
1608 if (r < 0)
1609 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1610
1611 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1612 if (!a)
1613 return log_oom();
1614
1615 a[arg_n_credentials++] = (Credential) {
1616 .id = TAKE_PTR(word),
1617 .data = TAKE_PTR(data),
1618 .size = size,
1619 };
1620
1621 arg_credentials = a;
1622
1623 arg_settings_mask |= SETTING_CREDENTIALS;
1624 break;
1625 }
1626
88213476
LP
1627 case '?':
1628 return -EINVAL;
1629
1630 default:
eb9da376 1631 assert_not_reached("Unhandled option");
88213476 1632 }
88213476 1633
60f1ec13
LP
1634 if (argc > optind) {
1635 strv_free(arg_parameters);
1636 arg_parameters = strv_copy(argv + optind);
1637 if (!arg_parameters)
1638 return log_oom();
d7bea6b6 1639
60f1ec13
LP
1640 arg_settings_mask |= SETTING_START_MODE;
1641 }
1642
1643 if (arg_ephemeral && arg_template && !arg_directory)
1644 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1645 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1646 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1647 * --directory=". */
1648 arg_directory = TAKE_PTR(arg_template);
1649
bd4b15f2 1650 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1651
de40a303 1652 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1653 r = parse_environment();
1654 if (r < 0)
1655 return r;
de40a303 1656
60f1ec13
LP
1657 /* Load all settings from .nspawn files */
1658 if (mask_no_settings)
1659 arg_settings_mask = 0;
1660
1661 /* Don't load any settings from .nspawn files */
1662 if (mask_all_settings)
1663 arg_settings_mask = _SETTINGS_MASK_ALL;
1664
1665 return 1;
1666}
1667
1668static int verify_arguments(void) {
1669 int r;
a6b5216c 1670
75b0d8b8
ZJS
1671 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1672 /* If we are running the stub init in the container, we don't need to look at what the init
1673 * in the container supports, because we are not using it. Let's immediately pick the right
1674 * setting based on the host system configuration.
1675 *
1676 * We only do this, if the user didn't use an environment variable to override the detection.
1677 */
1678
1679 r = cg_all_unified();
1680 if (r < 0)
1681 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1682 if (r > 0)
1683 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1684 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1685 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1686 else
1687 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1688 }
1689
4f086aab
SU
1690 if (arg_userns_mode != USER_NAMESPACE_NO)
1691 arg_mount_settings |= MOUNT_USE_USERNS;
1692
1693 if (arg_private_network)
1694 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1695
48a8d337
LB
1696 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1697 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1698 arg_register = false;
baaa35ad 1699 if (arg_start_mode != START_PID1)
60f1ec13 1700 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1701 }
eb91eb18 1702
0de7acce 1703 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1704 arg_userns_chown = true;
1705
60f1ec13
LP
1706 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1707 arg_kill_signal = SIGRTMIN+3;
1708
e5a4bb0d
LP
1709 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1710 arg_read_only = true;
1711
2436ea76
DDM
1712 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1713 arg_read_only = true;
1714
baaa35ad 1715 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1716 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1717 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1718 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1719
baaa35ad 1720 if (arg_directory && arg_image)
60f1ec13 1721 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1722
baaa35ad 1723 if (arg_template && arg_image)
60f1ec13 1724 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1725
baaa35ad 1726 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1727 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1728
baaa35ad 1729 if (arg_ephemeral && arg_template)
60f1ec13 1730 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1731
baaa35ad 1732 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1734
baaa35ad 1735 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1736 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1737
baaa35ad 1738 if (arg_userns_chown && arg_read_only)
de40a303
LP
1739 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1740 "--read-only and --private-users-chown may not be combined.");
f757855e 1741
e5a4bb0d
LP
1742 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1743 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
5238e957 1744 * copy-up (in case of overlay) making the entire exercise pointless. */
e5a4bb0d
LP
1745 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1746 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1747
679ecd36
SZ
1748 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1749 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1750 if (arg_network_namespace_path &&
1751 (arg_network_interfaces || arg_network_macvlan ||
1752 arg_network_ipvlan || arg_network_veth_extra ||
1753 arg_network_bridge || arg_network_zone ||
679ecd36 1754 arg_network_veth))
de40a303 1755 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1756
60f1ec13 1757 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1758 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1759 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1760
baaa35ad 1761 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1762 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1763
baaa35ad 1764 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1765 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1766
baaa35ad 1767 if (arg_expose_ports && !arg_private_network)
60f1ec13 1768 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1769
88fc9c9b 1770 if (arg_caps_ambient) {
f5fbe71d 1771 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1772 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1773
1774 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1775 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1776
1777 if (arg_start_mode == START_BOOT)
1778 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1779 }
1780
60f1ec13
LP
1781 r = custom_mount_check_all();
1782 if (r < 0)
1783 return r;
c6c8f6e2 1784
f757855e 1785 return 0;
88213476
LP
1786}
1787
03cfe0d5
LP
1788static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1789 assert(p);
1790
0de7acce 1791 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1792 return 0;
1793
1794 if (uid == UID_INVALID && gid == GID_INVALID)
1795 return 0;
1796
1797 if (uid != UID_INVALID) {
1798 uid += arg_uid_shift;
1799
1800 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1801 return -EOVERFLOW;
1802 }
1803
1804 if (gid != GID_INVALID) {
1805 gid += (gid_t) arg_uid_shift;
1806
1807 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1808 return -EOVERFLOW;
1809 }
1810
1811 if (lchown(p, uid, gid) < 0)
1812 return -errno;
b12afc8c
LP
1813
1814 return 0;
1815}
1816
03cfe0d5
LP
1817static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1818 const char *q;
dae8b82e 1819 int r;
03cfe0d5
LP
1820
1821 q = prefix_roota(root, path);
dae8b82e
ZJS
1822 r = mkdir_errno_wrapper(q, mode);
1823 if (r == -EEXIST)
1824 return 0;
1825 if (r < 0)
1826 return r;
03cfe0d5
LP
1827
1828 return userns_lchown(q, uid, gid);
1829}
1830
1688841f 1831static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1832 return PATH_STARTSWITH_SET(
1833 path,
1834 "../usr/share/zoneinfo/",
1835 "/usr/share/zoneinfo/");
1688841f
LP
1836}
1837
83205269
LP
1838static bool etc_writable(void) {
1839 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1840}
1841
e58a1277 1842static int setup_timezone(const char *dest) {
1688841f
LP
1843 _cleanup_free_ char *p = NULL, *etc = NULL;
1844 const char *where, *check;
1845 TimezoneMode m;
d4036145 1846 int r;
f8440af5 1847
e58a1277
LP
1848 assert(dest);
1849
1688841f 1850 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1851 r = readlink_malloc("/etc/localtime", &p);
1852 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1853 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1854 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1855 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1856 else if (r < 0) {
1857 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1858 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1859 * file.
1860 *
1861 * Example:
1862 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1863 */
1864 return 0;
1865 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1866 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1867 else
1868 m = arg_timezone;
1869 } else
1870 m = arg_timezone;
1871
1872 if (m == TIMEZONE_OFF)
1873 return 0;
1874
a5648b80 1875 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1876 if (r < 0) {
1688841f 1877 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1878 return 0;
1879 }
1880
1688841f
LP
1881 where = strjoina(etc, "/localtime");
1882
1883 switch (m) {
1884
1885 case TIMEZONE_DELETE:
1886 if (unlink(where) < 0)
1887 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1888
d4036145 1889 return 0;
d4036145 1890
1688841f
LP
1891 case TIMEZONE_SYMLINK: {
1892 _cleanup_free_ char *q = NULL;
1893 const char *z, *what;
4d1c38b8 1894
1688841f
LP
1895 z = timezone_from_path(p);
1896 if (!z) {
1897 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1898 return 0;
1688841f 1899 }
d4036145 1900
1688841f
LP
1901 r = readlink_malloc(where, &q);
1902 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1903 return 0; /* Already pointing to the right place? Then do nothing .. */
1904
1905 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1906 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1907 if (r < 0)
1908 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1909 else {
1910 if (unlink(where) < 0 && errno != ENOENT) {
1911 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1912 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1913 return 0;
1914 }
1915
1916 what = strjoina("../usr/share/zoneinfo/", z);
1917 if (symlink(what, where) < 0) {
1918 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1919 errno, "Failed to correct timezone of container, ignoring: %m");
1920 return 0;
1921 }
1922
1923 break;
1924 }
1925
1926 _fallthrough_;
d4036145 1927 }
68fb0892 1928
1688841f
LP
1929 case TIMEZONE_BIND: {
1930 _cleanup_free_ char *resolved = NULL;
1931 int found;
1932
a5648b80 1933 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1934 if (found < 0) {
1935 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1936 return 0;
1937 }
1938
1939 if (found == 0) /* missing? */
1940 (void) touch(resolved);
1941
511a8cfe 1942 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 1943 if (r >= 0)
511a8cfe 1944 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
1945
1946 _fallthrough_;
79d80fc1 1947 }
4d9f07b4 1948
1688841f
LP
1949 case TIMEZONE_COPY:
1950 /* If mounting failed, try to copy */
8a016c74 1951 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1952 if (r < 0) {
1953 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1954 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1955 return 0;
1956 }
1957
1958 break;
1959
1960 default:
1961 assert_not_reached("unexpected mode");
d4036145 1962 }
e58a1277 1963
1688841f 1964 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1965 r = userns_lchown(where, 0, 0);
1966 if (r < 0)
1688841f 1967 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1968
e58a1277 1969 return 0;
88213476
LP
1970}
1971
09d423e9
LP
1972static int have_resolv_conf(const char *path) {
1973 assert(path);
1974
1975 if (access(path, F_OK) < 0) {
1976 if (errno == ENOENT)
1977 return 0;
1978
1979 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1980 }
1981
1982 return 1;
1983}
1984
7357272e 1985static int resolved_listening(void) {
b8ea7a6e 1986 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1987 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1988 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1989 int r;
1990
7357272e 1991 /* Check if resolved is listening */
b053cd5f
LP
1992
1993 r = sd_bus_open_system(&bus);
1994 if (r < 0)
b8ea7a6e 1995 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1996
7357272e 1997 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1998 if (r < 0)
1999 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2000 if (r == 0)
2001 return 0;
7357272e
DM
2002
2003 r = sd_bus_get_property_string(bus,
2004 "org.freedesktop.resolve1",
2005 "/org/freedesktop/resolve1",
2006 "org.freedesktop.resolve1.Manager",
2007 "DNSStubListener",
b8ea7a6e 2008 &error,
7357272e
DM
2009 &dns_stub_listener_mode);
2010 if (r < 0)
b8ea7a6e 2011 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2012
2013 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2014}
2015
2547bb41 2016static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2017 _cleanup_free_ char *etc = NULL;
2018 const char *where, *what;
2019 ResolvConfMode m;
2020 int r;
2547bb41
LP
2021
2022 assert(dest);
2023
09d423e9
LP
2024 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2025 if (arg_private_network)
2026 m = RESOLV_CONF_OFF;
86775e35
LP
2027 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2028 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2029 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2030 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2031 else
83205269 2032 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2033
09d423e9
LP
2034 } else
2035 m = arg_resolv_conf;
2036
2037 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2038 return 0;
2039
a5648b80 2040 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2041 if (r < 0) {
2042 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2043 return 0;
2044 }
2045
2046 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2047
2048 if (m == RESOLV_CONF_DELETE) {
2049 if (unlink(where) < 0)
2050 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2051
87447ae4
LP
2052 return 0;
2053 }
79d80fc1 2054
86775e35
LP
2055 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2056 what = PRIVATE_STATIC_RESOLV_CONF;
2057 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2058 what = PRIVATE_UPLINK_RESOLV_CONF;
2059 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2060 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2061 else
2062 what = "/etc/resolv.conf";
87447ae4 2063
86775e35 2064 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2065 _cleanup_free_ char *resolved = NULL;
2066 int found;
2067
a5648b80 2068 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2069 if (found < 0) {
2070 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2071 return 0;
2072 }
3539724c 2073
87447ae4
LP
2074 if (found == 0) /* missing? */
2075 (void) touch(resolved);
5367354d 2076
511a8cfe 2077 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2078 if (r >= 0)
511a8cfe 2079 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2080
2081 /* If that didn't work, let's copy the file */
3539724c
LP
2082 }
2083
86775e35
LP
2084 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2085 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2086 else
2087 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 2088 if (r < 0) {
3539724c
LP
2089 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2090 * resolved or something similar runs inside and the symlink points there.
68a313c5 2091 *
3539724c 2092 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2093 */
86775e35
LP
2094 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2095 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2096 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2097 return 0;
2098 }
2547bb41 2099
03cfe0d5
LP
2100 r = userns_lchown(where, 0, 0);
2101 if (r < 0)
3539724c 2102 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2103
2547bb41
LP
2104 return 0;
2105}
2106
1e4f1671 2107static int setup_boot_id(void) {
cdde6ba6
LP
2108 _cleanup_(unlink_and_freep) char *from = NULL;
2109 _cleanup_free_ char *path = NULL;
3bbaff3e 2110 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2111 const char *to;
04bc4a3f
LP
2112 int r;
2113
1eacc470 2114 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2115
1eacc470 2116 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2117 if (r < 0)
2118 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2119
2120 r = sd_id128_randomize(&rnd);
f647962d
MS
2121 if (r < 0)
2122 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2123
cdde6ba6 2124 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
2125 if (r < 0)
2126 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2127
cdde6ba6
LP
2128 from = TAKE_PTR(path);
2129 to = "/proc/sys/kernel/random/boot_id";
2130
511a8cfe 2131 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2132 if (r < 0)
2133 return r;
04bc4a3f 2134
511a8cfe 2135 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2136}
2137
e58a1277 2138static int copy_devnodes(const char *dest) {
88213476
LP
2139 static const char devnodes[] =
2140 "null\0"
2141 "zero\0"
2142 "full\0"
2143 "random\0"
2144 "urandom\0"
85614d66
TG
2145 "tty\0"
2146 "net/tun\0";
88213476 2147
de40a303 2148 _cleanup_umask_ mode_t u;
88213476 2149 const char *d;
e58a1277 2150 int r = 0;
a258bf26
LP
2151
2152 assert(dest);
124640f1
LP
2153
2154 u = umask(0000);
88213476 2155
03cfe0d5
LP
2156 /* Create /dev/net, so that we can create /dev/net/tun in it */
2157 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2158 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2159
88213476 2160 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2161 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2162 struct stat st;
88213476 2163
c6134d3e 2164 from = path_join("/dev/", d);
8967f291
LP
2165 if (!from)
2166 return log_oom();
2167
c6134d3e 2168 to = path_join(dest, from);
8967f291
LP
2169 if (!to)
2170 return log_oom();
88213476
LP
2171
2172 if (stat(from, &st) < 0) {
2173
4a62c710
MS
2174 if (errno != ENOENT)
2175 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2176
baaa35ad
ZJS
2177 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2178 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2179 "%s is not a char or block device, cannot copy.", from);
2180 else {
8dfce114
LP
2181 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2182
81f5049b 2183 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2184 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2185 if (errno == EEXIST)
8dbf71ec 2186 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2187 if (errno != EPERM)
2188 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2189
8dfce114 2190 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2191 r = touch(to);
2192 if (r < 0)
2193 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2194 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2195 if (r < 0)
2196 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2197 }
6278cf60 2198
03cfe0d5
LP
2199 r = userns_lchown(to, 0, 0);
2200 if (r < 0)
2201 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2202
657ee2d8 2203 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2204 if (!dn)
2205 return log_oom();
2206
2207 r = userns_mkdir(dest, dn, 0755, 0, 0);
2208 if (r < 0)
2209 return log_error_errno(r, "Failed to create '%s': %m", dn);
2210
2211 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2212 return log_oom();
2213
c6134d3e 2214 prefixed = path_join(dest, sl);
8dfce114
LP
2215 if (!prefixed)
2216 return log_oom();
2217
2d9b74ba 2218 t = path_join("..", d);
8dfce114
LP
2219 if (!t)
2220 return log_oom();
2221
2222 if (symlink(t, prefixed) < 0)
2223 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2224 }
88213476
LP
2225 }
2226
e58a1277
LP
2227 return r;
2228}
88213476 2229
de40a303
LP
2230static int make_extra_nodes(const char *dest) {
2231 _cleanup_umask_ mode_t u;
2232 size_t i;
2233 int r;
2234
2235 u = umask(0000);
2236
2237 for (i = 0; i < arg_n_extra_nodes; i++) {
2238 _cleanup_free_ char *path = NULL;
2239 DeviceNode *n = arg_extra_nodes + i;
2240
c6134d3e 2241 path = path_join(dest, n->path);
de40a303
LP
2242 if (!path)
2243 return log_oom();
2244
2245 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2246 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2247
2248 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2249 if (r < 0)
2250 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2251 }
2252
2253 return 0;
2254}
2255
03cfe0d5
LP
2256static int setup_pts(const char *dest) {
2257 _cleanup_free_ char *options = NULL;
2258 const char *p;
709f6e46 2259 int r;
03cfe0d5 2260
349cc4a5 2261#if HAVE_SELINUX
03cfe0d5
LP
2262 if (arg_selinux_apifs_context)
2263 (void) asprintf(&options,
3dce8915 2264 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2265 arg_uid_shift + TTY_GID,
2266 arg_selinux_apifs_context);
2267 else
2268#endif
2269 (void) asprintf(&options,
3dce8915 2270 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2271 arg_uid_shift + TTY_GID);
f2d88580 2272
03cfe0d5 2273 if (!options)
f2d88580
LP
2274 return log_oom();
2275
03cfe0d5 2276 /* Mount /dev/pts itself */
cc9fce65 2277 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2278 r = mkdir_errno_wrapper(p, 0755);
2279 if (r < 0)
2280 return log_error_errno(r, "Failed to create /dev/pts: %m");
2281
511a8cfe 2282 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2283 if (r < 0)
2284 return r;
709f6e46
MS
2285 r = userns_lchown(p, 0, 0);
2286 if (r < 0)
2287 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2288
2289 /* Create /dev/ptmx symlink */
2290 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2291 if (symlink("pts/ptmx", p) < 0)
2292 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2293 r = userns_lchown(p, 0, 0);
2294 if (r < 0)
2295 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2296
03cfe0d5
LP
2297 /* And fix /dev/pts/ptmx ownership */
2298 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2299 r = userns_lchown(p, 0, 0);
2300 if (r < 0)
2301 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2302
f2d88580
LP
2303 return 0;
2304}
2305
3acc84eb 2306static int setup_stdio_as_dev_console(void) {
2fef50cd 2307 _cleanup_close_ int terminal = -1;
e58a1277 2308 int r;
e58a1277 2309
335d2ead
LP
2310 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2311 * explicitly, if we are configured to. */
2312 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2313 if (terminal < 0)
2314 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2315
3acc84eb
FB
2316 /* Make sure we can continue logging to the original stderr, even if
2317 * stderr points elsewhere now */
2318 r = log_dup_console();
2319 if (r < 0)
2320 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2321
3acc84eb
FB
2322 /* invalidates 'terminal' on success and failure */
2323 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2324 TAKE_FD(terminal);
f647962d 2325 if (r < 0)
3acc84eb
FB
2326 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2327
2328 return 0;
2329}
88213476 2330
3acc84eb
FB
2331static int setup_dev_console(const char *console) {
2332 _cleanup_free_ char *p = NULL;
2333 int r;
a258bf26 2334
3acc84eb
FB
2335 /* Create /dev/console symlink */
2336 r = path_make_relative("/dev", console, &p);
81f5049b 2337 if (r < 0)
3acc84eb
FB
2338 return log_error_errno(r, "Failed to create relative path: %m");
2339
2340 if (symlink(p, "/dev/console") < 0)
2341 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2342
3acc84eb 2343 return 0;
e58a1277
LP
2344}
2345
8e5430c4
LP
2346static int setup_keyring(void) {
2347 key_serial_t keyring;
2348
6b000af4
LP
2349 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2350 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2351 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2352 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2353 * into the container. */
8e5430c4
LP
2354
2355 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2356 if (keyring == -1) {
2357 if (errno == ENOSYS)
2358 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2359 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2360 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2361 else
2362 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2363 }
2364
2365 return 0;
2366}
2367
3652872a
LP
2368static int setup_credentials(const char *root) {
2369 const char *q;
2370 int r;
2371
2372 if (arg_n_credentials <= 0)
2373 return 0;
2374
2375 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2376 if (r < 0)
2377 return log_error_errno(r, "Failed to create /run/host: %m");
2378
2379 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2380 if (r < 0)
2381 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2382
2383 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2384 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2385 if (r < 0)
2386 return r;
2387
2388 for (size_t i = 0; i < arg_n_credentials; i++) {
2389 _cleanup_free_ char *j = NULL;
2390 _cleanup_close_ int fd = -1;
2391
2392 j = path_join(q, arg_credentials[i].id);
2393 if (!j)
2394 return log_oom();
2395
2396 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2397 if (fd < 0)
2398 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2399
2400 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2401 if (r < 0)
2402 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2403
2404 if (fchmod(fd, 0400) < 0)
2405 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2406
2407 if (arg_userns_mode != USER_NAMESPACE_NO) {
2408 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2409 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2410 }
2411 }
2412
2413 if (chmod(q, 0500) < 0)
2414 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2415
2416 r = userns_lchown(q, 0, 0);
2417 if (r < 0)
2418 return r;
2419
2420 /* Make both mount and superblock read-only now */
511a8cfe 2421 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2422 if (r < 0)
2423 return r;
2424
511a8cfe 2425 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2426}
2427
1e4f1671 2428static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2429 _cleanup_(unlink_and_freep) char *from = NULL;
2430 _cleanup_free_ char *fifo = NULL;
2431 _cleanup_close_ int fd = -1;
7fd1b19b 2432 _cleanup_umask_ mode_t u;
9ec5a93c 2433 int r;
e58a1277 2434
e58a1277 2435 assert(kmsg_socket >= 0);
a258bf26 2436
e58a1277 2437 u = umask(0000);
a258bf26 2438
1eacc470 2439 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2440 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2441 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2442 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2443
1eacc470 2444 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2445 if (r < 0)
2446 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2447
9ec5a93c 2448 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2449 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2450
2451 from = TAKE_PTR(fifo);
9ec5a93c 2452
511a8cfe 2453 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2454 if (r < 0)
2455 return r;
e58a1277 2456
669fc4e5 2457 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2458 if (fd < 0)
2459 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2460
9ec5a93c 2461 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2462 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2465
25ea79fe 2466 return 0;
88213476
LP
2467}
2468
761cf19d 2469struct ExposeArgs {
deff68e7
FW
2470 union in_addr_union address4;
2471 union in_addr_union address6;
761cf19d
FW
2472 struct FirewallContext *fw_ctx;
2473};
2474
1c4baffc 2475static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
761cf19d 2476 struct ExposeArgs *args = userdata;
6d0b55c2
LP
2477
2478 assert(rtnl);
2479 assert(m);
761cf19d 2480 assert(args);
6d0b55c2 2481
deff68e7
FW
2482 expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2483 expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2484 return 0;
2485}
2486
3a74cea5 2487static int setup_hostname(void) {
c818eef1 2488 int r;
3a74cea5 2489
0c582db0 2490 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2491 return 0;
2492
c818eef1
LP
2493 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2494 if (r < 0)
2495 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2496
7027ff61 2497 return 0;
3a74cea5
LP
2498}
2499
57fb9fb5 2500static int setup_journal(const char *directory) {
0f5e1382 2501 _cleanup_free_ char *d = NULL;
5905d7cf 2502 char id[SD_ID128_STRING_MAX];
b2238e38
LP
2503 const char *dirname, *p, *q;
2504 sd_id128_t this_id;
8054d749 2505 bool try;
57fb9fb5
LP
2506 int r;
2507
df9a75e4
LP
2508 /* Don't link journals in ephemeral mode */
2509 if (arg_ephemeral)
2510 return 0;
2511
8054d749
LP
2512 if (arg_link_journal == LINK_NO)
2513 return 0;
2514
2515 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2516
4d680aee 2517 r = sd_id128_get_machine(&this_id);
f647962d
MS
2518 if (r < 0)
2519 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2520
e01ff70a 2521 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2522 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2523 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2524 if (try)
4d680aee 2525 return 0;
df9a75e4 2526 return -EEXIST;
4d680aee
ZJS
2527 }
2528
369ca6da
ZJS
2529 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2530 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2531 if (r < 0) {
2532 bool ignore = r == -EROFS && try;
2533 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2534 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2535 return ignore ? 0 : r;
2536 }
2537 }
03cfe0d5 2538
e01ff70a
MS
2539 (void) sd_id128_to_string(arg_uuid, id);
2540
03cfe0d5
LP
2541 p = strjoina("/var/log/journal/", id);
2542 q = prefix_roota(directory, p);
27407a01 2543
e1873695 2544 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2545 if (try)
2546 return 0;
27407a01 2547
baaa35ad
ZJS
2548 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2549 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2550 }
2551
e1873695 2552 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2553 if (try)
2554 return 0;
57fb9fb5 2555
baaa35ad
ZJS
2556 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2557 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2558 }
2559
2560 r = readlink_and_make_absolute(p, &d);
2561 if (r >= 0) {
3742095b 2562 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2563 path_equal(d, q)) {
2564
03cfe0d5 2565 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2566 if (r < 0)
709f6e46 2567 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2568 return 0;
57fb9fb5
LP
2569 }
2570
4a62c710
MS
2571 if (unlink(p) < 0)
2572 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2573 } else if (r == -EINVAL) {
2574
2575 if (arg_link_journal == LINK_GUEST &&
2576 rmdir(p) < 0) {
2577
27407a01
ZJS
2578 if (errno == ENOTDIR) {
2579 log_error("%s already exists and is neither a symlink nor a directory", p);
2580 return r;
4314d33f
MS
2581 } else
2582 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2583 }
4314d33f
MS
2584 } else if (r != -ENOENT)
2585 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2586
2587 if (arg_link_journal == LINK_GUEST) {
2588
2589 if (symlink(q, p) < 0) {
8054d749 2590 if (try) {
56f64d95 2591 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2592 return 0;
4314d33f
MS
2593 } else
2594 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2595 }
2596
03cfe0d5 2597 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2598 if (r < 0)
709f6e46 2599 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2600 return 0;
57fb9fb5
LP
2601 }
2602
2603 if (arg_link_journal == LINK_HOST) {
ccddd104 2604 /* don't create parents here — if the host doesn't have
574edc90 2605 * permanent journal set up, don't force it here */
ba8e6c4d 2606
dae8b82e
ZJS
2607 r = mkdir_errno_wrapper(p, 0755);
2608 if (r < 0 && r != -EEXIST) {
8054d749 2609 if (try) {
dae8b82e 2610 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2611 return 0;
4314d33f 2612 } else
dae8b82e 2613 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2614 }
2615
27407a01
ZJS
2616 } else if (access(p, F_OK) < 0)
2617 return 0;
57fb9fb5 2618
cdb2b9d0
LP
2619 if (dir_is_empty(q) == 0)
2620 log_warning("%s is not empty, proceeding anyway.", q);
2621
03cfe0d5 2622 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2623 if (r < 0)
2624 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2625
511a8cfe 2626 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2627 if (r < 0)
4a62c710 2628 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2629
27407a01 2630 return 0;
57fb9fb5
LP
2631}
2632
de40a303
LP
2633static int drop_capabilities(uid_t uid) {
2634 CapabilityQuintet q;
2635
2636 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2637 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2638 * arg_caps_retain. */
2639
2640 if (capability_quintet_is_set(&arg_full_capabilities)) {
2641 q = arg_full_capabilities;
2642
f5fbe71d 2643 if (q.bounding == UINT64_MAX)
de40a303
LP
2644 q.bounding = uid == 0 ? arg_caps_retain : 0;
2645
f5fbe71d 2646 if (q.effective == UINT64_MAX)
de40a303
LP
2647 q.effective = uid == 0 ? q.bounding : 0;
2648
f5fbe71d 2649 if (q.inheritable == UINT64_MAX)
88fc9c9b 2650 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2651
f5fbe71d 2652 if (q.permitted == UINT64_MAX)
88fc9c9b 2653 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2654
f5fbe71d 2655 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2656 q.ambient = arg_caps_ambient;
f66ad460
AZ
2657
2658 if (capability_quintet_mangle(&q))
2659 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2660
2661 } else {
de40a303
LP
2662 q = (CapabilityQuintet) {
2663 .bounding = arg_caps_retain,
2664 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2665 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2666 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2667 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2668 };
2669
f66ad460
AZ
2670 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2671 * in order to maintain the same behavior as systemd < 242. */
2672 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2673 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2674 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2675
2676 }
2677
de40a303 2678 return capability_quintet_enforce(&q);
88213476
LP
2679}
2680
db999e0f
LP
2681static int reset_audit_loginuid(void) {
2682 _cleanup_free_ char *p = NULL;
2683 int r;
2684
0c582db0 2685 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2686 return 0;
2687
2688 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2689 if (r == -ENOENT)
db999e0f 2690 return 0;
f647962d
MS
2691 if (r < 0)
2692 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2693
2694 /* Already reset? */
2695 if (streq(p, "4294967295"))
2696 return 0;
2697
57512c89 2698 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2699 if (r < 0) {
10a87006
LP
2700 log_error_errno(r,
2701 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2702 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2703 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2704 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2705 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2706
db999e0f 2707 sleep(5);
77b6e194 2708 }
db999e0f
LP
2709
2710 return 0;
77b6e194
LP
2711}
2712
785890ac
LP
2713static int setup_propagate(const char *root) {
2714 const char *p, *q;
709f6e46 2715 int r;
785890ac
LP
2716
2717 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2718 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2719 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2720 (void) mkdir_p(p, 0600);
2721
5a27b395 2722 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2723 if (r < 0)
5a27b395 2724 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2725
5a27b395 2726 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
709f6e46 2727 if (r < 0)
5a27b395 2728 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
03cfe0d5 2729
5a27b395 2730 q = prefix_roota(root, "/run/host/incoming");
511a8cfe 2731 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2732 if (r < 0)
2733 return r;
785890ac 2734
511a8cfe 2735 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2736 if (r < 0)
2737 return r;
785890ac 2738
5a27b395 2739 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
511a8cfe 2740 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2741}
2742
317feb4d 2743static int setup_machine_id(const char *directory) {
691675ba
LP
2744 const char *etc_machine_id;
2745 sd_id128_t id;
3bbaff3e 2746 int r;
e01ff70a 2747
317feb4d
LP
2748 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2749 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2750 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2751 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2752 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2753 * container behaves nicely). */
2754
e01ff70a
MS
2755 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2756
c5fbeedb 2757 r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
317feb4d
LP
2758 if (r < 0) {
2759 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2760 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2761
317feb4d
LP
2762 if (sd_id128_is_null(arg_uuid)) {
2763 r = sd_id128_randomize(&arg_uuid);
2764 if (r < 0)
2765 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2766 }
2767 } else {
baaa35ad
ZJS
2768 if (sd_id128_is_null(id))
2769 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2770 "Machine ID in container image is zero, refusing.");
e01ff70a 2771
317feb4d
LP
2772 arg_uuid = id;
2773 }
691675ba 2774
e01ff70a
MS
2775 return 0;
2776}
2777
7336138e
LP
2778static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2779 int r;
2780
2781 assert(directory);
2782
0de7acce 2783 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2784 return 0;
2785
2786 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2787 if (r == -EOPNOTSUPP)
2788 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2789 if (r == -EBADE)
2790 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2791 if (r < 0)
2792 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2793 if (r == 0)
2794 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2795 else
2796 log_debug("Patched directory tree to match UID/GID range.");
2797
2798 return r;
2799}
2800
113cea80 2801/*
6d416b9c
LS
2802 * Return values:
2803 * < 0 : wait_for_terminate() failed to get the state of the
2804 * container, the container was terminated by a signal, or
2805 * failed for an unknown reason. No change is made to the
2806 * container argument.
2807 * > 0 : The program executed in the container terminated with an
2808 * error. The exit code of the program executed in the
919699ec
LP
2809 * container is returned. The container argument has been set
2810 * to CONTAINER_TERMINATED.
6d416b9c
LS
2811 * 0 : The container is being rebooted, has been shut down or exited
2812 * successfully. The container argument has been set to either
2813 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2814 *
6d416b9c
LS
2815 * That is, success is indicated by a return value of zero, and an
2816 * error is indicated by a non-zero value.
113cea80
DH
2817 */
2818static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2819 siginfo_t status;
919699ec 2820 int r;
113cea80
DH
2821
2822 r = wait_for_terminate(pid, &status);
f647962d
MS
2823 if (r < 0)
2824 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2825
2826 switch (status.si_code) {
fddbb89c 2827
113cea80 2828 case CLD_EXITED:
b5a2179b 2829 if (status.si_status == 0)
919699ec 2830 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2831 else
919699ec 2832 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2833
919699ec
LP
2834 *container = CONTAINER_TERMINATED;
2835 return status.si_status;
113cea80
DH
2836
2837 case CLD_KILLED:
2838 if (status.si_status == SIGINT) {
919699ec 2839 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2840 *container = CONTAINER_TERMINATED;
919699ec
LP
2841 return 0;
2842
113cea80 2843 } else if (status.si_status == SIGHUP) {
919699ec 2844 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2845 *container = CONTAINER_REBOOTED;
919699ec 2846 return 0;
113cea80 2847 }
919699ec 2848
4831981d 2849 _fallthrough_;
113cea80 2850 case CLD_DUMPED:
baaa35ad
ZJS
2851 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2852 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2853
2854 default:
baaa35ad
ZJS
2855 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2856 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2857 }
113cea80
DH
2858}
2859
023fb90b
LP
2860static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2861 pid_t pid;
2862
4a0b58c4 2863 pid = PTR_TO_PID(userdata);
023fb90b 2864 if (pid > 0) {
c6c8f6e2 2865 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2866 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2867 sd_event_source_set_userdata(s, NULL);
2868 return 0;
2869 }
2870 }
2871
2872 sd_event_exit(sd_event_source_get_event(s), 0);
2873 return 0;
2874}
2875
6916b164 2876static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2877 pid_t pid;
2878
2879 assert(s);
2880 assert(ssi);
2881
2882 pid = PTR_TO_PID(userdata);
2883
6916b164
AU
2884 for (;;) {
2885 siginfo_t si = {};
abdb9b08 2886
6916b164
AU
2887 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2888 return log_error_errno(errno, "Failed to waitid(): %m");
2889 if (si.si_pid == 0) /* No pending children. */
2890 break;
abdb9b08 2891 if (si.si_pid == pid) {
6916b164
AU
2892 /* The main process we care for has exited. Return from
2893 * signal handler but leave the zombie. */
2894 sd_event_exit(sd_event_source_get_event(s), 0);
2895 break;
2896 }
abdb9b08 2897
6916b164
AU
2898 /* Reap all other children. */
2899 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2900 }
2901
2902 return 0;
2903}
2904
abdb9b08
LP
2905static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2906 pid_t pid;
2907
2908 assert(m);
2909
2910 pid = PTR_TO_PID(userdata);
2911
2912 if (arg_kill_signal > 0) {
2913 log_info("Container termination requested. Attempting to halt container.");
2914 (void) kill(pid, arg_kill_signal);
2915 } else {
2916 log_info("Container termination requested. Exiting.");
2917 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2918 }
2919
2920 return 0;
2921}
2922
ec16945e 2923static int determine_names(void) {
1b9cebf6 2924 int r;
ec16945e 2925
c1521918
LP
2926 if (arg_template && !arg_directory && arg_machine) {
2927
2928 /* If --template= was specified then we should not
2929 * search for a machine, but instead create a new one
2930 * in /var/lib/machine. */
2931
657ee2d8 2932 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2933 if (!arg_directory)
2934 return log_oom();
2935 }
2936
ec16945e 2937 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2938 if (arg_machine) {
2939 _cleanup_(image_unrefp) Image *i = NULL;
2940
d577d4a4 2941 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
2942 if (r == -ENOENT)
2943 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2944 if (r < 0)
2945 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2946
eb38edce 2947 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2948 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2949 else
0f03c2a4 2950 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2951 if (r < 0)
0f3be6ca 2952 return log_oom();
1b9cebf6 2953
aee327b8
LP
2954 if (!arg_ephemeral)
2955 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2956 } else {
2957 r = safe_getcwd(&arg_directory);
2958 if (r < 0)
2959 return log_error_errno(r, "Failed to determine current directory: %m");
2960 }
ec16945e 2961
c6147113
LP
2962 if (!arg_directory && !arg_image)
2963 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2964 }
2965
2966 if (!arg_machine) {
b9ba4dab
LP
2967 if (arg_directory && path_equal(arg_directory, "/"))
2968 arg_machine = gethostname_malloc();
4827ab48
LP
2969 else {
2970 if (arg_image) {
2971 char *e;
2972
2973 arg_machine = strdup(basename(arg_image));
2974
2975 /* Truncate suffix if there is one */
2976 e = endswith(arg_machine, ".raw");
2977 if (e)
2978 *e = 0;
2979 } else
2980 arg_machine = strdup(basename(arg_directory));
2981 }
ec16945e
LP
2982 if (!arg_machine)
2983 return log_oom();
2984
ae691c1d 2985 hostname_cleanup(arg_machine);
52ef5dd7 2986 if (!hostname_is_valid(arg_machine, 0))
c6147113 2987 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab
LP
2988
2989 if (arg_ephemeral) {
2990 char *b;
2991
2992 /* Add a random suffix when this is an
2993 * ephemeral machine, so that we can run many
2994 * instances at once without manually having
2995 * to specify -M each time. */
2996
2997 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2998 return log_oom();
2999
3000 free(arg_machine);
3001 arg_machine = b;
3002 }
ec16945e
LP
3003 }
3004
3005 return 0;
3006}
3007
8d4aa2bb 3008static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
3009 char *chased;
3010 int r;
3011
3012 assert(p);
3013
3014 if (!*p)
3015 return 0;
3016
a5648b80 3017 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3018 if (r < 0)
3019 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3020
a5648b80 3021 return free_and_replace(*p, chased);
3f342ec4
LP
3022}
3023
03cfe0d5 3024static int determine_uid_shift(const char *directory) {
6dac160c
LP
3025 int r;
3026
0de7acce 3027 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3028 arg_uid_shift = 0;
6dac160c 3029 return 0;
03cfe0d5 3030 }
6dac160c
LP
3031
3032 if (arg_uid_shift == UID_INVALID) {
3033 struct stat st;
3034
03cfe0d5 3035 r = stat(directory, &st);
6dac160c 3036 if (r < 0)
03cfe0d5 3037 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3038
3039 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3040
baaa35ad
ZJS
3041 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3042 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3043 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3044
3045 arg_uid_range = UINT32_C(0x10000);
3046 }
3047
f5fbe71d 3048 if (arg_uid_shift > UID_INVALID - arg_uid_range)
baaa35ad
ZJS
3049 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3050 "UID base too high for UID range.");
6dac160c 3051
6dac160c
LP
3052 return 0;
3053}
3054
de40a303
LP
3055static unsigned long effective_clone_ns_flags(void) {
3056 unsigned long flags = arg_clone_ns_flags;
3057
3058 if (arg_private_network)
3059 flags |= CLONE_NEWNET;
3060 if (arg_use_cgns)
3061 flags |= CLONE_NEWCGROUP;
3062 if (arg_userns_mode != USER_NAMESPACE_NO)
3063 flags |= CLONE_NEWUSER;
3064
3065 return flags;
3066}
3067
3068static int patch_sysctl(void) {
3069
3070 /* This table is inspired by runc's sysctl() function */
3071 static const struct {
3072 const char *key;
3073 bool prefix;
3074 unsigned long clone_flags;
3075 } safe_sysctl[] = {
3076 { "kernel.hostname", false, CLONE_NEWUTS },
3077 { "kernel.domainname", false, CLONE_NEWUTS },
3078 { "kernel.msgmax", false, CLONE_NEWIPC },
3079 { "kernel.msgmnb", false, CLONE_NEWIPC },
3080 { "kernel.msgmni", false, CLONE_NEWIPC },
3081 { "kernel.sem", false, CLONE_NEWIPC },
3082 { "kernel.shmall", false, CLONE_NEWIPC },
3083 { "kernel.shmmax", false, CLONE_NEWIPC },
3084 { "kernel.shmmni", false, CLONE_NEWIPC },
3085 { "fs.mqueue.", true, CLONE_NEWIPC },
3086 { "net.", true, CLONE_NEWNET },
3087 };
3088
3089 unsigned long flags;
3090 char **k, **v;
3091 int r;
3092
3093 flags = effective_clone_ns_flags();
3094
3095 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3096 bool good = false;
3097 size_t i;
3098
3099 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3100
3101 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3102 continue;
3103
3104 if (safe_sysctl[i].prefix)
3105 good = startswith(*k, safe_sysctl[i].key);
3106 else
3107 good = streq(*k, safe_sysctl[i].key);
3108
3109 if (good)
3110 break;
3111 }
3112
c6147113
LP
3113 if (!good)
3114 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3115
3116 r = sysctl_write(*k, *v);
3117 if (r < 0)
3118 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3119 }
3120
3121 return 0;
3122}
3123
03cfe0d5
LP
3124static int inner_child(
3125 Barrier *barrier,
3126 const char *directory,
3127 bool secondary,
3128 int kmsg_socket,
3129 int rtnl_socket,
3acc84eb 3130 int master_pty_socket,
e1bb4b0d
LB
3131 FDSet *fds,
3132 char **os_release_pairs) {
69c79d3c 3133
03cfe0d5 3134 _cleanup_free_ char *home = NULL;
b5ea030d 3135 char as_uuid[ID128_UUID_STRING_MAX];
88614c8a 3136 size_t n_env = 1;
03cfe0d5 3137 const char *envp[] = {
0c300adf 3138 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3139 NULL, /* container */
03cfe0d5
LP
3140 NULL, /* TERM */
3141 NULL, /* HOME */
3142 NULL, /* USER */
3143 NULL, /* LOGNAME */
3144 NULL, /* container_uuid */
3145 NULL, /* LISTEN_FDS */
3146 NULL, /* LISTEN_PID */
9c1e04d0 3147 NULL, /* NOTIFY_SOCKET */
3652872a 3148 NULL, /* CREDENTIALS_DIRECTORY */
03cfe0d5
LP
3149 NULL
3150 };
1a68e1e5 3151 const char *exec_target;
2371271c 3152 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3153 int r, which_failed;
88213476 3154
b37469d7
LP
3155 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3156 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3157 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3158 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3159 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3160 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3161 * namespace.
3162 *
3163 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3164 * unshare(). See below. */
3165
03cfe0d5
LP
3166 assert(barrier);
3167 assert(directory);
3168 assert(kmsg_socket >= 0);
88213476 3169
de40a303
LP
3170 log_debug("Inner child is initializing.");
3171
0de7acce 3172 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3173 /* Tell the parent, that it now can write the UID map. */
3174 (void) barrier_place(barrier); /* #1 */
7027ff61 3175
03cfe0d5 3176 /* Wait until the parent wrote the UID map */
baaa35ad 3177 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3178 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3179
2a2e78e9
LP
3180 /* Become the new root user inside our namespace */
3181 r = reset_uid_gid();
3182 if (r < 0)
3183 return log_error_errno(r, "Couldn't become new root: %m");
3184
3185 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3186 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3187 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3188 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3189 if (r < 0)
3190 return r;
3191 }
6d66bd3b 3192
0de7acce 3193 r = mount_all(NULL,
4f086aab 3194 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3195 arg_uid_shift,
0de7acce 3196 arg_selinux_apifs_context);
03cfe0d5
LP
3197 if (r < 0)
3198 return r;
3199
04413780
ZJS
3200 if (!arg_network_namespace_path && arg_private_network) {
3201 r = unshare(CLONE_NEWNET);
3202 if (r < 0)
3203 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3204
3205 /* Tell the parent that it can setup network interfaces. */
3206 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3207 }
3208
4f086aab 3209 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3210 if (r < 0)
3211 return r;
3212
03cfe0d5
LP
3213 /* Wait until we are cgroup-ified, so that we
3214 * can mount the right cgroup path writable */
baaa35ad
ZJS
3215 if (!barrier_place_and_sync(barrier)) /* #4 */
3216 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3217 "Parent died too early");
88213476 3218
489fae52 3219 if (arg_use_cgns) {
0996ef00
CB
3220 r = unshare(CLONE_NEWCGROUP);
3221 if (r < 0)
04413780 3222 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3223 r = mount_cgroups(
3224 "",
3225 arg_unified_cgroup_hierarchy,
3226 arg_userns_mode != USER_NAMESPACE_NO,
3227 arg_uid_shift,
3228 arg_uid_range,
5a8ff0e6 3229 arg_selinux_apifs_context,
ada54120 3230 true);
1433e0f2 3231 } else
0996ef00 3232 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3233 if (r < 0)
3234 return r;
ec16945e 3235
1e4f1671 3236 r = setup_boot_id();
03cfe0d5
LP
3237 if (r < 0)
3238 return r;
ec16945e 3239
1e4f1671 3240 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
3241 if (r < 0)
3242 return r;
3243 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3244
de40a303
LP
3245 r = mount_custom(
3246 "/",
3247 arg_custom_mounts,
3248 arg_n_custom_mounts,
de40a303
LP
3249 0,
3250 arg_selinux_apifs_context,
5f0a6347 3251 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3252 if (r < 0)
3253 return r;
3254
03cfe0d5
LP
3255 if (setsid() < 0)
3256 return log_error_errno(errno, "setsid() failed: %m");
3257
3258 if (arg_private_network)
df883de9 3259 (void) loopback_setup();
03cfe0d5 3260
7a8f6325
LP
3261 if (arg_expose_ports) {
3262 r = expose_port_send_rtnl(rtnl_socket);
3263 if (r < 0)
3264 return r;
3265 rtnl_socket = safe_close(rtnl_socket);
3266 }
03cfe0d5 3267
3acc84eb 3268 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3269 _cleanup_close_ int master = -1;
3acc84eb
FB
3270 _cleanup_free_ char *console = NULL;
3271
3272 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3273 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3274 if (master < 0)
dc98caea 3275 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3276
3277 r = setup_dev_console(console);
3278 if (r < 0)
105a1a36 3279 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb
FB
3280
3281 r = send_one_fd(master_pty_socket, master, 0);
3282 if (r < 0)
3283 return log_error_errno(r, "Failed to send master fd: %m");
3284 master_pty_socket = safe_close(master_pty_socket);
3285
3286 r = setup_stdio_as_dev_console();
3287 if (r < 0)
3288 return r;
3289 }
3290
de40a303
LP
3291 r = patch_sysctl();
3292 if (r < 0)
3293 return r;
3294
81f345df
LP
3295 if (arg_oom_score_adjust_set) {
3296 r = set_oom_score_adjust(arg_oom_score_adjust);
3297 if (r < 0)
3298 return log_error_errno(r, "Failed to adjust OOM score: %m");
3299 }
3300
0985c7c4
ZJS
3301 if (arg_cpu_set.set)
3302 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3303 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3304
c818eef1 3305 (void) setup_hostname();
03cfe0d5 3306
050f7277 3307 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3308 r = safe_personality(arg_personality);
3309 if (r < 0)
3310 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3311 } else if (secondary) {
21022b9d
LP
3312 r = safe_personality(PER_LINUX32);
3313 if (r < 0)
3314 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3315 }
3316
de40a303
LP
3317 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3318 if (r < 0)
3319 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3320
3321#if HAVE_SECCOMP
3322 if (arg_seccomp) {
3323
3324 if (is_seccomp_available()) {
3325
3326 r = seccomp_load(arg_seccomp);
7bc5e0b1 3327 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3328 return log_error_errno(r, "Failed to install seccomp filter: %m");
3329 if (r < 0)
3330 log_debug_errno(r, "Failed to install seccomp filter: %m");
3331 }
3332 } else
3333#endif
3334 {
6b000af4 3335 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3336 if (r < 0)
3337 return r;
3338 }
3339
349cc4a5 3340#if HAVE_SELINUX
03cfe0d5 3341 if (arg_selinux_context)
2ed96880 3342 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3343 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3344#endif
3345
de40a303
LP
3346 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3347 * if we need to later on. */
3348 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3349 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3350
3351 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3352 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3353 else
3462d773 3354 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3355 if (r < 0)
3356 return r;
3357
de40a303
LP
3358 r = drop_capabilities(getuid());
3359 if (r < 0)
3360 return log_error_errno(r, "Dropping capabilities failed: %m");
3361
66edd963
LP
3362 if (arg_no_new_privileges)
3363 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3364 return log_error_errno(errno, "Failed to disable new privileges: %m");
3365
6aadfa4c
ILG
3366 /* LXC sets container=lxc, so follow the scheme here */
3367 envp[n_env++] = strjoina("container=", arg_container_service_name);
3368
03cfe0d5
LP
3369 envp[n_env] = strv_find_prefix(environ, "TERM=");
3370 if (envp[n_env])
313cefa1 3371 n_env++;
03cfe0d5 3372
de40a303
LP
3373 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3374 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3375 return log_oom();
3376
3377 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3378 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3379 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3380 return log_oom();
03cfe0d5 3381
3bbaff3e 3382 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3383
691675ba 3384 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3385 return log_oom();
03cfe0d5
LP
3386
3387 if (fdset_size(fds) > 0) {
3388 r = fdset_cloexec(fds, false);
3389 if (r < 0)
3390 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3391
3392 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3393 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3394 return log_oom();
3395 }
9c1e04d0
AP
3396 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3397 return log_oom();
03cfe0d5 3398
3652872a
LP
3399 if (arg_n_credentials > 0) {
3400 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3401 if (!envp[n_env])
3402 return log_oom();
3403 n_env++;
3404 }
3405
ed4512d0 3406 env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
2371271c
TG
3407 if (!env_use)
3408 return log_oom();
03cfe0d5
LP
3409
3410 /* Let the parent know that we are ready and
3411 * wait until the parent is ready with the
3412 * setup, too... */
baaa35ad 3413 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3414 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3415
5f932eb9
LP
3416 if (arg_chdir)
3417 if (chdir(arg_chdir) < 0)
3418 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3419
7732f92b 3420 if (arg_start_mode == START_PID2) {
75bf701f 3421 r = stub_pid1(arg_uuid);
7732f92b
LP
3422 if (r < 0)
3423 return r;
3424 }
3425
335d2ead
LP
3426 if (arg_console_mode != CONSOLE_PIPE) {
3427 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3428 * are configured for that. Acquire it as controlling tty. */
3429 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3430 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3431 }
3432
de40a303
LP
3433 log_debug("Inner child completed, invoking payload.");
3434
8ca082b4
LP
3435 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3436 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3437 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3438 log_close();
8ca082b4
LP
3439 log_set_open_when_needed(true);
3440
03cfe0d5
LP
3441 (void) fdset_close_others(fds);
3442
7732f92b 3443 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3444 char **a;
3445 size_t m;
3446
3447 /* Automatically search for the init system */
3448
75f32f04
ZJS
3449 m = strv_length(arg_parameters);
3450 a = newa(char*, m + 2);
3451 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3452 a[1 + m] = NULL;
03cfe0d5 3453
ced58da7 3454 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3455 execve(a[0], a, env_use);
3456
ced58da7 3457 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3458 execve(a[0], a, env_use);
3459
ced58da7 3460 a[0] = (char*) "/sbin/init";
03cfe0d5 3461 execve(a[0], a, env_use);
ced58da7
LP
3462
3463 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3464 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3465 const char *dollar_path;
3466
1a68e1e5 3467 exec_target = arg_parameters[0];
b6b180b7
LP
3468
3469 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3470 * binary. */
3471 dollar_path = strv_env_get(env_use, "PATH");
3472 if (dollar_path) {
6f646e01 3473 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3474 return log_error_errno(errno, "Failed to update $PATH: %m");
3475 }
3476
f757855e 3477 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3478 } else {
5f932eb9 3479 if (!arg_chdir)
d929b0f9
ZJS
3480 /* If we cannot change the directory, we'll end up in /, that is expected. */
3481 (void) chdir(home ?: "/root");
5f932eb9 3482
03cfe0d5
LP
3483 execle("/bin/bash", "-bash", NULL, env_use);
3484 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3485
3486 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3487 }
3488
8ca082b4 3489 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3490}
3491
e96ceaba 3492static int setup_notify_child(void) {
271f518f 3493 _cleanup_close_ int fd = -1;
9c1e04d0 3494 union sockaddr_union sa = {
44ed5214
LP
3495 .un.sun_family = AF_UNIX,
3496 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3497 };
3498 int r;
3499
3500 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3501 if (fd < 0)
3502 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3503
3504 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3505 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3506
9c1e04d0 3507 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3508 if (r < 0)
44ed5214 3509 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3510
adc7d9f0 3511 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3512 if (r < 0)
adc7d9f0 3513 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3514
2ff48e98 3515 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3516 if (r < 0)
2ff48e98 3517 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3518
271f518f 3519 return TAKE_FD(fd);
9c1e04d0
AP
3520}
3521
03cfe0d5
LP
3522static int outer_child(
3523 Barrier *barrier,
3524 const char *directory,
2d845785 3525 DissectedImage *dissected_image,
03cfe0d5
LP
3526 bool secondary,
3527 int pid_socket,
e01ff70a 3528 int uuid_socket,
9c1e04d0 3529 int notify_socket,
03cfe0d5
LP
3530 int kmsg_socket,
3531 int rtnl_socket,
825d5287 3532 int uid_shift_socket,
3acc84eb 3533 int master_pty_socket,
8199d554 3534 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3535 FDSet *fds,
3536 int netns_fd) {
03cfe0d5 3537
e1bb4b0d 3538 _cleanup_strv_free_ char **os_release_pairs = NULL;
bf428efb 3539 _cleanup_close_ int fd = -1;
e5f10caf 3540 const char *p;
03cfe0d5
LP
3541 pid_t pid;
3542 ssize_t l;
de40a303 3543 int r;
03cfe0d5 3544
b37469d7
LP
3545 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3546 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3547 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3548 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3549
03cfe0d5
LP
3550 assert(barrier);
3551 assert(directory);
03cfe0d5 3552 assert(pid_socket >= 0);
e01ff70a 3553 assert(uuid_socket >= 0);
9c1e04d0 3554 assert(notify_socket >= 0);
3acc84eb 3555 assert(master_pty_socket >= 0);
03cfe0d5
LP
3556 assert(kmsg_socket >= 0);
3557
de40a303
LP
3558 log_debug("Outer child is initializing.");
3559
e1bb4b0d
LB
3560 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3561 if (r < 0)
3562 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3563
03cfe0d5
LP
3564 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3565 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3566
03cfe0d5
LP
3567 r = reset_audit_loginuid();
3568 if (r < 0)
3569 return r;
3570
2a2e78e9
LP
3571 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3572 * mounts to the real root. */
511a8cfe 3573 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3574 if (r < 0)
3575 return r;
03cfe0d5 3576
2d845785 3577 if (dissected_image) {
2d3a5a73
LP
3578 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3579 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3580 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3581 * makes sure ESP partitions and userns are compatible. */
3582
af187ab2 3583 r = dissected_image_mount_and_warn(
d04faa4e
LP
3584 dissected_image,
3585 directory,
3586 arg_uid_shift,
3587 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3588 DISSECT_IMAGE_DISCARD_ON_LOOP|
3589 DISSECT_IMAGE_USR_NO_ROOT|
af187ab2
LP
3590 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK)|
3591 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3592 if (r < 0)
af187ab2 3593 return r;
2d845785 3594 }
03cfe0d5 3595
391567f4
LP
3596 r = determine_uid_shift(directory);
3597 if (r < 0)
3598 return r;
3599
0de7acce 3600 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3601 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3602 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3603 if (l < 0)
3604 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3605 if (l != sizeof(arg_uid_shift))
3606 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3607 "Short write while sending UID shift.");
0e7ac751 3608
0de7acce 3609 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3610 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3611 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3612 * not it will pick a different one, and send it back to us. */
3613
3614 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3615 if (l < 0)
3616 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3617 if (l != sizeof(arg_uid_shift))
3618 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3619 "Short read while receiving UID shift.");
0e7ac751
LP
3620 }
3621
ff6c6cc1
LP
3622 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3623 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3624 }
3625
6f83d3d1
LP
3626 if (path_equal(directory, "/")) {
3627 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3628 * place, so that we can make changes to its mount structure (for example, to implement
3629 * --volatile=) without this interfering with our ability to access files such as
3630 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3631 * (instead of a temporary directory, since we are living in our own mount namspace here
3632 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3633 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3634
511a8cfe 3635 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3636 if (r < 0)
3637 return r;
3638
3639 directory = "/run/systemd/nspawn-root";
e50cd82f 3640 }
7d0ecdd6
LP
3641
3642 r = setup_pivot_root(
3643 directory,
3644 arg_pivot_root_new,
3645 arg_pivot_root_old);
3646 if (r < 0)
3647 return r;
3648
3649 r = setup_volatile_mode(
3650 directory,
3651 arg_volatile_mode,
7d0ecdd6 3652 arg_uid_shift,
8f1ed04a 3653 arg_selinux_apifs_context);
7d0ecdd6
LP
3654 if (r < 0)
3655 return r;
3656
5f0a6347
DDM
3657 r = mount_custom(
3658 directory,
3659 arg_custom_mounts,
3660 arg_n_custom_mounts,
5f0a6347 3661 arg_uid_shift,
5f0a6347
DDM
3662 arg_selinux_apifs_context,
3663 MOUNT_ROOT_ONLY);
3664 if (r < 0)
3665 return r;
3666
5530dc87
DDM
3667 /* Make sure we always have a mount that we can move to root later on. */
3668 if (!path_is_mount_point(directory, NULL, 0)) {
511a8cfe 3669 r = mount_nofollow_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
5530dc87
DDM
3670 if (r < 0)
3671 return r;
3672 }
3673
2d3a5a73
LP
3674 if (dissected_image) {
3675 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3676 r = dissected_image_mount(
3677 dissected_image,
3678 directory,
3679 arg_uid_shift,
3680 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3681 DISSECT_IMAGE_DISCARD_ON_LOOP|
3682 DISSECT_IMAGE_USR_NO_ROOT|
3683 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK));
4fcb96ce
LP
3684 if (r == -EUCLEAN)
3685 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3686 if (r < 0)
4fcb96ce 3687 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3688 }
3689
8199d554
LP
3690 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3691 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3692
3693 r = detect_unified_cgroup_hierarchy_from_image(directory);
3694 if (r < 0)
3695 return r;
3696
3697 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3698 if (l < 0)
3699 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3700 if (l != sizeof(arg_unified_cgroup_hierarchy))
3701 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3702 "Short write while sending cgroup mode.");
8199d554
LP
3703
3704 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3705 }
3706
4ad14eff
LP
3707 /* Mark everything as shared so our mounts get propagated down. This is
3708 * required to make new bind mounts available in systemd services
5238e957 3709 * inside the container that create a new mount namespace.
4ad14eff
LP
3710 * See https://github.com/systemd/systemd/issues/3860
3711 * Further submounts (such as /dev) done after this will inherit the
5f0a6347
DDM
3712 * shared propagation mode.
3713 *
3714 * IMPORTANT: Do not overmount the root directory anymore from now on to
3715 * enable moving the root directory mount to root later on.
3716 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3717 */
511a8cfe 3718 r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
4ad14eff
LP
3719 if (r < 0)
3720 return r;
3721
3722 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3723 if (r < 0)
3724 return r;
3725
03cfe0d5
LP
3726 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3727 if (r < 0)
3728 return r;
3729
bbd407ea
DDM
3730 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3731 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3732 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3733 if (r < 0)
3734 return log_error_errno(r, "Failed to make tree read-only: %m");
3735 }
3736
0de7acce 3737 r = mount_all(directory,
4f086aab 3738 arg_mount_settings,
0de7acce 3739 arg_uid_shift,
0de7acce 3740 arg_selinux_apifs_context);
03cfe0d5
LP
3741 if (r < 0)
3742 return r;
3743
07fa00f9
LP
3744 r = copy_devnodes(directory);
3745 if (r < 0)
03cfe0d5
LP
3746 return r;
3747
de40a303
LP
3748 r = make_extra_nodes(directory);
3749 if (r < 0)
3750 return r;
3751
3752 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3753
9fac5029 3754 p = prefix_roota(directory, "/run/host");
e5f10caf 3755 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3756
07fa00f9
LP
3757 r = setup_pts(directory);
3758 if (r < 0)
03cfe0d5
LP
3759 return r;
3760
3761 r = setup_propagate(directory);
3762 if (r < 0)
3763 return r;
3764
8e5430c4
LP
3765 r = setup_keyring();
3766 if (r < 0)
3767 return r;
3768
3652872a
LP
3769 r = setup_credentials(directory);
3770 if (r < 0)
3771 return r;
3772
5c4deb9a
MJ
3773 r = mount_custom(
3774 directory,
3775 arg_custom_mounts,
3776 arg_n_custom_mounts,
3777 arg_uid_shift,
3778 arg_selinux_apifs_context,
3779 MOUNT_NON_ROOT_ONLY);
3780 if (r < 0)
3781 return r;
3782
03cfe0d5
LP
3783 r = setup_timezone(directory);
3784 if (r < 0)
3785 return r;
3786
3787 r = setup_resolv_conf(directory);
3788 if (r < 0)
3789 return r;
3790
e01ff70a
MS
3791 r = setup_machine_id(directory);
3792 if (r < 0)
3793 return r;
3794
03cfe0d5
LP
3795 r = setup_journal(directory);
3796 if (r < 0)
3797 return r;
3798
0f48ba7b
LP
3799 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3800 p = prefix_roota(directory, "/run/host/container-manager");
3801 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3802
3803 /* The same stuff as the $container_uuid env var */
3804 p = prefix_roota(directory, "/run/host/container-uuid");
3805 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3806
489fae52 3807 if (!arg_use_cgns) {
0996ef00
CB
3808 r = mount_cgroups(
3809 directory,
3810 arg_unified_cgroup_hierarchy,
3811 arg_userns_mode != USER_NAMESPACE_NO,
3812 arg_uid_shift,
3813 arg_uid_range,
5a8ff0e6 3814 arg_selinux_apifs_context,
ada54120 3815 false);
0996ef00
CB
3816 if (r < 0)
3817 return r;
3818 }
03cfe0d5
LP
3819
3820 r = mount_move_root(directory);
3821 if (r < 0)
3822 return log_error_errno(r, "Failed to move root directory: %m");
3823
e96ceaba 3824 fd = setup_notify_child();
9c1e04d0
AP
3825 if (fd < 0)
3826 return fd;
3827
03cfe0d5 3828 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3829 arg_clone_ns_flags |
8869a0b4 3830 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3831 if (pid < 0)
3832 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3833 if (pid == 0) {
3834 pid_socket = safe_close(pid_socket);
e01ff70a 3835 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3836 notify_socket = safe_close(notify_socket);
825d5287 3837 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5 3838
2a2e78e9
LP
3839 /* The inner child has all namespaces that are requested, so that we all are owned by the
3840 * user if user namespaces are turned on. */
03cfe0d5 3841
d7bea6b6
DP
3842 if (arg_network_namespace_path) {
3843 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3844 if (r < 0)
e2d39e54 3845 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3846 }
3847
e1bb4b0d 3848 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
03cfe0d5
LP
3849 if (r < 0)
3850 _exit(EXIT_FAILURE);
3851
3852 _exit(EXIT_SUCCESS);
3853 }
3854
3855 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3856 if (l < 0)
3857 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3858 if (l != sizeof(pid))
3859 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3860 "Short write while sending PID.");
03cfe0d5 3861
e01ff70a
MS
3862 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3863 if (l < 0)
3864 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3865 if (l != sizeof(arg_uuid))
3866 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3867 "Short write while sending machine ID.");
e01ff70a 3868
9c1e04d0
AP
3869 l = send_one_fd(notify_socket, fd, 0);
3870 if (l < 0)
ba72801d 3871 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3872
03cfe0d5 3873 pid_socket = safe_close(pid_socket);
e01ff70a 3874 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3875 notify_socket = safe_close(notify_socket);
3acc84eb 3876 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3877 kmsg_socket = safe_close(kmsg_socket);
3878 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3879 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3880
3881 return 0;
3882}
3883
0e7ac751 3884static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3885 bool tried_hashed = false;
0e7ac751
LP
3886 unsigned n_tries = 100;
3887 uid_t candidate;
3888 int r;
3889
3890 assert(shift);
3891 assert(ret_lock_file);
0de7acce 3892 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3893 assert(arg_uid_range == 0x10000U);
3894
3895 candidate = *shift;
3896
3897 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3898
3899 for (;;) {
fbd0b64f 3900 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3901 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3902
3903 if (--n_tries <= 0)
3904 return -EBUSY;
3905
87d5e4f2 3906 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3907 goto next;
3908 if ((candidate & UINT32_C(0xFFFF)) != 0)
3909 goto next;
3910
3911 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3912 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3913 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3914 goto next;
3915 if (r < 0)
3916 return r;
3917
3918 /* Make some superficial checks whether the range is currently known in the user database */
3919 if (getpwuid(candidate))
3920 goto next;
3921 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3922 goto next;
3923 if (getgrgid(candidate))
3924 goto next;
3925 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3926 goto next;
3927
3928 *ret_lock_file = lf;
3929 lf = (struct LockFile) LOCK_FILE_INIT;
3930 *shift = candidate;
3931 return 0;
3932
3933 next:
d381c8a6
LP
3934 if (arg_machine && !tried_hashed) {
3935 /* Try to hash the base from the container name */
3936
3937 static const uint8_t hash_key[] = {
3938 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3939 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3940 };
3941
3942 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3943
3944 tried_hashed = true;
3945 } else
3946 random_bytes(&candidate, sizeof(candidate));
3947
87d5e4f2 3948 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3949 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3950 }
3951}
3952
03cfe0d5 3953static int setup_uid_map(pid_t pid) {
fbd0b64f 3954 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3955 int r;
3956
3957 assert(pid > 1);
3958
3959 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3960 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3961 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3962 if (r < 0)
3963 return log_error_errno(r, "Failed to write UID map: %m");
3964
3965 /* We always assign the same UID and GID ranges */
3966 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3967 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3968 if (r < 0)
3969 return log_error_errno(r, "Failed to write GID map: %m");
3970
3971 return 0;
3972}
3973
9c1e04d0 3974static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3975 char buf[NOTIFY_BUFFER_MAX+1];
3976 char *p = NULL;
3977 struct iovec iovec = {
3978 .iov_base = buf,
3979 .iov_len = sizeof(buf)-1,
3980 };
fb29cdbe
LP
3981 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
3982 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
3983 struct msghdr msghdr = {
3984 .msg_iov = &iovec,
3985 .msg_iovlen = 1,
3986 .msg_control = &control,
3987 .msg_controllen = sizeof(control),
3988 };
371d72e0 3989 struct ucred *ucred;
9c1e04d0
AP
3990 ssize_t n;
3991 pid_t inner_child_pid;
3992 _cleanup_strv_free_ char **tags = NULL;
3993
3994 assert(userdata);
3995
3996 inner_child_pid = PTR_TO_PID(userdata);
3997
3998 if (revents != EPOLLIN) {
3999 log_warning("Got unexpected poll event for notify fd.");
4000 return 0;
4001 }
4002
3691bcf3
LP
4003 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4004 if (IN_SET(n, -EAGAIN, -EINTR))
4005 return 0;
741bfd7f
LP
4006 if (n == -EXFULL) {
4007 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4008 return 0;
4009 }
3691bcf3
LP
4010 if (n < 0)
4011 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 4012
9c1e04d0
AP
4013 cmsg_close_all(&msghdr);
4014
371d72e0 4015 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4016 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4017 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4018 return 0;
4019 }
4020
4021 if ((size_t) n >= sizeof(buf)) {
4022 log_warning("Received notify message exceeded maximum size. Ignoring.");
4023 return 0;
4024 }
4025
4026 buf[n] = 0;
4027 tags = strv_split(buf, "\n\r");
4028 if (!tags)
4029 return log_oom();
4030
4031 if (strv_find(tags, "READY=1"))
04f590a4 4032 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
4033
4034 p = strv_find_startswith(tags, "STATUS=");
4035 if (p)
04f590a4 4036 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4037
4038 return 0;
4039}
4040
e96ceaba 4041static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4042 int r;
9c1e04d0 4043
5773024d 4044 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4045 if (r < 0)
4046 return log_error_errno(r, "Failed to allocate notify event source: %m");
4047
5773024d 4048 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4049
4050 return 0;
4051}
4052
5d961407
LP
4053static int merge_settings(Settings *settings, const char *path) {
4054 int rl;
f757855e 4055
5d961407
LP
4056 assert(settings);
4057 assert(path);
f757855e 4058
5d961407
LP
4059 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4060 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4061
7732f92b
LP
4062 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4063 settings->start_mode >= 0) {
4064 arg_start_mode = settings->start_mode;
130d3d22 4065 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4066 }
4067
a2f577fc
JL
4068 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
4069 arg_ephemeral = settings->ephemeral;
4070
de40a303
LP
4071 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4072 settings->root) {
4073
4074 if (!arg_settings_trusted)
4075 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4076 else
4077 free_and_replace(arg_directory, settings->root);
4078 }
4079
b53ede69
PW
4080 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4081 settings->pivot_root_new) {
4082 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4083 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4084 }
4085
5f932eb9 4086 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4087 settings->working_directory)
4088 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4089
f757855e 4090 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4091 settings->environment)
4092 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4093
de40a303
LP
4094 if ((arg_settings_mask & SETTING_USER) == 0) {
4095
4096 if (settings->user)
4097 free_and_replace(arg_user, settings->user);
4098
4099 if (uid_is_valid(settings->uid))
4100 arg_uid = settings->uid;
4101 if (gid_is_valid(settings->gid))
4102 arg_gid = settings->gid;
4103 if (settings->n_supplementary_gids > 0) {
4104 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4105 arg_n_supplementary_gids = settings->n_supplementary_gids;
4106 }
4107 }
f757855e
LP
4108
4109 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4110 uint64_t plus, minus;
7be830c6 4111 uint64_t network_minus = 0;
88fc9c9b 4112 uint64_t ambient;
f757855e 4113
de40a303
LP
4114 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4115 * Settings structure */
4116
0e265674 4117 plus = settings->capability;
a3fc6b55
LP
4118 minus = settings->drop_capability;
4119
4120 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
4121 if (settings_private_network(settings))
4122 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4123 else
7be830c6 4124 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4125 }
0e265674
LP
4126
4127 if (!arg_settings_trusted && plus != 0) {
4128 if (settings->capability != 0)
5d961407 4129 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4130 } else {
4131 arg_caps_retain &= ~network_minus;
520e0d54 4132 arg_caps_retain |= plus;
7be830c6 4133 }
f757855e 4134
a3fc6b55 4135 arg_caps_retain &= ~minus;
de40a303
LP
4136
4137 /* Copy the full capabilities over too */
4138 if (capability_quintet_is_set(&settings->full_capabilities)) {
4139 if (!arg_settings_trusted)
5238e957 4140 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4141 else
4142 arg_full_capabilities = settings->full_capabilities;
4143 }
88fc9c9b
TH
4144
4145 ambient = settings->ambient_capability;
4146 if (!arg_settings_trusted && ambient != 0)
4147 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4148 else
4149 arg_caps_ambient |= ambient;
f757855e
LP
4150 }
4151
4152 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4153 settings->kill_signal > 0)
4154 arg_kill_signal = settings->kill_signal;
4155
4156 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4157 settings->personality != PERSONALITY_INVALID)
4158 arg_personality = settings->personality;
4159
4160 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4161 !sd_id128_is_null(settings->machine_id)) {
4162
4163 if (!arg_settings_trusted)
5d961407 4164 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4165 else
4166 arg_uuid = settings->machine_id;
4167 }
4168
4169 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4170 settings->read_only >= 0)
4171 arg_read_only = settings->read_only;
4172
4173 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4174 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4175 arg_volatile_mode = settings->volatile_mode;
4176
4177 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4178 settings->n_custom_mounts > 0) {
4179
4180 if (!arg_settings_trusted)
5d961407 4181 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4182 else {
4183 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4184 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4185 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4186 settings->n_custom_mounts = 0;
4187 }
4188 }
4189
4190 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4191 (settings->private_network >= 0 ||
4192 settings->network_veth >= 0 ||
4193 settings->network_bridge ||
22b28dfd 4194 settings->network_zone ||
f757855e
LP
4195 settings->network_interfaces ||
4196 settings->network_macvlan ||
f6d6bad1 4197 settings->network_ipvlan ||
de40a303
LP
4198 settings->network_veth_extra ||
4199 settings->network_namespace_path)) {
f757855e
LP
4200
4201 if (!arg_settings_trusted)
5d961407 4202 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4203 else {
f6d6bad1 4204 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4205 arg_private_network = settings_private_network(settings);
4206
130d3d22
YW
4207 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4208 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4209 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4210 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4211
1cc6c93a
YW
4212 free_and_replace(arg_network_bridge, settings->network_bridge);
4213 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4214
4215 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4216 }
4217 }
4218
4219 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4220 settings->expose_ports) {
4221
4222 if (!arg_settings_trusted)
5d961407 4223 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4224 else {
4225 expose_port_free_all(arg_expose_ports);
1cc6c93a 4226 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4227 }
4228 }
4229
0de7acce
LP
4230 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4231 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4232
4233 if (!arg_settings_trusted)
5d961407 4234 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4235 else {
4236 arg_userns_mode = settings->userns_mode;
4237 arg_uid_shift = settings->uid_shift;
4238 arg_uid_range = settings->uid_range;
4239 arg_userns_chown = settings->userns_chown;
4240 }
4241 }
4242
9c1e04d0
AP
4243 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
4244 arg_notify_ready = settings->notify_ready;
4245
960e4569
LP
4246 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4247
6b000af4 4248 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
5d961407 4249 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 4250 else {
6b000af4
LP
4251 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4252 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
960e4569 4253 }
de40a303
LP
4254
4255#if HAVE_SECCOMP
4256 if (!arg_settings_trusted && settings->seccomp)
4257 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4258 else {
4259 seccomp_release(arg_seccomp);
4260 arg_seccomp = TAKE_PTR(settings->seccomp);
4261 }
4262#endif
960e4569
LP
4263 }
4264
bf428efb
LP
4265 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4266 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4267 continue;
4268
4269 if (!settings->rlimit[rl])
4270 continue;
4271
4272 if (!arg_settings_trusted) {
5d961407 4273 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4274 continue;
4275 }
4276
4277 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4278 }
4279
3a9530e5
LP
4280 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4281 settings->hostname)
4282 free_and_replace(arg_hostname, settings->hostname);
4283
66edd963
LP
4284 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4285 settings->no_new_privileges >= 0)
4286 arg_no_new_privileges = settings->no_new_privileges;
4287
81f345df
LP
4288 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4289 settings->oom_score_adjust_set) {
4290
4291 if (!arg_settings_trusted)
5d961407 4292 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4293 else {
4294 arg_oom_score_adjust = settings->oom_score_adjust;
4295 arg_oom_score_adjust_set = true;
4296 }
4297 }
4298
d107bb7d 4299 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4300 settings->cpu_set.set) {
d107bb7d
LP
4301
4302 if (!arg_settings_trusted)
5d961407 4303 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4304 else {
0985c7c4
ZJS
4305 cpu_set_reset(&arg_cpu_set);
4306 arg_cpu_set = settings->cpu_set;
4307 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4308 }
4309 }
4310
09d423e9
LP
4311 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4312 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4313 arg_resolv_conf = settings->resolv_conf;
4314
4e1d6aa9
LP
4315 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4316 settings->link_journal != _LINK_JOURNAL_INVALID) {
4317
4318 if (!arg_settings_trusted)
4319 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4320 else {
4321 arg_link_journal = settings->link_journal;
4322 arg_link_journal_try = settings->link_journal_try;
4323 }
4324 }
4325
1688841f
LP
4326 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4327 settings->timezone != _TIMEZONE_MODE_INVALID)
4328 arg_timezone = settings->timezone;
4329
de40a303
LP
4330 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4331 settings->slice) {
4332
4333 if (!arg_settings_trusted)
4334 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4335 else
4336 free_and_replace(arg_slice, settings->slice);
4337 }
4338
4339 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4340 settings->use_cgns >= 0) {
4341
4342 if (!arg_settings_trusted)
4343 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4344 else
4345 arg_use_cgns = settings->use_cgns;
4346 }
4347
4348 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4349 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4350
4351 if (!arg_settings_trusted)
4352 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4353 else
4354 arg_clone_ns_flags = settings->clone_ns_flags;
4355 }
4356
4357 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4358 settings->console_mode >= 0) {
4359
4360 if (!arg_settings_trusted)
4361 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4362 else
4363 arg_console_mode = settings->console_mode;
4364 }
4365
4366 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4367 * don't consult arg_settings_mask for them. */
4368
4369 sd_bus_message_unref(arg_property_message);
4370 arg_property_message = TAKE_PTR(settings->properties);
4371
4372 arg_console_width = settings->console_width;
4373 arg_console_height = settings->console_height;
4374
b2645747 4375 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4376 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4377 arg_n_extra_nodes = settings->n_extra_nodes;
4378
f757855e
LP
4379 return 0;
4380}
4381
5d961407
LP
4382static int load_settings(void) {
4383 _cleanup_(settings_freep) Settings *settings = NULL;
4384 _cleanup_fclose_ FILE *f = NULL;
4385 _cleanup_free_ char *p = NULL;
4386 const char *fn, *i;
4387 int r;
4388
de40a303
LP
4389 if (arg_oci_bundle)
4390 return 0;
4391
5d961407
LP
4392 /* If all settings are masked, there's no point in looking for
4393 * the settings file */
d7a0f1f4 4394 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4395 return 0;
4396
4397 fn = strjoina(arg_machine, ".nspawn");
4398
4399 /* We first look in the admin's directories in /etc and /run */
4400 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4401 _cleanup_free_ char *j = NULL;
4402
657ee2d8 4403 j = path_join(i, fn);
5d961407
LP
4404 if (!j)
4405 return log_oom();
4406
4407 f = fopen(j, "re");
4408 if (f) {
4409 p = TAKE_PTR(j);
4410
4411 /* By default, we trust configuration from /etc and /run */
4412 if (arg_settings_trusted < 0)
4413 arg_settings_trusted = true;
4414
4415 break;
4416 }
4417
4418 if (errno != ENOENT)
4419 return log_error_errno(errno, "Failed to open %s: %m", j);
4420 }
4421
4422 if (!f) {
4423 /* After that, let's look for a file next to the
4424 * actual image we shall boot. */
4425
4426 if (arg_image) {
4427 p = file_in_same_dir(arg_image, fn);
4428 if (!p)
4429 return log_oom();
cd6e3914 4430 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4431 p = file_in_same_dir(arg_directory, fn);
4432 if (!p)
4433 return log_oom();
4434 }
4435
4436 if (p) {
4437 f = fopen(p, "re");
4438 if (!f && errno != ENOENT)
4439 return log_error_errno(errno, "Failed to open %s: %m", p);
4440
4441 /* By default, we do not trust configuration from /var/lib/machines */
4442 if (arg_settings_trusted < 0)
4443 arg_settings_trusted = false;
4444 }
4445 }
4446
4447 if (!f)
4448 return 0;
4449
4450 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4451
4452 r = settings_load(f, p, &settings);
4453 if (r < 0)
4454 return r;
4455
4456 return merge_settings(settings, p);
4457}
4458
de40a303
LP
4459static int load_oci_bundle(void) {
4460 _cleanup_(settings_freep) Settings *settings = NULL;
4461 int r;
4462
4463 if (!arg_oci_bundle)
4464 return 0;
4465
4466 /* By default let's trust OCI bundles */
4467 if (arg_settings_trusted < 0)
4468 arg_settings_trusted = true;
4469
4470 r = oci_load(NULL, arg_oci_bundle, &settings);
4471 if (r < 0)
4472 return r;
4473
4474 return merge_settings(settings, arg_oci_bundle);
4475}
4476
3acc84eb 4477static int run_container(
2d845785 4478 DissectedImage *dissected_image,
b0067625
ZJS
4479 bool secondary,
4480 FDSet *fds,
4481 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4482 struct ExposeArgs *expose_args,
3acc84eb 4483 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4484
4485 static const struct sigaction sa = {
4486 .sa_handler = nop_signal_handler,
e28c7cd0 4487 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4488 };
4489
8e766630 4490 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4491 _cleanup_close_ int etc_passwd_lock = -1;
4492 _cleanup_close_pair_ int
4493 kmsg_socket_pair[2] = { -1, -1 },
4494 rtnl_socket_pair[2] = { -1, -1 },
4495 pid_socket_pair[2] = { -1, -1 },
4496 uuid_socket_pair[2] = { -1, -1 },
4497 notify_socket_pair[2] = { -1, -1 },
8199d554 4498 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4499 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4500 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4501
3acc84eb 4502 _cleanup_close_ int notify_socket = -1;
b0067625 4503 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4504 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4505 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4506 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4507 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4508 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4509 ContainerStatus container_status = 0;
b0067625
ZJS
4510 int ifi = 0, r;
4511 ssize_t l;
4512 sigset_t mask_chld;
5b4855ab 4513 _cleanup_close_ int child_netns_fd = -1;
b0067625
ZJS
4514
4515 assert_se(sigemptyset(&mask_chld) == 0);
4516 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4517
4518 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4519 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4520 * check with getpwuid() if the specific user already exists. Note that /etc might be
4521 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4522 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4523 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4524 * really ours. */
4525
4526 etc_passwd_lock = take_etc_passwd_lock(NULL);
4527 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4528 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4529 }
4530
4531 r = barrier_create(&barrier);
4532 if (r < 0)
4533 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4534
4535 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4536 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4537
4538 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4539 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4540
4541 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4542 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4543
4544 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4545 return log_error_errno(errno, "Failed to create id socket pair: %m");
4546
4547 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4548 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4549
3acc84eb
FB
4550 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4551 return log_error_errno(errno, "Failed to create console socket pair: %m");
4552
b0067625
ZJS
4553 if (arg_userns_mode != USER_NAMESPACE_NO)
4554 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4555 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4556
8199d554
LP
4557 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4558 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4559 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4560
b0067625
ZJS
4561 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4562 * parent's blocking calls and give it a chance to call wait() and terminate. */
4563 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4564 if (r < 0)
4565 return log_error_errno(errno, "Failed to change the signal mask: %m");
4566
4567 r = sigaction(SIGCHLD, &sa, NULL);
4568 if (r < 0)
4569 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4570
d7bea6b6 4571 if (arg_network_namespace_path) {
5b4855ab
DDM
4572 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4573 if (child_netns_fd < 0)
d7bea6b6
DP
4574 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4575
54c2459d 4576 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4577 if (r == -EUCLEAN)
4578 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4579 else if (r < 0)
d7bea6b6 4580 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4581 else if (r == 0)
4582 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4583 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4584 }
4585
b0067625
ZJS
4586 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4587 if (*pid < 0)
4588 return log_error_errno(errno, "clone() failed%s: %m",
4589 errno == EINVAL ?
4590 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4591
4592 if (*pid == 0) {
4593 /* The outer child only has a file system namespace. */
4594 barrier_set_role(&barrier, BARRIER_CHILD);
4595
b0067625
ZJS
4596 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4597 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4598 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4599 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4600 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4601 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4602 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4603 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4604
4605 (void) reset_all_signal_handlers();
4606 (void) reset_signal_mask();
4607
4608 r = outer_child(&barrier,
4609 arg_directory,
2d845785 4610 dissected_image,
b0067625
ZJS
4611 secondary,
4612 pid_socket_pair[1],
4613 uuid_socket_pair[1],
4614 notify_socket_pair[1],
4615 kmsg_socket_pair[1],
4616 rtnl_socket_pair[1],
4617 uid_shift_socket_pair[1],
3acc84eb 4618 master_pty_socket_pair[1],
8199d554 4619 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6 4620 fds,
5b4855ab 4621 child_netns_fd);
b0067625
ZJS
4622 if (r < 0)
4623 _exit(EXIT_FAILURE);
4624
4625 _exit(EXIT_SUCCESS);
4626 }
4627
4628 barrier_set_role(&barrier, BARRIER_PARENT);
4629
e4077ff6 4630 fdset_close(fds);
b0067625
ZJS
4631
4632 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4633 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4634 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4635 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4636 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4637 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4638 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4639 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4640
4641 if (arg_userns_mode != USER_NAMESPACE_NO) {
4642 /* The child just let us know the UID shift it might have read from the image. */
4643 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4644 if (l < 0)
4645 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4646 if (l != sizeof arg_uid_shift)
4647 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4648
4649 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4650 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4651 * image, but if that's already in use, pick a new one, and report back to the child,
4652 * which one we now picked. */
4653
4654 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4655 if (r < 0)
4656 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4657
4658 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4659 if (l < 0)
4660 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4661 if (l != sizeof arg_uid_shift)
4662 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625
ZJS
4663 }
4664 }
4665
8199d554
LP
4666 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4667 /* The child let us know the support cgroup mode it might have read from the image. */
4668 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4669 if (l < 0)
4670 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4671 if (l != sizeof(arg_unified_cgroup_hierarchy))
4672 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4673 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4674 }
4675
b0067625 4676 /* Wait for the outer child. */
d2e0ac3d
LP
4677 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4678 if (r < 0)
4679 return r;
4680 if (r != EXIT_SUCCESS)
4681 return -EIO;
b0067625
ZJS
4682
4683 /* And now retrieve the PID of the inner child. */
4684 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4685 if (l < 0)
4686 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4687 if (l != sizeof *pid)
4688 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4689
4690 /* We also retrieve container UUID in case it was generated by outer child */
4691 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4692 if (l < 0)
4693 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4694 if (l != sizeof(arg_uuid))
4695 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4696
4697 /* We also retrieve the socket used for notifications generated by outer child */
4698 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4699 if (notify_socket < 0)
4700 return log_error_errno(notify_socket,
4701 "Failed to receive notification socket from the outer child: %m");
4702
4703 log_debug("Init process invoked as PID "PID_FMT, *pid);
4704
4705 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4706 if (!barrier_place_and_sync(&barrier)) /* #1 */
4707 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4708
4709 r = setup_uid_map(*pid);
4710 if (r < 0)
4711 return r;
4712
4713 (void) barrier_place(&barrier); /* #2 */
4714 }
4715
4716 if (arg_private_network) {
75116558
PS
4717 if (!arg_network_namespace_path) {
4718 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4719 if (!barrier_place_and_sync(&barrier)) /* #3 */
4720 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4721 }
4722
5b4855ab
DDM
4723 if (child_netns_fd < 0) {
4724 /* Make sure we have an open file descriptor to the child's network
4725 * namespace so it stays alive even if the child exits. */
4726 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4727 if (r < 0)
4728 return log_error_errno(r, "Failed to open child network namespace: %m");
4729 }
4730
4731 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4732 if (r < 0)
4733 return r;
4734
4735 if (arg_network_veth) {
4736 r = setup_veth(arg_machine, *pid, veth_name,
4737 arg_network_bridge || arg_network_zone);
4738 if (r < 0)
4739 return r;
4740 else if (r > 0)
4741 ifi = r;
4742
4743 if (arg_network_bridge) {
4744 /* Add the interface to a bridge */
4745 r = setup_bridge(veth_name, arg_network_bridge, false);
4746 if (r < 0)
4747 return r;
4748 if (r > 0)
4749 ifi = r;
4750 } else if (arg_network_zone) {
4751 /* Add the interface to a bridge, possibly creating it */
4752 r = setup_bridge(veth_name, arg_network_zone, true);
4753 if (r < 0)
4754 return r;
4755 if (r > 0)
4756 ifi = r;
4757 }
4758 }
4759
4760 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4761 if (r < 0)
4762 return r;
4763
4764 /* We created the primary and extra veth links now; let's remember this, so that we know to
4765 remove them later on. Note that we don't bother with removing veth links that were created
4766 here when their setup failed half-way, because in that case the kernel should be able to
4767 remove them on its own, since they cannot be referenced by anything yet. */
4768 *veth_created = true;
4769
4770 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4771 if (r < 0)
4772 return r;
4773
4774 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4775 if (r < 0)
4776 return r;
4777 }
4778
abdb9b08
LP
4779 if (arg_register || !arg_keep_unit) {
4780 r = sd_bus_default_system(&bus);
4781 if (r < 0)
4782 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4783
4784 r = sd_bus_set_close_on_exit(bus, false);
4785 if (r < 0)
4786 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4787 }
4788
4789 if (!arg_keep_unit) {
4790 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4791 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4792 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4793
75152a4d
LP
4794 r = sd_bus_match_signal_async(
4795 bus,
4796 NULL,
4797 "org.freedesktop.systemd1",
4798 NULL,
4799 "org.freedesktop.systemd1.Scope",
4800 "RequestStop",
4801 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4802 if (r < 0)
75152a4d 4803 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4804 }
4805
b0067625
ZJS
4806 if (arg_register) {
4807 r = register_machine(
abdb9b08 4808 bus,
b0067625
ZJS
4809 arg_machine,
4810 *pid,
4811 arg_directory,
4812 arg_uuid,
4813 ifi,
4814 arg_slice,
4815 arg_custom_mounts, arg_n_custom_mounts,
4816 arg_kill_signal,
4817 arg_property,
de40a303 4818 arg_property_message,
b0067625
ZJS
4819 arg_keep_unit,
4820 arg_container_service_name);
4821 if (r < 0)
4822 return r;
abdb9b08 4823
cd2dfc6f
LP
4824 } else if (!arg_keep_unit) {
4825 r = allocate_scope(
abdb9b08 4826 bus,
cd2dfc6f
LP
4827 arg_machine,
4828 *pid,
4829 arg_slice,
4830 arg_custom_mounts, arg_n_custom_mounts,
4831 arg_kill_signal,
de40a303
LP
4832 arg_property,
4833 arg_property_message);
cd2dfc6f
LP
4834 if (r < 0)
4835 return r;
4836
4837 } else if (arg_slice || arg_property)
4838 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4839
27da7ef0 4840 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4841 if (r < 0)
4842 return r;
4843
27da7ef0 4844 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4845 if (r < 0)
4846 return r;
b0067625 4847
de54e02d 4848 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4849 if (r < 0)
4850 return r;
4851
4852 /* Notify the child that the parent is ready with all
4853 * its setup (including cgroup-ification), and that
4854 * the child can now hand over control to the code to
4855 * run inside the container. */
75116558 4856 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4857
4858 /* Block SIGCHLD here, before notifying child.
4859 * process_pty() will handle it with the other signals. */
4860 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4861
4862 /* Reset signal to default */
9c274488 4863 r = default_signals(SIGCHLD);
b0067625
ZJS
4864 if (r < 0)
4865 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4866
4867 r = sd_event_new(&event);
4868 if (r < 0)
4869 return log_error_errno(r, "Failed to get default event source: %m");
4870
8fd010bb
LP
4871 (void) sd_event_set_watchdog(event, true);
4872
abdb9b08
LP
4873 if (bus) {
4874 r = sd_bus_attach_event(bus, event, 0);
4875 if (r < 0)
4876 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4877 }
4878
e96ceaba 4879 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4880 if (r < 0)
4881 return r;
4882
4883 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
4884 if (!barrier_place_and_sync(&barrier)) /* #5 */
4885 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4886
38ccb557 4887 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
4888 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4889 etc_passwd_lock = safe_close(etc_passwd_lock);
4890
04f590a4
LP
4891 (void) sd_notifyf(false,
4892 "STATUS=Container running.\n"
4893 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4894 if (!arg_notify_ready)
919f5ae0 4895 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4896
4897 if (arg_kill_signal > 0) {
4898 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4899 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4900 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4901 } else {
4902 /* Immediately exit */
919f5ae0
LP
4903 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4904 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4905 }
4906
6916b164 4907 /* Exit when the child exits */
919f5ae0 4908 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4909
4910 if (arg_expose_ports) {
761cf19d 4911 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
4912 if (r < 0)
4913 return r;
4914
deff68e7
FW
4915 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
4916 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
4917 }
4918
4919 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4920
3acc84eb
FB
4921 if (arg_console_mode != CONSOLE_PIPE) {
4922 _cleanup_close_ int fd = -1;
4923 PTYForwardFlags flags = 0;
de40a303 4924
3acc84eb
FB
4925 /* Retrieve the master pty allocated by inner child */
4926 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4927 if (fd < 0)
4928 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4929
4930 switch (arg_console_mode) {
de40a303 4931
3acc84eb
FB
4932 case CONSOLE_READ_ONLY:
4933 flags |= PTY_FORWARD_READ_ONLY;
4934
4935 _fallthrough_;
4936
4937 case CONSOLE_INTERACTIVE:
4938 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4939
4940 r = pty_forward_new(event, fd, flags, &forward);
4941 if (r < 0)
4942 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4943
f5fbe71d 4944 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
4945 (void) pty_forward_set_width_height(forward,
4946 arg_console_width,
4947 arg_console_height);
4948 break;
4949
4950 default:
4951 assert(arg_console_mode == CONSOLE_PASSIVE);
4952 }
4953
4954 *master = TAKE_FD(fd);
de40a303 4955 }
b0067625
ZJS
4956
4957 r = sd_event_loop(event);
4958 if (r < 0)
4959 return log_error_errno(r, "Failed to run event loop: %m");
4960
de40a303
LP
4961 if (forward) {
4962 char last_char = 0;
b0067625 4963
de40a303
LP
4964 (void) pty_forward_get_last_char(forward, &last_char);
4965 forward = pty_forward_free(forward);
b0067625 4966
de40a303
LP
4967 if (!arg_quiet && last_char != '\n')
4968 putc('\n', stdout);
4969 }
b0067625
ZJS
4970
4971 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
4972 if (!arg_register && !arg_keep_unit && bus)
4973 terminate_scope(bus, arg_machine);
b0067625
ZJS
4974
4975 /* Normally redundant, but better safe than sorry */
c67b0082 4976 (void) kill(*pid, SIGKILL);
b0067625 4977
5b4855ab
DDM
4978 if (arg_private_network) {
4979 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
4980 * to avoid having to move the parent to the child network namespace. */
4981 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
4982 if (r < 0)
4983 return r;
4984
4985 if (r == 0) {
4986 _cleanup_close_ int parent_netns_fd = -1;
4987
4988 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
4989 if (r < 0) {
4990 log_error_errno(r, "Failed to open parent network namespace: %m");
4991 _exit(EXIT_FAILURE);
4992 }
4993
4994 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
4995 if (r < 0) {
4996 log_error_errno(r, "Failed to enter child network namespace: %m");
4997 _exit(EXIT_FAILURE);
4998 }
4999
5000 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5001 if (r < 0)
5002 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5003
5004 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5005 }
5006 }
5007
b0067625
ZJS
5008 r = wait_for_container(*pid, &container_status);
5009 *pid = 0;
5010
0bb0a9fa
ZJS
5011 /* Tell machined that we are gone. */
5012 if (bus)
5013 (void) unregister_machine(bus, arg_machine);
5014
b0067625
ZJS
5015 if (r < 0)
5016 /* We failed to wait for the container, or the container exited abnormally. */
5017 return r;
5018 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5019 /* r > 0 → The container exited with a non-zero status.
5020 * As a special case, we need to replace 133 with a different value,
5021 * because 133 is special-cased in the service file to reboot the container.
5022 * otherwise → The container exited with zero status and a reboot was not requested.
5023 */
2a49b612 5024 if (r == EXIT_FORCE_RESTART)
27e29a1e 5025 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5026 *ret = r;
b0067625
ZJS
5027 return 0; /* finito */
5028 }
5029
5030 /* CONTAINER_REBOOTED, loop again */
5031
5032 if (arg_keep_unit) {
5033 /* Special handling if we are running as a service: instead of simply
5034 * restarting the machine we want to restart the entire service, so let's
5035 * inform systemd about this with the special exit code 133. The service
5036 * file uses RestartForceExitStatus=133 so that this results in a full
5037 * nspawn restart. This is necessary since we might have cgroup parameters
5038 * set we want to have flushed out. */
2a49b612
ZJS
5039 *ret = EXIT_FORCE_RESTART;
5040 return 0; /* finito */
b0067625
ZJS
5041 }
5042
deff68e7
FW
5043 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5044 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5045
5046 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5047 *veth_created = false;
5048 return 1; /* loop again */
5049}
5050
bf428efb 5051static int initialize_rlimits(void) {
bf428efb
LP
5052 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
5053 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5054 * container execution environments. */
5055
5056 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5057 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5058 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5059 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5060 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5061 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5062 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5063 [RLIMIT_MEMLOCK] = { 65536, 65536 },
5064 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5065 [RLIMIT_NICE] = { 0, 0 },
5066 [RLIMIT_NOFILE] = { 1024, 4096 },
5067 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5068 [RLIMIT_RTPRIO] = { 0, 0 },
5069 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5070 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5071
5072 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5073 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5074 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5075 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5076 * that PID 1 changes a number of other resource limits during early initialization which is why we
5077 * don't read the other limits from PID 1 but prefer the static table above. */
5078 };
5079
5080 int rl;
5081
5082 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5083 /* Let's only fill in what the user hasn't explicitly configured anyway */
5084 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5085 const struct rlimit *v;
5086 struct rlimit buffer;
5087
5088 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5089 /* For these two let's read the limits off PID 1. See above for an explanation. */
5090
5091 if (prlimit(1, rl, NULL, &buffer) < 0)
5092 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5093
5094 v = &buffer;
5095 } else
5096 v = kernel_defaults + rl;
5097
5098 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5099 if (!arg_rlimit[rl])
5100 return log_oom();
5101 }
5102
5103 if (DEBUG_LOGGING) {
5104 _cleanup_free_ char *k = NULL;
5105
5106 (void) rlimit_format(arg_rlimit[rl], &k);
5107 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5108 }
5109 }
5110
5111 return 0;
5112}
5113
287b7376
LP
5114static int cant_be_in_netns(void) {
5115 union sockaddr_union sa = {
5116 .un = {
5117 .sun_family = AF_UNIX,
5118 .sun_path = "/run/udev/control",
5119 },
5120 };
5121 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5122 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5123 _cleanup_close_ int fd = -1;
5124 struct ucred ucred;
5125 int r;
5126
5127 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5128 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5129 * nice message. */
5130
5131 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5132 return 0;
5133
5134 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5135 if (fd < 0)
5136 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5137
5138 if (connect(fd, &sa.un, SOCKADDR_UN_LEN(sa.un)) < 0) {
5139
5140 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5141 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5142 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5143
5144 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5145 }
5146
5147 r = getpeercred(fd, &ucred);
5148 if (r < 0)
5149 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5150
5151 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5152 r = readlink_malloc(udev_path, &udev_ns);
5153 if (r < 0)
5154 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5155
5156 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5157 if (r < 0)
5158 return log_error_errno(r, "Failed to read our own network namespace: %m");
5159
5160 if (!streq(our_ns, udev_ns))
5161 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5162 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5163 return 0;
5164}
5165
44dbef90 5166static int run(int argc, char *argv[]) {
7bf011e3
LP
5167 bool secondary = false, remove_directory = false, remove_image = false,
5168 veth_created = false, remove_tmprootdir = false;
2d845785 5169 _cleanup_close_ int master = -1;
03cfe0d5 5170 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5171 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5172 char veth_name[IFNAMSIZ] = "";
761cf19d 5173 struct ExposeArgs expose_args = {};
8e766630 5174 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5175 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5176 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
5177 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5178 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5179 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5180 pid_t pid = 0;
03cfe0d5
LP
5181
5182 log_parse_environment();
5183 log_open();
415fc41c 5184
03cfe0d5
LP
5185 r = parse_argv(argc, argv);
5186 if (r <= 0)
5187 goto finish;
5188
38ee19c0
ZJS
5189 if (geteuid() != 0) {
5190 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5191 argc >= 2 ? "Need to be root." :
5192 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5193 goto finish;
38ee19c0 5194 }
fba868fa 5195
287b7376
LP
5196 r = cant_be_in_netns();
5197 if (r < 0)
5198 goto finish;
5199
bf428efb
LP
5200 r = initialize_rlimits();
5201 if (r < 0)
5202 goto finish;
5203
de40a303
LP
5204 r = load_oci_bundle();
5205 if (r < 0)
5206 goto finish;
5207
f757855e
LP
5208 r = determine_names();
5209 if (r < 0)
5210 goto finish;
5211
5212 r = load_settings();
5213 if (r < 0)
5214 goto finish;
5215
d4d99bc6 5216 r = cg_unified();
5eee8290
LP
5217 if (r < 0) {
5218 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5219 goto finish;
5220 }
5221
f757855e
LP
5222 r = verify_arguments();
5223 if (r < 0)
5224 goto finish;
03cfe0d5 5225
49048684
ZJS
5226 /* Reapply environment settings. */
5227 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5228
2949ff26
LP
5229 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5230 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5231 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5232 (void) ignore_signals(SIGPIPE);
2949ff26 5233
03cfe0d5
LP
5234 n_fd_passed = sd_listen_fds(false);
5235 if (n_fd_passed > 0) {
5236 r = fdset_new_listen_fds(&fds, false);
5237 if (r < 0) {
5238 log_error_errno(r, "Failed to collect file descriptors: %m");
5239 goto finish;
5240 }
5241 }
5242
83e803a9
ZJS
5243 /* The "default" umask. This is appropriate for most file and directory
5244 * operations performed by nspawn, and is the umask that will be used for
5245 * the child. Functions like copy_devnodes() change the umask temporarily. */
5246 umask(0022);
5247
03cfe0d5
LP
5248 if (arg_directory) {
5249 assert(!arg_image);
5250
b35ca61a
LP
5251 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5252 * /var from the host will propagate into container dynamically (because bad things happen if
5253 * two systems write to the same /var). Let's allow it for the special cases where /var is
5254 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5255 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5256 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5257 r = -EINVAL;
5258 goto finish;
5259 }
5260
5261 if (arg_ephemeral) {
5262 _cleanup_free_ char *np = NULL;
5263
8d4aa2bb 5264 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
5265 if (r < 0)
5266 goto finish;
5267
7bf011e3
LP
5268 /* If the specified path is a mount point we generate the new snapshot immediately
5269 * inside it under a random name. However if the specified is not a mount point we
5270 * create the new snapshot in the parent directory, just next to it. */
e1873695 5271 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5272 if (r < 0) {
5273 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5274 goto finish;
5275 }
5276 if (r > 0)
770b5ce4 5277 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5278 else
770b5ce4 5279 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5280 if (r < 0) {
0f3be6ca 5281 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5282 goto finish;
5283 }
5284
6992459c 5285 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5286 * only owned by us and no one else. */
6992459c 5287 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5288 if (r < 0) {
5289 log_error_errno(r, "Failed to lock %s: %m", np);
5290 goto finish;
5291 }
5292
7bf011e3
LP
5293 {
5294 BLOCK_SIGNALS(SIGINT);
5295 r = btrfs_subvol_snapshot(arg_directory, np,
5296 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5297 BTRFS_SNAPSHOT_FALLBACK_COPY |
5298 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5299 BTRFS_SNAPSHOT_RECURSIVE |
5300 BTRFS_SNAPSHOT_QUOTA |
5301 BTRFS_SNAPSHOT_SIGINT);
5302 }
5303 if (r == -EINTR) {
5304 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5305 goto finish;
5306 }
03cfe0d5
LP
5307 if (r < 0) {
5308 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5309 goto finish;
ec16945e
LP
5310 }
5311
1cc6c93a 5312 free_and_replace(arg_directory, np);
17cbb288 5313 remove_directory = true;
30535c16 5314 } else {
cb638b5e 5315 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5316 if (r < 0)
5317 goto finish;
5318
30535c16
LP
5319 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5320 if (r == -EBUSY) {
5321 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5322 goto finish;
5323 }
5324 if (r < 0) {
5325 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5326 goto finish;
30535c16
LP
5327 }
5328
5329 if (arg_template) {
8d4aa2bb 5330 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
5331 if (r < 0)
5332 goto finish;
5333
7bf011e3
LP
5334 {
5335 BLOCK_SIGNALS(SIGINT);
5336 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5337 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5338 BTRFS_SNAPSHOT_FALLBACK_COPY |
5339 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5340 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5341 BTRFS_SNAPSHOT_RECURSIVE |
5342 BTRFS_SNAPSHOT_QUOTA |
5343 BTRFS_SNAPSHOT_SIGINT);
5344 }
ff6c6cc1
LP
5345 if (r == -EEXIST)
5346 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5347 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5348 else if (r == -EINTR) {
5349 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5350 goto finish;
5351 } else if (r < 0) {
83521414 5352 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5353 goto finish;
ff6c6cc1
LP
5354 } else
5355 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5356 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5357 }
ec16945e
LP
5358 }
5359
7732f92b 5360 if (arg_start_mode == START_BOOT) {
a5201ed6 5361 const char *p;
c9fe05e0 5362
a5201ed6
LP
5363 if (arg_pivot_root_new)
5364 p = prefix_roota(arg_directory, arg_pivot_root_new);
5365 else
5366 p = arg_directory;
c9fe05e0
AR
5367
5368 if (path_is_os_tree(p) <= 0) {
5369 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 5370 r = -EINVAL;
1b9e5b12
LP
5371 goto finish;
5372 }
5373 } else {
c9fe05e0
AR
5374 const char *p, *q;
5375
a5201ed6
LP
5376 if (arg_pivot_root_new)
5377 p = prefix_roota(arg_directory, arg_pivot_root_new);
5378 else
5379 p = arg_directory;
c9fe05e0
AR
5380
5381 q = strjoina(p, "/usr/");
1b9e5b12 5382
c9fe05e0
AR
5383 if (laccess(q, F_OK) < 0) {
5384 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 5385 r = -EINVAL;
1b9e5b12 5386 goto finish;
1b9e5b12
LP
5387 }
5388 }
ec16945e 5389
6b9132a9 5390 } else {
d04faa4e
LP
5391 DissectImageFlags dissect_image_flags =
5392 DISSECT_IMAGE_REQUIRE_ROOT |
5393 DISSECT_IMAGE_RELAX_VAR_CHECK |
5394 DISSECT_IMAGE_USR_NO_ROOT;
ec16945e
LP
5395 assert(arg_image);
5396 assert(!arg_template);
5397
8d4aa2bb 5398 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
5399 if (r < 0)
5400 goto finish;
5401
0f3be6ca
LP
5402 if (arg_ephemeral) {
5403 _cleanup_free_ char *np = NULL;
5404
5405 r = tempfn_random(arg_image, "machine.", &np);
5406 if (r < 0) {
5407 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5408 goto finish;
5409 }
5410
6992459c
LP
5411 /* Always take an exclusive lock on our own ephemeral copy. */
5412 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5413 if (r < 0) {
5414 r = log_error_errno(r, "Failed to create image lock: %m");
5415 goto finish;
5416 }
5417
7bf011e3
LP
5418 {
5419 BLOCK_SIGNALS(SIGINT);
5420 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5421 }
5422 if (r == -EINTR) {
5423 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5424 goto finish;
5425 }
0f3be6ca
LP
5426 if (r < 0) {
5427 r = log_error_errno(r, "Failed to copy image file: %m");
5428 goto finish;
5429 }
5430
1cc6c93a 5431 free_and_replace(arg_image, np);
0f3be6ca
LP
5432 remove_image = true;
5433 } else {
5434 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5435 if (r == -EBUSY) {
5436 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5437 goto finish;
5438 }
5439 if (r < 0) {
5440 r = log_error_errno(r, "Failed to create image lock: %m");
5441 goto finish;
5442 }
4623e8e6 5443
89e62e0b
LP
5444 r = verity_settings_load(
5445 &arg_verity_settings,
5446 arg_image, NULL, NULL);
e7cbe5cb
LB
5447 if (r < 0) {
5448 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5449 goto finish;
78ebe980 5450 }
89e62e0b
LP
5451
5452 if (arg_verity_settings.data_path)
5453 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5454 }
5455
c67b0082 5456 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5457 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5458 goto finish;
1b9e5b12 5459 }
6b9132a9 5460
c67b0082
LP
5461 remove_tmprootdir = true;
5462
5463 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5464 if (!arg_directory) {
5465 r = log_oom();
5466 goto finish;
6b9132a9 5467 }
88213476 5468
89e62e0b
LP
5469 r = loop_device_make_by_path(
5470 arg_image,
5471 arg_read_only ? O_RDONLY : O_RDWR,
5472 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5473 &loop);
2d845785
LP
5474 if (r < 0) {
5475 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5476 goto finish;
5477 }
1b9e5b12 5478
4526113f 5479 r = dissect_image_and_warn(
e0f9e7bd 5480 loop->fd,
4526113f 5481 arg_image,
89e62e0b 5482 &arg_verity_settings,
18d73705 5483 NULL,
e7cbe5cb 5484 dissect_image_flags,
e0f9e7bd 5485 &dissected_image);
2d845785 5486 if (r == -ENOPKG) {
4526113f 5487 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5488 log_notice("Note that the disk image needs to\n"
5489 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5490 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
19ac32cd 5491 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
2d845785
LP
5492 " d) or contain a file system without a partition table\n"
5493 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5494 goto finish;
2d845785 5495 }
4526113f 5496 if (r < 0)
842f3b0f 5497 goto finish;
1b9e5b12 5498
89e62e0b 5499 if (!arg_verity_settings.root_hash && dissected_image->can_verity)
4623e8e6
LP
5500 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5501
89e62e0b
LP
5502 r = dissected_image_decrypt_interactively(
5503 dissected_image,
5504 NULL,
5505 &arg_verity_settings,
5506 0,
5507 &decrypted_image);
1b9e5b12
LP
5508 if (r < 0)
5509 goto finish;
0f3be6ca
LP
5510
5511 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5512 if (remove_image && unlink(arg_image) >= 0)
5513 remove_image = false;
842f3b0f 5514 }
842f3b0f 5515
86c0dd4a 5516 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5517 if (r < 0)
5518 goto finish;
5519
de40a303
LP
5520 if (arg_console_mode < 0)
5521 arg_console_mode =
5522 isatty(STDIN_FILENO) > 0 &&
5523 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5524
de40a303
LP
5525 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5526 arg_quiet = true;
a258bf26 5527
9c857b9d
LP
5528 if (!arg_quiet)
5529 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5530 arg_machine, arg_image ?: arg_directory);
5531
72c0a2c2 5532 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5533
66edd963 5534 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5535 r = log_error_errno(errno, "Failed to become subreaper: %m");
5536 goto finish;
5537 }
5538
761cf19d
FW
5539 if (arg_expose_ports) {
5540 r = fw_ctx_new(&fw_ctx);
5541 if (r < 0) {
5542 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5543 goto finish;
5544 }
5545 expose_args.fw_ctx = fw_ctx;
5546 }
d87be9b0 5547 for (;;) {
3acc84eb 5548 r = run_container(dissected_image,
44dbef90
LP
5549 secondary,
5550 fds,
5551 veth_name, &veth_created,
761cf19d 5552 &expose_args, &master,
44dbef90 5553 &pid, &ret);
b0067625 5554 if (r <= 0)
d87be9b0 5555 break;
d87be9b0 5556 }
88213476
LP
5557
5558finish:
04f590a4
LP
5559 (void) sd_notify(false,
5560 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5561 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5562
9444b1f2 5563 if (pid > 0)
c67b0082 5564 (void) kill(pid, SIGKILL);
88213476 5565
503546da 5566 /* Try to flush whatever is still queued in the pty */
6a0f896b 5567 if (master >= 0) {
f5fbe71d 5568 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5569 master = safe_close(master);
5570 }
5571
5572 if (pid > 0)
5573 (void) wait_for_terminate(pid, NULL);
503546da 5574
50ebcf6c
LP
5575 pager_close();
5576
17cbb288 5577 if (remove_directory && arg_directory) {
ec16945e
LP
5578 int k;
5579
17cbb288 5580 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5581 if (k < 0)
17cbb288 5582 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5583 }
5584
0f3be6ca
LP
5585 if (remove_image && arg_image) {
5586 if (unlink(arg_image) < 0)
5587 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5588 }
5589
c67b0082
LP
5590 if (remove_tmprootdir) {
5591 if (rmdir(tmprootdir) < 0)
5592 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5593 }
5594
785890ac
LP
5595 if (arg_machine) {
5596 const char *p;
5597
63c372cb 5598 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5599 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5600 }
5601
deff68e7
FW
5602 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5603 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5604
5605 if (veth_created)
5606 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5607 (void) remove_bridge(arg_network_zone);
f757855e 5608
f757855e
LP
5609 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5610 expose_port_free_all(arg_expose_ports);
bf428efb 5611 rlimit_free_all(arg_rlimit);
b2645747 5612 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5613 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5614
44dbef90
LP
5615 if (r < 0)
5616 return r;
5617
5618 return ret;
88213476 5619}
44dbef90
LP
5620
5621DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);