]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/nspawn/nspawn.c
Merge pull request #31531 from poettering/verity-userspace-optional
[thirdparty/systemd.git] / src / nspawn / nspawn.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <errno.h>
4#include <getopt.h>
5#include <linux/loop.h>
6#if HAVE_SELINUX
7#include <selinux/selinux.h>
8#endif
9#include <stdlib.h>
10#include <sys/file.h>
11#include <sys/ioctl.h>
12#include <sys/mount.h>
13#include <sys/personality.h>
14#include <sys/prctl.h>
15#include <sys/types.h>
16#include <sys/wait.h>
17#include <termios.h>
18#include <unistd.h>
19
20#include <linux/fs.h> /* Must be included after <sys/mount.h> */
21
22#include "sd-bus.h"
23#include "sd-daemon.h"
24#include "sd-id128.h"
25
26#include "alloc-util.h"
27#include "ether-addr-util.h"
28#include "barrier.h"
29#include "base-filesystem.h"
30#include "blkid-util.h"
31#include "btrfs-util.h"
32#include "build.h"
33#include "bus-error.h"
34#include "bus-locator.h"
35#include "bus-util.h"
36#include "cap-list.h"
37#include "capability-util.h"
38#include "cgroup-util.h"
39#include "chase.h"
40#include "common-signal.h"
41#include "copy.h"
42#include "cpu-set-util.h"
43#include "creds-util.h"
44#include "dev-setup.h"
45#include "discover-image.h"
46#include "dissect-image.h"
47#include "env-util.h"
48#include "escape.h"
49#include "fd-util.h"
50#include "fdset.h"
51#include "fileio.h"
52#include "format-util.h"
53#include "fs-util.h"
54#include "gpt.h"
55#include "hexdecoct.h"
56#include "hostname-setup.h"
57#include "hostname-util.h"
58#include "id128-util.h"
59#include "io-util.h"
60#include "log.h"
61#include "loop-util.h"
62#include "loopback-setup.h"
63#include "machine-credential.h"
64#include "macro.h"
65#include "main-func.h"
66#include "missing_sched.h"
67#include "mkdir.h"
68#include "mount-util.h"
69#include "mountpoint-util.h"
70#include "namespace-util.h"
71#include "netlink-util.h"
72#include "nspawn-bind-user.h"
73#include "nspawn-cgroup.h"
74#include "nspawn-def.h"
75#include "nspawn-expose-ports.h"
76#include "nspawn-mount.h"
77#include "nspawn-network.h"
78#include "nspawn-oci.h"
79#include "nspawn-patch-uid.h"
80#include "nspawn-register.h"
81#include "nspawn-seccomp.h"
82#include "nspawn-settings.h"
83#include "nspawn-setuid.h"
84#include "nspawn-stub-pid1.h"
85#include "nspawn-util.h"
86#include "nspawn.h"
87#include "nulstr-util.h"
88#include "os-util.h"
89#include "pager.h"
90#include "parse-argument.h"
91#include "parse-util.h"
92#include "pretty-print.h"
93#include "process-util.h"
94#include "ptyfwd.h"
95#include "random-util.h"
96#include "raw-clone.h"
97#include "resolve-util.h"
98#include "rlimit-util.h"
99#include "rm-rf.h"
100#include "seccomp-util.h"
101#include "selinux-util.h"
102#include "signal-util.h"
103#include "socket-util.h"
104#include "stat-util.h"
105#include "stdio-util.h"
106#include "string-table.h"
107#include "string-util.h"
108#include "strv.h"
109#include "sysctl-util.h"
110#include "terminal-util.h"
111#include "tmpfile-util.h"
112#include "umask-util.h"
113#include "unit-name.h"
114#include "user-util.h"
115#include "vpick.h"
116
117/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
118#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
119#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
120
121#define EXIT_FORCE_RESTART 133
122
123typedef enum ContainerStatus {
124 CONTAINER_TERMINATED,
125 CONTAINER_REBOOTED,
126} ContainerStatus;
127
128static char *arg_directory = NULL;
129static char *arg_template = NULL;
130static char *arg_chdir = NULL;
131static char *arg_pivot_root_new = NULL;
132static char *arg_pivot_root_old = NULL;
133static char *arg_user = NULL;
134static uid_t arg_uid = UID_INVALID;
135static gid_t arg_gid = GID_INVALID;
136static gid_t* arg_supplementary_gids = NULL;
137static size_t arg_n_supplementary_gids = 0;
138static sd_id128_t arg_uuid = {};
139static char *arg_machine = NULL; /* The name used by the host to refer to this */
140static char *arg_hostname = NULL; /* The name the payload sees by default */
141static const char *arg_selinux_context = NULL;
142static const char *arg_selinux_apifs_context = NULL;
143static char *arg_slice = NULL;
144static bool arg_private_network = false;
145static bool arg_read_only = false;
146static StartMode arg_start_mode = START_PID1;
147static bool arg_ephemeral = false;
148static LinkJournal arg_link_journal = LINK_AUTO;
149static bool arg_link_journal_try = false;
150static uint64_t arg_caps_retain =
151 (1ULL << CAP_AUDIT_CONTROL) |
152 (1ULL << CAP_AUDIT_WRITE) |
153 (1ULL << CAP_CHOWN) |
154 (1ULL << CAP_DAC_OVERRIDE) |
155 (1ULL << CAP_DAC_READ_SEARCH) |
156 (1ULL << CAP_FOWNER) |
157 (1ULL << CAP_FSETID) |
158 (1ULL << CAP_IPC_OWNER) |
159 (1ULL << CAP_KILL) |
160 (1ULL << CAP_LEASE) |
161 (1ULL << CAP_LINUX_IMMUTABLE) |
162 (1ULL << CAP_MKNOD) |
163 (1ULL << CAP_NET_BIND_SERVICE) |
164 (1ULL << CAP_NET_BROADCAST) |
165 (1ULL << CAP_NET_RAW) |
166 (1ULL << CAP_SETFCAP) |
167 (1ULL << CAP_SETGID) |
168 (1ULL << CAP_SETPCAP) |
169 (1ULL << CAP_SETUID) |
170 (1ULL << CAP_SYS_ADMIN) |
171 (1ULL << CAP_SYS_BOOT) |
172 (1ULL << CAP_SYS_CHROOT) |
173 (1ULL << CAP_SYS_NICE) |
174 (1ULL << CAP_SYS_PTRACE) |
175 (1ULL << CAP_SYS_RESOURCE) |
176 (1ULL << CAP_SYS_TTY_CONFIG);
177static uint64_t arg_caps_ambient = 0;
178static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
179static CustomMount *arg_custom_mounts = NULL;
180static size_t arg_n_custom_mounts = 0;
181static char **arg_setenv = NULL;
182static bool arg_quiet = false;
183static bool arg_register = true;
184static bool arg_keep_unit = false;
185static char **arg_network_interfaces = NULL;
186static char **arg_network_macvlan = NULL;
187static char **arg_network_ipvlan = NULL;
188static bool arg_network_veth = false;
189static char **arg_network_veth_extra = NULL;
190static char *arg_network_bridge = NULL;
191static char *arg_network_zone = NULL;
192static char *arg_network_namespace_path = NULL;
193struct ether_addr arg_network_provided_mac = {};
194static PagerFlags arg_pager_flags = 0;
195static unsigned long arg_personality = PERSONALITY_INVALID;
196static char *arg_image = NULL;
197static char *arg_oci_bundle = NULL;
198static VolatileMode arg_volatile_mode = VOLATILE_NO;
199static ExposePort *arg_expose_ports = NULL;
200static char **arg_property = NULL;
201static sd_bus_message *arg_property_message = NULL;
202static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
203static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
204static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
205static int arg_kill_signal = 0;
206static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
207static SettingsMask arg_settings_mask = 0;
208static int arg_settings_trusted = -1;
209static char **arg_parameters = NULL;
210static const char *arg_container_service_name = "systemd-nspawn";
211static bool arg_notify_ready = false;
212static bool arg_use_cgns = true;
213static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
214static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
215static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
216static char **arg_syscall_allow_list = NULL;
217static char **arg_syscall_deny_list = NULL;
218#if HAVE_SECCOMP
219static scmp_filter_ctx arg_seccomp = NULL;
220#endif
221static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
222static bool arg_no_new_privileges = false;
223static int arg_oom_score_adjust = 0;
224static bool arg_oom_score_adjust_set = false;
225static CPUSet arg_cpu_set = {};
226static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
227static TimezoneMode arg_timezone = TIMEZONE_AUTO;
228static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
229static DeviceNode* arg_extra_nodes = NULL;
230static size_t arg_n_extra_nodes = 0;
231static char **arg_sysctl = NULL;
232static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
233static MachineCredentialContext arg_credentials = {};
234static char **arg_bind_user = NULL;
235static bool arg_suppress_sync = false;
236static char *arg_settings_filename = NULL;
237static Architecture arg_architecture = _ARCHITECTURE_INVALID;
238static ImagePolicy *arg_image_policy = NULL;
239static char *arg_background = NULL;
240
241STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
249STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
250STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
251STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
254STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
255STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
256STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
258STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
259STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
260STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
261STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
262STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
263STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
264STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
265STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
266STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
267#if HAVE_SECCOMP
268STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
269#endif
270STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done);
271STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
272STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
273STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
274STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
275STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
276STATIC_DESTRUCTOR_REGISTER(arg_background, freep);
277
278static int handle_arg_console(const char *arg) {
279 if (streq(arg, "help")) {
280 puts("autopipe\n"
281 "interactive\n"
282 "passive\n"
283 "pipe\n"
284 "read-only");
285 return 0;
286 }
287
288 if (streq(arg, "interactive"))
289 arg_console_mode = CONSOLE_INTERACTIVE;
290 else if (streq(arg, "read-only"))
291 arg_console_mode = CONSOLE_READ_ONLY;
292 else if (streq(arg, "passive"))
293 arg_console_mode = CONSOLE_PASSIVE;
294 else if (streq(arg, "pipe")) {
295 if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
296 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
297 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
298 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
299 "Proceeding anyway.");
300
301 arg_console_mode = CONSOLE_PIPE;
302 } else if (streq(arg, "autopipe")) {
303 if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
304 arg_console_mode = CONSOLE_INTERACTIVE;
305 else
306 arg_console_mode = CONSOLE_PIPE;
307 } else
308 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
309
310 arg_settings_mask |= SETTING_CONSOLE_MODE;
311 return 1;
312}
313
314static int help(void) {
315 _cleanup_free_ char *link = NULL;
316 int r;
317
318 pager_open(arg_pager_flags);
319
320 r = terminal_urlify_man("systemd-nspawn", "1", &link);
321 if (r < 0)
322 return log_oom();
323
324 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
325 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
326 " -h --help Show this help\n"
327 " --version Print version string\n"
328 " -q --quiet Do not show status information\n"
329 " --no-pager Do not pipe output into a pager\n"
330 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
331 "\n%3$sImage:%4$s\n"
332 " -D --directory=PATH Root directory for the container\n"
333 " --template=PATH Initialize root directory from template directory,\n"
334 " if missing\n"
335 " -x --ephemeral Run container with snapshot of root directory, and\n"
336 " remove it after exit\n"
337 " -i --image=PATH Root file system disk image (or device node) for\n"
338 " the container\n"
339 " --image-policy=POLICY Specify disk image dissection policy\n"
340 " --oci-bundle=PATH OCI bundle directory\n"
341 " --read-only Mount the root directory read-only\n"
342 " --volatile[=MODE] Run the system in volatile mode\n"
343 " --root-hash=HASH Specify verity root hash for root disk image\n"
344 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
345 " as a DER encoded PKCS7, either as a path to a file\n"
346 " or as an ASCII base64 encoded string prefixed by\n"
347 " 'base64:'\n"
348 " --verity-data=PATH Specify hash device for verity\n"
349 " --pivot-root=PATH[:PATH]\n"
350 " Pivot root to given directory in the container\n"
351 "\n%3$sExecution:%4$s\n"
352 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
353 " -b --boot Boot up full system (i.e. invoke init)\n"
354 " --chdir=PATH Set working directory in the container\n"
355 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
356 " -u --user=USER Run the command under specified user or UID\n"
357 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
358 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
359 " --suppress-sync=BOOLEAN\n"
360 " Suppress any form of disk data synchronization\n"
361 "\n%3$sSystem Identity:%4$s\n"
362 " -M --machine=NAME Set the machine name for the container\n"
363 " --hostname=NAME Override the hostname for the container\n"
364 " --uuid=UUID Set a specific machine UUID for the container\n"
365 "\n%3$sProperties:%4$s\n"
366 " -S --slice=SLICE Place the container in the specified slice\n"
367 " --property=NAME=VALUE Set scope unit property\n"
368 " --register=BOOLEAN Register container as machine\n"
369 " --keep-unit Do not register a scope for the machine, reuse\n"
370 " the service unit nspawn is running in\n"
371 "\n%3$sUser Namespacing:%4$s\n"
372 " --private-users=no Run without user namespacing\n"
373 " --private-users=yes|pick|identity\n"
374 " Run within user namespace, autoselect UID/GID range\n"
375 " --private-users=UIDBASE[:NUIDS]\n"
376 " Similar, but with user configured UID/GID range\n"
377 " --private-users-ownership=MODE\n"
378 " Adjust ('chown') or map ('map') OS tree ownership\n"
379 " to private UID/GID range\n"
380 " -U Equivalent to --private-users=pick and\n"
381 " --private-users-ownership=auto\n"
382 "\n%3$sNetworking:%4$s\n"
383 " --private-network Disable network in container\n"
384 " --network-interface=HOSTIF[:CONTAINERIF]\n"
385 " Assign an existing network interface to the\n"
386 " container\n"
387 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
388 " Create a macvlan network interface based on an\n"
389 " existing network interface to the container\n"
390 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
391 " Create an ipvlan network interface based on an\n"
392 " existing network interface to the container\n"
393 " -n --network-veth Add a virtual Ethernet connection between host\n"
394 " and container\n"
395 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
396 " Add an additional virtual Ethernet link between\n"
397 " host and container\n"
398 " --network-bridge=INTERFACE\n"
399 " Add a virtual Ethernet connection to the container\n"
400 " and attach it to an existing bridge on the host\n"
401 " --network-zone=NAME Similar, but attach the new interface to an\n"
402 " an automatically managed bridge interface\n"
403 " --network-namespace-path=PATH\n"
404 " Set network namespace to the one represented by\n"
405 " the specified kernel namespace file node\n"
406 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
407 " Expose a container IP port on the host\n"
408 "\n%3$sSecurity:%4$s\n"
409 " --capability=CAP In addition to the default, retain specified\n"
410 " capability\n"
411 " --drop-capability=CAP Drop the specified capability from the default set\n"
412 " --ambient-capability=CAP\n"
413 " Sets the specified capability for the started\n"
414 " process. Not useful if booting a machine.\n"
415 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
416 " --system-call-filter=LIST|~LIST\n"
417 " Permit/prohibit specific system calls\n"
418 " -Z --selinux-context=SECLABEL\n"
419 " Set the SELinux security context to be used by\n"
420 " processes in the container\n"
421 " -L --selinux-apifs-context=SECLABEL\n"
422 " Set the SELinux security context to be used by\n"
423 " API/tmpfs file systems in the container\n"
424 "\n%3$sResources:%4$s\n"
425 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
426 " --oom-score-adjust=VALUE\n"
427 " Adjust the OOM score value for the payload\n"
428 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
429 " --personality=ARCH Pick personality for this container\n"
430 "\n%3$sIntegration:%4$s\n"
431 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
432 " --timezone=MODE Select mode of /etc/localtime initialization\n"
433 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
434 " host, try-guest, try-host\n"
435 " -j Equivalent to --link-journal=try-guest\n"
436 "\n%3$sMounts:%4$s\n"
437 " --bind=PATH[:PATH[:OPTIONS]]\n"
438 " Bind mount a file or directory from the host into\n"
439 " the container\n"
440 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
441 " Similar, but creates a read-only bind mount\n"
442 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
443 " it\n"
444 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
445 " --overlay=PATH[:PATH...]:PATH\n"
446 " Create an overlay mount from the host to \n"
447 " the container\n"
448 " --overlay-ro=PATH[:PATH...]:PATH\n"
449 " Similar, but creates a read-only overlay mount\n"
450 " --bind-user=NAME Bind user from host to container\n"
451 "\n%3$sInput/Output:%4$s\n"
452 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
453 " set up for the container.\n"
454 " -P --pipe Equivalent to --console=pipe\n"
455 " --background=COLOR Set ANSI color for background\n"
456 "\n%3$sCredentials:%4$s\n"
457 " --set-credential=ID:VALUE\n"
458 " Pass a credential with literal value to container.\n"
459 " --load-credential=ID:PATH\n"
460 " Load credential to pass to container from file or\n"
461 " AF_UNIX stream socket.\n"
462 "\nSee the %2$s for details.\n",
463 program_invocation_short_name,
464 link,
465 ansi_underline(),
466 ansi_normal(),
467 ansi_highlight(),
468 ansi_normal());
469
470 return 0;
471}
472
473static int custom_mount_check_all(void) {
474 size_t i;
475
476 for (i = 0; i < arg_n_custom_mounts; i++) {
477 CustomMount *m = &arg_custom_mounts[i];
478
479 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
480 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
481 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
482 "--private-users-ownership=own may not be combined with custom root mounts.");
483 if (arg_uid_shift == UID_INVALID)
484 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
485 "--private-users with automatic UID shift may not be combined with custom root mounts.");
486 }
487 }
488
489 return 0;
490}
491
492static int detect_unified_cgroup_hierarchy_from_environment(void) {
493 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
494 int r;
495
496 /* Allow the user to control whether the unified hierarchy is used */
497
498 e = getenv(var);
499 if (!e) {
500 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
501 var = "UNIFIED_CGROUP_HIERARCHY";
502 e = getenv(var);
503 }
504
505 if (!isempty(e)) {
506 r = parse_boolean(e);
507 if (r < 0)
508 return log_error_errno(r, "Failed to parse $%s: %m", var);
509 if (r > 0)
510 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
511 else
512 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
513 }
514
515 return 0;
516}
517
518static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
519 int r;
520
521 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
522 * in the image actually supports. */
523 r = cg_all_unified();
524 if (r < 0)
525 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
526 if (r > 0) {
527 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
528 * routine only detects 231, so we'll have a false negative here for 230. */
529 r = systemd_installation_has_version(directory, "230");
530 if (r < 0)
531 return log_error_errno(r, "Failed to determine systemd version in container: %m");
532 if (r > 0)
533 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
534 else
535 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
536 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
537 /* Mixed cgroup hierarchy support was added in 233 */
538 r = systemd_installation_has_version(directory, "233");
539 if (r < 0)
540 return log_error_errno(r, "Failed to determine systemd version in container: %m");
541 if (r > 0)
542 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
543 else
544 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
545 } else
546 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
547
548 log_debug("Using %s hierarchy for container.",
549 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
550 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
551
552 return 0;
553}
554
555static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
556 uint64_t mask = 0;
557 int r;
558
559 for (;;) {
560 _cleanup_free_ char *t = NULL;
561
562 r = extract_first_word(&spec, &t, ",", 0);
563 if (r < 0)
564 return log_error_errno(r, "Failed to parse capability %s.", t);
565 if (r == 0)
566 break;
567
568 if (streq(t, "help")) {
569 for (int i = 0; i < capability_list_length(); i++) {
570 const char *name;
571
572 name = capability_to_name(i);
573 if (name)
574 puts(name);
575 }
576
577 return 0; /* quit */
578 }
579
580 if (streq(t, "all"))
581 mask = UINT64_MAX;
582 else {
583 r = capability_from_name(t);
584 if (r < 0)
585 return log_error_errno(r, "Failed to parse capability %s.", t);
586
587 mask |= 1ULL << r;
588 }
589 }
590
591 *ret_mask = mask;
592 return 1; /* continue */
593}
594
595static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
596 int r;
597
598 r = getenv_bool(name);
599 if (r == -ENXIO)
600 return 0;
601 if (r < 0)
602 return log_error_errno(r, "Failed to parse $%s: %m", name);
603
604 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
605 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
606 return 0;
607}
608
609static int parse_mount_settings_env(void) {
610 const char *e;
611 int r;
612
613 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
614 if (r < 0 && r != -ENXIO)
615 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
616 if (r >= 0)
617 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
618
619 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
620 if (streq_ptr(e, "network"))
621 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
622
623 else if (e) {
624 r = parse_boolean(e);
625 if (r < 0)
626 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
627
628 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
629 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
630 }
631
632 return 0;
633}
634
635static int parse_environment(void) {
636 const char *e;
637 int r;
638
639 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
640 if (r < 0)
641 return r;
642 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
643 if (r < 0)
644 return r;
645 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
646 if (r < 0)
647 return r;
648 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
649 if (r < 0)
650 return r;
651
652 r = parse_mount_settings_env();
653 if (r < 0)
654 return r;
655
656 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
657 * even if it is supported. If not supported, it has no effect. */
658 if (!cg_ns_supported())
659 arg_use_cgns = false;
660 else {
661 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
662 if (r < 0) {
663 if (r != -ENXIO)
664 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
665
666 arg_use_cgns = true;
667 } else {
668 arg_use_cgns = r > 0;
669 arg_settings_mask |= SETTING_USE_CGNS;
670 }
671 }
672
673 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
674 if (e)
675 arg_container_service_name = e;
676
677 e = getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
678 if (e) {
679 r = parse_ether_addr(e, &arg_network_provided_mac);
680 if (r < 0)
681 return log_error_errno(r, "Failed to parse provided MAC address via environment variable");
682 }
683
684 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
685 if (r >= 0)
686 arg_suppress_sync = r;
687 else if (r != -ENXIO)
688 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
689
690 return detect_unified_cgroup_hierarchy_from_environment();
691}
692
693static int parse_argv(int argc, char *argv[]) {
694 enum {
695 ARG_VERSION = 0x100,
696 ARG_PRIVATE_NETWORK,
697 ARG_UUID,
698 ARG_READ_ONLY,
699 ARG_CAPABILITY,
700 ARG_AMBIENT_CAPABILITY,
701 ARG_DROP_CAPABILITY,
702 ARG_LINK_JOURNAL,
703 ARG_BIND,
704 ARG_BIND_RO,
705 ARG_TMPFS,
706 ARG_OVERLAY,
707 ARG_OVERLAY_RO,
708 ARG_INACCESSIBLE,
709 ARG_SHARE_SYSTEM,
710 ARG_REGISTER,
711 ARG_KEEP_UNIT,
712 ARG_NETWORK_INTERFACE,
713 ARG_NETWORK_MACVLAN,
714 ARG_NETWORK_IPVLAN,
715 ARG_NETWORK_BRIDGE,
716 ARG_NETWORK_ZONE,
717 ARG_NETWORK_VETH_EXTRA,
718 ARG_NETWORK_NAMESPACE_PATH,
719 ARG_PERSONALITY,
720 ARG_VOLATILE,
721 ARG_TEMPLATE,
722 ARG_PROPERTY,
723 ARG_PRIVATE_USERS,
724 ARG_KILL_SIGNAL,
725 ARG_SETTINGS,
726 ARG_CHDIR,
727 ARG_PIVOT_ROOT,
728 ARG_PRIVATE_USERS_CHOWN,
729 ARG_PRIVATE_USERS_OWNERSHIP,
730 ARG_NOTIFY_READY,
731 ARG_ROOT_HASH,
732 ARG_ROOT_HASH_SIG,
733 ARG_VERITY_DATA,
734 ARG_SYSTEM_CALL_FILTER,
735 ARG_RLIMIT,
736 ARG_HOSTNAME,
737 ARG_NO_NEW_PRIVILEGES,
738 ARG_OOM_SCORE_ADJUST,
739 ARG_CPU_AFFINITY,
740 ARG_RESOLV_CONF,
741 ARG_TIMEZONE,
742 ARG_CONSOLE,
743 ARG_PIPE,
744 ARG_OCI_BUNDLE,
745 ARG_NO_PAGER,
746 ARG_SET_CREDENTIAL,
747 ARG_LOAD_CREDENTIAL,
748 ARG_BIND_USER,
749 ARG_SUPPRESS_SYNC,
750 ARG_IMAGE_POLICY,
751 ARG_BACKGROUND,
752 };
753
754 static const struct option options[] = {
755 { "help", no_argument, NULL, 'h' },
756 { "version", no_argument, NULL, ARG_VERSION },
757 { "directory", required_argument, NULL, 'D' },
758 { "template", required_argument, NULL, ARG_TEMPLATE },
759 { "ephemeral", no_argument, NULL, 'x' },
760 { "user", required_argument, NULL, 'u' },
761 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
762 { "as-pid2", no_argument, NULL, 'a' },
763 { "boot", no_argument, NULL, 'b' },
764 { "uuid", required_argument, NULL, ARG_UUID },
765 { "read-only", no_argument, NULL, ARG_READ_ONLY },
766 { "capability", required_argument, NULL, ARG_CAPABILITY },
767 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
768 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
769 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
770 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
771 { "bind", required_argument, NULL, ARG_BIND },
772 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
773 { "tmpfs", required_argument, NULL, ARG_TMPFS },
774 { "overlay", required_argument, NULL, ARG_OVERLAY },
775 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
776 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
777 { "machine", required_argument, NULL, 'M' },
778 { "hostname", required_argument, NULL, ARG_HOSTNAME },
779 { "slice", required_argument, NULL, 'S' },
780 { "setenv", required_argument, NULL, 'E' },
781 { "selinux-context", required_argument, NULL, 'Z' },
782 { "selinux-apifs-context", required_argument, NULL, 'L' },
783 { "quiet", no_argument, NULL, 'q' },
784 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
785 { "register", required_argument, NULL, ARG_REGISTER },
786 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
787 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
788 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
789 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
790 { "network-veth", no_argument, NULL, 'n' },
791 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
792 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
793 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
794 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
795 { "personality", required_argument, NULL, ARG_PERSONALITY },
796 { "image", required_argument, NULL, 'i' },
797 { "volatile", optional_argument, NULL, ARG_VOLATILE },
798 { "port", required_argument, NULL, 'p' },
799 { "property", required_argument, NULL, ARG_PROPERTY },
800 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
801 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
802 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
803 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
804 { "settings", required_argument, NULL, ARG_SETTINGS },
805 { "chdir", required_argument, NULL, ARG_CHDIR },
806 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
807 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
808 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
809 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
810 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
811 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
812 { "rlimit", required_argument, NULL, ARG_RLIMIT },
813 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
814 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
815 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
816 { "timezone", required_argument, NULL, ARG_TIMEZONE },
817 { "console", required_argument, NULL, ARG_CONSOLE },
818 { "pipe", no_argument, NULL, ARG_PIPE },
819 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
820 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
821 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
822 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
823 { "bind-user", required_argument, NULL, ARG_BIND_USER },
824 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
825 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
826 { "background", required_argument, NULL, ARG_BACKGROUND },
827 {}
828 };
829
830 int c, r;
831 uint64_t plus = 0, minus = 0;
832 bool mask_all_settings = false, mask_no_settings = false;
833
834 assert(argc >= 0);
835 assert(argv);
836
837 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
838 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
839 optind = 0;
840 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
841 switch (c) {
842
843 case 'h':
844 return help();
845
846 case ARG_VERSION:
847 return version();
848
849 case 'D':
850 r = parse_path_argument(optarg, false, &arg_directory);
851 if (r < 0)
852 return r;
853
854 arg_settings_mask |= SETTING_DIRECTORY;
855 break;
856
857 case ARG_TEMPLATE:
858 r = parse_path_argument(optarg, false, &arg_template);
859 if (r < 0)
860 return r;
861
862 arg_settings_mask |= SETTING_DIRECTORY;
863 break;
864
865 case 'i':
866 r = parse_path_argument(optarg, false, &arg_image);
867 if (r < 0)
868 return r;
869
870 arg_settings_mask |= SETTING_DIRECTORY;
871 break;
872
873 case ARG_OCI_BUNDLE:
874 r = parse_path_argument(optarg, false, &arg_oci_bundle);
875 if (r < 0)
876 return r;
877
878 break;
879
880 case 'x':
881 arg_ephemeral = true;
882 arg_settings_mask |= SETTING_EPHEMERAL;
883 break;
884
885 case 'u':
886 r = free_and_strdup(&arg_user, optarg);
887 if (r < 0)
888 return log_oom();
889
890 arg_settings_mask |= SETTING_USER;
891 break;
892
893 case ARG_NETWORK_ZONE: {
894 _cleanup_free_ char *j = NULL;
895
896 j = strjoin("vz-", optarg);
897 if (!j)
898 return log_oom();
899
900 if (!ifname_valid(j))
901 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
902 "Network zone name not valid: %s", j);
903
904 free_and_replace(arg_network_zone, j);
905
906 arg_network_veth = true;
907 arg_private_network = true;
908 arg_settings_mask |= SETTING_NETWORK;
909 break;
910 }
911
912 case ARG_NETWORK_BRIDGE:
913
914 if (!ifname_valid(optarg))
915 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
916 "Bridge interface name not valid: %s", optarg);
917
918 r = free_and_strdup(&arg_network_bridge, optarg);
919 if (r < 0)
920 return log_oom();
921
922 _fallthrough_;
923 case 'n':
924 arg_network_veth = true;
925 arg_private_network = true;
926 arg_settings_mask |= SETTING_NETWORK;
927 break;
928
929 case ARG_NETWORK_VETH_EXTRA:
930 r = veth_extra_parse(&arg_network_veth_extra, optarg);
931 if (r < 0)
932 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
933
934 arg_private_network = true;
935 arg_settings_mask |= SETTING_NETWORK;
936 break;
937
938 case ARG_NETWORK_INTERFACE:
939 r = interface_pair_parse(&arg_network_interfaces, optarg);
940 if (r < 0)
941 return r;
942
943 arg_private_network = true;
944 arg_settings_mask |= SETTING_NETWORK;
945 break;
946
947 case ARG_NETWORK_MACVLAN:
948 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
949 if (r < 0)
950 return r;
951
952 arg_private_network = true;
953 arg_settings_mask |= SETTING_NETWORK;
954 break;
955
956 case ARG_NETWORK_IPVLAN:
957 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
958 if (r < 0)
959 return r;
960
961 _fallthrough_;
962 case ARG_PRIVATE_NETWORK:
963 arg_private_network = true;
964 arg_settings_mask |= SETTING_NETWORK;
965 break;
966
967 case ARG_NETWORK_NAMESPACE_PATH:
968 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
969 if (r < 0)
970 return r;
971
972 arg_settings_mask |= SETTING_NETWORK;
973 break;
974
975 case 'b':
976 if (arg_start_mode == START_PID2)
977 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
978 "--boot and --as-pid2 may not be combined.");
979
980 arg_start_mode = START_BOOT;
981 arg_settings_mask |= SETTING_START_MODE;
982 break;
983
984 case 'a':
985 if (arg_start_mode == START_BOOT)
986 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
987 "--boot and --as-pid2 may not be combined.");
988
989 arg_start_mode = START_PID2;
990 arg_settings_mask |= SETTING_START_MODE;
991 break;
992
993 case ARG_UUID:
994 r = id128_from_string_nonzero(optarg, &arg_uuid);
995 if (r == -ENXIO)
996 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
997 "Machine UUID may not be all zeroes.");
998 if (r < 0)
999 return log_error_errno(r, "Invalid UUID: %s", optarg);
1000
1001 arg_settings_mask |= SETTING_MACHINE_ID;
1002 break;
1003
1004 case 'S': {
1005 _cleanup_free_ char *mangled = NULL;
1006
1007 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1008 if (r < 0)
1009 return log_oom();
1010
1011 free_and_replace(arg_slice, mangled);
1012 arg_settings_mask |= SETTING_SLICE;
1013 break;
1014 }
1015
1016 case 'M':
1017 if (isempty(optarg))
1018 arg_machine = mfree(arg_machine);
1019 else {
1020 if (!hostname_is_valid(optarg, 0))
1021 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1022 "Invalid machine name: %s", optarg);
1023
1024 r = free_and_strdup(&arg_machine, optarg);
1025 if (r < 0)
1026 return log_oom();
1027 }
1028 break;
1029
1030 case ARG_HOSTNAME:
1031 if (isempty(optarg))
1032 arg_hostname = mfree(arg_hostname);
1033 else {
1034 if (!hostname_is_valid(optarg, 0))
1035 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1036 "Invalid hostname: %s", optarg);
1037
1038 r = free_and_strdup(&arg_hostname, optarg);
1039 if (r < 0)
1040 return log_oom();
1041 }
1042
1043 arg_settings_mask |= SETTING_HOSTNAME;
1044 break;
1045
1046 case 'Z':
1047 arg_selinux_context = optarg;
1048 break;
1049
1050 case 'L':
1051 arg_selinux_apifs_context = optarg;
1052 break;
1053
1054 case ARG_READ_ONLY:
1055 arg_read_only = true;
1056 arg_settings_mask |= SETTING_READ_ONLY;
1057 break;
1058
1059 case ARG_AMBIENT_CAPABILITY: {
1060 uint64_t m;
1061 r = parse_capability_spec(optarg, &m);
1062 if (r <= 0)
1063 return r;
1064 arg_caps_ambient |= m;
1065 arg_settings_mask |= SETTING_CAPABILITY;
1066 break;
1067 }
1068 case ARG_CAPABILITY:
1069 case ARG_DROP_CAPABILITY: {
1070 uint64_t m;
1071 r = parse_capability_spec(optarg, &m);
1072 if (r <= 0)
1073 return r;
1074
1075 if (c == ARG_CAPABILITY)
1076 plus |= m;
1077 else
1078 minus |= m;
1079 arg_settings_mask |= SETTING_CAPABILITY;
1080 break;
1081 }
1082 case ARG_NO_NEW_PRIVILEGES:
1083 r = parse_boolean(optarg);
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1086
1087 arg_no_new_privileges = r;
1088 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1089 break;
1090
1091 case 'j':
1092 arg_link_journal = LINK_GUEST;
1093 arg_link_journal_try = true;
1094 arg_settings_mask |= SETTING_LINK_JOURNAL;
1095 break;
1096
1097 case ARG_LINK_JOURNAL:
1098 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1099 if (r < 0)
1100 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1101
1102 arg_settings_mask |= SETTING_LINK_JOURNAL;
1103 break;
1104
1105 case ARG_BIND:
1106 case ARG_BIND_RO:
1107 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1108 if (r < 0)
1109 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1110
1111 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1112 break;
1113
1114 case ARG_TMPFS:
1115 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1116 if (r < 0)
1117 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1118
1119 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1120 break;
1121
1122 case ARG_OVERLAY:
1123 case ARG_OVERLAY_RO:
1124 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1125 if (r == -EADDRNOTAVAIL)
1126 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1127 if (r < 0)
1128 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1129
1130 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1131 break;
1132
1133 case ARG_INACCESSIBLE:
1134 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1135 if (r < 0)
1136 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1137
1138 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1139 break;
1140
1141 case 'E':
1142 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1143 if (r < 0)
1144 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1145
1146 arg_settings_mask |= SETTING_ENVIRONMENT;
1147 break;
1148
1149 case 'q':
1150 arg_quiet = true;
1151 break;
1152
1153 case ARG_SHARE_SYSTEM:
1154 /* We don't officially support this anymore, except for compat reasons. People should use the
1155 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1156 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1157 arg_clone_ns_flags = 0;
1158 break;
1159
1160 case ARG_REGISTER:
1161 r = parse_boolean(optarg);
1162 if (r < 0) {
1163 log_error("Failed to parse --register= argument: %s", optarg);
1164 return r;
1165 }
1166
1167 arg_register = r;
1168 break;
1169
1170 case ARG_KEEP_UNIT:
1171 arg_keep_unit = true;
1172 break;
1173
1174 case ARG_PERSONALITY:
1175
1176 arg_personality = personality_from_string(optarg);
1177 if (arg_personality == PERSONALITY_INVALID)
1178 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1179 "Unknown or unsupported personality '%s'.", optarg);
1180
1181 arg_settings_mask |= SETTING_PERSONALITY;
1182 break;
1183
1184 case ARG_VOLATILE:
1185
1186 if (!optarg)
1187 arg_volatile_mode = VOLATILE_YES;
1188 else if (streq(optarg, "help")) {
1189 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1190 return 0;
1191 } else {
1192 VolatileMode m;
1193
1194 m = volatile_mode_from_string(optarg);
1195 if (m < 0)
1196 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1197 "Failed to parse --volatile= argument: %s", optarg);
1198 else
1199 arg_volatile_mode = m;
1200 }
1201
1202 arg_settings_mask |= SETTING_VOLATILE_MODE;
1203 break;
1204
1205 case 'p':
1206 r = expose_port_parse(&arg_expose_ports, optarg);
1207 if (r == -EEXIST)
1208 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1209 if (r < 0)
1210 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1211
1212 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1213 break;
1214
1215 case ARG_PROPERTY:
1216 if (strv_extend(&arg_property, optarg) < 0)
1217 return log_oom();
1218
1219 break;
1220
1221 case ARG_PRIVATE_USERS: {
1222 int boolean;
1223
1224 if (!optarg)
1225 boolean = true;
1226 else if (!in_charset(optarg, DIGITS))
1227 /* do *not* parse numbers as booleans */
1228 boolean = parse_boolean(optarg);
1229 else
1230 boolean = -1;
1231
1232 if (boolean == 0) {
1233 /* no: User namespacing off */
1234 arg_userns_mode = USER_NAMESPACE_NO;
1235 arg_uid_shift = UID_INVALID;
1236 arg_uid_range = UINT32_C(0x10000);
1237 } else if (boolean > 0) {
1238 /* yes: User namespacing on, UID range is read from root dir */
1239 arg_userns_mode = USER_NAMESPACE_FIXED;
1240 arg_uid_shift = UID_INVALID;
1241 arg_uid_range = UINT32_C(0x10000);
1242 } else if (streq(optarg, "pick")) {
1243 /* pick: User namespacing on, UID range is picked randomly */
1244 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1245 * implied by USER_NAMESPACE_PICK
1246 * further down. */
1247 arg_uid_shift = UID_INVALID;
1248 arg_uid_range = UINT32_C(0x10000);
1249
1250 } else if (streq(optarg, "identity")) {
1251 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1252 * itself, i.e. we don't actually map anything, but do take benefit of
1253 * isolation of capability sets. */
1254 arg_userns_mode = USER_NAMESPACE_FIXED;
1255 arg_uid_shift = 0;
1256 arg_uid_range = UINT32_C(0x10000);
1257 } else {
1258 /* anything else: User namespacing on, UID range is explicitly configured */
1259 r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range);
1260 if (r < 0)
1261 return r;
1262 arg_userns_mode = USER_NAMESPACE_FIXED;
1263 }
1264
1265 arg_settings_mask |= SETTING_USERNS;
1266 break;
1267 }
1268
1269 case 'U':
1270 if (userns_supported()) {
1271 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1272 * implied by USER_NAMESPACE_PICK
1273 * further down. */
1274 arg_uid_shift = UID_INVALID;
1275 arg_uid_range = UINT32_C(0x10000);
1276
1277 arg_settings_mask |= SETTING_USERNS;
1278 }
1279
1280 break;
1281
1282 case ARG_PRIVATE_USERS_CHOWN:
1283 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1284
1285 arg_settings_mask |= SETTING_USERNS;
1286 break;
1287
1288 case ARG_PRIVATE_USERS_OWNERSHIP:
1289 if (streq(optarg, "help")) {
1290 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1291 return 0;
1292 }
1293
1294 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1295 if (arg_userns_ownership < 0)
1296 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1297
1298 arg_settings_mask |= SETTING_USERNS;
1299 break;
1300
1301 case ARG_KILL_SIGNAL:
1302 if (streq(optarg, "help")) {
1303 DUMP_STRING_TABLE(signal, int, _NSIG);
1304 return 0;
1305 }
1306
1307 arg_kill_signal = signal_from_string(optarg);
1308 if (arg_kill_signal < 0)
1309 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1310
1311 arg_settings_mask |= SETTING_KILL_SIGNAL;
1312 break;
1313
1314 case ARG_SETTINGS:
1315
1316 /* no → do not read files
1317 * yes → read files, do not override cmdline, trust only subset
1318 * override → read files, override cmdline, trust only subset
1319 * trusted → read files, do not override cmdline, trust all
1320 */
1321
1322 r = parse_boolean(optarg);
1323 if (r < 0) {
1324 if (streq(optarg, "trusted")) {
1325 mask_all_settings = false;
1326 mask_no_settings = false;
1327 arg_settings_trusted = true;
1328
1329 } else if (streq(optarg, "override")) {
1330 mask_all_settings = false;
1331 mask_no_settings = true;
1332 arg_settings_trusted = -1;
1333 } else
1334 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1335 } else if (r > 0) {
1336 /* yes */
1337 mask_all_settings = false;
1338 mask_no_settings = false;
1339 arg_settings_trusted = -1;
1340 } else {
1341 /* no */
1342 mask_all_settings = true;
1343 mask_no_settings = false;
1344 arg_settings_trusted = false;
1345 }
1346
1347 break;
1348
1349 case ARG_CHDIR: {
1350 _cleanup_free_ char *wd = NULL;
1351
1352 if (!path_is_absolute(optarg))
1353 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1354 "Working directory %s is not an absolute path.", optarg);
1355
1356 r = path_simplify_alloc(optarg, &wd);
1357 if (r < 0)
1358 return log_error_errno(r, "Failed to simplify path %s: %m", optarg);
1359
1360 if (!path_is_normalized(wd))
1361 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory path is not normalized: %s", wd);
1362
1363 if (path_below_api_vfs(wd))
1364 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory is below API VFS, refusing: %s", wd);
1365
1366 free_and_replace(arg_chdir, wd);
1367 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1368 break;
1369 }
1370
1371 case ARG_PIVOT_ROOT:
1372 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1375
1376 arg_settings_mask |= SETTING_PIVOT_ROOT;
1377 break;
1378
1379 case ARG_NOTIFY_READY:
1380 r = parse_boolean(optarg);
1381 if (r < 0)
1382 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1383 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1384 arg_notify_ready = r;
1385 arg_settings_mask |= SETTING_NOTIFY_READY;
1386 break;
1387
1388 case ARG_ROOT_HASH: {
1389 _cleanup_free_ void *k = NULL;
1390 size_t l;
1391
1392 r = unhexmem(optarg, &k, &l);
1393 if (r < 0)
1394 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1395 if (l < sizeof(sd_id128_t))
1396 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
1397
1398 free_and_replace(arg_verity_settings.root_hash, k);
1399 arg_verity_settings.root_hash_size = l;
1400 break;
1401 }
1402
1403 case ARG_ROOT_HASH_SIG: {
1404 char *value;
1405 size_t l;
1406 void *p;
1407
1408 if ((value = startswith(optarg, "base64:"))) {
1409 r = unbase64mem(value, &p, &l);
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1412
1413 } else {
1414 r = read_full_file(optarg, (char**) &p, &l);
1415 if (r < 0)
1416 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1417 }
1418
1419 free_and_replace(arg_verity_settings.root_hash_sig, p);
1420 arg_verity_settings.root_hash_sig_size = l;
1421 break;
1422 }
1423
1424 case ARG_VERITY_DATA:
1425 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1426 if (r < 0)
1427 return r;
1428 break;
1429
1430 case ARG_SYSTEM_CALL_FILTER: {
1431 bool negative;
1432 const char *items;
1433
1434 negative = optarg[0] == '~';
1435 items = negative ? optarg + 1 : optarg;
1436
1437 for (;;) {
1438 _cleanup_free_ char *word = NULL;
1439
1440 r = extract_first_word(&items, &word, NULL, 0);
1441 if (r == 0)
1442 break;
1443 if (r == -ENOMEM)
1444 return log_oom();
1445 if (r < 0)
1446 return log_error_errno(r, "Failed to parse system call filter: %m");
1447
1448 if (negative)
1449 r = strv_extend(&arg_syscall_deny_list, word);
1450 else
1451 r = strv_extend(&arg_syscall_allow_list, word);
1452 if (r < 0)
1453 return log_oom();
1454 }
1455
1456 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1457 break;
1458 }
1459
1460 case ARG_RLIMIT: {
1461 const char *eq;
1462 _cleanup_free_ char *name = NULL;
1463 int rl;
1464
1465 if (streq(optarg, "help")) {
1466 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1467 return 0;
1468 }
1469
1470 eq = strchr(optarg, '=');
1471 if (!eq)
1472 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1473 "--rlimit= expects an '=' assignment.");
1474
1475 name = strndup(optarg, eq - optarg);
1476 if (!name)
1477 return log_oom();
1478
1479 rl = rlimit_from_string_harder(name);
1480 if (rl < 0)
1481 return log_error_errno(rl, "Unknown resource limit: %s", name);
1482
1483 if (!arg_rlimit[rl]) {
1484 arg_rlimit[rl] = new0(struct rlimit, 1);
1485 if (!arg_rlimit[rl])
1486 return log_oom();
1487 }
1488
1489 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1490 if (r < 0)
1491 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1492
1493 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1494 break;
1495 }
1496
1497 case ARG_OOM_SCORE_ADJUST:
1498 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1499 if (r < 0)
1500 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1501
1502 arg_oom_score_adjust_set = true;
1503 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1504 break;
1505
1506 case ARG_CPU_AFFINITY: {
1507 CPUSet cpuset;
1508
1509 r = parse_cpu_set(optarg, &cpuset);
1510 if (r < 0)
1511 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1512
1513 cpu_set_reset(&arg_cpu_set);
1514 arg_cpu_set = cpuset;
1515 arg_settings_mask |= SETTING_CPU_AFFINITY;
1516 break;
1517 }
1518
1519 case ARG_RESOLV_CONF:
1520 if (streq(optarg, "help")) {
1521 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1522 return 0;
1523 }
1524
1525 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1526 if (arg_resolv_conf < 0)
1527 return log_error_errno(arg_resolv_conf,
1528 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1529
1530 arg_settings_mask |= SETTING_RESOLV_CONF;
1531 break;
1532
1533 case ARG_TIMEZONE:
1534 if (streq(optarg, "help")) {
1535 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1536 return 0;
1537 }
1538
1539 arg_timezone = timezone_mode_from_string(optarg);
1540 if (arg_timezone < 0)
1541 return log_error_errno(arg_timezone,
1542 "Failed to parse /etc/localtime mode: %s", optarg);
1543
1544 arg_settings_mask |= SETTING_TIMEZONE;
1545 break;
1546
1547 case ARG_CONSOLE:
1548 r = handle_arg_console(optarg);
1549 if (r <= 0)
1550 return r;
1551 break;
1552
1553 case 'P':
1554 case ARG_PIPE:
1555 r = handle_arg_console("pipe");
1556 if (r <= 0)
1557 return r;
1558 break;
1559
1560 case ARG_NO_PAGER:
1561 arg_pager_flags |= PAGER_DISABLE;
1562 break;
1563
1564 case ARG_SET_CREDENTIAL:
1565 r = machine_credential_set(&arg_credentials, optarg);
1566 if (r < 0)
1567 return r;
1568
1569 arg_settings_mask |= SETTING_CREDENTIALS;
1570 break;
1571
1572 case ARG_LOAD_CREDENTIAL:
1573 r = machine_credential_load(&arg_credentials, optarg);
1574 if (r < 0)
1575 return r;
1576
1577 arg_settings_mask |= SETTING_CREDENTIALS;
1578 break;
1579
1580 case ARG_BIND_USER:
1581 if (!valid_user_group_name(optarg, 0))
1582 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1583
1584 if (strv_extend(&arg_bind_user, optarg) < 0)
1585 return log_oom();
1586
1587 arg_settings_mask |= SETTING_BIND_USER;
1588 break;
1589
1590 case ARG_SUPPRESS_SYNC:
1591 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1592 if (r < 0)
1593 return r;
1594
1595 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1596 break;
1597
1598 case ARG_IMAGE_POLICY:
1599 r = parse_image_policy_argument(optarg, &arg_image_policy);
1600 if (r < 0)
1601 return r;
1602 break;
1603
1604 case ARG_BACKGROUND:
1605 r = free_and_strdup_warn(&arg_background, optarg);
1606 if (r < 0)
1607 return r;
1608 break;
1609
1610 case '?':
1611 return -EINVAL;
1612
1613 default:
1614 assert_not_reached();
1615 }
1616
1617 if (argc > optind) {
1618 strv_free(arg_parameters);
1619 arg_parameters = strv_copy(argv + optind);
1620 if (!arg_parameters)
1621 return log_oom();
1622
1623 arg_settings_mask |= SETTING_START_MODE;
1624 }
1625
1626 if (arg_ephemeral && arg_template && !arg_directory)
1627 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1628 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1629 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1630 * --directory=". */
1631 arg_directory = TAKE_PTR(arg_template);
1632
1633 arg_caps_retain |= plus;
1634 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1635 arg_caps_retain &= ~minus;
1636
1637 /* Make sure to parse environment before we reset the settings mask below */
1638 r = parse_environment();
1639 if (r < 0)
1640 return r;
1641
1642 /* Load all settings from .nspawn files */
1643 if (mask_no_settings)
1644 arg_settings_mask = 0;
1645
1646 /* Don't load any settings from .nspawn files */
1647 if (mask_all_settings)
1648 arg_settings_mask = _SETTINGS_MASK_ALL;
1649
1650 return 1;
1651}
1652
1653static int verify_arguments(void) {
1654 int r;
1655
1656 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1657 /* If we are running the stub init in the container, we don't need to look at what the init
1658 * in the container supports, because we are not using it. Let's immediately pick the right
1659 * setting based on the host system configuration.
1660 *
1661 * We only do this, if the user didn't use an environment variable to override the detection.
1662 */
1663
1664 r = cg_all_unified();
1665 if (r < 0)
1666 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1667 if (r > 0)
1668 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1669 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1670 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1671 else
1672 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1673 }
1674
1675 if (arg_userns_mode != USER_NAMESPACE_NO)
1676 arg_mount_settings |= MOUNT_USE_USERNS;
1677
1678 if (arg_private_network)
1679 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1680
1681 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1682 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1683 arg_register = false;
1684 if (arg_start_mode != START_PID1)
1685 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1686 }
1687
1688 if (arg_userns_ownership < 0)
1689 arg_userns_ownership =
1690 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1691 USER_NAMESPACE_OWNERSHIP_OFF;
1692
1693 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1694 arg_kill_signal = SIGRTMIN+3;
1695
1696 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1697 arg_read_only = true;
1698
1699 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1700 arg_read_only = true;
1701
1702 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1703 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1704 * The latter is not technically a user session, but we don't need to labour the point. */
1705 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1706
1707 if (arg_directory && arg_image)
1708 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1709
1710 if (arg_template && arg_image)
1711 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1712
1713 if (arg_template && !(arg_directory || arg_machine))
1714 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1715
1716 if (arg_ephemeral && arg_template)
1717 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1718
1719 /* Permit --ephemeral with --link-journal=try-* to satisfy principle of the least astonishment
1720 * (by common sense, "try" means "do not fail if not possible") */
1721 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO) && !arg_link_journal_try)
1722 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal={host,guest} may not be combined.");
1723
1724 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1725 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1726
1727 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1728 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1729 "--read-only and --private-users-ownership=chown may not be combined.");
1730
1731 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1732 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1733 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1734 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1735 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1736
1737 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1738 * we need to error out, to avoid conflicts between different network options. */
1739 if (arg_network_namespace_path &&
1740 (arg_network_interfaces || arg_network_macvlan ||
1741 arg_network_ipvlan || arg_network_veth_extra ||
1742 arg_network_bridge || arg_network_zone ||
1743 arg_network_veth))
1744 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1745
1746 if (arg_network_bridge && arg_network_zone)
1747 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1748 "--network-bridge= and --network-zone= may not be combined.");
1749
1750 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1751 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1752
1753 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1754 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1755
1756 if (arg_expose_ports && !arg_private_network)
1757 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1758
1759 if (arg_caps_ambient) {
1760 if (arg_caps_ambient == UINT64_MAX)
1761 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1762
1763 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1764 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1765
1766 if (arg_start_mode == START_BOOT)
1767 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1768 }
1769
1770 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1771 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1772
1773 /* Drop duplicate --bind-user= entries */
1774 strv_uniq(arg_bind_user);
1775
1776 r = custom_mount_check_all();
1777 if (r < 0)
1778 return r;
1779
1780 return 0;
1781}
1782
1783static int verify_network_interfaces_initialized(void) {
1784 int r;
1785 r = test_network_interfaces_initialized(arg_network_interfaces);
1786 if (r < 0)
1787 return r;
1788
1789 r = test_network_interfaces_initialized(arg_network_macvlan);
1790 if (r < 0)
1791 return r;
1792
1793 r = test_network_interfaces_initialized(arg_network_ipvlan);
1794 if (r < 0)
1795 return r;
1796
1797 return 0;
1798}
1799
1800int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1801 assert(p);
1802
1803 if (arg_userns_mode == USER_NAMESPACE_NO)
1804 return 0;
1805
1806 if (uid == UID_INVALID && gid == GID_INVALID)
1807 return 0;
1808
1809 if (uid != UID_INVALID) {
1810 uid += arg_uid_shift;
1811
1812 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1813 return -EOVERFLOW;
1814 }
1815
1816 if (gid != GID_INVALID) {
1817 gid += (gid_t) arg_uid_shift;
1818
1819 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1820 return -EOVERFLOW;
1821 }
1822
1823 return RET_NERRNO(lchown(p, uid, gid));
1824}
1825
1826int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1827 const char *q;
1828 int r;
1829
1830 q = prefix_roota(root, path);
1831 r = RET_NERRNO(mkdir(q, mode));
1832 if (r == -EEXIST)
1833 return 0;
1834 if (r < 0)
1835 return r;
1836
1837 return userns_lchown(q, uid, gid);
1838}
1839
1840static const char *timezone_from_path(const char *path) {
1841 return PATH_STARTSWITH_SET(
1842 path,
1843 "../usr/share/zoneinfo/",
1844 "/usr/share/zoneinfo/");
1845}
1846
1847static bool etc_writable(void) {
1848 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1849}
1850
1851static int setup_timezone(const char *dest) {
1852 _cleanup_free_ char *p = NULL, *etc = NULL;
1853 const char *where, *check;
1854 TimezoneMode m;
1855 int r;
1856
1857 assert(dest);
1858
1859 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1860 r = readlink_malloc("/etc/localtime", &p);
1861 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1862 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1863 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1864 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1865 else if (r < 0) {
1866 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1867 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1868 * file.
1869 *
1870 * Example:
1871 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1872 */
1873 return 0;
1874 } else if (arg_timezone == TIMEZONE_AUTO)
1875 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1876 else
1877 m = arg_timezone;
1878 } else
1879 m = arg_timezone;
1880
1881 if (m == TIMEZONE_OFF)
1882 return 0;
1883
1884 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1885 if (r < 0) {
1886 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1887 return 0;
1888 }
1889
1890 where = strjoina(etc, "/localtime");
1891
1892 switch (m) {
1893
1894 case TIMEZONE_DELETE:
1895 if (unlink(where) < 0)
1896 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1897
1898 return 0;
1899
1900 case TIMEZONE_SYMLINK: {
1901 _cleanup_free_ char *q = NULL;
1902 const char *z, *what;
1903
1904 z = timezone_from_path(p);
1905 if (!z) {
1906 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1907 return 0;
1908 }
1909
1910 r = readlink_malloc(where, &q);
1911 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1912 return 0; /* Already pointing to the right place? Then do nothing .. */
1913
1914 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1915 r = chase(check, dest, 0, NULL, NULL);
1916 if (r < 0)
1917 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1918 else {
1919 if (unlink(where) < 0 && errno != ENOENT) {
1920 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1921 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1922 return 0;
1923 }
1924
1925 what = strjoina("../usr/share/zoneinfo/", z);
1926 if (symlink(what, where) < 0) {
1927 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1928 errno, "Failed to correct timezone of container, ignoring: %m");
1929 return 0;
1930 }
1931
1932 break;
1933 }
1934
1935 _fallthrough_;
1936 }
1937
1938 case TIMEZONE_BIND: {
1939 _cleanup_free_ char *resolved = NULL;
1940 int found;
1941
1942 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1943 if (found < 0) {
1944 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1945 return 0;
1946 }
1947
1948 if (found == 0) /* missing? */
1949 (void) touch(resolved);
1950
1951 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1952 if (r >= 0)
1953 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1954
1955 _fallthrough_;
1956 }
1957
1958 case TIMEZONE_COPY:
1959 /* If mounting failed, try to copy */
1960 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1961 if (r < 0) {
1962 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1963 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1964 return 0;
1965 }
1966
1967 break;
1968
1969 default:
1970 assert_not_reached();
1971 }
1972
1973 /* Fix permissions of the symlink or file copy we just created */
1974 r = userns_lchown(where, 0, 0);
1975 if (r < 0)
1976 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1977
1978 return 0;
1979}
1980
1981static int have_resolv_conf(const char *path) {
1982 assert(path);
1983
1984 if (access(path, F_OK) < 0) {
1985 if (errno == ENOENT)
1986 return 0;
1987
1988 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1989 }
1990
1991 return 1;
1992}
1993
1994static int resolved_listening(void) {
1995 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1996 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1997 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1998 int r;
1999
2000 /* Check if resolved is listening */
2001
2002 r = sd_bus_open_system(&bus);
2003 if (r < 0)
2004 return log_debug_errno(r, "Failed to open system bus: %m");
2005
2006 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2007 if (r < 0)
2008 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2009 if (r == 0)
2010 return 0;
2011
2012 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
2013 if (r < 0)
2014 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2015
2016 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2017}
2018
2019static int setup_resolv_conf(const char *dest) {
2020 _cleanup_free_ char *etc = NULL;
2021 const char *where, *what;
2022 ResolvConfMode m;
2023 int r;
2024
2025 assert(dest);
2026
2027 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2028 if (arg_private_network)
2029 m = RESOLV_CONF_OFF;
2030 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2031 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2032 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2033 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2034 else
2035 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2036
2037 } else
2038 m = arg_resolv_conf;
2039
2040 if (m == RESOLV_CONF_OFF)
2041 return 0;
2042
2043 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2044 if (r < 0) {
2045 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2046 return 0;
2047 }
2048
2049 where = strjoina(etc, "/resolv.conf");
2050
2051 if (m == RESOLV_CONF_DELETE) {
2052 if (unlink(where) < 0)
2053 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2054
2055 return 0;
2056 }
2057
2058 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2059 what = PRIVATE_STATIC_RESOLV_CONF;
2060 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2061 what = PRIVATE_UPLINK_RESOLV_CONF;
2062 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2063 what = PRIVATE_STUB_RESOLV_CONF;
2064 else
2065 what = "/etc/resolv.conf";
2066
2067 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2068 _cleanup_free_ char *resolved = NULL;
2069 int found;
2070
2071 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
2072 if (found < 0) {
2073 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2074 return 0;
2075 }
2076
2077 if (found == 0) /* missing? */
2078 (void) touch(resolved);
2079
2080 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2081 if (r >= 0)
2082 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2083
2084 /* If that didn't work, let's copy the file */
2085 }
2086
2087 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2088 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
2089 else
2090 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
2091 if (r < 0) {
2092 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2093 * resolved or something similar runs inside and the symlink points there.
2094 *
2095 * If the disk image is read-only, there's also no point in complaining.
2096 */
2097 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2098 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2099 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2100 return 0;
2101 }
2102
2103 r = userns_lchown(where, 0, 0);
2104 if (r < 0)
2105 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2106
2107 return 0;
2108}
2109
2110static int setup_boot_id(void) {
2111 _cleanup_(unlink_and_freep) char *from = NULL;
2112 _cleanup_free_ char *path = NULL;
2113 sd_id128_t rnd = SD_ID128_NULL;
2114 const char *to;
2115 int r;
2116
2117 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2118
2119 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2120 if (r < 0)
2121 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2122
2123 r = sd_id128_randomize(&rnd);
2124 if (r < 0)
2125 return log_error_errno(r, "Failed to generate random boot id: %m");
2126
2127 r = id128_write(path, ID128_FORMAT_UUID, rnd);
2128 if (r < 0)
2129 return log_error_errno(r, "Failed to write boot id: %m");
2130
2131 from = TAKE_PTR(path);
2132 to = "/proc/sys/kernel/random/boot_id";
2133
2134 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2135 if (r < 0)
2136 return r;
2137
2138 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2139}
2140
2141static int copy_devnodes(const char *dest) {
2142 static const char devnodes[] =
2143 "null\0"
2144 "zero\0"
2145 "full\0"
2146 "random\0"
2147 "urandom\0"
2148 "tty\0"
2149 "net/tun\0";
2150
2151 int r = 0;
2152
2153 assert(dest);
2154
2155 BLOCK_WITH_UMASK(0000);
2156
2157 /* Create /dev/net, so that we can create /dev/net/tun in it */
2158 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2159 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2160
2161 NULSTR_FOREACH(d, devnodes) {
2162 _cleanup_free_ char *from = NULL, *to = NULL;
2163 struct stat st;
2164
2165 from = path_join("/dev/", d);
2166 if (!from)
2167 return log_oom();
2168
2169 to = path_join(dest, from);
2170 if (!to)
2171 return log_oom();
2172
2173 if (stat(from, &st) < 0) {
2174
2175 if (errno != ENOENT)
2176 return log_error_errno(errno, "Failed to stat %s: %m", from);
2177
2178 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2179 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2180 "%s is not a char or block device, cannot copy.", from);
2181 else {
2182 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2183
2184 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2185 /* Explicitly warn the user when /dev is already populated. */
2186 if (errno == EEXIST)
2187 log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
2188 if (errno != EPERM)
2189 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2190
2191 /* Some systems abusively restrict mknod but allow bind mounts. */
2192 r = touch(to);
2193 if (r < 0)
2194 return log_error_errno(r, "touch (%s) failed: %m", to);
2195 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2196 if (r < 0)
2197 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2198 }
2199
2200 r = userns_lchown(to, 0, 0);
2201 if (r < 0)
2202 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2203
2204 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2205 if (!dn)
2206 return log_oom();
2207
2208 r = userns_mkdir(dest, dn, 0755, 0, 0);
2209 if (r < 0)
2210 return log_error_errno(r, "Failed to create '%s': %m", dn);
2211
2212 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2213 return log_oom();
2214
2215 prefixed = path_join(dest, sl);
2216 if (!prefixed)
2217 return log_oom();
2218
2219 t = path_join("..", d);
2220 if (!t)
2221 return log_oom();
2222
2223 if (symlink(t, prefixed) < 0)
2224 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2225 }
2226 }
2227
2228 return r;
2229}
2230
2231static int make_extra_nodes(const char *dest) {
2232 size_t i;
2233 int r;
2234
2235 BLOCK_WITH_UMASK(0000);
2236
2237 for (i = 0; i < arg_n_extra_nodes; i++) {
2238 _cleanup_free_ char *path = NULL;
2239 DeviceNode *n = arg_extra_nodes + i;
2240
2241 path = path_join(dest, n->path);
2242 if (!path)
2243 return log_oom();
2244
2245 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2246 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2247
2248 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2249 if (r < 0)
2250 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2251 }
2252
2253 return 0;
2254}
2255
2256static int setup_pts(const char *dest) {
2257 _cleanup_free_ char *options = NULL;
2258 const char *p;
2259 int r;
2260
2261#if HAVE_SELINUX
2262 if (arg_selinux_apifs_context)
2263 (void) asprintf(&options,
2264 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2265 arg_uid_shift + TTY_GID,
2266 arg_selinux_apifs_context);
2267 else
2268#endif
2269 (void) asprintf(&options,
2270 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2271 arg_uid_shift + TTY_GID);
2272
2273 if (!options)
2274 return log_oom();
2275
2276 /* Mount /dev/pts itself */
2277 p = prefix_roota(dest, "/dev/pts");
2278 r = RET_NERRNO(mkdir(p, 0755));
2279 if (r < 0)
2280 return log_error_errno(r, "Failed to create /dev/pts: %m");
2281
2282 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2283 if (r < 0)
2284 return r;
2285 r = userns_lchown(p, 0, 0);
2286 if (r < 0)
2287 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2288
2289 /* Create /dev/ptmx symlink */
2290 p = prefix_roota(dest, "/dev/ptmx");
2291 if (symlink("pts/ptmx", p) < 0)
2292 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2293 r = userns_lchown(p, 0, 0);
2294 if (r < 0)
2295 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2296
2297 /* And fix /dev/pts/ptmx ownership */
2298 p = prefix_roota(dest, "/dev/pts/ptmx");
2299 r = userns_lchown(p, 0, 0);
2300 if (r < 0)
2301 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2302
2303 return 0;
2304}
2305
2306static int setup_stdio_as_dev_console(void) {
2307 _cleanup_close_ int terminal = -EBADF;
2308 int r;
2309
2310 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2311 * explicitly, if we are configured to. */
2312 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2313 if (terminal < 0)
2314 return log_error_errno(terminal, "Failed to open console: %m");
2315
2316 /* Make sure we can continue logging to the original stderr, even if
2317 * stderr points elsewhere now */
2318 r = log_dup_console();
2319 if (r < 0)
2320 return log_error_errno(r, "Failed to duplicate stderr: %m");
2321
2322 /* invalidates 'terminal' on success and failure */
2323 r = rearrange_stdio(terminal, terminal, terminal);
2324 TAKE_FD(terminal);
2325 if (r < 0)
2326 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2327
2328 return 0;
2329}
2330
2331static int setup_dev_console(const char *console) {
2332 _cleanup_free_ char *p = NULL;
2333 int r;
2334
2335 /* Create /dev/console symlink */
2336 r = path_make_relative("/dev", console, &p);
2337 if (r < 0)
2338 return log_error_errno(r, "Failed to create relative path: %m");
2339
2340 if (symlink(p, "/dev/console") < 0)
2341 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2342
2343 return 0;
2344}
2345
2346static int setup_keyring(void) {
2347 key_serial_t keyring;
2348
2349 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2350 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2351 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2352 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2353 * into the container. */
2354
2355 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2356 if (keyring == -1) {
2357 if (errno == ENOSYS)
2358 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2359 else if (ERRNO_IS_PRIVILEGE(errno))
2360 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2361 else
2362 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2363 }
2364
2365 return 0;
2366}
2367
2368int make_run_host(const char *root) {
2369 int r;
2370
2371 assert(root);
2372
2373 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2374 if (r < 0)
2375 return log_error_errno(r, "Failed to create /run/host/: %m");
2376
2377 return 0;
2378}
2379
2380static int setup_credentials(const char *root) {
2381 const char *q;
2382 int r;
2383
2384 if (arg_credentials.n_credentials == 0)
2385 return 0;
2386
2387 r = make_run_host(root);
2388 if (r < 0)
2389 return r;
2390
2391 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2392 if (r < 0)
2393 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2394
2395 q = prefix_roota(root, "/run/host/credentials");
2396 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2397 if (r < 0)
2398 return r;
2399
2400 FOREACH_ARRAY(cred, arg_credentials.credentials, arg_credentials.n_credentials) {
2401 _cleanup_free_ char *j = NULL;
2402 _cleanup_close_ int fd = -EBADF;
2403
2404 j = path_join(q, cred->id);
2405 if (!j)
2406 return log_oom();
2407
2408 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2409 if (fd < 0)
2410 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2411
2412 r = loop_write(fd, cred->data, cred->size);
2413 if (r < 0)
2414 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2415
2416 if (fchmod(fd, 0400) < 0)
2417 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2418
2419 if (arg_userns_mode != USER_NAMESPACE_NO) {
2420 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2421 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2422 }
2423 }
2424
2425 if (chmod(q, 0500) < 0)
2426 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2427
2428 r = userns_lchown(q, 0, 0);
2429 if (r < 0)
2430 return r;
2431
2432 /* Make both mount and superblock read-only now */
2433 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2434 if (r < 0)
2435 return r;
2436
2437 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2438}
2439
2440static int setup_kmsg(int fd_inner_socket) {
2441 _cleanup_(unlink_and_freep) char *from = NULL;
2442 _cleanup_free_ char *fifo = NULL;
2443 _cleanup_close_ int fd = -EBADF;
2444 int r;
2445
2446 assert(fd_inner_socket >= 0);
2447
2448 BLOCK_WITH_UMASK(0000);
2449
2450 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2451 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2452 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2453 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2454
2455 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2456 if (r < 0)
2457 return log_error_errno(r, "Failed to generate kmsg path: %m");
2458
2459 if (mkfifo(fifo, 0600) < 0)
2460 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2461
2462 from = TAKE_PTR(fifo);
2463
2464 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2465 if (r < 0)
2466 return r;
2467
2468 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2469 if (fd < 0)
2470 return log_error_errno(errno, "Failed to open fifo: %m");
2471
2472 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2473 r = send_one_fd(fd_inner_socket, fd, 0);
2474 if (r < 0)
2475 return log_error_errno(r, "Failed to send FIFO fd: %m");
2476
2477 return 0;
2478}
2479
2480struct ExposeArgs {
2481 union in_addr_union address4;
2482 union in_addr_union address6;
2483 struct FirewallContext *fw_ctx;
2484};
2485
2486static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2487 struct ExposeArgs *args = ASSERT_PTR(userdata);
2488
2489 assert(rtnl);
2490 assert(m);
2491
2492 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2493 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2494 return 0;
2495}
2496
2497static int setup_hostname(void) {
2498 int r;
2499
2500 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2501 return 0;
2502
2503 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2504 if (r < 0)
2505 return log_error_errno(r, "Failed to set hostname: %m");
2506
2507 return 0;
2508}
2509
2510static int setup_journal(const char *directory) {
2511 _cleanup_free_ char *d = NULL;
2512 const char *p, *q;
2513 sd_id128_t this_id;
2514 bool try;
2515 int r;
2516
2517 /* Don't link journals in ephemeral mode */
2518 if (arg_ephemeral)
2519 return 0;
2520
2521 if (arg_link_journal == LINK_NO)
2522 return 0;
2523
2524 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2525
2526 r = sd_id128_get_machine(&this_id);
2527 if (r < 0)
2528 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2529
2530 if (sd_id128_equal(arg_uuid, this_id)) {
2531 log_full(try ? LOG_WARNING : LOG_ERR,
2532 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2533 if (try)
2534 return 0;
2535 return -EEXIST;
2536 }
2537
2538 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2539 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2540 if (r < 0) {
2541 bool ignore = r == -EROFS && try;
2542 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2543 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2544 return ignore ? 0 : r;
2545 }
2546 }
2547
2548 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2549 q = prefix_roota(directory, p);
2550
2551 if (path_is_mount_point(p) > 0) {
2552 if (try)
2553 return 0;
2554
2555 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2556 "%s: already a mount point, refusing to use for journal", p);
2557 }
2558
2559 if (path_is_mount_point(q) > 0) {
2560 if (try)
2561 return 0;
2562
2563 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2564 "%s: already a mount point, refusing to use for journal", q);
2565 }
2566
2567 r = readlink_and_make_absolute(p, &d);
2568 if (r >= 0) {
2569 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2570 path_equal(d, q)) {
2571
2572 r = userns_mkdir(directory, p, 0755, 0, 0);
2573 if (r < 0)
2574 log_warning_errno(r, "Failed to create directory %s: %m", q);
2575 return 0;
2576 }
2577
2578 if (unlink(p) < 0)
2579 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2580 } else if (r == -EINVAL) {
2581
2582 if (arg_link_journal == LINK_GUEST &&
2583 rmdir(p) < 0) {
2584
2585 if (errno == ENOTDIR) {
2586 log_error("%s already exists and is neither a symlink nor a directory", p);
2587 return r;
2588 } else
2589 return log_error_errno(errno, "Failed to remove %s: %m", p);
2590 }
2591 } else if (r != -ENOENT)
2592 return log_error_errno(r, "readlink(%s) failed: %m", p);
2593
2594 if (arg_link_journal == LINK_GUEST) {
2595
2596 if (symlink(q, p) < 0) {
2597 if (try) {
2598 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2599 return 0;
2600 } else
2601 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2602 }
2603
2604 r = userns_mkdir(directory, p, 0755, 0, 0);
2605 if (r < 0)
2606 log_warning_errno(r, "Failed to create directory %s: %m", q);
2607 return 0;
2608 }
2609
2610 if (arg_link_journal == LINK_HOST) {
2611 /* don't create parents here — if the host doesn't have
2612 * permanent journal set up, don't force it here */
2613
2614 r = RET_NERRNO(mkdir(p, 0755));
2615 if (r < 0 && r != -EEXIST) {
2616 if (try) {
2617 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2618 return 0;
2619 } else
2620 return log_error_errno(r, "Failed to create %s: %m", p);
2621 }
2622
2623 } else if (access(p, F_OK) < 0)
2624 return 0;
2625
2626 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2627 log_warning("%s is not empty, proceeding anyway.", q);
2628
2629 r = userns_mkdir(directory, p, 0755, 0, 0);
2630 if (r < 0)
2631 return log_error_errno(r, "Failed to create %s: %m", q);
2632
2633 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2634 if (r < 0)
2635 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2636
2637 return 0;
2638}
2639
2640static int drop_capabilities(uid_t uid) {
2641 CapabilityQuintet q;
2642
2643 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2644 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2645 * arg_caps_retain. */
2646
2647 if (capability_quintet_is_set(&arg_full_capabilities)) {
2648 q = arg_full_capabilities;
2649
2650 if (q.bounding == UINT64_MAX)
2651 q.bounding = uid == 0 ? arg_caps_retain : 0;
2652
2653 if (q.effective == UINT64_MAX)
2654 q.effective = uid == 0 ? q.bounding : 0;
2655
2656 if (q.inheritable == UINT64_MAX)
2657 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2658
2659 if (q.permitted == UINT64_MAX)
2660 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2661
2662 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2663 q.ambient = arg_caps_ambient;
2664
2665 if (capability_quintet_mangle(&q))
2666 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2667
2668 } else {
2669 q = (CapabilityQuintet) {
2670 .bounding = arg_caps_retain,
2671 .effective = uid == 0 ? arg_caps_retain : 0,
2672 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2673 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2674 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2675 };
2676
2677 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2678 * in order to maintain the same behavior as systemd < 242. */
2679 if (capability_quintet_mangle(&q))
2680 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2681 "Some capabilities will not be set because they are not in the current bounding set.");
2682
2683 }
2684
2685 return capability_quintet_enforce(&q);
2686}
2687
2688static int reset_audit_loginuid(void) {
2689 _cleanup_free_ char *p = NULL;
2690 int r;
2691
2692 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2693 return 0;
2694
2695 r = read_one_line_file("/proc/self/loginuid", &p);
2696 if (r == -ENOENT)
2697 return 0;
2698 if (r < 0)
2699 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2700
2701 /* Already reset? */
2702 if (streq(p, "4294967295"))
2703 return 0;
2704
2705 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2706 if (r < 0) {
2707 log_error_errno(r,
2708 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2709 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2710 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2711 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2712 "using systemd-nspawn. Sleeping for 5s... (%m)");
2713
2714 sleep(5);
2715 }
2716
2717 return 0;
2718}
2719
2720static int mount_tunnel_dig(const char *root) {
2721 const char *p, *q;
2722 int r;
2723
2724 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2725 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2726 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2727 (void) mkdir_p(p, 0600);
2728
2729 r = make_run_host(root);
2730 if (r < 0)
2731 return r;
2732
2733 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
2734 if (r < 0)
2735 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
2736
2737 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
2738 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2739 if (r < 0)
2740 return r;
2741
2742 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2743 if (r < 0)
2744 return r;
2745
2746 return 0;
2747}
2748
2749static int mount_tunnel_open(void) {
2750 int r;
2751
2752 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2753 if (r < 0)
2754 return r;
2755
2756 return 0;
2757}
2758
2759static int setup_machine_id(const char *directory) {
2760 int r;
2761
2762 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2763 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2764 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2765 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2766 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2767 * container behaves nicely). */
2768
2769 r = id128_get_machine(directory, &arg_uuid);
2770 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2771 /* If the file is missing, empty, or uninitialized, we don't mind */
2772 if (sd_id128_is_null(arg_uuid)) {
2773 r = sd_id128_randomize(&arg_uuid);
2774 if (r < 0)
2775 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2776 }
2777 } else if (r < 0)
2778 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2779
2780 return 0;
2781}
2782
2783static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2784 int r;
2785
2786 assert(directory);
2787
2788 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2789 return 0;
2790
2791 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2792 if (r == -EOPNOTSUPP)
2793 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2794 if (r == -EBADE)
2795 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2796 if (r < 0)
2797 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2798 if (r == 0)
2799 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2800 else
2801 log_debug("Patched directory tree to match UID/GID range.");
2802
2803 return r;
2804}
2805
2806/*
2807 * Return values:
2808 * < 0 : wait_for_terminate() failed to get the state of the
2809 * container, the container was terminated by a signal, or
2810 * failed for an unknown reason. No change is made to the
2811 * container argument.
2812 * > 0 : The program executed in the container terminated with an
2813 * error. The exit code of the program executed in the
2814 * container is returned. The container argument has been set
2815 * to CONTAINER_TERMINATED.
2816 * 0 : The container is being rebooted, has been shut down or exited
2817 * successfully. The container argument has been set to either
2818 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2819 *
2820 * That is, success is indicated by a return value of zero, and an
2821 * error is indicated by a non-zero value.
2822 */
2823static int wait_for_container(pid_t pid, ContainerStatus *container) {
2824 siginfo_t status;
2825 int r;
2826
2827 r = wait_for_terminate(pid, &status);
2828 if (r < 0)
2829 return log_warning_errno(r, "Failed to wait for container: %m");
2830
2831 switch (status.si_code) {
2832
2833 case CLD_EXITED:
2834 if (status.si_status == 0)
2835 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2836 else
2837 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2838
2839 *container = CONTAINER_TERMINATED;
2840 return status.si_status;
2841
2842 case CLD_KILLED:
2843 if (status.si_status == SIGINT) {
2844 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2845 *container = CONTAINER_TERMINATED;
2846 return 0;
2847
2848 } else if (status.si_status == SIGHUP) {
2849 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2850 *container = CONTAINER_REBOOTED;
2851 return 0;
2852 }
2853
2854 _fallthrough_;
2855 case CLD_DUMPED:
2856 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2857 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2858
2859 default:
2860 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2861 "Container %s failed due to unknown reason.", arg_machine);
2862 }
2863}
2864
2865static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2866 pid_t pid;
2867
2868 pid = PTR_TO_PID(userdata);
2869 if (pid > 0) {
2870 if (kill(pid, arg_kill_signal) >= 0) {
2871 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2872 sd_event_source_set_userdata(s, NULL);
2873 return 0;
2874 }
2875 }
2876
2877 sd_event_exit(sd_event_source_get_event(s), 0);
2878 return 0;
2879}
2880
2881static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2882 pid_t pid;
2883
2884 assert(s);
2885 assert(ssi);
2886
2887 pid = PTR_TO_PID(userdata);
2888
2889 for (;;) {
2890 siginfo_t si = {};
2891
2892 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2893 return log_error_errno(errno, "Failed to waitid(): %m");
2894 if (si.si_pid == 0) /* No pending children. */
2895 break;
2896 if (si.si_pid == pid) {
2897 /* The main process we care for has exited. Return from
2898 * signal handler but leave the zombie. */
2899 sd_event_exit(sd_event_source_get_event(s), 0);
2900 break;
2901 }
2902
2903 /* Reap all other children. */
2904 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2905 }
2906
2907 return 0;
2908}
2909
2910static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2911 pid_t pid;
2912
2913 assert(m);
2914
2915 pid = PTR_TO_PID(userdata);
2916
2917 if (arg_kill_signal > 0) {
2918 log_info("Container termination requested. Attempting to halt container.");
2919 (void) kill(pid, arg_kill_signal);
2920 } else {
2921 log_info("Container termination requested. Exiting.");
2922 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2923 }
2924
2925 return 0;
2926}
2927
2928static int pick_paths(void) {
2929 int r;
2930
2931 if (arg_directory) {
2932 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
2933 PickFilter filter = pick_filter_image_dir;
2934
2935 filter.architecture = arg_architecture;
2936
2937 r = path_pick_update_warn(
2938 &arg_directory,
2939 &filter,
2940 PICK_ARCHITECTURE|PICK_TRIES,
2941 &result);
2942 if (r < 0) {
2943 /* Accept ENOENT here so that the --template= logic can work */
2944 if (r != -ENOENT)
2945 return r;
2946 } else
2947 arg_architecture = result.architecture;
2948 }
2949
2950 if (arg_image) {
2951 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
2952 PickFilter filter = pick_filter_image_raw;
2953
2954 filter.architecture = arg_architecture;
2955
2956 r = path_pick_update_warn(
2957 &arg_image,
2958 &filter,
2959 PICK_ARCHITECTURE|PICK_TRIES,
2960 &result);
2961 if (r < 0)
2962 return r;
2963
2964 arg_architecture = result.architecture;
2965 }
2966
2967 if (arg_template) {
2968 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
2969 PickFilter filter = pick_filter_image_dir;
2970
2971 filter.architecture = arg_architecture;
2972
2973 r = path_pick_update_warn(
2974 &arg_template,
2975 &filter,
2976 PICK_ARCHITECTURE,
2977 &result);
2978 if (r < 0)
2979 return r;
2980
2981 arg_architecture = result.architecture;
2982 }
2983
2984 return 0;
2985}
2986
2987static int determine_names(void) {
2988 int r;
2989
2990 if (arg_template && !arg_directory && arg_machine) {
2991
2992 /* If --template= was specified then we should not search for a machine, but instead create a
2993 * new one in /var/lib/machine. */
2994
2995 arg_directory = path_join("/var/lib/machines", arg_machine);
2996 if (!arg_directory)
2997 return log_oom();
2998 }
2999
3000 if (!arg_image && !arg_directory) {
3001 if (arg_machine) {
3002 _cleanup_(image_unrefp) Image *i = NULL;
3003
3004 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3005 if (r == -ENOENT)
3006 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3007 if (r < 0)
3008 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3009
3010 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3011 r = free_and_strdup(&arg_image, i->path);
3012 else
3013 r = free_and_strdup(&arg_directory, i->path);
3014 if (r < 0)
3015 return log_oom();
3016
3017 if (!arg_ephemeral)
3018 arg_read_only = arg_read_only || i->read_only;
3019 } else {
3020 r = safe_getcwd(&arg_directory);
3021 if (r < 0)
3022 return log_error_errno(r, "Failed to determine current directory: %m");
3023 }
3024
3025 if (!arg_directory && !arg_image)
3026 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3027 }
3028
3029 if (!arg_machine) {
3030 if (arg_directory && path_equal(arg_directory, "/")) {
3031 arg_machine = gethostname_malloc();
3032 if (!arg_machine)
3033 return log_oom();
3034 } else if (arg_image) {
3035 char *e;
3036
3037 r = path_extract_filename(arg_image, &arg_machine);
3038 if (r < 0)
3039 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
3040
3041 /* Truncate suffix if there is one */
3042 e = endswith(arg_machine, ".raw");
3043 if (e)
3044 *e = 0;
3045 } else {
3046 r = path_extract_filename(arg_directory, &arg_machine);
3047 if (r < 0)
3048 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3049 }
3050
3051 hostname_cleanup(arg_machine);
3052 if (!hostname_is_valid(arg_machine, 0))
3053 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3054
3055 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3056 * to match fixed config file names. */
3057 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3058 if (!arg_settings_filename)
3059 return log_oom();
3060
3061 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3062 * instances at once without manually having to specify -M each time. */
3063 if (arg_ephemeral)
3064 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3065 return log_oom();
3066 } else {
3067 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3068 if (!arg_settings_filename)
3069 return log_oom();
3070 }
3071
3072 return 0;
3073}
3074
3075static int chase_and_update(char **p, unsigned flags) {
3076 char *chased;
3077 int r;
3078
3079 assert(p);
3080
3081 if (!*p)
3082 return 0;
3083
3084 r = chase(*p, NULL, flags, &chased, NULL);
3085 if (r < 0)
3086 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3087
3088 return free_and_replace(*p, chased);
3089}
3090
3091static int determine_uid_shift(const char *directory) {
3092
3093 if (arg_userns_mode == USER_NAMESPACE_NO) {
3094 arg_uid_shift = 0;
3095 return 0;
3096 }
3097
3098 if (arg_uid_shift == UID_INVALID) {
3099 struct stat st;
3100
3101 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3102
3103 if (stat(directory, &st) < 0)
3104 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3105
3106 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3107
3108 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3109 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3110 "UID and GID base of %s don't match.", directory);
3111
3112 arg_uid_range = UINT32_C(0x10000);
3113
3114 if (arg_uid_shift != 0) {
3115 /* If the image is shifted already, then we'll fall back to classic chowning, for
3116 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3117
3118 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3119 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3120 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3121 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3122 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3123 "UID base of %s is not zero, UID mapping not supported.", directory);
3124 }
3125 }
3126
3127 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3128 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3129
3130 return 0;
3131}
3132
3133static unsigned long effective_clone_ns_flags(void) {
3134 unsigned long flags = arg_clone_ns_flags;
3135
3136 if (arg_private_network)
3137 flags |= CLONE_NEWNET;
3138 if (arg_use_cgns)
3139 flags |= CLONE_NEWCGROUP;
3140 if (arg_userns_mode != USER_NAMESPACE_NO)
3141 flags |= CLONE_NEWUSER;
3142
3143 return flags;
3144}
3145
3146static int patch_sysctl(void) {
3147
3148 /* This table is inspired by runc's sysctl() function */
3149 static const struct {
3150 const char *key;
3151 bool prefix;
3152 unsigned long clone_flags;
3153 } safe_sysctl[] = {
3154 { "kernel.hostname", false, CLONE_NEWUTS },
3155 { "kernel.domainname", false, CLONE_NEWUTS },
3156 { "kernel.msgmax", false, CLONE_NEWIPC },
3157 { "kernel.msgmnb", false, CLONE_NEWIPC },
3158 { "kernel.msgmni", false, CLONE_NEWIPC },
3159 { "kernel.sem", false, CLONE_NEWIPC },
3160 { "kernel.shmall", false, CLONE_NEWIPC },
3161 { "kernel.shmmax", false, CLONE_NEWIPC },
3162 { "kernel.shmmni", false, CLONE_NEWIPC },
3163 { "fs.mqueue.", true, CLONE_NEWIPC },
3164 { "net.", true, CLONE_NEWNET },
3165 };
3166
3167 unsigned long flags;
3168 int r;
3169
3170 flags = effective_clone_ns_flags();
3171
3172 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3173 bool good = false;
3174 size_t i;
3175
3176 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3177
3178 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3179 continue;
3180
3181 if (safe_sysctl[i].prefix)
3182 good = startswith(*k, safe_sysctl[i].key);
3183 else
3184 good = streq(*k, safe_sysctl[i].key);
3185
3186 if (good)
3187 break;
3188 }
3189
3190 if (!good)
3191 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3192
3193 r = sysctl_write(*k, *v);
3194 if (r < 0)
3195 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3196 }
3197
3198 return 0;
3199}
3200
3201static int inner_child(
3202 Barrier *barrier,
3203 int fd_inner_socket,
3204 FDSet *fds,
3205 char **os_release_pairs) {
3206
3207 _cleanup_free_ char *home = NULL;
3208 size_t n_env = 1;
3209 char *envp[] = {
3210 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3211 NULL, /* container */
3212 NULL, /* TERM */
3213 NULL, /* HOME */
3214 NULL, /* USER */
3215 NULL, /* LOGNAME */
3216 NULL, /* container_uuid */
3217 NULL, /* LISTEN_FDS */
3218 NULL, /* LISTEN_PID */
3219 NULL, /* NOTIFY_SOCKET */
3220 NULL, /* CREDENTIALS_DIRECTORY */
3221 NULL, /* LANG */
3222 NULL
3223 };
3224 const char *exec_target;
3225 _cleanup_strv_free_ char **env_use = NULL;
3226 int r, which_failed;
3227
3228 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3229 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3230 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3231 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3232 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3233 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3234 * namespace.
3235 *
3236 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3237 * unshare(). See below. */
3238
3239 assert(barrier);
3240 assert(fd_inner_socket >= 0);
3241
3242 log_debug("Inner child is initializing.");
3243
3244 if (arg_userns_mode != USER_NAMESPACE_NO) {
3245 /* Tell the parent, that it now can write the UID map. */
3246 (void) barrier_place(barrier); /* #1 */
3247
3248 /* Wait until the parent wrote the UID map */
3249 if (!barrier_place_and_sync(barrier)) /* #2 */
3250 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3251
3252 /* Become the new root user inside our namespace */
3253 r = reset_uid_gid();
3254 if (r < 0)
3255 return log_error_errno(r, "Couldn't become new root: %m");
3256
3257 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3258 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3259 * propagation, but simply create new peer groups for all our mounts). */
3260 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3261 if (r < 0)
3262 return r;
3263 }
3264
3265 r = mount_all(NULL,
3266 arg_mount_settings | MOUNT_IN_USERNS,
3267 arg_uid_shift,
3268 arg_selinux_apifs_context);
3269 if (r < 0)
3270 return r;
3271
3272 if (!arg_network_namespace_path && arg_private_network) {
3273 r = unshare(CLONE_NEWNET);
3274 if (r < 0)
3275 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3276
3277 /* Tell the parent that it can setup network interfaces. */
3278 (void) barrier_place(barrier); /* #3 */
3279 }
3280
3281 r = mount_sysfs(NULL, arg_mount_settings);
3282 if (r < 0)
3283 return r;
3284
3285 /* Wait until we are cgroup-ified, so that we
3286 * can mount the right cgroup path writable */
3287 if (!barrier_place_and_sync(barrier)) /* #4 */
3288 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3289 "Parent died too early");
3290
3291 if (arg_use_cgns) {
3292 r = unshare(CLONE_NEWCGROUP);
3293 if (r < 0)
3294 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3295 r = mount_cgroups(
3296 "",
3297 arg_unified_cgroup_hierarchy,
3298 arg_userns_mode != USER_NAMESPACE_NO,
3299 arg_uid_shift,
3300 arg_uid_range,
3301 arg_selinux_apifs_context,
3302 true);
3303 } else
3304 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3305 if (r < 0)
3306 return r;
3307
3308 r = setup_boot_id();
3309 if (r < 0)
3310 return r;
3311
3312 r = setup_kmsg(fd_inner_socket);
3313 if (r < 0)
3314 return r;
3315
3316 r = mount_custom(
3317 "/",
3318 arg_custom_mounts,
3319 arg_n_custom_mounts,
3320 0,
3321 0,
3322 arg_selinux_apifs_context,
3323 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3324 if (r < 0)
3325 return r;
3326
3327 if (setsid() < 0)
3328 return log_error_errno(errno, "setsid() failed: %m");
3329
3330 if (arg_private_network)
3331 (void) loopback_setup();
3332
3333 if (arg_expose_ports) {
3334 r = expose_port_send_rtnl(fd_inner_socket);
3335 if (r < 0)
3336 return r;
3337 }
3338
3339 if (arg_console_mode != CONSOLE_PIPE) {
3340 _cleanup_close_ int master = -EBADF;
3341 _cleanup_free_ char *console = NULL;
3342
3343 /* Allocate a pty and make it available as /dev/console. */
3344 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3345 if (master < 0)
3346 return log_error_errno(master, "Failed to allocate a pty: %m");
3347
3348 r = setup_dev_console(console);
3349 if (r < 0)
3350 return log_error_errno(r, "Failed to set up /dev/console: %m");
3351
3352 r = send_one_fd(fd_inner_socket, master, 0);
3353 if (r < 0)
3354 return log_error_errno(r, "Failed to send master fd: %m");
3355
3356 r = setup_stdio_as_dev_console();
3357 if (r < 0)
3358 return r;
3359 }
3360
3361 r = patch_sysctl();
3362 if (r < 0)
3363 return r;
3364
3365 if (arg_oom_score_adjust_set) {
3366 r = set_oom_score_adjust(arg_oom_score_adjust);
3367 if (r < 0)
3368 return log_error_errno(r, "Failed to adjust OOM score: %m");
3369 }
3370
3371 if (arg_cpu_set.set)
3372 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3373 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3374
3375 (void) setup_hostname();
3376
3377 if (arg_personality != PERSONALITY_INVALID) {
3378 r = safe_personality(arg_personality);
3379 if (r < 0)
3380 return log_error_errno(r, "personality() failed: %m");
3381#ifdef ARCHITECTURE_SECONDARY
3382 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
3383 r = safe_personality(PER_LINUX32);
3384 if (r < 0)
3385 return log_error_errno(r, "personality() failed: %m");
3386#endif
3387 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3388 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3389 "invocation with qemu userspace emulator (or equivalent) in effect.",
3390 architecture_to_string(arg_architecture));
3391
3392 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3393 if (r < 0)
3394 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3395
3396#if HAVE_SECCOMP
3397 if (arg_seccomp) {
3398
3399 if (is_seccomp_available()) {
3400 r = seccomp_load(arg_seccomp);
3401 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3402 return log_error_errno(r, "Failed to install seccomp filter: %m");
3403 if (r < 0)
3404 log_debug_errno(r, "Failed to install seccomp filter: %m");
3405 }
3406 } else
3407#endif
3408 {
3409 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3410 if (r < 0)
3411 return r;
3412 }
3413
3414 if (arg_suppress_sync) {
3415#if HAVE_SECCOMP
3416 r = seccomp_suppress_sync();
3417 if (r < 0)
3418 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3419#else
3420 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3421#endif
3422 }
3423
3424#if HAVE_SELINUX
3425 if (arg_selinux_context)
3426 if (setexeccon(arg_selinux_context) < 0)
3427 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3428#endif
3429
3430 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3431 * if we need to later on. */
3432 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3433 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3434
3435 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3436 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3437 else
3438 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3439 if (r < 0)
3440 return r;
3441
3442 r = drop_capabilities(getuid());
3443 if (r < 0)
3444 return log_error_errno(r, "Dropping capabilities failed: %m");
3445
3446 if (arg_no_new_privileges)
3447 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3448 return log_error_errno(errno, "Failed to disable new privileges: %m");
3449
3450 /* LXC sets container=lxc, so follow the scheme here */
3451 envp[n_env++] = strjoina("container=", arg_container_service_name);
3452
3453 envp[n_env] = strv_find_prefix(environ, "TERM=");
3454 if (envp[n_env])
3455 n_env++;
3456
3457 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3458 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3459 return log_oom();
3460
3461 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3462 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3463 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
3464 return log_oom();
3465
3466 assert(!sd_id128_is_null(arg_uuid));
3467
3468 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3469 return log_oom();
3470
3471 if (!fdset_isempty(fds)) {
3472 r = fdset_cloexec(fds, false);
3473 if (r < 0)
3474 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3475
3476 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3477 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3478 return log_oom();
3479 }
3480 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3481 return log_oom();
3482
3483 if (arg_credentials.n_credentials > 0) {
3484 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3485 if (!envp[n_env])
3486 return log_oom();
3487 n_env++;
3488 }
3489
3490 if (arg_start_mode != START_BOOT) {
3491 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
3492 if (!envp[n_env])
3493 return log_oom();
3494 n_env++;
3495 }
3496
3497 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3498 if (!env_use)
3499 return log_oom();
3500
3501 /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */
3502 if (!barrier_place_and_sync(barrier)) /* #5 */
3503 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3504
3505 /* Note, this should be done this late (💣 and not moved earlier! 💣), so that all namespacing
3506 * changes are already in effect by now, so that any resolved paths here definitely reference
3507 * resources inside the container, and not outside of them. */
3508 if (arg_chdir)
3509 if (chdir(arg_chdir) < 0)
3510 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3511
3512 if (arg_start_mode == START_PID2) {
3513 r = stub_pid1(arg_uuid);
3514 if (r < 0)
3515 return r;
3516 }
3517
3518 if (arg_console_mode != CONSOLE_PIPE) {
3519 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3520 * are configured for that. Acquire it as controlling tty. */
3521 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3522 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3523 }
3524
3525 log_debug("Inner child completed, invoking payload.");
3526
3527 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3528 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3529 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3530 log_close();
3531 log_set_open_when_needed(true);
3532 log_settle_target();
3533
3534 (void) fdset_close_others(fds);
3535
3536 if (arg_start_mode == START_BOOT) {
3537 char **a;
3538 size_t m;
3539
3540 /* Automatically search for the init system */
3541
3542 m = strv_length(arg_parameters);
3543 a = newa(char*, m + 2);
3544 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3545 a[1 + m] = NULL;
3546
3547 FOREACH_STRING(init,
3548 "/usr/lib/systemd/systemd",
3549 "/lib/systemd/systemd",
3550 "/sbin/init") {
3551 a[0] = (char*) init;
3552 execve(a[0], a, env_use);
3553 }
3554
3555 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3556 } else if (!strv_isempty(arg_parameters)) {
3557 const char *dollar_path;
3558
3559 exec_target = arg_parameters[0];
3560
3561 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3562 * binary. */
3563 dollar_path = strv_env_get(env_use, "PATH");
3564 if (dollar_path) {
3565 if (setenv("PATH", dollar_path, 1) < 0)
3566 return log_error_errno(errno, "Failed to update $PATH: %m");
3567 }
3568
3569 execvpe(arg_parameters[0], arg_parameters, env_use);
3570 } else {
3571 if (!arg_chdir)
3572 /* If we cannot change the directory, we'll end up in /, that is expected. */
3573 (void) chdir(home ?: "/root");
3574
3575 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3576 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3577 execle("/bin/bash", "-bash", NULL, env_use);
3578 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3579 execle("/bin/sh", "-sh", NULL, env_use);
3580
3581 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3582 }
3583
3584 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3585}
3586
3587static int setup_notify_child(void) {
3588 _cleanup_close_ int fd = -EBADF;
3589 static const union sockaddr_union sa = {
3590 .un.sun_family = AF_UNIX,
3591 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3592 };
3593 int r;
3594
3595 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3596 if (fd < 0)
3597 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3598
3599 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3600 (void) sockaddr_un_unlink(&sa.un);
3601
3602 WITH_UMASK(0577) { /* only set "w" bit, which is all that's necessary for connecting from the container */
3603 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3604 if (r < 0)
3605 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3606 }
3607
3608 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3609 if (r < 0)
3610 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3611
3612 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3613 if (r < 0)
3614 return log_error_errno(r, "SO_PASSCRED failed: %m");
3615
3616 return TAKE_FD(fd);
3617}
3618
3619static int setup_unix_export_dir_outside(char **ret) {
3620 int r;
3621
3622 assert(ret);
3623
3624 _cleanup_free_ char *p = NULL;
3625 p = path_join("/run/systemd/nspawn/unix-export", arg_machine);
3626 if (!p)
3627 return log_oom();
3628
3629 r = path_is_mount_point(p);
3630 if (r > 0)
3631 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Mount point '%s' exists already, refusing.", p);
3632 if (r < 0 && r != -ENOENT)
3633 return log_error_errno(r, "Failed to detect if '%s' is a mount point: %m", p);
3634
3635 r = mkdir_p(p, 0755);
3636 if (r < 0)
3637 return log_error_errno(r, "Failed to create '%s': %m", p);
3638
3639 _cleanup_(rmdir_and_freep) char *q = TAKE_PTR(p);
3640
3641 /* Mount the "unix export" directory really tiny, just 64 inodes. We mark the superblock writable
3642 * (since the container shall bind sockets into it). */
3643 r = mount_nofollow_verbose(
3644 LOG_ERR,
3645 "tmpfs",
3646 q,
3647 "tmpfs",
3648 MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
3649 "size=4M,nr_inodes=64,mode=0755");
3650 if (r < 0)
3651 return r;
3652
3653 _cleanup_(umount_and_rmdir_and_freep) char *w = TAKE_PTR(q);
3654
3655 /* After creating the superblock we change the bind mount to be read-only. This means that the fs
3656 * itself is writable, but not through the mount accessible from the host. */
3657 r = mount_nofollow_verbose(
3658 LOG_ERR,
3659 /* source= */ NULL,
3660 w,
3661 /* fstype= */ NULL,
3662 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
3663 /* options= */ NULL);
3664 if (r < 0)
3665 return r;
3666
3667 *ret = TAKE_PTR(w);
3668 return 0;
3669}
3670
3671static int setup_unix_export_host_inside(const char *directory, const char *unix_export_path) {
3672 int r;
3673
3674 assert(directory);
3675 assert(unix_export_path);
3676
3677 r = make_run_host(directory);
3678 if (r < 0)
3679 return r;
3680
3681 _cleanup_free_ char *p = path_join(directory, "run/host/unix-export");
3682 if (!p)
3683 return log_oom();
3684
3685 if (mkdir(p, 0755) < 0)
3686 return log_error_errno(errno, "Failed to create '%s': %m", p);
3687
3688 r = mount_nofollow_verbose(
3689 LOG_ERR,
3690 unix_export_path,
3691 p,
3692 /* fstype= */ NULL,
3693 MS_BIND,
3694 /* options= */ NULL);
3695 if (r < 0)
3696 return r;
3697
3698 r = mount_nofollow_verbose(
3699 LOG_ERR,
3700 /* source= */ NULL,
3701 p,
3702 /* fstype= */ NULL,
3703 MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
3704 /* options= */ NULL);
3705 if (r < 0)
3706 return r;
3707
3708 r = userns_lchown(p, 0, 0);
3709 if (r < 0)
3710 return log_error_errno(r, "Failed to chown '%s': %m", p);
3711
3712 return 0;
3713}
3714
3715static DissectImageFlags determine_dissect_image_flags(void) {
3716 return
3717 DISSECT_IMAGE_USR_NO_ROOT |
3718 DISSECT_IMAGE_DISCARD_ON_LOOP |
3719 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) |
3720 DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
3721}
3722
3723static int outer_child(
3724 Barrier *barrier,
3725 const char *directory,
3726 DissectedImage *dissected_image,
3727 int fd_outer_socket,
3728 int fd_inner_socket,
3729 FDSet *fds,
3730 int netns_fd,
3731 const char *unix_export_path) {
3732
3733 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3734 _cleanup_strv_free_ char **os_release_pairs = NULL;
3735 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
3736 bool idmap = false;
3737 const char *p;
3738 pid_t pid;
3739 ssize_t l;
3740 int r;
3741
3742 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3743 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3744 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3745 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3746 * forked off it, and it exits. */
3747
3748 assert(barrier);
3749 assert(directory);
3750 assert(fd_outer_socket >= 0);
3751 assert(fd_inner_socket >= 0);
3752
3753 log_debug("Outer child is initializing.");
3754
3755 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3756 if (r < 0)
3757 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3758
3759 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3760 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3761
3762 r = reset_audit_loginuid();
3763 if (r < 0)
3764 return r;
3765
3766 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3767 * mounts to the real root. */
3768 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3769 if (r < 0)
3770 return r;
3771
3772 if (dissected_image) {
3773 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3774 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3775 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3776 * right place right away. This makes sure ESP partitions and userns are compatible. */
3777
3778 r = dissected_image_mount_and_warn(
3779 dissected_image,
3780 directory,
3781 arg_uid_shift,
3782 arg_uid_range,
3783 /* userns_fd= */ -EBADF,
3784 determine_dissect_image_flags()|
3785 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3786 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3787 if (r < 0)
3788 return r;
3789 }
3790
3791 r = determine_uid_shift(directory);
3792 if (r < 0)
3793 return r;
3794
3795 if (arg_userns_mode != USER_NAMESPACE_NO) {
3796 r = namespace_open(0,
3797 /* ret_pidns_fd = */ NULL,
3798 &mntns_fd,
3799 /* ret_netns_fd = */ NULL,
3800 /* ret_userns_fd = */ NULL,
3801 /* ret_root_fd = */ NULL);
3802 if (r < 0)
3803 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3804
3805 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
3806 if (l < 0)
3807 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3808 mntns_fd = safe_close(mntns_fd);
3809
3810 /* Let the parent know which UID shift we read from the image */
3811 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3812 if (l < 0)
3813 return log_error_errno(errno, "Failed to send UID shift: %m");
3814 if (l != sizeof(arg_uid_shift))
3815 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3816 "Short write while sending UID shift.");
3817
3818 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3819 /* When we are supposed to pick the UID shift, the parent will check now whether the
3820 * UID shift we just read from the image is available. If yes, it will send the UID
3821 * shift back to us, if not it will pick a different one, and send it back to us. */
3822
3823 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3824 if (l < 0)
3825 return log_error_errno(errno, "Failed to recv UID shift: %m");
3826 if (l != sizeof(arg_uid_shift))
3827 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3828 "Short read while receiving UID shift.");
3829 }
3830
3831 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3832 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3833 }
3834
3835 if (path_equal(directory, "/")) {
3836 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3837 * place, so that we can make changes to its mount structure (for example, to implement
3838 * --volatile=) without this interfering with our ability to access files such as
3839 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3840 * (instead of a temporary directory, since we are living in our own mount namespace here
3841 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3842 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3843
3844 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3845 if (r < 0)
3846 return r;
3847
3848 directory = "/run/systemd/nspawn-root";
3849 }
3850
3851 /* Make sure we always have a mount that we can move to root later on. */
3852 r = make_mount_point(directory);
3853 if (r < 0)
3854 return r;
3855
3856 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3857 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3858 * we'll live in our own little world from now on, and propagation from the host may only happen via
3859 * the mount tunnel dir, or not at all. */
3860 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3861 if (r < 0)
3862 return r;
3863
3864 r = setup_pivot_root(
3865 directory,
3866 arg_pivot_root_new,
3867 arg_pivot_root_old);
3868 if (r < 0)
3869 return r;
3870
3871 r = setup_volatile_mode(
3872 directory,
3873 arg_volatile_mode,
3874 arg_uid_shift,
3875 arg_selinux_apifs_context);
3876 if (r < 0)
3877 return r;
3878
3879 r = bind_user_prepare(
3880 directory,
3881 arg_bind_user,
3882 arg_uid_shift,
3883 arg_uid_range,
3884 &arg_custom_mounts, &arg_n_custom_mounts,
3885 &bind_user_context);
3886 if (r < 0)
3887 return r;
3888
3889 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3890 /* Send the user maps we determined to the parent, so that it installs it in our user
3891 * namespace UID map table */
3892
3893 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3894 uid_t map[] = {
3895 bind_user_context->data[i].payload_user->uid,
3896 bind_user_context->data[i].host_user->uid,
3897 (uid_t) bind_user_context->data[i].payload_group->gid,
3898 (uid_t) bind_user_context->data[i].host_group->gid,
3899 };
3900
3901 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
3902 if (l < 0)
3903 return log_error_errno(errno, "Failed to send user UID map: %m");
3904 if (l != sizeof(map))
3905 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3906 "Short write while sending user UID map.");
3907 }
3908 }
3909
3910 r = mount_custom(
3911 directory,
3912 arg_custom_mounts,
3913 arg_n_custom_mounts,
3914 arg_uid_shift,
3915 arg_uid_range,
3916 arg_selinux_apifs_context,
3917 MOUNT_ROOT_ONLY);
3918 if (r < 0)
3919 return r;
3920
3921 if (arg_userns_mode != USER_NAMESPACE_NO &&
3922 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3923 arg_uid_shift != 0) {
3924 _cleanup_free_ char *usr_subtree = NULL;
3925 char *dirs[3];
3926 size_t i = 0;
3927
3928 dirs[i++] = (char*) directory;
3929
3930 if (dissected_image && dissected_image->partitions[PARTITION_USR].found) {
3931 usr_subtree = path_join(directory, "/usr");
3932 if (!usr_subtree)
3933 return log_oom();
3934
3935 dirs[i++] = usr_subtree;
3936 }
3937
3938 dirs[i] = NULL;
3939
3940 r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
3941 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3942 /* This might fail because the kernel or file system doesn't support idmapping. We
3943 * can't really distinguish this nicely, nor do we have any guarantees about the
3944 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3945 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3946 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3947 "ID mapped mounts are apparently not available, sorry.");
3948
3949 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3950 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3951 } else if (r < 0)
3952 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3953 else {
3954 log_debug("ID mapped mounts available, making use of them.");
3955 idmap = true;
3956 }
3957 }
3958
3959 if (dissected_image) {
3960 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3961 r = dissected_image_mount(
3962 dissected_image,
3963 directory,
3964 arg_uid_shift,
3965 arg_uid_range,
3966 /* userns_fd= */ -EBADF,
3967 determine_dissect_image_flags()|
3968 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3969 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3970 if (r == -EUCLEAN)
3971 return log_error_errno(r, "File system check for image failed: %m");
3972 if (r < 0)
3973 return log_error_errno(r, "Failed to mount image file system: %m");
3974 }
3975
3976 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3977 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3978
3979 r = detect_unified_cgroup_hierarchy_from_image(directory);
3980 if (r < 0)
3981 return r;
3982
3983 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3984 if (l < 0)
3985 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3986 if (l != sizeof(arg_unified_cgroup_hierarchy))
3987 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3988 "Short write while sending cgroup mode.");
3989 }
3990
3991 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3992 if (r < 0)
3993 return r;
3994
3995 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3996 if (r < 0)
3997 return r;
3998
3999 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
4000 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
4001 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
4002 if (r < 0)
4003 return log_error_errno(r, "Failed to make tree read-only: %m");
4004 }
4005
4006 r = mount_all(directory,
4007 arg_mount_settings,
4008 arg_uid_shift,
4009 arg_selinux_apifs_context);
4010 if (r < 0)
4011 return r;
4012
4013 r = copy_devnodes(directory);
4014 if (r < 0)
4015 return r;
4016
4017 r = make_extra_nodes(directory);
4018 if (r < 0)
4019 return r;
4020
4021 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
4022
4023 p = prefix_roota(directory, "/run/host");
4024 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
4025
4026 r = setup_unix_export_host_inside(directory, unix_export_path);
4027 if (r < 0)
4028 return r;
4029
4030 r = setup_pts(directory);
4031 if (r < 0)
4032 return r;
4033
4034 r = mount_tunnel_dig(directory);
4035 if (r < 0)
4036 return r;
4037
4038 r = setup_keyring();
4039 if (r < 0)
4040 return r;
4041
4042 r = setup_credentials(directory);
4043 if (r < 0)
4044 return r;
4045
4046 r = bind_user_setup(bind_user_context, directory);
4047 if (r < 0)
4048 return r;
4049
4050 r = mount_custom(
4051 directory,
4052 arg_custom_mounts,
4053 arg_n_custom_mounts,
4054 arg_uid_shift,
4055 arg_uid_range,
4056 arg_selinux_apifs_context,
4057 MOUNT_NON_ROOT_ONLY);
4058 if (r < 0)
4059 return r;
4060
4061 r = setup_timezone(directory);
4062 if (r < 0)
4063 return r;
4064
4065 r = setup_resolv_conf(directory);
4066 if (r < 0)
4067 return r;
4068
4069 r = setup_machine_id(directory);
4070 if (r < 0)
4071 return r;
4072
4073 r = setup_journal(directory);
4074 if (r < 0)
4075 return r;
4076
4077 /* The same stuff as the $container env var, but nicely readable for the entire payload */
4078 p = prefix_roota(directory, "/run/host/container-manager");
4079 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0444);
4080
4081 /* The same stuff as the $container_uuid env var */
4082 p = prefix_roota(directory, "/run/host/container-uuid");
4083 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0444, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
4084
4085 if (!arg_use_cgns) {
4086 r = mount_cgroups(
4087 directory,
4088 arg_unified_cgroup_hierarchy,
4089 arg_userns_mode != USER_NAMESPACE_NO,
4090 arg_uid_shift,
4091 arg_uid_range,
4092 arg_selinux_apifs_context,
4093 false);
4094 if (r < 0)
4095 return r;
4096 }
4097
4098 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
4099 * mounts available in systemd services inside the container that create a new mount namespace. See
4100 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
4101 * will inherit the shared propagation mode.
4102 *
4103 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
4104 * directory mount to root later on.
4105 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
4106 */
4107 r = mount_switch_root(directory, MS_SHARED);
4108 if (r < 0)
4109 return log_error_errno(r, "Failed to move root directory: %m");
4110
4111 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4112 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4113 * the container. */
4114 r = mount_tunnel_open();
4115 if (r < 0)
4116 return r;
4117
4118 if (arg_userns_mode != USER_NAMESPACE_NO) {
4119 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4120 * requires that a fully visible instance is already present in the target mount
4121 * namespace. Mount one here so the inner child can mount its own instances. Later
4122 * we umount the temporary instances created here before we actually exec the
4123 * payload. Since the rootfs is shared the umount will propagate into the container.
4124 * Note, the inner child wouldn't be able to unmount the instances on its own since
4125 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4126 * this. */
4127 r = pin_fully_visible_fs();
4128 if (r < 0)
4129 return r;
4130 }
4131
4132 fd = setup_notify_child();
4133 if (fd < 0)
4134 return fd;
4135
4136 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4137 arg_clone_ns_flags |
4138 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
4139 if (pid < 0)
4140 return log_error_errno(errno, "Failed to fork inner child: %m");
4141 if (pid == 0) {
4142 fd_outer_socket = safe_close(fd_outer_socket);
4143
4144 /* The inner child has all namespaces that are requested, so that we all are owned by the
4145 * user if user namespaces are turned on. */
4146
4147 if (arg_network_namespace_path) {
4148 r = namespace_enter(/* pidns_fd = */ -EBADF,
4149 /* mntns_fd = */ -EBADF,
4150 netns_fd,
4151 /* userns_fd = */ -EBADF,
4152 /* root_fd = */ -EBADF);
4153 if (r < 0)
4154 return log_error_errno(r, "Failed to join network namespace: %m");
4155 }
4156
4157 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
4158 if (r < 0)
4159 _exit(EXIT_FAILURE);
4160
4161 _exit(EXIT_SUCCESS);
4162 }
4163
4164 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4165 if (l < 0)
4166 return log_error_errno(errno, "Failed to send PID: %m");
4167 if (l != sizeof(pid))
4168 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4169 "Short write while sending PID.");
4170
4171 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4172 if (l < 0)
4173 return log_error_errno(errno, "Failed to send machine ID: %m");
4174 if (l != sizeof(arg_uuid))
4175 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4176 "Short write while sending machine ID.");
4177
4178 l = send_one_fd(fd_outer_socket, fd, 0);
4179 if (l < 0)
4180 return log_error_errno(l, "Failed to send notify fd: %m");
4181
4182 fd_outer_socket = safe_close(fd_outer_socket);
4183 fd_inner_socket = safe_close(fd_inner_socket);
4184 netns_fd = safe_close(netns_fd);
4185
4186 return 0;
4187}
4188
4189static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4190 bool tried_hashed = false;
4191 unsigned n_tries = 100;
4192 uid_t candidate;
4193 int r;
4194
4195 assert(shift);
4196 assert(ret_lock_file);
4197 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4198 assert(arg_uid_range == 0x10000U);
4199
4200 candidate = *shift;
4201
4202 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4203
4204 for (;;) {
4205 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4206 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4207
4208 if (--n_tries <= 0)
4209 return -EBUSY;
4210
4211 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4212 goto next;
4213 if ((candidate & UINT32_C(0xFFFF)) != 0)
4214 goto next;
4215
4216 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4217 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4218 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4219 goto next;
4220 if (r < 0)
4221 return r;
4222
4223 /* Make some superficial checks whether the range is currently known in the user database */
4224 if (getpwuid_malloc(candidate, /* ret= */ NULL) >= 0)
4225 goto next;
4226 if (getpwuid_malloc(candidate + UINT32_C(0xFFFE), /* ret= */ NULL) >= 0)
4227 goto next;
4228 if (getgrgid_malloc(candidate, /* ret= */ NULL) >= 0)
4229 goto next;
4230 if (getgrgid_malloc(candidate + UINT32_C(0xFFFE), /* ret= */ NULL) >= 0)
4231 goto next;
4232
4233 *ret_lock_file = lf;
4234 lf = (struct LockFile) LOCK_FILE_INIT;
4235 *shift = candidate;
4236 return 0;
4237
4238 next:
4239 if (arg_machine && !tried_hashed) {
4240 /* Try to hash the base from the container name */
4241
4242 static const uint8_t hash_key[] = {
4243 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4244 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4245 };
4246
4247 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4248
4249 tried_hashed = true;
4250 } else
4251 random_bytes(&candidate, sizeof(candidate));
4252
4253 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4254 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4255 }
4256}
4257
4258static int add_one_uid_map(
4259 char **p,
4260 uid_t container_uid,
4261 uid_t host_uid,
4262 uid_t range) {
4263
4264 return strextendf(p,
4265 UID_FMT " " UID_FMT " " UID_FMT "\n",
4266 container_uid, host_uid, range);
4267}
4268
4269static int make_uid_map_string(
4270 const uid_t bind_user_uid[],
4271 size_t n_bind_user_uid,
4272 size_t offset,
4273 char **ret) {
4274
4275 _cleanup_free_ char *s = NULL;
4276 uid_t previous_uid = 0;
4277 int r;
4278
4279 assert(n_bind_user_uid == 0 || bind_user_uid);
4280 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4281 assert(ret);
4282
4283 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4284 * quadruplet, consisting of host and container UID + GID. */
4285
4286 for (size_t i = 0; i < n_bind_user_uid; i++) {
4287 uid_t payload_uid = bind_user_uid[i*4+offset],
4288 host_uid = bind_user_uid[i*4+offset+1];
4289
4290 assert(previous_uid <= payload_uid);
4291 assert(payload_uid < arg_uid_range);
4292
4293 /* Add a range to close the gap to previous entry */
4294 if (payload_uid > previous_uid) {
4295 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4296 if (r < 0)
4297 return r;
4298 }
4299
4300 /* Map this specific user */
4301 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4302 if (r < 0)
4303 return r;
4304
4305 previous_uid = payload_uid + 1;
4306 }
4307
4308 /* And add a range to close the gap to finish the range */
4309 if (arg_uid_range > previous_uid) {
4310 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4311 if (r < 0)
4312 return r;
4313 }
4314
4315 assert(s);
4316
4317 *ret = TAKE_PTR(s);
4318 return 0;
4319}
4320
4321static int setup_uid_map(
4322 pid_t pid,
4323 const uid_t bind_user_uid[],
4324 size_t n_bind_user_uid) {
4325
4326 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4327 _cleanup_free_ char *s = NULL;
4328 int r;
4329
4330 assert(pid > 1);
4331
4332 /* Build the UID map string */
4333 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4334 return log_oom();
4335
4336 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4337 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4338 if (r < 0)
4339 return log_error_errno(r, "Failed to write UID map: %m");
4340
4341 /* And now build the GID map string */
4342 s = mfree(s);
4343 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4344 return log_oom();
4345
4346 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4347 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4348 if (r < 0)
4349 return log_error_errno(r, "Failed to write GID map: %m");
4350
4351 return 0;
4352}
4353
4354static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4355 char buf[NOTIFY_BUFFER_MAX+1];
4356 char *p = NULL;
4357 struct iovec iovec = {
4358 .iov_base = buf,
4359 .iov_len = sizeof(buf)-1,
4360 };
4361 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4362 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4363 struct msghdr msghdr = {
4364 .msg_iov = &iovec,
4365 .msg_iovlen = 1,
4366 .msg_control = &control,
4367 .msg_controllen = sizeof(control),
4368 };
4369 struct ucred *ucred;
4370 ssize_t n;
4371 pid_t inner_child_pid;
4372 _cleanup_strv_free_ char **tags = NULL;
4373 int r;
4374
4375 assert(userdata);
4376
4377 inner_child_pid = PTR_TO_PID(userdata);
4378
4379 if (revents != EPOLLIN) {
4380 log_warning("Got unexpected poll event for notify fd.");
4381 return 0;
4382 }
4383
4384 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4385 if (ERRNO_IS_NEG_TRANSIENT(n))
4386 return 0;
4387 else if (n == -EXFULL) {
4388 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4389 return 0;
4390 } else if (n < 0)
4391 return log_warning_errno(n, "Couldn't read notification socket: %m");
4392
4393 cmsg_close_all(&msghdr);
4394
4395 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4396 if (!ucred || ucred->pid != inner_child_pid) {
4397 log_debug("Received notify message without valid credentials. Ignoring.");
4398 return 0;
4399 }
4400
4401 if ((size_t) n >= sizeof(buf)) {
4402 log_warning("Received notify message exceeded maximum size. Ignoring.");
4403 return 0;
4404 }
4405
4406 buf[n] = 0;
4407 tags = strv_split(buf, "\n\r");
4408 if (!tags)
4409 return log_oom();
4410
4411 if (strv_contains(tags, "READY=1")) {
4412 r = sd_notify(false, "READY=1\n");
4413 if (r < 0)
4414 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4415 }
4416
4417 p = strv_find_startswith(tags, "STATUS=");
4418 if (p)
4419 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4420
4421 return 0;
4422}
4423
4424static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4425 int r;
4426
4427 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4428 if (r < 0)
4429 return log_error_errno(r, "Failed to allocate notify event source: %m");
4430
4431 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4432
4433 return 0;
4434}
4435
4436static void set_window_title(PTYForward *f) {
4437 _cleanup_free_ char *hn = NULL, *dot = NULL;
4438
4439 assert(f);
4440
4441 (void) gethostname_strict(&hn);
4442
4443 if (emoji_enabled())
4444 dot = strjoin(special_glyph(SPECIAL_GLYPH_BLUE_CIRCLE), " ");
4445
4446 if (hn)
4447 (void) pty_forward_set_titlef(f, "%sContainer %s on %s", strempty(dot), arg_machine, hn);
4448 else
4449 (void) pty_forward_set_titlef(f, "%sContainer %s", strempty(dot), arg_machine);
4450
4451 if (dot)
4452 (void) pty_forward_set_title_prefix(f, dot);
4453}
4454
4455static int merge_settings(Settings *settings, const char *path) {
4456 int rl;
4457
4458 assert(settings);
4459 assert(path);
4460
4461 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4462 * that this steals the fields of the Settings* structure, and hence modifies it. */
4463
4464 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4465 settings->start_mode >= 0) {
4466 arg_start_mode = settings->start_mode;
4467 strv_free_and_replace(arg_parameters, settings->parameters);
4468 }
4469
4470 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4471 settings->ephemeral >= 0)
4472 arg_ephemeral = settings->ephemeral;
4473
4474 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4475 settings->root) {
4476
4477 if (!arg_settings_trusted)
4478 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4479 else
4480 free_and_replace(arg_directory, settings->root);
4481 }
4482
4483 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4484 settings->pivot_root_new) {
4485 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4486 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4487 }
4488
4489 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4490 settings->working_directory)
4491 free_and_replace(arg_chdir, settings->working_directory);
4492
4493 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4494 settings->environment)
4495 strv_free_and_replace(arg_setenv, settings->environment);
4496
4497 if ((arg_settings_mask & SETTING_USER) == 0) {
4498
4499 if (settings->user)
4500 free_and_replace(arg_user, settings->user);
4501
4502 if (uid_is_valid(settings->uid))
4503 arg_uid = settings->uid;
4504 if (gid_is_valid(settings->gid))
4505 arg_gid = settings->gid;
4506 if (settings->n_supplementary_gids > 0) {
4507 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4508 arg_n_supplementary_gids = settings->n_supplementary_gids;
4509 }
4510 }
4511
4512 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4513 uint64_t plus, minus;
4514 uint64_t network_minus = 0;
4515 uint64_t ambient;
4516
4517 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4518 * Settings structure */
4519
4520 plus = settings->capability;
4521 minus = settings->drop_capability;
4522
4523 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4524 settings_network_configured(settings)) {
4525 if (settings_private_network(settings))
4526 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4527 else
4528 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4529 }
4530
4531 if (!arg_settings_trusted && plus != 0) {
4532 if (settings->capability != 0)
4533 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4534 } else {
4535 arg_caps_retain &= ~network_minus;
4536 arg_caps_retain |= plus;
4537 }
4538
4539 arg_caps_retain &= ~minus;
4540
4541 /* Copy the full capabilities over too */
4542 if (capability_quintet_is_set(&settings->full_capabilities)) {
4543 if (!arg_settings_trusted)
4544 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4545 else
4546 arg_full_capabilities = settings->full_capabilities;
4547 }
4548
4549 ambient = settings->ambient_capability;
4550 if (!arg_settings_trusted && ambient != 0)
4551 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4552 else
4553 arg_caps_ambient |= ambient;
4554 }
4555
4556 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4557 settings->kill_signal > 0)
4558 arg_kill_signal = settings->kill_signal;
4559
4560 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4561 settings->personality != PERSONALITY_INVALID)
4562 arg_personality = settings->personality;
4563
4564 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4565 !sd_id128_is_null(settings->machine_id)) {
4566
4567 if (!arg_settings_trusted)
4568 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4569 else
4570 arg_uuid = settings->machine_id;
4571 }
4572
4573 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4574 settings->read_only >= 0)
4575 arg_read_only = settings->read_only;
4576
4577 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4578 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4579 arg_volatile_mode = settings->volatile_mode;
4580
4581 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4582 settings->n_custom_mounts > 0) {
4583
4584 if (!arg_settings_trusted)
4585 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4586 else {
4587 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4588 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4589 arg_n_custom_mounts = settings->n_custom_mounts;
4590 settings->n_custom_mounts = 0;
4591 }
4592 }
4593
4594 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4595 settings_network_configured(settings)) {
4596
4597 if (!arg_settings_trusted)
4598 log_warning("Ignoring network settings, file %s is not trusted.", path);
4599 else {
4600 arg_network_veth = settings_network_veth(settings);
4601 arg_private_network = settings_private_network(settings);
4602
4603 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4604 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4605 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4606 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4607
4608 free_and_replace(arg_network_bridge, settings->network_bridge);
4609 free_and_replace(arg_network_zone, settings->network_zone);
4610
4611 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4612 }
4613 }
4614
4615 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4616 settings->expose_ports) {
4617
4618 if (!arg_settings_trusted)
4619 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4620 else {
4621 expose_port_free_all(arg_expose_ports);
4622 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4623 }
4624 }
4625
4626 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4627 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4628
4629 if (!arg_settings_trusted)
4630 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4631 else {
4632 arg_userns_mode = settings->userns_mode;
4633 arg_uid_shift = settings->uid_shift;
4634 arg_uid_range = settings->uid_range;
4635 arg_userns_ownership = settings->userns_ownership;
4636 }
4637 }
4638
4639 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4640 !strv_isempty(settings->bind_user))
4641 strv_free_and_replace(arg_bind_user, settings->bind_user);
4642
4643 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4644 settings->notify_ready >= 0)
4645 arg_notify_ready = settings->notify_ready;
4646
4647 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4648
4649 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4650 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4651 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4652 else {
4653 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4654 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4655 }
4656 }
4657
4658#if HAVE_SECCOMP
4659 if (settings->seccomp) {
4660 if (!arg_settings_trusted)
4661 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4662 else {
4663 seccomp_release(arg_seccomp);
4664 arg_seccomp = TAKE_PTR(settings->seccomp);
4665 }
4666 }
4667#endif
4668 }
4669
4670 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4671 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4672 continue;
4673
4674 if (!settings->rlimit[rl])
4675 continue;
4676
4677 if (!arg_settings_trusted) {
4678 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4679 continue;
4680 }
4681
4682 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4683 }
4684
4685 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4686 settings->hostname)
4687 free_and_replace(arg_hostname, settings->hostname);
4688
4689 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4690 settings->no_new_privileges >= 0)
4691 arg_no_new_privileges = settings->no_new_privileges;
4692
4693 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4694 settings->oom_score_adjust_set) {
4695
4696 if (!arg_settings_trusted)
4697 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4698 else {
4699 arg_oom_score_adjust = settings->oom_score_adjust;
4700 arg_oom_score_adjust_set = true;
4701 }
4702 }
4703
4704 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4705 settings->cpu_set.set) {
4706
4707 if (!arg_settings_trusted)
4708 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4709 else {
4710 cpu_set_reset(&arg_cpu_set);
4711 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
4712 }
4713 }
4714
4715 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4716 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4717 arg_resolv_conf = settings->resolv_conf;
4718
4719 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4720 settings->link_journal != _LINK_JOURNAL_INVALID) {
4721
4722 if (!arg_settings_trusted)
4723 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4724 else {
4725 arg_link_journal = settings->link_journal;
4726 arg_link_journal_try = settings->link_journal_try;
4727 }
4728 }
4729
4730 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4731 settings->timezone != _TIMEZONE_MODE_INVALID)
4732 arg_timezone = settings->timezone;
4733
4734 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4735 settings->slice) {
4736
4737 if (!arg_settings_trusted)
4738 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4739 else
4740 free_and_replace(arg_slice, settings->slice);
4741 }
4742
4743 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4744 settings->use_cgns >= 0) {
4745
4746 if (!arg_settings_trusted)
4747 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4748 else
4749 arg_use_cgns = settings->use_cgns;
4750 }
4751
4752 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4753 settings->clone_ns_flags != ULONG_MAX) {
4754
4755 if (!arg_settings_trusted)
4756 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4757 else
4758 arg_clone_ns_flags = settings->clone_ns_flags;
4759 }
4760
4761 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4762 settings->console_mode >= 0) {
4763
4764 if (!arg_settings_trusted)
4765 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4766 else
4767 arg_console_mode = settings->console_mode;
4768 }
4769
4770 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4771 settings->suppress_sync >= 0)
4772 arg_suppress_sync = settings->suppress_sync;
4773
4774 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4775 * don't consult arg_settings_mask for them. */
4776
4777 sd_bus_message_unref(arg_property_message);
4778 arg_property_message = TAKE_PTR(settings->properties);
4779
4780 arg_console_width = settings->console_width;
4781 arg_console_height = settings->console_height;
4782
4783 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4784 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4785 arg_n_extra_nodes = settings->n_extra_nodes;
4786 settings->n_extra_nodes = 0;
4787
4788 return 0;
4789}
4790
4791static int load_settings(void) {
4792 _cleanup_(settings_freep) Settings *settings = NULL;
4793 _cleanup_fclose_ FILE *f = NULL;
4794 _cleanup_free_ char *p = NULL;
4795 int r;
4796
4797 if (arg_oci_bundle)
4798 return 0;
4799
4800 /* If all settings are masked, there's no point in looking for
4801 * the settings file */
4802 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4803 return 0;
4804
4805 /* We first look in the admin's directories in /etc and /run */
4806 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4807 _cleanup_free_ char *j = NULL;
4808
4809 j = path_join(i, arg_settings_filename);
4810 if (!j)
4811 return log_oom();
4812
4813 f = fopen(j, "re");
4814 if (f) {
4815 p = TAKE_PTR(j);
4816
4817 /* By default, we trust configuration from /etc and /run */
4818 if (arg_settings_trusted < 0)
4819 arg_settings_trusted = true;
4820
4821 break;
4822 }
4823
4824 if (errno != ENOENT)
4825 return log_error_errno(errno, "Failed to open %s: %m", j);
4826 }
4827
4828 if (!f) {
4829 /* After that, let's look for a file next to the
4830 * actual image we shall boot. */
4831
4832 if (arg_image) {
4833 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4834 if (r < 0)
4835 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4836 } else if (arg_directory) {
4837 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4838 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4839 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
4840 }
4841
4842 if (p) {
4843 f = fopen(p, "re");
4844 if (!f && errno != ENOENT)
4845 return log_error_errno(errno, "Failed to open %s: %m", p);
4846
4847 /* By default, we do not trust configuration from /var/lib/machines */
4848 if (arg_settings_trusted < 0)
4849 arg_settings_trusted = false;
4850 }
4851 }
4852
4853 if (!f)
4854 return 0;
4855
4856 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4857
4858 r = settings_load(f, p, &settings);
4859 if (r < 0)
4860 return r;
4861
4862 return merge_settings(settings, p);
4863}
4864
4865static int load_oci_bundle(void) {
4866 _cleanup_(settings_freep) Settings *settings = NULL;
4867 int r;
4868
4869 if (!arg_oci_bundle)
4870 return 0;
4871
4872 /* By default let's trust OCI bundles */
4873 if (arg_settings_trusted < 0)
4874 arg_settings_trusted = true;
4875
4876 r = oci_load(NULL, arg_oci_bundle, &settings);
4877 if (r < 0)
4878 return r;
4879
4880 return merge_settings(settings, arg_oci_bundle);
4881}
4882
4883static int run_container(
4884 DissectedImage *dissected_image,
4885 FDSet *fds,
4886 char veth_name[IFNAMSIZ], bool *veth_created,
4887 struct ExposeArgs *expose_args,
4888 int *master, pid_t *pid, int *ret) {
4889
4890 static const struct sigaction sa = {
4891 .sa_handler = nop_signal_handler,
4892 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4893 };
4894
4895 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4896 _cleanup_close_ int etc_passwd_lock = -EBADF;
4897 _cleanup_close_pair_ int
4898 fd_inner_socket_pair[2] = EBADF_PAIR,
4899 fd_outer_socket_pair[2] = EBADF_PAIR;
4900
4901 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
4902 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4903 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4904 _cleanup_(umount_and_rmdir_and_freep) char *unix_export_host_dir = NULL;
4905 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4906 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4907 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4908 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4909 _cleanup_free_ uid_t *bind_user_uid = NULL;
4910 size_t n_bind_user_uid = 0;
4911 ContainerStatus container_status = 0;
4912 int ifi = 0, r;
4913 ssize_t l;
4914 sigset_t mask_chld;
4915 _cleanup_close_ int child_netns_fd = -EBADF;
4916
4917 assert_se(sigemptyset(&mask_chld) == 0);
4918 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4919
4920 /* Set up the unix export host directory on the host first */
4921 r = setup_unix_export_dir_outside(&unix_export_host_dir);
4922 if (r < 0)
4923 return r;
4924
4925 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4926 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4927 * check with getpwuid() if the specific user already exists. Note that /etc might be
4928 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4929 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4930 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4931 * really ours. */
4932
4933 etc_passwd_lock = take_etc_passwd_lock(NULL);
4934 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4935 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4936 }
4937
4938 r = barrier_create(&barrier);
4939 if (r < 0)
4940 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4941
4942 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4943 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4944
4945 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4946 return log_error_errno(errno, "Failed to create outer socket pair: %m");
4947
4948 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4949 * parent's blocking calls and give it a chance to call wait() and terminate. */
4950 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4951 if (r < 0)
4952 return log_error_errno(errno, "Failed to change the signal mask: %m");
4953
4954 r = sigaction(SIGCHLD, &sa, NULL);
4955 if (r < 0)
4956 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4957
4958 if (arg_network_namespace_path) {
4959 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4960 if (child_netns_fd < 0)
4961 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4962
4963 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4964 if (r == -EUCLEAN)
4965 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4966 else if (r < 0)
4967 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4968 else if (r == 0)
4969 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4970 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4971 }
4972
4973 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4974 if (*pid < 0)
4975 return log_error_errno(errno, "clone() failed%s: %m",
4976 errno == EINVAL ?
4977 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4978
4979 if (*pid == 0) {
4980 /* The outer child only has a file system namespace. */
4981 barrier_set_role(&barrier, BARRIER_CHILD);
4982
4983 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
4984 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
4985
4986 (void) reset_all_signal_handlers();
4987 (void) reset_signal_mask();
4988
4989 r = outer_child(&barrier,
4990 arg_directory,
4991 dissected_image,
4992 fd_outer_socket_pair[1],
4993 fd_inner_socket_pair[1],
4994 fds,
4995 child_netns_fd,
4996 unix_export_host_dir);
4997 if (r < 0)
4998 _exit(EXIT_FAILURE);
4999
5000 _exit(EXIT_SUCCESS);
5001 }
5002
5003 barrier_set_role(&barrier, BARRIER_PARENT);
5004
5005 fdset_close(fds);
5006
5007 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
5008 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
5009
5010 if (arg_userns_mode != USER_NAMESPACE_NO) {
5011 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
5012 if (mntns_fd < 0)
5013 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
5014
5015 /* The child just let us know the UID shift it might have read from the image. */
5016 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
5017 if (l < 0)
5018 return log_error_errno(errno, "Failed to read UID shift: %m");
5019 if (l != sizeof arg_uid_shift)
5020 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
5021
5022 if (arg_userns_mode == USER_NAMESPACE_PICK) {
5023 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
5024 * image, but if that's already in use, pick a new one, and report back to the child,
5025 * which one we now picked. */
5026
5027 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
5028 if (r < 0)
5029 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
5030
5031 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
5032 if (l < 0)
5033 return log_error_errno(errno, "Failed to send UID shift: %m");
5034 if (l != sizeof arg_uid_shift)
5035 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
5036 }
5037
5038 n_bind_user_uid = strv_length(arg_bind_user);
5039 if (n_bind_user_uid > 0) {
5040 /* Right after the UID shift, we'll receive the list of UID mappings for the
5041 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
5042
5043 bind_user_uid = new(uid_t, n_bind_user_uid*4);
5044 if (!bind_user_uid)
5045 return log_oom();
5046
5047 for (size_t i = 0; i < n_bind_user_uid; i++) {
5048 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
5049 if (l < 0)
5050 return log_error_errno(errno, "Failed to read user UID map pair: %m");
5051 if (l != sizeof(uid_t)*4)
5052 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
5053 SYNTHETIC_ERRNO(EIO),
5054 "Short read while reading bind user UID pairs.");
5055 }
5056 }
5057 }
5058
5059 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
5060 /* The child let us know the support cgroup mode it might have read from the image. */
5061 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
5062 if (l < 0)
5063 return log_error_errno(errno, "Failed to read cgroup mode: %m");
5064 if (l != sizeof(arg_unified_cgroup_hierarchy))
5065 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
5066 l, l == 0 ? " The child is most likely dead." : "");
5067 }
5068
5069 /* Wait for the outer child. */
5070 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
5071 if (r < 0)
5072 return r;
5073 if (r != EXIT_SUCCESS)
5074 return -EIO;
5075
5076 /* And now retrieve the PID of the inner child. */
5077 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
5078 if (l < 0)
5079 return log_error_errno(errno, "Failed to read inner child PID: %m");
5080 if (l != sizeof *pid)
5081 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
5082
5083 /* We also retrieve container UUID in case it was generated by outer child */
5084 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
5085 if (l < 0)
5086 return log_error_errno(errno, "Failed to read container machine ID: %m");
5087 if (l != sizeof(arg_uuid))
5088 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
5089
5090 /* We also retrieve the socket used for notifications generated by outer child */
5091 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
5092 if (notify_socket < 0)
5093 return log_error_errno(notify_socket,
5094 "Failed to receive notification socket from the outer child: %m");
5095
5096 log_debug("Init process invoked as PID "PID_FMT, *pid);
5097
5098 if (arg_userns_mode != USER_NAMESPACE_NO) {
5099 if (!barrier_place_and_sync(&barrier)) /* #1 */
5100 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5101
5102 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
5103 if (r < 0)
5104 return r;
5105
5106 (void) barrier_place(&barrier); /* #2 */
5107 }
5108
5109 if (arg_private_network) {
5110 if (!arg_network_namespace_path) {
5111 /* Wait until the child has unshared its network namespace. */
5112 if (!barrier_place_and_sync(&barrier)) /* #3 */
5113 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
5114 }
5115
5116 if (child_netns_fd < 0) {
5117 /* Make sure we have an open file descriptor to the child's network
5118 * namespace so it stays alive even if the child exits. */
5119 r = namespace_open(*pid,
5120 /* ret_pidns_fd = */ NULL,
5121 /* ret_mntns_fd = */ NULL,
5122 &child_netns_fd,
5123 /* ret_userns_fd = */ NULL,
5124 /* ret_root_fd = */ NULL);
5125 if (r < 0)
5126 return log_error_errno(r, "Failed to open child network namespace: %m");
5127 }
5128
5129 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
5130 if (r < 0)
5131 return r;
5132
5133 if (arg_network_veth) {
5134 r = setup_veth(arg_machine, *pid, veth_name,
5135 arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
5136 if (r < 0)
5137 return r;
5138 else if (r > 0)
5139 ifi = r;
5140
5141 if (arg_network_bridge) {
5142 /* Add the interface to a bridge */
5143 r = setup_bridge(veth_name, arg_network_bridge, false);
5144 if (r < 0)
5145 return r;
5146 if (r > 0)
5147 ifi = r;
5148 } else if (arg_network_zone) {
5149 /* Add the interface to a bridge, possibly creating it */
5150 r = setup_bridge(veth_name, arg_network_zone, true);
5151 if (r < 0)
5152 return r;
5153 if (r > 0)
5154 ifi = r;
5155 }
5156 }
5157
5158 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5159 if (r < 0)
5160 return r;
5161
5162 /* We created the primary and extra veth links now; let's remember this, so that we know to
5163 remove them later on. Note that we don't bother with removing veth links that were created
5164 here when their setup failed half-way, because in that case the kernel should be able to
5165 remove them on its own, since they cannot be referenced by anything yet. */
5166 *veth_created = true;
5167
5168 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5169 if (r < 0)
5170 return r;
5171
5172 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5173 if (r < 0)
5174 return r;
5175 }
5176
5177 if (arg_register || !arg_keep_unit) {
5178 r = sd_bus_default_system(&bus);
5179 if (r < 0)
5180 return log_error_errno(r, "Failed to open system bus: %m");
5181
5182 r = sd_bus_set_close_on_exit(bus, false);
5183 if (r < 0)
5184 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5185 }
5186
5187 if (!arg_keep_unit) {
5188 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5189 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5190 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5191
5192 r = sd_bus_match_signal_async(
5193 bus,
5194 NULL,
5195 "org.freedesktop.systemd1",
5196 NULL,
5197 "org.freedesktop.systemd1.Scope",
5198 "RequestStop",
5199 on_request_stop, NULL, PID_TO_PTR(*pid));
5200 if (r < 0)
5201 return log_error_errno(r, "Failed to request RequestStop match: %m");
5202 }
5203
5204 if (arg_register) {
5205 r = register_machine(
5206 bus,
5207 arg_machine,
5208 *pid,
5209 arg_directory,
5210 arg_uuid,
5211 ifi,
5212 arg_slice,
5213 arg_custom_mounts, arg_n_custom_mounts,
5214 arg_kill_signal,
5215 arg_property,
5216 arg_property_message,
5217 arg_keep_unit,
5218 arg_container_service_name,
5219 arg_start_mode);
5220 if (r < 0)
5221 return r;
5222
5223 } else if (!arg_keep_unit) {
5224 r = allocate_scope(
5225 bus,
5226 arg_machine,
5227 *pid,
5228 arg_slice,
5229 arg_custom_mounts, arg_n_custom_mounts,
5230 arg_kill_signal,
5231 arg_property,
5232 arg_property_message,
5233 /* allow_pidfds= */ true,
5234 arg_start_mode);
5235 if (r < 0)
5236 return r;
5237
5238 } else if (arg_slice || arg_property)
5239 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5240
5241 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5242 if (r < 0)
5243 return r;
5244
5245 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5246 if (r < 0)
5247 return r;
5248
5249 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5250 if (r < 0)
5251 return r;
5252
5253 /* Notify the child that the parent is ready with all
5254 * its setup (including cgroup-ification), and that
5255 * the child can now hand over control to the code to
5256 * run inside the container. */
5257 (void) barrier_place(&barrier); /* #4 */
5258
5259 /* Block SIGCHLD here, before notifying child.
5260 * process_pty() will handle it with the other signals. */
5261 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5262
5263 /* Reset signal to default */
5264 r = default_signals(SIGCHLD);
5265 if (r < 0)
5266 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5267
5268 r = sd_event_new(&event);
5269 if (r < 0)
5270 return log_error_errno(r, "Failed to get default event source: %m");
5271
5272 (void) sd_event_set_watchdog(event, true);
5273
5274 if (bus) {
5275 r = sd_bus_attach_event(bus, event, 0);
5276 if (r < 0)
5277 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5278 }
5279
5280 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5281 if (r < 0)
5282 return r;
5283
5284 /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on,
5285 * before we take the fully visible instances away. */
5286 if (!barrier_sync(&barrier)) /* #5.1 */
5287 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5288
5289 if (arg_userns_mode != USER_NAMESPACE_NO) {
5290 r = wipe_fully_visible_fs(mntns_fd);
5291 if (r < 0)
5292 return r;
5293 mntns_fd = safe_close(mntns_fd);
5294 }
5295
5296 /* And now let the child know that we completed removing the procfs instances, and it can start the
5297 * payload. */
5298 if (!barrier_place(&barrier)) /* #5.2 */
5299 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5300
5301 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5302 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5303 etc_passwd_lock = safe_close(etc_passwd_lock);
5304
5305 (void) sd_notifyf(false,
5306 "STATUS=Container running.\n"
5307 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5308 if (!arg_notify_ready) {
5309 r = sd_notify(false, "READY=1\n");
5310 if (r < 0)
5311 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5312 }
5313
5314 if (arg_kill_signal > 0) {
5315 /* Try to kill the init system on SIGINT or SIGTERM */
5316 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5317 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5318 } else {
5319 /* Immediately exit */
5320 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5321 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5322 }
5323
5324 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5325
5326 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5327 if (r < 0)
5328 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5329
5330 /* Exit when the child exits */
5331 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5332
5333 /* Retrieve the kmsg fifo allocated by inner child */
5334 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5335 if (fd_kmsg_fifo < 0)
5336 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5337
5338 if (arg_expose_ports) {
5339 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
5340 if (r < 0)
5341 return r;
5342
5343 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5344 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5345 }
5346
5347 if (arg_console_mode != CONSOLE_PIPE) {
5348 _cleanup_close_ int fd = -EBADF;
5349 PTYForwardFlags flags = 0;
5350
5351 /* Retrieve the master pty allocated by inner child */
5352 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
5353 if (fd < 0)
5354 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5355
5356 switch (arg_console_mode) {
5357
5358 case CONSOLE_READ_ONLY:
5359 flags |= PTY_FORWARD_READ_ONLY;
5360
5361 _fallthrough_;
5362
5363 case CONSOLE_INTERACTIVE:
5364 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5365
5366 r = pty_forward_new(event, fd, flags, &forward);
5367 if (r < 0)
5368 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5369
5370 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5371 (void) pty_forward_set_width_height(
5372 forward,
5373 arg_console_width,
5374 arg_console_height);
5375
5376 if (!arg_background) {
5377 _cleanup_free_ char *bg = NULL;
5378
5379 r = terminal_tint_color(220 /* blue */, &bg);
5380 if (r < 0)
5381 log_debug_errno(r, "Failed to determine terminal background color, not tinting.");
5382 else
5383 (void) pty_forward_set_background_color(forward, bg);
5384 } else if (!isempty(arg_background))
5385 (void) pty_forward_set_background_color(forward, arg_background);
5386
5387 set_window_title(forward);
5388 break;
5389
5390 default:
5391 assert(arg_console_mode == CONSOLE_PASSIVE);
5392 }
5393
5394 *master = TAKE_FD(fd);
5395 }
5396
5397 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5398
5399 r = sd_event_loop(event);
5400 if (r < 0)
5401 return log_error_errno(r, "Failed to run event loop: %m");
5402
5403 if (forward) {
5404 char last_char = 0;
5405
5406 (void) pty_forward_get_last_char(forward, &last_char);
5407 forward = pty_forward_free(forward);
5408
5409 if (!arg_quiet && last_char != '\n')
5410 putc('\n', stdout);
5411 }
5412
5413 /* Kill if it is not dead yet anyway */
5414 if (!arg_register && !arg_keep_unit && bus)
5415 terminate_scope(bus, arg_machine);
5416
5417 /* Normally redundant, but better safe than sorry */
5418 (void) kill(*pid, SIGKILL);
5419
5420 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5421
5422 if (arg_private_network) {
5423 r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces);
5424 if (r < 0)
5425 return r;
5426 }
5427
5428 r = wait_for_container(TAKE_PID(*pid), &container_status);
5429
5430 /* Tell machined that we are gone. */
5431 if (bus)
5432 (void) unregister_machine(bus, arg_machine);
5433
5434 if (r < 0)
5435 /* We failed to wait for the container, or the container exited abnormally. */
5436 return r;
5437 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5438 /* r > 0 → The container exited with a non-zero status.
5439 * As a special case, we need to replace 133 with a different value,
5440 * because 133 is special-cased in the service file to reboot the container.
5441 * otherwise → The container exited with zero status and a reboot was not requested.
5442 */
5443 if (r == EXIT_FORCE_RESTART)
5444 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5445 *ret = r;
5446 return 0; /* finito */
5447 }
5448
5449 /* CONTAINER_REBOOTED, loop again */
5450
5451 if (arg_keep_unit) {
5452 /* Special handling if we are running as a service: instead of simply
5453 * restarting the machine we want to restart the entire service, so let's
5454 * inform systemd about this with the special exit code 133. The service
5455 * file uses RestartForceExitStatus=133 so that this results in a full
5456 * nspawn restart. This is necessary since we might have cgroup parameters
5457 * set we want to have flushed out. */
5458 *ret = EXIT_FORCE_RESTART;
5459 return 0; /* finito */
5460 }
5461
5462 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5463 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5464
5465 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5466 *veth_created = false;
5467 return 1; /* loop again */
5468}
5469
5470static int initialize_rlimits(void) {
5471 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5472 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5473 * container execution environments. */
5474
5475 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5476 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5477 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5478 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5479 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5480 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5481 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5482 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5483 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5484 [RLIMIT_NICE] = { 0, 0 },
5485 [RLIMIT_NOFILE] = { 1024, 4096 },
5486 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5487 [RLIMIT_RTPRIO] = { 0, 0 },
5488 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5489 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5490
5491 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5492 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5493 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5494 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5495 * that PID 1 changes a number of other resource limits during early initialization which is why we
5496 * don't read the other limits from PID 1 but prefer the static table above. */
5497 };
5498
5499 int rl, r;
5500
5501 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5502 /* Let's only fill in what the user hasn't explicitly configured anyway */
5503 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5504 const struct rlimit *v;
5505 struct rlimit buffer;
5506
5507 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5508 /* For these two let's read the limits off PID 1. See above for an explanation. */
5509
5510 r = pid_getrlimit(1, rl, &buffer);
5511 if (r < 0)
5512 return log_error_errno(r, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5513
5514 v = &buffer;
5515 } else if (rl == RLIMIT_NOFILE) {
5516 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5517 * userspace. Given that nspawn containers are often run without our PID 1,
5518 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5519 * so that container userspace gets similar resources as host userspace
5520 * gets. */
5521 buffer = kernel_defaults[rl];
5522 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5523 v = &buffer;
5524 } else
5525 v = kernel_defaults + rl;
5526
5527 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5528 if (!arg_rlimit[rl])
5529 return log_oom();
5530 }
5531
5532 if (DEBUG_LOGGING) {
5533 _cleanup_free_ char *k = NULL;
5534
5535 (void) rlimit_format(arg_rlimit[rl], &k);
5536 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5537 }
5538 }
5539
5540 return 0;
5541}
5542
5543static int cant_be_in_netns(void) {
5544 _cleanup_close_ int fd = -EBADF;
5545 struct ucred ucred;
5546 int r;
5547
5548 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5549 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5550 * nice message. */
5551
5552 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5553 return 0;
5554
5555 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5556 if (fd < 0)
5557 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5558
5559 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5560 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5561 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5562 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5563 if (r < 0)
5564 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
5565
5566 r = getpeercred(fd, &ucred);
5567 if (r < 0)
5568 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5569
5570 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
5571 if (r < 0)
5572 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5573 if (r == 0)
5574 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5575 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5576 return 0;
5577}
5578
5579static int run(int argc, char *argv[]) {
5580 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5581 _cleanup_close_ int master = -EBADF;
5582 _cleanup_fdset_free_ FDSet *fds = NULL;
5583 int r, n_fd_passed, ret = EXIT_SUCCESS;
5584 char veth_name[IFNAMSIZ] = "";
5585 struct ExposeArgs expose_args = {};
5586 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5587 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5588 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5589 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5590 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5591 pid_t pid = 0;
5592
5593 log_parse_environment();
5594 log_open();
5595
5596 r = parse_argv(argc, argv);
5597 if (r <= 0)
5598 goto finish;
5599
5600 if (geteuid() != 0) {
5601 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5602 argc >= 2 ? "Need to be root." :
5603 "Need to be root (and some arguments are usually required).\nHint: try --help");
5604 goto finish;
5605 }
5606
5607 r = cant_be_in_netns();
5608 if (r < 0)
5609 goto finish;
5610
5611 r = initialize_rlimits();
5612 if (r < 0)
5613 goto finish;
5614
5615 r = load_oci_bundle();
5616 if (r < 0)
5617 goto finish;
5618
5619 r = pick_paths();
5620 if (r < 0)
5621 goto finish;
5622
5623 r = determine_names();
5624 if (r < 0)
5625 goto finish;
5626
5627 r = load_settings();
5628 if (r < 0)
5629 goto finish;
5630
5631 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
5632 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
5633 * indicate that. */
5634 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
5635 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
5636
5637 r = cg_unified();
5638 if (r < 0) {
5639 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5640 goto finish;
5641 }
5642
5643 r = verify_arguments();
5644 if (r < 0)
5645 goto finish;
5646
5647 r = resolve_network_interface_names(arg_network_interfaces);
5648 if (r < 0)
5649 goto finish;
5650
5651 r = verify_network_interfaces_initialized();
5652 if (r < 0)
5653 goto finish;
5654
5655 /* Reapply environment settings. */
5656 (void) detect_unified_cgroup_hierarchy_from_environment();
5657
5658 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5659 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5660 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5661 (void) ignore_signals(SIGPIPE);
5662
5663 n_fd_passed = sd_listen_fds(false);
5664 if (n_fd_passed > 0) {
5665 r = fdset_new_listen_fds(&fds, false);
5666 if (r < 0) {
5667 log_error_errno(r, "Failed to collect file descriptors: %m");
5668 goto finish;
5669 }
5670 }
5671
5672 /* The "default" umask. This is appropriate for most file and directory
5673 * operations performed by nspawn, and is the umask that will be used for
5674 * the child. Functions like copy_devnodes() change the umask temporarily. */
5675 umask(0022);
5676
5677 if (arg_directory) {
5678 assert(!arg_image);
5679
5680 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5681 * /var from the host will propagate into container dynamically (because bad things happen if
5682 * two systems write to the same /var). Let's allow it for the special cases where /var is
5683 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5684 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5685 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5686 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5687 goto finish;
5688 }
5689
5690 if (arg_ephemeral) {
5691 _cleanup_free_ char *np = NULL;
5692
5693 r = chase_and_update(&arg_directory, 0);
5694 if (r < 0)
5695 goto finish;
5696
5697 /* If the specified path is a mount point we generate the new snapshot immediately
5698 * inside it under a random name. However if the specified is not a mount point we
5699 * create the new snapshot in the parent directory, just next to it. */
5700 r = path_is_mount_point(arg_directory);
5701 if (r < 0) {
5702 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5703 goto finish;
5704 }
5705 if (r > 0)
5706 r = tempfn_random_child(arg_directory, "machine.", &np);
5707 else
5708 r = tempfn_random(arg_directory, "machine.", &np);
5709 if (r < 0) {
5710 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5711 goto finish;
5712 }
5713
5714 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5715 * only owned by us and no one else. */
5716 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5717 if (r < 0) {
5718 log_error_errno(r, "Failed to lock %s: %m", np);
5719 goto finish;
5720 }
5721
5722 {
5723 BLOCK_SIGNALS(SIGINT);
5724 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5725 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5726 BTRFS_SNAPSHOT_FALLBACK_COPY |
5727 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5728 BTRFS_SNAPSHOT_RECURSIVE |
5729 BTRFS_SNAPSHOT_QUOTA |
5730 BTRFS_SNAPSHOT_SIGINT);
5731 }
5732 if (r == -EINTR) {
5733 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5734 goto finish;
5735 }
5736 if (r < 0) {
5737 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5738 goto finish;
5739 }
5740
5741 free_and_replace(arg_directory, np);
5742 remove_directory = true;
5743 } else {
5744 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5745 if (r < 0)
5746 goto finish;
5747
5748 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5749 if (r == -EBUSY) {
5750 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5751 goto finish;
5752 }
5753 if (r < 0) {
5754 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5755 goto finish;
5756 }
5757
5758 if (arg_template) {
5759 r = chase_and_update(&arg_template, 0);
5760 if (r < 0)
5761 goto finish;
5762
5763 {
5764 BLOCK_SIGNALS(SIGINT);
5765 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5766 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5767 BTRFS_SNAPSHOT_FALLBACK_COPY |
5768 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5769 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5770 BTRFS_SNAPSHOT_RECURSIVE |
5771 BTRFS_SNAPSHOT_QUOTA |
5772 BTRFS_SNAPSHOT_SIGINT);
5773 }
5774 if (r == -EEXIST)
5775 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5776 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5777 else if (r == -EINTR) {
5778 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5779 goto finish;
5780 } else if (r < 0) {
5781 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5782 goto finish;
5783 } else
5784 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5785 "Populated %s from template %s.", arg_directory, arg_template);
5786 }
5787 }
5788
5789 if (arg_start_mode == START_BOOT) {
5790 _cleanup_free_ char *b = NULL;
5791 const char *p;
5792 int check_os_release, is_os_tree;
5793
5794 if (arg_pivot_root_new) {
5795 b = path_join(arg_directory, arg_pivot_root_new);
5796 if (!b) {
5797 r = log_oom();
5798 goto finish;
5799 }
5800
5801 p = b;
5802 } else
5803 p = arg_directory;
5804
5805 check_os_release = getenv_bool("SYSTEMD_NSPAWN_CHECK_OS_RELEASE");
5806 if (check_os_release < 0 && check_os_release != -ENXIO) {
5807 r = log_error_errno(check_os_release, "Failed to parse $SYSTEMD_NSPAWN_CHECK_OS_RELEASE: %m");
5808 goto finish;
5809 }
5810
5811 is_os_tree = path_is_os_tree(p);
5812 if (is_os_tree == 0 && check_os_release == 0)
5813 log_debug("Directory %s is missing an os-release file, continuing anyway.", p);
5814 else if (is_os_tree <= 0) {
5815 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5816 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5817 goto finish;
5818 }
5819 } else {
5820 _cleanup_free_ char *p = NULL;
5821
5822 if (arg_pivot_root_new)
5823 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
5824 else
5825 p = path_join(arg_directory, "/usr/");
5826 if (!p) {
5827 r = log_oom();
5828 goto finish;
5829 }
5830
5831 if (laccess(p, F_OK) < 0) {
5832 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5833 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
5834 goto finish;
5835 }
5836 }
5837
5838 } else {
5839 DissectImageFlags dissect_image_flags =
5840 DISSECT_IMAGE_GENERIC_ROOT |
5841 DISSECT_IMAGE_REQUIRE_ROOT |
5842 DISSECT_IMAGE_RELAX_VAR_CHECK |
5843 DISSECT_IMAGE_USR_NO_ROOT |
5844 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5845 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
5846 assert(arg_image);
5847 assert(!arg_template);
5848
5849 r = chase_and_update(&arg_image, 0);
5850 if (r < 0)
5851 goto finish;
5852
5853 if (arg_ephemeral) {
5854 _cleanup_free_ char *np = NULL;
5855
5856 r = tempfn_random(arg_image, "machine.", &np);
5857 if (r < 0) {
5858 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5859 goto finish;
5860 }
5861
5862 /* Always take an exclusive lock on our own ephemeral copy. */
5863 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5864 if (r < 0) {
5865 log_error_errno(r, "Failed to create image lock: %m");
5866 goto finish;
5867 }
5868
5869 {
5870 BLOCK_SIGNALS(SIGINT);
5871 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5872 FS_NOCOW_FL, FS_NOCOW_FL,
5873 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5874 NULL, NULL);
5875 }
5876 if (r == -EINTR) {
5877 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5878 goto finish;
5879 }
5880 if (r < 0) {
5881 r = log_error_errno(r, "Failed to copy image file: %m");
5882 goto finish;
5883 }
5884
5885 free_and_replace(arg_image, np);
5886 remove_image = true;
5887 } else {
5888 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5889 if (r == -EBUSY) {
5890 log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5891 goto finish;
5892 }
5893 if (r < 0) {
5894 log_error_errno(r, "Failed to create image lock: %m");
5895 goto finish;
5896 }
5897
5898 r = verity_settings_load(
5899 &arg_verity_settings,
5900 arg_image, NULL, NULL);
5901 if (r < 0) {
5902 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5903 goto finish;
5904 }
5905
5906 if (arg_verity_settings.data_path)
5907 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5908 }
5909
5910 if (!mkdtemp(tmprootdir)) {
5911 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5912 goto finish;
5913 }
5914
5915 remove_tmprootdir = true;
5916
5917 arg_directory = strdup(tmprootdir);
5918 if (!arg_directory) {
5919 r = log_oom();
5920 goto finish;
5921 }
5922
5923 r = loop_device_make_by_path(
5924 arg_image,
5925 arg_read_only ? O_RDONLY : O_RDWR,
5926 /* sector_size= */ UINT32_MAX,
5927 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5928 LOCK_SH,
5929 &loop);
5930 if (r < 0) {
5931 log_error_errno(r, "Failed to set up loopback block device: %m");
5932 goto finish;
5933 }
5934
5935 r = dissect_loop_device_and_warn(
5936 loop,
5937 &arg_verity_settings,
5938 /* mount_options=*/ NULL,
5939 arg_image_policy ?: &image_policy_container,
5940 dissect_image_flags,
5941 &dissected_image);
5942 if (r == -ENOPKG) {
5943 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5944 log_notice("Note that the disk image needs to\n"
5945 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5946 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5947 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
5948 " d) or contain a file system without a partition table\n"
5949 "in order to be bootable with systemd-nspawn.");
5950 goto finish;
5951 }
5952 if (r < 0)
5953 goto finish;
5954
5955 r = dissected_image_load_verity_sig_partition(
5956 dissected_image,
5957 loop->fd,
5958 &arg_verity_settings);
5959 if (r < 0)
5960 goto finish;
5961
5962 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5963 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5964 "root hash signature found! Proceeding without integrity checking.", arg_image);
5965
5966 r = dissected_image_decrypt_interactively(
5967 dissected_image,
5968 NULL,
5969 &arg_verity_settings,
5970 0);
5971 if (r < 0)
5972 goto finish;
5973
5974 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5975 if (remove_image && unlink(arg_image) >= 0)
5976 remove_image = false;
5977
5978 if (arg_architecture < 0)
5979 arg_architecture = dissected_image_architecture(dissected_image);
5980 }
5981
5982 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5983 if (r < 0)
5984 goto finish;
5985
5986 if (arg_console_mode < 0)
5987 arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ?
5988 CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5989
5990 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5991 arg_quiet = true;
5992
5993 if (!arg_quiet) {
5994 const char *t = arg_image ?: arg_directory;
5995 _cleanup_free_ char *u = NULL;
5996 (void) terminal_urlify_path(t, t, &u);
5997
5998 log_info("%s %sSpawning container %s on %s.%s",
5999 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), arg_machine, u ?: t, ansi_normal());
6000
6001 if (arg_console_mode == CONSOLE_INTERACTIVE)
6002 log_info("%s %sPress %sCtrl-]%s three times within 1s to kill container.%s",
6003 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal());
6004 }
6005
6006 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18) >= 0);
6007
6008 r = make_reaper_process(true);
6009 if (r < 0) {
6010 log_error_errno(r, "Failed to become subreaper: %m");
6011 goto finish;
6012 }
6013
6014 if (arg_expose_ports) {
6015 r = fw_ctx_new(&fw_ctx);
6016 if (r < 0) {
6017 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
6018 goto finish;
6019 }
6020 expose_args.fw_ctx = fw_ctx;
6021 }
6022 for (;;) {
6023 r = run_container(dissected_image,
6024 fds,
6025 veth_name, &veth_created,
6026 &expose_args, &master,
6027 &pid, &ret);
6028 if (r <= 0)
6029 break;
6030 }
6031
6032finish:
6033 (void) sd_notify(false,
6034 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
6035 "STOPPING=1\nSTATUS=Terminating...");
6036
6037 if (pid > 0)
6038 (void) kill(pid, SIGKILL);
6039
6040 /* Try to flush whatever is still queued in the pty */
6041 if (master >= 0) {
6042 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6043 master = safe_close(master);
6044 }
6045
6046 if (pid > 0)
6047 (void) wait_for_terminate(pid, NULL);
6048
6049 pager_close();
6050
6051 if (remove_directory && arg_directory) {
6052 int k;
6053
6054 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
6055 if (k < 0)
6056 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
6057 }
6058
6059 if (remove_image && arg_image) {
6060 if (unlink(arg_image) < 0)
6061 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
6062 }
6063
6064 if (remove_tmprootdir) {
6065 if (rmdir(tmprootdir) < 0)
6066 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
6067 }
6068
6069 if (arg_machine) {
6070 const char *p;
6071
6072 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
6073 (void) rm_rf(p, REMOVE_ROOT);
6074
6075 p = strjoina("/run/systemd/nspawn/unix-export/", arg_machine);
6076 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6077 (void) rmdir(p);
6078 }
6079
6080 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
6081 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
6082
6083 if (veth_created)
6084 (void) remove_veth_links(veth_name, arg_network_veth_extra);
6085 (void) remove_bridge(arg_network_zone);
6086
6087 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
6088 expose_port_free_all(arg_expose_ports);
6089 rlimit_free_all(arg_rlimit);
6090 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
6091
6092 if (r < 0)
6093 return r;
6094
6095 return ret;
6096}
6097
6098DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);