]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #29788 from poettering/nspawn-barrier-fix
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <termios.h>
20 #include <unistd.h>
21
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25
26 #include "alloc-util.h"
27 #include "ether-addr-util.h"
28 #include "barrier.h"
29 #include "base-filesystem.h"
30 #include "blkid-util.h"
31 #include "btrfs-util.h"
32 #include "build.h"
33 #include "bus-error.h"
34 #include "bus-locator.h"
35 #include "bus-util.h"
36 #include "cap-list.h"
37 #include "capability-util.h"
38 #include "cgroup-util.h"
39 #include "chase.h"
40 #include "common-signal.h"
41 #include "copy.h"
42 #include "cpu-set-util.h"
43 #include "creds-util.h"
44 #include "dev-setup.h"
45 #include "discover-image.h"
46 #include "dissect-image.h"
47 #include "env-util.h"
48 #include "escape.h"
49 #include "fd-util.h"
50 #include "fdset.h"
51 #include "fileio.h"
52 #include "format-util.h"
53 #include "fs-util.h"
54 #include "gpt.h"
55 #include "hexdecoct.h"
56 #include "hostname-setup.h"
57 #include "hostname-util.h"
58 #include "id128-util.h"
59 #include "io-util.h"
60 #include "log.h"
61 #include "loop-util.h"
62 #include "loopback-setup.h"
63 #include "macro.h"
64 #include "main-func.h"
65 #include "missing_sched.h"
66 #include "mkdir.h"
67 #include "mount-util.h"
68 #include "mountpoint-util.h"
69 #include "namespace-util.h"
70 #include "netlink-util.h"
71 #include "nspawn-bind-user.h"
72 #include "nspawn-cgroup.h"
73 #include "nspawn-creds.h"
74 #include "nspawn-def.h"
75 #include "nspawn-expose-ports.h"
76 #include "nspawn-mount.h"
77 #include "nspawn-network.h"
78 #include "nspawn-oci.h"
79 #include "nspawn-patch-uid.h"
80 #include "nspawn-register.h"
81 #include "nspawn-seccomp.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "nspawn-util.h"
86 #include "nspawn.h"
87 #include "nulstr-util.h"
88 #include "os-util.h"
89 #include "pager.h"
90 #include "parse-argument.h"
91 #include "parse-util.h"
92 #include "pretty-print.h"
93 #include "process-util.h"
94 #include "ptyfwd.h"
95 #include "random-util.h"
96 #include "raw-clone.h"
97 #include "resolve-util.h"
98 #include "rlimit-util.h"
99 #include "rm-rf.h"
100 #include "seccomp-util.h"
101 #include "selinux-util.h"
102 #include "signal-util.h"
103 #include "socket-util.h"
104 #include "stat-util.h"
105 #include "stdio-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
108 #include "strv.h"
109 #include "sysctl-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-name.h"
114 #include "user-util.h"
115
116 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
118 #define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
119
120 #define EXIT_FORCE_RESTART 133
121
122 typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED,
125 } ContainerStatus;
126
127 static char *arg_directory = NULL;
128 static char *arg_template = NULL;
129 static char *arg_chdir = NULL;
130 static char *arg_pivot_root_new = NULL;
131 static char *arg_pivot_root_old = NULL;
132 static char *arg_user = NULL;
133 static uid_t arg_uid = UID_INVALID;
134 static gid_t arg_gid = GID_INVALID;
135 static gid_t* arg_supplementary_gids = NULL;
136 static size_t arg_n_supplementary_gids = 0;
137 static sd_id128_t arg_uuid = {};
138 static char *arg_machine = NULL; /* The name used by the host to refer to this */
139 static char *arg_hostname = NULL; /* The name the payload sees by default */
140 static const char *arg_selinux_context = NULL;
141 static const char *arg_selinux_apifs_context = NULL;
142 static char *arg_slice = NULL;
143 static bool arg_private_network = false;
144 static bool arg_read_only = false;
145 static StartMode arg_start_mode = START_PID1;
146 static bool arg_ephemeral = false;
147 static LinkJournal arg_link_journal = LINK_AUTO;
148 static bool arg_link_journal_try = false;
149 static uint64_t arg_caps_retain =
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
161 (1ULL << CAP_MKNOD) |
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
165 (1ULL << CAP_SETFCAP) |
166 (1ULL << CAP_SETGID) |
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
170 (1ULL << CAP_SYS_BOOT) |
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
174 (1ULL << CAP_SYS_RESOURCE) |
175 (1ULL << CAP_SYS_TTY_CONFIG);
176 static uint64_t arg_caps_ambient = 0;
177 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
178 static CustomMount *arg_custom_mounts = NULL;
179 static size_t arg_n_custom_mounts = 0;
180 static char **arg_setenv = NULL;
181 static bool arg_quiet = false;
182 static bool arg_register = true;
183 static bool arg_keep_unit = false;
184 static char **arg_network_interfaces = NULL;
185 static char **arg_network_macvlan = NULL;
186 static char **arg_network_ipvlan = NULL;
187 static bool arg_network_veth = false;
188 static char **arg_network_veth_extra = NULL;
189 static char *arg_network_bridge = NULL;
190 static char *arg_network_zone = NULL;
191 static char *arg_network_namespace_path = NULL;
192 struct ether_addr arg_network_provided_mac = {};
193 static PagerFlags arg_pager_flags = 0;
194 static unsigned long arg_personality = PERSONALITY_INVALID;
195 static char *arg_image = NULL;
196 static char *arg_oci_bundle = NULL;
197 static VolatileMode arg_volatile_mode = VOLATILE_NO;
198 static ExposePort *arg_expose_ports = NULL;
199 static char **arg_property = NULL;
200 static sd_bus_message *arg_property_message = NULL;
201 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
202 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
203 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
204 static int arg_kill_signal = 0;
205 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
206 static SettingsMask arg_settings_mask = 0;
207 static int arg_settings_trusted = -1;
208 static char **arg_parameters = NULL;
209 static const char *arg_container_service_name = "systemd-nspawn";
210 static bool arg_notify_ready = false;
211 static bool arg_use_cgns = true;
212 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
213 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
214 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
215 static char **arg_syscall_allow_list = NULL;
216 static char **arg_syscall_deny_list = NULL;
217 #if HAVE_SECCOMP
218 static scmp_filter_ctx arg_seccomp = NULL;
219 #endif
220 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
221 static bool arg_no_new_privileges = false;
222 static int arg_oom_score_adjust = 0;
223 static bool arg_oom_score_adjust_set = false;
224 static CPUSet arg_cpu_set = {};
225 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
226 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
227 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
228 static DeviceNode* arg_extra_nodes = NULL;
229 static size_t arg_n_extra_nodes = 0;
230 static char **arg_sysctl = NULL;
231 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
232 static Credential *arg_credentials = NULL;
233 static size_t arg_n_credentials = 0;
234 static char **arg_bind_user = NULL;
235 static bool arg_suppress_sync = false;
236 static char *arg_settings_filename = NULL;
237 static Architecture arg_architecture = _ARCHITECTURE_INVALID;
238 static ImagePolicy *arg_image_policy = NULL;
239
240 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
259 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
260 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
261 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
262 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
263 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
264 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
265 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
266 #if HAVE_SECCOMP
267 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
268 #endif
269 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
270 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
271 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
272 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
273 STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
274
275 static int handle_arg_console(const char *arg) {
276 if (streq(arg, "help")) {
277 puts("autopipe\n"
278 "interactive\n"
279 "passive\n"
280 "pipe\n"
281 "read-only");
282 return 0;
283 }
284
285 if (streq(arg, "interactive"))
286 arg_console_mode = CONSOLE_INTERACTIVE;
287 else if (streq(arg, "read-only"))
288 arg_console_mode = CONSOLE_READ_ONLY;
289 else if (streq(arg, "passive"))
290 arg_console_mode = CONSOLE_PASSIVE;
291 else if (streq(arg, "pipe")) {
292 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
293 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
294 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
295 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
296 "Proceeding anyway.");
297
298 arg_console_mode = CONSOLE_PIPE;
299 } else if (streq(arg, "autopipe")) {
300 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
301 arg_console_mode = CONSOLE_INTERACTIVE;
302 else
303 arg_console_mode = CONSOLE_PIPE;
304 } else
305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
306
307 arg_settings_mask |= SETTING_CONSOLE_MODE;
308 return 1;
309 }
310
311 static int help(void) {
312 _cleanup_free_ char *link = NULL;
313 int r;
314
315 pager_open(arg_pager_flags);
316
317 r = terminal_urlify_man("systemd-nspawn", "1", &link);
318 if (r < 0)
319 return log_oom();
320
321 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
322 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
323 " -h --help Show this help\n"
324 " --version Print version string\n"
325 " -q --quiet Do not show status information\n"
326 " --no-pager Do not pipe output into a pager\n"
327 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
328 "%3$sImage:%4$s\n"
329 " -D --directory=PATH Root directory for the container\n"
330 " --template=PATH Initialize root directory from template directory,\n"
331 " if missing\n"
332 " -x --ephemeral Run container with snapshot of root directory, and\n"
333 " remove it after exit\n"
334 " -i --image=PATH Root file system disk image (or device node) for\n"
335 " the container\n"
336 " --image-policy=POLICY Specify disk image dissection policy\n"
337 " --oci-bundle=PATH OCI bundle directory\n"
338 " --read-only Mount the root directory read-only\n"
339 " --volatile[=MODE] Run the system in volatile mode\n"
340 " --root-hash=HASH Specify verity root hash for root disk image\n"
341 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
342 " as a DER encoded PKCS7, either as a path to a file\n"
343 " or as an ASCII base64 encoded string prefixed by\n"
344 " 'base64:'\n"
345 " --verity-data=PATH Specify hash device for verity\n"
346 " --pivot-root=PATH[:PATH]\n"
347 " Pivot root to given directory in the container\n\n"
348 "%3$sExecution:%4$s\n"
349 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
350 " -b --boot Boot up full system (i.e. invoke init)\n"
351 " --chdir=PATH Set working directory in the container\n"
352 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
353 " -u --user=USER Run the command under specified user or UID\n"
354 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
355 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
356 " --suppress-sync=BOOLEAN\n"
357 " Suppress any form of disk data synchronization\n\n"
358 "%3$sSystem Identity:%4$s\n"
359 " -M --machine=NAME Set the machine name for the container\n"
360 " --hostname=NAME Override the hostname for the container\n"
361 " --uuid=UUID Set a specific machine UUID for the container\n\n"
362 "%3$sProperties:%4$s\n"
363 " -S --slice=SLICE Place the container in the specified slice\n"
364 " --property=NAME=VALUE Set scope unit property\n"
365 " --register=BOOLEAN Register container as machine\n"
366 " --keep-unit Do not register a scope for the machine, reuse\n"
367 " the service unit nspawn is running in\n\n"
368 "%3$sUser Namespacing:%4$s\n"
369 " --private-users=no Run without user namespacing\n"
370 " --private-users=yes|pick|identity\n"
371 " Run within user namespace, autoselect UID/GID range\n"
372 " --private-users=UIDBASE[:NUIDS]\n"
373 " Similar, but with user configured UID/GID range\n"
374 " --private-users-ownership=MODE\n"
375 " Adjust ('chown') or map ('map') OS tree ownership\n"
376 " to private UID/GID range\n"
377 " -U Equivalent to --private-users=pick and\n"
378 " --private-users-ownership=auto\n\n"
379 "%3$sNetworking:%4$s\n"
380 " --private-network Disable network in container\n"
381 " --network-interface=HOSTIF[:CONTAINERIF]\n"
382 " Assign an existing network interface to the\n"
383 " container\n"
384 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
385 " Create a macvlan network interface based on an\n"
386 " existing network interface to the container\n"
387 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
388 " Create an ipvlan network interface based on an\n"
389 " existing network interface to the container\n"
390 " -n --network-veth Add a virtual Ethernet connection between host\n"
391 " and container\n"
392 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
393 " Add an additional virtual Ethernet link between\n"
394 " host and container\n"
395 " --network-bridge=INTERFACE\n"
396 " Add a virtual Ethernet connection to the container\n"
397 " and attach it to an existing bridge on the host\n"
398 " --network-zone=NAME Similar, but attach the new interface to an\n"
399 " an automatically managed bridge interface\n"
400 " --network-namespace-path=PATH\n"
401 " Set network namespace to the one represented by\n"
402 " the specified kernel namespace file node\n"
403 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
404 " Expose a container IP port on the host\n\n"
405 "%3$sSecurity:%4$s\n"
406 " --capability=CAP In addition to the default, retain specified\n"
407 " capability\n"
408 " --drop-capability=CAP Drop the specified capability from the default set\n"
409 " --ambient-capability=CAP\n"
410 " Sets the specified capability for the started\n"
411 " process. Not useful if booting a machine.\n"
412 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
413 " --system-call-filter=LIST|~LIST\n"
414 " Permit/prohibit specific system calls\n"
415 " -Z --selinux-context=SECLABEL\n"
416 " Set the SELinux security context to be used by\n"
417 " processes in the container\n"
418 " -L --selinux-apifs-context=SECLABEL\n"
419 " Set the SELinux security context to be used by\n"
420 " API/tmpfs file systems in the container\n\n"
421 "%3$sResources:%4$s\n"
422 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
423 " --oom-score-adjust=VALUE\n"
424 " Adjust the OOM score value for the payload\n"
425 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
426 " --personality=ARCH Pick personality for this container\n\n"
427 "%3$sIntegration:%4$s\n"
428 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
429 " --timezone=MODE Select mode of /etc/localtime initialization\n"
430 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
431 " host, try-guest, try-host\n"
432 " -j Equivalent to --link-journal=try-guest\n\n"
433 "%3$sMounts:%4$s\n"
434 " --bind=PATH[:PATH[:OPTIONS]]\n"
435 " Bind mount a file or directory from the host into\n"
436 " the container\n"
437 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
438 " Similar, but creates a read-only bind mount\n"
439 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
440 " it\n"
441 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
442 " --overlay=PATH[:PATH...]:PATH\n"
443 " Create an overlay mount from the host to \n"
444 " the container\n"
445 " --overlay-ro=PATH[:PATH...]:PATH\n"
446 " Similar, but creates a read-only overlay mount\n"
447 " --bind-user=NAME Bind user from host to container\n\n"
448 "%3$sInput/Output:%4$s\n"
449 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
450 " set up for the container.\n"
451 " -P --pipe Equivalent to --console=pipe\n\n"
452 "%3$sCredentials:%4$s\n"
453 " --set-credential=ID:VALUE\n"
454 " Pass a credential with literal value to container.\n"
455 " --load-credential=ID:PATH\n"
456 " Load credential to pass to container from file or\n"
457 " AF_UNIX stream socket.\n"
458 "\nSee the %2$s for details.\n",
459 program_invocation_short_name,
460 link,
461 ansi_underline(),
462 ansi_normal(),
463 ansi_highlight(),
464 ansi_normal());
465
466 return 0;
467 }
468
469 static int custom_mount_check_all(void) {
470 size_t i;
471
472 for (i = 0; i < arg_n_custom_mounts; i++) {
473 CustomMount *m = &arg_custom_mounts[i];
474
475 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
476 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
478 "--private-users-ownership=own may not be combined with custom root mounts.");
479 if (arg_uid_shift == UID_INVALID)
480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
481 "--private-users with automatic UID shift may not be combined with custom root mounts.");
482 }
483 }
484
485 return 0;
486 }
487
488 static int detect_unified_cgroup_hierarchy_from_environment(void) {
489 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
490 int r;
491
492 /* Allow the user to control whether the unified hierarchy is used */
493
494 e = getenv(var);
495 if (!e) {
496 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
497 var = "UNIFIED_CGROUP_HIERARCHY";
498 e = getenv(var);
499 }
500
501 if (!isempty(e)) {
502 r = parse_boolean(e);
503 if (r < 0)
504 return log_error_errno(r, "Failed to parse $%s: %m", var);
505 if (r > 0)
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
507 else
508 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
509 }
510
511 return 0;
512 }
513
514 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
515 int r;
516
517 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
518 * in the image actually supports. */
519 r = cg_all_unified();
520 if (r < 0)
521 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
522 if (r > 0) {
523 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
524 * routine only detects 231, so we'll have a false negative here for 230. */
525 r = systemd_installation_has_version(directory, "230");
526 if (r < 0)
527 return log_error_errno(r, "Failed to determine systemd version in container: %m");
528 if (r > 0)
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
530 else
531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
532 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
533 /* Mixed cgroup hierarchy support was added in 233 */
534 r = systemd_installation_has_version(directory, "233");
535 if (r < 0)
536 return log_error_errno(r, "Failed to determine systemd version in container: %m");
537 if (r > 0)
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
539 else
540 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
541 } else
542 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
543
544 log_debug("Using %s hierarchy for container.",
545 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
546 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
547
548 return 0;
549 }
550
551 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
552 uint64_t mask = 0;
553 int r;
554
555 for (;;) {
556 _cleanup_free_ char *t = NULL;
557
558 r = extract_first_word(&spec, &t, ",", 0);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse capability %s.", t);
561 if (r == 0)
562 break;
563
564 if (streq(t, "help")) {
565 for (int i = 0; i < capability_list_length(); i++) {
566 const char *name;
567
568 name = capability_to_name(i);
569 if (name)
570 puts(name);
571 }
572
573 return 0; /* quit */
574 }
575
576 if (streq(t, "all"))
577 mask = UINT64_MAX;
578 else {
579 r = capability_from_name(t);
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse capability %s.", t);
582
583 mask |= 1ULL << r;
584 }
585 }
586
587 *ret_mask = mask;
588 return 1; /* continue */
589 }
590
591 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
592 int r;
593
594 r = getenv_bool(name);
595 if (r == -ENXIO)
596 return 0;
597 if (r < 0)
598 return log_error_errno(r, "Failed to parse $%s: %m", name);
599
600 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
601 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
602 return 0;
603 }
604
605 static int parse_mount_settings_env(void) {
606 const char *e;
607 int r;
608
609 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
610 if (r < 0 && r != -ENXIO)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
612 if (r >= 0)
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
614
615 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
616 if (streq_ptr(e, "network"))
617 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
618
619 else if (e) {
620 r = parse_boolean(e);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
623
624 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
625 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
626 }
627
628 return 0;
629 }
630
631 static int parse_environment(void) {
632 const char *e;
633 int r;
634
635 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
636 if (r < 0)
637 return r;
638 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
639 if (r < 0)
640 return r;
641 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
642 if (r < 0)
643 return r;
644 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
645 if (r < 0)
646 return r;
647
648 r = parse_mount_settings_env();
649 if (r < 0)
650 return r;
651
652 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
653 * even if it is supported. If not supported, it has no effect. */
654 if (!cg_ns_supported())
655 arg_use_cgns = false;
656 else {
657 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
658 if (r < 0) {
659 if (r != -ENXIO)
660 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
661
662 arg_use_cgns = true;
663 } else {
664 arg_use_cgns = r > 0;
665 arg_settings_mask |= SETTING_USE_CGNS;
666 }
667 }
668
669 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
670 if (e)
671 arg_container_service_name = e;
672
673 e = getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
674 if (e) {
675 r = parse_ether_addr(e, &arg_network_provided_mac);
676 if (r < 0)
677 return log_error_errno(r, "Failed to parse provided MAC address via environment variable");
678 }
679
680 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
681 if (r >= 0)
682 arg_suppress_sync = r;
683 else if (r != -ENXIO)
684 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
685
686 return detect_unified_cgroup_hierarchy_from_environment();
687 }
688
689 static int parse_argv(int argc, char *argv[]) {
690 enum {
691 ARG_VERSION = 0x100,
692 ARG_PRIVATE_NETWORK,
693 ARG_UUID,
694 ARG_READ_ONLY,
695 ARG_CAPABILITY,
696 ARG_AMBIENT_CAPABILITY,
697 ARG_DROP_CAPABILITY,
698 ARG_LINK_JOURNAL,
699 ARG_BIND,
700 ARG_BIND_RO,
701 ARG_TMPFS,
702 ARG_OVERLAY,
703 ARG_OVERLAY_RO,
704 ARG_INACCESSIBLE,
705 ARG_SHARE_SYSTEM,
706 ARG_REGISTER,
707 ARG_KEEP_UNIT,
708 ARG_NETWORK_INTERFACE,
709 ARG_NETWORK_MACVLAN,
710 ARG_NETWORK_IPVLAN,
711 ARG_NETWORK_BRIDGE,
712 ARG_NETWORK_ZONE,
713 ARG_NETWORK_VETH_EXTRA,
714 ARG_NETWORK_NAMESPACE_PATH,
715 ARG_PERSONALITY,
716 ARG_VOLATILE,
717 ARG_TEMPLATE,
718 ARG_PROPERTY,
719 ARG_PRIVATE_USERS,
720 ARG_KILL_SIGNAL,
721 ARG_SETTINGS,
722 ARG_CHDIR,
723 ARG_PIVOT_ROOT,
724 ARG_PRIVATE_USERS_CHOWN,
725 ARG_PRIVATE_USERS_OWNERSHIP,
726 ARG_NOTIFY_READY,
727 ARG_ROOT_HASH,
728 ARG_ROOT_HASH_SIG,
729 ARG_VERITY_DATA,
730 ARG_SYSTEM_CALL_FILTER,
731 ARG_RLIMIT,
732 ARG_HOSTNAME,
733 ARG_NO_NEW_PRIVILEGES,
734 ARG_OOM_SCORE_ADJUST,
735 ARG_CPU_AFFINITY,
736 ARG_RESOLV_CONF,
737 ARG_TIMEZONE,
738 ARG_CONSOLE,
739 ARG_PIPE,
740 ARG_OCI_BUNDLE,
741 ARG_NO_PAGER,
742 ARG_SET_CREDENTIAL,
743 ARG_LOAD_CREDENTIAL,
744 ARG_BIND_USER,
745 ARG_SUPPRESS_SYNC,
746 ARG_IMAGE_POLICY,
747 };
748
749 static const struct option options[] = {
750 { "help", no_argument, NULL, 'h' },
751 { "version", no_argument, NULL, ARG_VERSION },
752 { "directory", required_argument, NULL, 'D' },
753 { "template", required_argument, NULL, ARG_TEMPLATE },
754 { "ephemeral", no_argument, NULL, 'x' },
755 { "user", required_argument, NULL, 'u' },
756 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
757 { "as-pid2", no_argument, NULL, 'a' },
758 { "boot", no_argument, NULL, 'b' },
759 { "uuid", required_argument, NULL, ARG_UUID },
760 { "read-only", no_argument, NULL, ARG_READ_ONLY },
761 { "capability", required_argument, NULL, ARG_CAPABILITY },
762 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
763 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
764 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
765 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
766 { "bind", required_argument, NULL, ARG_BIND },
767 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
768 { "tmpfs", required_argument, NULL, ARG_TMPFS },
769 { "overlay", required_argument, NULL, ARG_OVERLAY },
770 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
771 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
772 { "machine", required_argument, NULL, 'M' },
773 { "hostname", required_argument, NULL, ARG_HOSTNAME },
774 { "slice", required_argument, NULL, 'S' },
775 { "setenv", required_argument, NULL, 'E' },
776 { "selinux-context", required_argument, NULL, 'Z' },
777 { "selinux-apifs-context", required_argument, NULL, 'L' },
778 { "quiet", no_argument, NULL, 'q' },
779 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
780 { "register", required_argument, NULL, ARG_REGISTER },
781 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
782 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
783 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
784 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
785 { "network-veth", no_argument, NULL, 'n' },
786 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
787 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
788 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
789 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
790 { "personality", required_argument, NULL, ARG_PERSONALITY },
791 { "image", required_argument, NULL, 'i' },
792 { "volatile", optional_argument, NULL, ARG_VOLATILE },
793 { "port", required_argument, NULL, 'p' },
794 { "property", required_argument, NULL, ARG_PROPERTY },
795 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
796 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
797 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
798 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
799 { "settings", required_argument, NULL, ARG_SETTINGS },
800 { "chdir", required_argument, NULL, ARG_CHDIR },
801 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
802 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
803 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
804 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
805 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
806 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
807 { "rlimit", required_argument, NULL, ARG_RLIMIT },
808 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
809 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
810 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
811 { "timezone", required_argument, NULL, ARG_TIMEZONE },
812 { "console", required_argument, NULL, ARG_CONSOLE },
813 { "pipe", no_argument, NULL, ARG_PIPE },
814 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
815 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
816 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
817 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
818 { "bind-user", required_argument, NULL, ARG_BIND_USER },
819 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
820 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
821 {}
822 };
823
824 int c, r;
825 uint64_t plus = 0, minus = 0;
826 bool mask_all_settings = false, mask_no_settings = false;
827
828 assert(argc >= 0);
829 assert(argv);
830
831 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
832 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
833 optind = 0;
834 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
835 switch (c) {
836
837 case 'h':
838 return help();
839
840 case ARG_VERSION:
841 return version();
842
843 case 'D':
844 r = parse_path_argument(optarg, false, &arg_directory);
845 if (r < 0)
846 return r;
847
848 arg_settings_mask |= SETTING_DIRECTORY;
849 break;
850
851 case ARG_TEMPLATE:
852 r = parse_path_argument(optarg, false, &arg_template);
853 if (r < 0)
854 return r;
855
856 arg_settings_mask |= SETTING_DIRECTORY;
857 break;
858
859 case 'i':
860 r = parse_path_argument(optarg, false, &arg_image);
861 if (r < 0)
862 return r;
863
864 arg_settings_mask |= SETTING_DIRECTORY;
865 break;
866
867 case ARG_OCI_BUNDLE:
868 r = parse_path_argument(optarg, false, &arg_oci_bundle);
869 if (r < 0)
870 return r;
871
872 break;
873
874 case 'x':
875 arg_ephemeral = true;
876 arg_settings_mask |= SETTING_EPHEMERAL;
877 break;
878
879 case 'u':
880 r = free_and_strdup(&arg_user, optarg);
881 if (r < 0)
882 return log_oom();
883
884 arg_settings_mask |= SETTING_USER;
885 break;
886
887 case ARG_NETWORK_ZONE: {
888 _cleanup_free_ char *j = NULL;
889
890 j = strjoin("vz-", optarg);
891 if (!j)
892 return log_oom();
893
894 if (!ifname_valid(j))
895 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
896 "Network zone name not valid: %s", j);
897
898 free_and_replace(arg_network_zone, j);
899
900 arg_network_veth = true;
901 arg_private_network = true;
902 arg_settings_mask |= SETTING_NETWORK;
903 break;
904 }
905
906 case ARG_NETWORK_BRIDGE:
907
908 if (!ifname_valid(optarg))
909 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
910 "Bridge interface name not valid: %s", optarg);
911
912 r = free_and_strdup(&arg_network_bridge, optarg);
913 if (r < 0)
914 return log_oom();
915
916 _fallthrough_;
917 case 'n':
918 arg_network_veth = true;
919 arg_private_network = true;
920 arg_settings_mask |= SETTING_NETWORK;
921 break;
922
923 case ARG_NETWORK_VETH_EXTRA:
924 r = veth_extra_parse(&arg_network_veth_extra, optarg);
925 if (r < 0)
926 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
927
928 arg_private_network = true;
929 arg_settings_mask |= SETTING_NETWORK;
930 break;
931
932 case ARG_NETWORK_INTERFACE:
933 r = interface_pair_parse(&arg_network_interfaces, optarg);
934 if (r < 0)
935 return r;
936
937 arg_private_network = true;
938 arg_settings_mask |= SETTING_NETWORK;
939 break;
940
941 case ARG_NETWORK_MACVLAN:
942 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
943 if (r < 0)
944 return r;
945
946 arg_private_network = true;
947 arg_settings_mask |= SETTING_NETWORK;
948 break;
949
950 case ARG_NETWORK_IPVLAN:
951 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
952 if (r < 0)
953 return r;
954
955 _fallthrough_;
956 case ARG_PRIVATE_NETWORK:
957 arg_private_network = true;
958 arg_settings_mask |= SETTING_NETWORK;
959 break;
960
961 case ARG_NETWORK_NAMESPACE_PATH:
962 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
963 if (r < 0)
964 return r;
965
966 arg_settings_mask |= SETTING_NETWORK;
967 break;
968
969 case 'b':
970 if (arg_start_mode == START_PID2)
971 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
972 "--boot and --as-pid2 may not be combined.");
973
974 arg_start_mode = START_BOOT;
975 arg_settings_mask |= SETTING_START_MODE;
976 break;
977
978 case 'a':
979 if (arg_start_mode == START_BOOT)
980 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
981 "--boot and --as-pid2 may not be combined.");
982
983 arg_start_mode = START_PID2;
984 arg_settings_mask |= SETTING_START_MODE;
985 break;
986
987 case ARG_UUID:
988 r = id128_from_string_nonzero(optarg, &arg_uuid);
989 if (r == -ENXIO)
990 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
991 "Machine UUID may not be all zeroes.");
992 if (r < 0)
993 return log_error_errno(r, "Invalid UUID: %s", optarg);
994
995 arg_settings_mask |= SETTING_MACHINE_ID;
996 break;
997
998 case 'S': {
999 _cleanup_free_ char *mangled = NULL;
1000
1001 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1002 if (r < 0)
1003 return log_oom();
1004
1005 free_and_replace(arg_slice, mangled);
1006 arg_settings_mask |= SETTING_SLICE;
1007 break;
1008 }
1009
1010 case 'M':
1011 if (isempty(optarg))
1012 arg_machine = mfree(arg_machine);
1013 else {
1014 if (!hostname_is_valid(optarg, 0))
1015 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1016 "Invalid machine name: %s", optarg);
1017
1018 r = free_and_strdup(&arg_machine, optarg);
1019 if (r < 0)
1020 return log_oom();
1021 }
1022 break;
1023
1024 case ARG_HOSTNAME:
1025 if (isempty(optarg))
1026 arg_hostname = mfree(arg_hostname);
1027 else {
1028 if (!hostname_is_valid(optarg, 0))
1029 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1030 "Invalid hostname: %s", optarg);
1031
1032 r = free_and_strdup(&arg_hostname, optarg);
1033 if (r < 0)
1034 return log_oom();
1035 }
1036
1037 arg_settings_mask |= SETTING_HOSTNAME;
1038 break;
1039
1040 case 'Z':
1041 arg_selinux_context = optarg;
1042 break;
1043
1044 case 'L':
1045 arg_selinux_apifs_context = optarg;
1046 break;
1047
1048 case ARG_READ_ONLY:
1049 arg_read_only = true;
1050 arg_settings_mask |= SETTING_READ_ONLY;
1051 break;
1052
1053 case ARG_AMBIENT_CAPABILITY: {
1054 uint64_t m;
1055 r = parse_capability_spec(optarg, &m);
1056 if (r <= 0)
1057 return r;
1058 arg_caps_ambient |= m;
1059 arg_settings_mask |= SETTING_CAPABILITY;
1060 break;
1061 }
1062 case ARG_CAPABILITY:
1063 case ARG_DROP_CAPABILITY: {
1064 uint64_t m;
1065 r = parse_capability_spec(optarg, &m);
1066 if (r <= 0)
1067 return r;
1068
1069 if (c == ARG_CAPABILITY)
1070 plus |= m;
1071 else
1072 minus |= m;
1073 arg_settings_mask |= SETTING_CAPABILITY;
1074 break;
1075 }
1076 case ARG_NO_NEW_PRIVILEGES:
1077 r = parse_boolean(optarg);
1078 if (r < 0)
1079 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1080
1081 arg_no_new_privileges = r;
1082 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1083 break;
1084
1085 case 'j':
1086 arg_link_journal = LINK_GUEST;
1087 arg_link_journal_try = true;
1088 arg_settings_mask |= SETTING_LINK_JOURNAL;
1089 break;
1090
1091 case ARG_LINK_JOURNAL:
1092 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1095
1096 arg_settings_mask |= SETTING_LINK_JOURNAL;
1097 break;
1098
1099 case ARG_BIND:
1100 case ARG_BIND_RO:
1101 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1102 if (r < 0)
1103 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1104
1105 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1106 break;
1107
1108 case ARG_TMPFS:
1109 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1110 if (r < 0)
1111 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1112
1113 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1114 break;
1115
1116 case ARG_OVERLAY:
1117 case ARG_OVERLAY_RO:
1118 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1119 if (r == -EADDRNOTAVAIL)
1120 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1123
1124 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1125 break;
1126
1127 case ARG_INACCESSIBLE:
1128 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1129 if (r < 0)
1130 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1131
1132 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1133 break;
1134
1135 case 'E':
1136 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1137 if (r < 0)
1138 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1139
1140 arg_settings_mask |= SETTING_ENVIRONMENT;
1141 break;
1142
1143 case 'q':
1144 arg_quiet = true;
1145 break;
1146
1147 case ARG_SHARE_SYSTEM:
1148 /* We don't officially support this anymore, except for compat reasons. People should use the
1149 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1150 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1151 arg_clone_ns_flags = 0;
1152 break;
1153
1154 case ARG_REGISTER:
1155 r = parse_boolean(optarg);
1156 if (r < 0) {
1157 log_error("Failed to parse --register= argument: %s", optarg);
1158 return r;
1159 }
1160
1161 arg_register = r;
1162 break;
1163
1164 case ARG_KEEP_UNIT:
1165 arg_keep_unit = true;
1166 break;
1167
1168 case ARG_PERSONALITY:
1169
1170 arg_personality = personality_from_string(optarg);
1171 if (arg_personality == PERSONALITY_INVALID)
1172 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1173 "Unknown or unsupported personality '%s'.", optarg);
1174
1175 arg_settings_mask |= SETTING_PERSONALITY;
1176 break;
1177
1178 case ARG_VOLATILE:
1179
1180 if (!optarg)
1181 arg_volatile_mode = VOLATILE_YES;
1182 else if (streq(optarg, "help")) {
1183 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1184 return 0;
1185 } else {
1186 VolatileMode m;
1187
1188 m = volatile_mode_from_string(optarg);
1189 if (m < 0)
1190 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1191 "Failed to parse --volatile= argument: %s", optarg);
1192 else
1193 arg_volatile_mode = m;
1194 }
1195
1196 arg_settings_mask |= SETTING_VOLATILE_MODE;
1197 break;
1198
1199 case 'p':
1200 r = expose_port_parse(&arg_expose_ports, optarg);
1201 if (r == -EEXIST)
1202 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1203 if (r < 0)
1204 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1205
1206 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1207 break;
1208
1209 case ARG_PROPERTY:
1210 if (strv_extend(&arg_property, optarg) < 0)
1211 return log_oom();
1212
1213 break;
1214
1215 case ARG_PRIVATE_USERS: {
1216 int boolean;
1217
1218 if (!optarg)
1219 boolean = true;
1220 else if (!in_charset(optarg, DIGITS))
1221 /* do *not* parse numbers as booleans */
1222 boolean = parse_boolean(optarg);
1223 else
1224 boolean = -1;
1225
1226 if (boolean == 0) {
1227 /* no: User namespacing off */
1228 arg_userns_mode = USER_NAMESPACE_NO;
1229 arg_uid_shift = UID_INVALID;
1230 arg_uid_range = UINT32_C(0x10000);
1231 } else if (boolean > 0) {
1232 /* yes: User namespacing on, UID range is read from root dir */
1233 arg_userns_mode = USER_NAMESPACE_FIXED;
1234 arg_uid_shift = UID_INVALID;
1235 arg_uid_range = UINT32_C(0x10000);
1236 } else if (streq(optarg, "pick")) {
1237 /* pick: User namespacing on, UID range is picked randomly */
1238 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1239 * implied by USER_NAMESPACE_PICK
1240 * further down. */
1241 arg_uid_shift = UID_INVALID;
1242 arg_uid_range = UINT32_C(0x10000);
1243
1244 } else if (streq(optarg, "identity")) {
1245 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1246 * itself, i.e. we don't actually map anything, but do take benefit of
1247 * isolation of capability sets. */
1248 arg_userns_mode = USER_NAMESPACE_FIXED;
1249 arg_uid_shift = 0;
1250 arg_uid_range = UINT32_C(0x10000);
1251 } else {
1252 _cleanup_free_ char *buffer = NULL;
1253 const char *range, *shift;
1254
1255 /* anything else: User namespacing on, UID range is explicitly configured */
1256
1257 range = strchr(optarg, ':');
1258 if (range) {
1259 buffer = strndup(optarg, range - optarg);
1260 if (!buffer)
1261 return log_oom();
1262 shift = buffer;
1263
1264 range++;
1265 r = safe_atou32(range, &arg_uid_range);
1266 if (r < 0)
1267 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1268 } else
1269 shift = optarg;
1270
1271 r = parse_uid(shift, &arg_uid_shift);
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1274
1275 arg_userns_mode = USER_NAMESPACE_FIXED;
1276
1277 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1278 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1279 }
1280
1281 arg_settings_mask |= SETTING_USERNS;
1282 break;
1283 }
1284
1285 case 'U':
1286 if (userns_supported()) {
1287 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1288 * implied by USER_NAMESPACE_PICK
1289 * further down. */
1290 arg_uid_shift = UID_INVALID;
1291 arg_uid_range = UINT32_C(0x10000);
1292
1293 arg_settings_mask |= SETTING_USERNS;
1294 }
1295
1296 break;
1297
1298 case ARG_PRIVATE_USERS_CHOWN:
1299 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1300
1301 arg_settings_mask |= SETTING_USERNS;
1302 break;
1303
1304 case ARG_PRIVATE_USERS_OWNERSHIP:
1305 if (streq(optarg, "help")) {
1306 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1307 return 0;
1308 }
1309
1310 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1311 if (arg_userns_ownership < 0)
1312 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1313
1314 arg_settings_mask |= SETTING_USERNS;
1315 break;
1316
1317 case ARG_KILL_SIGNAL:
1318 if (streq(optarg, "help")) {
1319 DUMP_STRING_TABLE(signal, int, _NSIG);
1320 return 0;
1321 }
1322
1323 arg_kill_signal = signal_from_string(optarg);
1324 if (arg_kill_signal < 0)
1325 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1326
1327 arg_settings_mask |= SETTING_KILL_SIGNAL;
1328 break;
1329
1330 case ARG_SETTINGS:
1331
1332 /* no → do not read files
1333 * yes → read files, do not override cmdline, trust only subset
1334 * override → read files, override cmdline, trust only subset
1335 * trusted → read files, do not override cmdline, trust all
1336 */
1337
1338 r = parse_boolean(optarg);
1339 if (r < 0) {
1340 if (streq(optarg, "trusted")) {
1341 mask_all_settings = false;
1342 mask_no_settings = false;
1343 arg_settings_trusted = true;
1344
1345 } else if (streq(optarg, "override")) {
1346 mask_all_settings = false;
1347 mask_no_settings = true;
1348 arg_settings_trusted = -1;
1349 } else
1350 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1351 } else if (r > 0) {
1352 /* yes */
1353 mask_all_settings = false;
1354 mask_no_settings = false;
1355 arg_settings_trusted = -1;
1356 } else {
1357 /* no */
1358 mask_all_settings = true;
1359 mask_no_settings = false;
1360 arg_settings_trusted = false;
1361 }
1362
1363 break;
1364
1365 case ARG_CHDIR:
1366 if (!path_is_absolute(optarg))
1367 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1368 "Working directory %s is not an absolute path.", optarg);
1369
1370 r = free_and_strdup(&arg_chdir, optarg);
1371 if (r < 0)
1372 return log_oom();
1373
1374 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1375 break;
1376
1377 case ARG_PIVOT_ROOT:
1378 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1379 if (r < 0)
1380 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1381
1382 arg_settings_mask |= SETTING_PIVOT_ROOT;
1383 break;
1384
1385 case ARG_NOTIFY_READY:
1386 r = parse_boolean(optarg);
1387 if (r < 0)
1388 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1389 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1390 arg_notify_ready = r;
1391 arg_settings_mask |= SETTING_NOTIFY_READY;
1392 break;
1393
1394 case ARG_ROOT_HASH: {
1395 _cleanup_free_ void *k = NULL;
1396 size_t l;
1397
1398 r = unhexmem(optarg, strlen(optarg), &k, &l);
1399 if (r < 0)
1400 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1401 if (l < sizeof(sd_id128_t))
1402 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
1403
1404 free_and_replace(arg_verity_settings.root_hash, k);
1405 arg_verity_settings.root_hash_size = l;
1406 break;
1407 }
1408
1409 case ARG_ROOT_HASH_SIG: {
1410 char *value;
1411 size_t l;
1412 void *p;
1413
1414 if ((value = startswith(optarg, "base64:"))) {
1415 r = unbase64mem(value, strlen(value), &p, &l);
1416 if (r < 0)
1417 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1418
1419 } else {
1420 r = read_full_file(optarg, (char**) &p, &l);
1421 if (r < 0)
1422 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1423 }
1424
1425 free_and_replace(arg_verity_settings.root_hash_sig, p);
1426 arg_verity_settings.root_hash_sig_size = l;
1427 break;
1428 }
1429
1430 case ARG_VERITY_DATA:
1431 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1432 if (r < 0)
1433 return r;
1434 break;
1435
1436 case ARG_SYSTEM_CALL_FILTER: {
1437 bool negative;
1438 const char *items;
1439
1440 negative = optarg[0] == '~';
1441 items = negative ? optarg + 1 : optarg;
1442
1443 for (;;) {
1444 _cleanup_free_ char *word = NULL;
1445
1446 r = extract_first_word(&items, &word, NULL, 0);
1447 if (r == 0)
1448 break;
1449 if (r == -ENOMEM)
1450 return log_oom();
1451 if (r < 0)
1452 return log_error_errno(r, "Failed to parse system call filter: %m");
1453
1454 if (negative)
1455 r = strv_extend(&arg_syscall_deny_list, word);
1456 else
1457 r = strv_extend(&arg_syscall_allow_list, word);
1458 if (r < 0)
1459 return log_oom();
1460 }
1461
1462 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1463 break;
1464 }
1465
1466 case ARG_RLIMIT: {
1467 const char *eq;
1468 _cleanup_free_ char *name = NULL;
1469 int rl;
1470
1471 if (streq(optarg, "help")) {
1472 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1473 return 0;
1474 }
1475
1476 eq = strchr(optarg, '=');
1477 if (!eq)
1478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1479 "--rlimit= expects an '=' assignment.");
1480
1481 name = strndup(optarg, eq - optarg);
1482 if (!name)
1483 return log_oom();
1484
1485 rl = rlimit_from_string_harder(name);
1486 if (rl < 0)
1487 return log_error_errno(rl, "Unknown resource limit: %s", name);
1488
1489 if (!arg_rlimit[rl]) {
1490 arg_rlimit[rl] = new0(struct rlimit, 1);
1491 if (!arg_rlimit[rl])
1492 return log_oom();
1493 }
1494
1495 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1498
1499 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1500 break;
1501 }
1502
1503 case ARG_OOM_SCORE_ADJUST:
1504 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1505 if (r < 0)
1506 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1507
1508 arg_oom_score_adjust_set = true;
1509 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1510 break;
1511
1512 case ARG_CPU_AFFINITY: {
1513 CPUSet cpuset;
1514
1515 r = parse_cpu_set(optarg, &cpuset);
1516 if (r < 0)
1517 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1518
1519 cpu_set_reset(&arg_cpu_set);
1520 arg_cpu_set = cpuset;
1521 arg_settings_mask |= SETTING_CPU_AFFINITY;
1522 break;
1523 }
1524
1525 case ARG_RESOLV_CONF:
1526 if (streq(optarg, "help")) {
1527 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1528 return 0;
1529 }
1530
1531 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1532 if (arg_resolv_conf < 0)
1533 return log_error_errno(arg_resolv_conf,
1534 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1535
1536 arg_settings_mask |= SETTING_RESOLV_CONF;
1537 break;
1538
1539 case ARG_TIMEZONE:
1540 if (streq(optarg, "help")) {
1541 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1542 return 0;
1543 }
1544
1545 arg_timezone = timezone_mode_from_string(optarg);
1546 if (arg_timezone < 0)
1547 return log_error_errno(arg_timezone,
1548 "Failed to parse /etc/localtime mode: %s", optarg);
1549
1550 arg_settings_mask |= SETTING_TIMEZONE;
1551 break;
1552
1553 case ARG_CONSOLE:
1554 r = handle_arg_console(optarg);
1555 if (r <= 0)
1556 return r;
1557 break;
1558
1559 case 'P':
1560 case ARG_PIPE:
1561 r = handle_arg_console("pipe");
1562 if (r <= 0)
1563 return r;
1564 break;
1565
1566 case ARG_NO_PAGER:
1567 arg_pager_flags |= PAGER_DISABLE;
1568 break;
1569
1570 case ARG_SET_CREDENTIAL: {
1571 _cleanup_free_ char *word = NULL, *data = NULL;
1572 const char *p = optarg;
1573 Credential *a;
1574 ssize_t l;
1575
1576 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1577 if (r == -ENOMEM)
1578 return log_oom();
1579 if (r < 0)
1580 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1581 if (r == 0 || !p)
1582 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1583
1584 if (!credential_name_valid(word))
1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1586
1587 for (size_t i = 0; i < arg_n_credentials; i++)
1588 if (streq(arg_credentials[i].id, word))
1589 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1590
1591 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1592 if (l < 0)
1593 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1594
1595 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1596 if (!a)
1597 return log_oom();
1598
1599 a[arg_n_credentials++] = (Credential) {
1600 .id = TAKE_PTR(word),
1601 .data = TAKE_PTR(data),
1602 .size = l,
1603 };
1604
1605 arg_credentials = a;
1606
1607 arg_settings_mask |= SETTING_CREDENTIALS;
1608 break;
1609 }
1610
1611 case ARG_LOAD_CREDENTIAL: {
1612 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1613 _cleanup_(erase_and_freep) char *data = NULL;
1614 _cleanup_free_ char *word = NULL, *j = NULL;
1615 const char *p = optarg;
1616 Credential *a;
1617 size_t size, i;
1618
1619 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1620 if (r == -ENOMEM)
1621 return log_oom();
1622 if (r < 0)
1623 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
1624 if (r == 0 || !p)
1625 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
1626
1627 if (!credential_name_valid(word))
1628 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1629
1630 for (i = 0; i < arg_n_credentials; i++)
1631 if (streq(arg_credentials[i].id, word))
1632 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1633
1634 if (path_is_absolute(p))
1635 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1636 else {
1637 const char *e;
1638
1639 r = get_credentials_dir(&e);
1640 if (r < 0)
1641 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
1642
1643 j = path_join(e, p);
1644 if (!j)
1645 return log_oom();
1646 }
1647
1648 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1649 flags,
1650 NULL,
1651 &data, &size);
1652 if (r < 0)
1653 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1654
1655 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1656 if (!a)
1657 return log_oom();
1658
1659 a[arg_n_credentials++] = (Credential) {
1660 .id = TAKE_PTR(word),
1661 .data = TAKE_PTR(data),
1662 .size = size,
1663 };
1664
1665 arg_credentials = a;
1666
1667 arg_settings_mask |= SETTING_CREDENTIALS;
1668 break;
1669 }
1670
1671 case ARG_BIND_USER:
1672 if (!valid_user_group_name(optarg, 0))
1673 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1674
1675 if (strv_extend(&arg_bind_user, optarg) < 0)
1676 return log_oom();
1677
1678 arg_settings_mask |= SETTING_BIND_USER;
1679 break;
1680
1681 case ARG_SUPPRESS_SYNC:
1682 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1683 if (r < 0)
1684 return r;
1685
1686 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1687 break;
1688
1689 case ARG_IMAGE_POLICY:
1690 r = parse_image_policy_argument(optarg, &arg_image_policy);
1691 if (r < 0)
1692 return r;
1693 break;
1694
1695 case '?':
1696 return -EINVAL;
1697
1698 default:
1699 assert_not_reached();
1700 }
1701
1702 if (argc > optind) {
1703 strv_free(arg_parameters);
1704 arg_parameters = strv_copy(argv + optind);
1705 if (!arg_parameters)
1706 return log_oom();
1707
1708 arg_settings_mask |= SETTING_START_MODE;
1709 }
1710
1711 if (arg_ephemeral && arg_template && !arg_directory)
1712 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1713 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1714 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1715 * --directory=". */
1716 arg_directory = TAKE_PTR(arg_template);
1717
1718 arg_caps_retain |= plus;
1719 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1720
1721 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1722 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1723 * indicate that. */
1724 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1725 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1726
1727 arg_caps_retain &= ~minus;
1728
1729 /* Make sure to parse environment before we reset the settings mask below */
1730 r = parse_environment();
1731 if (r < 0)
1732 return r;
1733
1734 /* Load all settings from .nspawn files */
1735 if (mask_no_settings)
1736 arg_settings_mask = 0;
1737
1738 /* Don't load any settings from .nspawn files */
1739 if (mask_all_settings)
1740 arg_settings_mask = _SETTINGS_MASK_ALL;
1741
1742 return 1;
1743 }
1744
1745 static int verify_arguments(void) {
1746 int r;
1747
1748 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1749 /* If we are running the stub init in the container, we don't need to look at what the init
1750 * in the container supports, because we are not using it. Let's immediately pick the right
1751 * setting based on the host system configuration.
1752 *
1753 * We only do this, if the user didn't use an environment variable to override the detection.
1754 */
1755
1756 r = cg_all_unified();
1757 if (r < 0)
1758 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1759 if (r > 0)
1760 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1761 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1762 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1763 else
1764 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1765 }
1766
1767 if (arg_userns_mode != USER_NAMESPACE_NO)
1768 arg_mount_settings |= MOUNT_USE_USERNS;
1769
1770 if (arg_private_network)
1771 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1772
1773 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1774 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1775 arg_register = false;
1776 if (arg_start_mode != START_PID1)
1777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1778 }
1779
1780 if (arg_userns_ownership < 0)
1781 arg_userns_ownership =
1782 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1783 USER_NAMESPACE_OWNERSHIP_OFF;
1784
1785 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1786 arg_kill_signal = SIGRTMIN+3;
1787
1788 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1789 arg_read_only = true;
1790
1791 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1792 arg_read_only = true;
1793
1794 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1795 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1796 * The latter is not technically a user session, but we don't need to labour the point. */
1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1798
1799 if (arg_directory && arg_image)
1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1801
1802 if (arg_template && arg_image)
1803 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1804
1805 if (arg_template && !(arg_directory || arg_machine))
1806 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1807
1808 if (arg_ephemeral && arg_template)
1809 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1810
1811 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1812 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1813
1814 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1815 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1816
1817 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1818 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1819 "--read-only and --private-users-ownership=chown may not be combined.");
1820
1821 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1822 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1823 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1824 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1826
1827 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1828 * we need to error out, to avoid conflicts between different network options. */
1829 if (arg_network_namespace_path &&
1830 (arg_network_interfaces || arg_network_macvlan ||
1831 arg_network_ipvlan || arg_network_veth_extra ||
1832 arg_network_bridge || arg_network_zone ||
1833 arg_network_veth))
1834 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1835
1836 if (arg_network_bridge && arg_network_zone)
1837 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1838 "--network-bridge= and --network-zone= may not be combined.");
1839
1840 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1841 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1842
1843 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1844 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1845
1846 if (arg_expose_ports && !arg_private_network)
1847 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1848
1849 if (arg_caps_ambient) {
1850 if (arg_caps_ambient == UINT64_MAX)
1851 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1852
1853 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1854 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1855
1856 if (arg_start_mode == START_BOOT)
1857 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1858 }
1859
1860 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1862
1863 /* Drop duplicate --bind-user= entries */
1864 strv_uniq(arg_bind_user);
1865
1866 r = custom_mount_check_all();
1867 if (r < 0)
1868 return r;
1869
1870 return 0;
1871 }
1872
1873 static int verify_network_interfaces_initialized(void) {
1874 int r;
1875 r = test_network_interfaces_initialized(arg_network_interfaces);
1876 if (r < 0)
1877 return r;
1878
1879 r = test_network_interfaces_initialized(arg_network_macvlan);
1880 if (r < 0)
1881 return r;
1882
1883 r = test_network_interfaces_initialized(arg_network_ipvlan);
1884 if (r < 0)
1885 return r;
1886
1887 return 0;
1888 }
1889
1890 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1891 assert(p);
1892
1893 if (arg_userns_mode == USER_NAMESPACE_NO)
1894 return 0;
1895
1896 if (uid == UID_INVALID && gid == GID_INVALID)
1897 return 0;
1898
1899 if (uid != UID_INVALID) {
1900 uid += arg_uid_shift;
1901
1902 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1903 return -EOVERFLOW;
1904 }
1905
1906 if (gid != GID_INVALID) {
1907 gid += (gid_t) arg_uid_shift;
1908
1909 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1910 return -EOVERFLOW;
1911 }
1912
1913 return RET_NERRNO(lchown(p, uid, gid));
1914 }
1915
1916 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1917 const char *q;
1918 int r;
1919
1920 q = prefix_roota(root, path);
1921 r = RET_NERRNO(mkdir(q, mode));
1922 if (r == -EEXIST)
1923 return 0;
1924 if (r < 0)
1925 return r;
1926
1927 return userns_lchown(q, uid, gid);
1928 }
1929
1930 static const char *timezone_from_path(const char *path) {
1931 return PATH_STARTSWITH_SET(
1932 path,
1933 "../usr/share/zoneinfo/",
1934 "/usr/share/zoneinfo/");
1935 }
1936
1937 static bool etc_writable(void) {
1938 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1939 }
1940
1941 static int setup_timezone(const char *dest) {
1942 _cleanup_free_ char *p = NULL, *etc = NULL;
1943 const char *where, *check;
1944 TimezoneMode m;
1945 int r;
1946
1947 assert(dest);
1948
1949 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1950 r = readlink_malloc("/etc/localtime", &p);
1951 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1952 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1953 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1954 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1955 else if (r < 0) {
1956 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1957 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1958 * file.
1959 *
1960 * Example:
1961 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1962 */
1963 return 0;
1964 } else if (arg_timezone == TIMEZONE_AUTO)
1965 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1966 else
1967 m = arg_timezone;
1968 } else
1969 m = arg_timezone;
1970
1971 if (m == TIMEZONE_OFF)
1972 return 0;
1973
1974 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1975 if (r < 0) {
1976 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1977 return 0;
1978 }
1979
1980 where = strjoina(etc, "/localtime");
1981
1982 switch (m) {
1983
1984 case TIMEZONE_DELETE:
1985 if (unlink(where) < 0)
1986 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1987
1988 return 0;
1989
1990 case TIMEZONE_SYMLINK: {
1991 _cleanup_free_ char *q = NULL;
1992 const char *z, *what;
1993
1994 z = timezone_from_path(p);
1995 if (!z) {
1996 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1997 return 0;
1998 }
1999
2000 r = readlink_malloc(where, &q);
2001 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
2002 return 0; /* Already pointing to the right place? Then do nothing .. */
2003
2004 check = strjoina(dest, "/usr/share/zoneinfo/", z);
2005 r = chase(check, dest, 0, NULL, NULL);
2006 if (r < 0)
2007 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
2008 else {
2009 if (unlink(where) < 0 && errno != ENOENT) {
2010 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
2011 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
2012 return 0;
2013 }
2014
2015 what = strjoina("../usr/share/zoneinfo/", z);
2016 if (symlink(what, where) < 0) {
2017 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2018 errno, "Failed to correct timezone of container, ignoring: %m");
2019 return 0;
2020 }
2021
2022 break;
2023 }
2024
2025 _fallthrough_;
2026 }
2027
2028 case TIMEZONE_BIND: {
2029 _cleanup_free_ char *resolved = NULL;
2030 int found;
2031
2032 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2033 if (found < 0) {
2034 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2035 return 0;
2036 }
2037
2038 if (found == 0) /* missing? */
2039 (void) touch(resolved);
2040
2041 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
2042 if (r >= 0)
2043 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2044
2045 _fallthrough_;
2046 }
2047
2048 case TIMEZONE_COPY:
2049 /* If mounting failed, try to copy */
2050 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
2051 if (r < 0) {
2052 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2053 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2054 return 0;
2055 }
2056
2057 break;
2058
2059 default:
2060 assert_not_reached();
2061 }
2062
2063 /* Fix permissions of the symlink or file copy we just created */
2064 r = userns_lchown(where, 0, 0);
2065 if (r < 0)
2066 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
2067
2068 return 0;
2069 }
2070
2071 static int have_resolv_conf(const char *path) {
2072 assert(path);
2073
2074 if (access(path, F_OK) < 0) {
2075 if (errno == ENOENT)
2076 return 0;
2077
2078 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2079 }
2080
2081 return 1;
2082 }
2083
2084 static int resolved_listening(void) {
2085 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2086 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2087 _cleanup_free_ char *dns_stub_listener_mode = NULL;
2088 int r;
2089
2090 /* Check if resolved is listening */
2091
2092 r = sd_bus_open_system(&bus);
2093 if (r < 0)
2094 return log_debug_errno(r, "Failed to open system bus: %m");
2095
2096 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2097 if (r < 0)
2098 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2099 if (r == 0)
2100 return 0;
2101
2102 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
2103 if (r < 0)
2104 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2105
2106 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2107 }
2108
2109 static int setup_resolv_conf(const char *dest) {
2110 _cleanup_free_ char *etc = NULL;
2111 const char *where, *what;
2112 ResolvConfMode m;
2113 int r;
2114
2115 assert(dest);
2116
2117 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2118 if (arg_private_network)
2119 m = RESOLV_CONF_OFF;
2120 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2121 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2122 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2123 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2124 else
2125 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2126
2127 } else
2128 m = arg_resolv_conf;
2129
2130 if (m == RESOLV_CONF_OFF)
2131 return 0;
2132
2133 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2134 if (r < 0) {
2135 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2136 return 0;
2137 }
2138
2139 where = strjoina(etc, "/resolv.conf");
2140
2141 if (m == RESOLV_CONF_DELETE) {
2142 if (unlink(where) < 0)
2143 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2144
2145 return 0;
2146 }
2147
2148 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2149 what = PRIVATE_STATIC_RESOLV_CONF;
2150 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2151 what = PRIVATE_UPLINK_RESOLV_CONF;
2152 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2153 what = PRIVATE_STUB_RESOLV_CONF;
2154 else
2155 what = "/etc/resolv.conf";
2156
2157 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2158 _cleanup_free_ char *resolved = NULL;
2159 int found;
2160
2161 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
2162 if (found < 0) {
2163 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2164 return 0;
2165 }
2166
2167 if (found == 0) /* missing? */
2168 (void) touch(resolved);
2169
2170 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2171 if (r >= 0)
2172 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2173
2174 /* If that didn't work, let's copy the file */
2175 }
2176
2177 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2178 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
2179 else
2180 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
2181 if (r < 0) {
2182 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2183 * resolved or something similar runs inside and the symlink points there.
2184 *
2185 * If the disk image is read-only, there's also no point in complaining.
2186 */
2187 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2188 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2189 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2190 return 0;
2191 }
2192
2193 r = userns_lchown(where, 0, 0);
2194 if (r < 0)
2195 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2196
2197 return 0;
2198 }
2199
2200 static int setup_boot_id(void) {
2201 _cleanup_(unlink_and_freep) char *from = NULL;
2202 _cleanup_free_ char *path = NULL;
2203 sd_id128_t rnd = SD_ID128_NULL;
2204 const char *to;
2205 int r;
2206
2207 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2208
2209 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2210 if (r < 0)
2211 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2212
2213 r = sd_id128_randomize(&rnd);
2214 if (r < 0)
2215 return log_error_errno(r, "Failed to generate random boot id: %m");
2216
2217 r = id128_write(path, ID128_FORMAT_UUID, rnd);
2218 if (r < 0)
2219 return log_error_errno(r, "Failed to write boot id: %m");
2220
2221 from = TAKE_PTR(path);
2222 to = "/proc/sys/kernel/random/boot_id";
2223
2224 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2225 if (r < 0)
2226 return r;
2227
2228 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2229 }
2230
2231 static int copy_devnodes(const char *dest) {
2232 static const char devnodes[] =
2233 "null\0"
2234 "zero\0"
2235 "full\0"
2236 "random\0"
2237 "urandom\0"
2238 "tty\0"
2239 "net/tun\0";
2240
2241 int r = 0;
2242
2243 assert(dest);
2244
2245 BLOCK_WITH_UMASK(0000);
2246
2247 /* Create /dev/net, so that we can create /dev/net/tun in it */
2248 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2249 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2250
2251 NULSTR_FOREACH(d, devnodes) {
2252 _cleanup_free_ char *from = NULL, *to = NULL;
2253 struct stat st;
2254
2255 from = path_join("/dev/", d);
2256 if (!from)
2257 return log_oom();
2258
2259 to = path_join(dest, from);
2260 if (!to)
2261 return log_oom();
2262
2263 if (stat(from, &st) < 0) {
2264
2265 if (errno != ENOENT)
2266 return log_error_errno(errno, "Failed to stat %s: %m", from);
2267
2268 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2269 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2270 "%s is not a char or block device, cannot copy.", from);
2271 else {
2272 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2273
2274 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2275 /* Explicitly warn the user when /dev is already populated. */
2276 if (errno == EEXIST)
2277 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
2278 if (errno != EPERM)
2279 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2280
2281 /* Some systems abusively restrict mknod but allow bind mounts. */
2282 r = touch(to);
2283 if (r < 0)
2284 return log_error_errno(r, "touch (%s) failed: %m", to);
2285 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2286 if (r < 0)
2287 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2288 }
2289
2290 r = userns_lchown(to, 0, 0);
2291 if (r < 0)
2292 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2293
2294 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2295 if (!dn)
2296 return log_oom();
2297
2298 r = userns_mkdir(dest, dn, 0755, 0, 0);
2299 if (r < 0)
2300 return log_error_errno(r, "Failed to create '%s': %m", dn);
2301
2302 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2303 return log_oom();
2304
2305 prefixed = path_join(dest, sl);
2306 if (!prefixed)
2307 return log_oom();
2308
2309 t = path_join("..", d);
2310 if (!t)
2311 return log_oom();
2312
2313 if (symlink(t, prefixed) < 0)
2314 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2315 }
2316 }
2317
2318 return r;
2319 }
2320
2321 static int make_extra_nodes(const char *dest) {
2322 size_t i;
2323 int r;
2324
2325 BLOCK_WITH_UMASK(0000);
2326
2327 for (i = 0; i < arg_n_extra_nodes; i++) {
2328 _cleanup_free_ char *path = NULL;
2329 DeviceNode *n = arg_extra_nodes + i;
2330
2331 path = path_join(dest, n->path);
2332 if (!path)
2333 return log_oom();
2334
2335 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2336 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2337
2338 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2339 if (r < 0)
2340 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2341 }
2342
2343 return 0;
2344 }
2345
2346 static int setup_pts(const char *dest) {
2347 _cleanup_free_ char *options = NULL;
2348 const char *p;
2349 int r;
2350
2351 #if HAVE_SELINUX
2352 if (arg_selinux_apifs_context)
2353 (void) asprintf(&options,
2354 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2355 arg_uid_shift + TTY_GID,
2356 arg_selinux_apifs_context);
2357 else
2358 #endif
2359 (void) asprintf(&options,
2360 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2361 arg_uid_shift + TTY_GID);
2362
2363 if (!options)
2364 return log_oom();
2365
2366 /* Mount /dev/pts itself */
2367 p = prefix_roota(dest, "/dev/pts");
2368 r = RET_NERRNO(mkdir(p, 0755));
2369 if (r < 0)
2370 return log_error_errno(r, "Failed to create /dev/pts: %m");
2371
2372 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2373 if (r < 0)
2374 return r;
2375 r = userns_lchown(p, 0, 0);
2376 if (r < 0)
2377 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2378
2379 /* Create /dev/ptmx symlink */
2380 p = prefix_roota(dest, "/dev/ptmx");
2381 if (symlink("pts/ptmx", p) < 0)
2382 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2383 r = userns_lchown(p, 0, 0);
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2386
2387 /* And fix /dev/pts/ptmx ownership */
2388 p = prefix_roota(dest, "/dev/pts/ptmx");
2389 r = userns_lchown(p, 0, 0);
2390 if (r < 0)
2391 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2392
2393 return 0;
2394 }
2395
2396 static int setup_stdio_as_dev_console(void) {
2397 _cleanup_close_ int terminal = -EBADF;
2398 int r;
2399
2400 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2401 * explicitly, if we are configured to. */
2402 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2403 if (terminal < 0)
2404 return log_error_errno(terminal, "Failed to open console: %m");
2405
2406 /* Make sure we can continue logging to the original stderr, even if
2407 * stderr points elsewhere now */
2408 r = log_dup_console();
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to duplicate stderr: %m");
2411
2412 /* invalidates 'terminal' on success and failure */
2413 r = rearrange_stdio(terminal, terminal, terminal);
2414 TAKE_FD(terminal);
2415 if (r < 0)
2416 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2417
2418 return 0;
2419 }
2420
2421 static int setup_dev_console(const char *console) {
2422 _cleanup_free_ char *p = NULL;
2423 int r;
2424
2425 /* Create /dev/console symlink */
2426 r = path_make_relative("/dev", console, &p);
2427 if (r < 0)
2428 return log_error_errno(r, "Failed to create relative path: %m");
2429
2430 if (symlink(p, "/dev/console") < 0)
2431 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2432
2433 return 0;
2434 }
2435
2436 static int setup_keyring(void) {
2437 key_serial_t keyring;
2438
2439 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2440 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2441 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2442 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2443 * into the container. */
2444
2445 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2446 if (keyring == -1) {
2447 if (errno == ENOSYS)
2448 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2449 else if (ERRNO_IS_PRIVILEGE(errno))
2450 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2451 else
2452 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2453 }
2454
2455 return 0;
2456 }
2457
2458 static int setup_credentials(const char *root) {
2459 const char *q;
2460 int r;
2461
2462 if (arg_n_credentials <= 0)
2463 return 0;
2464
2465 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2466 if (r < 0)
2467 return log_error_errno(r, "Failed to create /run/host: %m");
2468
2469 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2470 if (r < 0)
2471 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2472
2473 q = prefix_roota(root, "/run/host/credentials");
2474 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2475 if (r < 0)
2476 return r;
2477
2478 for (size_t i = 0; i < arg_n_credentials; i++) {
2479 _cleanup_free_ char *j = NULL;
2480 _cleanup_close_ int fd = -EBADF;
2481
2482 j = path_join(q, arg_credentials[i].id);
2483 if (!j)
2484 return log_oom();
2485
2486 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2487 if (fd < 0)
2488 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2489
2490 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size);
2491 if (r < 0)
2492 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2493
2494 if (fchmod(fd, 0400) < 0)
2495 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2496
2497 if (arg_userns_mode != USER_NAMESPACE_NO) {
2498 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2499 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2500 }
2501 }
2502
2503 if (chmod(q, 0500) < 0)
2504 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2505
2506 r = userns_lchown(q, 0, 0);
2507 if (r < 0)
2508 return r;
2509
2510 /* Make both mount and superblock read-only now */
2511 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2512 if (r < 0)
2513 return r;
2514
2515 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2516 }
2517
2518 static int setup_kmsg(int fd_inner_socket) {
2519 _cleanup_(unlink_and_freep) char *from = NULL;
2520 _cleanup_free_ char *fifo = NULL;
2521 _cleanup_close_ int fd = -EBADF;
2522 int r;
2523
2524 assert(fd_inner_socket >= 0);
2525
2526 BLOCK_WITH_UMASK(0000);
2527
2528 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2529 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2530 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2531 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2532
2533 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2534 if (r < 0)
2535 return log_error_errno(r, "Failed to generate kmsg path: %m");
2536
2537 if (mkfifo(fifo, 0600) < 0)
2538 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2539
2540 from = TAKE_PTR(fifo);
2541
2542 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2543 if (r < 0)
2544 return r;
2545
2546 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2547 if (fd < 0)
2548 return log_error_errno(errno, "Failed to open fifo: %m");
2549
2550 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2551 r = send_one_fd(fd_inner_socket, fd, 0);
2552 if (r < 0)
2553 return log_error_errno(r, "Failed to send FIFO fd: %m");
2554
2555 return 0;
2556 }
2557
2558 struct ExposeArgs {
2559 union in_addr_union address4;
2560 union in_addr_union address6;
2561 struct FirewallContext *fw_ctx;
2562 };
2563
2564 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2565 struct ExposeArgs *args = ASSERT_PTR(userdata);
2566
2567 assert(rtnl);
2568 assert(m);
2569
2570 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2571 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2572 return 0;
2573 }
2574
2575 static int setup_hostname(void) {
2576 int r;
2577
2578 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2579 return 0;
2580
2581 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2582 if (r < 0)
2583 return log_error_errno(r, "Failed to set hostname: %m");
2584
2585 return 0;
2586 }
2587
2588 static int setup_journal(const char *directory) {
2589 _cleanup_free_ char *d = NULL;
2590 const char *p, *q;
2591 sd_id128_t this_id;
2592 bool try;
2593 int r;
2594
2595 /* Don't link journals in ephemeral mode */
2596 if (arg_ephemeral)
2597 return 0;
2598
2599 if (arg_link_journal == LINK_NO)
2600 return 0;
2601
2602 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2603
2604 r = sd_id128_get_machine(&this_id);
2605 if (r < 0)
2606 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2607
2608 if (sd_id128_equal(arg_uuid, this_id)) {
2609 log_full(try ? LOG_WARNING : LOG_ERR,
2610 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2611 if (try)
2612 return 0;
2613 return -EEXIST;
2614 }
2615
2616 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2617 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2618 if (r < 0) {
2619 bool ignore = r == -EROFS && try;
2620 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2621 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2622 return ignore ? 0 : r;
2623 }
2624 }
2625
2626 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2627 q = prefix_roota(directory, p);
2628
2629 if (path_is_mount_point(p, NULL, 0) > 0) {
2630 if (try)
2631 return 0;
2632
2633 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2634 "%s: already a mount point, refusing to use for journal", p);
2635 }
2636
2637 if (path_is_mount_point(q, NULL, 0) > 0) {
2638 if (try)
2639 return 0;
2640
2641 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2642 "%s: already a mount point, refusing to use for journal", q);
2643 }
2644
2645 r = readlink_and_make_absolute(p, &d);
2646 if (r >= 0) {
2647 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2648 path_equal(d, q)) {
2649
2650 r = userns_mkdir(directory, p, 0755, 0, 0);
2651 if (r < 0)
2652 log_warning_errno(r, "Failed to create directory %s: %m", q);
2653 return 0;
2654 }
2655
2656 if (unlink(p) < 0)
2657 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2658 } else if (r == -EINVAL) {
2659
2660 if (arg_link_journal == LINK_GUEST &&
2661 rmdir(p) < 0) {
2662
2663 if (errno == ENOTDIR) {
2664 log_error("%s already exists and is neither a symlink nor a directory", p);
2665 return r;
2666 } else
2667 return log_error_errno(errno, "Failed to remove %s: %m", p);
2668 }
2669 } else if (r != -ENOENT)
2670 return log_error_errno(r, "readlink(%s) failed: %m", p);
2671
2672 if (arg_link_journal == LINK_GUEST) {
2673
2674 if (symlink(q, p) < 0) {
2675 if (try) {
2676 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2677 return 0;
2678 } else
2679 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2680 }
2681
2682 r = userns_mkdir(directory, p, 0755, 0, 0);
2683 if (r < 0)
2684 log_warning_errno(r, "Failed to create directory %s: %m", q);
2685 return 0;
2686 }
2687
2688 if (arg_link_journal == LINK_HOST) {
2689 /* don't create parents here — if the host doesn't have
2690 * permanent journal set up, don't force it here */
2691
2692 r = RET_NERRNO(mkdir(p, 0755));
2693 if (r < 0 && r != -EEXIST) {
2694 if (try) {
2695 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2696 return 0;
2697 } else
2698 return log_error_errno(r, "Failed to create %s: %m", p);
2699 }
2700
2701 } else if (access(p, F_OK) < 0)
2702 return 0;
2703
2704 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2705 log_warning("%s is not empty, proceeding anyway.", q);
2706
2707 r = userns_mkdir(directory, p, 0755, 0, 0);
2708 if (r < 0)
2709 return log_error_errno(r, "Failed to create %s: %m", q);
2710
2711 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2712 if (r < 0)
2713 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2714
2715 return 0;
2716 }
2717
2718 static int drop_capabilities(uid_t uid) {
2719 CapabilityQuintet q;
2720
2721 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2722 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2723 * arg_caps_retain. */
2724
2725 if (capability_quintet_is_set(&arg_full_capabilities)) {
2726 q = arg_full_capabilities;
2727
2728 if (q.bounding == UINT64_MAX)
2729 q.bounding = uid == 0 ? arg_caps_retain : 0;
2730
2731 if (q.effective == UINT64_MAX)
2732 q.effective = uid == 0 ? q.bounding : 0;
2733
2734 if (q.inheritable == UINT64_MAX)
2735 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2736
2737 if (q.permitted == UINT64_MAX)
2738 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2739
2740 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2741 q.ambient = arg_caps_ambient;
2742
2743 if (capability_quintet_mangle(&q))
2744 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2745
2746 } else {
2747 q = (CapabilityQuintet) {
2748 .bounding = arg_caps_retain,
2749 .effective = uid == 0 ? arg_caps_retain : 0,
2750 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2751 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2752 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2753 };
2754
2755 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2756 * in order to maintain the same behavior as systemd < 242. */
2757 if (capability_quintet_mangle(&q))
2758 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2759 "Some capabilities will not be set because they are not in the current bounding set.");
2760
2761 }
2762
2763 return capability_quintet_enforce(&q);
2764 }
2765
2766 static int reset_audit_loginuid(void) {
2767 _cleanup_free_ char *p = NULL;
2768 int r;
2769
2770 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2771 return 0;
2772
2773 r = read_one_line_file("/proc/self/loginuid", &p);
2774 if (r == -ENOENT)
2775 return 0;
2776 if (r < 0)
2777 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2778
2779 /* Already reset? */
2780 if (streq(p, "4294967295"))
2781 return 0;
2782
2783 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2784 if (r < 0) {
2785 log_error_errno(r,
2786 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2787 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2788 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2789 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2790 "using systemd-nspawn. Sleeping for 5s... (%m)");
2791
2792 sleep(5);
2793 }
2794
2795 return 0;
2796 }
2797
2798 static int mount_tunnel_dig(const char *root) {
2799 const char *p, *q;
2800 int r;
2801
2802 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2803 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2804 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2805 (void) mkdir_p(p, 0600);
2806
2807 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2808 if (r < 0)
2809 return log_error_errno(r, "Failed to create /run/host: %m");
2810
2811 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
2812 if (r < 0)
2813 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
2814
2815 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
2816 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2817 if (r < 0)
2818 return r;
2819
2820 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2821 if (r < 0)
2822 return r;
2823
2824 return 0;
2825 }
2826
2827 static int mount_tunnel_open(void) {
2828 int r;
2829
2830 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2831 if (r < 0)
2832 return r;
2833
2834 return 0;
2835 }
2836
2837 static int setup_machine_id(const char *directory) {
2838 int r;
2839
2840 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2841 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2842 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2843 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2844 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2845 * container behaves nicely). */
2846
2847 r = id128_get_machine(directory, &arg_uuid);
2848 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2849 /* If the file is missing, empty, or uninitialized, we don't mind */
2850 if (sd_id128_is_null(arg_uuid)) {
2851 r = sd_id128_randomize(&arg_uuid);
2852 if (r < 0)
2853 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2854 }
2855 } else if (r < 0)
2856 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2857
2858 return 0;
2859 }
2860
2861 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2862 int r;
2863
2864 assert(directory);
2865
2866 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2867 return 0;
2868
2869 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2870 if (r == -EOPNOTSUPP)
2871 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2872 if (r == -EBADE)
2873 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2874 if (r < 0)
2875 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2876 if (r == 0)
2877 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2878 else
2879 log_debug("Patched directory tree to match UID/GID range.");
2880
2881 return r;
2882 }
2883
2884 /*
2885 * Return values:
2886 * < 0 : wait_for_terminate() failed to get the state of the
2887 * container, the container was terminated by a signal, or
2888 * failed for an unknown reason. No change is made to the
2889 * container argument.
2890 * > 0 : The program executed in the container terminated with an
2891 * error. The exit code of the program executed in the
2892 * container is returned. The container argument has been set
2893 * to CONTAINER_TERMINATED.
2894 * 0 : The container is being rebooted, has been shut down or exited
2895 * successfully. The container argument has been set to either
2896 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2897 *
2898 * That is, success is indicated by a return value of zero, and an
2899 * error is indicated by a non-zero value.
2900 */
2901 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2902 siginfo_t status;
2903 int r;
2904
2905 r = wait_for_terminate(pid, &status);
2906 if (r < 0)
2907 return log_warning_errno(r, "Failed to wait for container: %m");
2908
2909 switch (status.si_code) {
2910
2911 case CLD_EXITED:
2912 if (status.si_status == 0)
2913 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2914 else
2915 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2916
2917 *container = CONTAINER_TERMINATED;
2918 return status.si_status;
2919
2920 case CLD_KILLED:
2921 if (status.si_status == SIGINT) {
2922 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2923 *container = CONTAINER_TERMINATED;
2924 return 0;
2925
2926 } else if (status.si_status == SIGHUP) {
2927 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2928 *container = CONTAINER_REBOOTED;
2929 return 0;
2930 }
2931
2932 _fallthrough_;
2933 case CLD_DUMPED:
2934 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2935 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2936
2937 default:
2938 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2939 "Container %s failed due to unknown reason.", arg_machine);
2940 }
2941 }
2942
2943 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2944 pid_t pid;
2945
2946 pid = PTR_TO_PID(userdata);
2947 if (pid > 0) {
2948 if (kill(pid, arg_kill_signal) >= 0) {
2949 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2950 sd_event_source_set_userdata(s, NULL);
2951 return 0;
2952 }
2953 }
2954
2955 sd_event_exit(sd_event_source_get_event(s), 0);
2956 return 0;
2957 }
2958
2959 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2960 pid_t pid;
2961
2962 assert(s);
2963 assert(ssi);
2964
2965 pid = PTR_TO_PID(userdata);
2966
2967 for (;;) {
2968 siginfo_t si = {};
2969
2970 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2971 return log_error_errno(errno, "Failed to waitid(): %m");
2972 if (si.si_pid == 0) /* No pending children. */
2973 break;
2974 if (si.si_pid == pid) {
2975 /* The main process we care for has exited. Return from
2976 * signal handler but leave the zombie. */
2977 sd_event_exit(sd_event_source_get_event(s), 0);
2978 break;
2979 }
2980
2981 /* Reap all other children. */
2982 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2983 }
2984
2985 return 0;
2986 }
2987
2988 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2989 pid_t pid;
2990
2991 assert(m);
2992
2993 pid = PTR_TO_PID(userdata);
2994
2995 if (arg_kill_signal > 0) {
2996 log_info("Container termination requested. Attempting to halt container.");
2997 (void) kill(pid, arg_kill_signal);
2998 } else {
2999 log_info("Container termination requested. Exiting.");
3000 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
3001 }
3002
3003 return 0;
3004 }
3005
3006 static int determine_names(void) {
3007 int r;
3008
3009 if (arg_template && !arg_directory && arg_machine) {
3010
3011 /* If --template= was specified then we should not
3012 * search for a machine, but instead create a new one
3013 * in /var/lib/machine. */
3014
3015 arg_directory = path_join("/var/lib/machines", arg_machine);
3016 if (!arg_directory)
3017 return log_oom();
3018 }
3019
3020 if (!arg_image && !arg_directory) {
3021 if (arg_machine) {
3022 _cleanup_(image_unrefp) Image *i = NULL;
3023
3024 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3025 if (r == -ENOENT)
3026 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3027 if (r < 0)
3028 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3029
3030 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3031 r = free_and_strdup(&arg_image, i->path);
3032 else
3033 r = free_and_strdup(&arg_directory, i->path);
3034 if (r < 0)
3035 return log_oom();
3036
3037 if (!arg_ephemeral)
3038 arg_read_only = arg_read_only || i->read_only;
3039 } else {
3040 r = safe_getcwd(&arg_directory);
3041 if (r < 0)
3042 return log_error_errno(r, "Failed to determine current directory: %m");
3043 }
3044
3045 if (!arg_directory && !arg_image)
3046 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3047 }
3048
3049 if (!arg_machine) {
3050 if (arg_directory && path_equal(arg_directory, "/"))
3051 arg_machine = gethostname_malloc();
3052 else if (arg_image) {
3053 char *e;
3054
3055 r = path_extract_filename(arg_image, &arg_machine);
3056 if (r < 0)
3057 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
3058
3059 /* Truncate suffix if there is one */
3060 e = endswith(arg_machine, ".raw");
3061 if (e)
3062 *e = 0;
3063 } else {
3064 r = path_extract_filename(arg_directory, &arg_machine);
3065 if (r < 0)
3066 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3067 }
3068
3069 hostname_cleanup(arg_machine);
3070 if (!hostname_is_valid(arg_machine, 0))
3071 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3072
3073 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3074 * to match fixed config file names. */
3075 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3076 if (!arg_settings_filename)
3077 return log_oom();
3078
3079 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3080 * instances at once without manually having to specify -M each time. */
3081 if (arg_ephemeral)
3082 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3083 return log_oom();
3084 } else {
3085 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3086 if (!arg_settings_filename)
3087 return log_oom();
3088 }
3089
3090 return 0;
3091 }
3092
3093 static int chase_and_update(char **p, unsigned flags) {
3094 char *chased;
3095 int r;
3096
3097 assert(p);
3098
3099 if (!*p)
3100 return 0;
3101
3102 r = chase(*p, NULL, flags, &chased, NULL);
3103 if (r < 0)
3104 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3105
3106 return free_and_replace(*p, chased);
3107 }
3108
3109 static int determine_uid_shift(const char *directory) {
3110
3111 if (arg_userns_mode == USER_NAMESPACE_NO) {
3112 arg_uid_shift = 0;
3113 return 0;
3114 }
3115
3116 if (arg_uid_shift == UID_INVALID) {
3117 struct stat st;
3118
3119 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3120
3121 if (stat(directory, &st) < 0)
3122 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3123
3124 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3125
3126 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3127 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3128 "UID and GID base of %s don't match.", directory);
3129
3130 arg_uid_range = UINT32_C(0x10000);
3131
3132 if (arg_uid_shift != 0) {
3133 /* If the image is shifted already, then we'll fall back to classic chowning, for
3134 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3135
3136 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3137 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3138 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3139 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3140 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3141 "UID base of %s is not zero, UID mapping not supported.", directory);
3142 }
3143 }
3144
3145 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3146 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3147
3148 return 0;
3149 }
3150
3151 static unsigned long effective_clone_ns_flags(void) {
3152 unsigned long flags = arg_clone_ns_flags;
3153
3154 if (arg_private_network)
3155 flags |= CLONE_NEWNET;
3156 if (arg_use_cgns)
3157 flags |= CLONE_NEWCGROUP;
3158 if (arg_userns_mode != USER_NAMESPACE_NO)
3159 flags |= CLONE_NEWUSER;
3160
3161 return flags;
3162 }
3163
3164 static int patch_sysctl(void) {
3165
3166 /* This table is inspired by runc's sysctl() function */
3167 static const struct {
3168 const char *key;
3169 bool prefix;
3170 unsigned long clone_flags;
3171 } safe_sysctl[] = {
3172 { "kernel.hostname", false, CLONE_NEWUTS },
3173 { "kernel.domainname", false, CLONE_NEWUTS },
3174 { "kernel.msgmax", false, CLONE_NEWIPC },
3175 { "kernel.msgmnb", false, CLONE_NEWIPC },
3176 { "kernel.msgmni", false, CLONE_NEWIPC },
3177 { "kernel.sem", false, CLONE_NEWIPC },
3178 { "kernel.shmall", false, CLONE_NEWIPC },
3179 { "kernel.shmmax", false, CLONE_NEWIPC },
3180 { "kernel.shmmni", false, CLONE_NEWIPC },
3181 { "fs.mqueue.", true, CLONE_NEWIPC },
3182 { "net.", true, CLONE_NEWNET },
3183 };
3184
3185 unsigned long flags;
3186 int r;
3187
3188 flags = effective_clone_ns_flags();
3189
3190 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3191 bool good = false;
3192 size_t i;
3193
3194 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3195
3196 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3197 continue;
3198
3199 if (safe_sysctl[i].prefix)
3200 good = startswith(*k, safe_sysctl[i].key);
3201 else
3202 good = streq(*k, safe_sysctl[i].key);
3203
3204 if (good)
3205 break;
3206 }
3207
3208 if (!good)
3209 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3210
3211 r = sysctl_write(*k, *v);
3212 if (r < 0)
3213 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3214 }
3215
3216 return 0;
3217 }
3218
3219 static int inner_child(
3220 Barrier *barrier,
3221 int fd_inner_socket,
3222 FDSet *fds,
3223 char **os_release_pairs) {
3224
3225 _cleanup_free_ char *home = NULL;
3226 size_t n_env = 1;
3227 char *envp[] = {
3228 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3229 NULL, /* container */
3230 NULL, /* TERM */
3231 NULL, /* HOME */
3232 NULL, /* USER */
3233 NULL, /* LOGNAME */
3234 NULL, /* container_uuid */
3235 NULL, /* LISTEN_FDS */
3236 NULL, /* LISTEN_PID */
3237 NULL, /* NOTIFY_SOCKET */
3238 NULL, /* CREDENTIALS_DIRECTORY */
3239 NULL, /* LANG */
3240 NULL
3241 };
3242 const char *exec_target;
3243 _cleanup_strv_free_ char **env_use = NULL;
3244 int r, which_failed;
3245
3246 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3247 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3248 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3249 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3250 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3251 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3252 * namespace.
3253 *
3254 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3255 * unshare(). See below. */
3256
3257 assert(barrier);
3258 assert(fd_inner_socket >= 0);
3259
3260 log_debug("Inner child is initializing.");
3261
3262 if (arg_userns_mode != USER_NAMESPACE_NO) {
3263 /* Tell the parent, that it now can write the UID map. */
3264 (void) barrier_place(barrier); /* #1 */
3265
3266 /* Wait until the parent wrote the UID map */
3267 if (!barrier_place_and_sync(barrier)) /* #2 */
3268 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3269
3270 /* Become the new root user inside our namespace */
3271 r = reset_uid_gid();
3272 if (r < 0)
3273 return log_error_errno(r, "Couldn't become new root: %m");
3274
3275 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3276 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3277 * propagation, but simply create new peer groups for all our mounts). */
3278 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3279 if (r < 0)
3280 return r;
3281 }
3282
3283 r = mount_all(NULL,
3284 arg_mount_settings | MOUNT_IN_USERNS,
3285 arg_uid_shift,
3286 arg_selinux_apifs_context);
3287 if (r < 0)
3288 return r;
3289
3290 if (!arg_network_namespace_path && arg_private_network) {
3291 r = unshare(CLONE_NEWNET);
3292 if (r < 0)
3293 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3294
3295 /* Tell the parent that it can setup network interfaces. */
3296 (void) barrier_place(barrier); /* #3 */
3297 }
3298
3299 r = mount_sysfs(NULL, arg_mount_settings);
3300 if (r < 0)
3301 return r;
3302
3303 /* Wait until we are cgroup-ified, so that we
3304 * can mount the right cgroup path writable */
3305 if (!barrier_place_and_sync(barrier)) /* #4 */
3306 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3307 "Parent died too early");
3308
3309 if (arg_use_cgns) {
3310 r = unshare(CLONE_NEWCGROUP);
3311 if (r < 0)
3312 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3313 r = mount_cgroups(
3314 "",
3315 arg_unified_cgroup_hierarchy,
3316 arg_userns_mode != USER_NAMESPACE_NO,
3317 arg_uid_shift,
3318 arg_uid_range,
3319 arg_selinux_apifs_context,
3320 true);
3321 } else
3322 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3323 if (r < 0)
3324 return r;
3325
3326 r = setup_boot_id();
3327 if (r < 0)
3328 return r;
3329
3330 r = setup_kmsg(fd_inner_socket);
3331 if (r < 0)
3332 return r;
3333
3334 r = mount_custom(
3335 "/",
3336 arg_custom_mounts,
3337 arg_n_custom_mounts,
3338 0,
3339 0,
3340 arg_selinux_apifs_context,
3341 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3342 if (r < 0)
3343 return r;
3344
3345 if (setsid() < 0)
3346 return log_error_errno(errno, "setsid() failed: %m");
3347
3348 if (arg_private_network)
3349 (void) loopback_setup();
3350
3351 if (arg_expose_ports) {
3352 r = expose_port_send_rtnl(fd_inner_socket);
3353 if (r < 0)
3354 return r;
3355 }
3356
3357 if (arg_console_mode != CONSOLE_PIPE) {
3358 _cleanup_close_ int master = -EBADF;
3359 _cleanup_free_ char *console = NULL;
3360
3361 /* Allocate a pty and make it available as /dev/console. */
3362 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3363 if (master < 0)
3364 return log_error_errno(master, "Failed to allocate a pty: %m");
3365
3366 r = setup_dev_console(console);
3367 if (r < 0)
3368 return log_error_errno(r, "Failed to set up /dev/console: %m");
3369
3370 r = send_one_fd(fd_inner_socket, master, 0);
3371 if (r < 0)
3372 return log_error_errno(r, "Failed to send master fd: %m");
3373
3374 r = setup_stdio_as_dev_console();
3375 if (r < 0)
3376 return r;
3377 }
3378
3379 r = patch_sysctl();
3380 if (r < 0)
3381 return r;
3382
3383 if (arg_oom_score_adjust_set) {
3384 r = set_oom_score_adjust(arg_oom_score_adjust);
3385 if (r < 0)
3386 return log_error_errno(r, "Failed to adjust OOM score: %m");
3387 }
3388
3389 if (arg_cpu_set.set)
3390 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3391 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3392
3393 (void) setup_hostname();
3394
3395 if (arg_personality != PERSONALITY_INVALID) {
3396 r = safe_personality(arg_personality);
3397 if (r < 0)
3398 return log_error_errno(r, "personality() failed: %m");
3399 #ifdef ARCHITECTURE_SECONDARY
3400 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
3401 r = safe_personality(PER_LINUX32);
3402 if (r < 0)
3403 return log_error_errno(r, "personality() failed: %m");
3404 #endif
3405 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3406 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3407 "invocation with qemu userspace emulator (or equivalent) in effect.",
3408 architecture_to_string(arg_architecture));
3409
3410 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3411 if (r < 0)
3412 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3413
3414 #if HAVE_SECCOMP
3415 if (arg_seccomp) {
3416
3417 if (is_seccomp_available()) {
3418 r = seccomp_load(arg_seccomp);
3419 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3420 return log_error_errno(r, "Failed to install seccomp filter: %m");
3421 if (r < 0)
3422 log_debug_errno(r, "Failed to install seccomp filter: %m");
3423 }
3424 } else
3425 #endif
3426 {
3427 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3428 if (r < 0)
3429 return r;
3430 }
3431
3432 if (arg_suppress_sync) {
3433 #if HAVE_SECCOMP
3434 r = seccomp_suppress_sync();
3435 if (r < 0)
3436 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3437 #else
3438 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3439 #endif
3440 }
3441
3442 #if HAVE_SELINUX
3443 if (arg_selinux_context)
3444 if (setexeccon(arg_selinux_context) < 0)
3445 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3446 #endif
3447
3448 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3449 * if we need to later on. */
3450 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3451 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3452
3453 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3454 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3455 else
3456 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3457 if (r < 0)
3458 return r;
3459
3460 r = drop_capabilities(getuid());
3461 if (r < 0)
3462 return log_error_errno(r, "Dropping capabilities failed: %m");
3463
3464 if (arg_no_new_privileges)
3465 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3466 return log_error_errno(errno, "Failed to disable new privileges: %m");
3467
3468 /* LXC sets container=lxc, so follow the scheme here */
3469 envp[n_env++] = strjoina("container=", arg_container_service_name);
3470
3471 envp[n_env] = strv_find_prefix(environ, "TERM=");
3472 if (envp[n_env])
3473 n_env++;
3474
3475 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3476 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3477 return log_oom();
3478
3479 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3480 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3481 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
3482 return log_oom();
3483
3484 assert(!sd_id128_is_null(arg_uuid));
3485
3486 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3487 return log_oom();
3488
3489 if (fdset_size(fds) > 0) {
3490 r = fdset_cloexec(fds, false);
3491 if (r < 0)
3492 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3493
3494 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3495 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3496 return log_oom();
3497 }
3498 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3499 return log_oom();
3500
3501 if (arg_n_credentials > 0) {
3502 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3503 if (!envp[n_env])
3504 return log_oom();
3505 n_env++;
3506 }
3507
3508 if (arg_start_mode != START_BOOT) {
3509 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
3510 if (!envp[n_env])
3511 return log_oom();
3512 n_env++;
3513 }
3514
3515 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3516 if (!env_use)
3517 return log_oom();
3518
3519 /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */
3520 if (!barrier_place_and_sync(barrier)) /* #5 */
3521 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3522
3523 if (arg_chdir)
3524 if (chdir(arg_chdir) < 0)
3525 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3526
3527 if (arg_start_mode == START_PID2) {
3528 r = stub_pid1(arg_uuid);
3529 if (r < 0)
3530 return r;
3531 }
3532
3533 if (arg_console_mode != CONSOLE_PIPE) {
3534 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3535 * are configured for that. Acquire it as controlling tty. */
3536 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3537 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3538 }
3539
3540 log_debug("Inner child completed, invoking payload.");
3541
3542 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3543 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3544 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3545 log_close();
3546 log_set_open_when_needed(true);
3547 log_settle_target();
3548
3549 (void) fdset_close_others(fds);
3550
3551 if (arg_start_mode == START_BOOT) {
3552 char **a;
3553 size_t m;
3554
3555 /* Automatically search for the init system */
3556
3557 m = strv_length(arg_parameters);
3558 a = newa(char*, m + 2);
3559 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3560 a[1 + m] = NULL;
3561
3562 FOREACH_STRING(init,
3563 "/usr/lib/systemd/systemd",
3564 "/lib/systemd/systemd",
3565 "/sbin/init") {
3566 a[0] = (char*) init;
3567 execve(a[0], a, env_use);
3568 }
3569
3570 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3571 } else if (!strv_isempty(arg_parameters)) {
3572 const char *dollar_path;
3573
3574 exec_target = arg_parameters[0];
3575
3576 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3577 * binary. */
3578 dollar_path = strv_env_get(env_use, "PATH");
3579 if (dollar_path) {
3580 if (setenv("PATH", dollar_path, 1) < 0)
3581 return log_error_errno(errno, "Failed to update $PATH: %m");
3582 }
3583
3584 execvpe(arg_parameters[0], arg_parameters, env_use);
3585 } else {
3586 if (!arg_chdir)
3587 /* If we cannot change the directory, we'll end up in /, that is expected. */
3588 (void) chdir(home ?: "/root");
3589
3590 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3591 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3592 execle("/bin/bash", "-bash", NULL, env_use);
3593 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3594 execle("/bin/sh", "-sh", NULL, env_use);
3595
3596 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3597 }
3598
3599 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3600 }
3601
3602 static int setup_notify_child(void) {
3603 _cleanup_close_ int fd = -EBADF;
3604 static const union sockaddr_union sa = {
3605 .un.sun_family = AF_UNIX,
3606 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3607 };
3608 int r;
3609
3610 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3611 if (fd < 0)
3612 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3613
3614 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3615 (void) sockaddr_un_unlink(&sa.un);
3616
3617 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3618 if (r < 0)
3619 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3620
3621 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3622 if (r < 0)
3623 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3624
3625 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3626 if (r < 0)
3627 return log_error_errno(r, "SO_PASSCRED failed: %m");
3628
3629 return TAKE_FD(fd);
3630 }
3631
3632 static int outer_child(
3633 Barrier *barrier,
3634 const char *directory,
3635 DissectedImage *dissected_image,
3636 int fd_outer_socket,
3637 int fd_inner_socket,
3638 FDSet *fds,
3639 int netns_fd) {
3640
3641 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3642 _cleanup_strv_free_ char **os_release_pairs = NULL;
3643 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
3644 bool idmap = false;
3645 const char *p;
3646 pid_t pid;
3647 ssize_t l;
3648 int r;
3649
3650 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3651 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3652 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3653 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3654 * forked off it, and it exits. */
3655
3656 assert(barrier);
3657 assert(directory);
3658 assert(fd_outer_socket >= 0);
3659 assert(fd_inner_socket >= 0);
3660
3661 log_debug("Outer child is initializing.");
3662
3663 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3664 if (r < 0)
3665 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3666
3667 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3668 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3669
3670 r = reset_audit_loginuid();
3671 if (r < 0)
3672 return r;
3673
3674 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3675 * mounts to the real root. */
3676 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3677 if (r < 0)
3678 return r;
3679
3680 if (dissected_image) {
3681 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3682 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3683 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3684 * right place right away. This makes sure ESP partitions and userns are compatible. */
3685
3686 r = dissected_image_mount_and_warn(
3687 dissected_image,
3688 directory,
3689 arg_uid_shift,
3690 arg_uid_range,
3691 /* userns_fd= */ -EBADF,
3692 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3693 DISSECT_IMAGE_DISCARD_ON_LOOP|
3694 DISSECT_IMAGE_USR_NO_ROOT|
3695 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3696 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3697 if (r < 0)
3698 return r;
3699 }
3700
3701 r = determine_uid_shift(directory);
3702 if (r < 0)
3703 return r;
3704
3705 if (arg_userns_mode != USER_NAMESPACE_NO) {
3706 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3707 if (r < 0)
3708 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3709
3710 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
3711 if (l < 0)
3712 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3713 mntns_fd = safe_close(mntns_fd);
3714
3715 /* Let the parent know which UID shift we read from the image */
3716 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3717 if (l < 0)
3718 return log_error_errno(errno, "Failed to send UID shift: %m");
3719 if (l != sizeof(arg_uid_shift))
3720 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3721 "Short write while sending UID shift.");
3722
3723 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3724 /* When we are supposed to pick the UID shift, the parent will check now whether the
3725 * UID shift we just read from the image is available. If yes, it will send the UID
3726 * shift back to us, if not it will pick a different one, and send it back to us. */
3727
3728 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3729 if (l < 0)
3730 return log_error_errno(errno, "Failed to recv UID shift: %m");
3731 if (l != sizeof(arg_uid_shift))
3732 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3733 "Short read while receiving UID shift.");
3734 }
3735
3736 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3737 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3738 }
3739
3740 if (path_equal(directory, "/")) {
3741 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3742 * place, so that we can make changes to its mount structure (for example, to implement
3743 * --volatile=) without this interfering with our ability to access files such as
3744 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3745 * (instead of a temporary directory, since we are living in our own mount namespace here
3746 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3747 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3748
3749 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3750 if (r < 0)
3751 return r;
3752
3753 directory = "/run/systemd/nspawn-root";
3754 }
3755
3756 /* Make sure we always have a mount that we can move to root later on. */
3757 r = make_mount_point(directory);
3758 if (r < 0)
3759 return r;
3760
3761 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3762 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3763 * we'll live in our own little world from now on, and propagation from the host may only happen via
3764 * the mount tunnel dir, or not at all. */
3765 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3766 if (r < 0)
3767 return r;
3768
3769 r = setup_pivot_root(
3770 directory,
3771 arg_pivot_root_new,
3772 arg_pivot_root_old);
3773 if (r < 0)
3774 return r;
3775
3776 r = setup_volatile_mode(
3777 directory,
3778 arg_volatile_mode,
3779 arg_uid_shift,
3780 arg_selinux_apifs_context);
3781 if (r < 0)
3782 return r;
3783
3784 r = bind_user_prepare(
3785 directory,
3786 arg_bind_user,
3787 arg_uid_shift,
3788 arg_uid_range,
3789 &arg_custom_mounts, &arg_n_custom_mounts,
3790 &bind_user_context);
3791 if (r < 0)
3792 return r;
3793
3794 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3795 /* Send the user maps we determined to the parent, so that it installs it in our user
3796 * namespace UID map table */
3797
3798 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3799 uid_t map[] = {
3800 bind_user_context->data[i].payload_user->uid,
3801 bind_user_context->data[i].host_user->uid,
3802 (uid_t) bind_user_context->data[i].payload_group->gid,
3803 (uid_t) bind_user_context->data[i].host_group->gid,
3804 };
3805
3806 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
3807 if (l < 0)
3808 return log_error_errno(errno, "Failed to send user UID map: %m");
3809 if (l != sizeof(map))
3810 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3811 "Short write while sending user UID map.");
3812 }
3813 }
3814
3815 r = mount_custom(
3816 directory,
3817 arg_custom_mounts,
3818 arg_n_custom_mounts,
3819 arg_uid_shift,
3820 arg_uid_range,
3821 arg_selinux_apifs_context,
3822 MOUNT_ROOT_ONLY);
3823 if (r < 0)
3824 return r;
3825
3826 if (arg_userns_mode != USER_NAMESPACE_NO &&
3827 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3828 arg_uid_shift != 0) {
3829 _cleanup_free_ char *usr_subtree = NULL;
3830 char *dirs[3];
3831 size_t i = 0;
3832
3833 dirs[i++] = (char*) directory;
3834
3835 if (dissected_image && dissected_image->partitions[PARTITION_USR].found) {
3836 usr_subtree = path_join(directory, "/usr");
3837 if (!usr_subtree)
3838 return log_oom();
3839
3840 dirs[i++] = usr_subtree;
3841 }
3842
3843 dirs[i] = NULL;
3844
3845 r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
3846 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3847 /* This might fail because the kernel or file system doesn't support idmapping. We
3848 * can't really distinguish this nicely, nor do we have any guarantees about the
3849 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3850 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3851 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3852 "ID mapped mounts are apparently not available, sorry.");
3853
3854 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3855 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3856 } else if (r < 0)
3857 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3858 else {
3859 log_debug("ID mapped mounts available, making use of them.");
3860 idmap = true;
3861 }
3862 }
3863
3864 if (dissected_image) {
3865 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3866 r = dissected_image_mount(
3867 dissected_image,
3868 directory,
3869 arg_uid_shift,
3870 arg_uid_range,
3871 /* userns_fd= */ -EBADF,
3872 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3873 DISSECT_IMAGE_DISCARD_ON_LOOP|
3874 DISSECT_IMAGE_USR_NO_ROOT|
3875 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3876 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3877 if (r == -EUCLEAN)
3878 return log_error_errno(r, "File system check for image failed: %m");
3879 if (r < 0)
3880 return log_error_errno(r, "Failed to mount image file system: %m");
3881 }
3882
3883 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3884 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3885
3886 r = detect_unified_cgroup_hierarchy_from_image(directory);
3887 if (r < 0)
3888 return r;
3889
3890 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3891 if (l < 0)
3892 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3893 if (l != sizeof(arg_unified_cgroup_hierarchy))
3894 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3895 "Short write while sending cgroup mode.");
3896 }
3897
3898 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3899 if (r < 0)
3900 return r;
3901
3902 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3903 if (r < 0)
3904 return r;
3905
3906 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3907 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
3908 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3909 if (r < 0)
3910 return log_error_errno(r, "Failed to make tree read-only: %m");
3911 }
3912
3913 r = mount_all(directory,
3914 arg_mount_settings,
3915 arg_uid_shift,
3916 arg_selinux_apifs_context);
3917 if (r < 0)
3918 return r;
3919
3920 r = copy_devnodes(directory);
3921 if (r < 0)
3922 return r;
3923
3924 r = make_extra_nodes(directory);
3925 if (r < 0)
3926 return r;
3927
3928 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3929
3930 p = prefix_roota(directory, "/run/host");
3931 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
3932
3933 r = setup_pts(directory);
3934 if (r < 0)
3935 return r;
3936
3937 r = mount_tunnel_dig(directory);
3938 if (r < 0)
3939 return r;
3940
3941 r = setup_keyring();
3942 if (r < 0)
3943 return r;
3944
3945 r = setup_credentials(directory);
3946 if (r < 0)
3947 return r;
3948
3949 r = bind_user_setup(bind_user_context, directory);
3950 if (r < 0)
3951 return r;
3952
3953 r = mount_custom(
3954 directory,
3955 arg_custom_mounts,
3956 arg_n_custom_mounts,
3957 arg_uid_shift,
3958 arg_uid_range,
3959 arg_selinux_apifs_context,
3960 MOUNT_NON_ROOT_ONLY);
3961 if (r < 0)
3962 return r;
3963
3964 r = setup_timezone(directory);
3965 if (r < 0)
3966 return r;
3967
3968 r = setup_resolv_conf(directory);
3969 if (r < 0)
3970 return r;
3971
3972 r = setup_machine_id(directory);
3973 if (r < 0)
3974 return r;
3975
3976 r = setup_journal(directory);
3977 if (r < 0)
3978 return r;
3979
3980 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3981 p = prefix_roota(directory, "/run/host/container-manager");
3982 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3983
3984 /* The same stuff as the $container_uuid env var */
3985 p = prefix_roota(directory, "/run/host/container-uuid");
3986 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3987
3988 if (!arg_use_cgns) {
3989 r = mount_cgroups(
3990 directory,
3991 arg_unified_cgroup_hierarchy,
3992 arg_userns_mode != USER_NAMESPACE_NO,
3993 arg_uid_shift,
3994 arg_uid_range,
3995 arg_selinux_apifs_context,
3996 false);
3997 if (r < 0)
3998 return r;
3999 }
4000
4001 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
4002 * mounts available in systemd services inside the container that create a new mount namespace. See
4003 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
4004 * will inherit the shared propagation mode.
4005 *
4006 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
4007 * directory mount to root later on.
4008 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
4009 */
4010 r = mount_switch_root(directory, MS_SHARED);
4011 if (r < 0)
4012 return log_error_errno(r, "Failed to move root directory: %m");
4013
4014 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4015 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4016 * the container. */
4017 r = mount_tunnel_open();
4018 if (r < 0)
4019 return r;
4020
4021 if (arg_userns_mode != USER_NAMESPACE_NO) {
4022 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4023 * requires that a fully visible instance is already present in the target mount
4024 * namespace. Mount one here so the inner child can mount its own instances. Later
4025 * we umount the temporary instances created here before we actually exec the
4026 * payload. Since the rootfs is shared the umount will propagate into the container.
4027 * Note, the inner child wouldn't be able to unmount the instances on its own since
4028 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4029 * this. */
4030 r = pin_fully_visible_fs();
4031 if (r < 0)
4032 return r;
4033 }
4034
4035 fd = setup_notify_child();
4036 if (fd < 0)
4037 return fd;
4038
4039 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4040 arg_clone_ns_flags |
4041 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
4042 if (pid < 0)
4043 return log_error_errno(errno, "Failed to fork inner child: %m");
4044 if (pid == 0) {
4045 fd_outer_socket = safe_close(fd_outer_socket);
4046
4047 /* The inner child has all namespaces that are requested, so that we all are owned by the
4048 * user if user namespaces are turned on. */
4049
4050 if (arg_network_namespace_path) {
4051 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4052 if (r < 0)
4053 return log_error_errno(r, "Failed to join network namespace: %m");
4054 }
4055
4056 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
4057 if (r < 0)
4058 _exit(EXIT_FAILURE);
4059
4060 _exit(EXIT_SUCCESS);
4061 }
4062
4063 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4064 if (l < 0)
4065 return log_error_errno(errno, "Failed to send PID: %m");
4066 if (l != sizeof(pid))
4067 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4068 "Short write while sending PID.");
4069
4070 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4071 if (l < 0)
4072 return log_error_errno(errno, "Failed to send machine ID: %m");
4073 if (l != sizeof(arg_uuid))
4074 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4075 "Short write while sending machine ID.");
4076
4077 l = send_one_fd(fd_outer_socket, fd, 0);
4078 if (l < 0)
4079 return log_error_errno(l, "Failed to send notify fd: %m");
4080
4081 fd_outer_socket = safe_close(fd_outer_socket);
4082 fd_inner_socket = safe_close(fd_inner_socket);
4083 netns_fd = safe_close(netns_fd);
4084
4085 return 0;
4086 }
4087
4088 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4089 bool tried_hashed = false;
4090 unsigned n_tries = 100;
4091 uid_t candidate;
4092 int r;
4093
4094 assert(shift);
4095 assert(ret_lock_file);
4096 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4097 assert(arg_uid_range == 0x10000U);
4098
4099 candidate = *shift;
4100
4101 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4102
4103 for (;;) {
4104 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4105 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4106
4107 if (--n_tries <= 0)
4108 return -EBUSY;
4109
4110 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4111 goto next;
4112 if ((candidate & UINT32_C(0xFFFF)) != 0)
4113 goto next;
4114
4115 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4116 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4117 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4118 goto next;
4119 if (r < 0)
4120 return r;
4121
4122 /* Make some superficial checks whether the range is currently known in the user database */
4123 if (getpwuid(candidate))
4124 goto next;
4125 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4126 goto next;
4127 if (getgrgid(candidate))
4128 goto next;
4129 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4130 goto next;
4131
4132 *ret_lock_file = lf;
4133 lf = (struct LockFile) LOCK_FILE_INIT;
4134 *shift = candidate;
4135 return 0;
4136
4137 next:
4138 if (arg_machine && !tried_hashed) {
4139 /* Try to hash the base from the container name */
4140
4141 static const uint8_t hash_key[] = {
4142 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4143 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4144 };
4145
4146 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4147
4148 tried_hashed = true;
4149 } else
4150 random_bytes(&candidate, sizeof(candidate));
4151
4152 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4153 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4154 }
4155 }
4156
4157 static int add_one_uid_map(
4158 char **p,
4159 uid_t container_uid,
4160 uid_t host_uid,
4161 uid_t range) {
4162
4163 return strextendf(p,
4164 UID_FMT " " UID_FMT " " UID_FMT "\n",
4165 container_uid, host_uid, range);
4166 }
4167
4168 static int make_uid_map_string(
4169 const uid_t bind_user_uid[],
4170 size_t n_bind_user_uid,
4171 size_t offset,
4172 char **ret) {
4173
4174 _cleanup_free_ char *s = NULL;
4175 uid_t previous_uid = 0;
4176 int r;
4177
4178 assert(n_bind_user_uid == 0 || bind_user_uid);
4179 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4180 assert(ret);
4181
4182 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4183 * quadruplet, consisting of host and container UID + GID. */
4184
4185 for (size_t i = 0; i < n_bind_user_uid; i++) {
4186 uid_t payload_uid = bind_user_uid[i*4+offset],
4187 host_uid = bind_user_uid[i*4+offset+1];
4188
4189 assert(previous_uid <= payload_uid);
4190 assert(payload_uid < arg_uid_range);
4191
4192 /* Add a range to close the gap to previous entry */
4193 if (payload_uid > previous_uid) {
4194 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4195 if (r < 0)
4196 return r;
4197 }
4198
4199 /* Map this specific user */
4200 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4201 if (r < 0)
4202 return r;
4203
4204 previous_uid = payload_uid + 1;
4205 }
4206
4207 /* And add a range to close the gap to finish the range */
4208 if (arg_uid_range > previous_uid) {
4209 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4210 if (r < 0)
4211 return r;
4212 }
4213
4214 assert(s);
4215
4216 *ret = TAKE_PTR(s);
4217 return 0;
4218 }
4219
4220 static int setup_uid_map(
4221 pid_t pid,
4222 const uid_t bind_user_uid[],
4223 size_t n_bind_user_uid) {
4224
4225 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4226 _cleanup_free_ char *s = NULL;
4227 int r;
4228
4229 assert(pid > 1);
4230
4231 /* Build the UID map string */
4232 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4233 return log_oom();
4234
4235 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4236 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4237 if (r < 0)
4238 return log_error_errno(r, "Failed to write UID map: %m");
4239
4240 /* And now build the GID map string */
4241 s = mfree(s);
4242 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4243 return log_oom();
4244
4245 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4246 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4247 if (r < 0)
4248 return log_error_errno(r, "Failed to write GID map: %m");
4249
4250 return 0;
4251 }
4252
4253 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4254 char buf[NOTIFY_BUFFER_MAX+1];
4255 char *p = NULL;
4256 struct iovec iovec = {
4257 .iov_base = buf,
4258 .iov_len = sizeof(buf)-1,
4259 };
4260 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4261 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4262 struct msghdr msghdr = {
4263 .msg_iov = &iovec,
4264 .msg_iovlen = 1,
4265 .msg_control = &control,
4266 .msg_controllen = sizeof(control),
4267 };
4268 struct ucred *ucred;
4269 ssize_t n;
4270 pid_t inner_child_pid;
4271 _cleanup_strv_free_ char **tags = NULL;
4272 int r;
4273
4274 assert(userdata);
4275
4276 inner_child_pid = PTR_TO_PID(userdata);
4277
4278 if (revents != EPOLLIN) {
4279 log_warning("Got unexpected poll event for notify fd.");
4280 return 0;
4281 }
4282
4283 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4284 if (ERRNO_IS_NEG_TRANSIENT(n))
4285 return 0;
4286 else if (n == -EXFULL) {
4287 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4288 return 0;
4289 } else if (n < 0)
4290 return log_warning_errno(n, "Couldn't read notification socket: %m");
4291
4292 cmsg_close_all(&msghdr);
4293
4294 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4295 if (!ucred || ucred->pid != inner_child_pid) {
4296 log_debug("Received notify message without valid credentials. Ignoring.");
4297 return 0;
4298 }
4299
4300 if ((size_t) n >= sizeof(buf)) {
4301 log_warning("Received notify message exceeded maximum size. Ignoring.");
4302 return 0;
4303 }
4304
4305 buf[n] = 0;
4306 tags = strv_split(buf, "\n\r");
4307 if (!tags)
4308 return log_oom();
4309
4310 if (strv_contains(tags, "READY=1")) {
4311 r = sd_notify(false, "READY=1\n");
4312 if (r < 0)
4313 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4314 }
4315
4316 p = strv_find_startswith(tags, "STATUS=");
4317 if (p)
4318 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4319
4320 return 0;
4321 }
4322
4323 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4324 int r;
4325
4326 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4327 if (r < 0)
4328 return log_error_errno(r, "Failed to allocate notify event source: %m");
4329
4330 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4331
4332 return 0;
4333 }
4334
4335 static int merge_settings(Settings *settings, const char *path) {
4336 int rl;
4337
4338 assert(settings);
4339 assert(path);
4340
4341 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4342 * that this steals the fields of the Settings* structure, and hence modifies it. */
4343
4344 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4345 settings->start_mode >= 0) {
4346 arg_start_mode = settings->start_mode;
4347 strv_free_and_replace(arg_parameters, settings->parameters);
4348 }
4349
4350 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4351 settings->ephemeral >= 0)
4352 arg_ephemeral = settings->ephemeral;
4353
4354 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4355 settings->root) {
4356
4357 if (!arg_settings_trusted)
4358 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4359 else
4360 free_and_replace(arg_directory, settings->root);
4361 }
4362
4363 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4364 settings->pivot_root_new) {
4365 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4366 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4367 }
4368
4369 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4370 settings->working_directory)
4371 free_and_replace(arg_chdir, settings->working_directory);
4372
4373 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4374 settings->environment)
4375 strv_free_and_replace(arg_setenv, settings->environment);
4376
4377 if ((arg_settings_mask & SETTING_USER) == 0) {
4378
4379 if (settings->user)
4380 free_and_replace(arg_user, settings->user);
4381
4382 if (uid_is_valid(settings->uid))
4383 arg_uid = settings->uid;
4384 if (gid_is_valid(settings->gid))
4385 arg_gid = settings->gid;
4386 if (settings->n_supplementary_gids > 0) {
4387 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4388 arg_n_supplementary_gids = settings->n_supplementary_gids;
4389 }
4390 }
4391
4392 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4393 uint64_t plus, minus;
4394 uint64_t network_minus = 0;
4395 uint64_t ambient;
4396
4397 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4398 * Settings structure */
4399
4400 plus = settings->capability;
4401 minus = settings->drop_capability;
4402
4403 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4404 settings_network_configured(settings)) {
4405 if (settings_private_network(settings))
4406 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4407 else
4408 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4409 }
4410
4411 if (!arg_settings_trusted && plus != 0) {
4412 if (settings->capability != 0)
4413 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4414 } else {
4415 arg_caps_retain &= ~network_minus;
4416 arg_caps_retain |= plus;
4417 }
4418
4419 arg_caps_retain &= ~minus;
4420
4421 /* Copy the full capabilities over too */
4422 if (capability_quintet_is_set(&settings->full_capabilities)) {
4423 if (!arg_settings_trusted)
4424 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4425 else
4426 arg_full_capabilities = settings->full_capabilities;
4427 }
4428
4429 ambient = settings->ambient_capability;
4430 if (!arg_settings_trusted && ambient != 0)
4431 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4432 else
4433 arg_caps_ambient |= ambient;
4434 }
4435
4436 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4437 settings->kill_signal > 0)
4438 arg_kill_signal = settings->kill_signal;
4439
4440 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4441 settings->personality != PERSONALITY_INVALID)
4442 arg_personality = settings->personality;
4443
4444 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4445 !sd_id128_is_null(settings->machine_id)) {
4446
4447 if (!arg_settings_trusted)
4448 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4449 else
4450 arg_uuid = settings->machine_id;
4451 }
4452
4453 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4454 settings->read_only >= 0)
4455 arg_read_only = settings->read_only;
4456
4457 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4458 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4459 arg_volatile_mode = settings->volatile_mode;
4460
4461 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4462 settings->n_custom_mounts > 0) {
4463
4464 if (!arg_settings_trusted)
4465 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4466 else {
4467 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4468 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4469 arg_n_custom_mounts = settings->n_custom_mounts;
4470 settings->n_custom_mounts = 0;
4471 }
4472 }
4473
4474 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4475 settings_network_configured(settings)) {
4476
4477 if (!arg_settings_trusted)
4478 log_warning("Ignoring network settings, file %s is not trusted.", path);
4479 else {
4480 arg_network_veth = settings_network_veth(settings);
4481 arg_private_network = settings_private_network(settings);
4482
4483 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4484 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4485 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4486 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4487
4488 free_and_replace(arg_network_bridge, settings->network_bridge);
4489 free_and_replace(arg_network_zone, settings->network_zone);
4490
4491 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4492 }
4493 }
4494
4495 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4496 settings->expose_ports) {
4497
4498 if (!arg_settings_trusted)
4499 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4500 else {
4501 expose_port_free_all(arg_expose_ports);
4502 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4503 }
4504 }
4505
4506 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4507 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4508
4509 if (!arg_settings_trusted)
4510 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4511 else {
4512 arg_userns_mode = settings->userns_mode;
4513 arg_uid_shift = settings->uid_shift;
4514 arg_uid_range = settings->uid_range;
4515 arg_userns_ownership = settings->userns_ownership;
4516 }
4517 }
4518
4519 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4520 !strv_isempty(settings->bind_user))
4521 strv_free_and_replace(arg_bind_user, settings->bind_user);
4522
4523 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4524 settings->notify_ready >= 0)
4525 arg_notify_ready = settings->notify_ready;
4526
4527 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4528
4529 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4530 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4531 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4532 else {
4533 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4534 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4535 }
4536 }
4537
4538 #if HAVE_SECCOMP
4539 if (settings->seccomp) {
4540 if (!arg_settings_trusted)
4541 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4542 else {
4543 seccomp_release(arg_seccomp);
4544 arg_seccomp = TAKE_PTR(settings->seccomp);
4545 }
4546 }
4547 #endif
4548 }
4549
4550 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4551 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4552 continue;
4553
4554 if (!settings->rlimit[rl])
4555 continue;
4556
4557 if (!arg_settings_trusted) {
4558 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4559 continue;
4560 }
4561
4562 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4563 }
4564
4565 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4566 settings->hostname)
4567 free_and_replace(arg_hostname, settings->hostname);
4568
4569 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4570 settings->no_new_privileges >= 0)
4571 arg_no_new_privileges = settings->no_new_privileges;
4572
4573 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4574 settings->oom_score_adjust_set) {
4575
4576 if (!arg_settings_trusted)
4577 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4578 else {
4579 arg_oom_score_adjust = settings->oom_score_adjust;
4580 arg_oom_score_adjust_set = true;
4581 }
4582 }
4583
4584 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4585 settings->cpu_set.set) {
4586
4587 if (!arg_settings_trusted)
4588 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4589 else {
4590 cpu_set_reset(&arg_cpu_set);
4591 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
4592 }
4593 }
4594
4595 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4596 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4597 arg_resolv_conf = settings->resolv_conf;
4598
4599 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4600 settings->link_journal != _LINK_JOURNAL_INVALID) {
4601
4602 if (!arg_settings_trusted)
4603 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4604 else {
4605 arg_link_journal = settings->link_journal;
4606 arg_link_journal_try = settings->link_journal_try;
4607 }
4608 }
4609
4610 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4611 settings->timezone != _TIMEZONE_MODE_INVALID)
4612 arg_timezone = settings->timezone;
4613
4614 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4615 settings->slice) {
4616
4617 if (!arg_settings_trusted)
4618 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4619 else
4620 free_and_replace(arg_slice, settings->slice);
4621 }
4622
4623 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4624 settings->use_cgns >= 0) {
4625
4626 if (!arg_settings_trusted)
4627 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4628 else
4629 arg_use_cgns = settings->use_cgns;
4630 }
4631
4632 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4633 settings->clone_ns_flags != ULONG_MAX) {
4634
4635 if (!arg_settings_trusted)
4636 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4637 else
4638 arg_clone_ns_flags = settings->clone_ns_flags;
4639 }
4640
4641 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4642 settings->console_mode >= 0) {
4643
4644 if (!arg_settings_trusted)
4645 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4646 else
4647 arg_console_mode = settings->console_mode;
4648 }
4649
4650 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4651 settings->suppress_sync >= 0)
4652 arg_suppress_sync = settings->suppress_sync;
4653
4654 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4655 * don't consult arg_settings_mask for them. */
4656
4657 sd_bus_message_unref(arg_property_message);
4658 arg_property_message = TAKE_PTR(settings->properties);
4659
4660 arg_console_width = settings->console_width;
4661 arg_console_height = settings->console_height;
4662
4663 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4664 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4665 arg_n_extra_nodes = settings->n_extra_nodes;
4666 settings->n_extra_nodes = 0;
4667
4668 return 0;
4669 }
4670
4671 static int load_settings(void) {
4672 _cleanup_(settings_freep) Settings *settings = NULL;
4673 _cleanup_fclose_ FILE *f = NULL;
4674 _cleanup_free_ char *p = NULL;
4675 int r;
4676
4677 if (arg_oci_bundle)
4678 return 0;
4679
4680 /* If all settings are masked, there's no point in looking for
4681 * the settings file */
4682 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4683 return 0;
4684
4685 /* We first look in the admin's directories in /etc and /run */
4686 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4687 _cleanup_free_ char *j = NULL;
4688
4689 j = path_join(i, arg_settings_filename);
4690 if (!j)
4691 return log_oom();
4692
4693 f = fopen(j, "re");
4694 if (f) {
4695 p = TAKE_PTR(j);
4696
4697 /* By default, we trust configuration from /etc and /run */
4698 if (arg_settings_trusted < 0)
4699 arg_settings_trusted = true;
4700
4701 break;
4702 }
4703
4704 if (errno != ENOENT)
4705 return log_error_errno(errno, "Failed to open %s: %m", j);
4706 }
4707
4708 if (!f) {
4709 /* After that, let's look for a file next to the
4710 * actual image we shall boot. */
4711
4712 if (arg_image) {
4713 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4714 if (r < 0)
4715 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4716 } else if (arg_directory) {
4717 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4718 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4719 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
4720 }
4721
4722 if (p) {
4723 f = fopen(p, "re");
4724 if (!f && errno != ENOENT)
4725 return log_error_errno(errno, "Failed to open %s: %m", p);
4726
4727 /* By default, we do not trust configuration from /var/lib/machines */
4728 if (arg_settings_trusted < 0)
4729 arg_settings_trusted = false;
4730 }
4731 }
4732
4733 if (!f)
4734 return 0;
4735
4736 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4737
4738 r = settings_load(f, p, &settings);
4739 if (r < 0)
4740 return r;
4741
4742 return merge_settings(settings, p);
4743 }
4744
4745 static int load_oci_bundle(void) {
4746 _cleanup_(settings_freep) Settings *settings = NULL;
4747 int r;
4748
4749 if (!arg_oci_bundle)
4750 return 0;
4751
4752 /* By default let's trust OCI bundles */
4753 if (arg_settings_trusted < 0)
4754 arg_settings_trusted = true;
4755
4756 r = oci_load(NULL, arg_oci_bundle, &settings);
4757 if (r < 0)
4758 return r;
4759
4760 return merge_settings(settings, arg_oci_bundle);
4761 }
4762
4763 static int run_container(
4764 DissectedImage *dissected_image,
4765 FDSet *fds,
4766 char veth_name[IFNAMSIZ], bool *veth_created,
4767 struct ExposeArgs *expose_args,
4768 int *master, pid_t *pid, int *ret) {
4769
4770 static const struct sigaction sa = {
4771 .sa_handler = nop_signal_handler,
4772 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4773 };
4774
4775 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4776 _cleanup_close_ int etc_passwd_lock = -EBADF;
4777 _cleanup_close_pair_ int
4778 fd_inner_socket_pair[2] = EBADF_PAIR,
4779 fd_outer_socket_pair[2] = EBADF_PAIR;
4780
4781 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
4782 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4783 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4784 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4785 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4786 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4787 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4788 _cleanup_free_ uid_t *bind_user_uid = NULL;
4789 size_t n_bind_user_uid = 0;
4790 ContainerStatus container_status = 0;
4791 int ifi = 0, r;
4792 ssize_t l;
4793 sigset_t mask_chld;
4794 _cleanup_close_ int child_netns_fd = -EBADF;
4795
4796 assert_se(sigemptyset(&mask_chld) == 0);
4797 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4798
4799 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4800 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4801 * check with getpwuid() if the specific user already exists. Note that /etc might be
4802 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4803 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4804 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4805 * really ours. */
4806
4807 etc_passwd_lock = take_etc_passwd_lock(NULL);
4808 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4809 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4810 }
4811
4812 r = barrier_create(&barrier);
4813 if (r < 0)
4814 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4815
4816 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4817 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4818
4819 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4820 return log_error_errno(errno, "Failed to create outer socket pair: %m");
4821
4822 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4823 * parent's blocking calls and give it a chance to call wait() and terminate. */
4824 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4825 if (r < 0)
4826 return log_error_errno(errno, "Failed to change the signal mask: %m");
4827
4828 r = sigaction(SIGCHLD, &sa, NULL);
4829 if (r < 0)
4830 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4831
4832 if (arg_network_namespace_path) {
4833 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4834 if (child_netns_fd < 0)
4835 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4836
4837 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4838 if (r == -EUCLEAN)
4839 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4840 else if (r < 0)
4841 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4842 else if (r == 0)
4843 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4844 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4845 }
4846
4847 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4848 if (*pid < 0)
4849 return log_error_errno(errno, "clone() failed%s: %m",
4850 errno == EINVAL ?
4851 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4852
4853 if (*pid == 0) {
4854 /* The outer child only has a file system namespace. */
4855 barrier_set_role(&barrier, BARRIER_CHILD);
4856
4857 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
4858 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
4859
4860 (void) reset_all_signal_handlers();
4861 (void) reset_signal_mask();
4862
4863 r = outer_child(&barrier,
4864 arg_directory,
4865 dissected_image,
4866 fd_outer_socket_pair[1],
4867 fd_inner_socket_pair[1],
4868 fds,
4869 child_netns_fd);
4870 if (r < 0)
4871 _exit(EXIT_FAILURE);
4872
4873 _exit(EXIT_SUCCESS);
4874 }
4875
4876 barrier_set_role(&barrier, BARRIER_PARENT);
4877
4878 fdset_close(fds);
4879
4880 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
4881 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
4882
4883 if (arg_userns_mode != USER_NAMESPACE_NO) {
4884 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
4885 if (mntns_fd < 0)
4886 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4887
4888 /* The child just let us know the UID shift it might have read from the image. */
4889 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4890 if (l < 0)
4891 return log_error_errno(errno, "Failed to read UID shift: %m");
4892 if (l != sizeof arg_uid_shift)
4893 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4894
4895 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4896 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4897 * image, but if that's already in use, pick a new one, and report back to the child,
4898 * which one we now picked. */
4899
4900 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4901 if (r < 0)
4902 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4903
4904 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4905 if (l < 0)
4906 return log_error_errno(errno, "Failed to send UID shift: %m");
4907 if (l != sizeof arg_uid_shift)
4908 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4909 }
4910
4911 n_bind_user_uid = strv_length(arg_bind_user);
4912 if (n_bind_user_uid > 0) {
4913 /* Right after the UID shift, we'll receive the list of UID mappings for the
4914 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4915
4916 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4917 if (!bind_user_uid)
4918 return log_oom();
4919
4920 for (size_t i = 0; i < n_bind_user_uid; i++) {
4921 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4922 if (l < 0)
4923 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4924 if (l != sizeof(uid_t)*4)
4925 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4926 SYNTHETIC_ERRNO(EIO),
4927 "Short read while reading bind user UID pairs.");
4928 }
4929 }
4930 }
4931
4932 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4933 /* The child let us know the support cgroup mode it might have read from the image. */
4934 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4935 if (l < 0)
4936 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4937 if (l != sizeof(arg_unified_cgroup_hierarchy))
4938 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
4939 l, l == 0 ? " The child is most likely dead." : "");
4940 }
4941
4942 /* Wait for the outer child. */
4943 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4944 if (r < 0)
4945 return r;
4946 if (r != EXIT_SUCCESS)
4947 return -EIO;
4948
4949 /* And now retrieve the PID of the inner child. */
4950 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
4951 if (l < 0)
4952 return log_error_errno(errno, "Failed to read inner child PID: %m");
4953 if (l != sizeof *pid)
4954 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4955
4956 /* We also retrieve container UUID in case it was generated by outer child */
4957 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4958 if (l < 0)
4959 return log_error_errno(errno, "Failed to read container machine ID: %m");
4960 if (l != sizeof(arg_uuid))
4961 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4962
4963 /* We also retrieve the socket used for notifications generated by outer child */
4964 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
4965 if (notify_socket < 0)
4966 return log_error_errno(notify_socket,
4967 "Failed to receive notification socket from the outer child: %m");
4968
4969 log_debug("Init process invoked as PID "PID_FMT, *pid);
4970
4971 if (arg_userns_mode != USER_NAMESPACE_NO) {
4972 if (!barrier_place_and_sync(&barrier)) /* #1 */
4973 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4974
4975 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
4976 if (r < 0)
4977 return r;
4978
4979 (void) barrier_place(&barrier); /* #2 */
4980 }
4981
4982 if (arg_private_network) {
4983 if (!arg_network_namespace_path) {
4984 /* Wait until the child has unshared its network namespace. */
4985 if (!barrier_place_and_sync(&barrier)) /* #3 */
4986 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4987 }
4988
4989 if (child_netns_fd < 0) {
4990 /* Make sure we have an open file descriptor to the child's network
4991 * namespace so it stays alive even if the child exits. */
4992 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4993 if (r < 0)
4994 return log_error_errno(r, "Failed to open child network namespace: %m");
4995 }
4996
4997 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
4998 if (r < 0)
4999 return r;
5000
5001 if (arg_network_veth) {
5002 r = setup_veth(arg_machine, *pid, veth_name,
5003 arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
5004 if (r < 0)
5005 return r;
5006 else if (r > 0)
5007 ifi = r;
5008
5009 if (arg_network_bridge) {
5010 /* Add the interface to a bridge */
5011 r = setup_bridge(veth_name, arg_network_bridge, false);
5012 if (r < 0)
5013 return r;
5014 if (r > 0)
5015 ifi = r;
5016 } else if (arg_network_zone) {
5017 /* Add the interface to a bridge, possibly creating it */
5018 r = setup_bridge(veth_name, arg_network_zone, true);
5019 if (r < 0)
5020 return r;
5021 if (r > 0)
5022 ifi = r;
5023 }
5024 }
5025
5026 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5027 if (r < 0)
5028 return r;
5029
5030 /* We created the primary and extra veth links now; let's remember this, so that we know to
5031 remove them later on. Note that we don't bother with removing veth links that were created
5032 here when their setup failed half-way, because in that case the kernel should be able to
5033 remove them on its own, since they cannot be referenced by anything yet. */
5034 *veth_created = true;
5035
5036 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5037 if (r < 0)
5038 return r;
5039
5040 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5041 if (r < 0)
5042 return r;
5043 }
5044
5045 if (arg_register || !arg_keep_unit) {
5046 r = sd_bus_default_system(&bus);
5047 if (r < 0)
5048 return log_error_errno(r, "Failed to open system bus: %m");
5049
5050 r = sd_bus_set_close_on_exit(bus, false);
5051 if (r < 0)
5052 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5053 }
5054
5055 if (!arg_keep_unit) {
5056 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5057 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5058 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5059
5060 r = sd_bus_match_signal_async(
5061 bus,
5062 NULL,
5063 "org.freedesktop.systemd1",
5064 NULL,
5065 "org.freedesktop.systemd1.Scope",
5066 "RequestStop",
5067 on_request_stop, NULL, PID_TO_PTR(*pid));
5068 if (r < 0)
5069 return log_error_errno(r, "Failed to request RequestStop match: %m");
5070 }
5071
5072 if (arg_register) {
5073 r = register_machine(
5074 bus,
5075 arg_machine,
5076 *pid,
5077 arg_directory,
5078 arg_uuid,
5079 ifi,
5080 arg_slice,
5081 arg_custom_mounts, arg_n_custom_mounts,
5082 arg_kill_signal,
5083 arg_property,
5084 arg_property_message,
5085 arg_keep_unit,
5086 arg_container_service_name,
5087 arg_start_mode);
5088 if (r < 0)
5089 return r;
5090
5091 } else if (!arg_keep_unit) {
5092 r = allocate_scope(
5093 bus,
5094 arg_machine,
5095 *pid,
5096 arg_slice,
5097 arg_custom_mounts, arg_n_custom_mounts,
5098 arg_kill_signal,
5099 arg_property,
5100 arg_property_message,
5101 /* allow_pidfds= */ true,
5102 arg_start_mode);
5103 if (r < 0)
5104 return r;
5105
5106 } else if (arg_slice || arg_property)
5107 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5108
5109 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5110 if (r < 0)
5111 return r;
5112
5113 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5114 if (r < 0)
5115 return r;
5116
5117 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5118 if (r < 0)
5119 return r;
5120
5121 /* Notify the child that the parent is ready with all
5122 * its setup (including cgroup-ification), and that
5123 * the child can now hand over control to the code to
5124 * run inside the container. */
5125 (void) barrier_place(&barrier); /* #4 */
5126
5127 /* Block SIGCHLD here, before notifying child.
5128 * process_pty() will handle it with the other signals. */
5129 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5130
5131 /* Reset signal to default */
5132 r = default_signals(SIGCHLD);
5133 if (r < 0)
5134 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5135
5136 r = sd_event_new(&event);
5137 if (r < 0)
5138 return log_error_errno(r, "Failed to get default event source: %m");
5139
5140 (void) sd_event_set_watchdog(event, true);
5141
5142 if (bus) {
5143 r = sd_bus_attach_event(bus, event, 0);
5144 if (r < 0)
5145 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5146 }
5147
5148 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5149 if (r < 0)
5150 return r;
5151
5152 /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on,
5153 * before we take the fully visible instances away. */
5154 if (!barrier_sync(&barrier)) /* #5.1 */
5155 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5156
5157 if (arg_userns_mode != USER_NAMESPACE_NO) {
5158 r = wipe_fully_visible_fs(mntns_fd);
5159 if (r < 0)
5160 return r;
5161 mntns_fd = safe_close(mntns_fd);
5162 }
5163
5164 /* And now let the child know that we completed removing the procfs instances, and it can start the
5165 * payload. */
5166 if (!barrier_place(&barrier)) /* #5.2 */
5167 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5168
5169 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5170 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5171 etc_passwd_lock = safe_close(etc_passwd_lock);
5172
5173 (void) sd_notifyf(false,
5174 "STATUS=Container running.\n"
5175 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5176 if (!arg_notify_ready) {
5177 r = sd_notify(false, "READY=1\n");
5178 if (r < 0)
5179 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5180 }
5181
5182 if (arg_kill_signal > 0) {
5183 /* Try to kill the init system on SIGINT or SIGTERM */
5184 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5185 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5186 } else {
5187 /* Immediately exit */
5188 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5189 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5190 }
5191
5192 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5193
5194 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5195 if (r < 0)
5196 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5197
5198 /* Exit when the child exits */
5199 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5200
5201 /* Retrieve the kmsg fifo allocated by inner child */
5202 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5203 if (fd_kmsg_fifo < 0)
5204 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5205
5206 if (arg_expose_ports) {
5207 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
5208 if (r < 0)
5209 return r;
5210
5211 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5212 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5213 }
5214
5215 if (arg_console_mode != CONSOLE_PIPE) {
5216 _cleanup_close_ int fd = -EBADF;
5217 PTYForwardFlags flags = 0;
5218
5219 /* Retrieve the master pty allocated by inner child */
5220 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
5221 if (fd < 0)
5222 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5223
5224 switch (arg_console_mode) {
5225
5226 case CONSOLE_READ_ONLY:
5227 flags |= PTY_FORWARD_READ_ONLY;
5228
5229 _fallthrough_;
5230
5231 case CONSOLE_INTERACTIVE:
5232 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5233
5234 r = pty_forward_new(event, fd, flags, &forward);
5235 if (r < 0)
5236 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5237
5238 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5239 (void) pty_forward_set_width_height(forward,
5240 arg_console_width,
5241 arg_console_height);
5242 break;
5243
5244 default:
5245 assert(arg_console_mode == CONSOLE_PASSIVE);
5246 }
5247
5248 *master = TAKE_FD(fd);
5249 }
5250
5251 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5252
5253 r = sd_event_loop(event);
5254 if (r < 0)
5255 return log_error_errno(r, "Failed to run event loop: %m");
5256
5257 if (forward) {
5258 char last_char = 0;
5259
5260 (void) pty_forward_get_last_char(forward, &last_char);
5261 forward = pty_forward_free(forward);
5262
5263 if (!arg_quiet && last_char != '\n')
5264 putc('\n', stdout);
5265 }
5266
5267 /* Kill if it is not dead yet anyway */
5268 if (!arg_register && !arg_keep_unit && bus)
5269 terminate_scope(bus, arg_machine);
5270
5271 /* Normally redundant, but better safe than sorry */
5272 (void) kill(*pid, SIGKILL);
5273
5274 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5275
5276 if (arg_private_network) {
5277 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5278 * to avoid having to move the parent to the child network namespace. */
5279 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5280 if (r < 0)
5281 return r;
5282
5283 if (r == 0) {
5284 _cleanup_close_ int parent_netns_fd = -EBADF;
5285
5286 r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5287 if (r < 0) {
5288 log_error_errno(r, "Failed to open parent network namespace: %m");
5289 _exit(EXIT_FAILURE);
5290 }
5291
5292 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5293 if (r < 0) {
5294 log_error_errno(r, "Failed to enter child network namespace: %m");
5295 _exit(EXIT_FAILURE);
5296 }
5297
5298 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5299 * This is about ensuring interfaces get their old name back when being moved back. */
5300 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5301
5302 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5303 if (r < 0)
5304 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5305
5306 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5307 }
5308 }
5309
5310 r = wait_for_container(TAKE_PID(*pid), &container_status);
5311
5312 /* Tell machined that we are gone. */
5313 if (bus)
5314 (void) unregister_machine(bus, arg_machine);
5315
5316 if (r < 0)
5317 /* We failed to wait for the container, or the container exited abnormally. */
5318 return r;
5319 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5320 /* r > 0 → The container exited with a non-zero status.
5321 * As a special case, we need to replace 133 with a different value,
5322 * because 133 is special-cased in the service file to reboot the container.
5323 * otherwise → The container exited with zero status and a reboot was not requested.
5324 */
5325 if (r == EXIT_FORCE_RESTART)
5326 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5327 *ret = r;
5328 return 0; /* finito */
5329 }
5330
5331 /* CONTAINER_REBOOTED, loop again */
5332
5333 if (arg_keep_unit) {
5334 /* Special handling if we are running as a service: instead of simply
5335 * restarting the machine we want to restart the entire service, so let's
5336 * inform systemd about this with the special exit code 133. The service
5337 * file uses RestartForceExitStatus=133 so that this results in a full
5338 * nspawn restart. This is necessary since we might have cgroup parameters
5339 * set we want to have flushed out. */
5340 *ret = EXIT_FORCE_RESTART;
5341 return 0; /* finito */
5342 }
5343
5344 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5345 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5346
5347 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5348 *veth_created = false;
5349 return 1; /* loop again */
5350 }
5351
5352 static int initialize_rlimits(void) {
5353 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5354 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5355 * container execution environments. */
5356
5357 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5358 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5359 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5360 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5361 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5362 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5363 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5364 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5365 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5366 [RLIMIT_NICE] = { 0, 0 },
5367 [RLIMIT_NOFILE] = { 1024, 4096 },
5368 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5369 [RLIMIT_RTPRIO] = { 0, 0 },
5370 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5371 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5372
5373 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5374 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5375 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5376 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5377 * that PID 1 changes a number of other resource limits during early initialization which is why we
5378 * don't read the other limits from PID 1 but prefer the static table above. */
5379 };
5380
5381 int rl;
5382
5383 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5384 /* Let's only fill in what the user hasn't explicitly configured anyway */
5385 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5386 const struct rlimit *v;
5387 struct rlimit buffer;
5388
5389 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5390 /* For these two let's read the limits off PID 1. See above for an explanation. */
5391
5392 if (prlimit(1, rl, NULL, &buffer) < 0)
5393 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5394
5395 v = &buffer;
5396 } else if (rl == RLIMIT_NOFILE) {
5397 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5398 * userspace. Given that nspawn containers are often run without our PID 1,
5399 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5400 * so that container userspace gets similar resources as host userspace
5401 * gets. */
5402 buffer = kernel_defaults[rl];
5403 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5404 v = &buffer;
5405 } else
5406 v = kernel_defaults + rl;
5407
5408 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5409 if (!arg_rlimit[rl])
5410 return log_oom();
5411 }
5412
5413 if (DEBUG_LOGGING) {
5414 _cleanup_free_ char *k = NULL;
5415
5416 (void) rlimit_format(arg_rlimit[rl], &k);
5417 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5418 }
5419 }
5420
5421 return 0;
5422 }
5423
5424 static int cant_be_in_netns(void) {
5425 _cleanup_close_ int fd = -EBADF;
5426 struct ucred ucred;
5427 int r;
5428
5429 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5430 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5431 * nice message. */
5432
5433 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5434 return 0;
5435
5436 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5437 if (fd < 0)
5438 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5439
5440 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5441 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5442 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5443 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5444 if (r < 0)
5445 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
5446
5447 r = getpeercred(fd, &ucred);
5448 if (r < 0)
5449 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5450
5451 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
5452 if (r < 0)
5453 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5454 if (r == 0)
5455 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5456 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5457 return 0;
5458 }
5459
5460 static int run(int argc, char *argv[]) {
5461 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5462 _cleanup_close_ int master = -EBADF;
5463 _cleanup_fdset_free_ FDSet *fds = NULL;
5464 int r, n_fd_passed, ret = EXIT_SUCCESS;
5465 char veth_name[IFNAMSIZ] = "";
5466 struct ExposeArgs expose_args = {};
5467 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5468 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5469 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5470 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5471 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5472 pid_t pid = 0;
5473
5474 log_parse_environment();
5475 log_open();
5476
5477 r = parse_argv(argc, argv);
5478 if (r <= 0)
5479 goto finish;
5480
5481 if (geteuid() != 0) {
5482 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5483 argc >= 2 ? "Need to be root." :
5484 "Need to be root (and some arguments are usually required).\nHint: try --help");
5485 goto finish;
5486 }
5487
5488 r = cant_be_in_netns();
5489 if (r < 0)
5490 goto finish;
5491
5492 r = initialize_rlimits();
5493 if (r < 0)
5494 goto finish;
5495
5496 r = load_oci_bundle();
5497 if (r < 0)
5498 goto finish;
5499
5500 r = determine_names();
5501 if (r < 0)
5502 goto finish;
5503
5504 r = load_settings();
5505 if (r < 0)
5506 goto finish;
5507
5508 r = cg_unified();
5509 if (r < 0) {
5510 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5511 goto finish;
5512 }
5513
5514 r = verify_arguments();
5515 if (r < 0)
5516 goto finish;
5517
5518 r = verify_network_interfaces_initialized();
5519 if (r < 0)
5520 goto finish;
5521
5522 /* Reapply environment settings. */
5523 (void) detect_unified_cgroup_hierarchy_from_environment();
5524
5525 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5526 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5527 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5528 (void) ignore_signals(SIGPIPE);
5529
5530 n_fd_passed = sd_listen_fds(false);
5531 if (n_fd_passed > 0) {
5532 r = fdset_new_listen_fds(&fds, false);
5533 if (r < 0) {
5534 log_error_errno(r, "Failed to collect file descriptors: %m");
5535 goto finish;
5536 }
5537 }
5538
5539 /* The "default" umask. This is appropriate for most file and directory
5540 * operations performed by nspawn, and is the umask that will be used for
5541 * the child. Functions like copy_devnodes() change the umask temporarily. */
5542 umask(0022);
5543
5544 if (arg_directory) {
5545 assert(!arg_image);
5546
5547 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5548 * /var from the host will propagate into container dynamically (because bad things happen if
5549 * two systems write to the same /var). Let's allow it for the special cases where /var is
5550 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5551 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5552 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5553 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5554 goto finish;
5555 }
5556
5557 if (arg_ephemeral) {
5558 _cleanup_free_ char *np = NULL;
5559
5560 r = chase_and_update(&arg_directory, 0);
5561 if (r < 0)
5562 goto finish;
5563
5564 /* If the specified path is a mount point we generate the new snapshot immediately
5565 * inside it under a random name. However if the specified is not a mount point we
5566 * create the new snapshot in the parent directory, just next to it. */
5567 r = path_is_mount_point(arg_directory, NULL, 0);
5568 if (r < 0) {
5569 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5570 goto finish;
5571 }
5572 if (r > 0)
5573 r = tempfn_random_child(arg_directory, "machine.", &np);
5574 else
5575 r = tempfn_random(arg_directory, "machine.", &np);
5576 if (r < 0) {
5577 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5578 goto finish;
5579 }
5580
5581 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5582 * only owned by us and no one else. */
5583 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5584 if (r < 0) {
5585 log_error_errno(r, "Failed to lock %s: %m", np);
5586 goto finish;
5587 }
5588
5589 {
5590 BLOCK_SIGNALS(SIGINT);
5591 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5592 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5593 BTRFS_SNAPSHOT_FALLBACK_COPY |
5594 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5595 BTRFS_SNAPSHOT_RECURSIVE |
5596 BTRFS_SNAPSHOT_QUOTA |
5597 BTRFS_SNAPSHOT_SIGINT);
5598 }
5599 if (r == -EINTR) {
5600 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5601 goto finish;
5602 }
5603 if (r < 0) {
5604 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5605 goto finish;
5606 }
5607
5608 free_and_replace(arg_directory, np);
5609 remove_directory = true;
5610 } else {
5611 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5612 if (r < 0)
5613 goto finish;
5614
5615 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5616 if (r == -EBUSY) {
5617 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5618 goto finish;
5619 }
5620 if (r < 0) {
5621 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5622 goto finish;
5623 }
5624
5625 if (arg_template) {
5626 r = chase_and_update(&arg_template, 0);
5627 if (r < 0)
5628 goto finish;
5629
5630 {
5631 BLOCK_SIGNALS(SIGINT);
5632 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5633 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5634 BTRFS_SNAPSHOT_FALLBACK_COPY |
5635 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5636 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5637 BTRFS_SNAPSHOT_RECURSIVE |
5638 BTRFS_SNAPSHOT_QUOTA |
5639 BTRFS_SNAPSHOT_SIGINT);
5640 }
5641 if (r == -EEXIST)
5642 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5643 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5644 else if (r == -EINTR) {
5645 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5646 goto finish;
5647 } else if (r < 0) {
5648 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5649 goto finish;
5650 } else
5651 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5652 "Populated %s from template %s.", arg_directory, arg_template);
5653 }
5654 }
5655
5656 if (arg_start_mode == START_BOOT) {
5657 _cleanup_free_ char *b = NULL;
5658 const char *p;
5659
5660 if (arg_pivot_root_new) {
5661 b = path_join(arg_directory, arg_pivot_root_new);
5662 if (!b)
5663 return log_oom();
5664
5665 p = b;
5666 } else
5667 p = arg_directory;
5668
5669 if (path_is_os_tree(p) <= 0) {
5670 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5671 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5672 goto finish;
5673 }
5674 } else {
5675 _cleanup_free_ char *p = NULL;
5676
5677 if (arg_pivot_root_new)
5678 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
5679 else
5680 p = path_join(arg_directory, "/usr/");
5681 if (!p)
5682 return log_oom();
5683
5684 if (laccess(p, F_OK) < 0) {
5685 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5686 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
5687 goto finish;
5688 }
5689 }
5690
5691 } else {
5692 DissectImageFlags dissect_image_flags =
5693 DISSECT_IMAGE_GENERIC_ROOT |
5694 DISSECT_IMAGE_REQUIRE_ROOT |
5695 DISSECT_IMAGE_RELAX_VAR_CHECK |
5696 DISSECT_IMAGE_USR_NO_ROOT |
5697 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5698 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
5699 assert(arg_image);
5700 assert(!arg_template);
5701
5702 r = chase_and_update(&arg_image, 0);
5703 if (r < 0)
5704 goto finish;
5705
5706 if (arg_ephemeral) {
5707 _cleanup_free_ char *np = NULL;
5708
5709 r = tempfn_random(arg_image, "machine.", &np);
5710 if (r < 0) {
5711 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5712 goto finish;
5713 }
5714
5715 /* Always take an exclusive lock on our own ephemeral copy. */
5716 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5717 if (r < 0) {
5718 r = log_error_errno(r, "Failed to create image lock: %m");
5719 goto finish;
5720 }
5721
5722 {
5723 BLOCK_SIGNALS(SIGINT);
5724 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5725 FS_NOCOW_FL, FS_NOCOW_FL,
5726 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5727 NULL, NULL);
5728 }
5729 if (r == -EINTR) {
5730 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5731 goto finish;
5732 }
5733 if (r < 0) {
5734 r = log_error_errno(r, "Failed to copy image file: %m");
5735 goto finish;
5736 }
5737
5738 free_and_replace(arg_image, np);
5739 remove_image = true;
5740 } else {
5741 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5742 if (r == -EBUSY) {
5743 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5744 goto finish;
5745 }
5746 if (r < 0) {
5747 r = log_error_errno(r, "Failed to create image lock: %m");
5748 goto finish;
5749 }
5750
5751 r = verity_settings_load(
5752 &arg_verity_settings,
5753 arg_image, NULL, NULL);
5754 if (r < 0) {
5755 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5756 goto finish;
5757 }
5758
5759 if (arg_verity_settings.data_path)
5760 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5761 }
5762
5763 if (!mkdtemp(tmprootdir)) {
5764 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5765 goto finish;
5766 }
5767
5768 remove_tmprootdir = true;
5769
5770 arg_directory = strdup(tmprootdir);
5771 if (!arg_directory) {
5772 r = log_oom();
5773 goto finish;
5774 }
5775
5776 r = loop_device_make_by_path(
5777 arg_image,
5778 arg_read_only ? O_RDONLY : O_RDWR,
5779 /* sector_size= */ UINT32_MAX,
5780 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5781 LOCK_SH,
5782 &loop);
5783 if (r < 0) {
5784 log_error_errno(r, "Failed to set up loopback block device: %m");
5785 goto finish;
5786 }
5787
5788 r = dissect_loop_device_and_warn(
5789 loop,
5790 &arg_verity_settings,
5791 /* mount_options=*/ NULL,
5792 arg_image_policy ?: &image_policy_container,
5793 dissect_image_flags,
5794 &dissected_image);
5795 if (r == -ENOPKG) {
5796 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5797 log_notice("Note that the disk image needs to\n"
5798 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5799 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5800 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
5801 " d) or contain a file system without a partition table\n"
5802 "in order to be bootable with systemd-nspawn.");
5803 goto finish;
5804 }
5805 if (r < 0)
5806 goto finish;
5807
5808 r = dissected_image_load_verity_sig_partition(
5809 dissected_image,
5810 loop->fd,
5811 &arg_verity_settings);
5812 if (r < 0)
5813 goto finish;
5814
5815 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5816 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5817 "root hash signature found! Proceeding without integrity checking.", arg_image);
5818
5819 r = dissected_image_decrypt_interactively(
5820 dissected_image,
5821 NULL,
5822 &arg_verity_settings,
5823 0);
5824 if (r < 0)
5825 goto finish;
5826
5827 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5828 if (remove_image && unlink(arg_image) >= 0)
5829 remove_image = false;
5830
5831 if (arg_architecture < 0)
5832 arg_architecture = dissected_image_architecture(dissected_image);
5833 }
5834
5835 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5836 if (r < 0)
5837 goto finish;
5838
5839 if (arg_console_mode < 0)
5840 arg_console_mode =
5841 isatty(STDIN_FILENO) > 0 &&
5842 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5843
5844 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5845 arg_quiet = true;
5846
5847 if (!arg_quiet)
5848 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
5849 arg_machine, arg_image ?: arg_directory);
5850
5851 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
5852
5853 r = make_reaper_process(true);
5854 if (r < 0) {
5855 log_error_errno(r, "Failed to become subreaper: %m");
5856 goto finish;
5857 }
5858
5859 if (arg_expose_ports) {
5860 r = fw_ctx_new(&fw_ctx);
5861 if (r < 0) {
5862 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5863 goto finish;
5864 }
5865 expose_args.fw_ctx = fw_ctx;
5866 }
5867 for (;;) {
5868 r = run_container(dissected_image,
5869 fds,
5870 veth_name, &veth_created,
5871 &expose_args, &master,
5872 &pid, &ret);
5873 if (r <= 0)
5874 break;
5875 }
5876
5877 finish:
5878 (void) sd_notify(false,
5879 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5880 "STOPPING=1\nSTATUS=Terminating...");
5881
5882 if (pid > 0)
5883 (void) kill(pid, SIGKILL);
5884
5885 /* Try to flush whatever is still queued in the pty */
5886 if (master >= 0) {
5887 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
5888 master = safe_close(master);
5889 }
5890
5891 if (pid > 0)
5892 (void) wait_for_terminate(pid, NULL);
5893
5894 pager_close();
5895
5896 if (remove_directory && arg_directory) {
5897 int k;
5898
5899 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5900 if (k < 0)
5901 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5902 }
5903
5904 if (remove_image && arg_image) {
5905 if (unlink(arg_image) < 0)
5906 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5907 }
5908
5909 if (remove_tmprootdir) {
5910 if (rmdir(tmprootdir) < 0)
5911 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5912 }
5913
5914 if (arg_machine) {
5915 const char *p;
5916
5917 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5918 (void) rm_rf(p, REMOVE_ROOT);
5919 }
5920
5921 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5922 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
5923
5924 if (veth_created)
5925 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5926 (void) remove_bridge(arg_network_zone);
5927
5928 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5929 expose_port_free_all(arg_expose_ports);
5930 rlimit_free_all(arg_rlimit);
5931 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5932 credential_free_all(arg_credentials, arg_n_credentials);
5933
5934 if (r < 0)
5935 return r;
5936
5937 return ret;
5938 }
5939
5940 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);