]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
f1bb8796443eb1ffa30f527d4450e51ccfd71520
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <termios.h>
20 #include <unistd.h>
21
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25
26 #include "alloc-util.h"
27 #include "ether-addr-util.h"
28 #include "barrier.h"
29 #include "base-filesystem.h"
30 #include "blkid-util.h"
31 #include "btrfs-util.h"
32 #include "build.h"
33 #include "bus-error.h"
34 #include "bus-locator.h"
35 #include "bus-util.h"
36 #include "cap-list.h"
37 #include "capability-util.h"
38 #include "cgroup-util.h"
39 #include "chase.h"
40 #include "common-signal.h"
41 #include "copy.h"
42 #include "cpu-set-util.h"
43 #include "creds-util.h"
44 #include "dev-setup.h"
45 #include "discover-image.h"
46 #include "dissect-image.h"
47 #include "env-util.h"
48 #include "escape.h"
49 #include "fd-util.h"
50 #include "fdset.h"
51 #include "fileio.h"
52 #include "format-util.h"
53 #include "fs-util.h"
54 #include "gpt.h"
55 #include "hexdecoct.h"
56 #include "hostname-setup.h"
57 #include "hostname-util.h"
58 #include "id128-util.h"
59 #include "io-util.h"
60 #include "log.h"
61 #include "loop-util.h"
62 #include "loopback-setup.h"
63 #include "macro.h"
64 #include "main-func.h"
65 #include "missing_sched.h"
66 #include "mkdir.h"
67 #include "mount-util.h"
68 #include "mountpoint-util.h"
69 #include "namespace-util.h"
70 #include "netlink-util.h"
71 #include "nspawn-bind-user.h"
72 #include "nspawn-cgroup.h"
73 #include "nspawn-creds.h"
74 #include "nspawn-def.h"
75 #include "nspawn-expose-ports.h"
76 #include "nspawn-mount.h"
77 #include "nspawn-network.h"
78 #include "nspawn-oci.h"
79 #include "nspawn-patch-uid.h"
80 #include "nspawn-register.h"
81 #include "nspawn-seccomp.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "nspawn-util.h"
86 #include "nspawn.h"
87 #include "nulstr-util.h"
88 #include "os-util.h"
89 #include "pager.h"
90 #include "parse-argument.h"
91 #include "parse-util.h"
92 #include "pretty-print.h"
93 #include "process-util.h"
94 #include "ptyfwd.h"
95 #include "random-util.h"
96 #include "raw-clone.h"
97 #include "resolve-util.h"
98 #include "rlimit-util.h"
99 #include "rm-rf.h"
100 #include "seccomp-util.h"
101 #include "selinux-util.h"
102 #include "signal-util.h"
103 #include "socket-util.h"
104 #include "stat-util.h"
105 #include "stdio-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
108 #include "strv.h"
109 #include "sysctl-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-name.h"
114 #include "user-util.h"
115
116 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
118 #define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
119
120 #define EXIT_FORCE_RESTART 133
121
122 typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED,
125 } ContainerStatus;
126
127 static char *arg_directory = NULL;
128 static char *arg_template = NULL;
129 static char *arg_chdir = NULL;
130 static char *arg_pivot_root_new = NULL;
131 static char *arg_pivot_root_old = NULL;
132 static char *arg_user = NULL;
133 static uid_t arg_uid = UID_INVALID;
134 static gid_t arg_gid = GID_INVALID;
135 static gid_t* arg_supplementary_gids = NULL;
136 static size_t arg_n_supplementary_gids = 0;
137 static sd_id128_t arg_uuid = {};
138 static char *arg_machine = NULL; /* The name used by the host to refer to this */
139 static char *arg_hostname = NULL; /* The name the payload sees by default */
140 static const char *arg_selinux_context = NULL;
141 static const char *arg_selinux_apifs_context = NULL;
142 static char *arg_slice = NULL;
143 static bool arg_private_network = false;
144 static bool arg_read_only = false;
145 static StartMode arg_start_mode = START_PID1;
146 static bool arg_ephemeral = false;
147 static LinkJournal arg_link_journal = LINK_AUTO;
148 static bool arg_link_journal_try = false;
149 static uint64_t arg_caps_retain =
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
161 (1ULL << CAP_MKNOD) |
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
165 (1ULL << CAP_SETFCAP) |
166 (1ULL << CAP_SETGID) |
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
170 (1ULL << CAP_SYS_BOOT) |
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
174 (1ULL << CAP_SYS_RESOURCE) |
175 (1ULL << CAP_SYS_TTY_CONFIG);
176 static uint64_t arg_caps_ambient = 0;
177 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
178 static CustomMount *arg_custom_mounts = NULL;
179 static size_t arg_n_custom_mounts = 0;
180 static char **arg_setenv = NULL;
181 static bool arg_quiet = false;
182 static bool arg_register = true;
183 static bool arg_keep_unit = false;
184 static char **arg_network_interfaces = NULL;
185 static char **arg_network_macvlan = NULL;
186 static char **arg_network_ipvlan = NULL;
187 static bool arg_network_veth = false;
188 static char **arg_network_veth_extra = NULL;
189 static char *arg_network_bridge = NULL;
190 static char *arg_network_zone = NULL;
191 static char *arg_network_namespace_path = NULL;
192 struct ether_addr arg_network_provided_mac = {};
193 static PagerFlags arg_pager_flags = 0;
194 static unsigned long arg_personality = PERSONALITY_INVALID;
195 static char *arg_image = NULL;
196 static char *arg_oci_bundle = NULL;
197 static VolatileMode arg_volatile_mode = VOLATILE_NO;
198 static ExposePort *arg_expose_ports = NULL;
199 static char **arg_property = NULL;
200 static sd_bus_message *arg_property_message = NULL;
201 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
202 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
203 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
204 static int arg_kill_signal = 0;
205 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
206 static SettingsMask arg_settings_mask = 0;
207 static int arg_settings_trusted = -1;
208 static char **arg_parameters = NULL;
209 static const char *arg_container_service_name = "systemd-nspawn";
210 static bool arg_notify_ready = false;
211 static bool arg_use_cgns = true;
212 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
213 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
214 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
215 static char **arg_syscall_allow_list = NULL;
216 static char **arg_syscall_deny_list = NULL;
217 #if HAVE_SECCOMP
218 static scmp_filter_ctx arg_seccomp = NULL;
219 #endif
220 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
221 static bool arg_no_new_privileges = false;
222 static int arg_oom_score_adjust = 0;
223 static bool arg_oom_score_adjust_set = false;
224 static CPUSet arg_cpu_set = {};
225 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
226 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
227 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
228 static DeviceNode* arg_extra_nodes = NULL;
229 static size_t arg_n_extra_nodes = 0;
230 static char **arg_sysctl = NULL;
231 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
232 static Credential *arg_credentials = NULL;
233 static size_t arg_n_credentials = 0;
234 static char **arg_bind_user = NULL;
235 static bool arg_suppress_sync = false;
236 static char *arg_settings_filename = NULL;
237 static Architecture arg_architecture = _ARCHITECTURE_INVALID;
238 static ImagePolicy *arg_image_policy = NULL;
239
240 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
259 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
260 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
261 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
262 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
263 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
264 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
265 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
266 #if HAVE_SECCOMP
267 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
268 #endif
269 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
270 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
271 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
272 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
273 STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
274
275 static int handle_arg_console(const char *arg) {
276 if (streq(arg, "help")) {
277 puts("autopipe\n"
278 "interactive\n"
279 "passive\n"
280 "pipe\n"
281 "read-only");
282 return 0;
283 }
284
285 if (streq(arg, "interactive"))
286 arg_console_mode = CONSOLE_INTERACTIVE;
287 else if (streq(arg, "read-only"))
288 arg_console_mode = CONSOLE_READ_ONLY;
289 else if (streq(arg, "passive"))
290 arg_console_mode = CONSOLE_PASSIVE;
291 else if (streq(arg, "pipe")) {
292 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
293 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
294 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
295 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
296 "Proceeding anyway.");
297
298 arg_console_mode = CONSOLE_PIPE;
299 } else if (streq(arg, "autopipe")) {
300 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
301 arg_console_mode = CONSOLE_INTERACTIVE;
302 else
303 arg_console_mode = CONSOLE_PIPE;
304 } else
305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
306
307 arg_settings_mask |= SETTING_CONSOLE_MODE;
308 return 1;
309 }
310
311 static int help(void) {
312 _cleanup_free_ char *link = NULL;
313 int r;
314
315 pager_open(arg_pager_flags);
316
317 r = terminal_urlify_man("systemd-nspawn", "1", &link);
318 if (r < 0)
319 return log_oom();
320
321 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
322 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
323 " -h --help Show this help\n"
324 " --version Print version string\n"
325 " -q --quiet Do not show status information\n"
326 " --no-pager Do not pipe output into a pager\n"
327 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
328 "%3$sImage:%4$s\n"
329 " -D --directory=PATH Root directory for the container\n"
330 " --template=PATH Initialize root directory from template directory,\n"
331 " if missing\n"
332 " -x --ephemeral Run container with snapshot of root directory, and\n"
333 " remove it after exit\n"
334 " -i --image=PATH Root file system disk image (or device node) for\n"
335 " the container\n"
336 " --image-policy=POLICY Specify disk image dissection policy\n"
337 " --oci-bundle=PATH OCI bundle directory\n"
338 " --read-only Mount the root directory read-only\n"
339 " --volatile[=MODE] Run the system in volatile mode\n"
340 " --root-hash=HASH Specify verity root hash for root disk image\n"
341 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
342 " as a DER encoded PKCS7, either as a path to a file\n"
343 " or as an ASCII base64 encoded string prefixed by\n"
344 " 'base64:'\n"
345 " --verity-data=PATH Specify hash device for verity\n"
346 " --pivot-root=PATH[:PATH]\n"
347 " Pivot root to given directory in the container\n\n"
348 "%3$sExecution:%4$s\n"
349 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
350 " -b --boot Boot up full system (i.e. invoke init)\n"
351 " --chdir=PATH Set working directory in the container\n"
352 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
353 " -u --user=USER Run the command under specified user or UID\n"
354 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
355 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
356 " --suppress-sync=BOOLEAN\n"
357 " Suppress any form of disk data synchronization\n\n"
358 "%3$sSystem Identity:%4$s\n"
359 " -M --machine=NAME Set the machine name for the container\n"
360 " --hostname=NAME Override the hostname for the container\n"
361 " --uuid=UUID Set a specific machine UUID for the container\n\n"
362 "%3$sProperties:%4$s\n"
363 " -S --slice=SLICE Place the container in the specified slice\n"
364 " --property=NAME=VALUE Set scope unit property\n"
365 " --register=BOOLEAN Register container as machine\n"
366 " --keep-unit Do not register a scope for the machine, reuse\n"
367 " the service unit nspawn is running in\n\n"
368 "%3$sUser Namespacing:%4$s\n"
369 " --private-users=no Run without user namespacing\n"
370 " --private-users=yes|pick|identity\n"
371 " Run within user namespace, autoselect UID/GID range\n"
372 " --private-users=UIDBASE[:NUIDS]\n"
373 " Similar, but with user configured UID/GID range\n"
374 " --private-users-ownership=MODE\n"
375 " Adjust ('chown') or map ('map') OS tree ownership\n"
376 " to private UID/GID range\n"
377 " -U Equivalent to --private-users=pick and\n"
378 " --private-users-ownership=auto\n\n"
379 "%3$sNetworking:%4$s\n"
380 " --private-network Disable network in container\n"
381 " --network-interface=HOSTIF[:CONTAINERIF]\n"
382 " Assign an existing network interface to the\n"
383 " container\n"
384 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
385 " Create a macvlan network interface based on an\n"
386 " existing network interface to the container\n"
387 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
388 " Create an ipvlan network interface based on an\n"
389 " existing network interface to the container\n"
390 " -n --network-veth Add a virtual Ethernet connection between host\n"
391 " and container\n"
392 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
393 " Add an additional virtual Ethernet link between\n"
394 " host and container\n"
395 " --network-bridge=INTERFACE\n"
396 " Add a virtual Ethernet connection to the container\n"
397 " and attach it to an existing bridge on the host\n"
398 " --network-zone=NAME Similar, but attach the new interface to an\n"
399 " an automatically managed bridge interface\n"
400 " --network-namespace-path=PATH\n"
401 " Set network namespace to the one represented by\n"
402 " the specified kernel namespace file node\n"
403 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
404 " Expose a container IP port on the host\n\n"
405 "%3$sSecurity:%4$s\n"
406 " --capability=CAP In addition to the default, retain specified\n"
407 " capability\n"
408 " --drop-capability=CAP Drop the specified capability from the default set\n"
409 " --ambient-capability=CAP\n"
410 " Sets the specified capability for the started\n"
411 " process. Not useful if booting a machine.\n"
412 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
413 " --system-call-filter=LIST|~LIST\n"
414 " Permit/prohibit specific system calls\n"
415 " -Z --selinux-context=SECLABEL\n"
416 " Set the SELinux security context to be used by\n"
417 " processes in the container\n"
418 " -L --selinux-apifs-context=SECLABEL\n"
419 " Set the SELinux security context to be used by\n"
420 " API/tmpfs file systems in the container\n\n"
421 "%3$sResources:%4$s\n"
422 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
423 " --oom-score-adjust=VALUE\n"
424 " Adjust the OOM score value for the payload\n"
425 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
426 " --personality=ARCH Pick personality for this container\n\n"
427 "%3$sIntegration:%4$s\n"
428 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
429 " --timezone=MODE Select mode of /etc/localtime initialization\n"
430 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
431 " host, try-guest, try-host\n"
432 " -j Equivalent to --link-journal=try-guest\n\n"
433 "%3$sMounts:%4$s\n"
434 " --bind=PATH[:PATH[:OPTIONS]]\n"
435 " Bind mount a file or directory from the host into\n"
436 " the container\n"
437 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
438 " Similar, but creates a read-only bind mount\n"
439 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
440 " it\n"
441 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
442 " --overlay=PATH[:PATH...]:PATH\n"
443 " Create an overlay mount from the host to \n"
444 " the container\n"
445 " --overlay-ro=PATH[:PATH...]:PATH\n"
446 " Similar, but creates a read-only overlay mount\n"
447 " --bind-user=NAME Bind user from host to container\n\n"
448 "%3$sInput/Output:%4$s\n"
449 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
450 " set up for the container.\n"
451 " -P --pipe Equivalent to --console=pipe\n\n"
452 "%3$sCredentials:%4$s\n"
453 " --set-credential=ID:VALUE\n"
454 " Pass a credential with literal value to container.\n"
455 " --load-credential=ID:PATH\n"
456 " Load credential to pass to container from file or\n"
457 " AF_UNIX stream socket.\n"
458 "\nSee the %2$s for details.\n",
459 program_invocation_short_name,
460 link,
461 ansi_underline(),
462 ansi_normal(),
463 ansi_highlight(),
464 ansi_normal());
465
466 return 0;
467 }
468
469 static int custom_mount_check_all(void) {
470 size_t i;
471
472 for (i = 0; i < arg_n_custom_mounts; i++) {
473 CustomMount *m = &arg_custom_mounts[i];
474
475 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
476 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
478 "--private-users-ownership=own may not be combined with custom root mounts.");
479 if (arg_uid_shift == UID_INVALID)
480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
481 "--private-users with automatic UID shift may not be combined with custom root mounts.");
482 }
483 }
484
485 return 0;
486 }
487
488 static int detect_unified_cgroup_hierarchy_from_environment(void) {
489 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
490 int r;
491
492 /* Allow the user to control whether the unified hierarchy is used */
493
494 e = getenv(var);
495 if (!e) {
496 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
497 var = "UNIFIED_CGROUP_HIERARCHY";
498 e = getenv(var);
499 }
500
501 if (!isempty(e)) {
502 r = parse_boolean(e);
503 if (r < 0)
504 return log_error_errno(r, "Failed to parse $%s: %m", var);
505 if (r > 0)
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
507 else
508 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
509 }
510
511 return 0;
512 }
513
514 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
515 int r;
516
517 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
518 * in the image actually supports. */
519 r = cg_all_unified();
520 if (r < 0)
521 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
522 if (r > 0) {
523 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
524 * routine only detects 231, so we'll have a false negative here for 230. */
525 r = systemd_installation_has_version(directory, "230");
526 if (r < 0)
527 return log_error_errno(r, "Failed to determine systemd version in container: %m");
528 if (r > 0)
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
530 else
531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
532 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
533 /* Mixed cgroup hierarchy support was added in 233 */
534 r = systemd_installation_has_version(directory, "233");
535 if (r < 0)
536 return log_error_errno(r, "Failed to determine systemd version in container: %m");
537 if (r > 0)
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
539 else
540 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
541 } else
542 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
543
544 log_debug("Using %s hierarchy for container.",
545 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
546 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
547
548 return 0;
549 }
550
551 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
552 uint64_t mask = 0;
553 int r;
554
555 for (;;) {
556 _cleanup_free_ char *t = NULL;
557
558 r = extract_first_word(&spec, &t, ",", 0);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse capability %s.", t);
561 if (r == 0)
562 break;
563
564 if (streq(t, "help")) {
565 for (int i = 0; i < capability_list_length(); i++) {
566 const char *name;
567
568 name = capability_to_name(i);
569 if (name)
570 puts(name);
571 }
572
573 return 0; /* quit */
574 }
575
576 if (streq(t, "all"))
577 mask = UINT64_MAX;
578 else {
579 r = capability_from_name(t);
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse capability %s.", t);
582
583 mask |= 1ULL << r;
584 }
585 }
586
587 *ret_mask = mask;
588 return 1; /* continue */
589 }
590
591 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
592 int r;
593
594 r = getenv_bool(name);
595 if (r == -ENXIO)
596 return 0;
597 if (r < 0)
598 return log_error_errno(r, "Failed to parse $%s: %m", name);
599
600 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
601 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
602 return 0;
603 }
604
605 static int parse_mount_settings_env(void) {
606 const char *e;
607 int r;
608
609 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
610 if (r < 0 && r != -ENXIO)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
612 if (r >= 0)
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
614
615 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
616 if (streq_ptr(e, "network"))
617 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
618
619 else if (e) {
620 r = parse_boolean(e);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
623
624 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
625 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
626 }
627
628 return 0;
629 }
630
631 static int parse_environment(void) {
632 const char *e;
633 int r;
634
635 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
636 if (r < 0)
637 return r;
638 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
639 if (r < 0)
640 return r;
641 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
642 if (r < 0)
643 return r;
644 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
645 if (r < 0)
646 return r;
647
648 r = parse_mount_settings_env();
649 if (r < 0)
650 return r;
651
652 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
653 * even if it is supported. If not supported, it has no effect. */
654 if (!cg_ns_supported())
655 arg_use_cgns = false;
656 else {
657 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
658 if (r < 0) {
659 if (r != -ENXIO)
660 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
661
662 arg_use_cgns = true;
663 } else {
664 arg_use_cgns = r > 0;
665 arg_settings_mask |= SETTING_USE_CGNS;
666 }
667 }
668
669 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
670 if (e)
671 arg_container_service_name = e;
672
673 e = getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
674 if (e) {
675 r = parse_ether_addr(e, &arg_network_provided_mac);
676 if (r < 0)
677 return log_error_errno(r, "Failed to parse provided MAC address via environment variable");
678 }
679
680 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
681 if (r >= 0)
682 arg_suppress_sync = r;
683 else if (r != -ENXIO)
684 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
685
686 return detect_unified_cgroup_hierarchy_from_environment();
687 }
688
689 static int parse_argv(int argc, char *argv[]) {
690 enum {
691 ARG_VERSION = 0x100,
692 ARG_PRIVATE_NETWORK,
693 ARG_UUID,
694 ARG_READ_ONLY,
695 ARG_CAPABILITY,
696 ARG_AMBIENT_CAPABILITY,
697 ARG_DROP_CAPABILITY,
698 ARG_LINK_JOURNAL,
699 ARG_BIND,
700 ARG_BIND_RO,
701 ARG_TMPFS,
702 ARG_OVERLAY,
703 ARG_OVERLAY_RO,
704 ARG_INACCESSIBLE,
705 ARG_SHARE_SYSTEM,
706 ARG_REGISTER,
707 ARG_KEEP_UNIT,
708 ARG_NETWORK_INTERFACE,
709 ARG_NETWORK_MACVLAN,
710 ARG_NETWORK_IPVLAN,
711 ARG_NETWORK_BRIDGE,
712 ARG_NETWORK_ZONE,
713 ARG_NETWORK_VETH_EXTRA,
714 ARG_NETWORK_NAMESPACE_PATH,
715 ARG_PERSONALITY,
716 ARG_VOLATILE,
717 ARG_TEMPLATE,
718 ARG_PROPERTY,
719 ARG_PRIVATE_USERS,
720 ARG_KILL_SIGNAL,
721 ARG_SETTINGS,
722 ARG_CHDIR,
723 ARG_PIVOT_ROOT,
724 ARG_PRIVATE_USERS_CHOWN,
725 ARG_PRIVATE_USERS_OWNERSHIP,
726 ARG_NOTIFY_READY,
727 ARG_ROOT_HASH,
728 ARG_ROOT_HASH_SIG,
729 ARG_VERITY_DATA,
730 ARG_SYSTEM_CALL_FILTER,
731 ARG_RLIMIT,
732 ARG_HOSTNAME,
733 ARG_NO_NEW_PRIVILEGES,
734 ARG_OOM_SCORE_ADJUST,
735 ARG_CPU_AFFINITY,
736 ARG_RESOLV_CONF,
737 ARG_TIMEZONE,
738 ARG_CONSOLE,
739 ARG_PIPE,
740 ARG_OCI_BUNDLE,
741 ARG_NO_PAGER,
742 ARG_SET_CREDENTIAL,
743 ARG_LOAD_CREDENTIAL,
744 ARG_BIND_USER,
745 ARG_SUPPRESS_SYNC,
746 ARG_IMAGE_POLICY,
747 };
748
749 static const struct option options[] = {
750 { "help", no_argument, NULL, 'h' },
751 { "version", no_argument, NULL, ARG_VERSION },
752 { "directory", required_argument, NULL, 'D' },
753 { "template", required_argument, NULL, ARG_TEMPLATE },
754 { "ephemeral", no_argument, NULL, 'x' },
755 { "user", required_argument, NULL, 'u' },
756 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
757 { "as-pid2", no_argument, NULL, 'a' },
758 { "boot", no_argument, NULL, 'b' },
759 { "uuid", required_argument, NULL, ARG_UUID },
760 { "read-only", no_argument, NULL, ARG_READ_ONLY },
761 { "capability", required_argument, NULL, ARG_CAPABILITY },
762 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
763 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
764 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
765 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
766 { "bind", required_argument, NULL, ARG_BIND },
767 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
768 { "tmpfs", required_argument, NULL, ARG_TMPFS },
769 { "overlay", required_argument, NULL, ARG_OVERLAY },
770 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
771 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
772 { "machine", required_argument, NULL, 'M' },
773 { "hostname", required_argument, NULL, ARG_HOSTNAME },
774 { "slice", required_argument, NULL, 'S' },
775 { "setenv", required_argument, NULL, 'E' },
776 { "selinux-context", required_argument, NULL, 'Z' },
777 { "selinux-apifs-context", required_argument, NULL, 'L' },
778 { "quiet", no_argument, NULL, 'q' },
779 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
780 { "register", required_argument, NULL, ARG_REGISTER },
781 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
782 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
783 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
784 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
785 { "network-veth", no_argument, NULL, 'n' },
786 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
787 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
788 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
789 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
790 { "personality", required_argument, NULL, ARG_PERSONALITY },
791 { "image", required_argument, NULL, 'i' },
792 { "volatile", optional_argument, NULL, ARG_VOLATILE },
793 { "port", required_argument, NULL, 'p' },
794 { "property", required_argument, NULL, ARG_PROPERTY },
795 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
796 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
797 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
798 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
799 { "settings", required_argument, NULL, ARG_SETTINGS },
800 { "chdir", required_argument, NULL, ARG_CHDIR },
801 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
802 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
803 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
804 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
805 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
806 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
807 { "rlimit", required_argument, NULL, ARG_RLIMIT },
808 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
809 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
810 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
811 { "timezone", required_argument, NULL, ARG_TIMEZONE },
812 { "console", required_argument, NULL, ARG_CONSOLE },
813 { "pipe", no_argument, NULL, ARG_PIPE },
814 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
815 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
816 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
817 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
818 { "bind-user", required_argument, NULL, ARG_BIND_USER },
819 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
820 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
821 {}
822 };
823
824 int c, r;
825 uint64_t plus = 0, minus = 0;
826 bool mask_all_settings = false, mask_no_settings = false;
827
828 assert(argc >= 0);
829 assert(argv);
830
831 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
832 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
833 optind = 0;
834 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
835 switch (c) {
836
837 case 'h':
838 return help();
839
840 case ARG_VERSION:
841 return version();
842
843 case 'D':
844 r = parse_path_argument(optarg, false, &arg_directory);
845 if (r < 0)
846 return r;
847
848 arg_settings_mask |= SETTING_DIRECTORY;
849 break;
850
851 case ARG_TEMPLATE:
852 r = parse_path_argument(optarg, false, &arg_template);
853 if (r < 0)
854 return r;
855
856 arg_settings_mask |= SETTING_DIRECTORY;
857 break;
858
859 case 'i':
860 r = parse_path_argument(optarg, false, &arg_image);
861 if (r < 0)
862 return r;
863
864 arg_settings_mask |= SETTING_DIRECTORY;
865 break;
866
867 case ARG_OCI_BUNDLE:
868 r = parse_path_argument(optarg, false, &arg_oci_bundle);
869 if (r < 0)
870 return r;
871
872 break;
873
874 case 'x':
875 arg_ephemeral = true;
876 arg_settings_mask |= SETTING_EPHEMERAL;
877 break;
878
879 case 'u':
880 r = free_and_strdup(&arg_user, optarg);
881 if (r < 0)
882 return log_oom();
883
884 arg_settings_mask |= SETTING_USER;
885 break;
886
887 case ARG_NETWORK_ZONE: {
888 _cleanup_free_ char *j = NULL;
889
890 j = strjoin("vz-", optarg);
891 if (!j)
892 return log_oom();
893
894 if (!ifname_valid(j))
895 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
896 "Network zone name not valid: %s", j);
897
898 free_and_replace(arg_network_zone, j);
899
900 arg_network_veth = true;
901 arg_private_network = true;
902 arg_settings_mask |= SETTING_NETWORK;
903 break;
904 }
905
906 case ARG_NETWORK_BRIDGE:
907
908 if (!ifname_valid(optarg))
909 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
910 "Bridge interface name not valid: %s", optarg);
911
912 r = free_and_strdup(&arg_network_bridge, optarg);
913 if (r < 0)
914 return log_oom();
915
916 _fallthrough_;
917 case 'n':
918 arg_network_veth = true;
919 arg_private_network = true;
920 arg_settings_mask |= SETTING_NETWORK;
921 break;
922
923 case ARG_NETWORK_VETH_EXTRA:
924 r = veth_extra_parse(&arg_network_veth_extra, optarg);
925 if (r < 0)
926 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
927
928 arg_private_network = true;
929 arg_settings_mask |= SETTING_NETWORK;
930 break;
931
932 case ARG_NETWORK_INTERFACE:
933 r = interface_pair_parse(&arg_network_interfaces, optarg);
934 if (r < 0)
935 return r;
936
937 arg_private_network = true;
938 arg_settings_mask |= SETTING_NETWORK;
939 break;
940
941 case ARG_NETWORK_MACVLAN:
942 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
943 if (r < 0)
944 return r;
945
946 arg_private_network = true;
947 arg_settings_mask |= SETTING_NETWORK;
948 break;
949
950 case ARG_NETWORK_IPVLAN:
951 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
952 if (r < 0)
953 return r;
954
955 _fallthrough_;
956 case ARG_PRIVATE_NETWORK:
957 arg_private_network = true;
958 arg_settings_mask |= SETTING_NETWORK;
959 break;
960
961 case ARG_NETWORK_NAMESPACE_PATH:
962 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
963 if (r < 0)
964 return r;
965
966 arg_settings_mask |= SETTING_NETWORK;
967 break;
968
969 case 'b':
970 if (arg_start_mode == START_PID2)
971 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
972 "--boot and --as-pid2 may not be combined.");
973
974 arg_start_mode = START_BOOT;
975 arg_settings_mask |= SETTING_START_MODE;
976 break;
977
978 case 'a':
979 if (arg_start_mode == START_BOOT)
980 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
981 "--boot and --as-pid2 may not be combined.");
982
983 arg_start_mode = START_PID2;
984 arg_settings_mask |= SETTING_START_MODE;
985 break;
986
987 case ARG_UUID:
988 r = id128_from_string_nonzero(optarg, &arg_uuid);
989 if (r == -ENXIO)
990 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
991 "Machine UUID may not be all zeroes.");
992 if (r < 0)
993 return log_error_errno(r, "Invalid UUID: %s", optarg);
994
995 arg_settings_mask |= SETTING_MACHINE_ID;
996 break;
997
998 case 'S': {
999 _cleanup_free_ char *mangled = NULL;
1000
1001 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1002 if (r < 0)
1003 return log_oom();
1004
1005 free_and_replace(arg_slice, mangled);
1006 arg_settings_mask |= SETTING_SLICE;
1007 break;
1008 }
1009
1010 case 'M':
1011 if (isempty(optarg))
1012 arg_machine = mfree(arg_machine);
1013 else {
1014 if (!hostname_is_valid(optarg, 0))
1015 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1016 "Invalid machine name: %s", optarg);
1017
1018 r = free_and_strdup(&arg_machine, optarg);
1019 if (r < 0)
1020 return log_oom();
1021 }
1022 break;
1023
1024 case ARG_HOSTNAME:
1025 if (isempty(optarg))
1026 arg_hostname = mfree(arg_hostname);
1027 else {
1028 if (!hostname_is_valid(optarg, 0))
1029 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1030 "Invalid hostname: %s", optarg);
1031
1032 r = free_and_strdup(&arg_hostname, optarg);
1033 if (r < 0)
1034 return log_oom();
1035 }
1036
1037 arg_settings_mask |= SETTING_HOSTNAME;
1038 break;
1039
1040 case 'Z':
1041 arg_selinux_context = optarg;
1042 break;
1043
1044 case 'L':
1045 arg_selinux_apifs_context = optarg;
1046 break;
1047
1048 case ARG_READ_ONLY:
1049 arg_read_only = true;
1050 arg_settings_mask |= SETTING_READ_ONLY;
1051 break;
1052
1053 case ARG_AMBIENT_CAPABILITY: {
1054 uint64_t m;
1055 r = parse_capability_spec(optarg, &m);
1056 if (r <= 0)
1057 return r;
1058 arg_caps_ambient |= m;
1059 arg_settings_mask |= SETTING_CAPABILITY;
1060 break;
1061 }
1062 case ARG_CAPABILITY:
1063 case ARG_DROP_CAPABILITY: {
1064 uint64_t m;
1065 r = parse_capability_spec(optarg, &m);
1066 if (r <= 0)
1067 return r;
1068
1069 if (c == ARG_CAPABILITY)
1070 plus |= m;
1071 else
1072 minus |= m;
1073 arg_settings_mask |= SETTING_CAPABILITY;
1074 break;
1075 }
1076 case ARG_NO_NEW_PRIVILEGES:
1077 r = parse_boolean(optarg);
1078 if (r < 0)
1079 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1080
1081 arg_no_new_privileges = r;
1082 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1083 break;
1084
1085 case 'j':
1086 arg_link_journal = LINK_GUEST;
1087 arg_link_journal_try = true;
1088 arg_settings_mask |= SETTING_LINK_JOURNAL;
1089 break;
1090
1091 case ARG_LINK_JOURNAL:
1092 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1095
1096 arg_settings_mask |= SETTING_LINK_JOURNAL;
1097 break;
1098
1099 case ARG_BIND:
1100 case ARG_BIND_RO:
1101 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1102 if (r < 0)
1103 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1104
1105 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1106 break;
1107
1108 case ARG_TMPFS:
1109 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1110 if (r < 0)
1111 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1112
1113 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1114 break;
1115
1116 case ARG_OVERLAY:
1117 case ARG_OVERLAY_RO:
1118 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1119 if (r == -EADDRNOTAVAIL)
1120 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1123
1124 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1125 break;
1126
1127 case ARG_INACCESSIBLE:
1128 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1129 if (r < 0)
1130 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1131
1132 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1133 break;
1134
1135 case 'E':
1136 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1137 if (r < 0)
1138 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1139
1140 arg_settings_mask |= SETTING_ENVIRONMENT;
1141 break;
1142
1143 case 'q':
1144 arg_quiet = true;
1145 break;
1146
1147 case ARG_SHARE_SYSTEM:
1148 /* We don't officially support this anymore, except for compat reasons. People should use the
1149 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1150 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1151 arg_clone_ns_flags = 0;
1152 break;
1153
1154 case ARG_REGISTER:
1155 r = parse_boolean(optarg);
1156 if (r < 0) {
1157 log_error("Failed to parse --register= argument: %s", optarg);
1158 return r;
1159 }
1160
1161 arg_register = r;
1162 break;
1163
1164 case ARG_KEEP_UNIT:
1165 arg_keep_unit = true;
1166 break;
1167
1168 case ARG_PERSONALITY:
1169
1170 arg_personality = personality_from_string(optarg);
1171 if (arg_personality == PERSONALITY_INVALID)
1172 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1173 "Unknown or unsupported personality '%s'.", optarg);
1174
1175 arg_settings_mask |= SETTING_PERSONALITY;
1176 break;
1177
1178 case ARG_VOLATILE:
1179
1180 if (!optarg)
1181 arg_volatile_mode = VOLATILE_YES;
1182 else if (streq(optarg, "help")) {
1183 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1184 return 0;
1185 } else {
1186 VolatileMode m;
1187
1188 m = volatile_mode_from_string(optarg);
1189 if (m < 0)
1190 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1191 "Failed to parse --volatile= argument: %s", optarg);
1192 else
1193 arg_volatile_mode = m;
1194 }
1195
1196 arg_settings_mask |= SETTING_VOLATILE_MODE;
1197 break;
1198
1199 case 'p':
1200 r = expose_port_parse(&arg_expose_ports, optarg);
1201 if (r == -EEXIST)
1202 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1203 if (r < 0)
1204 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1205
1206 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1207 break;
1208
1209 case ARG_PROPERTY:
1210 if (strv_extend(&arg_property, optarg) < 0)
1211 return log_oom();
1212
1213 break;
1214
1215 case ARG_PRIVATE_USERS: {
1216 int boolean;
1217
1218 if (!optarg)
1219 boolean = true;
1220 else if (!in_charset(optarg, DIGITS))
1221 /* do *not* parse numbers as booleans */
1222 boolean = parse_boolean(optarg);
1223 else
1224 boolean = -1;
1225
1226 if (boolean == 0) {
1227 /* no: User namespacing off */
1228 arg_userns_mode = USER_NAMESPACE_NO;
1229 arg_uid_shift = UID_INVALID;
1230 arg_uid_range = UINT32_C(0x10000);
1231 } else if (boolean > 0) {
1232 /* yes: User namespacing on, UID range is read from root dir */
1233 arg_userns_mode = USER_NAMESPACE_FIXED;
1234 arg_uid_shift = UID_INVALID;
1235 arg_uid_range = UINT32_C(0x10000);
1236 } else if (streq(optarg, "pick")) {
1237 /* pick: User namespacing on, UID range is picked randomly */
1238 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1239 * implied by USER_NAMESPACE_PICK
1240 * further down. */
1241 arg_uid_shift = UID_INVALID;
1242 arg_uid_range = UINT32_C(0x10000);
1243
1244 } else if (streq(optarg, "identity")) {
1245 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1246 * itself, i.e. we don't actually map anything, but do take benefit of
1247 * isolation of capability sets. */
1248 arg_userns_mode = USER_NAMESPACE_FIXED;
1249 arg_uid_shift = 0;
1250 arg_uid_range = UINT32_C(0x10000);
1251 } else {
1252 _cleanup_free_ char *buffer = NULL;
1253 const char *range, *shift;
1254
1255 /* anything else: User namespacing on, UID range is explicitly configured */
1256
1257 range = strchr(optarg, ':');
1258 if (range) {
1259 buffer = strndup(optarg, range - optarg);
1260 if (!buffer)
1261 return log_oom();
1262 shift = buffer;
1263
1264 range++;
1265 r = safe_atou32(range, &arg_uid_range);
1266 if (r < 0)
1267 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1268 } else
1269 shift = optarg;
1270
1271 r = parse_uid(shift, &arg_uid_shift);
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1274
1275 arg_userns_mode = USER_NAMESPACE_FIXED;
1276
1277 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1278 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1279 }
1280
1281 arg_settings_mask |= SETTING_USERNS;
1282 break;
1283 }
1284
1285 case 'U':
1286 if (userns_supported()) {
1287 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1288 * implied by USER_NAMESPACE_PICK
1289 * further down. */
1290 arg_uid_shift = UID_INVALID;
1291 arg_uid_range = UINT32_C(0x10000);
1292
1293 arg_settings_mask |= SETTING_USERNS;
1294 }
1295
1296 break;
1297
1298 case ARG_PRIVATE_USERS_CHOWN:
1299 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1300
1301 arg_settings_mask |= SETTING_USERNS;
1302 break;
1303
1304 case ARG_PRIVATE_USERS_OWNERSHIP:
1305 if (streq(optarg, "help")) {
1306 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1307 return 0;
1308 }
1309
1310 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1311 if (arg_userns_ownership < 0)
1312 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1313
1314 arg_settings_mask |= SETTING_USERNS;
1315 break;
1316
1317 case ARG_KILL_SIGNAL:
1318 if (streq(optarg, "help")) {
1319 DUMP_STRING_TABLE(signal, int, _NSIG);
1320 return 0;
1321 }
1322
1323 arg_kill_signal = signal_from_string(optarg);
1324 if (arg_kill_signal < 0)
1325 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1326
1327 arg_settings_mask |= SETTING_KILL_SIGNAL;
1328 break;
1329
1330 case ARG_SETTINGS:
1331
1332 /* no → do not read files
1333 * yes → read files, do not override cmdline, trust only subset
1334 * override → read files, override cmdline, trust only subset
1335 * trusted → read files, do not override cmdline, trust all
1336 */
1337
1338 r = parse_boolean(optarg);
1339 if (r < 0) {
1340 if (streq(optarg, "trusted")) {
1341 mask_all_settings = false;
1342 mask_no_settings = false;
1343 arg_settings_trusted = true;
1344
1345 } else if (streq(optarg, "override")) {
1346 mask_all_settings = false;
1347 mask_no_settings = true;
1348 arg_settings_trusted = -1;
1349 } else
1350 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1351 } else if (r > 0) {
1352 /* yes */
1353 mask_all_settings = false;
1354 mask_no_settings = false;
1355 arg_settings_trusted = -1;
1356 } else {
1357 /* no */
1358 mask_all_settings = true;
1359 mask_no_settings = false;
1360 arg_settings_trusted = false;
1361 }
1362
1363 break;
1364
1365 case ARG_CHDIR:
1366 if (!path_is_absolute(optarg))
1367 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1368 "Working directory %s is not an absolute path.", optarg);
1369
1370 r = free_and_strdup(&arg_chdir, optarg);
1371 if (r < 0)
1372 return log_oom();
1373
1374 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1375 break;
1376
1377 case ARG_PIVOT_ROOT:
1378 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1379 if (r < 0)
1380 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1381
1382 arg_settings_mask |= SETTING_PIVOT_ROOT;
1383 break;
1384
1385 case ARG_NOTIFY_READY:
1386 r = parse_boolean(optarg);
1387 if (r < 0)
1388 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1389 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1390 arg_notify_ready = r;
1391 arg_settings_mask |= SETTING_NOTIFY_READY;
1392 break;
1393
1394 case ARG_ROOT_HASH: {
1395 _cleanup_free_ void *k = NULL;
1396 size_t l;
1397
1398 r = unhexmem(optarg, strlen(optarg), &k, &l);
1399 if (r < 0)
1400 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1401 if (l < sizeof(sd_id128_t))
1402 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
1403
1404 free_and_replace(arg_verity_settings.root_hash, k);
1405 arg_verity_settings.root_hash_size = l;
1406 break;
1407 }
1408
1409 case ARG_ROOT_HASH_SIG: {
1410 char *value;
1411 size_t l;
1412 void *p;
1413
1414 if ((value = startswith(optarg, "base64:"))) {
1415 r = unbase64mem(value, strlen(value), &p, &l);
1416 if (r < 0)
1417 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1418
1419 } else {
1420 r = read_full_file(optarg, (char**) &p, &l);
1421 if (r < 0)
1422 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1423 }
1424
1425 free_and_replace(arg_verity_settings.root_hash_sig, p);
1426 arg_verity_settings.root_hash_sig_size = l;
1427 break;
1428 }
1429
1430 case ARG_VERITY_DATA:
1431 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1432 if (r < 0)
1433 return r;
1434 break;
1435
1436 case ARG_SYSTEM_CALL_FILTER: {
1437 bool negative;
1438 const char *items;
1439
1440 negative = optarg[0] == '~';
1441 items = negative ? optarg + 1 : optarg;
1442
1443 for (;;) {
1444 _cleanup_free_ char *word = NULL;
1445
1446 r = extract_first_word(&items, &word, NULL, 0);
1447 if (r == 0)
1448 break;
1449 if (r == -ENOMEM)
1450 return log_oom();
1451 if (r < 0)
1452 return log_error_errno(r, "Failed to parse system call filter: %m");
1453
1454 if (negative)
1455 r = strv_extend(&arg_syscall_deny_list, word);
1456 else
1457 r = strv_extend(&arg_syscall_allow_list, word);
1458 if (r < 0)
1459 return log_oom();
1460 }
1461
1462 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1463 break;
1464 }
1465
1466 case ARG_RLIMIT: {
1467 const char *eq;
1468 _cleanup_free_ char *name = NULL;
1469 int rl;
1470
1471 if (streq(optarg, "help")) {
1472 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1473 return 0;
1474 }
1475
1476 eq = strchr(optarg, '=');
1477 if (!eq)
1478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1479 "--rlimit= expects an '=' assignment.");
1480
1481 name = strndup(optarg, eq - optarg);
1482 if (!name)
1483 return log_oom();
1484
1485 rl = rlimit_from_string_harder(name);
1486 if (rl < 0)
1487 return log_error_errno(rl, "Unknown resource limit: %s", name);
1488
1489 if (!arg_rlimit[rl]) {
1490 arg_rlimit[rl] = new0(struct rlimit, 1);
1491 if (!arg_rlimit[rl])
1492 return log_oom();
1493 }
1494
1495 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1498
1499 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1500 break;
1501 }
1502
1503 case ARG_OOM_SCORE_ADJUST:
1504 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1505 if (r < 0)
1506 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1507
1508 arg_oom_score_adjust_set = true;
1509 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1510 break;
1511
1512 case ARG_CPU_AFFINITY: {
1513 CPUSet cpuset;
1514
1515 r = parse_cpu_set(optarg, &cpuset);
1516 if (r < 0)
1517 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1518
1519 cpu_set_reset(&arg_cpu_set);
1520 arg_cpu_set = cpuset;
1521 arg_settings_mask |= SETTING_CPU_AFFINITY;
1522 break;
1523 }
1524
1525 case ARG_RESOLV_CONF:
1526 if (streq(optarg, "help")) {
1527 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1528 return 0;
1529 }
1530
1531 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1532 if (arg_resolv_conf < 0)
1533 return log_error_errno(arg_resolv_conf,
1534 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1535
1536 arg_settings_mask |= SETTING_RESOLV_CONF;
1537 break;
1538
1539 case ARG_TIMEZONE:
1540 if (streq(optarg, "help")) {
1541 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1542 return 0;
1543 }
1544
1545 arg_timezone = timezone_mode_from_string(optarg);
1546 if (arg_timezone < 0)
1547 return log_error_errno(arg_timezone,
1548 "Failed to parse /etc/localtime mode: %s", optarg);
1549
1550 arg_settings_mask |= SETTING_TIMEZONE;
1551 break;
1552
1553 case ARG_CONSOLE:
1554 r = handle_arg_console(optarg);
1555 if (r <= 0)
1556 return r;
1557 break;
1558
1559 case 'P':
1560 case ARG_PIPE:
1561 r = handle_arg_console("pipe");
1562 if (r <= 0)
1563 return r;
1564 break;
1565
1566 case ARG_NO_PAGER:
1567 arg_pager_flags |= PAGER_DISABLE;
1568 break;
1569
1570 case ARG_SET_CREDENTIAL: {
1571 _cleanup_free_ char *word = NULL, *data = NULL;
1572 const char *p = optarg;
1573 Credential *a;
1574 ssize_t l;
1575
1576 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1577 if (r == -ENOMEM)
1578 return log_oom();
1579 if (r < 0)
1580 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1581 if (r == 0 || !p)
1582 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1583
1584 if (!credential_name_valid(word))
1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1586
1587 for (size_t i = 0; i < arg_n_credentials; i++)
1588 if (streq(arg_credentials[i].id, word))
1589 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1590
1591 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1592 if (l < 0)
1593 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1594
1595 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1596 if (!a)
1597 return log_oom();
1598
1599 a[arg_n_credentials++] = (Credential) {
1600 .id = TAKE_PTR(word),
1601 .data = TAKE_PTR(data),
1602 .size = l,
1603 };
1604
1605 arg_credentials = a;
1606
1607 arg_settings_mask |= SETTING_CREDENTIALS;
1608 break;
1609 }
1610
1611 case ARG_LOAD_CREDENTIAL: {
1612 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1613 _cleanup_(erase_and_freep) char *data = NULL;
1614 _cleanup_free_ char *word = NULL, *j = NULL;
1615 const char *p = optarg;
1616 Credential *a;
1617 size_t size, i;
1618
1619 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1620 if (r == -ENOMEM)
1621 return log_oom();
1622 if (r < 0)
1623 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
1624 if (r == 0 || !p)
1625 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
1626
1627 if (!credential_name_valid(word))
1628 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1629
1630 for (i = 0; i < arg_n_credentials; i++)
1631 if (streq(arg_credentials[i].id, word))
1632 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1633
1634 if (path_is_absolute(p))
1635 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1636 else {
1637 const char *e;
1638
1639 r = get_credentials_dir(&e);
1640 if (r < 0)
1641 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
1642
1643 j = path_join(e, p);
1644 if (!j)
1645 return log_oom();
1646 }
1647
1648 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1649 flags,
1650 NULL,
1651 &data, &size);
1652 if (r < 0)
1653 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1654
1655 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1656 if (!a)
1657 return log_oom();
1658
1659 a[arg_n_credentials++] = (Credential) {
1660 .id = TAKE_PTR(word),
1661 .data = TAKE_PTR(data),
1662 .size = size,
1663 };
1664
1665 arg_credentials = a;
1666
1667 arg_settings_mask |= SETTING_CREDENTIALS;
1668 break;
1669 }
1670
1671 case ARG_BIND_USER:
1672 if (!valid_user_group_name(optarg, 0))
1673 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1674
1675 if (strv_extend(&arg_bind_user, optarg) < 0)
1676 return log_oom();
1677
1678 arg_settings_mask |= SETTING_BIND_USER;
1679 break;
1680
1681 case ARG_SUPPRESS_SYNC:
1682 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1683 if (r < 0)
1684 return r;
1685
1686 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1687 break;
1688
1689 case ARG_IMAGE_POLICY:
1690 r = parse_image_policy_argument(optarg, &arg_image_policy);
1691 if (r < 0)
1692 return r;
1693 break;
1694
1695 case '?':
1696 return -EINVAL;
1697
1698 default:
1699 assert_not_reached();
1700 }
1701
1702 if (argc > optind) {
1703 strv_free(arg_parameters);
1704 arg_parameters = strv_copy(argv + optind);
1705 if (!arg_parameters)
1706 return log_oom();
1707
1708 arg_settings_mask |= SETTING_START_MODE;
1709 }
1710
1711 if (arg_ephemeral && arg_template && !arg_directory)
1712 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1713 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1714 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1715 * --directory=". */
1716 arg_directory = TAKE_PTR(arg_template);
1717
1718 arg_caps_retain |= plus;
1719 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1720
1721 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1722 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1723 * indicate that. */
1724 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1725 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1726
1727 arg_caps_retain &= ~minus;
1728
1729 /* Make sure to parse environment before we reset the settings mask below */
1730 r = parse_environment();
1731 if (r < 0)
1732 return r;
1733
1734 /* Load all settings from .nspawn files */
1735 if (mask_no_settings)
1736 arg_settings_mask = 0;
1737
1738 /* Don't load any settings from .nspawn files */
1739 if (mask_all_settings)
1740 arg_settings_mask = _SETTINGS_MASK_ALL;
1741
1742 return 1;
1743 }
1744
1745 static int verify_arguments(void) {
1746 int r;
1747
1748 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1749 /* If we are running the stub init in the container, we don't need to look at what the init
1750 * in the container supports, because we are not using it. Let's immediately pick the right
1751 * setting based on the host system configuration.
1752 *
1753 * We only do this, if the user didn't use an environment variable to override the detection.
1754 */
1755
1756 r = cg_all_unified();
1757 if (r < 0)
1758 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1759 if (r > 0)
1760 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1761 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1762 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1763 else
1764 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1765 }
1766
1767 if (arg_userns_mode != USER_NAMESPACE_NO)
1768 arg_mount_settings |= MOUNT_USE_USERNS;
1769
1770 if (arg_private_network)
1771 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1772
1773 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1774 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1775 arg_register = false;
1776 if (arg_start_mode != START_PID1)
1777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1778 }
1779
1780 if (arg_userns_ownership < 0)
1781 arg_userns_ownership =
1782 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1783 USER_NAMESPACE_OWNERSHIP_OFF;
1784
1785 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1786 arg_kill_signal = SIGRTMIN+3;
1787
1788 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1789 arg_read_only = true;
1790
1791 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1792 arg_read_only = true;
1793
1794 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1795 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1796 * The latter is not technically a user session, but we don't need to labour the point. */
1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1798
1799 if (arg_directory && arg_image)
1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1801
1802 if (arg_template && arg_image)
1803 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1804
1805 if (arg_template && !(arg_directory || arg_machine))
1806 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1807
1808 if (arg_ephemeral && arg_template)
1809 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1810
1811 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1812 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1813
1814 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1815 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1816
1817 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1818 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1819 "--read-only and --private-users-ownership=chown may not be combined.");
1820
1821 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1822 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1823 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1824 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1826
1827 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1828 * we need to error out, to avoid conflicts between different network options. */
1829 if (arg_network_namespace_path &&
1830 (arg_network_interfaces || arg_network_macvlan ||
1831 arg_network_ipvlan || arg_network_veth_extra ||
1832 arg_network_bridge || arg_network_zone ||
1833 arg_network_veth))
1834 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1835
1836 if (arg_network_bridge && arg_network_zone)
1837 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1838 "--network-bridge= and --network-zone= may not be combined.");
1839
1840 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1841 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1842
1843 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1844 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1845
1846 if (arg_expose_ports && !arg_private_network)
1847 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1848
1849 if (arg_caps_ambient) {
1850 if (arg_caps_ambient == UINT64_MAX)
1851 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1852
1853 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1854 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1855
1856 if (arg_start_mode == START_BOOT)
1857 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1858 }
1859
1860 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1862
1863 /* Drop duplicate --bind-user= entries */
1864 strv_uniq(arg_bind_user);
1865
1866 r = custom_mount_check_all();
1867 if (r < 0)
1868 return r;
1869
1870 return 0;
1871 }
1872
1873 static int verify_network_interfaces_initialized(void) {
1874 int r;
1875 r = test_network_interfaces_initialized(arg_network_interfaces);
1876 if (r < 0)
1877 return r;
1878
1879 r = test_network_interfaces_initialized(arg_network_macvlan);
1880 if (r < 0)
1881 return r;
1882
1883 r = test_network_interfaces_initialized(arg_network_ipvlan);
1884 if (r < 0)
1885 return r;
1886
1887 return 0;
1888 }
1889
1890 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1891 assert(p);
1892
1893 if (arg_userns_mode == USER_NAMESPACE_NO)
1894 return 0;
1895
1896 if (uid == UID_INVALID && gid == GID_INVALID)
1897 return 0;
1898
1899 if (uid != UID_INVALID) {
1900 uid += arg_uid_shift;
1901
1902 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1903 return -EOVERFLOW;
1904 }
1905
1906 if (gid != GID_INVALID) {
1907 gid += (gid_t) arg_uid_shift;
1908
1909 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1910 return -EOVERFLOW;
1911 }
1912
1913 return RET_NERRNO(lchown(p, uid, gid));
1914 }
1915
1916 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1917 const char *q;
1918 int r;
1919
1920 q = prefix_roota(root, path);
1921 r = RET_NERRNO(mkdir(q, mode));
1922 if (r == -EEXIST)
1923 return 0;
1924 if (r < 0)
1925 return r;
1926
1927 return userns_lchown(q, uid, gid);
1928 }
1929
1930 static const char *timezone_from_path(const char *path) {
1931 return PATH_STARTSWITH_SET(
1932 path,
1933 "../usr/share/zoneinfo/",
1934 "/usr/share/zoneinfo/");
1935 }
1936
1937 static bool etc_writable(void) {
1938 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1939 }
1940
1941 static int setup_timezone(const char *dest) {
1942 _cleanup_free_ char *p = NULL, *etc = NULL;
1943 const char *where, *check;
1944 TimezoneMode m;
1945 int r;
1946
1947 assert(dest);
1948
1949 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1950 r = readlink_malloc("/etc/localtime", &p);
1951 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1952 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1953 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1954 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1955 else if (r < 0) {
1956 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1957 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1958 * file.
1959 *
1960 * Example:
1961 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1962 */
1963 return 0;
1964 } else if (arg_timezone == TIMEZONE_AUTO)
1965 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1966 else
1967 m = arg_timezone;
1968 } else
1969 m = arg_timezone;
1970
1971 if (m == TIMEZONE_OFF)
1972 return 0;
1973
1974 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1975 if (r < 0) {
1976 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1977 return 0;
1978 }
1979
1980 where = strjoina(etc, "/localtime");
1981
1982 switch (m) {
1983
1984 case TIMEZONE_DELETE:
1985 if (unlink(where) < 0)
1986 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1987
1988 return 0;
1989
1990 case TIMEZONE_SYMLINK: {
1991 _cleanup_free_ char *q = NULL;
1992 const char *z, *what;
1993
1994 z = timezone_from_path(p);
1995 if (!z) {
1996 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1997 return 0;
1998 }
1999
2000 r = readlink_malloc(where, &q);
2001 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
2002 return 0; /* Already pointing to the right place? Then do nothing .. */
2003
2004 check = strjoina(dest, "/usr/share/zoneinfo/", z);
2005 r = chase(check, dest, 0, NULL, NULL);
2006 if (r < 0)
2007 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
2008 else {
2009 if (unlink(where) < 0 && errno != ENOENT) {
2010 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
2011 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
2012 return 0;
2013 }
2014
2015 what = strjoina("../usr/share/zoneinfo/", z);
2016 if (symlink(what, where) < 0) {
2017 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2018 errno, "Failed to correct timezone of container, ignoring: %m");
2019 return 0;
2020 }
2021
2022 break;
2023 }
2024
2025 _fallthrough_;
2026 }
2027
2028 case TIMEZONE_BIND: {
2029 _cleanup_free_ char *resolved = NULL;
2030 int found;
2031
2032 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2033 if (found < 0) {
2034 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2035 return 0;
2036 }
2037
2038 if (found == 0) /* missing? */
2039 (void) touch(resolved);
2040
2041 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
2042 if (r >= 0)
2043 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2044
2045 _fallthrough_;
2046 }
2047
2048 case TIMEZONE_COPY:
2049 /* If mounting failed, try to copy */
2050 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
2051 if (r < 0) {
2052 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2053 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2054 return 0;
2055 }
2056
2057 break;
2058
2059 default:
2060 assert_not_reached();
2061 }
2062
2063 /* Fix permissions of the symlink or file copy we just created */
2064 r = userns_lchown(where, 0, 0);
2065 if (r < 0)
2066 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
2067
2068 return 0;
2069 }
2070
2071 static int have_resolv_conf(const char *path) {
2072 assert(path);
2073
2074 if (access(path, F_OK) < 0) {
2075 if (errno == ENOENT)
2076 return 0;
2077
2078 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2079 }
2080
2081 return 1;
2082 }
2083
2084 static int resolved_listening(void) {
2085 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2086 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2087 _cleanup_free_ char *dns_stub_listener_mode = NULL;
2088 int r;
2089
2090 /* Check if resolved is listening */
2091
2092 r = sd_bus_open_system(&bus);
2093 if (r < 0)
2094 return log_debug_errno(r, "Failed to open system bus: %m");
2095
2096 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2097 if (r < 0)
2098 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2099 if (r == 0)
2100 return 0;
2101
2102 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
2103 if (r < 0)
2104 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2105
2106 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2107 }
2108
2109 static int setup_resolv_conf(const char *dest) {
2110 _cleanup_free_ char *etc = NULL;
2111 const char *where, *what;
2112 ResolvConfMode m;
2113 int r;
2114
2115 assert(dest);
2116
2117 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2118 if (arg_private_network)
2119 m = RESOLV_CONF_OFF;
2120 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2121 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2122 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2123 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2124 else
2125 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2126
2127 } else
2128 m = arg_resolv_conf;
2129
2130 if (m == RESOLV_CONF_OFF)
2131 return 0;
2132
2133 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2134 if (r < 0) {
2135 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2136 return 0;
2137 }
2138
2139 where = strjoina(etc, "/resolv.conf");
2140
2141 if (m == RESOLV_CONF_DELETE) {
2142 if (unlink(where) < 0)
2143 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2144
2145 return 0;
2146 }
2147
2148 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2149 what = PRIVATE_STATIC_RESOLV_CONF;
2150 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2151 what = PRIVATE_UPLINK_RESOLV_CONF;
2152 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2153 what = PRIVATE_STUB_RESOLV_CONF;
2154 else
2155 what = "/etc/resolv.conf";
2156
2157 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2158 _cleanup_free_ char *resolved = NULL;
2159 int found;
2160
2161 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
2162 if (found < 0) {
2163 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2164 return 0;
2165 }
2166
2167 if (found == 0) /* missing? */
2168 (void) touch(resolved);
2169
2170 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2171 if (r >= 0)
2172 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2173
2174 /* If that didn't work, let's copy the file */
2175 }
2176
2177 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2178 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
2179 else
2180 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
2181 if (r < 0) {
2182 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2183 * resolved or something similar runs inside and the symlink points there.
2184 *
2185 * If the disk image is read-only, there's also no point in complaining.
2186 */
2187 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2188 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2189 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2190 return 0;
2191 }
2192
2193 r = userns_lchown(where, 0, 0);
2194 if (r < 0)
2195 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2196
2197 return 0;
2198 }
2199
2200 static int setup_boot_id(void) {
2201 _cleanup_(unlink_and_freep) char *from = NULL;
2202 _cleanup_free_ char *path = NULL;
2203 sd_id128_t rnd = SD_ID128_NULL;
2204 const char *to;
2205 int r;
2206
2207 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2208
2209 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2210 if (r < 0)
2211 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2212
2213 r = sd_id128_randomize(&rnd);
2214 if (r < 0)
2215 return log_error_errno(r, "Failed to generate random boot id: %m");
2216
2217 r = id128_write(path, ID128_FORMAT_UUID, rnd);
2218 if (r < 0)
2219 return log_error_errno(r, "Failed to write boot id: %m");
2220
2221 from = TAKE_PTR(path);
2222 to = "/proc/sys/kernel/random/boot_id";
2223
2224 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2225 if (r < 0)
2226 return r;
2227
2228 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2229 }
2230
2231 static int copy_devnodes(const char *dest) {
2232 static const char devnodes[] =
2233 "null\0"
2234 "zero\0"
2235 "full\0"
2236 "random\0"
2237 "urandom\0"
2238 "tty\0"
2239 "net/tun\0";
2240
2241 int r = 0;
2242
2243 assert(dest);
2244
2245 BLOCK_WITH_UMASK(0000);
2246
2247 /* Create /dev/net, so that we can create /dev/net/tun in it */
2248 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2249 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2250
2251 NULSTR_FOREACH(d, devnodes) {
2252 _cleanup_free_ char *from = NULL, *to = NULL;
2253 struct stat st;
2254
2255 from = path_join("/dev/", d);
2256 if (!from)
2257 return log_oom();
2258
2259 to = path_join(dest, from);
2260 if (!to)
2261 return log_oom();
2262
2263 if (stat(from, &st) < 0) {
2264
2265 if (errno != ENOENT)
2266 return log_error_errno(errno, "Failed to stat %s: %m", from);
2267
2268 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2269 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2270 "%s is not a char or block device, cannot copy.", from);
2271 else {
2272 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2273
2274 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2275 /* Explicitly warn the user when /dev is already populated. */
2276 if (errno == EEXIST)
2277 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
2278 if (errno != EPERM)
2279 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2280
2281 /* Some systems abusively restrict mknod but allow bind mounts. */
2282 r = touch(to);
2283 if (r < 0)
2284 return log_error_errno(r, "touch (%s) failed: %m", to);
2285 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2286 if (r < 0)
2287 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2288 }
2289
2290 r = userns_lchown(to, 0, 0);
2291 if (r < 0)
2292 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2293
2294 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2295 if (!dn)
2296 return log_oom();
2297
2298 r = userns_mkdir(dest, dn, 0755, 0, 0);
2299 if (r < 0)
2300 return log_error_errno(r, "Failed to create '%s': %m", dn);
2301
2302 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2303 return log_oom();
2304
2305 prefixed = path_join(dest, sl);
2306 if (!prefixed)
2307 return log_oom();
2308
2309 t = path_join("..", d);
2310 if (!t)
2311 return log_oom();
2312
2313 if (symlink(t, prefixed) < 0)
2314 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2315 }
2316 }
2317
2318 return r;
2319 }
2320
2321 static int make_extra_nodes(const char *dest) {
2322 size_t i;
2323 int r;
2324
2325 BLOCK_WITH_UMASK(0000);
2326
2327 for (i = 0; i < arg_n_extra_nodes; i++) {
2328 _cleanup_free_ char *path = NULL;
2329 DeviceNode *n = arg_extra_nodes + i;
2330
2331 path = path_join(dest, n->path);
2332 if (!path)
2333 return log_oom();
2334
2335 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2336 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2337
2338 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2339 if (r < 0)
2340 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2341 }
2342
2343 return 0;
2344 }
2345
2346 static int setup_pts(const char *dest) {
2347 _cleanup_free_ char *options = NULL;
2348 const char *p;
2349 int r;
2350
2351 #if HAVE_SELINUX
2352 if (arg_selinux_apifs_context)
2353 (void) asprintf(&options,
2354 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2355 arg_uid_shift + TTY_GID,
2356 arg_selinux_apifs_context);
2357 else
2358 #endif
2359 (void) asprintf(&options,
2360 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2361 arg_uid_shift + TTY_GID);
2362
2363 if (!options)
2364 return log_oom();
2365
2366 /* Mount /dev/pts itself */
2367 p = prefix_roota(dest, "/dev/pts");
2368 r = RET_NERRNO(mkdir(p, 0755));
2369 if (r < 0)
2370 return log_error_errno(r, "Failed to create /dev/pts: %m");
2371
2372 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2373 if (r < 0)
2374 return r;
2375 r = userns_lchown(p, 0, 0);
2376 if (r < 0)
2377 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2378
2379 /* Create /dev/ptmx symlink */
2380 p = prefix_roota(dest, "/dev/ptmx");
2381 if (symlink("pts/ptmx", p) < 0)
2382 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2383 r = userns_lchown(p, 0, 0);
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2386
2387 /* And fix /dev/pts/ptmx ownership */
2388 p = prefix_roota(dest, "/dev/pts/ptmx");
2389 r = userns_lchown(p, 0, 0);
2390 if (r < 0)
2391 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2392
2393 return 0;
2394 }
2395
2396 static int setup_stdio_as_dev_console(void) {
2397 _cleanup_close_ int terminal = -EBADF;
2398 int r;
2399
2400 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2401 * explicitly, if we are configured to. */
2402 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2403 if (terminal < 0)
2404 return log_error_errno(terminal, "Failed to open console: %m");
2405
2406 /* Make sure we can continue logging to the original stderr, even if
2407 * stderr points elsewhere now */
2408 r = log_dup_console();
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to duplicate stderr: %m");
2411
2412 /* invalidates 'terminal' on success and failure */
2413 r = rearrange_stdio(terminal, terminal, terminal);
2414 TAKE_FD(terminal);
2415 if (r < 0)
2416 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2417
2418 return 0;
2419 }
2420
2421 static int setup_dev_console(const char *console) {
2422 _cleanup_free_ char *p = NULL;
2423 int r;
2424
2425 /* Create /dev/console symlink */
2426 r = path_make_relative("/dev", console, &p);
2427 if (r < 0)
2428 return log_error_errno(r, "Failed to create relative path: %m");
2429
2430 if (symlink(p, "/dev/console") < 0)
2431 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2432
2433 return 0;
2434 }
2435
2436 static int setup_keyring(void) {
2437 key_serial_t keyring;
2438
2439 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2440 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2441 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2442 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2443 * into the container. */
2444
2445 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2446 if (keyring == -1) {
2447 if (errno == ENOSYS)
2448 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2449 else if (ERRNO_IS_PRIVILEGE(errno))
2450 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2451 else
2452 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2453 }
2454
2455 return 0;
2456 }
2457
2458 static int setup_credentials(const char *root) {
2459 const char *q;
2460 int r;
2461
2462 if (arg_n_credentials <= 0)
2463 return 0;
2464
2465 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2466 if (r < 0)
2467 return log_error_errno(r, "Failed to create /run/host: %m");
2468
2469 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2470 if (r < 0)
2471 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2472
2473 q = prefix_roota(root, "/run/host/credentials");
2474 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2475 if (r < 0)
2476 return r;
2477
2478 for (size_t i = 0; i < arg_n_credentials; i++) {
2479 _cleanup_free_ char *j = NULL;
2480 _cleanup_close_ int fd = -EBADF;
2481
2482 j = path_join(q, arg_credentials[i].id);
2483 if (!j)
2484 return log_oom();
2485
2486 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2487 if (fd < 0)
2488 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2489
2490 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size);
2491 if (r < 0)
2492 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2493
2494 if (fchmod(fd, 0400) < 0)
2495 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2496
2497 if (arg_userns_mode != USER_NAMESPACE_NO) {
2498 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2499 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2500 }
2501 }
2502
2503 if (chmod(q, 0500) < 0)
2504 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2505
2506 r = userns_lchown(q, 0, 0);
2507 if (r < 0)
2508 return r;
2509
2510 /* Make both mount and superblock read-only now */
2511 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2512 if (r < 0)
2513 return r;
2514
2515 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2516 }
2517
2518 static int setup_kmsg(int fd_inner_socket) {
2519 _cleanup_(unlink_and_freep) char *from = NULL;
2520 _cleanup_free_ char *fifo = NULL;
2521 _cleanup_close_ int fd = -EBADF;
2522 int r;
2523
2524 assert(fd_inner_socket >= 0);
2525
2526 BLOCK_WITH_UMASK(0000);
2527
2528 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2529 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2530 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2531 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2532
2533 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2534 if (r < 0)
2535 return log_error_errno(r, "Failed to generate kmsg path: %m");
2536
2537 if (mkfifo(fifo, 0600) < 0)
2538 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2539
2540 from = TAKE_PTR(fifo);
2541
2542 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2543 if (r < 0)
2544 return r;
2545
2546 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2547 if (fd < 0)
2548 return log_error_errno(errno, "Failed to open fifo: %m");
2549
2550 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2551 r = send_one_fd(fd_inner_socket, fd, 0);
2552 if (r < 0)
2553 return log_error_errno(r, "Failed to send FIFO fd: %m");
2554
2555 return 0;
2556 }
2557
2558 struct ExposeArgs {
2559 union in_addr_union address4;
2560 union in_addr_union address6;
2561 struct FirewallContext *fw_ctx;
2562 };
2563
2564 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2565 struct ExposeArgs *args = ASSERT_PTR(userdata);
2566
2567 assert(rtnl);
2568 assert(m);
2569
2570 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2571 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2572 return 0;
2573 }
2574
2575 static int setup_hostname(void) {
2576 int r;
2577
2578 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2579 return 0;
2580
2581 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2582 if (r < 0)
2583 return log_error_errno(r, "Failed to set hostname: %m");
2584
2585 return 0;
2586 }
2587
2588 static int setup_journal(const char *directory) {
2589 _cleanup_free_ char *d = NULL;
2590 const char *p, *q;
2591 sd_id128_t this_id;
2592 bool try;
2593 int r;
2594
2595 /* Don't link journals in ephemeral mode */
2596 if (arg_ephemeral)
2597 return 0;
2598
2599 if (arg_link_journal == LINK_NO)
2600 return 0;
2601
2602 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2603
2604 r = sd_id128_get_machine(&this_id);
2605 if (r < 0)
2606 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2607
2608 if (sd_id128_equal(arg_uuid, this_id)) {
2609 log_full(try ? LOG_WARNING : LOG_ERR,
2610 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2611 if (try)
2612 return 0;
2613 return -EEXIST;
2614 }
2615
2616 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2617 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2618 if (r < 0) {
2619 bool ignore = r == -EROFS && try;
2620 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2621 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2622 return ignore ? 0 : r;
2623 }
2624 }
2625
2626 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2627 q = prefix_roota(directory, p);
2628
2629 if (path_is_mount_point(p, NULL, 0) > 0) {
2630 if (try)
2631 return 0;
2632
2633 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2634 "%s: already a mount point, refusing to use for journal", p);
2635 }
2636
2637 if (path_is_mount_point(q, NULL, 0) > 0) {
2638 if (try)
2639 return 0;
2640
2641 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2642 "%s: already a mount point, refusing to use for journal", q);
2643 }
2644
2645 r = readlink_and_make_absolute(p, &d);
2646 if (r >= 0) {
2647 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2648 path_equal(d, q)) {
2649
2650 r = userns_mkdir(directory, p, 0755, 0, 0);
2651 if (r < 0)
2652 log_warning_errno(r, "Failed to create directory %s: %m", q);
2653 return 0;
2654 }
2655
2656 if (unlink(p) < 0)
2657 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2658 } else if (r == -EINVAL) {
2659
2660 if (arg_link_journal == LINK_GUEST &&
2661 rmdir(p) < 0) {
2662
2663 if (errno == ENOTDIR) {
2664 log_error("%s already exists and is neither a symlink nor a directory", p);
2665 return r;
2666 } else
2667 return log_error_errno(errno, "Failed to remove %s: %m", p);
2668 }
2669 } else if (r != -ENOENT)
2670 return log_error_errno(r, "readlink(%s) failed: %m", p);
2671
2672 if (arg_link_journal == LINK_GUEST) {
2673
2674 if (symlink(q, p) < 0) {
2675 if (try) {
2676 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2677 return 0;
2678 } else
2679 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2680 }
2681
2682 r = userns_mkdir(directory, p, 0755, 0, 0);
2683 if (r < 0)
2684 log_warning_errno(r, "Failed to create directory %s: %m", q);
2685 return 0;
2686 }
2687
2688 if (arg_link_journal == LINK_HOST) {
2689 /* don't create parents here — if the host doesn't have
2690 * permanent journal set up, don't force it here */
2691
2692 r = RET_NERRNO(mkdir(p, 0755));
2693 if (r < 0 && r != -EEXIST) {
2694 if (try) {
2695 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2696 return 0;
2697 } else
2698 return log_error_errno(r, "Failed to create %s: %m", p);
2699 }
2700
2701 } else if (access(p, F_OK) < 0)
2702 return 0;
2703
2704 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2705 log_warning("%s is not empty, proceeding anyway.", q);
2706
2707 r = userns_mkdir(directory, p, 0755, 0, 0);
2708 if (r < 0)
2709 return log_error_errno(r, "Failed to create %s: %m", q);
2710
2711 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2712 if (r < 0)
2713 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2714
2715 return 0;
2716 }
2717
2718 static int drop_capabilities(uid_t uid) {
2719 CapabilityQuintet q;
2720
2721 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2722 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2723 * arg_caps_retain. */
2724
2725 if (capability_quintet_is_set(&arg_full_capabilities)) {
2726 q = arg_full_capabilities;
2727
2728 if (q.bounding == UINT64_MAX)
2729 q.bounding = uid == 0 ? arg_caps_retain : 0;
2730
2731 if (q.effective == UINT64_MAX)
2732 q.effective = uid == 0 ? q.bounding : 0;
2733
2734 if (q.inheritable == UINT64_MAX)
2735 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2736
2737 if (q.permitted == UINT64_MAX)
2738 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2739
2740 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2741 q.ambient = arg_caps_ambient;
2742
2743 if (capability_quintet_mangle(&q))
2744 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2745
2746 } else {
2747 q = (CapabilityQuintet) {
2748 .bounding = arg_caps_retain,
2749 .effective = uid == 0 ? arg_caps_retain : 0,
2750 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2751 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2752 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2753 };
2754
2755 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2756 * in order to maintain the same behavior as systemd < 242. */
2757 if (capability_quintet_mangle(&q))
2758 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2759 "Some capabilities will not be set because they are not in the current bounding set.");
2760
2761 }
2762
2763 return capability_quintet_enforce(&q);
2764 }
2765
2766 static int reset_audit_loginuid(void) {
2767 _cleanup_free_ char *p = NULL;
2768 int r;
2769
2770 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2771 return 0;
2772
2773 r = read_one_line_file("/proc/self/loginuid", &p);
2774 if (r == -ENOENT)
2775 return 0;
2776 if (r < 0)
2777 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2778
2779 /* Already reset? */
2780 if (streq(p, "4294967295"))
2781 return 0;
2782
2783 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2784 if (r < 0) {
2785 log_error_errno(r,
2786 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2787 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2788 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2789 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2790 "using systemd-nspawn. Sleeping for 5s... (%m)");
2791
2792 sleep(5);
2793 }
2794
2795 return 0;
2796 }
2797
2798 static int mount_tunnel_dig(const char *root) {
2799 const char *p, *q;
2800 int r;
2801
2802 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2803 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2804 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2805 (void) mkdir_p(p, 0600);
2806
2807 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2808 if (r < 0)
2809 return log_error_errno(r, "Failed to create /run/host: %m");
2810
2811 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
2812 if (r < 0)
2813 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
2814
2815 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
2816 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2817 if (r < 0)
2818 return r;
2819
2820 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2821 if (r < 0)
2822 return r;
2823
2824 return 0;
2825 }
2826
2827 static int mount_tunnel_open(void) {
2828 int r;
2829
2830 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2831 if (r < 0)
2832 return r;
2833
2834 return 0;
2835 }
2836
2837 static int setup_machine_id(const char *directory) {
2838 int r;
2839
2840 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2841 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2842 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2843 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2844 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2845 * container behaves nicely). */
2846
2847 r = id128_get_machine(directory, &arg_uuid);
2848 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2849 /* If the file is missing, empty, or uninitialized, we don't mind */
2850 if (sd_id128_is_null(arg_uuid)) {
2851 r = sd_id128_randomize(&arg_uuid);
2852 if (r < 0)
2853 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2854 }
2855 } else if (r < 0)
2856 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2857
2858 return 0;
2859 }
2860
2861 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2862 int r;
2863
2864 assert(directory);
2865
2866 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2867 return 0;
2868
2869 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2870 if (r == -EOPNOTSUPP)
2871 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2872 if (r == -EBADE)
2873 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2874 if (r < 0)
2875 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2876 if (r == 0)
2877 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2878 else
2879 log_debug("Patched directory tree to match UID/GID range.");
2880
2881 return r;
2882 }
2883
2884 /*
2885 * Return values:
2886 * < 0 : wait_for_terminate() failed to get the state of the
2887 * container, the container was terminated by a signal, or
2888 * failed for an unknown reason. No change is made to the
2889 * container argument.
2890 * > 0 : The program executed in the container terminated with an
2891 * error. The exit code of the program executed in the
2892 * container is returned. The container argument has been set
2893 * to CONTAINER_TERMINATED.
2894 * 0 : The container is being rebooted, has been shut down or exited
2895 * successfully. The container argument has been set to either
2896 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2897 *
2898 * That is, success is indicated by a return value of zero, and an
2899 * error is indicated by a non-zero value.
2900 */
2901 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2902 siginfo_t status;
2903 int r;
2904
2905 r = wait_for_terminate(pid, &status);
2906 if (r < 0)
2907 return log_warning_errno(r, "Failed to wait for container: %m");
2908
2909 switch (status.si_code) {
2910
2911 case CLD_EXITED:
2912 if (status.si_status == 0)
2913 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2914 else
2915 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2916
2917 *container = CONTAINER_TERMINATED;
2918 return status.si_status;
2919
2920 case CLD_KILLED:
2921 if (status.si_status == SIGINT) {
2922 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2923 *container = CONTAINER_TERMINATED;
2924 return 0;
2925
2926 } else if (status.si_status == SIGHUP) {
2927 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2928 *container = CONTAINER_REBOOTED;
2929 return 0;
2930 }
2931
2932 _fallthrough_;
2933 case CLD_DUMPED:
2934 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2935 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2936
2937 default:
2938 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2939 "Container %s failed due to unknown reason.", arg_machine);
2940 }
2941 }
2942
2943 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2944 pid_t pid;
2945
2946 pid = PTR_TO_PID(userdata);
2947 if (pid > 0) {
2948 if (kill(pid, arg_kill_signal) >= 0) {
2949 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2950 sd_event_source_set_userdata(s, NULL);
2951 return 0;
2952 }
2953 }
2954
2955 sd_event_exit(sd_event_source_get_event(s), 0);
2956 return 0;
2957 }
2958
2959 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2960 pid_t pid;
2961
2962 assert(s);
2963 assert(ssi);
2964
2965 pid = PTR_TO_PID(userdata);
2966
2967 for (;;) {
2968 siginfo_t si = {};
2969
2970 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2971 return log_error_errno(errno, "Failed to waitid(): %m");
2972 if (si.si_pid == 0) /* No pending children. */
2973 break;
2974 if (si.si_pid == pid) {
2975 /* The main process we care for has exited. Return from
2976 * signal handler but leave the zombie. */
2977 sd_event_exit(sd_event_source_get_event(s), 0);
2978 break;
2979 }
2980
2981 /* Reap all other children. */
2982 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2983 }
2984
2985 return 0;
2986 }
2987
2988 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2989 pid_t pid;
2990
2991 assert(m);
2992
2993 pid = PTR_TO_PID(userdata);
2994
2995 if (arg_kill_signal > 0) {
2996 log_info("Container termination requested. Attempting to halt container.");
2997 (void) kill(pid, arg_kill_signal);
2998 } else {
2999 log_info("Container termination requested. Exiting.");
3000 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
3001 }
3002
3003 return 0;
3004 }
3005
3006 static int determine_names(void) {
3007 int r;
3008
3009 if (arg_template && !arg_directory && arg_machine) {
3010
3011 /* If --template= was specified then we should not
3012 * search for a machine, but instead create a new one
3013 * in /var/lib/machine. */
3014
3015 arg_directory = path_join("/var/lib/machines", arg_machine);
3016 if (!arg_directory)
3017 return log_oom();
3018 }
3019
3020 if (!arg_image && !arg_directory) {
3021 if (arg_machine) {
3022 _cleanup_(image_unrefp) Image *i = NULL;
3023
3024 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3025 if (r == -ENOENT)
3026 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3027 if (r < 0)
3028 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3029
3030 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3031 r = free_and_strdup(&arg_image, i->path);
3032 else
3033 r = free_and_strdup(&arg_directory, i->path);
3034 if (r < 0)
3035 return log_oom();
3036
3037 if (!arg_ephemeral)
3038 arg_read_only = arg_read_only || i->read_only;
3039 } else {
3040 r = safe_getcwd(&arg_directory);
3041 if (r < 0)
3042 return log_error_errno(r, "Failed to determine current directory: %m");
3043 }
3044
3045 if (!arg_directory && !arg_image)
3046 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3047 }
3048
3049 if (!arg_machine) {
3050 if (arg_directory && path_equal(arg_directory, "/"))
3051 arg_machine = gethostname_malloc();
3052 else if (arg_image) {
3053 char *e;
3054
3055 r = path_extract_filename(arg_image, &arg_machine);
3056 if (r < 0)
3057 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
3058
3059 /* Truncate suffix if there is one */
3060 e = endswith(arg_machine, ".raw");
3061 if (e)
3062 *e = 0;
3063 } else {
3064 r = path_extract_filename(arg_directory, &arg_machine);
3065 if (r < 0)
3066 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3067 }
3068
3069 hostname_cleanup(arg_machine);
3070 if (!hostname_is_valid(arg_machine, 0))
3071 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3072
3073 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3074 * to match fixed config file names. */
3075 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3076 if (!arg_settings_filename)
3077 return log_oom();
3078
3079 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3080 * instances at once without manually having to specify -M each time. */
3081 if (arg_ephemeral)
3082 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3083 return log_oom();
3084 } else {
3085 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3086 if (!arg_settings_filename)
3087 return log_oom();
3088 }
3089
3090 return 0;
3091 }
3092
3093 static int chase_and_update(char **p, unsigned flags) {
3094 char *chased;
3095 int r;
3096
3097 assert(p);
3098
3099 if (!*p)
3100 return 0;
3101
3102 r = chase(*p, NULL, flags, &chased, NULL);
3103 if (r < 0)
3104 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3105
3106 return free_and_replace(*p, chased);
3107 }
3108
3109 static int determine_uid_shift(const char *directory) {
3110
3111 if (arg_userns_mode == USER_NAMESPACE_NO) {
3112 arg_uid_shift = 0;
3113 return 0;
3114 }
3115
3116 if (arg_uid_shift == UID_INVALID) {
3117 struct stat st;
3118
3119 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3120
3121 if (stat(directory, &st) < 0)
3122 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3123
3124 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3125
3126 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3127 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3128 "UID and GID base of %s don't match.", directory);
3129
3130 arg_uid_range = UINT32_C(0x10000);
3131
3132 if (arg_uid_shift != 0) {
3133 /* If the image is shifted already, then we'll fall back to classic chowning, for
3134 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3135
3136 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3137 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3138 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3139 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3140 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3141 "UID base of %s is not zero, UID mapping not supported.", directory);
3142 }
3143 }
3144
3145 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3146 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3147
3148 return 0;
3149 }
3150
3151 static unsigned long effective_clone_ns_flags(void) {
3152 unsigned long flags = arg_clone_ns_flags;
3153
3154 if (arg_private_network)
3155 flags |= CLONE_NEWNET;
3156 if (arg_use_cgns)
3157 flags |= CLONE_NEWCGROUP;
3158 if (arg_userns_mode != USER_NAMESPACE_NO)
3159 flags |= CLONE_NEWUSER;
3160
3161 return flags;
3162 }
3163
3164 static int patch_sysctl(void) {
3165
3166 /* This table is inspired by runc's sysctl() function */
3167 static const struct {
3168 const char *key;
3169 bool prefix;
3170 unsigned long clone_flags;
3171 } safe_sysctl[] = {
3172 { "kernel.hostname", false, CLONE_NEWUTS },
3173 { "kernel.domainname", false, CLONE_NEWUTS },
3174 { "kernel.msgmax", false, CLONE_NEWIPC },
3175 { "kernel.msgmnb", false, CLONE_NEWIPC },
3176 { "kernel.msgmni", false, CLONE_NEWIPC },
3177 { "kernel.sem", false, CLONE_NEWIPC },
3178 { "kernel.shmall", false, CLONE_NEWIPC },
3179 { "kernel.shmmax", false, CLONE_NEWIPC },
3180 { "kernel.shmmni", false, CLONE_NEWIPC },
3181 { "fs.mqueue.", true, CLONE_NEWIPC },
3182 { "net.", true, CLONE_NEWNET },
3183 };
3184
3185 unsigned long flags;
3186 int r;
3187
3188 flags = effective_clone_ns_flags();
3189
3190 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3191 bool good = false;
3192 size_t i;
3193
3194 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3195
3196 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3197 continue;
3198
3199 if (safe_sysctl[i].prefix)
3200 good = startswith(*k, safe_sysctl[i].key);
3201 else
3202 good = streq(*k, safe_sysctl[i].key);
3203
3204 if (good)
3205 break;
3206 }
3207
3208 if (!good)
3209 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3210
3211 r = sysctl_write(*k, *v);
3212 if (r < 0)
3213 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3214 }
3215
3216 return 0;
3217 }
3218
3219 static int inner_child(
3220 Barrier *barrier,
3221 int fd_inner_socket,
3222 FDSet *fds,
3223 char **os_release_pairs) {
3224
3225 _cleanup_free_ char *home = NULL;
3226 size_t n_env = 1;
3227 char *envp[] = {
3228 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3229 NULL, /* container */
3230 NULL, /* TERM */
3231 NULL, /* HOME */
3232 NULL, /* USER */
3233 NULL, /* LOGNAME */
3234 NULL, /* container_uuid */
3235 NULL, /* LISTEN_FDS */
3236 NULL, /* LISTEN_PID */
3237 NULL, /* NOTIFY_SOCKET */
3238 NULL, /* CREDENTIALS_DIRECTORY */
3239 NULL, /* LANG */
3240 NULL
3241 };
3242 const char *exec_target;
3243 _cleanup_strv_free_ char **env_use = NULL;
3244 int r, which_failed;
3245
3246 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3247 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3248 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3249 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3250 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3251 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3252 * namespace.
3253 *
3254 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3255 * unshare(). See below. */
3256
3257 assert(barrier);
3258 assert(fd_inner_socket >= 0);
3259
3260 log_debug("Inner child is initializing.");
3261
3262 if (arg_userns_mode != USER_NAMESPACE_NO) {
3263 /* Tell the parent, that it now can write the UID map. */
3264 (void) barrier_place(barrier); /* #1 */
3265
3266 /* Wait until the parent wrote the UID map */
3267 if (!barrier_place_and_sync(barrier)) /* #2 */
3268 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3269
3270 /* Become the new root user inside our namespace */
3271 r = reset_uid_gid();
3272 if (r < 0)
3273 return log_error_errno(r, "Couldn't become new root: %m");
3274
3275 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3276 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3277 * propagation, but simply create new peer groups for all our mounts). */
3278 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3279 if (r < 0)
3280 return r;
3281 }
3282
3283 r = mount_all(NULL,
3284 arg_mount_settings | MOUNT_IN_USERNS,
3285 arg_uid_shift,
3286 arg_selinux_apifs_context);
3287 if (r < 0)
3288 return r;
3289
3290 if (!arg_network_namespace_path && arg_private_network) {
3291 r = unshare(CLONE_NEWNET);
3292 if (r < 0)
3293 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3294
3295 /* Tell the parent that it can setup network interfaces. */
3296 (void) barrier_place(barrier); /* #3 */
3297 }
3298
3299 r = mount_sysfs(NULL, arg_mount_settings);
3300 if (r < 0)
3301 return r;
3302
3303 /* Wait until we are cgroup-ified, so that we
3304 * can mount the right cgroup path writable */
3305 if (!barrier_place_and_sync(barrier)) /* #4 */
3306 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3307 "Parent died too early");
3308
3309 if (arg_use_cgns) {
3310 r = unshare(CLONE_NEWCGROUP);
3311 if (r < 0)
3312 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3313 r = mount_cgroups(
3314 "",
3315 arg_unified_cgroup_hierarchy,
3316 arg_userns_mode != USER_NAMESPACE_NO,
3317 arg_uid_shift,
3318 arg_uid_range,
3319 arg_selinux_apifs_context,
3320 true);
3321 } else
3322 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3323 if (r < 0)
3324 return r;
3325
3326 r = setup_boot_id();
3327 if (r < 0)
3328 return r;
3329
3330 r = setup_kmsg(fd_inner_socket);
3331 if (r < 0)
3332 return r;
3333
3334 r = mount_custom(
3335 "/",
3336 arg_custom_mounts,
3337 arg_n_custom_mounts,
3338 0,
3339 0,
3340 arg_selinux_apifs_context,
3341 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3342 if (r < 0)
3343 return r;
3344
3345 if (setsid() < 0)
3346 return log_error_errno(errno, "setsid() failed: %m");
3347
3348 if (arg_private_network)
3349 (void) loopback_setup();
3350
3351 if (arg_expose_ports) {
3352 r = expose_port_send_rtnl(fd_inner_socket);
3353 if (r < 0)
3354 return r;
3355 }
3356
3357 if (arg_console_mode != CONSOLE_PIPE) {
3358 _cleanup_close_ int master = -EBADF;
3359 _cleanup_free_ char *console = NULL;
3360
3361 /* Allocate a pty and make it available as /dev/console. */
3362 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3363 if (master < 0)
3364 return log_error_errno(master, "Failed to allocate a pty: %m");
3365
3366 r = setup_dev_console(console);
3367 if (r < 0)
3368 return log_error_errno(r, "Failed to set up /dev/console: %m");
3369
3370 r = send_one_fd(fd_inner_socket, master, 0);
3371 if (r < 0)
3372 return log_error_errno(r, "Failed to send master fd: %m");
3373
3374 r = setup_stdio_as_dev_console();
3375 if (r < 0)
3376 return r;
3377 }
3378
3379 r = patch_sysctl();
3380 if (r < 0)
3381 return r;
3382
3383 if (arg_oom_score_adjust_set) {
3384 r = set_oom_score_adjust(arg_oom_score_adjust);
3385 if (r < 0)
3386 return log_error_errno(r, "Failed to adjust OOM score: %m");
3387 }
3388
3389 if (arg_cpu_set.set)
3390 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3391 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3392
3393 (void) setup_hostname();
3394
3395 if (arg_personality != PERSONALITY_INVALID) {
3396 r = safe_personality(arg_personality);
3397 if (r < 0)
3398 return log_error_errno(r, "personality() failed: %m");
3399 #ifdef ARCHITECTURE_SECONDARY
3400 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
3401 r = safe_personality(PER_LINUX32);
3402 if (r < 0)
3403 return log_error_errno(r, "personality() failed: %m");
3404 #endif
3405 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3406 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3407 "invocation with qemu userspace emulator (or equivalent) in effect.",
3408 architecture_to_string(arg_architecture));
3409
3410 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3411 if (r < 0)
3412 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3413
3414 #if HAVE_SECCOMP
3415 if (arg_seccomp) {
3416
3417 if (is_seccomp_available()) {
3418 r = seccomp_load(arg_seccomp);
3419 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3420 return log_error_errno(r, "Failed to install seccomp filter: %m");
3421 if (r < 0)
3422 log_debug_errno(r, "Failed to install seccomp filter: %m");
3423 }
3424 } else
3425 #endif
3426 {
3427 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3428 if (r < 0)
3429 return r;
3430 }
3431
3432 if (arg_suppress_sync) {
3433 #if HAVE_SECCOMP
3434 r = seccomp_suppress_sync();
3435 if (r < 0)
3436 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3437 #else
3438 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3439 #endif
3440 }
3441
3442 #if HAVE_SELINUX
3443 if (arg_selinux_context)
3444 if (setexeccon(arg_selinux_context) < 0)
3445 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3446 #endif
3447
3448 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3449 * if we need to later on. */
3450 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3451 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3452
3453 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3454 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3455 else
3456 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3457 if (r < 0)
3458 return r;
3459
3460 r = drop_capabilities(getuid());
3461 if (r < 0)
3462 return log_error_errno(r, "Dropping capabilities failed: %m");
3463
3464 if (arg_no_new_privileges)
3465 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3466 return log_error_errno(errno, "Failed to disable new privileges: %m");
3467
3468 /* LXC sets container=lxc, so follow the scheme here */
3469 envp[n_env++] = strjoina("container=", arg_container_service_name);
3470
3471 envp[n_env] = strv_find_prefix(environ, "TERM=");
3472 if (envp[n_env])
3473 n_env++;
3474
3475 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3476 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3477 return log_oom();
3478
3479 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3480 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3481 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
3482 return log_oom();
3483
3484 assert(!sd_id128_is_null(arg_uuid));
3485
3486 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3487 return log_oom();
3488
3489 if (fdset_size(fds) > 0) {
3490 r = fdset_cloexec(fds, false);
3491 if (r < 0)
3492 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3493
3494 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3495 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3496 return log_oom();
3497 }
3498 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3499 return log_oom();
3500
3501 if (arg_n_credentials > 0) {
3502 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3503 if (!envp[n_env])
3504 return log_oom();
3505 n_env++;
3506 }
3507
3508 if (arg_start_mode != START_BOOT) {
3509 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
3510 if (!envp[n_env])
3511 return log_oom();
3512 n_env++;
3513 }
3514
3515 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3516 if (!env_use)
3517 return log_oom();
3518
3519 /* Let the parent know that we are ready and
3520 * wait until the parent is ready with the
3521 * setup, too... */
3522 if (!barrier_place_and_sync(barrier)) /* #5 */
3523 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3524
3525 if (arg_chdir)
3526 if (chdir(arg_chdir) < 0)
3527 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3528
3529 if (arg_start_mode == START_PID2) {
3530 r = stub_pid1(arg_uuid);
3531 if (r < 0)
3532 return r;
3533 }
3534
3535 if (arg_console_mode != CONSOLE_PIPE) {
3536 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3537 * are configured for that. Acquire it as controlling tty. */
3538 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3539 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3540 }
3541
3542 log_debug("Inner child completed, invoking payload.");
3543
3544 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3545 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3546 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3547 log_close();
3548 log_set_open_when_needed(true);
3549 log_settle_target();
3550
3551 (void) fdset_close_others(fds);
3552
3553 if (arg_start_mode == START_BOOT) {
3554 char **a;
3555 size_t m;
3556
3557 /* Automatically search for the init system */
3558
3559 m = strv_length(arg_parameters);
3560 a = newa(char*, m + 2);
3561 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3562 a[1 + m] = NULL;
3563
3564 FOREACH_STRING(init,
3565 "/usr/lib/systemd/systemd",
3566 "/lib/systemd/systemd",
3567 "/sbin/init") {
3568 a[0] = (char*) init;
3569 execve(a[0], a, env_use);
3570 }
3571
3572 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3573 } else if (!strv_isempty(arg_parameters)) {
3574 const char *dollar_path;
3575
3576 exec_target = arg_parameters[0];
3577
3578 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3579 * binary. */
3580 dollar_path = strv_env_get(env_use, "PATH");
3581 if (dollar_path) {
3582 if (setenv("PATH", dollar_path, 1) < 0)
3583 return log_error_errno(errno, "Failed to update $PATH: %m");
3584 }
3585
3586 execvpe(arg_parameters[0], arg_parameters, env_use);
3587 } else {
3588 if (!arg_chdir)
3589 /* If we cannot change the directory, we'll end up in /, that is expected. */
3590 (void) chdir(home ?: "/root");
3591
3592 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3593 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3594 execle("/bin/bash", "-bash", NULL, env_use);
3595 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3596 execle("/bin/sh", "-sh", NULL, env_use);
3597
3598 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3599 }
3600
3601 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3602 }
3603
3604 static int setup_notify_child(void) {
3605 _cleanup_close_ int fd = -EBADF;
3606 static const union sockaddr_union sa = {
3607 .un.sun_family = AF_UNIX,
3608 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3609 };
3610 int r;
3611
3612 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3613 if (fd < 0)
3614 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3615
3616 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3617 (void) sockaddr_un_unlink(&sa.un);
3618
3619 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3620 if (r < 0)
3621 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3622
3623 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3624 if (r < 0)
3625 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3626
3627 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3628 if (r < 0)
3629 return log_error_errno(r, "SO_PASSCRED failed: %m");
3630
3631 return TAKE_FD(fd);
3632 }
3633
3634 static int outer_child(
3635 Barrier *barrier,
3636 const char *directory,
3637 DissectedImage *dissected_image,
3638 int fd_outer_socket,
3639 int fd_inner_socket,
3640 FDSet *fds,
3641 int netns_fd) {
3642
3643 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3644 _cleanup_strv_free_ char **os_release_pairs = NULL;
3645 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
3646 bool idmap = false;
3647 const char *p;
3648 pid_t pid;
3649 ssize_t l;
3650 int r;
3651
3652 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3653 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3654 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3655 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3656 * forked off it, and it exits. */
3657
3658 assert(barrier);
3659 assert(directory);
3660 assert(fd_outer_socket >= 0);
3661 assert(fd_inner_socket >= 0);
3662
3663 log_debug("Outer child is initializing.");
3664
3665 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3666 if (r < 0)
3667 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3668
3669 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3670 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3671
3672 r = reset_audit_loginuid();
3673 if (r < 0)
3674 return r;
3675
3676 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3677 * mounts to the real root. */
3678 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3679 if (r < 0)
3680 return r;
3681
3682 if (dissected_image) {
3683 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3684 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3685 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3686 * right place right away. This makes sure ESP partitions and userns are compatible. */
3687
3688 r = dissected_image_mount_and_warn(
3689 dissected_image,
3690 directory,
3691 arg_uid_shift,
3692 arg_uid_range,
3693 /* userns_fd= */ -EBADF,
3694 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3695 DISSECT_IMAGE_DISCARD_ON_LOOP|
3696 DISSECT_IMAGE_USR_NO_ROOT|
3697 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3698 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3699 if (r < 0)
3700 return r;
3701 }
3702
3703 r = determine_uid_shift(directory);
3704 if (r < 0)
3705 return r;
3706
3707 if (arg_userns_mode != USER_NAMESPACE_NO) {
3708 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3709 if (r < 0)
3710 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3711
3712 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
3713 if (l < 0)
3714 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3715 mntns_fd = safe_close(mntns_fd);
3716
3717 /* Let the parent know which UID shift we read from the image */
3718 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3719 if (l < 0)
3720 return log_error_errno(errno, "Failed to send UID shift: %m");
3721 if (l != sizeof(arg_uid_shift))
3722 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3723 "Short write while sending UID shift.");
3724
3725 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3726 /* When we are supposed to pick the UID shift, the parent will check now whether the
3727 * UID shift we just read from the image is available. If yes, it will send the UID
3728 * shift back to us, if not it will pick a different one, and send it back to us. */
3729
3730 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3731 if (l < 0)
3732 return log_error_errno(errno, "Failed to recv UID shift: %m");
3733 if (l != sizeof(arg_uid_shift))
3734 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3735 "Short read while receiving UID shift.");
3736 }
3737
3738 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3739 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3740 }
3741
3742 if (path_equal(directory, "/")) {
3743 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3744 * place, so that we can make changes to its mount structure (for example, to implement
3745 * --volatile=) without this interfering with our ability to access files such as
3746 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3747 * (instead of a temporary directory, since we are living in our own mount namespace here
3748 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3749 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3750
3751 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3752 if (r < 0)
3753 return r;
3754
3755 directory = "/run/systemd/nspawn-root";
3756 }
3757
3758 /* Make sure we always have a mount that we can move to root later on. */
3759 r = make_mount_point(directory);
3760 if (r < 0)
3761 return r;
3762
3763 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3764 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3765 * we'll live in our own little world from now on, and propagation from the host may only happen via
3766 * the mount tunnel dir, or not at all. */
3767 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3768 if (r < 0)
3769 return r;
3770
3771 r = setup_pivot_root(
3772 directory,
3773 arg_pivot_root_new,
3774 arg_pivot_root_old);
3775 if (r < 0)
3776 return r;
3777
3778 r = setup_volatile_mode(
3779 directory,
3780 arg_volatile_mode,
3781 arg_uid_shift,
3782 arg_selinux_apifs_context);
3783 if (r < 0)
3784 return r;
3785
3786 r = bind_user_prepare(
3787 directory,
3788 arg_bind_user,
3789 arg_uid_shift,
3790 arg_uid_range,
3791 &arg_custom_mounts, &arg_n_custom_mounts,
3792 &bind_user_context);
3793 if (r < 0)
3794 return r;
3795
3796 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3797 /* Send the user maps we determined to the parent, so that it installs it in our user
3798 * namespace UID map table */
3799
3800 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3801 uid_t map[] = {
3802 bind_user_context->data[i].payload_user->uid,
3803 bind_user_context->data[i].host_user->uid,
3804 (uid_t) bind_user_context->data[i].payload_group->gid,
3805 (uid_t) bind_user_context->data[i].host_group->gid,
3806 };
3807
3808 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
3809 if (l < 0)
3810 return log_error_errno(errno, "Failed to send user UID map: %m");
3811 if (l != sizeof(map))
3812 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3813 "Short write while sending user UID map.");
3814 }
3815 }
3816
3817 r = mount_custom(
3818 directory,
3819 arg_custom_mounts,
3820 arg_n_custom_mounts,
3821 arg_uid_shift,
3822 arg_uid_range,
3823 arg_selinux_apifs_context,
3824 MOUNT_ROOT_ONLY);
3825 if (r < 0)
3826 return r;
3827
3828 if (arg_userns_mode != USER_NAMESPACE_NO &&
3829 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3830 arg_uid_shift != 0) {
3831
3832 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
3833 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3834 /* This might fail because the kernel or file system doesn't support idmapping. We
3835 * can't really distinguish this nicely, nor do we have any guarantees about the
3836 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3837 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3838 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3839 "ID mapped mounts are apparently not available, sorry.");
3840
3841 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3842 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3843 } else if (r < 0)
3844 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3845 else {
3846 log_debug("ID mapped mounts available, making use of them.");
3847 idmap = true;
3848 }
3849 }
3850
3851 if (dissected_image) {
3852 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3853 r = dissected_image_mount(
3854 dissected_image,
3855 directory,
3856 arg_uid_shift,
3857 arg_uid_range,
3858 /* userns_fd= */ -EBADF,
3859 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3860 DISSECT_IMAGE_DISCARD_ON_LOOP|
3861 DISSECT_IMAGE_USR_NO_ROOT|
3862 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3863 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3864 if (r == -EUCLEAN)
3865 return log_error_errno(r, "File system check for image failed: %m");
3866 if (r < 0)
3867 return log_error_errno(r, "Failed to mount image file system: %m");
3868 }
3869
3870 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3871 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3872
3873 r = detect_unified_cgroup_hierarchy_from_image(directory);
3874 if (r < 0)
3875 return r;
3876
3877 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3878 if (l < 0)
3879 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3880 if (l != sizeof(arg_unified_cgroup_hierarchy))
3881 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3882 "Short write while sending cgroup mode.");
3883 }
3884
3885 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3886 if (r < 0)
3887 return r;
3888
3889 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3890 if (r < 0)
3891 return r;
3892
3893 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3894 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
3895 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3896 if (r < 0)
3897 return log_error_errno(r, "Failed to make tree read-only: %m");
3898 }
3899
3900 r = mount_all(directory,
3901 arg_mount_settings,
3902 arg_uid_shift,
3903 arg_selinux_apifs_context);
3904 if (r < 0)
3905 return r;
3906
3907 r = copy_devnodes(directory);
3908 if (r < 0)
3909 return r;
3910
3911 r = make_extra_nodes(directory);
3912 if (r < 0)
3913 return r;
3914
3915 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3916
3917 p = prefix_roota(directory, "/run/host");
3918 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
3919
3920 r = setup_pts(directory);
3921 if (r < 0)
3922 return r;
3923
3924 r = mount_tunnel_dig(directory);
3925 if (r < 0)
3926 return r;
3927
3928 r = setup_keyring();
3929 if (r < 0)
3930 return r;
3931
3932 r = setup_credentials(directory);
3933 if (r < 0)
3934 return r;
3935
3936 r = bind_user_setup(bind_user_context, directory);
3937 if (r < 0)
3938 return r;
3939
3940 r = mount_custom(
3941 directory,
3942 arg_custom_mounts,
3943 arg_n_custom_mounts,
3944 arg_uid_shift,
3945 arg_uid_range,
3946 arg_selinux_apifs_context,
3947 MOUNT_NON_ROOT_ONLY);
3948 if (r < 0)
3949 return r;
3950
3951 r = setup_timezone(directory);
3952 if (r < 0)
3953 return r;
3954
3955 r = setup_resolv_conf(directory);
3956 if (r < 0)
3957 return r;
3958
3959 r = setup_machine_id(directory);
3960 if (r < 0)
3961 return r;
3962
3963 r = setup_journal(directory);
3964 if (r < 0)
3965 return r;
3966
3967 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3968 p = prefix_roota(directory, "/run/host/container-manager");
3969 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3970
3971 /* The same stuff as the $container_uuid env var */
3972 p = prefix_roota(directory, "/run/host/container-uuid");
3973 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3974
3975 if (!arg_use_cgns) {
3976 r = mount_cgroups(
3977 directory,
3978 arg_unified_cgroup_hierarchy,
3979 arg_userns_mode != USER_NAMESPACE_NO,
3980 arg_uid_shift,
3981 arg_uid_range,
3982 arg_selinux_apifs_context,
3983 false);
3984 if (r < 0)
3985 return r;
3986 }
3987
3988 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3989 * mounts available in systemd services inside the container that create a new mount namespace. See
3990 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3991 * will inherit the shared propagation mode.
3992 *
3993 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3994 * directory mount to root later on.
3995 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3996 */
3997 r = mount_switch_root(directory, MS_SHARED);
3998 if (r < 0)
3999 return log_error_errno(r, "Failed to move root directory: %m");
4000
4001 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4002 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4003 * the container. */
4004 r = mount_tunnel_open();
4005 if (r < 0)
4006 return r;
4007
4008 if (arg_userns_mode != USER_NAMESPACE_NO) {
4009 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4010 * requires that a fully visible instance is already present in the target mount
4011 * namespace. Mount one here so the inner child can mount its own instances. Later
4012 * we umount the temporary instances created here before we actually exec the
4013 * payload. Since the rootfs is shared the umount will propagate into the container.
4014 * Note, the inner child wouldn't be able to unmount the instances on its own since
4015 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4016 * this. */
4017 r = pin_fully_visible_fs();
4018 if (r < 0)
4019 return r;
4020 }
4021
4022 fd = setup_notify_child();
4023 if (fd < 0)
4024 return fd;
4025
4026 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4027 arg_clone_ns_flags |
4028 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
4029 if (pid < 0)
4030 return log_error_errno(errno, "Failed to fork inner child: %m");
4031 if (pid == 0) {
4032 fd_outer_socket = safe_close(fd_outer_socket);
4033
4034 /* The inner child has all namespaces that are requested, so that we all are owned by the
4035 * user if user namespaces are turned on. */
4036
4037 if (arg_network_namespace_path) {
4038 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4039 if (r < 0)
4040 return log_error_errno(r, "Failed to join network namespace: %m");
4041 }
4042
4043 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
4044 if (r < 0)
4045 _exit(EXIT_FAILURE);
4046
4047 _exit(EXIT_SUCCESS);
4048 }
4049
4050 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4051 if (l < 0)
4052 return log_error_errno(errno, "Failed to send PID: %m");
4053 if (l != sizeof(pid))
4054 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4055 "Short write while sending PID.");
4056
4057 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4058 if (l < 0)
4059 return log_error_errno(errno, "Failed to send machine ID: %m");
4060 if (l != sizeof(arg_uuid))
4061 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4062 "Short write while sending machine ID.");
4063
4064 l = send_one_fd(fd_outer_socket, fd, 0);
4065 if (l < 0)
4066 return log_error_errno(l, "Failed to send notify fd: %m");
4067
4068 fd_outer_socket = safe_close(fd_outer_socket);
4069 fd_inner_socket = safe_close(fd_inner_socket);
4070 netns_fd = safe_close(netns_fd);
4071
4072 return 0;
4073 }
4074
4075 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4076 bool tried_hashed = false;
4077 unsigned n_tries = 100;
4078 uid_t candidate;
4079 int r;
4080
4081 assert(shift);
4082 assert(ret_lock_file);
4083 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4084 assert(arg_uid_range == 0x10000U);
4085
4086 candidate = *shift;
4087
4088 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4089
4090 for (;;) {
4091 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4092 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4093
4094 if (--n_tries <= 0)
4095 return -EBUSY;
4096
4097 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4098 goto next;
4099 if ((candidate & UINT32_C(0xFFFF)) != 0)
4100 goto next;
4101
4102 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4103 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4104 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4105 goto next;
4106 if (r < 0)
4107 return r;
4108
4109 /* Make some superficial checks whether the range is currently known in the user database */
4110 if (getpwuid(candidate))
4111 goto next;
4112 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4113 goto next;
4114 if (getgrgid(candidate))
4115 goto next;
4116 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4117 goto next;
4118
4119 *ret_lock_file = lf;
4120 lf = (struct LockFile) LOCK_FILE_INIT;
4121 *shift = candidate;
4122 return 0;
4123
4124 next:
4125 if (arg_machine && !tried_hashed) {
4126 /* Try to hash the base from the container name */
4127
4128 static const uint8_t hash_key[] = {
4129 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4130 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4131 };
4132
4133 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4134
4135 tried_hashed = true;
4136 } else
4137 random_bytes(&candidate, sizeof(candidate));
4138
4139 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4140 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4141 }
4142 }
4143
4144 static int add_one_uid_map(
4145 char **p,
4146 uid_t container_uid,
4147 uid_t host_uid,
4148 uid_t range) {
4149
4150 return strextendf(p,
4151 UID_FMT " " UID_FMT " " UID_FMT "\n",
4152 container_uid, host_uid, range);
4153 }
4154
4155 static int make_uid_map_string(
4156 const uid_t bind_user_uid[],
4157 size_t n_bind_user_uid,
4158 size_t offset,
4159 char **ret) {
4160
4161 _cleanup_free_ char *s = NULL;
4162 uid_t previous_uid = 0;
4163 int r;
4164
4165 assert(n_bind_user_uid == 0 || bind_user_uid);
4166 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4167 assert(ret);
4168
4169 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4170 * quadruplet, consisting of host and container UID + GID. */
4171
4172 for (size_t i = 0; i < n_bind_user_uid; i++) {
4173 uid_t payload_uid = bind_user_uid[i*4+offset],
4174 host_uid = bind_user_uid[i*4+offset+1];
4175
4176 assert(previous_uid <= payload_uid);
4177 assert(payload_uid < arg_uid_range);
4178
4179 /* Add a range to close the gap to previous entry */
4180 if (payload_uid > previous_uid) {
4181 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4182 if (r < 0)
4183 return r;
4184 }
4185
4186 /* Map this specific user */
4187 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4188 if (r < 0)
4189 return r;
4190
4191 previous_uid = payload_uid + 1;
4192 }
4193
4194 /* And add a range to close the gap to finish the range */
4195 if (arg_uid_range > previous_uid) {
4196 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4197 if (r < 0)
4198 return r;
4199 }
4200
4201 assert(s);
4202
4203 *ret = TAKE_PTR(s);
4204 return 0;
4205 }
4206
4207 static int setup_uid_map(
4208 pid_t pid,
4209 const uid_t bind_user_uid[],
4210 size_t n_bind_user_uid) {
4211
4212 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4213 _cleanup_free_ char *s = NULL;
4214 int r;
4215
4216 assert(pid > 1);
4217
4218 /* Build the UID map string */
4219 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4220 return log_oom();
4221
4222 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4223 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4224 if (r < 0)
4225 return log_error_errno(r, "Failed to write UID map: %m");
4226
4227 /* And now build the GID map string */
4228 s = mfree(s);
4229 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4230 return log_oom();
4231
4232 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4233 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4234 if (r < 0)
4235 return log_error_errno(r, "Failed to write GID map: %m");
4236
4237 return 0;
4238 }
4239
4240 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4241 char buf[NOTIFY_BUFFER_MAX+1];
4242 char *p = NULL;
4243 struct iovec iovec = {
4244 .iov_base = buf,
4245 .iov_len = sizeof(buf)-1,
4246 };
4247 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4248 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4249 struct msghdr msghdr = {
4250 .msg_iov = &iovec,
4251 .msg_iovlen = 1,
4252 .msg_control = &control,
4253 .msg_controllen = sizeof(control),
4254 };
4255 struct ucred *ucred;
4256 ssize_t n;
4257 pid_t inner_child_pid;
4258 _cleanup_strv_free_ char **tags = NULL;
4259 int r;
4260
4261 assert(userdata);
4262
4263 inner_child_pid = PTR_TO_PID(userdata);
4264
4265 if (revents != EPOLLIN) {
4266 log_warning("Got unexpected poll event for notify fd.");
4267 return 0;
4268 }
4269
4270 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4271 if (ERRNO_IS_NEG_TRANSIENT(n))
4272 return 0;
4273 else if (n == -EXFULL) {
4274 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4275 return 0;
4276 } else if (n < 0)
4277 return log_warning_errno(n, "Couldn't read notification socket: %m");
4278
4279 cmsg_close_all(&msghdr);
4280
4281 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4282 if (!ucred || ucred->pid != inner_child_pid) {
4283 log_debug("Received notify message without valid credentials. Ignoring.");
4284 return 0;
4285 }
4286
4287 if ((size_t) n >= sizeof(buf)) {
4288 log_warning("Received notify message exceeded maximum size. Ignoring.");
4289 return 0;
4290 }
4291
4292 buf[n] = 0;
4293 tags = strv_split(buf, "\n\r");
4294 if (!tags)
4295 return log_oom();
4296
4297 if (strv_contains(tags, "READY=1")) {
4298 r = sd_notify(false, "READY=1\n");
4299 if (r < 0)
4300 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4301 }
4302
4303 p = strv_find_startswith(tags, "STATUS=");
4304 if (p)
4305 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4306
4307 return 0;
4308 }
4309
4310 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4311 int r;
4312
4313 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4314 if (r < 0)
4315 return log_error_errno(r, "Failed to allocate notify event source: %m");
4316
4317 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4318
4319 return 0;
4320 }
4321
4322 static int merge_settings(Settings *settings, const char *path) {
4323 int rl;
4324
4325 assert(settings);
4326 assert(path);
4327
4328 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4329 * that this steals the fields of the Settings* structure, and hence modifies it. */
4330
4331 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4332 settings->start_mode >= 0) {
4333 arg_start_mode = settings->start_mode;
4334 strv_free_and_replace(arg_parameters, settings->parameters);
4335 }
4336
4337 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4338 settings->ephemeral >= 0)
4339 arg_ephemeral = settings->ephemeral;
4340
4341 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4342 settings->root) {
4343
4344 if (!arg_settings_trusted)
4345 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4346 else
4347 free_and_replace(arg_directory, settings->root);
4348 }
4349
4350 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4351 settings->pivot_root_new) {
4352 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4353 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4354 }
4355
4356 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4357 settings->working_directory)
4358 free_and_replace(arg_chdir, settings->working_directory);
4359
4360 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4361 settings->environment)
4362 strv_free_and_replace(arg_setenv, settings->environment);
4363
4364 if ((arg_settings_mask & SETTING_USER) == 0) {
4365
4366 if (settings->user)
4367 free_and_replace(arg_user, settings->user);
4368
4369 if (uid_is_valid(settings->uid))
4370 arg_uid = settings->uid;
4371 if (gid_is_valid(settings->gid))
4372 arg_gid = settings->gid;
4373 if (settings->n_supplementary_gids > 0) {
4374 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4375 arg_n_supplementary_gids = settings->n_supplementary_gids;
4376 }
4377 }
4378
4379 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4380 uint64_t plus, minus;
4381 uint64_t network_minus = 0;
4382 uint64_t ambient;
4383
4384 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4385 * Settings structure */
4386
4387 plus = settings->capability;
4388 minus = settings->drop_capability;
4389
4390 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4391 settings_network_configured(settings)) {
4392 if (settings_private_network(settings))
4393 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4394 else
4395 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4396 }
4397
4398 if (!arg_settings_trusted && plus != 0) {
4399 if (settings->capability != 0)
4400 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4401 } else {
4402 arg_caps_retain &= ~network_minus;
4403 arg_caps_retain |= plus;
4404 }
4405
4406 arg_caps_retain &= ~minus;
4407
4408 /* Copy the full capabilities over too */
4409 if (capability_quintet_is_set(&settings->full_capabilities)) {
4410 if (!arg_settings_trusted)
4411 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4412 else
4413 arg_full_capabilities = settings->full_capabilities;
4414 }
4415
4416 ambient = settings->ambient_capability;
4417 if (!arg_settings_trusted && ambient != 0)
4418 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4419 else
4420 arg_caps_ambient |= ambient;
4421 }
4422
4423 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4424 settings->kill_signal > 0)
4425 arg_kill_signal = settings->kill_signal;
4426
4427 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4428 settings->personality != PERSONALITY_INVALID)
4429 arg_personality = settings->personality;
4430
4431 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4432 !sd_id128_is_null(settings->machine_id)) {
4433
4434 if (!arg_settings_trusted)
4435 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4436 else
4437 arg_uuid = settings->machine_id;
4438 }
4439
4440 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4441 settings->read_only >= 0)
4442 arg_read_only = settings->read_only;
4443
4444 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4445 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4446 arg_volatile_mode = settings->volatile_mode;
4447
4448 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4449 settings->n_custom_mounts > 0) {
4450
4451 if (!arg_settings_trusted)
4452 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4453 else {
4454 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4455 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4456 arg_n_custom_mounts = settings->n_custom_mounts;
4457 settings->n_custom_mounts = 0;
4458 }
4459 }
4460
4461 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4462 settings_network_configured(settings)) {
4463
4464 if (!arg_settings_trusted)
4465 log_warning("Ignoring network settings, file %s is not trusted.", path);
4466 else {
4467 arg_network_veth = settings_network_veth(settings);
4468 arg_private_network = settings_private_network(settings);
4469
4470 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4471 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4472 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4473 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4474
4475 free_and_replace(arg_network_bridge, settings->network_bridge);
4476 free_and_replace(arg_network_zone, settings->network_zone);
4477
4478 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4479 }
4480 }
4481
4482 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4483 settings->expose_ports) {
4484
4485 if (!arg_settings_trusted)
4486 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4487 else {
4488 expose_port_free_all(arg_expose_ports);
4489 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4490 }
4491 }
4492
4493 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4494 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4495
4496 if (!arg_settings_trusted)
4497 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4498 else {
4499 arg_userns_mode = settings->userns_mode;
4500 arg_uid_shift = settings->uid_shift;
4501 arg_uid_range = settings->uid_range;
4502 arg_userns_ownership = settings->userns_ownership;
4503 }
4504 }
4505
4506 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4507 !strv_isempty(settings->bind_user))
4508 strv_free_and_replace(arg_bind_user, settings->bind_user);
4509
4510 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4511 settings->notify_ready >= 0)
4512 arg_notify_ready = settings->notify_ready;
4513
4514 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4515
4516 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4517 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4518 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4519 else {
4520 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4521 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4522 }
4523 }
4524
4525 #if HAVE_SECCOMP
4526 if (settings->seccomp) {
4527 if (!arg_settings_trusted)
4528 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4529 else {
4530 seccomp_release(arg_seccomp);
4531 arg_seccomp = TAKE_PTR(settings->seccomp);
4532 }
4533 }
4534 #endif
4535 }
4536
4537 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4538 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4539 continue;
4540
4541 if (!settings->rlimit[rl])
4542 continue;
4543
4544 if (!arg_settings_trusted) {
4545 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4546 continue;
4547 }
4548
4549 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4550 }
4551
4552 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4553 settings->hostname)
4554 free_and_replace(arg_hostname, settings->hostname);
4555
4556 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4557 settings->no_new_privileges >= 0)
4558 arg_no_new_privileges = settings->no_new_privileges;
4559
4560 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4561 settings->oom_score_adjust_set) {
4562
4563 if (!arg_settings_trusted)
4564 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4565 else {
4566 arg_oom_score_adjust = settings->oom_score_adjust;
4567 arg_oom_score_adjust_set = true;
4568 }
4569 }
4570
4571 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4572 settings->cpu_set.set) {
4573
4574 if (!arg_settings_trusted)
4575 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4576 else {
4577 cpu_set_reset(&arg_cpu_set);
4578 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
4579 }
4580 }
4581
4582 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4583 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4584 arg_resolv_conf = settings->resolv_conf;
4585
4586 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4587 settings->link_journal != _LINK_JOURNAL_INVALID) {
4588
4589 if (!arg_settings_trusted)
4590 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4591 else {
4592 arg_link_journal = settings->link_journal;
4593 arg_link_journal_try = settings->link_journal_try;
4594 }
4595 }
4596
4597 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4598 settings->timezone != _TIMEZONE_MODE_INVALID)
4599 arg_timezone = settings->timezone;
4600
4601 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4602 settings->slice) {
4603
4604 if (!arg_settings_trusted)
4605 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4606 else
4607 free_and_replace(arg_slice, settings->slice);
4608 }
4609
4610 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4611 settings->use_cgns >= 0) {
4612
4613 if (!arg_settings_trusted)
4614 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4615 else
4616 arg_use_cgns = settings->use_cgns;
4617 }
4618
4619 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4620 settings->clone_ns_flags != ULONG_MAX) {
4621
4622 if (!arg_settings_trusted)
4623 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4624 else
4625 arg_clone_ns_flags = settings->clone_ns_flags;
4626 }
4627
4628 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4629 settings->console_mode >= 0) {
4630
4631 if (!arg_settings_trusted)
4632 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4633 else
4634 arg_console_mode = settings->console_mode;
4635 }
4636
4637 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4638 settings->suppress_sync >= 0)
4639 arg_suppress_sync = settings->suppress_sync;
4640
4641 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4642 * don't consult arg_settings_mask for them. */
4643
4644 sd_bus_message_unref(arg_property_message);
4645 arg_property_message = TAKE_PTR(settings->properties);
4646
4647 arg_console_width = settings->console_width;
4648 arg_console_height = settings->console_height;
4649
4650 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4651 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4652 arg_n_extra_nodes = settings->n_extra_nodes;
4653 settings->n_extra_nodes = 0;
4654
4655 return 0;
4656 }
4657
4658 static int load_settings(void) {
4659 _cleanup_(settings_freep) Settings *settings = NULL;
4660 _cleanup_fclose_ FILE *f = NULL;
4661 _cleanup_free_ char *p = NULL;
4662 int r;
4663
4664 if (arg_oci_bundle)
4665 return 0;
4666
4667 /* If all settings are masked, there's no point in looking for
4668 * the settings file */
4669 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4670 return 0;
4671
4672 /* We first look in the admin's directories in /etc and /run */
4673 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4674 _cleanup_free_ char *j = NULL;
4675
4676 j = path_join(i, arg_settings_filename);
4677 if (!j)
4678 return log_oom();
4679
4680 f = fopen(j, "re");
4681 if (f) {
4682 p = TAKE_PTR(j);
4683
4684 /* By default, we trust configuration from /etc and /run */
4685 if (arg_settings_trusted < 0)
4686 arg_settings_trusted = true;
4687
4688 break;
4689 }
4690
4691 if (errno != ENOENT)
4692 return log_error_errno(errno, "Failed to open %s: %m", j);
4693 }
4694
4695 if (!f) {
4696 /* After that, let's look for a file next to the
4697 * actual image we shall boot. */
4698
4699 if (arg_image) {
4700 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4701 if (r < 0)
4702 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4703 } else if (arg_directory) {
4704 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4705 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4706 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
4707 }
4708
4709 if (p) {
4710 f = fopen(p, "re");
4711 if (!f && errno != ENOENT)
4712 return log_error_errno(errno, "Failed to open %s: %m", p);
4713
4714 /* By default, we do not trust configuration from /var/lib/machines */
4715 if (arg_settings_trusted < 0)
4716 arg_settings_trusted = false;
4717 }
4718 }
4719
4720 if (!f)
4721 return 0;
4722
4723 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4724
4725 r = settings_load(f, p, &settings);
4726 if (r < 0)
4727 return r;
4728
4729 return merge_settings(settings, p);
4730 }
4731
4732 static int load_oci_bundle(void) {
4733 _cleanup_(settings_freep) Settings *settings = NULL;
4734 int r;
4735
4736 if (!arg_oci_bundle)
4737 return 0;
4738
4739 /* By default let's trust OCI bundles */
4740 if (arg_settings_trusted < 0)
4741 arg_settings_trusted = true;
4742
4743 r = oci_load(NULL, arg_oci_bundle, &settings);
4744 if (r < 0)
4745 return r;
4746
4747 return merge_settings(settings, arg_oci_bundle);
4748 }
4749
4750 static int run_container(
4751 DissectedImage *dissected_image,
4752 FDSet *fds,
4753 char veth_name[IFNAMSIZ], bool *veth_created,
4754 struct ExposeArgs *expose_args,
4755 int *master, pid_t *pid, int *ret) {
4756
4757 static const struct sigaction sa = {
4758 .sa_handler = nop_signal_handler,
4759 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4760 };
4761
4762 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4763 _cleanup_close_ int etc_passwd_lock = -EBADF;
4764 _cleanup_close_pair_ int
4765 fd_inner_socket_pair[2] = PIPE_EBADF,
4766 fd_outer_socket_pair[2] = PIPE_EBADF;
4767
4768 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
4769 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4770 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4771 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4772 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4773 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4774 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4775 _cleanup_free_ uid_t *bind_user_uid = NULL;
4776 size_t n_bind_user_uid = 0;
4777 ContainerStatus container_status = 0;
4778 int ifi = 0, r;
4779 ssize_t l;
4780 sigset_t mask_chld;
4781 _cleanup_close_ int child_netns_fd = -EBADF;
4782
4783 assert_se(sigemptyset(&mask_chld) == 0);
4784 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4785
4786 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4787 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4788 * check with getpwuid() if the specific user already exists. Note that /etc might be
4789 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4790 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4791 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4792 * really ours. */
4793
4794 etc_passwd_lock = take_etc_passwd_lock(NULL);
4795 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4796 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4797 }
4798
4799 r = barrier_create(&barrier);
4800 if (r < 0)
4801 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4802
4803 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4804 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4805
4806 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4807 return log_error_errno(errno, "Failed to create outer socket pair: %m");
4808
4809 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4810 * parent's blocking calls and give it a chance to call wait() and terminate. */
4811 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4812 if (r < 0)
4813 return log_error_errno(errno, "Failed to change the signal mask: %m");
4814
4815 r = sigaction(SIGCHLD, &sa, NULL);
4816 if (r < 0)
4817 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4818
4819 if (arg_network_namespace_path) {
4820 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4821 if (child_netns_fd < 0)
4822 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4823
4824 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4825 if (r == -EUCLEAN)
4826 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4827 else if (r < 0)
4828 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4829 else if (r == 0)
4830 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4831 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4832 }
4833
4834 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4835 if (*pid < 0)
4836 return log_error_errno(errno, "clone() failed%s: %m",
4837 errno == EINVAL ?
4838 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4839
4840 if (*pid == 0) {
4841 /* The outer child only has a file system namespace. */
4842 barrier_set_role(&barrier, BARRIER_CHILD);
4843
4844 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
4845 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
4846
4847 (void) reset_all_signal_handlers();
4848 (void) reset_signal_mask();
4849
4850 r = outer_child(&barrier,
4851 arg_directory,
4852 dissected_image,
4853 fd_outer_socket_pair[1],
4854 fd_inner_socket_pair[1],
4855 fds,
4856 child_netns_fd);
4857 if (r < 0)
4858 _exit(EXIT_FAILURE);
4859
4860 _exit(EXIT_SUCCESS);
4861 }
4862
4863 barrier_set_role(&barrier, BARRIER_PARENT);
4864
4865 fdset_close(fds);
4866
4867 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
4868 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
4869
4870 if (arg_userns_mode != USER_NAMESPACE_NO) {
4871 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
4872 if (mntns_fd < 0)
4873 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4874
4875 /* The child just let us know the UID shift it might have read from the image. */
4876 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4877 if (l < 0)
4878 return log_error_errno(errno, "Failed to read UID shift: %m");
4879 if (l != sizeof arg_uid_shift)
4880 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4881
4882 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4883 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4884 * image, but if that's already in use, pick a new one, and report back to the child,
4885 * which one we now picked. */
4886
4887 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4888 if (r < 0)
4889 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4890
4891 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4892 if (l < 0)
4893 return log_error_errno(errno, "Failed to send UID shift: %m");
4894 if (l != sizeof arg_uid_shift)
4895 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4896 }
4897
4898 n_bind_user_uid = strv_length(arg_bind_user);
4899 if (n_bind_user_uid > 0) {
4900 /* Right after the UID shift, we'll receive the list of UID mappings for the
4901 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4902
4903 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4904 if (!bind_user_uid)
4905 return log_oom();
4906
4907 for (size_t i = 0; i < n_bind_user_uid; i++) {
4908 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4909 if (l < 0)
4910 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4911 if (l != sizeof(uid_t)*4)
4912 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4913 SYNTHETIC_ERRNO(EIO),
4914 "Short read while reading bind user UID pairs.");
4915 }
4916 }
4917 }
4918
4919 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4920 /* The child let us know the support cgroup mode it might have read from the image. */
4921 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4922 if (l < 0)
4923 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4924 if (l != sizeof(arg_unified_cgroup_hierarchy))
4925 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
4926 l, l == 0 ? " The child is most likely dead." : "");
4927 }
4928
4929 /* Wait for the outer child. */
4930 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4931 if (r < 0)
4932 return r;
4933 if (r != EXIT_SUCCESS)
4934 return -EIO;
4935
4936 /* And now retrieve the PID of the inner child. */
4937 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
4938 if (l < 0)
4939 return log_error_errno(errno, "Failed to read inner child PID: %m");
4940 if (l != sizeof *pid)
4941 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4942
4943 /* We also retrieve container UUID in case it was generated by outer child */
4944 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4945 if (l < 0)
4946 return log_error_errno(errno, "Failed to read container machine ID: %m");
4947 if (l != sizeof(arg_uuid))
4948 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4949
4950 /* We also retrieve the socket used for notifications generated by outer child */
4951 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
4952 if (notify_socket < 0)
4953 return log_error_errno(notify_socket,
4954 "Failed to receive notification socket from the outer child: %m");
4955
4956 log_debug("Init process invoked as PID "PID_FMT, *pid);
4957
4958 if (arg_userns_mode != USER_NAMESPACE_NO) {
4959 if (!barrier_place_and_sync(&barrier)) /* #1 */
4960 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4961
4962 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
4963 if (r < 0)
4964 return r;
4965
4966 (void) barrier_place(&barrier); /* #2 */
4967 }
4968
4969 if (arg_private_network) {
4970 if (!arg_network_namespace_path) {
4971 /* Wait until the child has unshared its network namespace. */
4972 if (!barrier_place_and_sync(&barrier)) /* #3 */
4973 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4974 }
4975
4976 if (child_netns_fd < 0) {
4977 /* Make sure we have an open file descriptor to the child's network
4978 * namespace so it stays alive even if the child exits. */
4979 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4980 if (r < 0)
4981 return log_error_errno(r, "Failed to open child network namespace: %m");
4982 }
4983
4984 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
4985 if (r < 0)
4986 return r;
4987
4988 if (arg_network_veth) {
4989 r = setup_veth(arg_machine, *pid, veth_name,
4990 arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
4991 if (r < 0)
4992 return r;
4993 else if (r > 0)
4994 ifi = r;
4995
4996 if (arg_network_bridge) {
4997 /* Add the interface to a bridge */
4998 r = setup_bridge(veth_name, arg_network_bridge, false);
4999 if (r < 0)
5000 return r;
5001 if (r > 0)
5002 ifi = r;
5003 } else if (arg_network_zone) {
5004 /* Add the interface to a bridge, possibly creating it */
5005 r = setup_bridge(veth_name, arg_network_zone, true);
5006 if (r < 0)
5007 return r;
5008 if (r > 0)
5009 ifi = r;
5010 }
5011 }
5012
5013 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5014 if (r < 0)
5015 return r;
5016
5017 /* We created the primary and extra veth links now; let's remember this, so that we know to
5018 remove them later on. Note that we don't bother with removing veth links that were created
5019 here when their setup failed half-way, because in that case the kernel should be able to
5020 remove them on its own, since they cannot be referenced by anything yet. */
5021 *veth_created = true;
5022
5023 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5024 if (r < 0)
5025 return r;
5026
5027 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5028 if (r < 0)
5029 return r;
5030 }
5031
5032 if (arg_register || !arg_keep_unit) {
5033 r = sd_bus_default_system(&bus);
5034 if (r < 0)
5035 return log_error_errno(r, "Failed to open system bus: %m");
5036
5037 r = sd_bus_set_close_on_exit(bus, false);
5038 if (r < 0)
5039 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5040 }
5041
5042 if (!arg_keep_unit) {
5043 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5044 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5045 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5046
5047 r = sd_bus_match_signal_async(
5048 bus,
5049 NULL,
5050 "org.freedesktop.systemd1",
5051 NULL,
5052 "org.freedesktop.systemd1.Scope",
5053 "RequestStop",
5054 on_request_stop, NULL, PID_TO_PTR(*pid));
5055 if (r < 0)
5056 return log_error_errno(r, "Failed to request RequestStop match: %m");
5057 }
5058
5059 if (arg_register) {
5060 r = register_machine(
5061 bus,
5062 arg_machine,
5063 *pid,
5064 arg_directory,
5065 arg_uuid,
5066 ifi,
5067 arg_slice,
5068 arg_custom_mounts, arg_n_custom_mounts,
5069 arg_kill_signal,
5070 arg_property,
5071 arg_property_message,
5072 arg_keep_unit,
5073 arg_container_service_name,
5074 arg_start_mode);
5075 if (r < 0)
5076 return r;
5077
5078 } else if (!arg_keep_unit) {
5079 r = allocate_scope(
5080 bus,
5081 arg_machine,
5082 *pid,
5083 arg_slice,
5084 arg_custom_mounts, arg_n_custom_mounts,
5085 arg_kill_signal,
5086 arg_property,
5087 arg_property_message,
5088 /* allow_pidfds= */ true,
5089 arg_start_mode);
5090 if (r < 0)
5091 return r;
5092
5093 } else if (arg_slice || arg_property)
5094 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5095
5096 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5097 if (r < 0)
5098 return r;
5099
5100 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5101 if (r < 0)
5102 return r;
5103
5104 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5105 if (r < 0)
5106 return r;
5107
5108 /* Notify the child that the parent is ready with all
5109 * its setup (including cgroup-ification), and that
5110 * the child can now hand over control to the code to
5111 * run inside the container. */
5112 (void) barrier_place(&barrier); /* #4 */
5113
5114 /* Block SIGCHLD here, before notifying child.
5115 * process_pty() will handle it with the other signals. */
5116 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5117
5118 /* Reset signal to default */
5119 r = default_signals(SIGCHLD);
5120 if (r < 0)
5121 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5122
5123 r = sd_event_new(&event);
5124 if (r < 0)
5125 return log_error_errno(r, "Failed to get default event source: %m");
5126
5127 (void) sd_event_set_watchdog(event, true);
5128
5129 if (bus) {
5130 r = sd_bus_attach_event(bus, event, 0);
5131 if (r < 0)
5132 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5133 }
5134
5135 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5136 if (r < 0)
5137 return r;
5138
5139 if (arg_userns_mode != USER_NAMESPACE_NO) {
5140 r = wipe_fully_visible_fs(mntns_fd);
5141 if (r < 0)
5142 return r;
5143 mntns_fd = safe_close(mntns_fd);
5144 }
5145
5146 /* Let the child know that we are ready and wait that the child is completely ready now. */
5147 if (!barrier_place_and_sync(&barrier)) /* #5 */
5148 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5149
5150 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5151 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5152 etc_passwd_lock = safe_close(etc_passwd_lock);
5153
5154 (void) sd_notifyf(false,
5155 "STATUS=Container running.\n"
5156 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5157 if (!arg_notify_ready) {
5158 r = sd_notify(false, "READY=1\n");
5159 if (r < 0)
5160 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5161 }
5162
5163 if (arg_kill_signal > 0) {
5164 /* Try to kill the init system on SIGINT or SIGTERM */
5165 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5166 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5167 } else {
5168 /* Immediately exit */
5169 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5170 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5171 }
5172
5173 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5174
5175 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5176 if (r < 0)
5177 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5178
5179 /* Exit when the child exits */
5180 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5181
5182 /* Retrieve the kmsg fifo allocated by inner child */
5183 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5184 if (fd_kmsg_fifo < 0)
5185 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5186
5187 if (arg_expose_ports) {
5188 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
5189 if (r < 0)
5190 return r;
5191
5192 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5193 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5194 }
5195
5196 if (arg_console_mode != CONSOLE_PIPE) {
5197 _cleanup_close_ int fd = -EBADF;
5198 PTYForwardFlags flags = 0;
5199
5200 /* Retrieve the master pty allocated by inner child */
5201 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
5202 if (fd < 0)
5203 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5204
5205 switch (arg_console_mode) {
5206
5207 case CONSOLE_READ_ONLY:
5208 flags |= PTY_FORWARD_READ_ONLY;
5209
5210 _fallthrough_;
5211
5212 case CONSOLE_INTERACTIVE:
5213 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5214
5215 r = pty_forward_new(event, fd, flags, &forward);
5216 if (r < 0)
5217 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5218
5219 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5220 (void) pty_forward_set_width_height(forward,
5221 arg_console_width,
5222 arg_console_height);
5223 break;
5224
5225 default:
5226 assert(arg_console_mode == CONSOLE_PASSIVE);
5227 }
5228
5229 *master = TAKE_FD(fd);
5230 }
5231
5232 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5233
5234 r = sd_event_loop(event);
5235 if (r < 0)
5236 return log_error_errno(r, "Failed to run event loop: %m");
5237
5238 if (forward) {
5239 char last_char = 0;
5240
5241 (void) pty_forward_get_last_char(forward, &last_char);
5242 forward = pty_forward_free(forward);
5243
5244 if (!arg_quiet && last_char != '\n')
5245 putc('\n', stdout);
5246 }
5247
5248 /* Kill if it is not dead yet anyway */
5249 if (!arg_register && !arg_keep_unit && bus)
5250 terminate_scope(bus, arg_machine);
5251
5252 /* Normally redundant, but better safe than sorry */
5253 (void) kill(*pid, SIGKILL);
5254
5255 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5256
5257 if (arg_private_network) {
5258 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5259 * to avoid having to move the parent to the child network namespace. */
5260 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5261 if (r < 0)
5262 return r;
5263
5264 if (r == 0) {
5265 _cleanup_close_ int parent_netns_fd = -EBADF;
5266
5267 r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5268 if (r < 0) {
5269 log_error_errno(r, "Failed to open parent network namespace: %m");
5270 _exit(EXIT_FAILURE);
5271 }
5272
5273 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5274 if (r < 0) {
5275 log_error_errno(r, "Failed to enter child network namespace: %m");
5276 _exit(EXIT_FAILURE);
5277 }
5278
5279 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5280 * This is about ensuring interfaces get their old name back when being moved back. */
5281 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5282
5283 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5284 if (r < 0)
5285 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5286
5287 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5288 }
5289 }
5290
5291 r = wait_for_container(TAKE_PID(*pid), &container_status);
5292
5293 /* Tell machined that we are gone. */
5294 if (bus)
5295 (void) unregister_machine(bus, arg_machine);
5296
5297 if (r < 0)
5298 /* We failed to wait for the container, or the container exited abnormally. */
5299 return r;
5300 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5301 /* r > 0 → The container exited with a non-zero status.
5302 * As a special case, we need to replace 133 with a different value,
5303 * because 133 is special-cased in the service file to reboot the container.
5304 * otherwise → The container exited with zero status and a reboot was not requested.
5305 */
5306 if (r == EXIT_FORCE_RESTART)
5307 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5308 *ret = r;
5309 return 0; /* finito */
5310 }
5311
5312 /* CONTAINER_REBOOTED, loop again */
5313
5314 if (arg_keep_unit) {
5315 /* Special handling if we are running as a service: instead of simply
5316 * restarting the machine we want to restart the entire service, so let's
5317 * inform systemd about this with the special exit code 133. The service
5318 * file uses RestartForceExitStatus=133 so that this results in a full
5319 * nspawn restart. This is necessary since we might have cgroup parameters
5320 * set we want to have flushed out. */
5321 *ret = EXIT_FORCE_RESTART;
5322 return 0; /* finito */
5323 }
5324
5325 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5326 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5327
5328 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5329 *veth_created = false;
5330 return 1; /* loop again */
5331 }
5332
5333 static int initialize_rlimits(void) {
5334 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5335 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5336 * container execution environments. */
5337
5338 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5339 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5340 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5341 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5342 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5343 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5344 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5345 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5346 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5347 [RLIMIT_NICE] = { 0, 0 },
5348 [RLIMIT_NOFILE] = { 1024, 4096 },
5349 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5350 [RLIMIT_RTPRIO] = { 0, 0 },
5351 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5352 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5353
5354 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5355 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5356 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5357 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5358 * that PID 1 changes a number of other resource limits during early initialization which is why we
5359 * don't read the other limits from PID 1 but prefer the static table above. */
5360 };
5361
5362 int rl;
5363
5364 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5365 /* Let's only fill in what the user hasn't explicitly configured anyway */
5366 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5367 const struct rlimit *v;
5368 struct rlimit buffer;
5369
5370 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5371 /* For these two let's read the limits off PID 1. See above for an explanation. */
5372
5373 if (prlimit(1, rl, NULL, &buffer) < 0)
5374 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5375
5376 v = &buffer;
5377 } else if (rl == RLIMIT_NOFILE) {
5378 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5379 * userspace. Given that nspawn containers are often run without our PID 1,
5380 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5381 * so that container userspace gets similar resources as host userspace
5382 * gets. */
5383 buffer = kernel_defaults[rl];
5384 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5385 v = &buffer;
5386 } else
5387 v = kernel_defaults + rl;
5388
5389 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5390 if (!arg_rlimit[rl])
5391 return log_oom();
5392 }
5393
5394 if (DEBUG_LOGGING) {
5395 _cleanup_free_ char *k = NULL;
5396
5397 (void) rlimit_format(arg_rlimit[rl], &k);
5398 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5399 }
5400 }
5401
5402 return 0;
5403 }
5404
5405 static int cant_be_in_netns(void) {
5406 _cleanup_close_ int fd = -EBADF;
5407 struct ucred ucred;
5408 int r;
5409
5410 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5411 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5412 * nice message. */
5413
5414 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5415 return 0;
5416
5417 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5418 if (fd < 0)
5419 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5420
5421 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5422 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5423 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5424 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5425 if (r < 0)
5426 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
5427
5428 r = getpeercred(fd, &ucred);
5429 if (r < 0)
5430 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5431
5432 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
5433 if (r < 0)
5434 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5435 if (r == 0)
5436 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5437 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5438 return 0;
5439 }
5440
5441 static int run(int argc, char *argv[]) {
5442 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5443 _cleanup_close_ int master = -EBADF;
5444 _cleanup_fdset_free_ FDSet *fds = NULL;
5445 int r, n_fd_passed, ret = EXIT_SUCCESS;
5446 char veth_name[IFNAMSIZ] = "";
5447 struct ExposeArgs expose_args = {};
5448 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5449 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5450 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5451 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5452 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5453 pid_t pid = 0;
5454
5455 log_parse_environment();
5456 log_open();
5457
5458 r = parse_argv(argc, argv);
5459 if (r <= 0)
5460 goto finish;
5461
5462 if (geteuid() != 0) {
5463 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5464 argc >= 2 ? "Need to be root." :
5465 "Need to be root (and some arguments are usually required).\nHint: try --help");
5466 goto finish;
5467 }
5468
5469 r = cant_be_in_netns();
5470 if (r < 0)
5471 goto finish;
5472
5473 r = initialize_rlimits();
5474 if (r < 0)
5475 goto finish;
5476
5477 r = load_oci_bundle();
5478 if (r < 0)
5479 goto finish;
5480
5481 r = determine_names();
5482 if (r < 0)
5483 goto finish;
5484
5485 r = load_settings();
5486 if (r < 0)
5487 goto finish;
5488
5489 r = cg_unified();
5490 if (r < 0) {
5491 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5492 goto finish;
5493 }
5494
5495 r = verify_arguments();
5496 if (r < 0)
5497 goto finish;
5498
5499 r = verify_network_interfaces_initialized();
5500 if (r < 0)
5501 goto finish;
5502
5503 /* Reapply environment settings. */
5504 (void) detect_unified_cgroup_hierarchy_from_environment();
5505
5506 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5507 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5508 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5509 (void) ignore_signals(SIGPIPE);
5510
5511 n_fd_passed = sd_listen_fds(false);
5512 if (n_fd_passed > 0) {
5513 r = fdset_new_listen_fds(&fds, false);
5514 if (r < 0) {
5515 log_error_errno(r, "Failed to collect file descriptors: %m");
5516 goto finish;
5517 }
5518 }
5519
5520 /* The "default" umask. This is appropriate for most file and directory
5521 * operations performed by nspawn, and is the umask that will be used for
5522 * the child. Functions like copy_devnodes() change the umask temporarily. */
5523 umask(0022);
5524
5525 if (arg_directory) {
5526 assert(!arg_image);
5527
5528 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5529 * /var from the host will propagate into container dynamically (because bad things happen if
5530 * two systems write to the same /var). Let's allow it for the special cases where /var is
5531 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5532 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5533 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5534 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5535 goto finish;
5536 }
5537
5538 if (arg_ephemeral) {
5539 _cleanup_free_ char *np = NULL;
5540
5541 r = chase_and_update(&arg_directory, 0);
5542 if (r < 0)
5543 goto finish;
5544
5545 /* If the specified path is a mount point we generate the new snapshot immediately
5546 * inside it under a random name. However if the specified is not a mount point we
5547 * create the new snapshot in the parent directory, just next to it. */
5548 r = path_is_mount_point(arg_directory, NULL, 0);
5549 if (r < 0) {
5550 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5551 goto finish;
5552 }
5553 if (r > 0)
5554 r = tempfn_random_child(arg_directory, "machine.", &np);
5555 else
5556 r = tempfn_random(arg_directory, "machine.", &np);
5557 if (r < 0) {
5558 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5559 goto finish;
5560 }
5561
5562 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5563 * only owned by us and no one else. */
5564 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5565 if (r < 0) {
5566 log_error_errno(r, "Failed to lock %s: %m", np);
5567 goto finish;
5568 }
5569
5570 {
5571 BLOCK_SIGNALS(SIGINT);
5572 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5573 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5574 BTRFS_SNAPSHOT_FALLBACK_COPY |
5575 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5576 BTRFS_SNAPSHOT_RECURSIVE |
5577 BTRFS_SNAPSHOT_QUOTA |
5578 BTRFS_SNAPSHOT_SIGINT);
5579 }
5580 if (r == -EINTR) {
5581 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5582 goto finish;
5583 }
5584 if (r < 0) {
5585 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5586 goto finish;
5587 }
5588
5589 free_and_replace(arg_directory, np);
5590 remove_directory = true;
5591 } else {
5592 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5593 if (r < 0)
5594 goto finish;
5595
5596 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5597 if (r == -EBUSY) {
5598 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5599 goto finish;
5600 }
5601 if (r < 0) {
5602 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5603 goto finish;
5604 }
5605
5606 if (arg_template) {
5607 r = chase_and_update(&arg_template, 0);
5608 if (r < 0)
5609 goto finish;
5610
5611 {
5612 BLOCK_SIGNALS(SIGINT);
5613 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5614 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5615 BTRFS_SNAPSHOT_FALLBACK_COPY |
5616 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5617 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5618 BTRFS_SNAPSHOT_RECURSIVE |
5619 BTRFS_SNAPSHOT_QUOTA |
5620 BTRFS_SNAPSHOT_SIGINT);
5621 }
5622 if (r == -EEXIST)
5623 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5624 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5625 else if (r == -EINTR) {
5626 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5627 goto finish;
5628 } else if (r < 0) {
5629 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5630 goto finish;
5631 } else
5632 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5633 "Populated %s from template %s.", arg_directory, arg_template);
5634 }
5635 }
5636
5637 if (arg_start_mode == START_BOOT) {
5638 _cleanup_free_ char *b = NULL;
5639 const char *p;
5640
5641 if (arg_pivot_root_new) {
5642 b = path_join(arg_directory, arg_pivot_root_new);
5643 if (!b)
5644 return log_oom();
5645
5646 p = b;
5647 } else
5648 p = arg_directory;
5649
5650 if (path_is_os_tree(p) <= 0) {
5651 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5652 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5653 goto finish;
5654 }
5655 } else {
5656 _cleanup_free_ char *p = NULL;
5657
5658 if (arg_pivot_root_new)
5659 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
5660 else
5661 p = path_join(arg_directory, "/usr/");
5662 if (!p)
5663 return log_oom();
5664
5665 if (laccess(p, F_OK) < 0) {
5666 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5667 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
5668 goto finish;
5669 }
5670 }
5671
5672 } else {
5673 DissectImageFlags dissect_image_flags =
5674 DISSECT_IMAGE_GENERIC_ROOT |
5675 DISSECT_IMAGE_REQUIRE_ROOT |
5676 DISSECT_IMAGE_RELAX_VAR_CHECK |
5677 DISSECT_IMAGE_USR_NO_ROOT |
5678 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5679 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
5680 assert(arg_image);
5681 assert(!arg_template);
5682
5683 r = chase_and_update(&arg_image, 0);
5684 if (r < 0)
5685 goto finish;
5686
5687 if (arg_ephemeral) {
5688 _cleanup_free_ char *np = NULL;
5689
5690 r = tempfn_random(arg_image, "machine.", &np);
5691 if (r < 0) {
5692 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5693 goto finish;
5694 }
5695
5696 /* Always take an exclusive lock on our own ephemeral copy. */
5697 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5698 if (r < 0) {
5699 r = log_error_errno(r, "Failed to create image lock: %m");
5700 goto finish;
5701 }
5702
5703 {
5704 BLOCK_SIGNALS(SIGINT);
5705 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5706 FS_NOCOW_FL, FS_NOCOW_FL,
5707 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5708 NULL, NULL);
5709 }
5710 if (r == -EINTR) {
5711 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5712 goto finish;
5713 }
5714 if (r < 0) {
5715 r = log_error_errno(r, "Failed to copy image file: %m");
5716 goto finish;
5717 }
5718
5719 free_and_replace(arg_image, np);
5720 remove_image = true;
5721 } else {
5722 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5723 if (r == -EBUSY) {
5724 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5725 goto finish;
5726 }
5727 if (r < 0) {
5728 r = log_error_errno(r, "Failed to create image lock: %m");
5729 goto finish;
5730 }
5731
5732 r = verity_settings_load(
5733 &arg_verity_settings,
5734 arg_image, NULL, NULL);
5735 if (r < 0) {
5736 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5737 goto finish;
5738 }
5739
5740 if (arg_verity_settings.data_path)
5741 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5742 }
5743
5744 if (!mkdtemp(tmprootdir)) {
5745 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5746 goto finish;
5747 }
5748
5749 remove_tmprootdir = true;
5750
5751 arg_directory = strdup(tmprootdir);
5752 if (!arg_directory) {
5753 r = log_oom();
5754 goto finish;
5755 }
5756
5757 r = loop_device_make_by_path(
5758 arg_image,
5759 arg_read_only ? O_RDONLY : O_RDWR,
5760 /* sector_size= */ UINT32_MAX,
5761 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5762 LOCK_SH,
5763 &loop);
5764 if (r < 0) {
5765 log_error_errno(r, "Failed to set up loopback block device: %m");
5766 goto finish;
5767 }
5768
5769 r = dissect_loop_device_and_warn(
5770 loop,
5771 &arg_verity_settings,
5772 /* mount_options=*/ NULL,
5773 arg_image_policy ?: &image_policy_container,
5774 dissect_image_flags,
5775 &dissected_image);
5776 if (r == -ENOPKG) {
5777 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5778 log_notice("Note that the disk image needs to\n"
5779 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5780 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5781 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
5782 " d) or contain a file system without a partition table\n"
5783 "in order to be bootable with systemd-nspawn.");
5784 goto finish;
5785 }
5786 if (r < 0)
5787 goto finish;
5788
5789 r = dissected_image_load_verity_sig_partition(
5790 dissected_image,
5791 loop->fd,
5792 &arg_verity_settings);
5793 if (r < 0)
5794 goto finish;
5795
5796 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5797 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5798 "root hash signature found! Proceeding without integrity checking.", arg_image);
5799
5800 r = dissected_image_decrypt_interactively(
5801 dissected_image,
5802 NULL,
5803 &arg_verity_settings,
5804 0);
5805 if (r < 0)
5806 goto finish;
5807
5808 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5809 if (remove_image && unlink(arg_image) >= 0)
5810 remove_image = false;
5811
5812 if (arg_architecture < 0)
5813 arg_architecture = dissected_image_architecture(dissected_image);
5814 }
5815
5816 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5817 if (r < 0)
5818 goto finish;
5819
5820 if (arg_console_mode < 0)
5821 arg_console_mode =
5822 isatty(STDIN_FILENO) > 0 &&
5823 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5824
5825 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5826 arg_quiet = true;
5827
5828 if (!arg_quiet)
5829 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
5830 arg_machine, arg_image ?: arg_directory);
5831
5832 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
5833
5834 r = make_reaper_process(true);
5835 if (r < 0) {
5836 log_error_errno(r, "Failed to become subreaper: %m");
5837 goto finish;
5838 }
5839
5840 if (arg_expose_ports) {
5841 r = fw_ctx_new(&fw_ctx);
5842 if (r < 0) {
5843 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5844 goto finish;
5845 }
5846 expose_args.fw_ctx = fw_ctx;
5847 }
5848 for (;;) {
5849 r = run_container(dissected_image,
5850 fds,
5851 veth_name, &veth_created,
5852 &expose_args, &master,
5853 &pid, &ret);
5854 if (r <= 0)
5855 break;
5856 }
5857
5858 finish:
5859 (void) sd_notify(false,
5860 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5861 "STOPPING=1\nSTATUS=Terminating...");
5862
5863 if (pid > 0)
5864 (void) kill(pid, SIGKILL);
5865
5866 /* Try to flush whatever is still queued in the pty */
5867 if (master >= 0) {
5868 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
5869 master = safe_close(master);
5870 }
5871
5872 if (pid > 0)
5873 (void) wait_for_terminate(pid, NULL);
5874
5875 pager_close();
5876
5877 if (remove_directory && arg_directory) {
5878 int k;
5879
5880 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5881 if (k < 0)
5882 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5883 }
5884
5885 if (remove_image && arg_image) {
5886 if (unlink(arg_image) < 0)
5887 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5888 }
5889
5890 if (remove_tmprootdir) {
5891 if (rmdir(tmprootdir) < 0)
5892 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5893 }
5894
5895 if (arg_machine) {
5896 const char *p;
5897
5898 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5899 (void) rm_rf(p, REMOVE_ROOT);
5900 }
5901
5902 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5903 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
5904
5905 if (veth_created)
5906 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5907 (void) remove_bridge(arg_network_zone);
5908
5909 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5910 expose_port_free_all(arg_expose_ports);
5911 rlimit_free_all(arg_rlimit);
5912 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5913 credential_free_all(arg_credentials, arg_n_credentials);
5914
5915 if (r < 0)
5916 return r;
5917
5918 return ret;
5919 }
5920
5921 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);