]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #23654 from keszybz/gcc-warnings
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <termios.h>
20 #include <unistd.h>
21
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25
26 #include "alloc-util.h"
27 #include "barrier.h"
28 #include "base-filesystem.h"
29 #include "blkid-util.h"
30 #include "btrfs-util.h"
31 #include "bus-error.h"
32 #include "bus-util.h"
33 #include "cap-list.h"
34 #include "capability-util.h"
35 #include "cgroup-util.h"
36 #include "chase-symlinks.h"
37 #include "copy.h"
38 #include "cpu-set-util.h"
39 #include "creds-util.h"
40 #include "dev-setup.h"
41 #include "discover-image.h"
42 #include "dissect-image.h"
43 #include "env-util.h"
44 #include "escape.h"
45 #include "fd-util.h"
46 #include "fdset.h"
47 #include "fileio.h"
48 #include "format-util.h"
49 #include "fs-util.h"
50 #include "gpt.h"
51 #include "hexdecoct.h"
52 #include "hostname-setup.h"
53 #include "hostname-util.h"
54 #include "id128-util.h"
55 #include "io-util.h"
56 #include "log.h"
57 #include "loop-util.h"
58 #include "loopback-setup.h"
59 #include "macro.h"
60 #include "main-func.h"
61 #include "missing_sched.h"
62 #include "mkdir.h"
63 #include "mount-util.h"
64 #include "mountpoint-util.h"
65 #include "namespace-util.h"
66 #include "netlink-util.h"
67 #include "nspawn-bind-user.h"
68 #include "nspawn-cgroup.h"
69 #include "nspawn-creds.h"
70 #include "nspawn-def.h"
71 #include "nspawn-expose-ports.h"
72 #include "nspawn-mount.h"
73 #include "nspawn-network.h"
74 #include "nspawn-oci.h"
75 #include "nspawn-patch-uid.h"
76 #include "nspawn-register.h"
77 #include "nspawn-seccomp.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "nspawn-stub-pid1.h"
81 #include "nspawn-util.h"
82 #include "nspawn.h"
83 #include "nulstr-util.h"
84 #include "os-util.h"
85 #include "pager.h"
86 #include "parse-argument.h"
87 #include "parse-util.h"
88 #include "pretty-print.h"
89 #include "process-util.h"
90 #include "ptyfwd.h"
91 #include "random-util.h"
92 #include "raw-clone.h"
93 #include "resolve-util.h"
94 #include "rlimit-util.h"
95 #include "rm-rf.h"
96 #if HAVE_SECCOMP
97 #include "seccomp-util.h"
98 #endif
99 #include "selinux-util.h"
100 #include "signal-util.h"
101 #include "socket-util.h"
102 #include "stat-util.h"
103 #include "stdio-util.h"
104 #include "string-table.h"
105 #include "string-util.h"
106 #include "strv.h"
107 #include "sysctl-util.h"
108 #include "terminal-util.h"
109 #include "tmpfile-util.h"
110 #include "umask-util.h"
111 #include "unit-name.h"
112 #include "user-util.h"
113 #include "util.h"
114
115 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
116 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
117
118 #define EXIT_FORCE_RESTART 133
119
120 typedef enum ContainerStatus {
121 CONTAINER_TERMINATED,
122 CONTAINER_REBOOTED,
123 } ContainerStatus;
124
125 static char *arg_directory = NULL;
126 static char *arg_template = NULL;
127 static char *arg_chdir = NULL;
128 static char *arg_pivot_root_new = NULL;
129 static char *arg_pivot_root_old = NULL;
130 static char *arg_user = NULL;
131 static uid_t arg_uid = UID_INVALID;
132 static gid_t arg_gid = GID_INVALID;
133 static gid_t* arg_supplementary_gids = NULL;
134 static size_t arg_n_supplementary_gids = 0;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL; /* The name used by the host to refer to this */
137 static char *arg_hostname = NULL; /* The name the payload sees by default */
138 static const char *arg_selinux_context = NULL;
139 static const char *arg_selinux_apifs_context = NULL;
140 static char *arg_slice = NULL;
141 static bool arg_private_network = false;
142 static bool arg_read_only = false;
143 static StartMode arg_start_mode = START_PID1;
144 static bool arg_ephemeral = false;
145 static LinkJournal arg_link_journal = LINK_AUTO;
146 static bool arg_link_journal_try = false;
147 static uint64_t arg_caps_retain =
148 (1ULL << CAP_AUDIT_CONTROL) |
149 (1ULL << CAP_AUDIT_WRITE) |
150 (1ULL << CAP_CHOWN) |
151 (1ULL << CAP_DAC_OVERRIDE) |
152 (1ULL << CAP_DAC_READ_SEARCH) |
153 (1ULL << CAP_FOWNER) |
154 (1ULL << CAP_FSETID) |
155 (1ULL << CAP_IPC_OWNER) |
156 (1ULL << CAP_KILL) |
157 (1ULL << CAP_LEASE) |
158 (1ULL << CAP_LINUX_IMMUTABLE) |
159 (1ULL << CAP_MKNOD) |
160 (1ULL << CAP_NET_BIND_SERVICE) |
161 (1ULL << CAP_NET_BROADCAST) |
162 (1ULL << CAP_NET_RAW) |
163 (1ULL << CAP_SETFCAP) |
164 (1ULL << CAP_SETGID) |
165 (1ULL << CAP_SETPCAP) |
166 (1ULL << CAP_SETUID) |
167 (1ULL << CAP_SYS_ADMIN) |
168 (1ULL << CAP_SYS_BOOT) |
169 (1ULL << CAP_SYS_CHROOT) |
170 (1ULL << CAP_SYS_NICE) |
171 (1ULL << CAP_SYS_PTRACE) |
172 (1ULL << CAP_SYS_RESOURCE) |
173 (1ULL << CAP_SYS_TTY_CONFIG);
174 static uint64_t arg_caps_ambient = 0;
175 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
176 static CustomMount *arg_custom_mounts = NULL;
177 static size_t arg_n_custom_mounts = 0;
178 static char **arg_setenv = NULL;
179 static bool arg_quiet = false;
180 static bool arg_register = true;
181 static bool arg_keep_unit = false;
182 static char **arg_network_interfaces = NULL;
183 static char **arg_network_macvlan = NULL;
184 static char **arg_network_ipvlan = NULL;
185 static bool arg_network_veth = false;
186 static char **arg_network_veth_extra = NULL;
187 static char *arg_network_bridge = NULL;
188 static char *arg_network_zone = NULL;
189 static char *arg_network_namespace_path = NULL;
190 static PagerFlags arg_pager_flags = 0;
191 static unsigned long arg_personality = PERSONALITY_INVALID;
192 static char *arg_image = NULL;
193 static char *arg_oci_bundle = NULL;
194 static VolatileMode arg_volatile_mode = VOLATILE_NO;
195 static ExposePort *arg_expose_ports = NULL;
196 static char **arg_property = NULL;
197 static sd_bus_message *arg_property_message = NULL;
198 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
199 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
200 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
201 static int arg_kill_signal = 0;
202 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
203 static SettingsMask arg_settings_mask = 0;
204 static int arg_settings_trusted = -1;
205 static char **arg_parameters = NULL;
206 static const char *arg_container_service_name = "systemd-nspawn";
207 static bool arg_notify_ready = false;
208 static bool arg_use_cgns = true;
209 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
210 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
211 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
212 static char **arg_syscall_allow_list = NULL;
213 static char **arg_syscall_deny_list = NULL;
214 #if HAVE_SECCOMP
215 static scmp_filter_ctx arg_seccomp = NULL;
216 #endif
217 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
218 static bool arg_no_new_privileges = false;
219 static int arg_oom_score_adjust = 0;
220 static bool arg_oom_score_adjust_set = false;
221 static CPUSet arg_cpu_set = {};
222 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
223 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
224 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
225 static DeviceNode* arg_extra_nodes = NULL;
226 static size_t arg_n_extra_nodes = 0;
227 static char **arg_sysctl = NULL;
228 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
229 static Credential *arg_credentials = NULL;
230 static size_t arg_n_credentials = 0;
231 static char **arg_bind_user = NULL;
232 static bool arg_suppress_sync = false;
233 static char *arg_settings_filename = NULL;
234
235 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
257 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
259 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
260 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
261 #if HAVE_SECCOMP
262 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
263 #endif
264 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
265 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
266 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
267 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
268
269 static int handle_arg_console(const char *arg) {
270 if (streq(arg, "help")) {
271 puts("autopipe\n"
272 "interactive\n"
273 "passive\n"
274 "pipe\n"
275 "read-only");
276 return 0;
277 }
278
279 if (streq(arg, "interactive"))
280 arg_console_mode = CONSOLE_INTERACTIVE;
281 else if (streq(arg, "read-only"))
282 arg_console_mode = CONSOLE_READ_ONLY;
283 else if (streq(arg, "passive"))
284 arg_console_mode = CONSOLE_PASSIVE;
285 else if (streq(arg, "pipe")) {
286 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
287 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
288 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
289 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
290 "Proceeding anyway.");
291
292 arg_console_mode = CONSOLE_PIPE;
293 } else if (streq(arg, "autopipe")) {
294 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
295 arg_console_mode = CONSOLE_INTERACTIVE;
296 else
297 arg_console_mode = CONSOLE_PIPE;
298 } else
299 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
300
301 arg_settings_mask |= SETTING_CONSOLE_MODE;
302 return 1;
303 }
304
305 static int help(void) {
306 _cleanup_free_ char *link = NULL;
307 int r;
308
309 pager_open(arg_pager_flags);
310
311 r = terminal_urlify_man("systemd-nspawn", "1", &link);
312 if (r < 0)
313 return log_oom();
314
315 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
316 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
317 " -h --help Show this help\n"
318 " --version Print version string\n"
319 " -q --quiet Do not show status information\n"
320 " --no-pager Do not pipe output into a pager\n"
321 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
322 "%3$sImage:%4$s\n"
323 " -D --directory=PATH Root directory for the container\n"
324 " --template=PATH Initialize root directory from template directory,\n"
325 " if missing\n"
326 " -x --ephemeral Run container with snapshot of root directory, and\n"
327 " remove it after exit\n"
328 " -i --image=PATH Root file system disk image (or device node) for\n"
329 " the container\n"
330 " --oci-bundle=PATH OCI bundle directory\n"
331 " --read-only Mount the root directory read-only\n"
332 " --volatile[=MODE] Run the system in volatile mode\n"
333 " --root-hash=HASH Specify verity root hash for root disk image\n"
334 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
335 " as a DER encoded PKCS7, either as a path to a file\n"
336 " or as an ASCII base64 encoded string prefixed by\n"
337 " 'base64:'\n"
338 " --verity-data=PATH Specify hash device for verity\n"
339 " --pivot-root=PATH[:PATH]\n"
340 " Pivot root to given directory in the container\n\n"
341 "%3$sExecution:%4$s\n"
342 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
343 " -b --boot Boot up full system (i.e. invoke init)\n"
344 " --chdir=PATH Set working directory in the container\n"
345 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
346 " -u --user=USER Run the command under specified user or UID\n"
347 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
348 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
349 " --suppress-sync=BOOLEAN\n"
350 " Suppress any form of disk data synchronization\n\n"
351 "%3$sSystem Identity:%4$s\n"
352 " -M --machine=NAME Set the machine name for the container\n"
353 " --hostname=NAME Override the hostname for the container\n"
354 " --uuid=UUID Set a specific machine UUID for the container\n\n"
355 "%3$sProperties:%4$s\n"
356 " -S --slice=SLICE Place the container in the specified slice\n"
357 " --property=NAME=VALUE Set scope unit property\n"
358 " --register=BOOLEAN Register container as machine\n"
359 " --keep-unit Do not register a scope for the machine, reuse\n"
360 " the service unit nspawn is running in\n\n"
361 "%3$sUser Namespacing:%4$s\n"
362 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
363 " --private-users[=UIDBASE[:NUIDS]]\n"
364 " Similar, but with user configured UID/GID range\n"
365 " --private-users-ownership=MODE\n"
366 " Adjust ('chown') or map ('map') OS tree ownership\n"
367 " to private UID/GID range\n\n"
368 "%3$sNetworking:%4$s\n"
369 " --private-network Disable network in container\n"
370 " --network-interface=INTERFACE\n"
371 " Assign an existing network interface to the\n"
372 " container\n"
373 " --network-macvlan=INTERFACE\n"
374 " Create a macvlan network interface based on an\n"
375 " existing network interface to the container\n"
376 " --network-ipvlan=INTERFACE\n"
377 " Create an ipvlan network interface based on an\n"
378 " existing network interface to the container\n"
379 " -n --network-veth Add a virtual Ethernet connection between host\n"
380 " and container\n"
381 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
382 " Add an additional virtual Ethernet link between\n"
383 " host and container\n"
384 " --network-bridge=INTERFACE\n"
385 " Add a virtual Ethernet connection to the container\n"
386 " and attach it to an existing bridge on the host\n"
387 " --network-zone=NAME Similar, but attach the new interface to an\n"
388 " an automatically managed bridge interface\n"
389 " --network-namespace-path=PATH\n"
390 " Set network namespace to the one represented by\n"
391 " the specified kernel namespace file node\n"
392 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
393 " Expose a container IP port on the host\n\n"
394 "%3$sSecurity:%4$s\n"
395 " --capability=CAP In addition to the default, retain specified\n"
396 " capability\n"
397 " --drop-capability=CAP Drop the specified capability from the default set\n"
398 " --ambient-capability=CAP\n"
399 " Sets the specified capability for the started\n"
400 " process. Not useful if booting a machine.\n"
401 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
402 " --system-call-filter=LIST|~LIST\n"
403 " Permit/prohibit specific system calls\n"
404 " -Z --selinux-context=SECLABEL\n"
405 " Set the SELinux security context to be used by\n"
406 " processes in the container\n"
407 " -L --selinux-apifs-context=SECLABEL\n"
408 " Set the SELinux security context to be used by\n"
409 " API/tmpfs file systems in the container\n\n"
410 "%3$sResources:%4$s\n"
411 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
412 " --oom-score-adjust=VALUE\n"
413 " Adjust the OOM score value for the payload\n"
414 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
415 " --personality=ARCH Pick personality for this container\n\n"
416 "%3$sIntegration:%4$s\n"
417 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
418 " --timezone=MODE Select mode of /etc/localtime initialization\n"
419 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
420 " host, try-guest, try-host\n"
421 " -j Equivalent to --link-journal=try-guest\n\n"
422 "%3$sMounts:%4$s\n"
423 " --bind=PATH[:PATH[:OPTIONS]]\n"
424 " Bind mount a file or directory from the host into\n"
425 " the container\n"
426 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
427 " Similar, but creates a read-only bind mount\n"
428 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
429 " it\n"
430 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
431 " --overlay=PATH[:PATH...]:PATH\n"
432 " Create an overlay mount from the host to \n"
433 " the container\n"
434 " --overlay-ro=PATH[:PATH...]:PATH\n"
435 " Similar, but creates a read-only overlay mount\n"
436 " --bind-user=NAME Bind user from host to container\n\n"
437 "%3$sInput/Output:%4$s\n"
438 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
439 " set up for the container.\n"
440 " -P --pipe Equivalent to --console=pipe\n\n"
441 "%3$sCredentials:%4$s\n"
442 " --set-credential=ID:VALUE\n"
443 " Pass a credential with literal value to container.\n"
444 " --load-credential=ID:PATH\n"
445 " Load credential to pass to container from file or\n"
446 " AF_UNIX stream socket.\n"
447 "\nSee the %2$s for details.\n",
448 program_invocation_short_name,
449 link,
450 ansi_underline(),
451 ansi_normal(),
452 ansi_highlight(),
453 ansi_normal());
454
455 return 0;
456 }
457
458 static int custom_mount_check_all(void) {
459 size_t i;
460
461 for (i = 0; i < arg_n_custom_mounts; i++) {
462 CustomMount *m = &arg_custom_mounts[i];
463
464 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
465 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
466 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
467 "--private-users-ownership=own may not be combined with custom root mounts.");
468 if (arg_uid_shift == UID_INVALID)
469 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
470 "--private-users with automatic UID shift may not be combined with custom root mounts.");
471 }
472 }
473
474 return 0;
475 }
476
477 static int detect_unified_cgroup_hierarchy_from_environment(void) {
478 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
479 int r;
480
481 /* Allow the user to control whether the unified hierarchy is used */
482
483 e = getenv(var);
484 if (!e) {
485 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
486 var = "UNIFIED_CGROUP_HIERARCHY";
487 e = getenv(var);
488 }
489
490 if (!isempty(e)) {
491 r = parse_boolean(e);
492 if (r < 0)
493 return log_error_errno(r, "Failed to parse $%s: %m", var);
494 if (r > 0)
495 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
496 else
497 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
498 }
499
500 return 0;
501 }
502
503 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
504 int r;
505
506 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
507 * in the image actually supports. */
508 r = cg_all_unified();
509 if (r < 0)
510 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
511 if (r > 0) {
512 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
513 * routine only detects 231, so we'll have a false negative here for 230. */
514 r = systemd_installation_has_version(directory, "230");
515 if (r < 0)
516 return log_error_errno(r, "Failed to determine systemd version in container: %m");
517 if (r > 0)
518 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
519 else
520 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
521 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
522 /* Mixed cgroup hierarchy support was added in 233 */
523 r = systemd_installation_has_version(directory, "233");
524 if (r < 0)
525 return log_error_errno(r, "Failed to determine systemd version in container: %m");
526 if (r > 0)
527 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
528 else
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
530 } else
531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
532
533 log_debug("Using %s hierarchy for container.",
534 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
535 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
536
537 return 0;
538 }
539
540 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
541 uint64_t mask = 0;
542 int r;
543
544 for (;;) {
545 _cleanup_free_ char *t = NULL;
546
547 r = extract_first_word(&spec, &t, ",", 0);
548 if (r < 0)
549 return log_error_errno(r, "Failed to parse capability %s.", t);
550 if (r == 0)
551 break;
552
553 if (streq(t, "help")) {
554 for (int i = 0; i < capability_list_length(); i++) {
555 const char *name;
556
557 name = capability_to_name(i);
558 if (name)
559 puts(name);
560 }
561
562 return 0; /* quit */
563 }
564
565 if (streq(t, "all"))
566 mask = UINT64_MAX;
567 else {
568 r = capability_from_name(t);
569 if (r < 0)
570 return log_error_errno(r, "Failed to parse capability %s.", t);
571
572 mask |= 1ULL << r;
573 }
574 }
575
576 *ret_mask = mask;
577 return 1; /* continue */
578 }
579
580 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
581 int r;
582
583 r = getenv_bool(name);
584 if (r == -ENXIO)
585 return 0;
586 if (r < 0)
587 return log_error_errno(r, "Failed to parse $%s: %m", name);
588
589 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
590 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
591 return 0;
592 }
593
594 static int parse_mount_settings_env(void) {
595 const char *e;
596 int r;
597
598 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
599 if (r < 0 && r != -ENXIO)
600 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
601 if (r >= 0)
602 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
603
604 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
605 if (streq_ptr(e, "network"))
606 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
607
608 else if (e) {
609 r = parse_boolean(e);
610 if (r < 0)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
612
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
614 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
615 }
616
617 return 0;
618 }
619
620 static int parse_environment(void) {
621 const char *e;
622 int r;
623
624 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
625 if (r < 0)
626 return r;
627 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
628 if (r < 0)
629 return r;
630 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
631 if (r < 0)
632 return r;
633 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
634 if (r < 0)
635 return r;
636
637 r = parse_mount_settings_env();
638 if (r < 0)
639 return r;
640
641 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
642 * even if it is supported. If not supported, it has no effect. */
643 if (!cg_ns_supported())
644 arg_use_cgns = false;
645 else {
646 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
647 if (r < 0) {
648 if (r != -ENXIO)
649 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
650
651 arg_use_cgns = true;
652 } else {
653 arg_use_cgns = r > 0;
654 arg_settings_mask |= SETTING_USE_CGNS;
655 }
656 }
657
658 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
659 if (e)
660 arg_container_service_name = e;
661
662 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
663 if (r >= 0)
664 arg_suppress_sync = r;
665 else if (r != -ENXIO)
666 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
667
668 return detect_unified_cgroup_hierarchy_from_environment();
669 }
670
671 static int parse_argv(int argc, char *argv[]) {
672 enum {
673 ARG_VERSION = 0x100,
674 ARG_PRIVATE_NETWORK,
675 ARG_UUID,
676 ARG_READ_ONLY,
677 ARG_CAPABILITY,
678 ARG_AMBIENT_CAPABILITY,
679 ARG_DROP_CAPABILITY,
680 ARG_LINK_JOURNAL,
681 ARG_BIND,
682 ARG_BIND_RO,
683 ARG_TMPFS,
684 ARG_OVERLAY,
685 ARG_OVERLAY_RO,
686 ARG_INACCESSIBLE,
687 ARG_SHARE_SYSTEM,
688 ARG_REGISTER,
689 ARG_KEEP_UNIT,
690 ARG_NETWORK_INTERFACE,
691 ARG_NETWORK_MACVLAN,
692 ARG_NETWORK_IPVLAN,
693 ARG_NETWORK_BRIDGE,
694 ARG_NETWORK_ZONE,
695 ARG_NETWORK_VETH_EXTRA,
696 ARG_NETWORK_NAMESPACE_PATH,
697 ARG_PERSONALITY,
698 ARG_VOLATILE,
699 ARG_TEMPLATE,
700 ARG_PROPERTY,
701 ARG_PRIVATE_USERS,
702 ARG_KILL_SIGNAL,
703 ARG_SETTINGS,
704 ARG_CHDIR,
705 ARG_PIVOT_ROOT,
706 ARG_PRIVATE_USERS_CHOWN,
707 ARG_PRIVATE_USERS_OWNERSHIP,
708 ARG_NOTIFY_READY,
709 ARG_ROOT_HASH,
710 ARG_ROOT_HASH_SIG,
711 ARG_VERITY_DATA,
712 ARG_SYSTEM_CALL_FILTER,
713 ARG_RLIMIT,
714 ARG_HOSTNAME,
715 ARG_NO_NEW_PRIVILEGES,
716 ARG_OOM_SCORE_ADJUST,
717 ARG_CPU_AFFINITY,
718 ARG_RESOLV_CONF,
719 ARG_TIMEZONE,
720 ARG_CONSOLE,
721 ARG_PIPE,
722 ARG_OCI_BUNDLE,
723 ARG_NO_PAGER,
724 ARG_SET_CREDENTIAL,
725 ARG_LOAD_CREDENTIAL,
726 ARG_BIND_USER,
727 ARG_SUPPRESS_SYNC,
728 };
729
730 static const struct option options[] = {
731 { "help", no_argument, NULL, 'h' },
732 { "version", no_argument, NULL, ARG_VERSION },
733 { "directory", required_argument, NULL, 'D' },
734 { "template", required_argument, NULL, ARG_TEMPLATE },
735 { "ephemeral", no_argument, NULL, 'x' },
736 { "user", required_argument, NULL, 'u' },
737 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
738 { "as-pid2", no_argument, NULL, 'a' },
739 { "boot", no_argument, NULL, 'b' },
740 { "uuid", required_argument, NULL, ARG_UUID },
741 { "read-only", no_argument, NULL, ARG_READ_ONLY },
742 { "capability", required_argument, NULL, ARG_CAPABILITY },
743 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
744 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
745 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
746 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
747 { "bind", required_argument, NULL, ARG_BIND },
748 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
749 { "tmpfs", required_argument, NULL, ARG_TMPFS },
750 { "overlay", required_argument, NULL, ARG_OVERLAY },
751 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
752 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
753 { "machine", required_argument, NULL, 'M' },
754 { "hostname", required_argument, NULL, ARG_HOSTNAME },
755 { "slice", required_argument, NULL, 'S' },
756 { "setenv", required_argument, NULL, 'E' },
757 { "selinux-context", required_argument, NULL, 'Z' },
758 { "selinux-apifs-context", required_argument, NULL, 'L' },
759 { "quiet", no_argument, NULL, 'q' },
760 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
761 { "register", required_argument, NULL, ARG_REGISTER },
762 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
763 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
764 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
765 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
766 { "network-veth", no_argument, NULL, 'n' },
767 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
768 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
769 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
770 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
771 { "personality", required_argument, NULL, ARG_PERSONALITY },
772 { "image", required_argument, NULL, 'i' },
773 { "volatile", optional_argument, NULL, ARG_VOLATILE },
774 { "port", required_argument, NULL, 'p' },
775 { "property", required_argument, NULL, ARG_PROPERTY },
776 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
777 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
778 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
779 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
780 { "settings", required_argument, NULL, ARG_SETTINGS },
781 { "chdir", required_argument, NULL, ARG_CHDIR },
782 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
783 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
784 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
785 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
786 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
787 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
788 { "rlimit", required_argument, NULL, ARG_RLIMIT },
789 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
790 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
791 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
792 { "timezone", required_argument, NULL, ARG_TIMEZONE },
793 { "console", required_argument, NULL, ARG_CONSOLE },
794 { "pipe", no_argument, NULL, ARG_PIPE },
795 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
796 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
797 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
798 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
799 { "bind-user", required_argument, NULL, ARG_BIND_USER },
800 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
801 {}
802 };
803
804 int c, r;
805 uint64_t plus = 0, minus = 0;
806 bool mask_all_settings = false, mask_no_settings = false;
807
808 assert(argc >= 0);
809 assert(argv);
810
811 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
812 switch (c) {
813
814 case 'h':
815 return help();
816
817 case ARG_VERSION:
818 return version();
819
820 case 'D':
821 r = parse_path_argument(optarg, false, &arg_directory);
822 if (r < 0)
823 return r;
824
825 arg_settings_mask |= SETTING_DIRECTORY;
826 break;
827
828 case ARG_TEMPLATE:
829 r = parse_path_argument(optarg, false, &arg_template);
830 if (r < 0)
831 return r;
832
833 arg_settings_mask |= SETTING_DIRECTORY;
834 break;
835
836 case 'i':
837 r = parse_path_argument(optarg, false, &arg_image);
838 if (r < 0)
839 return r;
840
841 arg_settings_mask |= SETTING_DIRECTORY;
842 break;
843
844 case ARG_OCI_BUNDLE:
845 r = parse_path_argument(optarg, false, &arg_oci_bundle);
846 if (r < 0)
847 return r;
848
849 break;
850
851 case 'x':
852 arg_ephemeral = true;
853 arg_settings_mask |= SETTING_EPHEMERAL;
854 break;
855
856 case 'u':
857 r = free_and_strdup(&arg_user, optarg);
858 if (r < 0)
859 return log_oom();
860
861 arg_settings_mask |= SETTING_USER;
862 break;
863
864 case ARG_NETWORK_ZONE: {
865 char *j;
866
867 j = strjoin("vz-", optarg);
868 if (!j)
869 return log_oom();
870
871 if (!ifname_valid(j)) {
872 log_error("Network zone name not valid: %s", j);
873 free(j);
874 return -EINVAL;
875 }
876
877 free_and_replace(arg_network_zone, j);
878
879 arg_network_veth = true;
880 arg_private_network = true;
881 arg_settings_mask |= SETTING_NETWORK;
882 break;
883 }
884
885 case ARG_NETWORK_BRIDGE:
886
887 if (!ifname_valid(optarg))
888 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
889 "Bridge interface name not valid: %s", optarg);
890
891 r = free_and_strdup(&arg_network_bridge, optarg);
892 if (r < 0)
893 return log_oom();
894
895 _fallthrough_;
896 case 'n':
897 arg_network_veth = true;
898 arg_private_network = true;
899 arg_settings_mask |= SETTING_NETWORK;
900 break;
901
902 case ARG_NETWORK_VETH_EXTRA:
903 r = veth_extra_parse(&arg_network_veth_extra, optarg);
904 if (r < 0)
905 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
906
907 arg_private_network = true;
908 arg_settings_mask |= SETTING_NETWORK;
909 break;
910
911 case ARG_NETWORK_INTERFACE:
912 if (!ifname_valid(optarg))
913 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
914 "Network interface name not valid: %s", optarg);
915
916 r = test_network_interface_initialized(optarg);
917 if (r < 0)
918 return r;
919
920 if (strv_extend(&arg_network_interfaces, optarg) < 0)
921 return log_oom();
922
923 arg_private_network = true;
924 arg_settings_mask |= SETTING_NETWORK;
925 break;
926
927 case ARG_NETWORK_MACVLAN:
928
929 if (!ifname_valid(optarg))
930 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
931 "MACVLAN network interface name not valid: %s", optarg);
932
933 r = test_network_interface_initialized(optarg);
934 if (r < 0)
935 return r;
936
937 if (strv_extend(&arg_network_macvlan, optarg) < 0)
938 return log_oom();
939
940 arg_private_network = true;
941 arg_settings_mask |= SETTING_NETWORK;
942 break;
943
944 case ARG_NETWORK_IPVLAN:
945
946 if (!ifname_valid(optarg))
947 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
948 "IPVLAN network interface name not valid: %s", optarg);
949
950 r = test_network_interface_initialized(optarg);
951 if (r < 0)
952 return r;
953
954 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
955 return log_oom();
956
957 _fallthrough_;
958 case ARG_PRIVATE_NETWORK:
959 arg_private_network = true;
960 arg_settings_mask |= SETTING_NETWORK;
961 break;
962
963 case ARG_NETWORK_NAMESPACE_PATH:
964 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
965 if (r < 0)
966 return r;
967
968 arg_settings_mask |= SETTING_NETWORK;
969 break;
970
971 case 'b':
972 if (arg_start_mode == START_PID2)
973 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
974 "--boot and --as-pid2 may not be combined.");
975
976 arg_start_mode = START_BOOT;
977 arg_settings_mask |= SETTING_START_MODE;
978 break;
979
980 case 'a':
981 if (arg_start_mode == START_BOOT)
982 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
983 "--boot and --as-pid2 may not be combined.");
984
985 arg_start_mode = START_PID2;
986 arg_settings_mask |= SETTING_START_MODE;
987 break;
988
989 case ARG_UUID:
990 r = sd_id128_from_string(optarg, &arg_uuid);
991 if (r < 0)
992 return log_error_errno(r, "Invalid UUID: %s", optarg);
993
994 if (sd_id128_is_null(arg_uuid))
995 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
996 "Machine UUID may not be all zeroes.");
997
998 arg_settings_mask |= SETTING_MACHINE_ID;
999 break;
1000
1001 case 'S': {
1002 _cleanup_free_ char *mangled = NULL;
1003
1004 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1005 if (r < 0)
1006 return log_oom();
1007
1008 free_and_replace(arg_slice, mangled);
1009 arg_settings_mask |= SETTING_SLICE;
1010 break;
1011 }
1012
1013 case 'M':
1014 if (isempty(optarg))
1015 arg_machine = mfree(arg_machine);
1016 else {
1017 if (!hostname_is_valid(optarg, 0))
1018 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1019 "Invalid machine name: %s", optarg);
1020
1021 r = free_and_strdup(&arg_machine, optarg);
1022 if (r < 0)
1023 return log_oom();
1024 }
1025 break;
1026
1027 case ARG_HOSTNAME:
1028 if (isempty(optarg))
1029 arg_hostname = mfree(arg_hostname);
1030 else {
1031 if (!hostname_is_valid(optarg, 0))
1032 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1033 "Invalid hostname: %s", optarg);
1034
1035 r = free_and_strdup(&arg_hostname, optarg);
1036 if (r < 0)
1037 return log_oom();
1038 }
1039
1040 arg_settings_mask |= SETTING_HOSTNAME;
1041 break;
1042
1043 case 'Z':
1044 arg_selinux_context = optarg;
1045 break;
1046
1047 case 'L':
1048 arg_selinux_apifs_context = optarg;
1049 break;
1050
1051 case ARG_READ_ONLY:
1052 arg_read_only = true;
1053 arg_settings_mask |= SETTING_READ_ONLY;
1054 break;
1055
1056 case ARG_AMBIENT_CAPABILITY: {
1057 uint64_t m;
1058 r = parse_capability_spec(optarg, &m);
1059 if (r <= 0)
1060 return r;
1061 arg_caps_ambient |= m;
1062 arg_settings_mask |= SETTING_CAPABILITY;
1063 break;
1064 }
1065 case ARG_CAPABILITY:
1066 case ARG_DROP_CAPABILITY: {
1067 uint64_t m;
1068 r = parse_capability_spec(optarg, &m);
1069 if (r <= 0)
1070 return r;
1071
1072 if (c == ARG_CAPABILITY)
1073 plus |= m;
1074 else
1075 minus |= m;
1076 arg_settings_mask |= SETTING_CAPABILITY;
1077 break;
1078 }
1079 case ARG_NO_NEW_PRIVILEGES:
1080 r = parse_boolean(optarg);
1081 if (r < 0)
1082 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1083
1084 arg_no_new_privileges = r;
1085 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1086 break;
1087
1088 case 'j':
1089 arg_link_journal = LINK_GUEST;
1090 arg_link_journal_try = true;
1091 arg_settings_mask |= SETTING_LINK_JOURNAL;
1092 break;
1093
1094 case ARG_LINK_JOURNAL:
1095 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1096 if (r < 0)
1097 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1098
1099 arg_settings_mask |= SETTING_LINK_JOURNAL;
1100 break;
1101
1102 case ARG_BIND:
1103 case ARG_BIND_RO:
1104 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1105 if (r < 0)
1106 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1107
1108 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1109 break;
1110
1111 case ARG_TMPFS:
1112 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1113 if (r < 0)
1114 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1115
1116 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1117 break;
1118
1119 case ARG_OVERLAY:
1120 case ARG_OVERLAY_RO:
1121 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1122 if (r == -EADDRNOTAVAIL)
1123 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1124 if (r < 0)
1125 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1126
1127 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1128 break;
1129
1130 case ARG_INACCESSIBLE:
1131 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1132 if (r < 0)
1133 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1134
1135 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1136 break;
1137
1138 case 'E':
1139 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1140 if (r < 0)
1141 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1142
1143 arg_settings_mask |= SETTING_ENVIRONMENT;
1144 break;
1145
1146 case 'q':
1147 arg_quiet = true;
1148 break;
1149
1150 case ARG_SHARE_SYSTEM:
1151 /* We don't officially support this anymore, except for compat reasons. People should use the
1152 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1153 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1154 arg_clone_ns_flags = 0;
1155 break;
1156
1157 case ARG_REGISTER:
1158 r = parse_boolean(optarg);
1159 if (r < 0) {
1160 log_error("Failed to parse --register= argument: %s", optarg);
1161 return r;
1162 }
1163
1164 arg_register = r;
1165 break;
1166
1167 case ARG_KEEP_UNIT:
1168 arg_keep_unit = true;
1169 break;
1170
1171 case ARG_PERSONALITY:
1172
1173 arg_personality = personality_from_string(optarg);
1174 if (arg_personality == PERSONALITY_INVALID)
1175 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1176 "Unknown or unsupported personality '%s'.", optarg);
1177
1178 arg_settings_mask |= SETTING_PERSONALITY;
1179 break;
1180
1181 case ARG_VOLATILE:
1182
1183 if (!optarg)
1184 arg_volatile_mode = VOLATILE_YES;
1185 else if (streq(optarg, "help")) {
1186 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1187 return 0;
1188 } else {
1189 VolatileMode m;
1190
1191 m = volatile_mode_from_string(optarg);
1192 if (m < 0)
1193 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1194 "Failed to parse --volatile= argument: %s", optarg);
1195 else
1196 arg_volatile_mode = m;
1197 }
1198
1199 arg_settings_mask |= SETTING_VOLATILE_MODE;
1200 break;
1201
1202 case 'p':
1203 r = expose_port_parse(&arg_expose_ports, optarg);
1204 if (r == -EEXIST)
1205 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1206 if (r < 0)
1207 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1208
1209 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1210 break;
1211
1212 case ARG_PROPERTY:
1213 if (strv_extend(&arg_property, optarg) < 0)
1214 return log_oom();
1215
1216 break;
1217
1218 case ARG_PRIVATE_USERS: {
1219 int boolean;
1220
1221 if (!optarg)
1222 boolean = true;
1223 else if (!in_charset(optarg, DIGITS))
1224 /* do *not* parse numbers as booleans */
1225 boolean = parse_boolean(optarg);
1226 else
1227 boolean = -1;
1228
1229 if (boolean == 0) {
1230 /* no: User namespacing off */
1231 arg_userns_mode = USER_NAMESPACE_NO;
1232 arg_uid_shift = UID_INVALID;
1233 arg_uid_range = UINT32_C(0x10000);
1234 } else if (boolean > 0) {
1235 /* yes: User namespacing on, UID range is read from root dir */
1236 arg_userns_mode = USER_NAMESPACE_FIXED;
1237 arg_uid_shift = UID_INVALID;
1238 arg_uid_range = UINT32_C(0x10000);
1239 } else if (streq(optarg, "pick")) {
1240 /* pick: User namespacing on, UID range is picked randomly */
1241 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1242 * implied by USER_NAMESPACE_PICK
1243 * further down. */
1244 arg_uid_shift = UID_INVALID;
1245 arg_uid_range = UINT32_C(0x10000);
1246
1247 } else if (streq(optarg, "identity")) {
1248 /* identitiy: User namespaces on, UID range is map the 0…0xFFFF range to
1249 * itself, i.e. we don't actually map anything, but do take benefit of
1250 * isolation of capability sets. */
1251 arg_userns_mode = USER_NAMESPACE_FIXED;
1252 arg_uid_shift = 0;
1253 arg_uid_range = UINT32_C(0x10000);
1254 } else {
1255 _cleanup_free_ char *buffer = NULL;
1256 const char *range, *shift;
1257
1258 /* anything else: User namespacing on, UID range is explicitly configured */
1259
1260 range = strchr(optarg, ':');
1261 if (range) {
1262 buffer = strndup(optarg, range - optarg);
1263 if (!buffer)
1264 return log_oom();
1265 shift = buffer;
1266
1267 range++;
1268 r = safe_atou32(range, &arg_uid_range);
1269 if (r < 0)
1270 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1271 } else
1272 shift = optarg;
1273
1274 r = parse_uid(shift, &arg_uid_shift);
1275 if (r < 0)
1276 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1277
1278 arg_userns_mode = USER_NAMESPACE_FIXED;
1279
1280 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1281 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1282 }
1283
1284 arg_settings_mask |= SETTING_USERNS;
1285 break;
1286 }
1287
1288 case 'U':
1289 if (userns_supported()) {
1290 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1291 * implied by USER_NAMESPACE_PICK
1292 * further down. */
1293 arg_uid_shift = UID_INVALID;
1294 arg_uid_range = UINT32_C(0x10000);
1295
1296 arg_settings_mask |= SETTING_USERNS;
1297 }
1298
1299 break;
1300
1301 case ARG_PRIVATE_USERS_CHOWN:
1302 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1303
1304 arg_settings_mask |= SETTING_USERNS;
1305 break;
1306
1307 case ARG_PRIVATE_USERS_OWNERSHIP:
1308 if (streq(optarg, "help")) {
1309 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1310 return 0;
1311 }
1312
1313 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1314 if (arg_userns_ownership < 0)
1315 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1316
1317 arg_settings_mask |= SETTING_USERNS;
1318 break;
1319
1320 case ARG_KILL_SIGNAL:
1321 if (streq(optarg, "help")) {
1322 DUMP_STRING_TABLE(signal, int, _NSIG);
1323 return 0;
1324 }
1325
1326 arg_kill_signal = signal_from_string(optarg);
1327 if (arg_kill_signal < 0)
1328 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1329
1330 arg_settings_mask |= SETTING_KILL_SIGNAL;
1331 break;
1332
1333 case ARG_SETTINGS:
1334
1335 /* no → do not read files
1336 * yes → read files, do not override cmdline, trust only subset
1337 * override → read files, override cmdline, trust only subset
1338 * trusted → read files, do not override cmdline, trust all
1339 */
1340
1341 r = parse_boolean(optarg);
1342 if (r < 0) {
1343 if (streq(optarg, "trusted")) {
1344 mask_all_settings = false;
1345 mask_no_settings = false;
1346 arg_settings_trusted = true;
1347
1348 } else if (streq(optarg, "override")) {
1349 mask_all_settings = false;
1350 mask_no_settings = true;
1351 arg_settings_trusted = -1;
1352 } else
1353 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1354 } else if (r > 0) {
1355 /* yes */
1356 mask_all_settings = false;
1357 mask_no_settings = false;
1358 arg_settings_trusted = -1;
1359 } else {
1360 /* no */
1361 mask_all_settings = true;
1362 mask_no_settings = false;
1363 arg_settings_trusted = false;
1364 }
1365
1366 break;
1367
1368 case ARG_CHDIR:
1369 if (!path_is_absolute(optarg))
1370 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1371 "Working directory %s is not an absolute path.", optarg);
1372
1373 r = free_and_strdup(&arg_chdir, optarg);
1374 if (r < 0)
1375 return log_oom();
1376
1377 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1378 break;
1379
1380 case ARG_PIVOT_ROOT:
1381 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1382 if (r < 0)
1383 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1384
1385 arg_settings_mask |= SETTING_PIVOT_ROOT;
1386 break;
1387
1388 case ARG_NOTIFY_READY:
1389 r = parse_boolean(optarg);
1390 if (r < 0)
1391 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1392 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1393 arg_notify_ready = r;
1394 arg_settings_mask |= SETTING_NOTIFY_READY;
1395 break;
1396
1397 case ARG_ROOT_HASH: {
1398 _cleanup_free_ void *k = NULL;
1399 size_t l;
1400
1401 r = unhexmem(optarg, strlen(optarg), &k, &l);
1402 if (r < 0)
1403 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1404 if (l < sizeof(sd_id128_t))
1405 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1406
1407 free_and_replace(arg_verity_settings.root_hash, k);
1408 arg_verity_settings.root_hash_size = l;
1409 break;
1410 }
1411
1412 case ARG_ROOT_HASH_SIG: {
1413 char *value;
1414 size_t l;
1415 void *p;
1416
1417 if ((value = startswith(optarg, "base64:"))) {
1418 r = unbase64mem(value, strlen(value), &p, &l);
1419 if (r < 0)
1420 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1421
1422 } else {
1423 r = read_full_file(optarg, (char**) &p, &l);
1424 if (r < 0)
1425 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1426 }
1427
1428 free_and_replace(arg_verity_settings.root_hash_sig, p);
1429 arg_verity_settings.root_hash_sig_size = l;
1430 break;
1431 }
1432
1433 case ARG_VERITY_DATA:
1434 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1435 if (r < 0)
1436 return r;
1437 break;
1438
1439 case ARG_SYSTEM_CALL_FILTER: {
1440 bool negative;
1441 const char *items;
1442
1443 negative = optarg[0] == '~';
1444 items = negative ? optarg + 1 : optarg;
1445
1446 for (;;) {
1447 _cleanup_free_ char *word = NULL;
1448
1449 r = extract_first_word(&items, &word, NULL, 0);
1450 if (r == 0)
1451 break;
1452 if (r == -ENOMEM)
1453 return log_oom();
1454 if (r < 0)
1455 return log_error_errno(r, "Failed to parse system call filter: %m");
1456
1457 if (negative)
1458 r = strv_extend(&arg_syscall_deny_list, word);
1459 else
1460 r = strv_extend(&arg_syscall_allow_list, word);
1461 if (r < 0)
1462 return log_oom();
1463 }
1464
1465 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1466 break;
1467 }
1468
1469 case ARG_RLIMIT: {
1470 const char *eq;
1471 _cleanup_free_ char *name = NULL;
1472 int rl;
1473
1474 if (streq(optarg, "help")) {
1475 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1476 return 0;
1477 }
1478
1479 eq = strchr(optarg, '=');
1480 if (!eq)
1481 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1482 "--rlimit= expects an '=' assignment.");
1483
1484 name = strndup(optarg, eq - optarg);
1485 if (!name)
1486 return log_oom();
1487
1488 rl = rlimit_from_string_harder(name);
1489 if (rl < 0)
1490 return log_error_errno(rl, "Unknown resource limit: %s", name);
1491
1492 if (!arg_rlimit[rl]) {
1493 arg_rlimit[rl] = new0(struct rlimit, 1);
1494 if (!arg_rlimit[rl])
1495 return log_oom();
1496 }
1497
1498 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1499 if (r < 0)
1500 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1501
1502 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1503 break;
1504 }
1505
1506 case ARG_OOM_SCORE_ADJUST:
1507 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1508 if (r < 0)
1509 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1510
1511 arg_oom_score_adjust_set = true;
1512 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1513 break;
1514
1515 case ARG_CPU_AFFINITY: {
1516 CPUSet cpuset;
1517
1518 r = parse_cpu_set(optarg, &cpuset);
1519 if (r < 0)
1520 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1521
1522 cpu_set_reset(&arg_cpu_set);
1523 arg_cpu_set = cpuset;
1524 arg_settings_mask |= SETTING_CPU_AFFINITY;
1525 break;
1526 }
1527
1528 case ARG_RESOLV_CONF:
1529 if (streq(optarg, "help")) {
1530 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1531 return 0;
1532 }
1533
1534 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1535 if (arg_resolv_conf < 0)
1536 return log_error_errno(arg_resolv_conf,
1537 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1538
1539 arg_settings_mask |= SETTING_RESOLV_CONF;
1540 break;
1541
1542 case ARG_TIMEZONE:
1543 if (streq(optarg, "help")) {
1544 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1545 return 0;
1546 }
1547
1548 arg_timezone = timezone_mode_from_string(optarg);
1549 if (arg_timezone < 0)
1550 return log_error_errno(arg_timezone,
1551 "Failed to parse /etc/localtime mode: %s", optarg);
1552
1553 arg_settings_mask |= SETTING_TIMEZONE;
1554 break;
1555
1556 case ARG_CONSOLE:
1557 r = handle_arg_console(optarg);
1558 if (r <= 0)
1559 return r;
1560 break;
1561
1562 case 'P':
1563 case ARG_PIPE:
1564 r = handle_arg_console("pipe");
1565 if (r <= 0)
1566 return r;
1567 break;
1568
1569 case ARG_NO_PAGER:
1570 arg_pager_flags |= PAGER_DISABLE;
1571 break;
1572
1573 case ARG_SET_CREDENTIAL: {
1574 _cleanup_free_ char *word = NULL, *data = NULL;
1575 const char *p = optarg;
1576 Credential *a;
1577 ssize_t l;
1578
1579 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1580 if (r == -ENOMEM)
1581 return log_oom();
1582 if (r < 0)
1583 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1584 if (r == 0 || !p)
1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1586
1587 if (!credential_name_valid(word))
1588 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1589
1590 for (size_t i = 0; i < arg_n_credentials; i++)
1591 if (streq(arg_credentials[i].id, word))
1592 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1593
1594 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1595 if (l < 0)
1596 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1597
1598 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1599 if (!a)
1600 return log_oom();
1601
1602 a[arg_n_credentials++] = (Credential) {
1603 .id = TAKE_PTR(word),
1604 .data = TAKE_PTR(data),
1605 .size = l,
1606 };
1607
1608 arg_credentials = a;
1609
1610 arg_settings_mask |= SETTING_CREDENTIALS;
1611 break;
1612 }
1613
1614 case ARG_LOAD_CREDENTIAL: {
1615 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1616 _cleanup_(erase_and_freep) char *data = NULL;
1617 _cleanup_free_ char *word = NULL, *j = NULL;
1618 const char *p = optarg;
1619 Credential *a;
1620 size_t size, i;
1621
1622 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1623 if (r == -ENOMEM)
1624 return log_oom();
1625 if (r < 0)
1626 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1627 if (r == 0 || !p)
1628 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1629
1630 if (!credential_name_valid(word))
1631 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1632
1633 for (i = 0; i < arg_n_credentials; i++)
1634 if (streq(arg_credentials[i].id, word))
1635 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1636
1637 if (path_is_absolute(p))
1638 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1639 else {
1640 const char *e;
1641
1642 r = get_credentials_dir(&e);
1643 if (r < 0)
1644 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
1645
1646 j = path_join(e, p);
1647 if (!j)
1648 return log_oom();
1649 }
1650
1651 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1652 flags,
1653 NULL,
1654 &data, &size);
1655 if (r < 0)
1656 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1657
1658 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1659 if (!a)
1660 return log_oom();
1661
1662 a[arg_n_credentials++] = (Credential) {
1663 .id = TAKE_PTR(word),
1664 .data = TAKE_PTR(data),
1665 .size = size,
1666 };
1667
1668 arg_credentials = a;
1669
1670 arg_settings_mask |= SETTING_CREDENTIALS;
1671 break;
1672 }
1673
1674 case ARG_BIND_USER:
1675 if (!valid_user_group_name(optarg, 0))
1676 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1677
1678 if (strv_extend(&arg_bind_user, optarg) < 0)
1679 return log_oom();
1680
1681 arg_settings_mask |= SETTING_BIND_USER;
1682 break;
1683
1684 case ARG_SUPPRESS_SYNC:
1685 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1686 if (r < 0)
1687 return r;
1688
1689 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1690 break;
1691
1692 case '?':
1693 return -EINVAL;
1694
1695 default:
1696 assert_not_reached();
1697 }
1698
1699 if (argc > optind) {
1700 strv_free(arg_parameters);
1701 arg_parameters = strv_copy(argv + optind);
1702 if (!arg_parameters)
1703 return log_oom();
1704
1705 arg_settings_mask |= SETTING_START_MODE;
1706 }
1707
1708 if (arg_ephemeral && arg_template && !arg_directory)
1709 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1710 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1711 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1712 * --directory=". */
1713 arg_directory = TAKE_PTR(arg_template);
1714
1715 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1716
1717 /* Make sure to parse environment before we reset the settings mask below */
1718 r = parse_environment();
1719 if (r < 0)
1720 return r;
1721
1722 /* Load all settings from .nspawn files */
1723 if (mask_no_settings)
1724 arg_settings_mask = 0;
1725
1726 /* Don't load any settings from .nspawn files */
1727 if (mask_all_settings)
1728 arg_settings_mask = _SETTINGS_MASK_ALL;
1729
1730 return 1;
1731 }
1732
1733 static int verify_arguments(void) {
1734 int r;
1735
1736 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1737 /* If we are running the stub init in the container, we don't need to look at what the init
1738 * in the container supports, because we are not using it. Let's immediately pick the right
1739 * setting based on the host system configuration.
1740 *
1741 * We only do this, if the user didn't use an environment variable to override the detection.
1742 */
1743
1744 r = cg_all_unified();
1745 if (r < 0)
1746 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1747 if (r > 0)
1748 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1749 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1750 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1751 else
1752 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1753 }
1754
1755 if (arg_userns_mode != USER_NAMESPACE_NO)
1756 arg_mount_settings |= MOUNT_USE_USERNS;
1757
1758 if (arg_private_network)
1759 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1760
1761 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1762 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1763 arg_register = false;
1764 if (arg_start_mode != START_PID1)
1765 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1766 }
1767
1768 if (arg_userns_ownership < 0)
1769 arg_userns_ownership =
1770 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1771 USER_NAMESPACE_OWNERSHIP_OFF;
1772
1773 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1774 arg_kill_signal = SIGRTMIN+3;
1775
1776 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1777 arg_read_only = true;
1778
1779 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1780 arg_read_only = true;
1781
1782 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1783 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1784 * The latter is not technically a user session, but we don't need to labour the point. */
1785 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1786
1787 if (arg_directory && arg_image)
1788 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1789
1790 if (arg_template && arg_image)
1791 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1792
1793 if (arg_template && !(arg_directory || arg_machine))
1794 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1795
1796 if (arg_ephemeral && arg_template)
1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1798
1799 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1801
1802 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1803 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1804
1805 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1806 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1807 "--read-only and --private-users-ownership=chown may not be combined.");
1808
1809 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1810 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1811 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1812 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1813 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1814
1815 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1816 * we need to error out, to avoid conflicts between different network options. */
1817 if (arg_network_namespace_path &&
1818 (arg_network_interfaces || arg_network_macvlan ||
1819 arg_network_ipvlan || arg_network_veth_extra ||
1820 arg_network_bridge || arg_network_zone ||
1821 arg_network_veth))
1822 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1823
1824 if (arg_network_bridge && arg_network_zone)
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1826 "--network-bridge= and --network-zone= may not be combined.");
1827
1828 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1829 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1830
1831 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1833
1834 if (arg_expose_ports && !arg_private_network)
1835 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1836
1837 if (arg_caps_ambient) {
1838 if (arg_caps_ambient == UINT64_MAX)
1839 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1840
1841 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1842 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1843
1844 if (arg_start_mode == START_BOOT)
1845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1846 }
1847
1848 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1849 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1850
1851 /* Drop duplicate --bind-user= entries */
1852 strv_uniq(arg_bind_user);
1853
1854 r = custom_mount_check_all();
1855 if (r < 0)
1856 return r;
1857
1858 return 0;
1859 }
1860
1861 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1862 assert(p);
1863
1864 if (arg_userns_mode == USER_NAMESPACE_NO)
1865 return 0;
1866
1867 if (uid == UID_INVALID && gid == GID_INVALID)
1868 return 0;
1869
1870 if (uid != UID_INVALID) {
1871 uid += arg_uid_shift;
1872
1873 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1874 return -EOVERFLOW;
1875 }
1876
1877 if (gid != GID_INVALID) {
1878 gid += (gid_t) arg_uid_shift;
1879
1880 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1881 return -EOVERFLOW;
1882 }
1883
1884 return RET_NERRNO(lchown(p, uid, gid));
1885 }
1886
1887 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1888 const char *q;
1889 int r;
1890
1891 q = prefix_roota(root, path);
1892 r = RET_NERRNO(mkdir(q, mode));
1893 if (r == -EEXIST)
1894 return 0;
1895 if (r < 0)
1896 return r;
1897
1898 return userns_lchown(q, uid, gid);
1899 }
1900
1901 static const char *timezone_from_path(const char *path) {
1902 return PATH_STARTSWITH_SET(
1903 path,
1904 "../usr/share/zoneinfo/",
1905 "/usr/share/zoneinfo/");
1906 }
1907
1908 static bool etc_writable(void) {
1909 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1910 }
1911
1912 static int setup_timezone(const char *dest) {
1913 _cleanup_free_ char *p = NULL, *etc = NULL;
1914 const char *where, *check;
1915 TimezoneMode m;
1916 int r;
1917
1918 assert(dest);
1919
1920 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1921 r = readlink_malloc("/etc/localtime", &p);
1922 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1923 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1924 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1925 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1926 else if (r < 0) {
1927 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1928 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1929 * file.
1930 *
1931 * Example:
1932 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1933 */
1934 return 0;
1935 } else if (arg_timezone == TIMEZONE_AUTO)
1936 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1937 else
1938 m = arg_timezone;
1939 } else
1940 m = arg_timezone;
1941
1942 if (m == TIMEZONE_OFF)
1943 return 0;
1944
1945 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1946 if (r < 0) {
1947 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1948 return 0;
1949 }
1950
1951 where = strjoina(etc, "/localtime");
1952
1953 switch (m) {
1954
1955 case TIMEZONE_DELETE:
1956 if (unlink(where) < 0)
1957 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1958
1959 return 0;
1960
1961 case TIMEZONE_SYMLINK: {
1962 _cleanup_free_ char *q = NULL;
1963 const char *z, *what;
1964
1965 z = timezone_from_path(p);
1966 if (!z) {
1967 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1968 return 0;
1969 }
1970
1971 r = readlink_malloc(where, &q);
1972 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1973 return 0; /* Already pointing to the right place? Then do nothing .. */
1974
1975 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1976 r = chase_symlinks(check, dest, 0, NULL, NULL);
1977 if (r < 0)
1978 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1979 else {
1980 if (unlink(where) < 0 && errno != ENOENT) {
1981 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1982 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1983 return 0;
1984 }
1985
1986 what = strjoina("../usr/share/zoneinfo/", z);
1987 if (symlink(what, where) < 0) {
1988 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1989 errno, "Failed to correct timezone of container, ignoring: %m");
1990 return 0;
1991 }
1992
1993 break;
1994 }
1995
1996 _fallthrough_;
1997 }
1998
1999 case TIMEZONE_BIND: {
2000 _cleanup_free_ char *resolved = NULL;
2001 int found;
2002
2003 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2004 if (found < 0) {
2005 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2006 return 0;
2007 }
2008
2009 if (found == 0) /* missing? */
2010 (void) touch(resolved);
2011
2012 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
2013 if (r >= 0)
2014 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2015
2016 _fallthrough_;
2017 }
2018
2019 case TIMEZONE_COPY:
2020 /* If mounting failed, try to copy */
2021 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2022 if (r < 0) {
2023 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2024 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2025 return 0;
2026 }
2027
2028 break;
2029
2030 default:
2031 assert_not_reached();
2032 }
2033
2034 /* Fix permissions of the symlink or file copy we just created */
2035 r = userns_lchown(where, 0, 0);
2036 if (r < 0)
2037 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
2038
2039 return 0;
2040 }
2041
2042 static int have_resolv_conf(const char *path) {
2043 assert(path);
2044
2045 if (access(path, F_OK) < 0) {
2046 if (errno == ENOENT)
2047 return 0;
2048
2049 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2050 }
2051
2052 return 1;
2053 }
2054
2055 static int resolved_listening(void) {
2056 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2057 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2058 _cleanup_free_ char *dns_stub_listener_mode = NULL;
2059 int r;
2060
2061 /* Check if resolved is listening */
2062
2063 r = sd_bus_open_system(&bus);
2064 if (r < 0)
2065 return log_debug_errno(r, "Failed to open system bus: %m");
2066
2067 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2068 if (r < 0)
2069 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2070 if (r == 0)
2071 return 0;
2072
2073 r = sd_bus_get_property_string(bus,
2074 "org.freedesktop.resolve1",
2075 "/org/freedesktop/resolve1",
2076 "org.freedesktop.resolve1.Manager",
2077 "DNSStubListener",
2078 &error,
2079 &dns_stub_listener_mode);
2080 if (r < 0)
2081 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2082
2083 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2084 }
2085
2086 static int setup_resolv_conf(const char *dest) {
2087 _cleanup_free_ char *etc = NULL;
2088 const char *where, *what;
2089 ResolvConfMode m;
2090 int r;
2091
2092 assert(dest);
2093
2094 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2095 if (arg_private_network)
2096 m = RESOLV_CONF_OFF;
2097 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2098 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2099 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2100 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2101 else
2102 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2103
2104 } else
2105 m = arg_resolv_conf;
2106
2107 if (m == RESOLV_CONF_OFF)
2108 return 0;
2109
2110 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2111 if (r < 0) {
2112 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2113 return 0;
2114 }
2115
2116 where = strjoina(etc, "/resolv.conf");
2117
2118 if (m == RESOLV_CONF_DELETE) {
2119 if (unlink(where) < 0)
2120 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2121
2122 return 0;
2123 }
2124
2125 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2126 what = PRIVATE_STATIC_RESOLV_CONF;
2127 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2128 what = PRIVATE_UPLINK_RESOLV_CONF;
2129 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2130 what = PRIVATE_STUB_RESOLV_CONF;
2131 else
2132 what = "/etc/resolv.conf";
2133
2134 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2135 _cleanup_free_ char *resolved = NULL;
2136 int found;
2137
2138 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2139 if (found < 0) {
2140 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2141 return 0;
2142 }
2143
2144 if (found == 0) /* missing? */
2145 (void) touch(resolved);
2146
2147 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2148 if (r >= 0)
2149 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2150
2151 /* If that didn't work, let's copy the file */
2152 }
2153
2154 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2155 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2156 else
2157 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
2158 if (r < 0) {
2159 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2160 * resolved or something similar runs inside and the symlink points there.
2161 *
2162 * If the disk image is read-only, there's also no point in complaining.
2163 */
2164 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2165 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2166 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2167 return 0;
2168 }
2169
2170 r = userns_lchown(where, 0, 0);
2171 if (r < 0)
2172 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2173
2174 return 0;
2175 }
2176
2177 static int setup_boot_id(void) {
2178 _cleanup_(unlink_and_freep) char *from = NULL;
2179 _cleanup_free_ char *path = NULL;
2180 sd_id128_t rnd = SD_ID128_NULL;
2181 const char *to;
2182 int r;
2183
2184 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2185
2186 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2187 if (r < 0)
2188 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2189
2190 r = sd_id128_randomize(&rnd);
2191 if (r < 0)
2192 return log_error_errno(r, "Failed to generate random boot id: %m");
2193
2194 r = id128_write(path, ID128_UUID, rnd, false);
2195 if (r < 0)
2196 return log_error_errno(r, "Failed to write boot id: %m");
2197
2198 from = TAKE_PTR(path);
2199 to = "/proc/sys/kernel/random/boot_id";
2200
2201 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2202 if (r < 0)
2203 return r;
2204
2205 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2206 }
2207
2208 static int copy_devnodes(const char *dest) {
2209 static const char devnodes[] =
2210 "null\0"
2211 "zero\0"
2212 "full\0"
2213 "random\0"
2214 "urandom\0"
2215 "tty\0"
2216 "net/tun\0";
2217
2218 const char *d;
2219 int r = 0;
2220
2221 assert(dest);
2222
2223 BLOCK_WITH_UMASK(0000);
2224
2225 /* Create /dev/net, so that we can create /dev/net/tun in it */
2226 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2227 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2228
2229 NULSTR_FOREACH(d, devnodes) {
2230 _cleanup_free_ char *from = NULL, *to = NULL;
2231 struct stat st;
2232
2233 from = path_join("/dev/", d);
2234 if (!from)
2235 return log_oom();
2236
2237 to = path_join(dest, from);
2238 if (!to)
2239 return log_oom();
2240
2241 if (stat(from, &st) < 0) {
2242
2243 if (errno != ENOENT)
2244 return log_error_errno(errno, "Failed to stat %s: %m", from);
2245
2246 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2247 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2248 "%s is not a char or block device, cannot copy.", from);
2249 else {
2250 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2251
2252 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2253 /* Explicitly warn the user when /dev is already populated. */
2254 if (errno == EEXIST)
2255 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
2256 if (errno != EPERM)
2257 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2258
2259 /* Some systems abusively restrict mknod but allow bind mounts. */
2260 r = touch(to);
2261 if (r < 0)
2262 return log_error_errno(r, "touch (%s) failed: %m", to);
2263 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2264 if (r < 0)
2265 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2266 }
2267
2268 r = userns_lchown(to, 0, 0);
2269 if (r < 0)
2270 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2271
2272 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2273 if (!dn)
2274 return log_oom();
2275
2276 r = userns_mkdir(dest, dn, 0755, 0, 0);
2277 if (r < 0)
2278 return log_error_errno(r, "Failed to create '%s': %m", dn);
2279
2280 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2281 return log_oom();
2282
2283 prefixed = path_join(dest, sl);
2284 if (!prefixed)
2285 return log_oom();
2286
2287 t = path_join("..", d);
2288 if (!t)
2289 return log_oom();
2290
2291 if (symlink(t, prefixed) < 0)
2292 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2293 }
2294 }
2295
2296 return r;
2297 }
2298
2299 static int make_extra_nodes(const char *dest) {
2300 size_t i;
2301 int r;
2302
2303 BLOCK_WITH_UMASK(0000);
2304
2305 for (i = 0; i < arg_n_extra_nodes; i++) {
2306 _cleanup_free_ char *path = NULL;
2307 DeviceNode *n = arg_extra_nodes + i;
2308
2309 path = path_join(dest, n->path);
2310 if (!path)
2311 return log_oom();
2312
2313 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2314 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2315
2316 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2319 }
2320
2321 return 0;
2322 }
2323
2324 static int setup_pts(const char *dest) {
2325 _cleanup_free_ char *options = NULL;
2326 const char *p;
2327 int r;
2328
2329 #if HAVE_SELINUX
2330 if (arg_selinux_apifs_context)
2331 (void) asprintf(&options,
2332 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2333 arg_uid_shift + TTY_GID,
2334 arg_selinux_apifs_context);
2335 else
2336 #endif
2337 (void) asprintf(&options,
2338 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2339 arg_uid_shift + TTY_GID);
2340
2341 if (!options)
2342 return log_oom();
2343
2344 /* Mount /dev/pts itself */
2345 p = prefix_roota(dest, "/dev/pts");
2346 r = RET_NERRNO(mkdir(p, 0755));
2347 if (r < 0)
2348 return log_error_errno(r, "Failed to create /dev/pts: %m");
2349
2350 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2351 if (r < 0)
2352 return r;
2353 r = userns_lchown(p, 0, 0);
2354 if (r < 0)
2355 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2356
2357 /* Create /dev/ptmx symlink */
2358 p = prefix_roota(dest, "/dev/ptmx");
2359 if (symlink("pts/ptmx", p) < 0)
2360 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2361 r = userns_lchown(p, 0, 0);
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2364
2365 /* And fix /dev/pts/ptmx ownership */
2366 p = prefix_roota(dest, "/dev/pts/ptmx");
2367 r = userns_lchown(p, 0, 0);
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2370
2371 return 0;
2372 }
2373
2374 static int setup_stdio_as_dev_console(void) {
2375 _cleanup_close_ int terminal = -1;
2376 int r;
2377
2378 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2379 * explicitly, if we are configured to. */
2380 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2381 if (terminal < 0)
2382 return log_error_errno(terminal, "Failed to open console: %m");
2383
2384 /* Make sure we can continue logging to the original stderr, even if
2385 * stderr points elsewhere now */
2386 r = log_dup_console();
2387 if (r < 0)
2388 return log_error_errno(r, "Failed to duplicate stderr: %m");
2389
2390 /* invalidates 'terminal' on success and failure */
2391 r = rearrange_stdio(terminal, terminal, terminal);
2392 TAKE_FD(terminal);
2393 if (r < 0)
2394 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2395
2396 return 0;
2397 }
2398
2399 static int setup_dev_console(const char *console) {
2400 _cleanup_free_ char *p = NULL;
2401 int r;
2402
2403 /* Create /dev/console symlink */
2404 r = path_make_relative("/dev", console, &p);
2405 if (r < 0)
2406 return log_error_errno(r, "Failed to create relative path: %m");
2407
2408 if (symlink(p, "/dev/console") < 0)
2409 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2410
2411 return 0;
2412 }
2413
2414 static int setup_keyring(void) {
2415 key_serial_t keyring;
2416
2417 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2418 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2419 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2420 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2421 * into the container. */
2422
2423 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2424 if (keyring == -1) {
2425 if (errno == ENOSYS)
2426 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2427 else if (ERRNO_IS_PRIVILEGE(errno))
2428 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2429 else
2430 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2431 }
2432
2433 return 0;
2434 }
2435
2436 static int setup_credentials(const char *root) {
2437 const char *q;
2438 int r;
2439
2440 if (arg_n_credentials <= 0)
2441 return 0;
2442
2443 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2444 if (r < 0)
2445 return log_error_errno(r, "Failed to create /run/host: %m");
2446
2447 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2448 if (r < 0)
2449 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2450
2451 q = prefix_roota(root, "/run/host/credentials");
2452 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2453 if (r < 0)
2454 return r;
2455
2456 for (size_t i = 0; i < arg_n_credentials; i++) {
2457 _cleanup_free_ char *j = NULL;
2458 _cleanup_close_ int fd = -1;
2459
2460 j = path_join(q, arg_credentials[i].id);
2461 if (!j)
2462 return log_oom();
2463
2464 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2465 if (fd < 0)
2466 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2467
2468 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2471
2472 if (fchmod(fd, 0400) < 0)
2473 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2474
2475 if (arg_userns_mode != USER_NAMESPACE_NO) {
2476 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2477 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2478 }
2479 }
2480
2481 if (chmod(q, 0500) < 0)
2482 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2483
2484 r = userns_lchown(q, 0, 0);
2485 if (r < 0)
2486 return r;
2487
2488 /* Make both mount and superblock read-only now */
2489 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2490 if (r < 0)
2491 return r;
2492
2493 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2494 }
2495
2496 static int setup_kmsg(int kmsg_socket) {
2497 _cleanup_(unlink_and_freep) char *from = NULL;
2498 _cleanup_free_ char *fifo = NULL;
2499 _cleanup_close_ int fd = -1;
2500 int r;
2501
2502 assert(kmsg_socket >= 0);
2503
2504 BLOCK_WITH_UMASK(0000);
2505
2506 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
2507 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2508 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2509 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2510
2511 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2512 if (r < 0)
2513 return log_error_errno(r, "Failed to generate kmsg path: %m");
2514
2515 if (mkfifo(fifo, 0600) < 0)
2516 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2517
2518 from = TAKE_PTR(fifo);
2519
2520 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2521 if (r < 0)
2522 return r;
2523
2524 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2525 if (fd < 0)
2526 return log_error_errno(errno, "Failed to open fifo: %m");
2527
2528 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2529 r = send_one_fd(kmsg_socket, fd, 0);
2530 if (r < 0)
2531 return log_error_errno(r, "Failed to send FIFO fd: %m");
2532
2533 return 0;
2534 }
2535
2536 struct ExposeArgs {
2537 union in_addr_union address4;
2538 union in_addr_union address6;
2539 struct FirewallContext *fw_ctx;
2540 };
2541
2542 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2543 struct ExposeArgs *args = userdata;
2544
2545 assert(rtnl);
2546 assert(m);
2547 assert(args);
2548
2549 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2550 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2551 return 0;
2552 }
2553
2554 static int setup_hostname(void) {
2555 int r;
2556
2557 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2558 return 0;
2559
2560 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2561 if (r < 0)
2562 return log_error_errno(r, "Failed to set hostname: %m");
2563
2564 return 0;
2565 }
2566
2567 static int setup_journal(const char *directory) {
2568 _cleanup_free_ char *d = NULL;
2569 const char *p, *q;
2570 sd_id128_t this_id;
2571 bool try;
2572 int r;
2573
2574 /* Don't link journals in ephemeral mode */
2575 if (arg_ephemeral)
2576 return 0;
2577
2578 if (arg_link_journal == LINK_NO)
2579 return 0;
2580
2581 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2582
2583 r = sd_id128_get_machine(&this_id);
2584 if (r < 0)
2585 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2586
2587 if (sd_id128_equal(arg_uuid, this_id)) {
2588 log_full(try ? LOG_WARNING : LOG_ERR,
2589 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2590 if (try)
2591 return 0;
2592 return -EEXIST;
2593 }
2594
2595 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2596 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2597 if (r < 0) {
2598 bool ignore = r == -EROFS && try;
2599 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2600 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2601 return ignore ? 0 : r;
2602 }
2603 }
2604
2605 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2606 q = prefix_roota(directory, p);
2607
2608 if (path_is_mount_point(p, NULL, 0) > 0) {
2609 if (try)
2610 return 0;
2611
2612 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2613 "%s: already a mount point, refusing to use for journal", p);
2614 }
2615
2616 if (path_is_mount_point(q, NULL, 0) > 0) {
2617 if (try)
2618 return 0;
2619
2620 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2621 "%s: already a mount point, refusing to use for journal", q);
2622 }
2623
2624 r = readlink_and_make_absolute(p, &d);
2625 if (r >= 0) {
2626 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2627 path_equal(d, q)) {
2628
2629 r = userns_mkdir(directory, p, 0755, 0, 0);
2630 if (r < 0)
2631 log_warning_errno(r, "Failed to create directory %s: %m", q);
2632 return 0;
2633 }
2634
2635 if (unlink(p) < 0)
2636 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2637 } else if (r == -EINVAL) {
2638
2639 if (arg_link_journal == LINK_GUEST &&
2640 rmdir(p) < 0) {
2641
2642 if (errno == ENOTDIR) {
2643 log_error("%s already exists and is neither a symlink nor a directory", p);
2644 return r;
2645 } else
2646 return log_error_errno(errno, "Failed to remove %s: %m", p);
2647 }
2648 } else if (r != -ENOENT)
2649 return log_error_errno(r, "readlink(%s) failed: %m", p);
2650
2651 if (arg_link_journal == LINK_GUEST) {
2652
2653 if (symlink(q, p) < 0) {
2654 if (try) {
2655 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2656 return 0;
2657 } else
2658 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2659 }
2660
2661 r = userns_mkdir(directory, p, 0755, 0, 0);
2662 if (r < 0)
2663 log_warning_errno(r, "Failed to create directory %s: %m", q);
2664 return 0;
2665 }
2666
2667 if (arg_link_journal == LINK_HOST) {
2668 /* don't create parents here — if the host doesn't have
2669 * permanent journal set up, don't force it here */
2670
2671 r = RET_NERRNO(mkdir(p, 0755));
2672 if (r < 0 && r != -EEXIST) {
2673 if (try) {
2674 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2675 return 0;
2676 } else
2677 return log_error_errno(r, "Failed to create %s: %m", p);
2678 }
2679
2680 } else if (access(p, F_OK) < 0)
2681 return 0;
2682
2683 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2684 log_warning("%s is not empty, proceeding anyway.", q);
2685
2686 r = userns_mkdir(directory, p, 0755, 0, 0);
2687 if (r < 0)
2688 return log_error_errno(r, "Failed to create %s: %m", q);
2689
2690 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2691 if (r < 0)
2692 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2693
2694 return 0;
2695 }
2696
2697 static int drop_capabilities(uid_t uid) {
2698 CapabilityQuintet q;
2699
2700 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2701 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2702 * arg_caps_retain. */
2703
2704 if (capability_quintet_is_set(&arg_full_capabilities)) {
2705 q = arg_full_capabilities;
2706
2707 if (q.bounding == UINT64_MAX)
2708 q.bounding = uid == 0 ? arg_caps_retain : 0;
2709
2710 if (q.effective == UINT64_MAX)
2711 q.effective = uid == 0 ? q.bounding : 0;
2712
2713 if (q.inheritable == UINT64_MAX)
2714 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2715
2716 if (q.permitted == UINT64_MAX)
2717 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2718
2719 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2720 q.ambient = arg_caps_ambient;
2721
2722 if (capability_quintet_mangle(&q))
2723 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2724
2725 } else {
2726 q = (CapabilityQuintet) {
2727 .bounding = arg_caps_retain,
2728 .effective = uid == 0 ? arg_caps_retain : 0,
2729 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2730 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2731 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2732 };
2733
2734 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2735 * in order to maintain the same behavior as systemd < 242. */
2736 if (capability_quintet_mangle(&q))
2737 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2738 "Some capabilities will not be set because they are not in the current bounding set.");
2739
2740 }
2741
2742 return capability_quintet_enforce(&q);
2743 }
2744
2745 static int reset_audit_loginuid(void) {
2746 _cleanup_free_ char *p = NULL;
2747 int r;
2748
2749 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2750 return 0;
2751
2752 r = read_one_line_file("/proc/self/loginuid", &p);
2753 if (r == -ENOENT)
2754 return 0;
2755 if (r < 0)
2756 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2757
2758 /* Already reset? */
2759 if (streq(p, "4294967295"))
2760 return 0;
2761
2762 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2763 if (r < 0) {
2764 log_error_errno(r,
2765 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2766 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2767 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2768 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2769 "using systemd-nspawn. Sleeping for 5s... (%m)");
2770
2771 sleep(5);
2772 }
2773
2774 return 0;
2775 }
2776
2777 static int setup_propagate(const char *root) {
2778 const char *p, *q;
2779 int r;
2780
2781 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2782 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2783 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2784 (void) mkdir_p(p, 0600);
2785
2786 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2787 if (r < 0)
2788 return log_error_errno(r, "Failed to create /run/host: %m");
2789
2790 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
2791 if (r < 0)
2792 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
2793
2794 q = prefix_roota(root, "/run/host/incoming");
2795 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2796 if (r < 0)
2797 return r;
2798
2799 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2800 if (r < 0)
2801 return r;
2802
2803 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
2804 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2805 }
2806
2807 static int setup_machine_id(const char *directory) {
2808 const char *etc_machine_id;
2809 sd_id128_t id;
2810 int r;
2811
2812 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2813 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2814 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2815 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2816 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2817 * container behaves nicely). */
2818
2819 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2820
2821 r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
2822 if (r < 0) {
2823 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2824 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2825
2826 if (sd_id128_is_null(arg_uuid)) {
2827 r = sd_id128_randomize(&arg_uuid);
2828 if (r < 0)
2829 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2830 }
2831 } else {
2832 if (sd_id128_is_null(id))
2833 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2834 "Machine ID in container image is zero, refusing.");
2835
2836 arg_uuid = id;
2837 }
2838
2839 return 0;
2840 }
2841
2842 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2843 int r;
2844
2845 assert(directory);
2846
2847 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2848 return 0;
2849
2850 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2851 if (r == -EOPNOTSUPP)
2852 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2853 if (r == -EBADE)
2854 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2855 if (r < 0)
2856 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2857 if (r == 0)
2858 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2859 else
2860 log_debug("Patched directory tree to match UID/GID range.");
2861
2862 return r;
2863 }
2864
2865 /*
2866 * Return values:
2867 * < 0 : wait_for_terminate() failed to get the state of the
2868 * container, the container was terminated by a signal, or
2869 * failed for an unknown reason. No change is made to the
2870 * container argument.
2871 * > 0 : The program executed in the container terminated with an
2872 * error. The exit code of the program executed in the
2873 * container is returned. The container argument has been set
2874 * to CONTAINER_TERMINATED.
2875 * 0 : The container is being rebooted, has been shut down or exited
2876 * successfully. The container argument has been set to either
2877 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2878 *
2879 * That is, success is indicated by a return value of zero, and an
2880 * error is indicated by a non-zero value.
2881 */
2882 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2883 siginfo_t status;
2884 int r;
2885
2886 r = wait_for_terminate(pid, &status);
2887 if (r < 0)
2888 return log_warning_errno(r, "Failed to wait for container: %m");
2889
2890 switch (status.si_code) {
2891
2892 case CLD_EXITED:
2893 if (status.si_status == 0)
2894 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2895 else
2896 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2897
2898 *container = CONTAINER_TERMINATED;
2899 return status.si_status;
2900
2901 case CLD_KILLED:
2902 if (status.si_status == SIGINT) {
2903 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2904 *container = CONTAINER_TERMINATED;
2905 return 0;
2906
2907 } else if (status.si_status == SIGHUP) {
2908 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2909 *container = CONTAINER_REBOOTED;
2910 return 0;
2911 }
2912
2913 _fallthrough_;
2914 case CLD_DUMPED:
2915 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2916 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2917
2918 default:
2919 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2920 "Container %s failed due to unknown reason.", arg_machine);
2921 }
2922 }
2923
2924 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2925 pid_t pid;
2926
2927 pid = PTR_TO_PID(userdata);
2928 if (pid > 0) {
2929 if (kill(pid, arg_kill_signal) >= 0) {
2930 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2931 sd_event_source_set_userdata(s, NULL);
2932 return 0;
2933 }
2934 }
2935
2936 sd_event_exit(sd_event_source_get_event(s), 0);
2937 return 0;
2938 }
2939
2940 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2941 pid_t pid;
2942
2943 assert(s);
2944 assert(ssi);
2945
2946 pid = PTR_TO_PID(userdata);
2947
2948 for (;;) {
2949 siginfo_t si = {};
2950
2951 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2952 return log_error_errno(errno, "Failed to waitid(): %m");
2953 if (si.si_pid == 0) /* No pending children. */
2954 break;
2955 if (si.si_pid == pid) {
2956 /* The main process we care for has exited. Return from
2957 * signal handler but leave the zombie. */
2958 sd_event_exit(sd_event_source_get_event(s), 0);
2959 break;
2960 }
2961
2962 /* Reap all other children. */
2963 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2964 }
2965
2966 return 0;
2967 }
2968
2969 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2970 pid_t pid;
2971
2972 assert(m);
2973
2974 pid = PTR_TO_PID(userdata);
2975
2976 if (arg_kill_signal > 0) {
2977 log_info("Container termination requested. Attempting to halt container.");
2978 (void) kill(pid, arg_kill_signal);
2979 } else {
2980 log_info("Container termination requested. Exiting.");
2981 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2982 }
2983
2984 return 0;
2985 }
2986
2987 static int determine_names(void) {
2988 int r;
2989
2990 if (arg_template && !arg_directory && arg_machine) {
2991
2992 /* If --template= was specified then we should not
2993 * search for a machine, but instead create a new one
2994 * in /var/lib/machine. */
2995
2996 arg_directory = path_join("/var/lib/machines", arg_machine);
2997 if (!arg_directory)
2998 return log_oom();
2999 }
3000
3001 if (!arg_image && !arg_directory) {
3002 if (arg_machine) {
3003 _cleanup_(image_unrefp) Image *i = NULL;
3004
3005 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3006 if (r == -ENOENT)
3007 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3008 if (r < 0)
3009 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3010
3011 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3012 r = free_and_strdup(&arg_image, i->path);
3013 else
3014 r = free_and_strdup(&arg_directory, i->path);
3015 if (r < 0)
3016 return log_oom();
3017
3018 if (!arg_ephemeral)
3019 arg_read_only = arg_read_only || i->read_only;
3020 } else {
3021 r = safe_getcwd(&arg_directory);
3022 if (r < 0)
3023 return log_error_errno(r, "Failed to determine current directory: %m");
3024 }
3025
3026 if (!arg_directory && !arg_image)
3027 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3028 }
3029
3030 if (!arg_machine) {
3031 if (arg_directory && path_equal(arg_directory, "/"))
3032 arg_machine = gethostname_malloc();
3033 else if (arg_image) {
3034 char *e;
3035
3036 arg_machine = strdup(basename(arg_image));
3037
3038 /* Truncate suffix if there is one */
3039 e = endswith(arg_machine, ".raw");
3040 if (e)
3041 *e = 0;
3042 } else
3043 arg_machine = strdup(basename(arg_directory));
3044 if (!arg_machine)
3045 return log_oom();
3046
3047 hostname_cleanup(arg_machine);
3048 if (!hostname_is_valid(arg_machine, 0))
3049 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3050
3051 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3052 * to match fixed config file names. */
3053 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3054 if (!arg_settings_filename)
3055 return log_oom();
3056
3057 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3058 * instances at once without manually having to specify -M each time. */
3059 if (arg_ephemeral)
3060 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3061 return log_oom();
3062 } else {
3063 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3064 if (!arg_settings_filename)
3065 return log_oom();
3066 }
3067
3068 return 0;
3069 }
3070
3071 static int chase_symlinks_and_update(char **p, unsigned flags) {
3072 char *chased;
3073 int r;
3074
3075 assert(p);
3076
3077 if (!*p)
3078 return 0;
3079
3080 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3081 if (r < 0)
3082 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3083
3084 return free_and_replace(*p, chased);
3085 }
3086
3087 static int determine_uid_shift(const char *directory) {
3088
3089 if (arg_userns_mode == USER_NAMESPACE_NO) {
3090 arg_uid_shift = 0;
3091 return 0;
3092 }
3093
3094 if (arg_uid_shift == UID_INVALID) {
3095 struct stat st;
3096
3097 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3098
3099 if (stat(directory, &st) < 0)
3100 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3101
3102 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3103
3104 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3105 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3106 "UID and GID base of %s don't match.", directory);
3107
3108 arg_uid_range = UINT32_C(0x10000);
3109
3110 if (arg_uid_shift != 0) {
3111 /* If the image is shifted already, then we'll fall back to classic chowning, for
3112 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3113
3114 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3115 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3116 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3117 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3118 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3119 "UID base of %s is not zero, UID mapping not supported.", directory);
3120 }
3121 }
3122
3123 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3124 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3125
3126 return 0;
3127 }
3128
3129 static unsigned long effective_clone_ns_flags(void) {
3130 unsigned long flags = arg_clone_ns_flags;
3131
3132 if (arg_private_network)
3133 flags |= CLONE_NEWNET;
3134 if (arg_use_cgns)
3135 flags |= CLONE_NEWCGROUP;
3136 if (arg_userns_mode != USER_NAMESPACE_NO)
3137 flags |= CLONE_NEWUSER;
3138
3139 return flags;
3140 }
3141
3142 static int patch_sysctl(void) {
3143
3144 /* This table is inspired by runc's sysctl() function */
3145 static const struct {
3146 const char *key;
3147 bool prefix;
3148 unsigned long clone_flags;
3149 } safe_sysctl[] = {
3150 { "kernel.hostname", false, CLONE_NEWUTS },
3151 { "kernel.domainname", false, CLONE_NEWUTS },
3152 { "kernel.msgmax", false, CLONE_NEWIPC },
3153 { "kernel.msgmnb", false, CLONE_NEWIPC },
3154 { "kernel.msgmni", false, CLONE_NEWIPC },
3155 { "kernel.sem", false, CLONE_NEWIPC },
3156 { "kernel.shmall", false, CLONE_NEWIPC },
3157 { "kernel.shmmax", false, CLONE_NEWIPC },
3158 { "kernel.shmmni", false, CLONE_NEWIPC },
3159 { "fs.mqueue.", true, CLONE_NEWIPC },
3160 { "net.", true, CLONE_NEWNET },
3161 };
3162
3163 unsigned long flags;
3164 int r;
3165
3166 flags = effective_clone_ns_flags();
3167
3168 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3169 bool good = false;
3170 size_t i;
3171
3172 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3173
3174 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3175 continue;
3176
3177 if (safe_sysctl[i].prefix)
3178 good = startswith(*k, safe_sysctl[i].key);
3179 else
3180 good = streq(*k, safe_sysctl[i].key);
3181
3182 if (good)
3183 break;
3184 }
3185
3186 if (!good)
3187 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3188
3189 r = sysctl_write(*k, *v);
3190 if (r < 0)
3191 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3192 }
3193
3194 return 0;
3195 }
3196
3197 static int inner_child(
3198 Barrier *barrier,
3199 const char *directory,
3200 bool secondary,
3201 int kmsg_socket,
3202 int rtnl_socket,
3203 int master_pty_socket,
3204 FDSet *fds,
3205 char **os_release_pairs) {
3206
3207 _cleanup_free_ char *home = NULL;
3208 size_t n_env = 1;
3209 char *envp[] = {
3210 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3211 NULL, /* container */
3212 NULL, /* TERM */
3213 NULL, /* HOME */
3214 NULL, /* USER */
3215 NULL, /* LOGNAME */
3216 NULL, /* container_uuid */
3217 NULL, /* LISTEN_FDS */
3218 NULL, /* LISTEN_PID */
3219 NULL, /* NOTIFY_SOCKET */
3220 NULL, /* CREDENTIALS_DIRECTORY */
3221 NULL, /* LANG */
3222 NULL
3223 };
3224 const char *exec_target;
3225 _cleanup_strv_free_ char **env_use = NULL;
3226 int r, which_failed;
3227
3228 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3229 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3230 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3231 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3232 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3233 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3234 * namespace.
3235 *
3236 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3237 * unshare(). See below. */
3238
3239 assert(barrier);
3240 assert(directory);
3241 assert(kmsg_socket >= 0);
3242
3243 log_debug("Inner child is initializing.");
3244
3245 if (arg_userns_mode != USER_NAMESPACE_NO) {
3246 /* Tell the parent, that it now can write the UID map. */
3247 (void) barrier_place(barrier); /* #1 */
3248
3249 /* Wait until the parent wrote the UID map */
3250 if (!barrier_place_and_sync(barrier)) /* #2 */
3251 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3252
3253 /* Become the new root user inside our namespace */
3254 r = reset_uid_gid();
3255 if (r < 0)
3256 return log_error_errno(r, "Couldn't become new root: %m");
3257
3258 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3259 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3260 * propagation, but simply create new peer groups for all our mounts). */
3261 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3262 if (r < 0)
3263 return r;
3264 }
3265
3266 r = mount_all(NULL,
3267 arg_mount_settings | MOUNT_IN_USERNS,
3268 arg_uid_shift,
3269 arg_selinux_apifs_context);
3270 if (r < 0)
3271 return r;
3272
3273 if (!arg_network_namespace_path && arg_private_network) {
3274 r = unshare(CLONE_NEWNET);
3275 if (r < 0)
3276 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3277
3278 /* Tell the parent that it can setup network interfaces. */
3279 (void) barrier_place(barrier); /* #3 */
3280 }
3281
3282 r = mount_sysfs(NULL, arg_mount_settings);
3283 if (r < 0)
3284 return r;
3285
3286 /* Wait until we are cgroup-ified, so that we
3287 * can mount the right cgroup path writable */
3288 if (!barrier_place_and_sync(barrier)) /* #4 */
3289 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3290 "Parent died too early");
3291
3292 if (arg_use_cgns) {
3293 r = unshare(CLONE_NEWCGROUP);
3294 if (r < 0)
3295 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3296 r = mount_cgroups(
3297 "",
3298 arg_unified_cgroup_hierarchy,
3299 arg_userns_mode != USER_NAMESPACE_NO,
3300 arg_uid_shift,
3301 arg_uid_range,
3302 arg_selinux_apifs_context,
3303 true);
3304 } else
3305 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3306 if (r < 0)
3307 return r;
3308
3309 r = setup_boot_id();
3310 if (r < 0)
3311 return r;
3312
3313 r = setup_kmsg(kmsg_socket);
3314 if (r < 0)
3315 return r;
3316 kmsg_socket = safe_close(kmsg_socket);
3317
3318 r = mount_custom(
3319 "/",
3320 arg_custom_mounts,
3321 arg_n_custom_mounts,
3322 0,
3323 0,
3324 arg_selinux_apifs_context,
3325 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3326 if (r < 0)
3327 return r;
3328
3329 if (setsid() < 0)
3330 return log_error_errno(errno, "setsid() failed: %m");
3331
3332 if (arg_private_network)
3333 (void) loopback_setup();
3334
3335 if (arg_expose_ports) {
3336 r = expose_port_send_rtnl(rtnl_socket);
3337 if (r < 0)
3338 return r;
3339 rtnl_socket = safe_close(rtnl_socket);
3340 }
3341
3342 if (arg_console_mode != CONSOLE_PIPE) {
3343 _cleanup_close_ int master = -1;
3344 _cleanup_free_ char *console = NULL;
3345
3346 /* Allocate a pty and make it available as /dev/console. */
3347 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3348 if (master < 0)
3349 return log_error_errno(master, "Failed to allocate a pty: %m");
3350
3351 r = setup_dev_console(console);
3352 if (r < 0)
3353 return log_error_errno(r, "Failed to set up /dev/console: %m");
3354
3355 r = send_one_fd(master_pty_socket, master, 0);
3356 if (r < 0)
3357 return log_error_errno(r, "Failed to send master fd: %m");
3358 master_pty_socket = safe_close(master_pty_socket);
3359
3360 r = setup_stdio_as_dev_console();
3361 if (r < 0)
3362 return r;
3363 }
3364
3365 r = patch_sysctl();
3366 if (r < 0)
3367 return r;
3368
3369 if (arg_oom_score_adjust_set) {
3370 r = set_oom_score_adjust(arg_oom_score_adjust);
3371 if (r < 0)
3372 return log_error_errno(r, "Failed to adjust OOM score: %m");
3373 }
3374
3375 if (arg_cpu_set.set)
3376 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3377 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3378
3379 (void) setup_hostname();
3380
3381 if (arg_personality != PERSONALITY_INVALID) {
3382 r = safe_personality(arg_personality);
3383 if (r < 0)
3384 return log_error_errno(r, "personality() failed: %m");
3385 } else if (secondary) {
3386 r = safe_personality(PER_LINUX32);
3387 if (r < 0)
3388 return log_error_errno(r, "personality() failed: %m");
3389 }
3390
3391 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3392 if (r < 0)
3393 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3394
3395 #if HAVE_SECCOMP
3396 if (arg_seccomp) {
3397
3398 if (is_seccomp_available()) {
3399
3400 r = seccomp_load(arg_seccomp);
3401 if (ERRNO_IS_SECCOMP_FATAL(r))
3402 return log_error_errno(r, "Failed to install seccomp filter: %m");
3403 if (r < 0)
3404 log_debug_errno(r, "Failed to install seccomp filter: %m");
3405 }
3406 } else
3407 #endif
3408 {
3409 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3410 if (r < 0)
3411 return r;
3412 }
3413
3414 if (arg_suppress_sync) {
3415 #if HAVE_SECCOMP
3416 r = seccomp_suppress_sync();
3417 if (r < 0)
3418 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3419 #else
3420 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3421 #endif
3422 }
3423
3424 #if HAVE_SELINUX
3425 if (arg_selinux_context)
3426 if (setexeccon(arg_selinux_context) < 0)
3427 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3428 #endif
3429
3430 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3431 * if we need to later on. */
3432 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3433 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3434
3435 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3436 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3437 else
3438 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3439 if (r < 0)
3440 return r;
3441
3442 r = drop_capabilities(getuid());
3443 if (r < 0)
3444 return log_error_errno(r, "Dropping capabilities failed: %m");
3445
3446 if (arg_no_new_privileges)
3447 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3448 return log_error_errno(errno, "Failed to disable new privileges: %m");
3449
3450 /* LXC sets container=lxc, so follow the scheme here */
3451 envp[n_env++] = strjoina("container=", arg_container_service_name);
3452
3453 envp[n_env] = strv_find_prefix(environ, "TERM=");
3454 if (envp[n_env])
3455 n_env++;
3456
3457 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3458 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3459 return log_oom();
3460
3461 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3462 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3463 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3464 return log_oom();
3465
3466 assert(!sd_id128_is_null(arg_uuid));
3467
3468 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3469 return log_oom();
3470
3471 if (fdset_size(fds) > 0) {
3472 r = fdset_cloexec(fds, false);
3473 if (r < 0)
3474 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3475
3476 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3477 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3478 return log_oom();
3479 }
3480 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3481 return log_oom();
3482
3483 if (arg_n_credentials > 0) {
3484 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3485 if (!envp[n_env])
3486 return log_oom();
3487 n_env++;
3488 }
3489
3490 if (arg_start_mode != START_BOOT) {
3491 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
3492 if (!envp[n_env])
3493 return log_oom();
3494 n_env++;
3495 }
3496
3497 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3498 if (!env_use)
3499 return log_oom();
3500
3501 /* Let the parent know that we are ready and
3502 * wait until the parent is ready with the
3503 * setup, too... */
3504 if (!barrier_place_and_sync(barrier)) /* #5 */
3505 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3506
3507 if (arg_chdir)
3508 if (chdir(arg_chdir) < 0)
3509 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3510
3511 if (arg_start_mode == START_PID2) {
3512 r = stub_pid1(arg_uuid);
3513 if (r < 0)
3514 return r;
3515 }
3516
3517 if (arg_console_mode != CONSOLE_PIPE) {
3518 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3519 * are configured for that. Acquire it as controlling tty. */
3520 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3521 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3522 }
3523
3524 log_debug("Inner child completed, invoking payload.");
3525
3526 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3527 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3528 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3529 log_close();
3530 log_set_open_when_needed(true);
3531
3532 (void) fdset_close_others(fds);
3533
3534 if (arg_start_mode == START_BOOT) {
3535 char **a;
3536 size_t m;
3537
3538 /* Automatically search for the init system */
3539
3540 m = strv_length(arg_parameters);
3541 a = newa(char*, m + 2);
3542 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3543 a[1 + m] = NULL;
3544
3545 FOREACH_STRING(init,
3546 "/usr/lib/systemd/systemd",
3547 "/lib/systemd/systemd",
3548 "/sbin/init") {
3549 a[0] = (char*) init;
3550 execve(a[0], a, env_use);
3551 }
3552
3553 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3554 } else if (!strv_isempty(arg_parameters)) {
3555 const char *dollar_path;
3556
3557 exec_target = arg_parameters[0];
3558
3559 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3560 * binary. */
3561 dollar_path = strv_env_get(env_use, "PATH");
3562 if (dollar_path) {
3563 if (setenv("PATH", dollar_path, 1) < 0)
3564 return log_error_errno(errno, "Failed to update $PATH: %m");
3565 }
3566
3567 execvpe(arg_parameters[0], arg_parameters, env_use);
3568 } else {
3569 if (!arg_chdir)
3570 /* If we cannot change the directory, we'll end up in /, that is expected. */
3571 (void) chdir(home ?: "/root");
3572
3573 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3574 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3575 execle("/bin/bash", "-bash", NULL, env_use);
3576 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3577 execle("/bin/sh", "-sh", NULL, env_use);
3578
3579 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3580 }
3581
3582 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3583 }
3584
3585 static int setup_notify_child(void) {
3586 _cleanup_close_ int fd = -1;
3587 static const union sockaddr_union sa = {
3588 .un.sun_family = AF_UNIX,
3589 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3590 };
3591 int r;
3592
3593 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3594 if (fd < 0)
3595 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3596
3597 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3598 (void) sockaddr_un_unlink(&sa.un);
3599
3600 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3601 if (r < 0)
3602 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3603
3604 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3605 if (r < 0)
3606 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3607
3608 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3609 if (r < 0)
3610 return log_error_errno(r, "SO_PASSCRED failed: %m");
3611
3612 return TAKE_FD(fd);
3613 }
3614
3615 static int outer_child(
3616 Barrier *barrier,
3617 const char *directory,
3618 DissectedImage *dissected_image,
3619 bool secondary,
3620 int pid_socket,
3621 int uuid_socket,
3622 int notify_socket,
3623 int kmsg_socket,
3624 int rtnl_socket,
3625 int uid_shift_socket,
3626 int master_pty_socket,
3627 int unified_cgroup_hierarchy_socket,
3628 FDSet *fds,
3629 int netns_fd) {
3630
3631 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3632 _cleanup_strv_free_ char **os_release_pairs = NULL;
3633 _cleanup_close_ int fd = -1;
3634 bool idmap = false;
3635 const char *p;
3636 pid_t pid;
3637 ssize_t l;
3638 int r;
3639
3640 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3641 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3642 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3643 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3644 * forked off it, and it exits. */
3645
3646 assert(barrier);
3647 assert(directory);
3648 assert(pid_socket >= 0);
3649 assert(uuid_socket >= 0);
3650 assert(notify_socket >= 0);
3651 assert(master_pty_socket >= 0);
3652 assert(kmsg_socket >= 0);
3653
3654 log_debug("Outer child is initializing.");
3655
3656 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3657 if (r < 0)
3658 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3659
3660 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3661 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3662
3663 r = reset_audit_loginuid();
3664 if (r < 0)
3665 return r;
3666
3667 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3668 * mounts to the real root. */
3669 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3670 if (r < 0)
3671 return r;
3672
3673 if (dissected_image) {
3674 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3675 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3676 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3677 * right place right away. This makes sure ESP partitions and userns are compatible. */
3678
3679 r = dissected_image_mount_and_warn(
3680 dissected_image,
3681 directory,
3682 arg_uid_shift,
3683 arg_uid_range,
3684 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3685 DISSECT_IMAGE_DISCARD_ON_LOOP|
3686 DISSECT_IMAGE_USR_NO_ROOT|
3687 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3688 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3689 if (r < 0)
3690 return r;
3691 }
3692
3693 r = determine_uid_shift(directory);
3694 if (r < 0)
3695 return r;
3696
3697 if (arg_userns_mode != USER_NAMESPACE_NO) {
3698 /* Let the parent know which UID shift we read from the image */
3699 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3700 if (l < 0)
3701 return log_error_errno(errno, "Failed to send UID shift: %m");
3702 if (l != sizeof(arg_uid_shift))
3703 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3704 "Short write while sending UID shift.");
3705
3706 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3707 /* When we are supposed to pick the UID shift, the parent will check now whether the
3708 * UID shift we just read from the image is available. If yes, it will send the UID
3709 * shift back to us, if not it will pick a different one, and send it back to us. */
3710
3711 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3712 if (l < 0)
3713 return log_error_errno(errno, "Failed to recv UID shift: %m");
3714 if (l != sizeof(arg_uid_shift))
3715 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3716 "Short read while receiving UID shift.");
3717 }
3718
3719 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3720 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3721 }
3722
3723 if (path_equal(directory, "/")) {
3724 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3725 * place, so that we can make changes to its mount structure (for example, to implement
3726 * --volatile=) without this interfering with our ability to access files such as
3727 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3728 * (instead of a temporary directory, since we are living in our own mount namspace here
3729 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3730 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3731
3732 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3733 if (r < 0)
3734 return r;
3735
3736 directory = "/run/systemd/nspawn-root";
3737 }
3738
3739 r = setup_pivot_root(
3740 directory,
3741 arg_pivot_root_new,
3742 arg_pivot_root_old);
3743 if (r < 0)
3744 return r;
3745
3746 r = setup_volatile_mode(
3747 directory,
3748 arg_volatile_mode,
3749 arg_uid_shift,
3750 arg_selinux_apifs_context);
3751 if (r < 0)
3752 return r;
3753
3754 r = bind_user_prepare(
3755 directory,
3756 arg_bind_user,
3757 arg_uid_shift,
3758 arg_uid_range,
3759 &arg_custom_mounts, &arg_n_custom_mounts,
3760 &bind_user_context);
3761 if (r < 0)
3762 return r;
3763
3764 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3765 /* Send the user maps we determined to the parent, so that it installs it in our user
3766 * namespace UID map table */
3767
3768 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3769 uid_t map[] = {
3770 bind_user_context->data[i].payload_user->uid,
3771 bind_user_context->data[i].host_user->uid,
3772 (uid_t) bind_user_context->data[i].payload_group->gid,
3773 (uid_t) bind_user_context->data[i].host_group->gid,
3774 };
3775
3776 l = send(uid_shift_socket, map, sizeof(map), MSG_NOSIGNAL);
3777 if (l < 0)
3778 return log_error_errno(errno, "Failed to send user UID map: %m");
3779 if (l != sizeof(map))
3780 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3781 "Short write while sending user UID map.");
3782 }
3783 }
3784
3785 r = mount_custom(
3786 directory,
3787 arg_custom_mounts,
3788 arg_n_custom_mounts,
3789 arg_uid_shift,
3790 arg_uid_range,
3791 arg_selinux_apifs_context,
3792 MOUNT_ROOT_ONLY);
3793 if (r < 0)
3794 return r;
3795
3796 /* Make sure we always have a mount that we can move to root later on. */
3797 r = make_mount_point(directory);
3798 if (r < 0)
3799 return r;
3800
3801 if (arg_userns_mode != USER_NAMESPACE_NO &&
3802 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3803 arg_uid_shift != 0) {
3804
3805 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, REMOUNT_IDMAP_HOST_ROOT);
3806 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3807 /* This might fail because the kernel or file system doesn't support idmapping. We
3808 * can't really distinguish this nicely, nor do we have any guarantees about the
3809 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3810 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3811 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3812 "ID mapped mounts are apparently not available, sorry.");
3813
3814 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3815 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3816 } else if (r < 0)
3817 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3818 else {
3819 log_debug("ID mapped mounts available, making use of them.");
3820 idmap = true;
3821 }
3822 }
3823
3824 if (dissected_image) {
3825 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3826 r = dissected_image_mount(
3827 dissected_image,
3828 directory,
3829 arg_uid_shift,
3830 arg_uid_range,
3831 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3832 DISSECT_IMAGE_DISCARD_ON_LOOP|
3833 DISSECT_IMAGE_USR_NO_ROOT|
3834 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3835 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3836 if (r == -EUCLEAN)
3837 return log_error_errno(r, "File system check for image failed: %m");
3838 if (r < 0)
3839 return log_error_errno(r, "Failed to mount image file system: %m");
3840 }
3841
3842 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3843 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3844
3845 r = detect_unified_cgroup_hierarchy_from_image(directory);
3846 if (r < 0)
3847 return r;
3848
3849 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3850 if (l < 0)
3851 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3852 if (l != sizeof(arg_unified_cgroup_hierarchy))
3853 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3854 "Short write while sending cgroup mode.");
3855
3856 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3857 }
3858
3859 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3860 * mounts available in systemd services inside the container that create a new mount namespace. See
3861 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3862 * will inherit the shared propagation mode.
3863 *
3864 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3865 * directory mount to root later on.
3866 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3867 */
3868 r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3869 if (r < 0)
3870 return r;
3871
3872 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3873 if (r < 0)
3874 return r;
3875
3876 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3877 if (r < 0)
3878 return r;
3879
3880 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3881 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
3882 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3883 if (r < 0)
3884 return log_error_errno(r, "Failed to make tree read-only: %m");
3885 }
3886
3887 r = mount_all(directory,
3888 arg_mount_settings,
3889 arg_uid_shift,
3890 arg_selinux_apifs_context);
3891 if (r < 0)
3892 return r;
3893
3894 r = copy_devnodes(directory);
3895 if (r < 0)
3896 return r;
3897
3898 r = make_extra_nodes(directory);
3899 if (r < 0)
3900 return r;
3901
3902 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3903
3904 p = prefix_roota(directory, "/run/host");
3905 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
3906
3907 r = setup_pts(directory);
3908 if (r < 0)
3909 return r;
3910
3911 r = setup_propagate(directory);
3912 if (r < 0)
3913 return r;
3914
3915 r = setup_keyring();
3916 if (r < 0)
3917 return r;
3918
3919 r = setup_credentials(directory);
3920 if (r < 0)
3921 return r;
3922
3923 r = bind_user_setup(bind_user_context, directory);
3924 if (r < 0)
3925 return r;
3926
3927 r = mount_custom(
3928 directory,
3929 arg_custom_mounts,
3930 arg_n_custom_mounts,
3931 arg_uid_shift,
3932 arg_uid_range,
3933 arg_selinux_apifs_context,
3934 MOUNT_NON_ROOT_ONLY);
3935 if (r < 0)
3936 return r;
3937
3938 r = setup_timezone(directory);
3939 if (r < 0)
3940 return r;
3941
3942 r = setup_resolv_conf(directory);
3943 if (r < 0)
3944 return r;
3945
3946 r = setup_machine_id(directory);
3947 if (r < 0)
3948 return r;
3949
3950 r = setup_journal(directory);
3951 if (r < 0)
3952 return r;
3953
3954 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3955 p = prefix_roota(directory, "/run/host/container-manager");
3956 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3957
3958 /* The same stuff as the $container_uuid env var */
3959 p = prefix_roota(directory, "/run/host/container-uuid");
3960 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3961
3962 if (!arg_use_cgns) {
3963 r = mount_cgroups(
3964 directory,
3965 arg_unified_cgroup_hierarchy,
3966 arg_userns_mode != USER_NAMESPACE_NO,
3967 arg_uid_shift,
3968 arg_uid_range,
3969 arg_selinux_apifs_context,
3970 false);
3971 if (r < 0)
3972 return r;
3973 }
3974
3975 r = mount_move_root(directory);
3976 if (r < 0)
3977 return log_error_errno(r, "Failed to move root directory: %m");
3978
3979 fd = setup_notify_child();
3980 if (fd < 0)
3981 return fd;
3982
3983 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3984 arg_clone_ns_flags |
3985 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3986 if (pid < 0)
3987 return log_error_errno(errno, "Failed to fork inner child: %m");
3988 if (pid == 0) {
3989 pid_socket = safe_close(pid_socket);
3990 uuid_socket = safe_close(uuid_socket);
3991 notify_socket = safe_close(notify_socket);
3992 uid_shift_socket = safe_close(uid_shift_socket);
3993
3994 /* The inner child has all namespaces that are requested, so that we all are owned by the
3995 * user if user namespaces are turned on. */
3996
3997 if (arg_network_namespace_path) {
3998 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3999 if (r < 0)
4000 return log_error_errno(r, "Failed to join network namespace: %m");
4001 }
4002
4003 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
4004 if (r < 0)
4005 _exit(EXIT_FAILURE);
4006
4007 _exit(EXIT_SUCCESS);
4008 }
4009
4010 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4011 if (l < 0)
4012 return log_error_errno(errno, "Failed to send PID: %m");
4013 if (l != sizeof(pid))
4014 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4015 "Short write while sending PID.");
4016
4017 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4018 if (l < 0)
4019 return log_error_errno(errno, "Failed to send machine ID: %m");
4020 if (l != sizeof(arg_uuid))
4021 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4022 "Short write while sending machine ID.");
4023
4024 l = send_one_fd(notify_socket, fd, 0);
4025 if (l < 0)
4026 return log_error_errno(l, "Failed to send notify fd: %m");
4027
4028 pid_socket = safe_close(pid_socket);
4029 uuid_socket = safe_close(uuid_socket);
4030 notify_socket = safe_close(notify_socket);
4031 master_pty_socket = safe_close(master_pty_socket);
4032 kmsg_socket = safe_close(kmsg_socket);
4033 rtnl_socket = safe_close(rtnl_socket);
4034 netns_fd = safe_close(netns_fd);
4035
4036 return 0;
4037 }
4038
4039 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4040 bool tried_hashed = false;
4041 unsigned n_tries = 100;
4042 uid_t candidate;
4043 int r;
4044
4045 assert(shift);
4046 assert(ret_lock_file);
4047 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4048 assert(arg_uid_range == 0x10000U);
4049
4050 candidate = *shift;
4051
4052 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4053
4054 for (;;) {
4055 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4056 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4057
4058 if (--n_tries <= 0)
4059 return -EBUSY;
4060
4061 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4062 goto next;
4063 if ((candidate & UINT32_C(0xFFFF)) != 0)
4064 goto next;
4065
4066 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4067 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4068 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4069 goto next;
4070 if (r < 0)
4071 return r;
4072
4073 /* Make some superficial checks whether the range is currently known in the user database */
4074 if (getpwuid(candidate))
4075 goto next;
4076 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4077 goto next;
4078 if (getgrgid(candidate))
4079 goto next;
4080 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4081 goto next;
4082
4083 *ret_lock_file = lf;
4084 lf = (struct LockFile) LOCK_FILE_INIT;
4085 *shift = candidate;
4086 return 0;
4087
4088 next:
4089 if (arg_machine && !tried_hashed) {
4090 /* Try to hash the base from the container name */
4091
4092 static const uint8_t hash_key[] = {
4093 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4094 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4095 };
4096
4097 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4098
4099 tried_hashed = true;
4100 } else
4101 random_bytes(&candidate, sizeof(candidate));
4102
4103 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4104 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4105 }
4106 }
4107
4108 static int add_one_uid_map(
4109 char **p,
4110 uid_t container_uid,
4111 uid_t host_uid,
4112 uid_t range) {
4113
4114 return strextendf(p,
4115 UID_FMT " " UID_FMT " " UID_FMT "\n",
4116 container_uid, host_uid, range);
4117 }
4118
4119 static int make_uid_map_string(
4120 const uid_t bind_user_uid[],
4121 size_t n_bind_user_uid,
4122 size_t offset,
4123 char **ret) {
4124
4125 _cleanup_free_ char *s = NULL;
4126 uid_t previous_uid = 0;
4127 int r;
4128
4129 assert(n_bind_user_uid == 0 || bind_user_uid);
4130 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4131 assert(ret);
4132
4133 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4134 * quadruplet, consisting of host and container UID + GID. */
4135
4136 for (size_t i = 0; i < n_bind_user_uid; i++) {
4137 uid_t payload_uid = bind_user_uid[i*2+offset],
4138 host_uid = bind_user_uid[i*2+offset+1];
4139
4140 assert(previous_uid <= payload_uid);
4141 assert(payload_uid < arg_uid_range);
4142
4143 /* Add a range to close the gap to previous entry */
4144 if (payload_uid > previous_uid) {
4145 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4146 if (r < 0)
4147 return r;
4148 }
4149
4150 /* Map this specific user */
4151 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4152 if (r < 0)
4153 return r;
4154
4155 previous_uid = payload_uid + 1;
4156 }
4157
4158 /* And add a range to close the gap to finish the range */
4159 if (arg_uid_range > previous_uid) {
4160 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4161 if (r < 0)
4162 return r;
4163 }
4164
4165 assert(s);
4166
4167 *ret = TAKE_PTR(s);
4168 return 0;
4169 }
4170
4171 static int setup_uid_map(
4172 pid_t pid,
4173 const uid_t bind_user_uid[],
4174 size_t n_bind_user_uid) {
4175
4176 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4177 _cleanup_free_ char *s = NULL;
4178 int r;
4179
4180 assert(pid > 1);
4181
4182 /* Build the UID map string */
4183 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4184 return log_oom();
4185
4186 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4187 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4188 if (r < 0)
4189 return log_error_errno(r, "Failed to write UID map: %m");
4190
4191 /* And now build the GID map string */
4192 s = mfree(s);
4193 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4194 return log_oom();
4195
4196 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4197 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4198 if (r < 0)
4199 return log_error_errno(r, "Failed to write GID map: %m");
4200
4201 return 0;
4202 }
4203
4204 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4205 char buf[NOTIFY_BUFFER_MAX+1];
4206 char *p = NULL;
4207 struct iovec iovec = {
4208 .iov_base = buf,
4209 .iov_len = sizeof(buf)-1,
4210 };
4211 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4212 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4213 struct msghdr msghdr = {
4214 .msg_iov = &iovec,
4215 .msg_iovlen = 1,
4216 .msg_control = &control,
4217 .msg_controllen = sizeof(control),
4218 };
4219 struct ucred *ucred;
4220 ssize_t n;
4221 pid_t inner_child_pid;
4222 _cleanup_strv_free_ char **tags = NULL;
4223 int r;
4224
4225 assert(userdata);
4226
4227 inner_child_pid = PTR_TO_PID(userdata);
4228
4229 if (revents != EPOLLIN) {
4230 log_warning("Got unexpected poll event for notify fd.");
4231 return 0;
4232 }
4233
4234 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4235 if (n < 0) {
4236 if (ERRNO_IS_TRANSIENT(n))
4237 return 0;
4238 if (n == -EXFULL) {
4239 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4240 return 0;
4241 }
4242 return log_warning_errno(n, "Couldn't read notification socket: %m");
4243 }
4244
4245 cmsg_close_all(&msghdr);
4246
4247 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4248 if (!ucred || ucred->pid != inner_child_pid) {
4249 log_debug("Received notify message without valid credentials. Ignoring.");
4250 return 0;
4251 }
4252
4253 if ((size_t) n >= sizeof(buf)) {
4254 log_warning("Received notify message exceeded maximum size. Ignoring.");
4255 return 0;
4256 }
4257
4258 buf[n] = 0;
4259 tags = strv_split(buf, "\n\r");
4260 if (!tags)
4261 return log_oom();
4262
4263 if (strv_contains(tags, "READY=1")) {
4264 r = sd_notify(false, "READY=1\n");
4265 if (r < 0)
4266 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4267 }
4268
4269 p = strv_find_startswith(tags, "STATUS=");
4270 if (p)
4271 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4272
4273 return 0;
4274 }
4275
4276 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4277 int r;
4278
4279 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4280 if (r < 0)
4281 return log_error_errno(r, "Failed to allocate notify event source: %m");
4282
4283 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4284
4285 return 0;
4286 }
4287
4288 static int merge_settings(Settings *settings, const char *path) {
4289 int rl;
4290
4291 assert(settings);
4292 assert(path);
4293
4294 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4295 * that this steals the fields of the Settings* structure, and hence modifies it. */
4296
4297 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4298 settings->start_mode >= 0) {
4299 arg_start_mode = settings->start_mode;
4300 strv_free_and_replace(arg_parameters, settings->parameters);
4301 }
4302
4303 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4304 settings->ephemeral >= 0)
4305 arg_ephemeral = settings->ephemeral;
4306
4307 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4308 settings->root) {
4309
4310 if (!arg_settings_trusted)
4311 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4312 else
4313 free_and_replace(arg_directory, settings->root);
4314 }
4315
4316 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4317 settings->pivot_root_new) {
4318 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4319 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4320 }
4321
4322 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4323 settings->working_directory)
4324 free_and_replace(arg_chdir, settings->working_directory);
4325
4326 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4327 settings->environment)
4328 strv_free_and_replace(arg_setenv, settings->environment);
4329
4330 if ((arg_settings_mask & SETTING_USER) == 0) {
4331
4332 if (settings->user)
4333 free_and_replace(arg_user, settings->user);
4334
4335 if (uid_is_valid(settings->uid))
4336 arg_uid = settings->uid;
4337 if (gid_is_valid(settings->gid))
4338 arg_gid = settings->gid;
4339 if (settings->n_supplementary_gids > 0) {
4340 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4341 arg_n_supplementary_gids = settings->n_supplementary_gids;
4342 }
4343 }
4344
4345 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4346 uint64_t plus, minus;
4347 uint64_t network_minus = 0;
4348 uint64_t ambient;
4349
4350 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4351 * Settings structure */
4352
4353 plus = settings->capability;
4354 minus = settings->drop_capability;
4355
4356 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4357 settings_network_configured(settings)) {
4358 if (settings_private_network(settings))
4359 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4360 else
4361 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4362 }
4363
4364 if (!arg_settings_trusted && plus != 0) {
4365 if (settings->capability != 0)
4366 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4367 } else {
4368 arg_caps_retain &= ~network_minus;
4369 arg_caps_retain |= plus;
4370 }
4371
4372 arg_caps_retain &= ~minus;
4373
4374 /* Copy the full capabilities over too */
4375 if (capability_quintet_is_set(&settings->full_capabilities)) {
4376 if (!arg_settings_trusted)
4377 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4378 else
4379 arg_full_capabilities = settings->full_capabilities;
4380 }
4381
4382 ambient = settings->ambient_capability;
4383 if (!arg_settings_trusted && ambient != 0)
4384 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4385 else
4386 arg_caps_ambient |= ambient;
4387 }
4388
4389 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4390 settings->kill_signal > 0)
4391 arg_kill_signal = settings->kill_signal;
4392
4393 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4394 settings->personality != PERSONALITY_INVALID)
4395 arg_personality = settings->personality;
4396
4397 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4398 !sd_id128_is_null(settings->machine_id)) {
4399
4400 if (!arg_settings_trusted)
4401 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4402 else
4403 arg_uuid = settings->machine_id;
4404 }
4405
4406 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4407 settings->read_only >= 0)
4408 arg_read_only = settings->read_only;
4409
4410 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4411 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4412 arg_volatile_mode = settings->volatile_mode;
4413
4414 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4415 settings->n_custom_mounts > 0) {
4416
4417 if (!arg_settings_trusted)
4418 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4419 else {
4420 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4421 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4422 arg_n_custom_mounts = settings->n_custom_mounts;
4423 settings->n_custom_mounts = 0;
4424 }
4425 }
4426
4427 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4428 settings_network_configured(settings)) {
4429
4430 if (!arg_settings_trusted)
4431 log_warning("Ignoring network settings, file %s is not trusted.", path);
4432 else {
4433 arg_network_veth = settings_network_veth(settings);
4434 arg_private_network = settings_private_network(settings);
4435
4436 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4437 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4438 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4439 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4440
4441 free_and_replace(arg_network_bridge, settings->network_bridge);
4442 free_and_replace(arg_network_zone, settings->network_zone);
4443
4444 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4445 }
4446 }
4447
4448 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4449 settings->expose_ports) {
4450
4451 if (!arg_settings_trusted)
4452 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4453 else {
4454 expose_port_free_all(arg_expose_ports);
4455 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4456 }
4457 }
4458
4459 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4460 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4461
4462 if (!arg_settings_trusted)
4463 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4464 else {
4465 arg_userns_mode = settings->userns_mode;
4466 arg_uid_shift = settings->uid_shift;
4467 arg_uid_range = settings->uid_range;
4468 arg_userns_ownership = settings->userns_ownership;
4469 }
4470 }
4471
4472 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4473 !strv_isempty(settings->bind_user))
4474 strv_free_and_replace(arg_bind_user, settings->bind_user);
4475
4476 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4477 settings->notify_ready >= 0)
4478 arg_notify_ready = settings->notify_ready;
4479
4480 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4481
4482 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4483 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4484 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4485 else {
4486 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4487 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4488 }
4489 }
4490
4491 #if HAVE_SECCOMP
4492 if (settings->seccomp) {
4493 if (!arg_settings_trusted)
4494 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4495 else {
4496 seccomp_release(arg_seccomp);
4497 arg_seccomp = TAKE_PTR(settings->seccomp);
4498 }
4499 }
4500 #endif
4501 }
4502
4503 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4504 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4505 continue;
4506
4507 if (!settings->rlimit[rl])
4508 continue;
4509
4510 if (!arg_settings_trusted) {
4511 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4512 continue;
4513 }
4514
4515 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4516 }
4517
4518 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4519 settings->hostname)
4520 free_and_replace(arg_hostname, settings->hostname);
4521
4522 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4523 settings->no_new_privileges >= 0)
4524 arg_no_new_privileges = settings->no_new_privileges;
4525
4526 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4527 settings->oom_score_adjust_set) {
4528
4529 if (!arg_settings_trusted)
4530 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4531 else {
4532 arg_oom_score_adjust = settings->oom_score_adjust;
4533 arg_oom_score_adjust_set = true;
4534 }
4535 }
4536
4537 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4538 settings->cpu_set.set) {
4539
4540 if (!arg_settings_trusted)
4541 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4542 else {
4543 cpu_set_reset(&arg_cpu_set);
4544 arg_cpu_set = settings->cpu_set;
4545 settings->cpu_set = (CPUSet) {};
4546 }
4547 }
4548
4549 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4550 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4551 arg_resolv_conf = settings->resolv_conf;
4552
4553 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4554 settings->link_journal != _LINK_JOURNAL_INVALID) {
4555
4556 if (!arg_settings_trusted)
4557 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4558 else {
4559 arg_link_journal = settings->link_journal;
4560 arg_link_journal_try = settings->link_journal_try;
4561 }
4562 }
4563
4564 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4565 settings->timezone != _TIMEZONE_MODE_INVALID)
4566 arg_timezone = settings->timezone;
4567
4568 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4569 settings->slice) {
4570
4571 if (!arg_settings_trusted)
4572 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4573 else
4574 free_and_replace(arg_slice, settings->slice);
4575 }
4576
4577 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4578 settings->use_cgns >= 0) {
4579
4580 if (!arg_settings_trusted)
4581 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4582 else
4583 arg_use_cgns = settings->use_cgns;
4584 }
4585
4586 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4587 settings->clone_ns_flags != ULONG_MAX) {
4588
4589 if (!arg_settings_trusted)
4590 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4591 else
4592 arg_clone_ns_flags = settings->clone_ns_flags;
4593 }
4594
4595 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4596 settings->console_mode >= 0) {
4597
4598 if (!arg_settings_trusted)
4599 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4600 else
4601 arg_console_mode = settings->console_mode;
4602 }
4603
4604 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4605 settings->suppress_sync >= 0)
4606 arg_suppress_sync = settings->suppress_sync;
4607
4608 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4609 * don't consult arg_settings_mask for them. */
4610
4611 sd_bus_message_unref(arg_property_message);
4612 arg_property_message = TAKE_PTR(settings->properties);
4613
4614 arg_console_width = settings->console_width;
4615 arg_console_height = settings->console_height;
4616
4617 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4618 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4619 arg_n_extra_nodes = settings->n_extra_nodes;
4620
4621 return 0;
4622 }
4623
4624 static int load_settings(void) {
4625 _cleanup_(settings_freep) Settings *settings = NULL;
4626 _cleanup_fclose_ FILE *f = NULL;
4627 _cleanup_free_ char *p = NULL;
4628 int r;
4629
4630 if (arg_oci_bundle)
4631 return 0;
4632
4633 /* If all settings are masked, there's no point in looking for
4634 * the settings file */
4635 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4636 return 0;
4637
4638 /* We first look in the admin's directories in /etc and /run */
4639 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4640 _cleanup_free_ char *j = NULL;
4641
4642 j = path_join(i, arg_settings_filename);
4643 if (!j)
4644 return log_oom();
4645
4646 f = fopen(j, "re");
4647 if (f) {
4648 p = TAKE_PTR(j);
4649
4650 /* By default, we trust configuration from /etc and /run */
4651 if (arg_settings_trusted < 0)
4652 arg_settings_trusted = true;
4653
4654 break;
4655 }
4656
4657 if (errno != ENOENT)
4658 return log_error_errno(errno, "Failed to open %s: %m", j);
4659 }
4660
4661 if (!f) {
4662 /* After that, let's look for a file next to the
4663 * actual image we shall boot. */
4664
4665 if (arg_image) {
4666 p = file_in_same_dir(arg_image, arg_settings_filename);
4667 if (!p)
4668 return log_oom();
4669 } else if (arg_directory && !path_equal(arg_directory, "/")) {
4670 p = file_in_same_dir(arg_directory, arg_settings_filename);
4671 if (!p)
4672 return log_oom();
4673 }
4674
4675 if (p) {
4676 f = fopen(p, "re");
4677 if (!f && errno != ENOENT)
4678 return log_error_errno(errno, "Failed to open %s: %m", p);
4679
4680 /* By default, we do not trust configuration from /var/lib/machines */
4681 if (arg_settings_trusted < 0)
4682 arg_settings_trusted = false;
4683 }
4684 }
4685
4686 if (!f)
4687 return 0;
4688
4689 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4690
4691 r = settings_load(f, p, &settings);
4692 if (r < 0)
4693 return r;
4694
4695 return merge_settings(settings, p);
4696 }
4697
4698 static int load_oci_bundle(void) {
4699 _cleanup_(settings_freep) Settings *settings = NULL;
4700 int r;
4701
4702 if (!arg_oci_bundle)
4703 return 0;
4704
4705 /* By default let's trust OCI bundles */
4706 if (arg_settings_trusted < 0)
4707 arg_settings_trusted = true;
4708
4709 r = oci_load(NULL, arg_oci_bundle, &settings);
4710 if (r < 0)
4711 return r;
4712
4713 return merge_settings(settings, arg_oci_bundle);
4714 }
4715
4716 static int run_container(
4717 DissectedImage *dissected_image,
4718 bool secondary,
4719 FDSet *fds,
4720 char veth_name[IFNAMSIZ], bool *veth_created,
4721 struct ExposeArgs *expose_args,
4722 int *master, pid_t *pid, int *ret) {
4723
4724 static const struct sigaction sa = {
4725 .sa_handler = nop_signal_handler,
4726 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4727 };
4728
4729 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4730 _cleanup_close_ int etc_passwd_lock = -1;
4731 _cleanup_close_pair_ int
4732 kmsg_socket_pair[2] = { -1, -1 },
4733 rtnl_socket_pair[2] = { -1, -1 },
4734 pid_socket_pair[2] = { -1, -1 },
4735 uuid_socket_pair[2] = { -1, -1 },
4736 notify_socket_pair[2] = { -1, -1 },
4737 uid_shift_socket_pair[2] = { -1, -1 },
4738 master_pty_socket_pair[2] = { -1, -1 },
4739 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4740
4741 _cleanup_close_ int notify_socket = -1;
4742 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4743 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4744 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4745 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4746 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4747 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4748 _cleanup_free_ uid_t *bind_user_uid = NULL;
4749 size_t n_bind_user_uid = 0;
4750 ContainerStatus container_status = 0;
4751 int ifi = 0, r;
4752 ssize_t l;
4753 sigset_t mask_chld;
4754 _cleanup_close_ int child_netns_fd = -1;
4755
4756 assert_se(sigemptyset(&mask_chld) == 0);
4757 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4758
4759 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4760 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4761 * check with getpwuid() if the specific user already exists. Note that /etc might be
4762 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4763 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4764 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4765 * really ours. */
4766
4767 etc_passwd_lock = take_etc_passwd_lock(NULL);
4768 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4769 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4770 }
4771
4772 r = barrier_create(&barrier);
4773 if (r < 0)
4774 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4775
4776 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4777 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4778
4779 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4780 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4781
4782 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4783 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4784
4785 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4786 return log_error_errno(errno, "Failed to create id socket pair: %m");
4787
4788 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4789 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4790
4791 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4792 return log_error_errno(errno, "Failed to create console socket pair: %m");
4793
4794 if (arg_userns_mode != USER_NAMESPACE_NO)
4795 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4796 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4797
4798 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4799 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4800 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4801
4802 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4803 * parent's blocking calls and give it a chance to call wait() and terminate. */
4804 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4805 if (r < 0)
4806 return log_error_errno(errno, "Failed to change the signal mask: %m");
4807
4808 r = sigaction(SIGCHLD, &sa, NULL);
4809 if (r < 0)
4810 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4811
4812 if (arg_network_namespace_path) {
4813 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4814 if (child_netns_fd < 0)
4815 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4816
4817 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4818 if (r == -EUCLEAN)
4819 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4820 else if (r < 0)
4821 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4822 else if (r == 0)
4823 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4824 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4825 }
4826
4827 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4828 if (*pid < 0)
4829 return log_error_errno(errno, "clone() failed%s: %m",
4830 errno == EINVAL ?
4831 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4832
4833 if (*pid == 0) {
4834 /* The outer child only has a file system namespace. */
4835 barrier_set_role(&barrier, BARRIER_CHILD);
4836
4837 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4838 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4839 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4840 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4841 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4842 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
4843 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4844 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4845
4846 (void) reset_all_signal_handlers();
4847 (void) reset_signal_mask();
4848
4849 r = outer_child(&barrier,
4850 arg_directory,
4851 dissected_image,
4852 secondary,
4853 pid_socket_pair[1],
4854 uuid_socket_pair[1],
4855 notify_socket_pair[1],
4856 kmsg_socket_pair[1],
4857 rtnl_socket_pair[1],
4858 uid_shift_socket_pair[1],
4859 master_pty_socket_pair[1],
4860 unified_cgroup_hierarchy_socket_pair[1],
4861 fds,
4862 child_netns_fd);
4863 if (r < 0)
4864 _exit(EXIT_FAILURE);
4865
4866 _exit(EXIT_SUCCESS);
4867 }
4868
4869 barrier_set_role(&barrier, BARRIER_PARENT);
4870
4871 fdset_close(fds);
4872
4873 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4874 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4875 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4876 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4877 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4878 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
4879 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4880 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4881
4882 if (arg_userns_mode != USER_NAMESPACE_NO) {
4883 /* The child just let us know the UID shift it might have read from the image. */
4884 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4885 if (l < 0)
4886 return log_error_errno(errno, "Failed to read UID shift: %m");
4887 if (l != sizeof arg_uid_shift)
4888 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4889
4890 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4891 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4892 * image, but if that's already in use, pick a new one, and report back to the child,
4893 * which one we now picked. */
4894
4895 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4896 if (r < 0)
4897 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4898
4899 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4900 if (l < 0)
4901 return log_error_errno(errno, "Failed to send UID shift: %m");
4902 if (l != sizeof arg_uid_shift)
4903 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4904 }
4905
4906 n_bind_user_uid = strv_length(arg_bind_user);
4907 if (n_bind_user_uid > 0) {
4908 /* Right after the UID shift, we'll receive the list of UID mappings for the
4909 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4910
4911 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4912 if (!bind_user_uid)
4913 return log_oom();
4914
4915 for (size_t i = 0; i < n_bind_user_uid; i++) {
4916 l = recv(uid_shift_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4917 if (l < 0)
4918 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4919 if (l != sizeof(uid_t)*4)
4920 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4921 SYNTHETIC_ERRNO(EIO),
4922 "Short read while reading bind user UID pairs.");
4923 }
4924 }
4925 }
4926
4927 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4928 /* The child let us know the support cgroup mode it might have read from the image. */
4929 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4930 if (l < 0)
4931 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4932 if (l != sizeof(arg_unified_cgroup_hierarchy))
4933 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4934 l, l == 0 ? " The child is most likely dead." : "");
4935 }
4936
4937 /* Wait for the outer child. */
4938 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4939 if (r < 0)
4940 return r;
4941 if (r != EXIT_SUCCESS)
4942 return -EIO;
4943
4944 /* And now retrieve the PID of the inner child. */
4945 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4946 if (l < 0)
4947 return log_error_errno(errno, "Failed to read inner child PID: %m");
4948 if (l != sizeof *pid)
4949 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4950
4951 /* We also retrieve container UUID in case it was generated by outer child */
4952 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4953 if (l < 0)
4954 return log_error_errno(errno, "Failed to read container machine ID: %m");
4955 if (l != sizeof(arg_uuid))
4956 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4957
4958 /* We also retrieve the socket used for notifications generated by outer child */
4959 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4960 if (notify_socket < 0)
4961 return log_error_errno(notify_socket,
4962 "Failed to receive notification socket from the outer child: %m");
4963
4964 log_debug("Init process invoked as PID "PID_FMT, *pid);
4965
4966 if (arg_userns_mode != USER_NAMESPACE_NO) {
4967 if (!barrier_place_and_sync(&barrier)) /* #1 */
4968 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4969
4970 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
4971 if (r < 0)
4972 return r;
4973
4974 (void) barrier_place(&barrier); /* #2 */
4975 }
4976
4977 if (arg_private_network) {
4978 if (!arg_network_namespace_path) {
4979 /* Wait until the child has unshared its network namespace. */
4980 if (!barrier_place_and_sync(&barrier)) /* #3 */
4981 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4982 }
4983
4984 if (child_netns_fd < 0) {
4985 /* Make sure we have an open file descriptor to the child's network
4986 * namespace so it stays alive even if the child exits. */
4987 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4988 if (r < 0)
4989 return log_error_errno(r, "Failed to open child network namespace: %m");
4990 }
4991
4992 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
4993 if (r < 0)
4994 return r;
4995
4996 if (arg_network_veth) {
4997 r = setup_veth(arg_machine, *pid, veth_name,
4998 arg_network_bridge || arg_network_zone);
4999 if (r < 0)
5000 return r;
5001 else if (r > 0)
5002 ifi = r;
5003
5004 if (arg_network_bridge) {
5005 /* Add the interface to a bridge */
5006 r = setup_bridge(veth_name, arg_network_bridge, false);
5007 if (r < 0)
5008 return r;
5009 if (r > 0)
5010 ifi = r;
5011 } else if (arg_network_zone) {
5012 /* Add the interface to a bridge, possibly creating it */
5013 r = setup_bridge(veth_name, arg_network_zone, true);
5014 if (r < 0)
5015 return r;
5016 if (r > 0)
5017 ifi = r;
5018 }
5019 }
5020
5021 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5022 if (r < 0)
5023 return r;
5024
5025 /* We created the primary and extra veth links now; let's remember this, so that we know to
5026 remove them later on. Note that we don't bother with removing veth links that were created
5027 here when their setup failed half-way, because in that case the kernel should be able to
5028 remove them on its own, since they cannot be referenced by anything yet. */
5029 *veth_created = true;
5030
5031 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5032 if (r < 0)
5033 return r;
5034
5035 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5036 if (r < 0)
5037 return r;
5038 }
5039
5040 if (arg_register || !arg_keep_unit) {
5041 r = sd_bus_default_system(&bus);
5042 if (r < 0)
5043 return log_error_errno(r, "Failed to open system bus: %m");
5044
5045 r = sd_bus_set_close_on_exit(bus, false);
5046 if (r < 0)
5047 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5048 }
5049
5050 if (!arg_keep_unit) {
5051 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5052 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5053 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5054
5055 r = sd_bus_match_signal_async(
5056 bus,
5057 NULL,
5058 "org.freedesktop.systemd1",
5059 NULL,
5060 "org.freedesktop.systemd1.Scope",
5061 "RequestStop",
5062 on_request_stop, NULL, PID_TO_PTR(*pid));
5063 if (r < 0)
5064 return log_error_errno(r, "Failed to request RequestStop match: %m");
5065 }
5066
5067 if (arg_register) {
5068 r = register_machine(
5069 bus,
5070 arg_machine,
5071 *pid,
5072 arg_directory,
5073 arg_uuid,
5074 ifi,
5075 arg_slice,
5076 arg_custom_mounts, arg_n_custom_mounts,
5077 arg_kill_signal,
5078 arg_property,
5079 arg_property_message,
5080 arg_keep_unit,
5081 arg_container_service_name);
5082 if (r < 0)
5083 return r;
5084
5085 } else if (!arg_keep_unit) {
5086 r = allocate_scope(
5087 bus,
5088 arg_machine,
5089 *pid,
5090 arg_slice,
5091 arg_custom_mounts, arg_n_custom_mounts,
5092 arg_kill_signal,
5093 arg_property,
5094 arg_property_message);
5095 if (r < 0)
5096 return r;
5097
5098 } else if (arg_slice || arg_property)
5099 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5100
5101 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5102 if (r < 0)
5103 return r;
5104
5105 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5106 if (r < 0)
5107 return r;
5108
5109 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5110 if (r < 0)
5111 return r;
5112
5113 /* Notify the child that the parent is ready with all
5114 * its setup (including cgroup-ification), and that
5115 * the child can now hand over control to the code to
5116 * run inside the container. */
5117 (void) barrier_place(&barrier); /* #4 */
5118
5119 /* Block SIGCHLD here, before notifying child.
5120 * process_pty() will handle it with the other signals. */
5121 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5122
5123 /* Reset signal to default */
5124 r = default_signals(SIGCHLD);
5125 if (r < 0)
5126 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5127
5128 r = sd_event_new(&event);
5129 if (r < 0)
5130 return log_error_errno(r, "Failed to get default event source: %m");
5131
5132 (void) sd_event_set_watchdog(event, true);
5133
5134 if (bus) {
5135 r = sd_bus_attach_event(bus, event, 0);
5136 if (r < 0)
5137 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5138 }
5139
5140 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5141 if (r < 0)
5142 return r;
5143
5144 /* Let the child know that we are ready and wait that the child is completely ready now. */
5145 if (!barrier_place_and_sync(&barrier)) /* #5 */
5146 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5147
5148 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5149 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5150 etc_passwd_lock = safe_close(etc_passwd_lock);
5151
5152 (void) sd_notifyf(false,
5153 "STATUS=Container running.\n"
5154 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5155 if (!arg_notify_ready) {
5156 r = sd_notify(false, "READY=1\n");
5157 if (r < 0)
5158 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5159 }
5160
5161 if (arg_kill_signal > 0) {
5162 /* Try to kill the init system on SIGINT or SIGTERM */
5163 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5164 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5165 } else {
5166 /* Immediately exit */
5167 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5168 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5169 }
5170
5171 /* Exit when the child exits */
5172 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5173
5174 if (arg_expose_ports) {
5175 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, expose_args, &rtnl);
5176 if (r < 0)
5177 return r;
5178
5179 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5180 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5181 }
5182
5183 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5184
5185 if (arg_console_mode != CONSOLE_PIPE) {
5186 _cleanup_close_ int fd = -1;
5187 PTYForwardFlags flags = 0;
5188
5189 /* Retrieve the master pty allocated by inner child */
5190 fd = receive_one_fd(master_pty_socket_pair[0], 0);
5191 if (fd < 0)
5192 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5193
5194 switch (arg_console_mode) {
5195
5196 case CONSOLE_READ_ONLY:
5197 flags |= PTY_FORWARD_READ_ONLY;
5198
5199 _fallthrough_;
5200
5201 case CONSOLE_INTERACTIVE:
5202 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5203
5204 r = pty_forward_new(event, fd, flags, &forward);
5205 if (r < 0)
5206 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5207
5208 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5209 (void) pty_forward_set_width_height(forward,
5210 arg_console_width,
5211 arg_console_height);
5212 break;
5213
5214 default:
5215 assert(arg_console_mode == CONSOLE_PASSIVE);
5216 }
5217
5218 *master = TAKE_FD(fd);
5219 }
5220
5221 r = sd_event_loop(event);
5222 if (r < 0)
5223 return log_error_errno(r, "Failed to run event loop: %m");
5224
5225 if (forward) {
5226 char last_char = 0;
5227
5228 (void) pty_forward_get_last_char(forward, &last_char);
5229 forward = pty_forward_free(forward);
5230
5231 if (!arg_quiet && last_char != '\n')
5232 putc('\n', stdout);
5233 }
5234
5235 /* Kill if it is not dead yet anyway */
5236 if (!arg_register && !arg_keep_unit && bus)
5237 terminate_scope(bus, arg_machine);
5238
5239 /* Normally redundant, but better safe than sorry */
5240 (void) kill(*pid, SIGKILL);
5241
5242 if (arg_private_network) {
5243 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5244 * to avoid having to move the parent to the child network namespace. */
5245 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5246 if (r < 0)
5247 return r;
5248
5249 if (r == 0) {
5250 _cleanup_close_ int parent_netns_fd = -1;
5251
5252 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5253 if (r < 0) {
5254 log_error_errno(r, "Failed to open parent network namespace: %m");
5255 _exit(EXIT_FAILURE);
5256 }
5257
5258 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5259 if (r < 0) {
5260 log_error_errno(r, "Failed to enter child network namespace: %m");
5261 _exit(EXIT_FAILURE);
5262 }
5263
5264 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5265 if (r < 0)
5266 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5267
5268 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5269 }
5270 }
5271
5272 r = wait_for_container(TAKE_PID(*pid), &container_status);
5273
5274 /* Tell machined that we are gone. */
5275 if (bus)
5276 (void) unregister_machine(bus, arg_machine);
5277
5278 if (r < 0)
5279 /* We failed to wait for the container, or the container exited abnormally. */
5280 return r;
5281 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5282 /* r > 0 → The container exited with a non-zero status.
5283 * As a special case, we need to replace 133 with a different value,
5284 * because 133 is special-cased in the service file to reboot the container.
5285 * otherwise → The container exited with zero status and a reboot was not requested.
5286 */
5287 if (r == EXIT_FORCE_RESTART)
5288 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5289 *ret = r;
5290 return 0; /* finito */
5291 }
5292
5293 /* CONTAINER_REBOOTED, loop again */
5294
5295 if (arg_keep_unit) {
5296 /* Special handling if we are running as a service: instead of simply
5297 * restarting the machine we want to restart the entire service, so let's
5298 * inform systemd about this with the special exit code 133. The service
5299 * file uses RestartForceExitStatus=133 so that this results in a full
5300 * nspawn restart. This is necessary since we might have cgroup parameters
5301 * set we want to have flushed out. */
5302 *ret = EXIT_FORCE_RESTART;
5303 return 0; /* finito */
5304 }
5305
5306 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5307 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5308
5309 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5310 *veth_created = false;
5311 return 1; /* loop again */
5312 }
5313
5314 static int initialize_rlimits(void) {
5315 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5316 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5317 * container execution environments. */
5318
5319 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5320 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5321 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5322 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5323 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5324 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5325 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5326 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5327 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5328 [RLIMIT_NICE] = { 0, 0 },
5329 [RLIMIT_NOFILE] = { 1024, 4096 },
5330 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5331 [RLIMIT_RTPRIO] = { 0, 0 },
5332 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5333 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5334
5335 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5336 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5337 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5338 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5339 * that PID 1 changes a number of other resource limits during early initialization which is why we
5340 * don't read the other limits from PID 1 but prefer the static table above. */
5341 };
5342
5343 int rl;
5344
5345 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5346 /* Let's only fill in what the user hasn't explicitly configured anyway */
5347 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5348 const struct rlimit *v;
5349 struct rlimit buffer;
5350
5351 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5352 /* For these two let's read the limits off PID 1. See above for an explanation. */
5353
5354 if (prlimit(1, rl, NULL, &buffer) < 0)
5355 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5356
5357 v = &buffer;
5358 } else if (rl == RLIMIT_NOFILE) {
5359 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5360 * userspace. Given that nspawn containers are often run without our PID 1,
5361 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5362 * so that container userspace gets similar resources as host userspace
5363 * gets. */
5364 buffer = kernel_defaults[rl];
5365 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5366 v = &buffer;
5367 } else
5368 v = kernel_defaults + rl;
5369
5370 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5371 if (!arg_rlimit[rl])
5372 return log_oom();
5373 }
5374
5375 if (DEBUG_LOGGING) {
5376 _cleanup_free_ char *k = NULL;
5377
5378 (void) rlimit_format(arg_rlimit[rl], &k);
5379 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5380 }
5381 }
5382
5383 return 0;
5384 }
5385
5386 static int cant_be_in_netns(void) {
5387 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5388 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5389 _cleanup_close_ int fd = -1;
5390 struct ucred ucred;
5391 int r;
5392
5393 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5394 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5395 * nice message. */
5396
5397 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5398 return 0;
5399
5400 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5401 if (fd < 0)
5402 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5403
5404 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5405 if (r < 0) {
5406 if (r == -ENOENT || ERRNO_IS_DISCONNECT(r))
5407 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5408 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5409
5410 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
5411 }
5412
5413 r = getpeercred(fd, &ucred);
5414 if (r < 0)
5415 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5416
5417 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5418 r = readlink_malloc(udev_path, &udev_ns);
5419 if (r < 0)
5420 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5421
5422 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5423 if (r < 0)
5424 return log_error_errno(r, "Failed to read our own network namespace: %m");
5425
5426 if (!streq(our_ns, udev_ns))
5427 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5428 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5429 return 0;
5430 }
5431
5432 static int run(int argc, char *argv[]) {
5433 bool secondary = false, remove_directory = false, remove_image = false,
5434 veth_created = false, remove_tmprootdir = false;
5435 _cleanup_close_ int master = -1;
5436 _cleanup_fdset_free_ FDSet *fds = NULL;
5437 int r, n_fd_passed, ret = EXIT_SUCCESS;
5438 char veth_name[IFNAMSIZ] = "";
5439 struct ExposeArgs expose_args = {};
5440 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5441 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5442 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5443 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5444 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5445 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5446 pid_t pid = 0;
5447
5448 log_parse_environment();
5449 log_open();
5450
5451 r = parse_argv(argc, argv);
5452 if (r <= 0)
5453 goto finish;
5454
5455 if (geteuid() != 0) {
5456 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5457 argc >= 2 ? "Need to be root." :
5458 "Need to be root (and some arguments are usually required).\nHint: try --help");
5459 goto finish;
5460 }
5461
5462 r = cant_be_in_netns();
5463 if (r < 0)
5464 goto finish;
5465
5466 r = initialize_rlimits();
5467 if (r < 0)
5468 goto finish;
5469
5470 r = load_oci_bundle();
5471 if (r < 0)
5472 goto finish;
5473
5474 r = determine_names();
5475 if (r < 0)
5476 goto finish;
5477
5478 r = load_settings();
5479 if (r < 0)
5480 goto finish;
5481
5482 r = cg_unified();
5483 if (r < 0) {
5484 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5485 goto finish;
5486 }
5487
5488 r = verify_arguments();
5489 if (r < 0)
5490 goto finish;
5491
5492 /* Reapply environment settings. */
5493 (void) detect_unified_cgroup_hierarchy_from_environment();
5494
5495 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5496 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5497 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5498 (void) ignore_signals(SIGPIPE);
5499
5500 n_fd_passed = sd_listen_fds(false);
5501 if (n_fd_passed > 0) {
5502 r = fdset_new_listen_fds(&fds, false);
5503 if (r < 0) {
5504 log_error_errno(r, "Failed to collect file descriptors: %m");
5505 goto finish;
5506 }
5507 }
5508
5509 /* The "default" umask. This is appropriate for most file and directory
5510 * operations performed by nspawn, and is the umask that will be used for
5511 * the child. Functions like copy_devnodes() change the umask temporarily. */
5512 umask(0022);
5513
5514 if (arg_directory) {
5515 assert(!arg_image);
5516
5517 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5518 * /var from the host will propagate into container dynamically (because bad things happen if
5519 * two systems write to the same /var). Let's allow it for the special cases where /var is
5520 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5521 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5522 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5523 r = -EINVAL;
5524 goto finish;
5525 }
5526
5527 if (arg_ephemeral) {
5528 _cleanup_free_ char *np = NULL;
5529
5530 r = chase_symlinks_and_update(&arg_directory, 0);
5531 if (r < 0)
5532 goto finish;
5533
5534 /* If the specified path is a mount point we generate the new snapshot immediately
5535 * inside it under a random name. However if the specified is not a mount point we
5536 * create the new snapshot in the parent directory, just next to it. */
5537 r = path_is_mount_point(arg_directory, NULL, 0);
5538 if (r < 0) {
5539 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5540 goto finish;
5541 }
5542 if (r > 0)
5543 r = tempfn_random_child(arg_directory, "machine.", &np);
5544 else
5545 r = tempfn_random(arg_directory, "machine.", &np);
5546 if (r < 0) {
5547 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5548 goto finish;
5549 }
5550
5551 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5552 * only owned by us and no one else. */
5553 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5554 if (r < 0) {
5555 log_error_errno(r, "Failed to lock %s: %m", np);
5556 goto finish;
5557 }
5558
5559 {
5560 BLOCK_SIGNALS(SIGINT);
5561 r = btrfs_subvol_snapshot(arg_directory, np,
5562 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5563 BTRFS_SNAPSHOT_FALLBACK_COPY |
5564 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5565 BTRFS_SNAPSHOT_RECURSIVE |
5566 BTRFS_SNAPSHOT_QUOTA |
5567 BTRFS_SNAPSHOT_SIGINT);
5568 }
5569 if (r == -EINTR) {
5570 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5571 goto finish;
5572 }
5573 if (r < 0) {
5574 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5575 goto finish;
5576 }
5577
5578 free_and_replace(arg_directory, np);
5579 remove_directory = true;
5580 } else {
5581 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5582 if (r < 0)
5583 goto finish;
5584
5585 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5586 if (r == -EBUSY) {
5587 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5588 goto finish;
5589 }
5590 if (r < 0) {
5591 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5592 goto finish;
5593 }
5594
5595 if (arg_template) {
5596 r = chase_symlinks_and_update(&arg_template, 0);
5597 if (r < 0)
5598 goto finish;
5599
5600 {
5601 BLOCK_SIGNALS(SIGINT);
5602 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5603 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5604 BTRFS_SNAPSHOT_FALLBACK_COPY |
5605 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5606 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5607 BTRFS_SNAPSHOT_RECURSIVE |
5608 BTRFS_SNAPSHOT_QUOTA |
5609 BTRFS_SNAPSHOT_SIGINT);
5610 }
5611 if (r == -EEXIST)
5612 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5613 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5614 else if (r == -EINTR) {
5615 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5616 goto finish;
5617 } else if (r < 0) {
5618 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5619 goto finish;
5620 } else
5621 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5622 "Populated %s from template %s.", arg_directory, arg_template);
5623 }
5624 }
5625
5626 if (arg_start_mode == START_BOOT) {
5627 _cleanup_free_ char *b = NULL;
5628 const char *p;
5629
5630 if (arg_pivot_root_new) {
5631 b = path_join(arg_directory, arg_pivot_root_new);
5632 if (!b)
5633 return log_oom();
5634
5635 p = b;
5636 } else
5637 p = arg_directory;
5638
5639 if (path_is_os_tree(p) <= 0) {
5640 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5641 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5642 goto finish;
5643 }
5644 } else {
5645 _cleanup_free_ char *p = NULL;
5646
5647 if (arg_pivot_root_new)
5648 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
5649 else
5650 p = path_join(arg_directory, "/usr/");
5651 if (!p)
5652 return log_oom();
5653
5654 if (laccess(p, F_OK) < 0) {
5655 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5656 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
5657 goto finish;
5658 }
5659 }
5660
5661 } else {
5662 DissectImageFlags dissect_image_flags =
5663 DISSECT_IMAGE_GENERIC_ROOT |
5664 DISSECT_IMAGE_REQUIRE_ROOT |
5665 DISSECT_IMAGE_RELAX_VAR_CHECK |
5666 DISSECT_IMAGE_USR_NO_ROOT;
5667 assert(arg_image);
5668 assert(!arg_template);
5669
5670 r = chase_symlinks_and_update(&arg_image, 0);
5671 if (r < 0)
5672 goto finish;
5673
5674 if (arg_ephemeral) {
5675 _cleanup_free_ char *np = NULL;
5676
5677 r = tempfn_random(arg_image, "machine.", &np);
5678 if (r < 0) {
5679 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5680 goto finish;
5681 }
5682
5683 /* Always take an exclusive lock on our own ephemeral copy. */
5684 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5685 if (r < 0) {
5686 r = log_error_errno(r, "Failed to create image lock: %m");
5687 goto finish;
5688 }
5689
5690 {
5691 BLOCK_SIGNALS(SIGINT);
5692 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5693 }
5694 if (r == -EINTR) {
5695 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5696 goto finish;
5697 }
5698 if (r < 0) {
5699 r = log_error_errno(r, "Failed to copy image file: %m");
5700 goto finish;
5701 }
5702
5703 free_and_replace(arg_image, np);
5704 remove_image = true;
5705 } else {
5706 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5707 if (r == -EBUSY) {
5708 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5709 goto finish;
5710 }
5711 if (r < 0) {
5712 r = log_error_errno(r, "Failed to create image lock: %m");
5713 goto finish;
5714 }
5715
5716 r = verity_settings_load(
5717 &arg_verity_settings,
5718 arg_image, NULL, NULL);
5719 if (r < 0) {
5720 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5721 goto finish;
5722 }
5723
5724 if (arg_verity_settings.data_path)
5725 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5726 }
5727
5728 if (!mkdtemp(tmprootdir)) {
5729 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5730 goto finish;
5731 }
5732
5733 remove_tmprootdir = true;
5734
5735 arg_directory = strdup(tmprootdir);
5736 if (!arg_directory) {
5737 r = log_oom();
5738 goto finish;
5739 }
5740
5741 r = loop_device_make_by_path(
5742 arg_image,
5743 arg_read_only ? O_RDONLY : O_RDWR,
5744 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5745 &loop);
5746 if (r < 0) {
5747 log_error_errno(r, "Failed to set up loopback block device: %m");
5748 goto finish;
5749 }
5750
5751 /* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */
5752 r = loop_device_flock(loop, LOCK_SH);
5753 if (r < 0) {
5754 log_error_errno(r, "Failed to take lock on loopback block device: %m");
5755 goto finish;
5756 }
5757
5758 r = dissect_image_and_warn(
5759 loop->fd,
5760 arg_image,
5761 &arg_verity_settings,
5762 NULL,
5763 loop->diskseq,
5764 loop->uevent_seqnum_not_before,
5765 loop->timestamp_not_before,
5766 dissect_image_flags,
5767 &dissected_image);
5768 if (r == -ENOPKG) {
5769 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5770 log_notice("Note that the disk image needs to\n"
5771 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5772 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5773 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
5774 " d) or contain a file system without a partition table\n"
5775 "in order to be bootable with systemd-nspawn.");
5776 goto finish;
5777 }
5778 if (r < 0)
5779 goto finish;
5780
5781 r = dissected_image_load_verity_sig_partition(
5782 dissected_image,
5783 loop->fd,
5784 &arg_verity_settings);
5785 if (r < 0)
5786 goto finish;
5787
5788 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5789 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5790 "root hash signature found! Proceeding without integrity checking.", arg_image);
5791
5792 r = dissected_image_decrypt_interactively(
5793 dissected_image,
5794 NULL,
5795 &arg_verity_settings,
5796 0,
5797 &decrypted_image);
5798 if (r < 0)
5799 goto finish;
5800
5801 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5802 if (remove_image && unlink(arg_image) >= 0)
5803 remove_image = false;
5804 }
5805
5806 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5807 if (r < 0)
5808 goto finish;
5809
5810 if (arg_console_mode < 0)
5811 arg_console_mode =
5812 isatty(STDIN_FILENO) > 0 &&
5813 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5814
5815 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5816 arg_quiet = true;
5817
5818 if (!arg_quiet)
5819 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5820 arg_machine, arg_image ?: arg_directory);
5821
5822 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5823
5824 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5825 r = log_error_errno(errno, "Failed to become subreaper: %m");
5826 goto finish;
5827 }
5828
5829 if (arg_expose_ports) {
5830 r = fw_ctx_new(&fw_ctx);
5831 if (r < 0) {
5832 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5833 goto finish;
5834 }
5835 expose_args.fw_ctx = fw_ctx;
5836 }
5837 for (;;) {
5838 r = run_container(dissected_image,
5839 secondary,
5840 fds,
5841 veth_name, &veth_created,
5842 &expose_args, &master,
5843 &pid, &ret);
5844 if (r <= 0)
5845 break;
5846 }
5847
5848 finish:
5849 (void) sd_notify(false,
5850 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5851 "STOPPING=1\nSTATUS=Terminating...");
5852
5853 if (pid > 0)
5854 (void) kill(pid, SIGKILL);
5855
5856 /* Try to flush whatever is still queued in the pty */
5857 if (master >= 0) {
5858 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
5859 master = safe_close(master);
5860 }
5861
5862 if (pid > 0)
5863 (void) wait_for_terminate(pid, NULL);
5864
5865 pager_close();
5866
5867 if (remove_directory && arg_directory) {
5868 int k;
5869
5870 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5871 if (k < 0)
5872 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5873 }
5874
5875 if (remove_image && arg_image) {
5876 if (unlink(arg_image) < 0)
5877 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5878 }
5879
5880 if (remove_tmprootdir) {
5881 if (rmdir(tmprootdir) < 0)
5882 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5883 }
5884
5885 if (arg_machine) {
5886 const char *p;
5887
5888 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5889 (void) rm_rf(p, REMOVE_ROOT);
5890 }
5891
5892 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5893 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
5894
5895 if (veth_created)
5896 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5897 (void) remove_bridge(arg_network_zone);
5898
5899 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5900 expose_port_free_all(arg_expose_ports);
5901 rlimit_free_all(arg_rlimit);
5902 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5903 credential_free_all(arg_credentials, arg_n_credentials);
5904
5905 if (r < 0)
5906 return r;
5907
5908 return ret;
5909 }
5910
5911 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);